2015-03-16 21:18:11

by Anna Schumaker

[permalink] [raw]
Subject: [PATCH v3 0/3] NFSD: Add READ_PLUS support

These patches add server support for the NFS v4.2 operation READ_PLUS.

I tested these patches using xfstests and compared the runtime between NFS
v4.1, NFS v4.2 without READ_PLUS, and NFS v4.2 with READ_PLUS. Here are
the results:


Test | v4.1 | no READ_PLUS | READ_PLUS
---------------------------------------------------
generic/013 | 135s | 143s | 123s
generic/075 | 4s | 11s | 4s
generic/091 | 9s | 16s | 15s
generic/112 | 4s | 7s | 6s
generic/127 | 62s | 117s | 114s
generic/213 | [not run] | 1s | 1s
generic/214 | [not run] | 0s | 0s
generic/228 | [not run] | 1s | 2s
generic/236 | 1s | 1s | 1s
generic/263 | 4s | 6s | 8s
generic/285 | 0s | 1s | 3s
generic/315 | [not run] | 2s | 1s
---------------------------------------------------
Total | 3:47.47 | 5:11.85 | 4:43.77


Using the READ_PLUS operation does have an impact on reading sparse
files over the network. There is still a big difference between v4.1
and v4.2 runtimes, but this looks to be a result of fallocate() tests
that only run over v4.2.


Changes since v2:
- Encode data segments using splice reads whenever possible.
- Combine patches for adding HOLE and DATA segment support.


These patches and the corresponding client changes are available in the
[read_plus] branch of

git://git.linux-nfs.org/projects/anna/linux-nfs.git

Questions? Comments? Thoughts?

Anna


Anna Schumaker (3):
NFSD: nfsd4_encode_read{v}() should encode eof and maxcount
NFSD: Add basic READ_PLUS support
NFSD: Add support for encoding multiple segments

fs/nfsd/nfs4proc.c | 16 +++++
fs/nfsd/nfs4xdr.c | 182 ++++++++++++++++++++++++++++++++++++++++++-----------
2 files changed, 160 insertions(+), 38 deletions(-)

--
2.3.3



2015-03-16 21:18:12

by Anna Schumaker

[permalink] [raw]
Subject: [PATCH v3 1/3] NFSD: nfsd4_encode_read{v}() should encode eof and maxcount

I intend to reuse nfsd4_encode_readv() for READ_PLUS, so I need an
alternate way to encode these values. I think it makes sense for
nfsd4_encode_read() to handle this in a single place for both splice and
readv paths.

Signed-off-by: Anna Schumaker <[email protected]>
---
fs/nfsd/nfs4xdr.c | 59 ++++++++++++++++++++++---------------------------------
1 file changed, 23 insertions(+), 36 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index df5e66c..22cd001 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3286,21 +3286,19 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struc
static __be32 nfsd4_encode_splice_read(
struct nfsd4_compoundres *resp,
struct nfsd4_read *read,
- struct file *file, unsigned long maxcount)
+ struct file *file, unsigned long *maxcount)
{
struct xdr_stream *xdr = &resp->xdr;
struct xdr_buf *buf = xdr->buf;
- u32 eof;
int space_left;
__be32 nfserr;
- __be32 *p = xdr->p - 2;

/* Make sure there will be room for padding if needed */
if (xdr->end - xdr->p < 1)
return nfserr_resource;

nfserr = nfsd_splice_read(read->rd_rqstp, file,
- read->rd_offset, &maxcount);
+ read->rd_offset, maxcount);
if (nfserr) {
/*
* nfsd_splice_actor may have already messed with the
@@ -3311,27 +3309,21 @@ static __be32 nfsd4_encode_splice_read(
return nfserr;
}

- eof = (read->rd_offset + maxcount >=
- read->rd_fhp->fh_dentry->d_inode->i_size);
-
- *(p++) = htonl(eof);
- *(p++) = htonl(maxcount);
-
- buf->page_len = maxcount;
- buf->len += maxcount;
- xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1)
+ buf->page_len = *maxcount;
+ buf->len += *maxcount;
+ xdr->page_ptr += (buf->page_base + *maxcount + PAGE_SIZE - 1)
/ PAGE_SIZE;

/* Use rest of head for padding and remaining ops: */
buf->tail[0].iov_base = xdr->p;
buf->tail[0].iov_len = 0;
xdr->iov = buf->tail;
- if (maxcount&3) {
- int pad = 4 - (maxcount&3);
+ if (*maxcount&3) {
+ int pad = 4 - (*maxcount&3);

*(xdr->p++) = 0;

- buf->tail[0].iov_base += maxcount&3;
+ buf->tail[0].iov_base += *maxcount&3;
buf->tail[0].iov_len = pad;
buf->len += pad;
}
@@ -3346,21 +3338,19 @@ static __be32 nfsd4_encode_splice_read(

static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
struct nfsd4_read *read,
- struct file *file, unsigned long maxcount)
+ struct file *file, unsigned long *maxcount)
{
struct xdr_stream *xdr = &resp->xdr;
- u32 eof;
int v;
- int starting_len = xdr->buf->len - 8;
+ int starting_len = xdr->buf->len;
long len;
int thislen;
__be32 nfserr;
- __be32 tmp;
__be32 *p;
u32 zzz = 0;
int pad;

- len = maxcount;
+ len = *maxcount;
v = 0;

thislen = min_t(long, len, ((void *)xdr->end - (void *)xdr->p));
@@ -3383,22 +3373,13 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
read->rd_vlen = v;

nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec,
- read->rd_vlen, &maxcount);
+ read->rd_vlen, maxcount);
if (nfserr)
return nfserr;
- xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
+ xdr_truncate_encode(xdr, starting_len + ((*maxcount+3)&~3));

- eof = (read->rd_offset + maxcount >=
- read->rd_fhp->fh_dentry->d_inode->i_size);
-
- tmp = htonl(eof);
- write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4);
- tmp = htonl(maxcount);
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
-
- pad = (maxcount&3) ? 4 - (maxcount&3) : 0;
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 8 + maxcount,
- &zzz, pad);
+ pad = (*maxcount&3) ? 4 - (*maxcount&3) : 0;
+ write_bytes_to_xdr_buf(xdr->buf, starting_len + *maxcount, &zzz, pad);
return 0;

}
@@ -3414,6 +3395,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
struct raparms *ra;
__be32 *p;
__be32 err;
+ u32 eof;

if (nfserr)
return nfserr;
@@ -3441,9 +3423,14 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
}

if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
- err = nfsd4_encode_splice_read(resp, read, file, maxcount);
+ err = nfsd4_encode_splice_read(resp, read, file, &maxcount);
else
- err = nfsd4_encode_readv(resp, read, file, maxcount);
+ err = nfsd4_encode_readv(resp, read, file, &maxcount);
+
+ eof = (read->rd_offset + maxcount >=
+ read->rd_fhp->fh_dentry->d_inode->i_size);
+ *p++ = cpu_to_be32(eof);
+ *p++ = cpu_to_be32(maxcount);

if (!read->rd_filp)
nfsd_put_tmp_read_open(file, ra);
--
2.3.3


2015-03-16 21:18:13

by Anna Schumaker

[permalink] [raw]
Subject: [PATCH v3 2/3] NFSD: Add basic READ_PLUS support

This patch adds READ_PLUS support for both NFS4_CONTENT_DATA and
NFS4_CONTENT_HOLE segments. I keep things simple for now by only
returning one segment at a time to clients issuing the READ_PLUS call.

Signed-off-by: Anna Schumaker <[email protected]>
---
fs/nfsd/nfs4proc.c | 16 ++++++++
fs/nfsd/nfs4xdr.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 124 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index d30bea8..e9f4d8f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1858,6 +1858,16 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
}

+static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ u32 maxcount = svc_max_payload(rqstp);
+ u32 rlen = min(op->u.read.rd_length, maxcount);
+ /* enough extra xdr space for encoding either a hole or data segment. */
+ u32 xdr = 5;
+
+ return (op_encode_hdr_size + 2 + xdr + XDR_QUADLEN(rlen)) * sizeof(__be32);
+}
+
static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
u32 maxcount = 0, rlen = 0;
@@ -2290,6 +2300,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_name = "OP_DEALLOCATE",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
},
+ [OP_READ_PLUS] = {
+ .op_func = (nfsd4op_func)nfsd4_read,
+ .op_name = "OP_READ_PLUS",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_read_plus_rsize,
+ .op_get_currentstateid = (stateid_getter)nfsd4_get_readstateid,
+ },
[OP_SEEK] = {
.op_func = (nfsd4op_func)nfsd4_seek,
.op_name = "OP_SEEK",
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 22cd001..799d52c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1769,7 +1769,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
[OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_read,
[OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek,
[OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp,
};
@@ -4116,6 +4116,112 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
#endif /* CONFIG_NFSD_PNFS */

static __be32
+nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, struct nfsd4_read *read,
+ struct file *file)
+{
+ __be32 *p, err;
+ unsigned long maxcount;
+ struct xdr_stream *xdr = &resp->xdr;
+
+ p = xdr_reserve_space(xdr, 4 + 8 + 4);
+ if (!p)
+ return nfserr_resource;
+ xdr_commit_encode(xdr);
+
+ maxcount = svc_max_payload(resp->rqstp);
+ maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len));
+ maxcount = min_t(unsigned long, maxcount, read->rd_length);
+
+ if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
+ err = nfsd4_encode_splice_read(resp, read, file, &maxcount);
+ else
+ err = nfsd4_encode_readv(resp, read, file, &maxcount);
+
+ *p++ = cpu_to_be32(NFS4_CONTENT_DATA);
+ p = xdr_encode_hyper(p, read->rd_offset);
+ *p++ = cpu_to_be32(maxcount);
+
+ read->rd_offset += maxcount;
+ return err;
+}
+
+static __be32
+nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, struct nfsd4_read *read,
+ struct file *file)
+{
+ __be32 *p;
+ unsigned long maxcount;
+ loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
+
+ if (data_pos == -ENXIO)
+ data_pos = i_size_read(file_inode(file));
+ if (data_pos <= read->rd_offset)
+ return nfsd4_encode_read_plus_data(resp, read, file);
+
+ maxcount = data_pos - read->rd_offset;
+ p = xdr_reserve_space(&resp->xdr, 4 + 8 + 8);
+ *p++ = cpu_to_be32(NFS4_CONTENT_HOLE);
+ p = xdr_encode_hyper(p, read->rd_offset);
+ p = xdr_encode_hyper(p, maxcount);
+
+ read->rd_offset += maxcount;
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_read *read)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ struct file *file = read->rd_filp;
+ int starting_len = xdr->buf->len;
+ struct raparms *ra;
+ loff_t hole_pos;
+ __be32 *p;
+ __be32 err;
+ u32 eof, segments = 0;
+
+ if (nfserr)
+ return nfserr;
+
+ /* eof flag, segment count */
+ p = xdr_reserve_space(xdr, 4 + 4 );
+ if (!p)
+ return nfserr_resource;
+ xdr_commit_encode(xdr);
+
+ if (!read->rd_filp) {
+ err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp,
+ &file, &ra);
+ if (err)
+ goto err_truncate;
+ }
+
+ hole_pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
+ if (hole_pos == -ENXIO)
+ goto out_encode;
+
+ if (hole_pos == read->rd_offset)
+ err = nfsd4_encode_read_plus_hole(resp, read, file);
+ else
+ err = nfsd4_encode_read_plus_data(resp, read, file);
+ segments++;
+
+out_encode:
+ eof = (read->rd_offset >= i_size_read(file_inode(file)));
+ *p++ = cpu_to_be32(eof);
+ *p++ = cpu_to_be32(segments);
+
+ if (!read->rd_filp)
+ nfsd_put_tmp_read_open(file, ra);
+
+err_truncate:
+ if (err)
+ xdr_truncate_encode(xdr, starting_len);
+ return err;
+}
+
+static __be32
nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_seek *seek)
{
@@ -4222,7 +4328,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
[OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop,
[OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop,
[OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_read_plus,
[OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek,
[OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop,
};
--
2.3.3


2015-03-16 21:18:13

by Anna Schumaker

[permalink] [raw]
Subject: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

This patch implements sending an array of segments back to the client.
Clients should be prepared to handle multiple segment reads to make this
useful. We try to splice the first data segment into the XDR result,
and remaining segments are encoded directly.

Signed-off-by: Anna Schumaker <[email protected]>
---
fs/nfsd/nfs4proc.c | 4 ++--
fs/nfsd/nfs4xdr.c | 35 ++++++++++++++++++++++++-----------
2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e9f4d8f..6801973 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1862,8 +1862,8 @@ static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op
{
u32 maxcount = svc_max_payload(rqstp);
u32 rlen = min(op->u.read.rd_length, maxcount);
- /* enough extra xdr space for encoding either a hole or data segment. */
- u32 xdr = 5;
+ /* Extra xdr padding for encoding multiple segments. */
+ u32 xdr = 20;

return (op_encode_hdr_size + 2 + xdr + XDR_QUADLEN(rlen)) * sizeof(__be32);
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 799d52c..5eaecd2 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4117,7 +4117,7 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,

static __be32
nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, struct nfsd4_read *read,
- struct file *file)
+ struct file *file, loff_t hole_pos)
{
__be32 *p, err;
unsigned long maxcount;
@@ -4128,20 +4128,26 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, struct nfsd4_read *r
return nfserr_resource;
xdr_commit_encode(xdr);

+ if (hole_pos <= read->rd_offset)
+ hole_pos = i_size_read(file_inode(file));
+
maxcount = svc_max_payload(resp->rqstp);
maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len));
maxcount = min_t(unsigned long, maxcount, read->rd_length);
+ maxcount = min_t(unsigned long, maxcount, hole_pos - read->rd_offset);

if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
err = nfsd4_encode_splice_read(resp, read, file, &maxcount);
else
err = nfsd4_encode_readv(resp, read, file, &maxcount);
+ clear_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);

*p++ = cpu_to_be32(NFS4_CONTENT_DATA);
p = xdr_encode_hyper(p, read->rd_offset);
*p++ = cpu_to_be32(maxcount);

read->rd_offset += maxcount;
+ read->rd_length -= maxcount;
return err;
}

@@ -4156,7 +4162,7 @@ nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, struct nfsd4_read *r
if (data_pos == -ENXIO)
data_pos = i_size_read(file_inode(file));
if (data_pos <= read->rd_offset)
- return nfsd4_encode_read_plus_data(resp, read, file);
+ return nfsd4_encode_read_plus_data(resp, read, file, 0);

maxcount = data_pos - read->rd_offset;
p = xdr_reserve_space(&resp->xdr, 4 + 8 + 8);
@@ -4165,6 +4171,10 @@ nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, struct nfsd4_read *r
p = xdr_encode_hyper(p, maxcount);

read->rd_offset += maxcount;
+ if (maxcount > read->rd_length)
+ read->rd_length = 0;
+ else
+ read->rd_length -= maxcount;
return nfs_ok;
}

@@ -4197,17 +4207,20 @@ nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
goto err_truncate;
}

- hole_pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
- if (hole_pos == -ENXIO)
- goto out_encode;
+ do {
+ hole_pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
+ if (hole_pos == -ENXIO)
+ break;

- if (hole_pos == read->rd_offset)
- err = nfsd4_encode_read_plus_hole(resp, read, file);
- else
- err = nfsd4_encode_read_plus_data(resp, read, file);
- segments++;
+ if (hole_pos == read->rd_offset)
+ err = nfsd4_encode_read_plus_hole(resp, read, file);
+ else
+ err = nfsd4_encode_read_plus_data(resp, read, file, hole_pos);
+ if (err)
+ break;
+ segments++;
+ } while (read->rd_length > 0);

-out_encode:
eof = (read->rd_offset >= i_size_read(file_inode(file)));
*p++ = cpu_to_be32(eof);
*p++ = cpu_to_be32(segments);
--
2.3.3


2015-03-17 19:56:34

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker wrote:
> This patch implements sending an array of segments back to the client.
> Clients should be prepared to handle multiple segment reads to make this
> useful. We try to splice the first data segment into the XDR result,
> and remaining segments are encoded directly.

I'm still interested in what would happen if we started with an
implementation like:

- if the entire requested range falls within a hole, return that
single hole.
- otherwise, just treat the thing as one big data segment.

That would provide a benefit in the case there are large-ish holes
with minimal impact otherwise.

(Though patches for full support are still useful even if only for
client-testing purposes.)

--b.

>
> Signed-off-by: Anna Schumaker <[email protected]>
> ---
> fs/nfsd/nfs4proc.c | 4 ++--
> fs/nfsd/nfs4xdr.c | 35 ++++++++++++++++++++++++-----------
> 2 files changed, 26 insertions(+), 13 deletions(-)
>
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index e9f4d8f..6801973 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -1862,8 +1862,8 @@ static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op
> {
> u32 maxcount = svc_max_payload(rqstp);
> u32 rlen = min(op->u.read.rd_length, maxcount);
> - /* enough extra xdr space for encoding either a hole or data segment. */
> - u32 xdr = 5;
> + /* Extra xdr padding for encoding multiple segments. */
> + u32 xdr = 20;
>
> return (op_encode_hdr_size + 2 + xdr + XDR_QUADLEN(rlen)) * sizeof(__be32);
> }
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 799d52c..5eaecd2 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -4117,7 +4117,7 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
>
> static __be32
> nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, struct nfsd4_read *read,
> - struct file *file)
> + struct file *file, loff_t hole_pos)
> {
> __be32 *p, err;
> unsigned long maxcount;
> @@ -4128,20 +4128,26 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, struct nfsd4_read *r
> return nfserr_resource;
> xdr_commit_encode(xdr);
>
> + if (hole_pos <= read->rd_offset)
> + hole_pos = i_size_read(file_inode(file));
> +
> maxcount = svc_max_payload(resp->rqstp);
> maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len));
> maxcount = min_t(unsigned long, maxcount, read->rd_length);
> + maxcount = min_t(unsigned long, maxcount, hole_pos - read->rd_offset);
>
> if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
> err = nfsd4_encode_splice_read(resp, read, file, &maxcount);
> else
> err = nfsd4_encode_readv(resp, read, file, &maxcount);
> + clear_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
>
> *p++ = cpu_to_be32(NFS4_CONTENT_DATA);
> p = xdr_encode_hyper(p, read->rd_offset);
> *p++ = cpu_to_be32(maxcount);
>
> read->rd_offset += maxcount;
> + read->rd_length -= maxcount;
> return err;
> }
>
> @@ -4156,7 +4162,7 @@ nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, struct nfsd4_read *r
> if (data_pos == -ENXIO)
> data_pos = i_size_read(file_inode(file));
> if (data_pos <= read->rd_offset)
> - return nfsd4_encode_read_plus_data(resp, read, file);
> + return nfsd4_encode_read_plus_data(resp, read, file, 0);
>
> maxcount = data_pos - read->rd_offset;
> p = xdr_reserve_space(&resp->xdr, 4 + 8 + 8);
> @@ -4165,6 +4171,10 @@ nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, struct nfsd4_read *r
> p = xdr_encode_hyper(p, maxcount);
>
> read->rd_offset += maxcount;
> + if (maxcount > read->rd_length)
> + read->rd_length = 0;
> + else
> + read->rd_length -= maxcount;
> return nfs_ok;
> }
>
> @@ -4197,17 +4207,20 @@ nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
> goto err_truncate;
> }
>
> - hole_pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
> - if (hole_pos == -ENXIO)
> - goto out_encode;
> + do {
> + hole_pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
> + if (hole_pos == -ENXIO)
> + break;
>
> - if (hole_pos == read->rd_offset)
> - err = nfsd4_encode_read_plus_hole(resp, read, file);
> - else
> - err = nfsd4_encode_read_plus_data(resp, read, file);
> - segments++;
> + if (hole_pos == read->rd_offset)
> + err = nfsd4_encode_read_plus_hole(resp, read, file);
> + else
> + err = nfsd4_encode_read_plus_data(resp, read, file, hole_pos);
> + if (err)
> + break;
> + segments++;
> + } while (read->rd_length > 0);
>
> -out_encode:
> eof = (read->rd_offset >= i_size_read(file_inode(file)));
> *p++ = cpu_to_be32(eof);
> *p++ = cpu_to_be32(segments);
> --
> 2.3.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2015-03-17 20:07:39

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields wrote:
> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker wrote:
> > This patch implements sending an array of segments back to the client.
> > Clients should be prepared to handle multiple segment reads to make this
> > useful. We try to splice the first data segment into the XDR result,
> > and remaining segments are encoded directly.
>
> I'm still interested in what would happen if we started with an
> implementation like:
>
> - if the entire requested range falls within a hole, return that
> single hole.
> - otherwise, just treat the thing as one big data segment.
>
> That would provide a benefit in the case there are large-ish holes
> with minimal impact otherwise.
>
> (Though patches for full support are still useful even if only for
> client-testing purposes.)

Also, looks like

xvs_io -c "fiemap -v" <file>

will give hole sizes for a given <file>. (Thanks, esandeen.) Running
that on a few of my test vm images shows a fair number of large
(hundreds of megs) files, which suggests identifying only >=rwsize holes
might still be useful.

--b.

2015-03-17 21:36:54

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields wrote:
> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields wrote:
> > On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker wrote:
> > > This patch implements sending an array of segments back to the client.
> > > Clients should be prepared to handle multiple segment reads to make this
> > > useful. We try to splice the first data segment into the XDR result,
> > > and remaining segments are encoded directly.
> >
> > I'm still interested in what would happen if we started with an
> > implementation like:
> >
> > - if the entire requested range falls within a hole, return that
> > single hole.
> > - otherwise, just treat the thing as one big data segment.
> >
> > That would provide a benefit in the case there are large-ish holes
> > with minimal impact otherwise.
> >
> > (Though patches for full support are still useful even if only for
> > client-testing purposes.)
>
> Also, looks like
>
> xvs_io -c "fiemap -v" <file>
>
> will give hole sizes for a given <file>. (Thanks, esandeen.) Running
> that on a few of my test vm images shows a fair number of large
> (hundreds of megs) files, which suggests identifying only >=rwsize holes
> might still be useful.

Just for fun.... I wrote the following test program and ran it on my
collection of testing vm's. Some looked like this:

f21-1.qcow2
144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13 f21-1.qcow2
total hole bytes: 8443252736 (98%)
in aligned 1MB chunks: 8428453888 (98%)

So, basically, read_plus would save transferring most of the data even
when only handling 1MB holes.

But some looked like this:

501524 -rw-------. 1 qemu qemu 8589934592 May 20 2014 rhel6-1-1.img
total hole bytes: 8077516800 (94%)
in aligned 1MB chunks: 0 (0%)

So the READ_PLUS that caught every hole might save a lot, the one that
only caught 1MB holes wouldn't help at all.

And there were lots of examples in between those two extremes.

(But, check my math, I haven't tested this carefully.)

--b.

#define _GNU_SOURCE
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <err.h>

long round_up(long n, long b)
{
return ((n + b - 1)/b) * b;
}

long round_down(long n, long b)
{
return (n/b) * b;
}

long hbytes = 0;
long rplusbytes = 0;

do_stats(off_t hole_start, off_t hole_end)
{
off_t hole_start_up, hole_end_down;

hole_start_up = round_up(hole_start, 1024*1024);
hole_end_down = round_down(hole_end, 1024*1024);

hbytes += hole_end - hole_start;
if (hole_start_up < hole_end_down)
rplusbytes += hole_end_down - hole_start_up;
}

int main(int argc, char *argv[])
{
off_t hole_start, hole_end;
int fd;
char *name;

/* Map out holes with SEEK_HOLE, SEEK_DATA */
/* Useful statistics:
* - what percentage of file is in holes?
* - what percentage of file would be skipped if we read it
* sequentially in 1MB chunks?
*/

if (argc != 2)
errx(1, "usage: %s <filename>\n", argv[0]);
name = argv[1];
fd = open(name, O_RDONLY);
if (fd == -1)
err(1, "open");

hole_end = 0;
while (1) {
hole_start = lseek(fd, hole_end, SEEK_HOLE);
if (hole_start == -1)
err(1, "lseek");
hole_end = lseek(fd, hole_start, SEEK_DATA);
if (hole_end == -1) {
if (errno == ENXIO)
break;
err(1, "lseek");
}
do_stats(hole_start, hole_end);
}
hole_end = lseek(fd, 0, SEEK_END);
do_stats(hole_start, hole_end);
printf("total hole bytes: %ld (%.0f%)\n", hbytes,
100 * (float)hbytes/hole_end);
printf("in aligned 1MB chunks: %ld (%.0f%)\n", rplusbytes,
100 * (float)rplusbytes/hole_end);
}

2015-03-18 18:16:31

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On 03/17/2015 05:36 PM, J. Bruce Fields wrote:
> On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields wrote:
>> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields wrote:
>>> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker wrote:
>>>> This patch implements sending an array of segments back to the client.
>>>> Clients should be prepared to handle multiple segment reads to make this
>>>> useful. We try to splice the first data segment into the XDR result,
>>>> and remaining segments are encoded directly.
>>>
>>> I'm still interested in what would happen if we started with an
>>> implementation like:
>>>
>>> - if the entire requested range falls within a hole, return that
>>> single hole.
>>> - otherwise, just treat the thing as one big data segment.
>>>
>>> That would provide a benefit in the case there are large-ish holes
>>> with minimal impact otherwise.
>>>
>>> (Though patches for full support are still useful even if only for
>>> client-testing purposes.)
>>
>> Also, looks like
>>
>> xvs_io -c "fiemap -v" <file>
>>
>> will give hole sizes for a given <file>. (Thanks, esandeen.) Running
>> that on a few of my test vm images shows a fair number of large
>> (hundreds of megs) files, which suggests identifying only >=rwsize holes
>> might still be useful.
>
> Just for fun.... I wrote the following test program and ran it on my
> collection of testing vm's. Some looked like this:
>
> f21-1.qcow2
> 144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13 f21-1.qcow2
> total hole bytes: 8443252736 (98%)
> in aligned 1MB chunks: 8428453888 (98%)
>
> So, basically, read_plus would save transferring most of the data even
> when only handling 1MB holes.
>
> But some looked like this:
>
> 501524 -rw-------. 1 qemu qemu 8589934592 May 20 2014 rhel6-1-1.img
> total hole bytes: 8077516800 (94%)
> in aligned 1MB chunks: 0 (0%)
>
> So the READ_PLUS that caught every hole might save a lot, the one that
> only caught 1MB holes wouldn't help at all.
>
> And there were lots of examples in between those two extremes.

I tested with three different 512 MB files: 100% data, 100% hole, and alternating every megabyte. The results were surprising:

| v4.1 | v4.2
-----------------------
data | 0.685s | 0.714s
hole | 0.485s | 15.547s
mixed | 1.283s | 0.448

>From what I can tell, the 100% hole case takes so long because of the SEEK_DATA call in nfsd4_encode_read_plus_hole(). I took this out to trick the function into thinking that the entire file was already a hole, and runtime dropped to the levels of v4.1 and v4.2. I wonder if this is filesystem dependent? My server is exporting ext4.

Anna
>
> (But, check my math, I haven't tested this carefully.)
>
> --b.
>
> #define _GNU_SOURCE
> #include <stdio.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <fcntl.h>
> #include <unistd.h>
> #include <errno.h>
> #include <err.h>
>
> long round_up(long n, long b)
> {
> return ((n + b - 1)/b) * b;
> }
>
> long round_down(long n, long b)
> {
> return (n/b) * b;
> }
>
> long hbytes = 0;
> long rplusbytes = 0;
>
> do_stats(off_t hole_start, off_t hole_end)
> {
> off_t hole_start_up, hole_end_down;
>
> hole_start_up = round_up(hole_start, 1024*1024);
> hole_end_down = round_down(hole_end, 1024*1024);
>
> hbytes += hole_end - hole_start;
> if (hole_start_up < hole_end_down)
> rplusbytes += hole_end_down - hole_start_up;
> }
>
> int main(int argc, char *argv[])
> {
> off_t hole_start, hole_end;
> int fd;
> char *name;
>
> /* Map out holes with SEEK_HOLE, SEEK_DATA */
> /* Useful statistics:
> * - what percentage of file is in holes?
> * - what percentage of file would be skipped if we read it
> * sequentially in 1MB chunks?
> */
>
> if (argc != 2)
> errx(1, "usage: %s <filename>\n", argv[0]);
> name = argv[1];
> fd = open(name, O_RDONLY);
> if (fd == -1)
> err(1, "open");
>
> hole_end = 0;
> while (1) {
> hole_start = lseek(fd, hole_end, SEEK_HOLE);
> if (hole_start == -1)
> err(1, "lseek");
> hole_end = lseek(fd, hole_start, SEEK_DATA);
> if (hole_end == -1) {
> if (errno == ENXIO)
> break;
> err(1, "lseek");
> }
> do_stats(hole_start, hole_end);
> }
> hole_end = lseek(fd, 0, SEEK_END);
> do_stats(hole_start, hole_end);
> printf("total hole bytes: %ld (%.0f%)\n", hbytes,
> 100 * (float)hbytes/hole_end);
> printf("in aligned 1MB chunks: %ld (%.0f%)\n", rplusbytes,
> 100 * (float)rplusbytes/hole_end);
> }
>


2015-03-18 18:55:45

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Wed, Mar 18, 2015 at 02:16:29PM -0400, Anna Schumaker wrote:
> On 03/17/2015 05:36 PM, J. Bruce Fields wrote:
> > On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields wrote:
> >> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields wrote:
> >>> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker wrote:
> >>>> This patch implements sending an array of segments back to the client.
> >>>> Clients should be prepared to handle multiple segment reads to make this
> >>>> useful. We try to splice the first data segment into the XDR result,
> >>>> and remaining segments are encoded directly.
> >>>
> >>> I'm still interested in what would happen if we started with an
> >>> implementation like:
> >>>
> >>> - if the entire requested range falls within a hole, return that
> >>> single hole.
> >>> - otherwise, just treat the thing as one big data segment.
> >>>
> >>> That would provide a benefit in the case there are large-ish holes
> >>> with minimal impact otherwise.
> >>>
> >>> (Though patches for full support are still useful even if only for
> >>> client-testing purposes.)
> >>
> >> Also, looks like
> >>
> >> xvs_io -c "fiemap -v" <file>
> >>
> >> will give hole sizes for a given <file>. (Thanks, esandeen.) Running
> >> that on a few of my test vm images shows a fair number of large
> >> (hundreds of megs) files, which suggests identifying only >=rwsize holes
> >> might still be useful.
> >
> > Just for fun.... I wrote the following test program and ran it on my
> > collection of testing vm's. Some looked like this:
> >
> > f21-1.qcow2
> > 144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13 f21-1.qcow2
> > total hole bytes: 8443252736 (98%)
> > in aligned 1MB chunks: 8428453888 (98%)
> >
> > So, basically, read_plus would save transferring most of the data even
> > when only handling 1MB holes.
> >
> > But some looked like this:
> >
> > 501524 -rw-------. 1 qemu qemu 8589934592 May 20 2014 rhel6-1-1.img
> > total hole bytes: 8077516800 (94%)
> > in aligned 1MB chunks: 0 (0%)
> >
> > So the READ_PLUS that caught every hole might save a lot, the one that
> > only caught 1MB holes wouldn't help at all.
> >
> > And there were lots of examples in between those two extremes.
>
> I tested with three different 512 MB files: 100% data, 100% hole, and alternating every megabyte. The results were surprising:
>
> | v4.1 | v4.2
> -----------------------
> data | 0.685s | 0.714s
> hole | 0.485s | 15.547s
> mixed | 1.283s | 0.448
>
> >From what I can tell, the 100% hole case takes so long because of the
> >SEEK_DATA call in nfsd4_encode_read_plus_hole(). I took this out to
> >trick the function into thinking that the entire file was already a
> >hole, and runtime dropped to the levels of v4.1 and v4.2.

Wait, that 15s is due to just one SEEK_DATA?

> I wonder
> >if this is filesystem dependent? My server is exporting ext4.

Sounds like just a bug. I've been doing lots of lseek(.,.,SEEK_DATA) on
both ext4 and xfs without seeing anything that weird.

I believe it does return -ENXIO in the case SEEK_DATA is called at an
offset beyond which there's no more data. At least that's what I saw in
userspace. So maybe your code just isn't handling that case correctly?

--b.

2015-03-18 20:39:28

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On 03/18/2015 02:55 PM, J. Bruce Fields wrote:
> On Wed, Mar 18, 2015 at 02:16:29PM -0400, Anna Schumaker wrote:
>> On 03/17/2015 05:36 PM, J. Bruce Fields wrote:
>>> On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields wrote:
>>>> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields wrote:
>>>>> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker wrote:
>>>>>> This patch implements sending an array of segments back to the client.
>>>>>> Clients should be prepared to handle multiple segment reads to make this
>>>>>> useful. We try to splice the first data segment into the XDR result,
>>>>>> and remaining segments are encoded directly.
>>>>>
>>>>> I'm still interested in what would happen if we started with an
>>>>> implementation like:
>>>>>
>>>>> - if the entire requested range falls within a hole, return that
>>>>> single hole.
>>>>> - otherwise, just treat the thing as one big data segment.
>>>>>
>>>>> That would provide a benefit in the case there are large-ish holes
>>>>> with minimal impact otherwise.
>>>>>
>>>>> (Though patches for full support are still useful even if only for
>>>>> client-testing purposes.)
>>>>
>>>> Also, looks like
>>>>
>>>> xvs_io -c "fiemap -v" <file>
>>>>
>>>> will give hole sizes for a given <file>. (Thanks, esandeen.) Running
>>>> that on a few of my test vm images shows a fair number of large
>>>> (hundreds of megs) files, which suggests identifying only >=rwsize holes
>>>> might still be useful.
>>>
>>> Just for fun.... I wrote the following test program and ran it on my
>>> collection of testing vm's. Some looked like this:
>>>
>>> f21-1.qcow2
>>> 144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13 f21-1.qcow2
>>> total hole bytes: 8443252736 (98%)
>>> in aligned 1MB chunks: 8428453888 (98%)
>>>
>>> So, basically, read_plus would save transferring most of the data even
>>> when only handling 1MB holes.
>>>
>>> But some looked like this:
>>>
>>> 501524 -rw-------. 1 qemu qemu 8589934592 May 20 2014 rhel6-1-1.img
>>> total hole bytes: 8077516800 (94%)
>>> in aligned 1MB chunks: 0 (0%)
>>>
>>> So the READ_PLUS that caught every hole might save a lot, the one that
>>> only caught 1MB holes wouldn't help at all.
>>>
>>> And there were lots of examples in between those two extremes.
>>
>> I tested with three different 512 MB files: 100% data, 100% hole, and alternating every megabyte. The results were surprising:
>>
>> | v4.1 | v4.2
>> -----------------------
>> data | 0.685s | 0.714s
>> hole | 0.485s | 15.547s
>> mixed | 1.283s | 0.448
>>
>> >From what I can tell, the 100% hole case takes so long because of the
>>> SEEK_DATA call in nfsd4_encode_read_plus_hole(). I took this out to
>>> trick the function into thinking that the entire file was already a
>>> hole, and runtime dropped to the levels of v4.1 and v4.2.
>
> Wait, that 15s is due to just one SEEK_DATA?

The server is returning a larger hole than the client can read at once, so there are several SEEK_DATA calls made to verify that there are no data segments before the end of the file.

>
>> I wonder
>>> if this is filesystem dependent? My server is exporting ext4.
>
> Sounds like just a bug. I've been doing lots of lseek(.,.,SEEK_DATA) on
> both ext4 and xfs without seeing anything that weird.

It looks like something weird on ext4. I switched my exported filesystem to xfs:

| v4.1 | v4.2
------+--------+-------
data | 0.764s | 1.343s
hole | 0.572s | 0.205s
mixed | 0.634s | 0.472s


I bumped up the test to 1G files:

| v4.1 | v4.2
------+--------+-------
data | 1.578s | 1.743s
hole | 1.241s | 0.443s
mixed | 1.884s | 0.913s

Let me know if I should test anything larger!

Anna
>
> I believe it does return -ENXIO in the case SEEK_DATA is called at an
> offset beyond which there's no more data. At least that's what I saw in
> userspace. So maybe your code just isn't handling that case correctly?
>
> --b.
>


2015-03-18 20:55:54

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Wed, Mar 18, 2015 at 04:39:24PM -0400, Anna Schumaker wrote:
> On 03/18/2015 02:55 PM, J. Bruce Fields wrote:
> > On Wed, Mar 18, 2015 at 02:16:29PM -0400, Anna Schumaker wrote:
> >> On 03/17/2015 05:36 PM, J. Bruce Fields wrote:
> >>> On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields wrote:
> >>>> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields wrote:
> >>>>> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker wrote:
> >>>>>> This patch implements sending an array of segments back to the client.
> >>>>>> Clients should be prepared to handle multiple segment reads to make this
> >>>>>> useful. We try to splice the first data segment into the XDR result,
> >>>>>> and remaining segments are encoded directly.
> >>>>>
> >>>>> I'm still interested in what would happen if we started with an
> >>>>> implementation like:
> >>>>>
> >>>>> - if the entire requested range falls within a hole, return that
> >>>>> single hole.
> >>>>> - otherwise, just treat the thing as one big data segment.
> >>>>>
> >>>>> That would provide a benefit in the case there are large-ish holes
> >>>>> with minimal impact otherwise.
> >>>>>
> >>>>> (Though patches for full support are still useful even if only for
> >>>>> client-testing purposes.)
> >>>>
> >>>> Also, looks like
> >>>>
> >>>> xvs_io -c "fiemap -v" <file>
> >>>>
> >>>> will give hole sizes for a given <file>. (Thanks, esandeen.) Running
> >>>> that on a few of my test vm images shows a fair number of large
> >>>> (hundreds of megs) files, which suggests identifying only >=rwsize holes
> >>>> might still be useful.
> >>>
> >>> Just for fun.... I wrote the following test program and ran it on my
> >>> collection of testing vm's. Some looked like this:
> >>>
> >>> f21-1.qcow2
> >>> 144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13 f21-1.qcow2
> >>> total hole bytes: 8443252736 (98%)
> >>> in aligned 1MB chunks: 8428453888 (98%)
> >>>
> >>> So, basically, read_plus would save transferring most of the data even
> >>> when only handling 1MB holes.
> >>>
> >>> But some looked like this:
> >>>
> >>> 501524 -rw-------. 1 qemu qemu 8589934592 May 20 2014 rhel6-1-1.img
> >>> total hole bytes: 8077516800 (94%)
> >>> in aligned 1MB chunks: 0 (0%)
> >>>
> >>> So the READ_PLUS that caught every hole might save a lot, the one that
> >>> only caught 1MB holes wouldn't help at all.
> >>>
> >>> And there were lots of examples in between those two extremes.
> >>
> >> I tested with three different 512 MB files: 100% data, 100% hole, and alternating every megabyte. The results were surprising:
> >>
> >> | v4.1 | v4.2
> >> -----------------------
> >> data | 0.685s | 0.714s
> >> hole | 0.485s | 15.547s
> >> mixed | 1.283s | 0.448
> >>
> >> >From what I can tell, the 100% hole case takes so long because of the
> >>> SEEK_DATA call in nfsd4_encode_read_plus_hole(). I took this out to
> >>> trick the function into thinking that the entire file was already a
> >>> hole, and runtime dropped to the levels of v4.1 and v4.2.
> >
> > Wait, that 15s is due to just one SEEK_DATA?
>
> The server is returning a larger hole than the client can read at once, so there are several SEEK_DATA calls made to verify that there are no data segments before the end of the file.
>
> >
> >> I wonder
> >>> if this is filesystem dependent? My server is exporting ext4.
> >
> > Sounds like just a bug. I've been doing lots of lseek(.,.,SEEK_DATA) on
> > both ext4 and xfs without seeing anything that weird.
>
> It looks like something weird on ext4. I switched my exported filesystem to xfs:

Huh. Maybe we should report a bug....

>
> | v4.1 | v4.2
> ------+--------+-------
> data | 0.764s | 1.343s

That's too bad. Non-sparse files are surely still a common case and
we'd like to not see a slowdown there.... I wonder if we can figure out
where it's coming from?

> hole | 0.572s | 0.205s
> mixed | 0.634s | 0.472s
>
>
> I bumped up the test to 1G files:
>
> | v4.1 | v4.2
> ------+--------+-------
> data | 1.578s | 1.743s
> hole | 1.241s | 0.443s
> mixed | 1.884s | 0.913s
>
> Let me know if I should test anything larger!

The other thing I'd be interested in would be a "mixed" case that
alternates every 4k. That will test the worst case where we we do a 1MB
read and get back only a 4k hole. Aligned 1MB holes are somewhat of a
best case.

--b.

2015-03-18 21:03:35

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On 03/18/2015 04:55 PM, J. Bruce Fields wrote:
> On Wed, Mar 18, 2015 at 04:39:24PM -0400, Anna Schumaker wrote:
>> On 03/18/2015 02:55 PM, J. Bruce Fields wrote:
>>> On Wed, Mar 18, 2015 at 02:16:29PM -0400, Anna Schumaker wrote:
>>>> On 03/17/2015 05:36 PM, J. Bruce Fields wrote:
>>>>> On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields wrote:
>>>>>> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields wrote:
>>>>>>> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker wrote:
>>>>>>>> This patch implements sending an array of segments back to the client.
>>>>>>>> Clients should be prepared to handle multiple segment reads to make this
>>>>>>>> useful. We try to splice the first data segment into the XDR result,
>>>>>>>> and remaining segments are encoded directly.
>>>>>>>
>>>>>>> I'm still interested in what would happen if we started with an
>>>>>>> implementation like:
>>>>>>>
>>>>>>> - if the entire requested range falls within a hole, return that
>>>>>>> single hole.
>>>>>>> - otherwise, just treat the thing as one big data segment.
>>>>>>>
>>>>>>> That would provide a benefit in the case there are large-ish holes
>>>>>>> with minimal impact otherwise.
>>>>>>>
>>>>>>> (Though patches for full support are still useful even if only for
>>>>>>> client-testing purposes.)
>>>>>>
>>>>>> Also, looks like
>>>>>>
>>>>>> xvs_io -c "fiemap -v" <file>
>>>>>>
>>>>>> will give hole sizes for a given <file>. (Thanks, esandeen.) Running
>>>>>> that on a few of my test vm images shows a fair number of large
>>>>>> (hundreds of megs) files, which suggests identifying only >=rwsize holes
>>>>>> might still be useful.
>>>>>
>>>>> Just for fun.... I wrote the following test program and ran it on my
>>>>> collection of testing vm's. Some looked like this:
>>>>>
>>>>> f21-1.qcow2
>>>>> 144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13 f21-1.qcow2
>>>>> total hole bytes: 8443252736 (98%)
>>>>> in aligned 1MB chunks: 8428453888 (98%)
>>>>>
>>>>> So, basically, read_plus would save transferring most of the data even
>>>>> when only handling 1MB holes.
>>>>>
>>>>> But some looked like this:
>>>>>
>>>>> 501524 -rw-------. 1 qemu qemu 8589934592 May 20 2014 rhel6-1-1.img
>>>>> total hole bytes: 8077516800 (94%)
>>>>> in aligned 1MB chunks: 0 (0%)
>>>>>
>>>>> So the READ_PLUS that caught every hole might save a lot, the one that
>>>>> only caught 1MB holes wouldn't help at all.
>>>>>
>>>>> And there were lots of examples in between those two extremes.
>>>>
>>>> I tested with three different 512 MB files: 100% data, 100% hole, and alternating every megabyte. The results were surprising:
>>>>
>>>> | v4.1 | v4.2
>>>> -----------------------
>>>> data | 0.685s | 0.714s
>>>> hole | 0.485s | 15.547s
>>>> mixed | 1.283s | 0.448
>>>>
>>>> >From what I can tell, the 100% hole case takes so long because of the
>>>>> SEEK_DATA call in nfsd4_encode_read_plus_hole(). I took this out to
>>>>> trick the function into thinking that the entire file was already a
>>>>> hole, and runtime dropped to the levels of v4.1 and v4.2.
>>>
>>> Wait, that 15s is due to just one SEEK_DATA?
>>
>> The server is returning a larger hole than the client can read at once, so there are several SEEK_DATA calls made to verify that there are no data segments before the end of the file.
>>
>>>
>>>> I wonder
>>>>> if this is filesystem dependent? My server is exporting ext4.
>>>
>>> Sounds like just a bug. I've been doing lots of lseek(.,.,SEEK_DATA) on
>>> both ext4 and xfs without seeing anything that weird.
>>
>> It looks like something weird on ext4. I switched my exported filesystem to xfs:
>
> Huh. Maybe we should report a bug....
>
>>
>> | v4.1 | v4.2
>> ------+--------+-------
>> data | 0.764s | 1.343s
>
> That's too bad. Non-sparse files are surely still a common case and
> we'd like to not see a slowdown there.... I wonder if we can figure out
> where it's coming from?

That's a good question, especially since the 1G file didn't double this time. Maybe a VM quirk?


>
>> hole | 0.572s | 0.205s
>> mixed | 0.634s | 0.472s
>>
>>
>> I bumped up the test to 1G files:
>>
>> | v4.1 | v4.2
>> ------+--------+-------
>> data | 1.578s | 1.743s
>> hole | 1.241s | 0.443s
>> mixed | 1.884s | 0.913s
>>
>> Let me know if I should test anything larger!
>
> The other thing I'd be interested in would be a "mixed" case that
> alternates every 4k. That will test the worst case where we we do a 1MB
> read and get back only a 4k hole. Aligned 1MB holes are somewhat of a
> best case.

I probably won't get a chance to test this until I'm back from my vacation, but I'll keep the suggestion in mind!

Anna
>
> --b.
>


2015-03-18 21:11:45

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Wed, Mar 18, 2015 at 05:03:32PM -0400, Anna Schumaker wrote:
> On 03/18/2015 04:55 PM, J. Bruce Fields wrote:
> > On Wed, Mar 18, 2015 at 04:39:24PM -0400, Anna Schumaker wrote:
> >> On 03/18/2015 02:55 PM, J. Bruce Fields wrote:
> >>> On Wed, Mar 18, 2015 at 02:16:29PM -0400, Anna Schumaker wrote:
> >>>> On 03/17/2015 05:36 PM, J. Bruce Fields wrote:
> >>>>> On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields wrote:
> >>>>>> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields wrote:
> >>>>>>> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker wrote:
> >>>>>>>> This patch implements sending an array of segments back to the client.
> >>>>>>>> Clients should be prepared to handle multiple segment reads to make this
> >>>>>>>> useful. We try to splice the first data segment into the XDR result,
> >>>>>>>> and remaining segments are encoded directly.
> >>>>>>>
> >>>>>>> I'm still interested in what would happen if we started with an
> >>>>>>> implementation like:
> >>>>>>>
> >>>>>>> - if the entire requested range falls within a hole, return that
> >>>>>>> single hole.
> >>>>>>> - otherwise, just treat the thing as one big data segment.
> >>>>>>>
> >>>>>>> That would provide a benefit in the case there are large-ish holes
> >>>>>>> with minimal impact otherwise.
> >>>>>>>
> >>>>>>> (Though patches for full support are still useful even if only for
> >>>>>>> client-testing purposes.)
> >>>>>>
> >>>>>> Also, looks like
> >>>>>>
> >>>>>> xvs_io -c "fiemap -v" <file>
> >>>>>>
> >>>>>> will give hole sizes for a given <file>. (Thanks, esandeen.) Running
> >>>>>> that on a few of my test vm images shows a fair number of large
> >>>>>> (hundreds of megs) files, which suggests identifying only >=rwsize holes
> >>>>>> might still be useful.
> >>>>>
> >>>>> Just for fun.... I wrote the following test program and ran it on my
> >>>>> collection of testing vm's. Some looked like this:
> >>>>>
> >>>>> f21-1.qcow2
> >>>>> 144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13 f21-1.qcow2
> >>>>> total hole bytes: 8443252736 (98%)
> >>>>> in aligned 1MB chunks: 8428453888 (98%)
> >>>>>
> >>>>> So, basically, read_plus would save transferring most of the data even
> >>>>> when only handling 1MB holes.
> >>>>>
> >>>>> But some looked like this:
> >>>>>
> >>>>> 501524 -rw-------. 1 qemu qemu 8589934592 May 20 2014 rhel6-1-1.img
> >>>>> total hole bytes: 8077516800 (94%)
> >>>>> in aligned 1MB chunks: 0 (0%)
> >>>>>
> >>>>> So the READ_PLUS that caught every hole might save a lot, the one that
> >>>>> only caught 1MB holes wouldn't help at all.
> >>>>>
> >>>>> And there were lots of examples in between those two extremes.
> >>>>
> >>>> I tested with three different 512 MB files: 100% data, 100% hole, and alternating every megabyte. The results were surprising:
> >>>>
> >>>> | v4.1 | v4.2
> >>>> -----------------------
> >>>> data | 0.685s | 0.714s
> >>>> hole | 0.485s | 15.547s
> >>>> mixed | 1.283s | 0.448
> >>>>
> >>>> >From what I can tell, the 100% hole case takes so long because of the
> >>>>> SEEK_DATA call in nfsd4_encode_read_plus_hole(). I took this out to
> >>>>> trick the function into thinking that the entire file was already a
> >>>>> hole, and runtime dropped to the levels of v4.1 and v4.2.
> >>>
> >>> Wait, that 15s is due to just one SEEK_DATA?
> >>
> >> The server is returning a larger hole than the client can read at once, so there are several SEEK_DATA calls made to verify that there are no data segments before the end of the file.
> >>
> >>>
> >>>> I wonder
> >>>>> if this is filesystem dependent? My server is exporting ext4.
> >>>
> >>> Sounds like just a bug. I've been doing lots of lseek(.,.,SEEK_DATA) on
> >>> both ext4 and xfs without seeing anything that weird.
> >>
> >> It looks like something weird on ext4. I switched my exported filesystem to xfs:
> >
> > Huh. Maybe we should report a bug....
> >
> >>
> >> | v4.1 | v4.2
> >> ------+--------+-------
> >> data | 0.764s | 1.343s
> >
> > That's too bad. Non-sparse files are surely still a common case and
> > we'd like to not see a slowdown there.... I wonder if we can figure out
> > where it's coming from?
>
> That's a good question, especially since the 1G file didn't double this time. Maybe a VM quirk?

We definitely need to figure it out, I think. If we can't make
READ_PLUS perform as well as READ (or very close to it) in the
non-sparse case then I don't think we'll want it, and as Trond suggested
we may want to consider something more fiemap-like instead.

I don't know, maybe the client could try to be clever and only use
READ_PLUS if the space_used/size ratio is lower than some threshhold,
but it could get a little complicated to tune.

It's annoying that asking "does this range contain zeroes" is actually
taking longer than just reading the whole range....

--b.

2015-03-19 15:36:27

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Thu, Mar 19, 2015 at 08:00:05AM -0700, Marc Eshel wrote:
> [email protected] wrote on 03/18/2015 02:11:44 PM:
>
> > From: "J. Bruce Fields" <[email protected]>
> > To: Anna Schumaker <[email protected]>
> > Cc: [email protected]
> > Date: 03/18/2015 02:14 PM
> > Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple
> segments
> > Sent by: [email protected]
> >
> > On Wed, Mar 18, 2015 at 05:03:32PM -0400, Anna Schumaker wrote:
> > > On 03/18/2015 04:55 PM, J. Bruce Fields wrote:
> > > > On Wed, Mar 18, 2015 at 04:39:24PM -0400, Anna Schumaker wrote:
> > > >> On 03/18/2015 02:55 PM, J. Bruce Fields wrote:
> > > >>> On Wed, Mar 18, 2015 at 02:16:29PM -0400, Anna Schumaker wrote:
> > > >>>> On 03/17/2015 05:36 PM, J. Bruce Fields wrote:
> > > >>>>> On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields wrote:
> > > >>>>>> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields
> wrote:
> > > >>>>>>> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker
> wrote:
> > > >>>>>>>> This patch implements sending an array of segments back
> > to the client.
> > > >>>>>>>> Clients should be prepared to handle multiple segment
> > reads to make this
> > > >>>>>>>> useful. We try to splice the first data segment into the
> > XDR result,
> > > >>>>>>>> and remaining segments are encoded directly.
> > > >>>>>>>
> > > >>>>>>> I'm still interested in what would happen if we started with
> an
> > > >>>>>>> implementation like:
> > > >>>>>>>
> > > >>>>>>> - if the entire requested range falls within a hole, return
> that
> > > >>>>>>> single hole.
> > > >>>>>>> - otherwise, just treat the thing as one big data segment.
> > > >>>>>>>
> > > >>>>>>> That would provide a benefit in the case there are large-ish
> holes
> > > >>>>>>> with minimal impact otherwise.
> > > >>>>>>>
> > > >>>>>>> (Though patches for full support are still useful even if only
> for
> > > >>>>>>> client-testing purposes.)
> > > >>>>>>
> > > >>>>>> Also, looks like
> > > >>>>>>
> > > >>>>>> xvs_io -c "fiemap -v" <file>
> > > >>>>>>
> > > >>>>>> will give hole sizes for a given <file>. (Thanks,
> > esandeen.) Running
> > > >>>>>> that on a few of my test vm images shows a fair number of large
> > > >>>>>> (hundreds of megs) files, which suggests identifying only
> > >=rwsize holes
> > > >>>>>> might still be useful.
> > > >>>>>
> > > >>>>> Just for fun.... I wrote the following test program and ran it
> on my
> > > >>>>> collection of testing vm's. Some looked like this:
> > > >>>>>
> > > >>>>> f21-1.qcow2
> > > >>>>> 144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13
> f21-1.qcow2
> > > >>>>> total hole bytes: 8443252736 (98%)
> > > >>>>> in aligned 1MB chunks: 8428453888 (98%)
> > > >>>>>
> > > >>>>> So, basically, read_plus would save transferring most of thedata
> even
> > > >>>>> when only handling 1MB holes.
> > > >>>>>
> > > >>>>> But some looked like this:
> > > >>>>>
> > > >>>>> 501524 -rw-------. 1 qemu qemu 8589934592 May 20 2014
> > rhel6-1-1.img
> > > >>>>> total hole bytes: 8077516800 (94%)
> > > >>>>> in aligned 1MB chunks: 0 (0%)
> > > >>>>>
> > > >>>>> So the READ_PLUS that caught every hole might save a lot, the
> one that
> > > >>>>> only caught 1MB holes wouldn't help at all.
> > > >>>>>
> > > >>>>> And there were lots of examples in between those two extremes.
> > > >>>>
> > > >>>> I tested with three different 512 MB files: 100% data, 100%
> > hole, and alternating every megabyte. The results were surprising:
> > > >>>>
> > > >>>> | v4.1 | v4.2
> > > >>>> -----------------------
> > > >>>> data | 0.685s | 0.714s
> > > >>>> hole | 0.485s | 15.547s
> > > >>>> mixed | 1.283s | 0.448
> > > >>>>
> > > >>>> >From what I can tell, the 100% hole case takes so long because
> of the
> > > >>>>> SEEK_DATA call in nfsd4_encode_read_plus_hole(). I took this
> out to
> > > >>>>> trick the function into thinking that the entire file was
> already a
> > > >>>>> hole, and runtime dropped to the levels of v4.1 and v4.2.
> > > >>>
> > > >>> Wait, that 15s is due to just one SEEK_DATA?
> > > >>
> > > >> The server is returning a larger hole than the client can read
> > at once, so there are several SEEK_DATA calls made to verify that
> > there are no data segments before the end of the file.
> > > >>
> > > >>>
> > > >>>> I wonder
> > > >>>>> if this is filesystem dependent? My server is exporting ext4.
> > > >>>
> > > >>> Sounds like just a bug. I've been doing lots of
> lseek(.,.,SEEK_DATA) on
> > > >>> both ext4 and xfs without seeing anything that weird.
> > > >>
> > > >> It looks like something weird on ext4. I switched my exported
> > filesystem to xfs:
> > > >
> > > > Huh. Maybe we should report a bug....
> > > >
> > > >>
> > > >> | v4.1 | v4.2
> > > >> ------+--------+-------
> > > >> data | 0.764s | 1.343s
> > > >
> > > > That's too bad. Non-sparse files are surely still a common case and
> > > > we'd like to not see a slowdown there.... I wonder if we can figure
> out
> > > > where it's coming from?
> > >
> > > That's a good question, especially since the 1G file didn't double
> > this time. Maybe a VM quirk?
> >
> > We definitely need to figure it out, I think. If we can't make
> > READ_PLUS perform as well as READ (or very close to it) in the
> > non-sparse case then I don't think we'll want it, and as Trond suggested
> > we may want to consider something more fiemap-like instead.
>
> Testing Anna's NFS client with the Ganesha NFS server and GPFS file system
> shows the same numbers for READ with v4.1 and READ_PUSE with v4.2 of a
> data file. Using sparse files READ_PLUS is 5 times faster than READ.

Thanks! Is it possible to report the exact numbers?

Is Ganesha also implementing READ_PLUS with SEEK_HOLE/SEEK_DATA? If so
then maybe the difference is the filesystem. Might be interesting to
run the same sort of test with ganesha exporting xfs and/or knfsd
exporting GPFS.

--b.

2015-03-19 16:29:00

by Marc Eshel

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

[email protected] wrote on 03/19/2015 08:36:27 AM:

> From: "J. Bruce Fields" <[email protected]>
> To: Marc Eshel/Almaden/IBM@IBMUS
> Cc: Anna Schumaker <[email protected]>, linux-
> [email protected], [email protected]
> Date: 03/19/2015 08:36 AM
> Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple
segments
> Sent by: [email protected]
>
> On Thu, Mar 19, 2015 at 08:00:05AM -0700, Marc Eshel wrote:
> > [email protected] wrote on 03/18/2015 02:11:44 PM:
> >
> > > From: "J. Bruce Fields" <[email protected]>
> > > To: Anna Schumaker <[email protected]>
> > > Cc: [email protected]
> > > Date: 03/18/2015 02:14 PM
> > > Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple
> > segments
> > > Sent by: [email protected]
> > >
> > > On Wed, Mar 18, 2015 at 05:03:32PM -0400, Anna Schumaker wrote:
> > > > On 03/18/2015 04:55 PM, J. Bruce Fields wrote:
> > > > > On Wed, Mar 18, 2015 at 04:39:24PM -0400, Anna Schumaker wrote:
> > > > >> On 03/18/2015 02:55 PM, J. Bruce Fields wrote:
> > > > >>> On Wed, Mar 18, 2015 at 02:16:29PM -0400, Anna Schumaker
wrote:
> > > > >>>> On 03/17/2015 05:36 PM, J. Bruce Fields wrote:
> > > > >>>>> On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields
wrote:
> > > > >>>>>> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields
> > wrote:
> > > > >>>>>>> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker
> > wrote:
> > > > >>>>>>>> This patch implements sending an array of segments back
> > > to the client.
> > > > >>>>>>>> Clients should be prepared to handle multiple segment
> > > reads to make this
> > > > >>>>>>>> useful. We try to splice the first data segment into the
> > > XDR result,
> > > > >>>>>>>> and remaining segments are encoded directly.
> > > > >>>>>>>
> > > > >>>>>>> I'm still interested in what would happen if we started
with
> > an
> > > > >>>>>>> implementation like:
> > > > >>>>>>>
> > > > >>>>>>> - if the entire requested range falls within a hole,
return
> > that
> > > > >>>>>>> single hole.
> > > > >>>>>>> - otherwise, just treat the thing as one big data
segment.
> > > > >>>>>>>
> > > > >>>>>>> That would provide a benefit in the case there are
large-ish
> > holes
> > > > >>>>>>> with minimal impact otherwise.
> > > > >>>>>>>
> > > > >>>>>>> (Though patches for full support are still useful even if
only
> > for
> > > > >>>>>>> client-testing purposes.)
> > > > >>>>>>
> > > > >>>>>> Also, looks like
> > > > >>>>>>
> > > > >>>>>> xvs_io -c "fiemap -v" <file>
> > > > >>>>>>
> > > > >>>>>> will give hole sizes for a given <file>. (Thanks,
> > > esandeen.) Running
> > > > >>>>>> that on a few of my test vm images shows a fair number of
large
> > > > >>>>>> (hundreds of megs) files, which suggests identifying only
> > > >=rwsize holes
> > > > >>>>>> might still be useful.
> > > > >>>>>
> > > > >>>>> Just for fun.... I wrote the following test program and ran
it
> > on my
> > > > >>>>> collection of testing vm's. Some looked like this:
> > > > >>>>>
> > > > >>>>> f21-1.qcow2
> > > > >>>>> 144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13
> > f21-1.qcow2
> > > > >>>>> total hole bytes: 8443252736 (98%)
> > > > >>>>> in aligned 1MB chunks: 8428453888 (98%)
> > > > >>>>>
> > > > >>>>> So, basically, read_plus would save transferring most of
thedata
> > even
> > > > >>>>> when only handling 1MB holes.
> > > > >>>>>
> > > > >>>>> But some looked like this:
> > > > >>>>>
> > > > >>>>> 501524 -rw-------. 1 qemu qemu 8589934592 May 20 2014
> > > rhel6-1-1.img
> > > > >>>>> total hole bytes: 8077516800 (94%)
> > > > >>>>> in aligned 1MB chunks: 0 (0%)
> > > > >>>>>
> > > > >>>>> So the READ_PLUS that caught every hole might save a lot,
the
> > one that
> > > > >>>>> only caught 1MB holes wouldn't help at all.
> > > > >>>>>
> > > > >>>>> And there were lots of examples in between those two
extremes.
> > > > >>>>
> > > > >>>> I tested with three different 512 MB files: 100% data, 100%
> > > hole, and alternating every megabyte. The results were surprising:
> > > > >>>>
> > > > >>>> | v4.1 | v4.2
> > > > >>>> -----------------------
> > > > >>>> data | 0.685s | 0.714s
> > > > >>>> hole | 0.485s | 15.547s
> > > > >>>> mixed | 1.283s | 0.448
> > > > >>>>
> > > > >>>> >From what I can tell, the 100% hole case takes so long
because
> > of the
> > > > >>>>> SEEK_DATA call in nfsd4_encode_read_plus_hole(). I took
this
> > out to
> > > > >>>>> trick the function into thinking that the entire file was
> > already a
> > > > >>>>> hole, and runtime dropped to the levels of v4.1 and v4.2.
> > > > >>>
> > > > >>> Wait, that 15s is due to just one SEEK_DATA?
> > > > >>
> > > > >> The server is returning a larger hole than the client can read
> > > at once, so there are several SEEK_DATA calls made to verify that
> > > there are no data segments before the end of the file.
> > > > >>
> > > > >>>
> > > > >>>> I wonder
> > > > >>>>> if this is filesystem dependent? My server is exporting
ext4.
> > > > >>>
> > > > >>> Sounds like just a bug. I've been doing lots of
> > lseek(.,.,SEEK_DATA) on
> > > > >>> both ext4 and xfs without seeing anything that weird.
> > > > >>
> > > > >> It looks like something weird on ext4. I switched my exported
> > > filesystem to xfs:
> > > > >
> > > > > Huh. Maybe we should report a bug....
> > > > >
> > > > >>
> > > > >> | v4.1 | v4.2
> > > > >> ------+--------+-------
> > > > >> data | 0.764s | 1.343s
> > > > >
> > > > > That's too bad. Non-sparse files are surely still a common case
and
> > > > > we'd like to not see a slowdown there.... I wonder if we can
figure
> > out
> > > > > where it's coming from?
> > > >
> > > > That's a good question, especially since the 1G file didn't double
> > > this time. Maybe a VM quirk?
> > >
> > > We definitely need to figure it out, I think. If we can't make
> > > READ_PLUS perform as well as READ (or very close to it) in the
> > > non-sparse case then I don't think we'll want it, and as Trond
suggested
> > > we may want to consider something more fiemap-like instead.
> >
> > Testing Anna's NFS client with the Ganesha NFS server and GPFS file
system
> > shows the same numbers for READ with v4.1 and READ_PLUS with v4.2 of a

> > data file. Using sparse files READ_PLUS is 5 times faster than READ.
>
> Thanks! Is it possible to report the exact numbers?

This is a copy of a 100M file.

[root@fin16 ~]# umount /mnt
[root@fin16 ~]# mount -t nfs4 -o minorversion=1 9.1.74.120:/gpfsA /mnt
[root@fin16 ~]# time cp /mnt/100M /dev/null

real 0m1.597s
user 0m0.000s
sys 0m0.062s
[root@fin16 ~]# umount /mnt
[root@fin16 ~]# mount -t nfs4 -o minorversion=2 9.1.74.120:/gpfsA /mnt
[root@fin16 ~]# time cp /mnt/100M /dev/null

real 0m1.595s
user 0m0.002s
sys 0m0.057s

>
> Is Ganesha also implementing READ_PLUS with SEEK_HOLE/SEEK_DATA? If so
> then maybe the difference is the filesystem. Might be interesting to
> run the same sort of test with ganesha exporting xfs and/or knfsd
> exporting GPFS.

GPFS did not implement it using SEEK it just calls the fs read and if
there is no data the fs returns ENODATA return code. It is not yet
implemented on other FSLAs

>
> --b.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>


2015-03-20 15:17:18

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

Maybe this is a question for xfs developers.

So, we have a new READ_PLUS call that's basically just a version of READ
optimized for sparse files:

http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-33#section-15.10

It allows an NFS server to return either file data (like a normal READ
call) or, at the server's discretion, records saying "this range of the
data is all zeroes".

Anna tried implementing READ_PLUS for knfsd using
vfs_llseek(.,.,SEEK_HOLE) followed by an ordinary read if that
determines we're not at a hole.

(Very) preliminary results suggest that's slower than a plain READ for
an xfs file with no holes. (And *much* slower in the ext4 case for some
reason.)

Is that expected, and should we be doing this some other way instead?

--b.

On Thu, Mar 19, 2015 at 09:28:09AM -0700, Marc Eshel wrote:
> [email protected] wrote on 03/19/2015 08:36:27 AM:
>
> > From: "J. Bruce Fields" <[email protected]>
> > To: Marc Eshel/Almaden/IBM@IBMUS
> > Cc: Anna Schumaker <[email protected]>, linux-
> > [email protected], [email protected]
> > Date: 03/19/2015 08:36 AM
> > Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple
> segments
> > Sent by: [email protected]
> >
> > On Thu, Mar 19, 2015 at 08:00:05AM -0700, Marc Eshel wrote:
> > > [email protected] wrote on 03/18/2015 02:11:44 PM:
> > > > From: "J. Bruce Fields" <[email protected]>
> > > > On Wed, Mar 18, 2015 at 05:03:32PM -0400, Anna Schumaker wrote:
> > > > > On 03/18/2015 04:55 PM, J. Bruce Fields wrote:
> > > > > > On Wed, Mar 18, 2015 at 04:39:24PM -0400, Anna Schumaker wrote:
> > > > > >> | v4.1 | v4.2
> > > > > >> ------+--------+-------
> > > > > >> data | 0.764s | 1.343s
> > > > > >
> > > > > > That's too bad. Non-sparse files are surely still a common case
> and
> > > > > > we'd like to not see a slowdown there.... I wonder if we can
> figure
> > > out
> > > > > > where it's coming from?
> > > > >
> > > > > That's a good question, especially since the 1G file didn't double
> > > > this time. Maybe a VM quirk?
> > > >
> > > > We definitely need to figure it out, I think. If we can't make
> > > > READ_PLUS perform as well as READ (or very close to it) in the
> > > > non-sparse case then I don't think we'll want it, and as Trond
> suggested
> > > > we may want to consider something more fiemap-like instead.
> > >
> > > Testing Anna's NFS client with the Ganesha NFS server and GPFS file
> system
> > > shows the same numbers for READ with v4.1 and READ_PLUS with v4.2 of a
>
> > > data file. Using sparse files READ_PLUS is 5 times faster than READ.
> >
> > Thanks! Is it possible to report the exact numbers?
>
> This is a copy of a 100M file.
>
> [root@fin16 ~]# umount /mnt
> [root@fin16 ~]# mount -t nfs4 -o minorversion=1 9.1.74.120:/gpfsA /mnt
> [root@fin16 ~]# time cp /mnt/100M /dev/null
>
> real 0m1.597s
> user 0m0.000s
> sys 0m0.062s
> [root@fin16 ~]# umount /mnt
> [root@fin16 ~]# mount -t nfs4 -o minorversion=2 9.1.74.120:/gpfsA /mnt
> [root@fin16 ~]# time cp /mnt/100M /dev/null
>
> real 0m1.595s
> user 0m0.002s
> sys 0m0.057s
>
> >
> > Is Ganesha also implementing READ_PLUS with SEEK_HOLE/SEEK_DATA? If so
> > then maybe the difference is the filesystem. Might be interesting to
> > run the same sort of test with ganesha exporting xfs and/or knfsd
> > exporting GPFS.
>
> GPFS did not implement it using SEEK it just calls the fs read and if
> there is no data the fs returns ENODATA return code. It is not yet
> implemented on other FSLAs
>
> >
> > --b.
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> >

On Thu, Mar 19, 2015 at 09:28:09AM -0700, Marc Eshel wrote:
> [email protected] wrote on 03/19/2015 08:36:27 AM:
>
> > From: "J. Bruce Fields" <[email protected]>
> > To: Marc Eshel/Almaden/IBM@IBMUS
> > Cc: Anna Schumaker <[email protected]>, linux-
> > [email protected], [email protected]
> > Date: 03/19/2015 08:36 AM
> > Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple
> segments
> > Sent by: [email protected]
> >
> > On Thu, Mar 19, 2015 at 08:00:05AM -0700, Marc Eshel wrote:
> > > [email protected] wrote on 03/18/2015 02:11:44 PM:
> > >
> > > > From: "J. Bruce Fields" <[email protected]>
> > > > To: Anna Schumaker <[email protected]>
> > > > Cc: [email protected]
> > > > Date: 03/18/2015 02:14 PM
> > > > Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple
> > > segments
> > > > Sent by: [email protected]
> > > >
> > > > On Wed, Mar 18, 2015 at 05:03:32PM -0400, Anna Schumaker wrote:
> > > > > On 03/18/2015 04:55 PM, J. Bruce Fields wrote:
> > > > > > On Wed, Mar 18, 2015 at 04:39:24PM -0400, Anna Schumaker wrote:
> > > > > >> On 03/18/2015 02:55 PM, J. Bruce Fields wrote:
> > > > > >>> On Wed, Mar 18, 2015 at 02:16:29PM -0400, Anna Schumaker
> wrote:
> > > > > >>>> On 03/17/2015 05:36 PM, J. Bruce Fields wrote:
> > > > > >>>>> On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields
> wrote:
> > > > > >>>>>> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields
> > > wrote:
> > > > > >>>>>>> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker
> > > wrote:
> > > > > >>>>>>>> This patch implements sending an array of segments back
> > > > to the client.
> > > > > >>>>>>>> Clients should be prepared to handle multiple segment
> > > > reads to make this
> > > > > >>>>>>>> useful. We try to splice the first data segment into the
> > > > XDR result,
> > > > > >>>>>>>> and remaining segments are encoded directly.
> > > > > >>>>>>>
> > > > > >>>>>>> I'm still interested in what would happen if we started
> with
> > > an
> > > > > >>>>>>> implementation like:
> > > > > >>>>>>>
> > > > > >>>>>>> - if the entire requested range falls within a hole,
> return
> > > that
> > > > > >>>>>>> single hole.
> > > > > >>>>>>> - otherwise, just treat the thing as one big data
> segment.
> > > > > >>>>>>>
> > > > > >>>>>>> That would provide a benefit in the case there are
> large-ish
> > > holes
> > > > > >>>>>>> with minimal impact otherwise.
> > > > > >>>>>>>
> > > > > >>>>>>> (Though patches for full support are still useful even if
> only
> > > for
> > > > > >>>>>>> client-testing purposes.)
> > > > > >>>>>>
> > > > > >>>>>> Also, looks like
> > > > > >>>>>>
> > > > > >>>>>> xvs_io -c "fiemap -v" <file>
> > > > > >>>>>>
> > > > > >>>>>> will give hole sizes for a given <file>. (Thanks,
> > > > esandeen.) Running
> > > > > >>>>>> that on a few of my test vm images shows a fair number of
> large
> > > > > >>>>>> (hundreds of megs) files, which suggests identifying only
> > > > >=rwsize holes
> > > > > >>>>>> might still be useful.
> > > > > >>>>>
> > > > > >>>>> Just for fun.... I wrote the following test program and ran
> it
> > > on my
> > > > > >>>>> collection of testing vm's. Some looked like this:
> > > > > >>>>>
> > > > > >>>>> f21-1.qcow2
> > > > > >>>>> 144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13
> > > f21-1.qcow2
> > > > > >>>>> total hole bytes: 8443252736 (98%)
> > > > > >>>>> in aligned 1MB chunks: 8428453888 (98%)
> > > > > >>>>>
> > > > > >>>>> So, basically, read_plus would save transferring most of
> thedata
> > > even
> > > > > >>>>> when only handling 1MB holes.
> > > > > >>>>>
> > > > > >>>>> But some looked like this:
> > > > > >>>>>
> > > > > >>>>> 501524 -rw-------. 1 qemu qemu 8589934592 May 20 2014
> > > > rhel6-1-1.img
> > > > > >>>>> total hole bytes: 8077516800 (94%)
> > > > > >>>>> in aligned 1MB chunks: 0 (0%)
> > > > > >>>>>
> > > > > >>>>> So the READ_PLUS that caught every hole might save a lot,
> the
> > > one that
> > > > > >>>>> only caught 1MB holes wouldn't help at all.
> > > > > >>>>>
> > > > > >>>>> And there were lots of examples in between those two
> extremes.
> > > > > >>>>
> > > > > >>>> I tested with three different 512 MB files: 100% data, 100%
> > > > hole, and alternating every megabyte. The results were surprising:
> > > > > >>>>
> > > > > >>>> | v4.1 | v4.2
> > > > > >>>> -----------------------
> > > > > >>>> data | 0.685s | 0.714s
> > > > > >>>> hole | 0.485s | 15.547s
> > > > > >>>> mixed | 1.283s | 0.448
> > > > > >>>>
> > > > > >>>> >From what I can tell, the 100% hole case takes so long
> because
> > > of the
> > > > > >>>>> SEEK_DATA call in nfsd4_encode_read_plus_hole(). I took
> this
> > > out to
> > > > > >>>>> trick the function into thinking that the entire file was
> > > already a
> > > > > >>>>> hole, and runtime dropped to the levels of v4.1 and v4.2.
> > > > > >>>
> > > > > >>> Wait, that 15s is due to just one SEEK_DATA?
> > > > > >>
> > > > > >> The server is returning a larger hole than the client can read
> > > > at once, so there are several SEEK_DATA calls made to verify that
> > > > there are no data segments before the end of the file.
> > > > > >>
> > > > > >>>
> > > > > >>>> I wonder
> > > > > >>>>> if this is filesystem dependent? My server is exporting
> ext4.
> > > > > >>>
> > > > > >>> Sounds like just a bug. I've been doing lots of
> > > lseek(.,.,SEEK_DATA) on
> > > > > >>> both ext4 and xfs without seeing anything that weird.
> > > > > >>
> > > > > >> It looks like something weird on ext4. I switched my exported
> > > > filesystem to xfs:
> > > > > >
> > > > > > Huh. Maybe we should report a bug....
> > > > > >
> > > > > >>
> > > > > >> | v4.1 | v4.2
> > > > > >> ------+--------+-------
> > > > > >> data | 0.764s | 1.343s
> > > > > >
> > > > > > That's too bad. Non-sparse files are surely still a common case
> and
> > > > > > we'd like to not see a slowdown there.... I wonder if we can
> figure
> > > out
> > > > > > where it's coming from?
> > > > >
> > > > > That's a good question, especially since the 1G file didn't double
> > > > this time. Maybe a VM quirk?
> > > >
> > > > We definitely need to figure it out, I think. If we can't make
> > > > READ_PLUS perform as well as READ (or very close to it) in the
> > > > non-sparse case then I don't think we'll want it, and as Trond
> suggested
> > > > we may want to consider something more fiemap-like instead.
> > >
> > > Testing Anna's NFS client with the Ganesha NFS server and GPFS file
> system
> > > shows the same numbers for READ with v4.1 and READ_PLUS with v4.2 of a
>
> > > data file. Using sparse files READ_PLUS is 5 times faster than READ.
> >
> > Thanks! Is it possible to report the exact numbers?
>
> This is a copy of a 100M file.
>
> [root@fin16 ~]# umount /mnt
> [root@fin16 ~]# mount -t nfs4 -o minorversion=1 9.1.74.120:/gpfsA /mnt
> [root@fin16 ~]# time cp /mnt/100M /dev/null
>
> real 0m1.597s
> user 0m0.000s
> sys 0m0.062s
> [root@fin16 ~]# umount /mnt
> [root@fin16 ~]# mount -t nfs4 -o minorversion=2 9.1.74.120:/gpfsA /mnt
> [root@fin16 ~]# time cp /mnt/100M /dev/null
>
> real 0m1.595s
> user 0m0.002s
> sys 0m0.057s
>
> >
> > Is Ganesha also implementing READ_PLUS with SEEK_HOLE/SEEK_DATA? If so
> > then maybe the difference is the filesystem. Might be interesting to
> > run the same sort of test with ganesha exporting xfs and/or knfsd
> > exporting GPFS.
>
> GPFS did not implement it using SEEK it just calls the fs read and if
> there is no data the fs returns ENODATA return code. It is not yet
> implemented on other FSLAs
>
> >
> > --b.
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> >

2015-03-20 16:23:10

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Fri, Mar 20, 2015 at 11:17:18AM -0400, J. Bruce Fields wrote:
> Maybe this is a question for xfs developers.
>
> So, we have a new READ_PLUS call that's basically just a version of READ
> optimized for sparse files:
>
> http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-33#section-15.10
>
> It allows an NFS server to return either file data (like a normal READ
> call) or, at the server's discretion, records saying "this range of the
> data is all zeroes".
>
> Anna tried implementing READ_PLUS for knfsd using
> vfs_llseek(.,.,SEEK_HOLE) followed by an ordinary read if that
> determines we're not at a hole.
>
> (Very) preliminary results suggest that's slower than a plain READ for
> an xfs file with no holes. (And *much* slower in the ext4 case for some
> reason.)

It should be a fairly cheap operastion, and does extent tree operations
that are pretty similar to an (uncached) read. Do you have profiles?

> Is that expected, and should we be doing this some other way instead?

Are the read cached or uncached? If they are from pagecache just
copying the zeroes is pretty much unbeatable compared to extent
tree lookups, so we'd need a new page flag (difficult..) to see
that a page is a hole (and then it would only work for the whole page),
but for uncached reads an optimization would be to tell a read that it's
an NFS READ_PLUS so that it could just read until it reach a hole,
and then we'd need some way to communicate the hole size (or just fall
back to SEEK_HOLE for that case).

2015-03-20 18:26:22

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Fri, Mar 20, 2015 at 09:23:03AM -0700, Christoph Hellwig wrote:
> On Fri, Mar 20, 2015 at 11:17:18AM -0400, J. Bruce Fields wrote:
> > Maybe this is a question for xfs developers.
> >
> > So, we have a new READ_PLUS call that's basically just a version of READ
> > optimized for sparse files:
> >
> > http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-33#section-15.10
> >
> > It allows an NFS server to return either file data (like a normal READ
> > call) or, at the server's discretion, records saying "this range of the
> > data is all zeroes".
> >
> > Anna tried implementing READ_PLUS for knfsd using
> > vfs_llseek(.,.,SEEK_HOLE) followed by an ordinary read if that
> > determines we're not at a hole.
> >
> > (Very) preliminary results suggest that's slower than a plain READ for
> > an xfs file with no holes. (And *much* slower in the ext4 case for some
> > reason.)
>
> It should be a fairly cheap operastion, and does extent tree operations
> that are pretty similar to an (uncached) read. Do you have profiles?
>
> > Is that expected, and should we be doing this some other way instead?
>
> Are the read cached or uncached?

I don't know, and don't have profiles. I'll either try to reproduce or
wait till Anna's back from vacation.

> If they are from pagecache just copying the zeroes is pretty much
> unbeatable compared to extent tree lookups, so we'd need a new page
> flag (difficult..) to see that a page is a hole (and then it would
> only work for the whole page), but for uncached reads an optimization
> would be to tell a read that it's an NFS READ_PLUS so that it could
> just read until it reach a hole, and then we'd need some way to
> communicate the hole size (or just fall back to SEEK_HOLE for that
> case).

Ugh, OK. We'll do some more tests before coming back to ask about
that....

--b.

2015-03-24 12:43:53

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Fri, Mar 20, 2015 at 2:26 PM, J. Bruce Fields <[email protected]> wrote:
> On Fri, Mar 20, 2015 at 09:23:03AM -0700, Christoph Hellwig wrote:
>> On Fri, Mar 20, 2015 at 11:17:18AM -0400, J. Bruce Fields wrote:
>> > Maybe this is a question for xfs developers.
>> >
>> > So, we have a new READ_PLUS call that's basically just a version of READ
>> > optimized for sparse files:
>> >
>> > http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-33#section-15.10
>> >
>> > It allows an NFS server to return either file data (like a normal READ
>> > call) or, at the server's discretion, records saying "this range of the
>> > data is all zeroes".
>> >
>> > Anna tried implementing READ_PLUS for knfsd using
>> > vfs_llseek(.,.,SEEK_HOLE) followed by an ordinary read if that
>> > determines we're not at a hole.
>> >
>> > (Very) preliminary results suggest that's slower than a plain READ for
>> > an xfs file with no holes. (And *much* slower in the ext4 case for some
>> > reason.)
>>
>> It should be a fairly cheap operastion, and does extent tree operations
>> that are pretty similar to an (uncached) read. Do you have profiles?
>>
>> > Is that expected, and should we be doing this some other way instead?
>>
>> Are the read cached or uncached?
>
> I don't know, and don't have profiles. I'll either try to reproduce or
> wait till Anna's back from vacation.

I'm using whatever functions NFSD already uses for reading files,
which I expect go through the VFS. Is there a flag that controls
cache behavior?

>
>> If they are from pagecache just copying the zeroes is pretty much
>> unbeatable compared to extent tree lookups, so we'd need a new page
>> flag (difficult..) to see that a page is a hole (and then it would
>> only work for the whole page), but for uncached reads an optimization
>> would be to tell a read that it's an NFS READ_PLUS so that it could
>> just read until it reach a hole, and then we'd need some way to
>> communicate the hole size (or just fall back to SEEK_HOLE for that
>> case).
>
> Ugh, OK. We'll do some more tests before coming back to ask about
> that....

I only had time for the one run, so I'll do more trials and see if
that one read is always so long. I'm still hoping it was something in
the way my VM was scheduling its tasks!

Anna

>
> --b.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2015-03-24 17:49:23

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Tue, Mar 24, 2015 at 08:43:31AM -0400, Anna Schumaker wrote:
> > I don't know, and don't have profiles. I'll either try to reproduce or
> > wait till Anna's back from vacation.
>
> I'm using whatever functions NFSD already uses for reading files,
> which I expect go through the VFS. Is there a flag that controls
> cache behavior?

There's the O_DIRECT flag, but that's not what I mean. If you just
wrote to it it's a cached read, if you did unmount the filesystem after
writing, or did an echo to /proc/sys/vm/drop_caches you get uncached
read behavior.

2015-03-25 17:15:09

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On 03/24/2015 01:49 PM, Christoph Hellwig wrote:
> On Tue, Mar 24, 2015 at 08:43:31AM -0400, Anna Schumaker wrote:
>>> I don't know, and don't have profiles. I'll either try to reproduce or
>>> wait till Anna's back from vacation.
>>
>> I'm using whatever functions NFSD already uses for reading files,
>> which I expect go through the VFS. Is there a flag that controls
>> cache behavior?
>
> There's the O_DIRECT flag, but that's not what I mean. If you just
> wrote to it it's a cached read, if you did unmount the filesystem after
> writing, or did an echo to /proc/sys/vm/drop_caches you get uncached
> read behavior.

Oh, I'm doing uncached reads for my tests. I'm collecting updated numbers now!

Anna
>


2015-03-26 15:21:44

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

Here are my updated numbers! I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K. I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:

##########################
# #
# Without READ_PLUS #
# #
##########################


NFS v4.1:
Trial
|---------|---------|---------|---------|---------|---------|---------|
| | 1 | 2 | 3 | 4 | 5 | Average |
|---------|---------|---------|---------|---------|---------|---------|
| Data | 8.723s | 7.243s | 8.252s | 6.997s | 6.980s | 7.639s |
| Hole | 5.271s | 5.224s | 5.060s | 4.897s | 5.321s | 5.155s |
| Mixed | 8.050s | 10.057s | 7.919s | 8.060s | 9.557s | 8.729s |
|---------|---------|---------|---------|---------|---------|---------|




NFS v4.2:
Trial
|---------|---------|---------|---------|---------|---------|---------|
| | 1 | 2 | 3 | 4 | 5 | Average |
|---------|---------|---------|---------|---------|---------|---------|
| Data | 6.707s | 7.070s | 6.722s | 6.761s | 6.810s | 6.814s |
| Hole | 5.152s | 5.149s | 5.213s | 5.206s | 5.312s | 5.206s |
| Mixed | 7.979s | 7.985s | 8.177s | 7.772s | 8.280s | 8.039s |
|---------|---------|---------|---------|---------|---------|---------|





#######################
# #
# With READ_PLUS #
# #
#######################


NFS v4.1:
Trial
|---------|---------|---------|---------|---------|---------|---------|
| | 1 | 2 | 3 | 4 | 5 | Average |
|---------|---------|---------|---------|---------|---------|---------|
| Data | 9.082s | 7.008s | 7.116s | 6.771s | 7.902s | 7.576s |
| Hole | 5.333s | 5.358s | 5.380s | 5.161s | 5.282s | 5.303s |
| Mixed | 8.189s | 8.308s | 9.540s | 7.937s | 8.420s | 8.479s |
|---------|---------|---------|---------|---------|---------|---------|




NFS v4.2:
Trial
|---------|---------|---------|---------|---------|---------|---------|
| | 1 | 2 | 3 | 4 | 5 | Average |
|---------|---------|---------|---------|---------|---------|---------|
| Data | 7.033s | 6.829s | 7.025s | 6.873s | 7.134s | 6.979s |
| Hole | 1.794s | 1.800s | 1.905s | 1.811s | 1.725s | 1.807s |
| Mixed | 7.590s | 8.777s | 9.423s | 10.366s | 8.024s | 8.836s |
|---------|---------|---------|---------|---------|---------|---------|



On 03/24/2015 01:49 PM, Christoph Hellwig wrote:
> On Tue, Mar 24, 2015 at 08:43:31AM -0400, Anna Schumaker wrote:
>>> I don't know, and don't have profiles. I'll either try to reproduce or
>>> wait till Anna's back from vacation.
>>
>> I'm using whatever functions NFSD already uses for reading files,
>> which I expect go through the VFS. Is there a flag that controls
>> cache behavior?
>
> There's the O_DIRECT flag, but that's not what I mean. If you just
> wrote to it it's a cached read, if you did unmount the filesystem after
> writing, or did an echo to /proc/sys/vm/drop_caches you get uncached
> read behavior.
>


2015-03-26 15:32:26

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
<[email protected]> wrote:
> Here are my updated numbers! I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K. I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>
> ##########################
> # #
> # Without READ_PLUS #
> # #
> ##########################
>
>
> NFS v4.1:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 8.723s | 7.243s | 8.252s | 6.997s | 6.980s | 7.639s |
> | Hole | 5.271s | 5.224s | 5.060s | 4.897s | 5.321s | 5.155s |
> | Mixed | 8.050s | 10.057s | 7.919s | 8.060s | 9.557s | 8.729s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
>
>
> NFS v4.2:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 6.707s | 7.070s | 6.722s | 6.761s | 6.810s | 6.814s |
> | Hole | 5.152s | 5.149s | 5.213s | 5.206s | 5.312s | 5.206s |
> | Mixed | 7.979s | 7.985s | 8.177s | 7.772s | 8.280s | 8.039s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
>
>
>
> #######################
> # #
> # With READ_PLUS #
> # #
> #######################
>
>
> NFS v4.1:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 9.082s | 7.008s | 7.116s | 6.771s | 7.902s | 7.576s |
> | Hole | 5.333s | 5.358s | 5.380s | 5.161s | 5.282s | 5.303s |
> | Mixed | 8.189s | 8.308s | 9.540s | 7.937s | 8.420s | 8.479s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
>
>
> NFS v4.2:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 7.033s | 6.829s | 7.025s | 6.873s | 7.134s | 6.979s |
> | Hole | 1.794s | 1.800s | 1.905s | 1.811s | 1.725s | 1.807s |
> | Mixed | 7.590s | 8.777s | 9.423s | 10.366s | 8.024s | 8.836s |
> |---------|---------|---------|---------|---------|---------|---------|
>

So there is a clear win in the 100% hole case here, but otherwise the
statistical fluctuations are dominating the numbers. Can you get us a
little more stats and then perhaps run the results through nfsometer?

>
>
> On 03/24/2015 01:49 PM, Christoph Hellwig wrote:
>> On Tue, Mar 24, 2015 at 08:43:31AM -0400, Anna Schumaker wrote:
>>>> I don't know, and don't have profiles. I'll either try to reproduce or
>>>> wait till Anna's back from vacation.
>>>
>>> I'm using whatever functions NFSD already uses for reading files,
>>> which I expect go through the VFS. Is there a flag that controls
>>> cache behavior?
>>
>> There's the O_DIRECT flag, but that's not what I mean. If you just
>> wrote to it it's a cached read, if you did unmount the filesystem after
>> writing, or did an echo to /proc/sys/vm/drop_caches you get uncached
>> read behavior.
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html



--
Trond Myklebust
Linux NFS client maintainer, PrimaryData
[email protected]

2015-03-26 15:36:39

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On 03/26/2015 11:32 AM, Trond Myklebust wrote:
> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
> <[email protected]> wrote:
>> Here are my updated numbers! I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K. I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>
>> ##########################
>> # #
>> # Without READ_PLUS #
>> # #
>> ##########################
>>
>>
>> NFS v4.1:
>> Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> | | 1 | 2 | 3 | 4 | 5 | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> | Data | 8.723s | 7.243s | 8.252s | 6.997s | 6.980s | 7.639s |
>> | Hole | 5.271s | 5.224s | 5.060s | 4.897s | 5.321s | 5.155s |
>> | Mixed | 8.050s | 10.057s | 7.919s | 8.060s | 9.557s | 8.729s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>>
>>
>> NFS v4.2:
>> Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> | | 1 | 2 | 3 | 4 | 5 | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> | Data | 6.707s | 7.070s | 6.722s | 6.761s | 6.810s | 6.814s |
>> | Hole | 5.152s | 5.149s | 5.213s | 5.206s | 5.312s | 5.206s |
>> | Mixed | 7.979s | 7.985s | 8.177s | 7.772s | 8.280s | 8.039s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>>
>>
>>
>> #######################
>> # #
>> # With READ_PLUS #
>> # #
>> #######################
>>
>>
>> NFS v4.1:
>> Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> | | 1 | 2 | 3 | 4 | 5 | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> | Data | 9.082s | 7.008s | 7.116s | 6.771s | 7.902s | 7.576s |
>> | Hole | 5.333s | 5.358s | 5.380s | 5.161s | 5.282s | 5.303s |
>> | Mixed | 8.189s | 8.308s | 9.540s | 7.937s | 8.420s | 8.479s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>>
>>
>> NFS v4.2:
>> Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> | | 1 | 2 | 3 | 4 | 5 | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> | Data | 7.033s | 6.829s | 7.025s | 6.873s | 7.134s | 6.979s |
>> | Hole | 1.794s | 1.800s | 1.905s | 1.811s | 1.725s | 1.807s |
>> | Mixed | 7.590s | 8.777s | 9.423s | 10.366s | 8.024s | 8.836s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>
> So there is a clear win in the 100% hole case here, but otherwise the
> statistical fluctuations are dominating the numbers. Can you get us a
> little more stats and then perhaps run the results through nfsometer?

Sure! Do you want any information besides runtime?

Anna

>
>>
>>
>> On 03/24/2015 01:49 PM, Christoph Hellwig wrote:
>>> On Tue, Mar 24, 2015 at 08:43:31AM -0400, Anna Schumaker wrote:
>>>>> I don't know, and don't have profiles. I'll either try to reproduce or
>>>>> wait till Anna's back from vacation.
>>>>
>>>> I'm using whatever functions NFSD already uses for reading files,
>>>> which I expect go through the VFS. Is there a flag that controls
>>>> cache behavior?
>>>
>>> There's the O_DIRECT flag, but that's not what I mean. If you just
>>> wrote to it it's a cached read, if you did unmount the filesystem after
>>> writing, or did an echo to /proc/sys/vm/drop_caches you get uncached
>>> read behavior.
>>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
>


2015-03-26 15:38:51

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
> <[email protected]> wrote:
> > Here are my updated numbers! I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K. I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
> >
> > ##########################
> > # #
> > # Without READ_PLUS #
> > # #
> > ##########################
> >
> >
> > NFS v4.1:
> > Trial
> > |---------|---------|---------|---------|---------|---------|---------|
> > | | 1 | 2 | 3 | 4 | 5 | Average |
> > |---------|---------|---------|---------|---------|---------|---------|
> > | Data | 8.723s | 7.243s | 8.252s | 6.997s | 6.980s | 7.639s |
> > | Hole | 5.271s | 5.224s | 5.060s | 4.897s | 5.321s | 5.155s |
> > | Mixed | 8.050s | 10.057s | 7.919s | 8.060s | 9.557s | 8.729s |
> > |---------|---------|---------|---------|---------|---------|---------|
> >
> >
> >
> >
> > NFS v4.2:
> > Trial
> > |---------|---------|---------|---------|---------|---------|---------|
> > | | 1 | 2 | 3 | 4 | 5 | Average |
> > |---------|---------|---------|---------|---------|---------|---------|
> > | Data | 6.707s | 7.070s | 6.722s | 6.761s | 6.810s | 6.814s |
> > | Hole | 5.152s | 5.149s | 5.213s | 5.206s | 5.312s | 5.206s |
> > | Mixed | 7.979s | 7.985s | 8.177s | 7.772s | 8.280s | 8.039s |
> > |---------|---------|---------|---------|---------|---------|---------|
> >
> >
> >
> >
> >
> > #######################
> > # #
> > # With READ_PLUS #
> > # #
> > #######################
> >
> >
> > NFS v4.1:
> > Trial
> > |---------|---------|---------|---------|---------|---------|---------|
> > | | 1 | 2 | 3 | 4 | 5 | Average |
> > |---------|---------|---------|---------|---------|---------|---------|
> > | Data | 9.082s | 7.008s | 7.116s | 6.771s | 7.902s | 7.576s |
> > | Hole | 5.333s | 5.358s | 5.380s | 5.161s | 5.282s | 5.303s |
> > | Mixed | 8.189s | 8.308s | 9.540s | 7.937s | 8.420s | 8.479s |
> > |---------|---------|---------|---------|---------|---------|---------|
> >
> >
> >
> >
> > NFS v4.2:
> > Trial
> > |---------|---------|---------|---------|---------|---------|---------|
> > | | 1 | 2 | 3 | 4 | 5 | Average |
> > |---------|---------|---------|---------|---------|---------|---------|
> > | Data | 7.033s | 6.829s | 7.025s | 6.873s | 7.134s | 6.979s |
> > | Hole | 1.794s | 1.800s | 1.905s | 1.811s | 1.725s | 1.807s |
> > | Mixed | 7.590s | 8.777s | 9.423s | 10.366s | 8.024s | 8.836s |
> > |---------|---------|---------|---------|---------|---------|---------|
> >
>
> So there is a clear win in the 100% hole case here, but otherwise the
> statistical fluctuations are dominating the numbers. Can you get us a
> little more stats and then perhaps run the results through nfsometer?

Also, could you describe the setup (are these still kvm's), and how
you're clearing the cache between runs?

--b.

2015-03-26 15:47:07

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>> <[email protected]> wrote:
>>> Here are my updated numbers! I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K. I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>
>>> ##########################
>>> # #
>>> # Without READ_PLUS #
>>> # #
>>> ##########################
>>>
>>>
>>> NFS v4.1:
>>> Trial
>>> |---------|---------|---------|---------|---------|---------|---------|
>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>> |---------|---------|---------|---------|---------|---------|---------|
>>> | Data | 8.723s | 7.243s | 8.252s | 6.997s | 6.980s | 7.639s |
>>> | Hole | 5.271s | 5.224s | 5.060s | 4.897s | 5.321s | 5.155s |
>>> | Mixed | 8.050s | 10.057s | 7.919s | 8.060s | 9.557s | 8.729s |
>>> |---------|---------|---------|---------|---------|---------|---------|
>>>
>>>
>>>
>>>
>>> NFS v4.2:
>>> Trial
>>> |---------|---------|---------|---------|---------|---------|---------|
>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>> |---------|---------|---------|---------|---------|---------|---------|
>>> | Data | 6.707s | 7.070s | 6.722s | 6.761s | 6.810s | 6.814s |
>>> | Hole | 5.152s | 5.149s | 5.213s | 5.206s | 5.312s | 5.206s |
>>> | Mixed | 7.979s | 7.985s | 8.177s | 7.772s | 8.280s | 8.039s |
>>> |---------|---------|---------|---------|---------|---------|---------|
>>>
>>>
>>>
>>>
>>>
>>> #######################
>>> # #
>>> # With READ_PLUS #
>>> # #
>>> #######################
>>>
>>>
>>> NFS v4.1:
>>> Trial
>>> |---------|---------|---------|---------|---------|---------|---------|
>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>> |---------|---------|---------|---------|---------|---------|---------|
>>> | Data | 9.082s | 7.008s | 7.116s | 6.771s | 7.902s | 7.576s |
>>> | Hole | 5.333s | 5.358s | 5.380s | 5.161s | 5.282s | 5.303s |
>>> | Mixed | 8.189s | 8.308s | 9.540s | 7.937s | 8.420s | 8.479s |
>>> |---------|---------|---------|---------|---------|---------|---------|
>>>
>>>
>>>
>>>
>>> NFS v4.2:
>>> Trial
>>> |---------|---------|---------|---------|---------|---------|---------|
>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>> |---------|---------|---------|---------|---------|---------|---------|
>>> | Data | 7.033s | 6.829s | 7.025s | 6.873s | 7.134s | 6.979s |
>>> | Hole | 1.794s | 1.800s | 1.905s | 1.811s | 1.725s | 1.807s |
>>> | Mixed | 7.590s | 8.777s | 9.423s | 10.366s | 8.024s | 8.836s |
>>> |---------|---------|---------|---------|---------|---------|---------|
>>>
>>
>> So there is a clear win in the 100% hole case here, but otherwise the
>> statistical fluctuations are dominating the numbers. Can you get us a
>> little more stats and then perhaps run the results through nfsometer?
>
> Also, could you describe the setup (are these still kvm's), and how
> you're clearing the cache between runs?

These are still KVMs and my server is exporting an xfs filesystem. I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.

>
> --b.
>


2015-03-26 16:06:51

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
<[email protected]> wrote:
> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>> <[email protected]> wrote:
>>>> Here are my updated numbers! I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K. I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>
>>>> ##########################
>>>> # #
>>>> # Without READ_PLUS #
>>>> # #
>>>> ##########################
>>>>
>>>>
>>>> NFS v4.1:
>>>> Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | Data | 8.723s | 7.243s | 8.252s | 6.997s | 6.980s | 7.639s |
>>>> | Hole | 5.271s | 5.224s | 5.060s | 4.897s | 5.321s | 5.155s |
>>>> | Mixed | 8.050s | 10.057s | 7.919s | 8.060s | 9.557s | 8.729s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>>
>>>>
>>>> NFS v4.2:
>>>> Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | Data | 6.707s | 7.070s | 6.722s | 6.761s | 6.810s | 6.814s |
>>>> | Hole | 5.152s | 5.149s | 5.213s | 5.206s | 5.312s | 5.206s |
>>>> | Mixed | 7.979s | 7.985s | 8.177s | 7.772s | 8.280s | 8.039s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>>
>>>>
>>>>
>>>> #######################
>>>> # #
>>>> # With READ_PLUS #
>>>> # #
>>>> #######################
>>>>
>>>>
>>>> NFS v4.1:
>>>> Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | Data | 9.082s | 7.008s | 7.116s | 6.771s | 7.902s | 7.576s |
>>>> | Hole | 5.333s | 5.358s | 5.380s | 5.161s | 5.282s | 5.303s |
>>>> | Mixed | 8.189s | 8.308s | 9.540s | 7.937s | 8.420s | 8.479s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>>
>>>>
>>>> NFS v4.2:
>>>> Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | Data | 7.033s | 6.829s | 7.025s | 6.873s | 7.134s | 6.979s |
>>>> | Hole | 1.794s | 1.800s | 1.905s | 1.811s | 1.725s | 1.807s |
>>>> | Mixed | 7.590s | 8.777s | 9.423s | 10.366s | 8.024s | 8.836s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>
>>> So there is a clear win in the 100% hole case here, but otherwise the
>>> statistical fluctuations are dominating the numbers. Can you get us a
>>> little more stats and then perhaps run the results through nfsometer?
>>
>> Also, could you describe the setup (are these still kvm's), and how
>> you're clearing the cache between runs?
>
> These are still KVMs and my server is exporting an xfs filesystem. I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.

I agree that you have to use the 'drop_caches' interface on the
server, but why not just use O_DIRECT on the clients?

--
Trond Myklebust
Linux NFS client maintainer, PrimaryData
[email protected]

2015-03-26 16:11:35

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On 03/26/2015 12:06 PM, Trond Myklebust wrote:
> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
> <[email protected]> wrote:
>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>>> <[email protected]> wrote:
>>>>> Here are my updated numbers! I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K. I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>>
>>>>> ##########################
>>>>> # #
>>>>> # Without READ_PLUS #
>>>>> # #
>>>>> ##########################
>>>>>
>>>>>
>>>>> NFS v4.1:
>>>>> Trial
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> | Data | 8.723s | 7.243s | 8.252s | 6.997s | 6.980s | 7.639s |
>>>>> | Hole | 5.271s | 5.224s | 5.060s | 4.897s | 5.321s | 5.155s |
>>>>> | Mixed | 8.050s | 10.057s | 7.919s | 8.060s | 9.557s | 8.729s |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> NFS v4.2:
>>>>> Trial
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> | Data | 6.707s | 7.070s | 6.722s | 6.761s | 6.810s | 6.814s |
>>>>> | Hole | 5.152s | 5.149s | 5.213s | 5.206s | 5.312s | 5.206s |
>>>>> | Mixed | 7.979s | 7.985s | 8.177s | 7.772s | 8.280s | 8.039s |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> #######################
>>>>> # #
>>>>> # With READ_PLUS #
>>>>> # #
>>>>> #######################
>>>>>
>>>>>
>>>>> NFS v4.1:
>>>>> Trial
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> | Data | 9.082s | 7.008s | 7.116s | 6.771s | 7.902s | 7.576s |
>>>>> | Hole | 5.333s | 5.358s | 5.380s | 5.161s | 5.282s | 5.303s |
>>>>> | Mixed | 8.189s | 8.308s | 9.540s | 7.937s | 8.420s | 8.479s |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> NFS v4.2:
>>>>> Trial
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> | Data | 7.033s | 6.829s | 7.025s | 6.873s | 7.134s | 6.979s |
>>>>> | Hole | 1.794s | 1.800s | 1.905s | 1.811s | 1.725s | 1.807s |
>>>>> | Mixed | 7.590s | 8.777s | 9.423s | 10.366s | 8.024s | 8.836s |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>
>>>>
>>>> So there is a clear win in the 100% hole case here, but otherwise the
>>>> statistical fluctuations are dominating the numbers. Can you get us a
>>>> little more stats and then perhaps run the results through nfsometer?
>>>
>>> Also, could you describe the setup (are these still kvm's), and how
>>> you're clearing the cache between runs?
>>
>> These are still KVMs and my server is exporting an xfs filesystem. I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
>
> I agree that you have to use the 'drop_caches' interface on the
> server, but why not just use O_DIRECT on the clients?

I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`. I can write something to read files with O_DIRECT if that would be more useful!

Anna

>


2015-03-26 16:11:55

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Thu, Mar 26, 2015 at 11:47:03AM -0400, Anna Schumaker wrote:
> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
> > On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
> >> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
> >> <[email protected]> wrote:
> >>> Here are my updated numbers! I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K. I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
> >>>
> >>> ##########################
> >>> # #
> >>> # Without READ_PLUS #
> >>> # #
> >>> ##########################
> >>>
> >>>
> >>> NFS v4.1:
> >>> Trial
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>> | Data | 8.723s | 7.243s | 8.252s | 6.997s | 6.980s | 7.639s |
> >>> | Hole | 5.271s | 5.224s | 5.060s | 4.897s | 5.321s | 5.155s |
> >>> | Mixed | 8.050s | 10.057s | 7.919s | 8.060s | 9.557s | 8.729s |
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>>
> >>>
> >>>
> >>>
> >>> NFS v4.2:
> >>> Trial
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>> | Data | 6.707s | 7.070s | 6.722s | 6.761s | 6.810s | 6.814s |
> >>> | Hole | 5.152s | 5.149s | 5.213s | 5.206s | 5.312s | 5.206s |
> >>> | Mixed | 7.979s | 7.985s | 8.177s | 7.772s | 8.280s | 8.039s |
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>>
> >>>
> >>>
> >>>
> >>>
> >>> #######################
> >>> # #
> >>> # With READ_PLUS #
> >>> # #
> >>> #######################
> >>>
> >>>
> >>> NFS v4.1:
> >>> Trial
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>> | Data | 9.082s | 7.008s | 7.116s | 6.771s | 7.902s | 7.576s |
> >>> | Hole | 5.333s | 5.358s | 5.380s | 5.161s | 5.282s | 5.303s |
> >>> | Mixed | 8.189s | 8.308s | 9.540s | 7.937s | 8.420s | 8.479s |
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>>
> >>>
> >>>
> >>>
> >>> NFS v4.2:
> >>> Trial
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>> | Data | 7.033s | 6.829s | 7.025s | 6.873s | 7.134s | 6.979s |
> >>> | Hole | 1.794s | 1.800s | 1.905s | 1.811s | 1.725s | 1.807s |
> >>> | Mixed | 7.590s | 8.777s | 9.423s | 10.366s | 8.024s | 8.836s |
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>>
> >>
> >> So there is a clear win in the 100% hole case here, but otherwise the
> >> statistical fluctuations are dominating the numbers. Can you get us a
> >> little more stats and then perhaps run the results through nfsometer?
> >
> > Also, could you describe the setup (are these still kvm's), and how
> > you're clearing the cache between runs?
>
> These are still KVMs and my server is exporting an xfs filesystem. I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.

What sort of device is the exported xfs filesystem on? (Can't there
be a second level of caching on the guest, depending on how it's set
up?)

Can we get results on bare metal? (The kvm test might be a good
worst-case for read_plus, as I'd expect bandwidth to be relatively high
compared to the cost of the extra memcpy's or seek calls. But it also
seems more complicated.)

--b.

2015-03-26 16:13:27

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Thu, Mar 26, 2015 at 12:11 PM, Anna Schumaker
<[email protected]> wrote:
> On 03/26/2015 12:06 PM, Trond Myklebust wrote:
>> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
>> <[email protected]> wrote:
>>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>>>> <[email protected]> wrote:
>>>>>> Here are my updated numbers! I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K. I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>>>
>>>>>> ##########################
>>>>>> # #
>>>>>> # Without READ_PLUS #
>>>>>> # #
>>>>>> ##########################
>>>>>>
>>>>>>
>>>>>> NFS v4.1:
>>>>>> Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | Data | 8.723s | 7.243s | 8.252s | 6.997s | 6.980s | 7.639s |
>>>>>> | Hole | 5.271s | 5.224s | 5.060s | 4.897s | 5.321s | 5.155s |
>>>>>> | Mixed | 8.050s | 10.057s | 7.919s | 8.060s | 9.557s | 8.729s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> NFS v4.2:
>>>>>> Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | Data | 6.707s | 7.070s | 6.722s | 6.761s | 6.810s | 6.814s |
>>>>>> | Hole | 5.152s | 5.149s | 5.213s | 5.206s | 5.312s | 5.206s |
>>>>>> | Mixed | 7.979s | 7.985s | 8.177s | 7.772s | 8.280s | 8.039s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> #######################
>>>>>> # #
>>>>>> # With READ_PLUS #
>>>>>> # #
>>>>>> #######################
>>>>>>
>>>>>>
>>>>>> NFS v4.1:
>>>>>> Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | Data | 9.082s | 7.008s | 7.116s | 6.771s | 7.902s | 7.576s |
>>>>>> | Hole | 5.333s | 5.358s | 5.380s | 5.161s | 5.282s | 5.303s |
>>>>>> | Mixed | 8.189s | 8.308s | 9.540s | 7.937s | 8.420s | 8.479s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> NFS v4.2:
>>>>>> Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | Data | 7.033s | 6.829s | 7.025s | 6.873s | 7.134s | 6.979s |
>>>>>> | Hole | 1.794s | 1.800s | 1.905s | 1.811s | 1.725s | 1.807s |
>>>>>> | Mixed | 7.590s | 8.777s | 9.423s | 10.366s | 8.024s | 8.836s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>
>>>>> So there is a clear win in the 100% hole case here, but otherwise the
>>>>> statistical fluctuations are dominating the numbers. Can you get us a
>>>>> little more stats and then perhaps run the results through nfsometer?
>>>>
>>>> Also, could you describe the setup (are these still kvm's), and how
>>>> you're clearing the cache between runs?
>>>
>>> These are still KVMs and my server is exporting an xfs filesystem. I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
>>
>> I agree that you have to use the 'drop_caches' interface on the
>> server, but why not just use O_DIRECT on the clients?
>
> I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`. I can write something to read files with O_DIRECT if that would be more useful!
>

'dd' can do that for you if the appropriate incantations are performed.

--
Trond Myklebust
Linux NFS client maintainer, PrimaryData
[email protected]

2015-03-26 16:14:49

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On 03/26/2015 12:13 PM, Trond Myklebust wrote:
> On Thu, Mar 26, 2015 at 12:11 PM, Anna Schumaker
> <[email protected]> wrote:
>> On 03/26/2015 12:06 PM, Trond Myklebust wrote:
>>> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
>>> <[email protected]> wrote:
>>>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>>>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>>>>> <[email protected]> wrote:
>>>>>>> Here are my updated numbers! I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K. I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>>>>
>>>>>>> ##########################
>>>>>>> # #
>>>>>>> # Without READ_PLUS #
>>>>>>> # #
>>>>>>> ##########################
>>>>>>>
>>>>>>>
>>>>>>> NFS v4.1:
>>>>>>> Trial
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> | Data | 8.723s | 7.243s | 8.252s | 6.997s | 6.980s | 7.639s |
>>>>>>> | Hole | 5.271s | 5.224s | 5.060s | 4.897s | 5.321s | 5.155s |
>>>>>>> | Mixed | 8.050s | 10.057s | 7.919s | 8.060s | 9.557s | 8.729s |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> NFS v4.2:
>>>>>>> Trial
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> | Data | 6.707s | 7.070s | 6.722s | 6.761s | 6.810s | 6.814s |
>>>>>>> | Hole | 5.152s | 5.149s | 5.213s | 5.206s | 5.312s | 5.206s |
>>>>>>> | Mixed | 7.979s | 7.985s | 8.177s | 7.772s | 8.280s | 8.039s |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> #######################
>>>>>>> # #
>>>>>>> # With READ_PLUS #
>>>>>>> # #
>>>>>>> #######################
>>>>>>>
>>>>>>>
>>>>>>> NFS v4.1:
>>>>>>> Trial
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> | Data | 9.082s | 7.008s | 7.116s | 6.771s | 7.902s | 7.576s |
>>>>>>> | Hole | 5.333s | 5.358s | 5.380s | 5.161s | 5.282s | 5.303s |
>>>>>>> | Mixed | 8.189s | 8.308s | 9.540s | 7.937s | 8.420s | 8.479s |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> NFS v4.2:
>>>>>>> Trial
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> | Data | 7.033s | 6.829s | 7.025s | 6.873s | 7.134s | 6.979s |
>>>>>>> | Hole | 1.794s | 1.800s | 1.905s | 1.811s | 1.725s | 1.807s |
>>>>>>> | Mixed | 7.590s | 8.777s | 9.423s | 10.366s | 8.024s | 8.836s |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>
>>>>>>
>>>>>> So there is a clear win in the 100% hole case here, but otherwise the
>>>>>> statistical fluctuations are dominating the numbers. Can you get us a
>>>>>> little more stats and then perhaps run the results through nfsometer?
>>>>>
>>>>> Also, could you describe the setup (are these still kvm's), and how
>>>>> you're clearing the cache between runs?
>>>>
>>>> These are still KVMs and my server is exporting an xfs filesystem. I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
>>>
>>> I agree that you have to use the 'drop_caches' interface on the
>>> server, but why not just use O_DIRECT on the clients?
>>
>> I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`. I can write something to read files with O_DIRECT if that would be more useful!
>>
>
> 'dd' can do that for you if the appropriate incantations are performed.

Got it. I'll sacrifice a goat to 'dd' and rerun the tests with O_DIRECT!
>


2015-03-26 16:18:50

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On 03/26/2015 12:11 PM, J. Bruce Fields wrote:
> On Thu, Mar 26, 2015 at 11:47:03AM -0400, Anna Schumaker wrote:
>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>>> <[email protected]> wrote:
>>>>> Here are my updated numbers! I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K. I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>>
>>>>> ##########################
>>>>> # #
>>>>> # Without READ_PLUS #
>>>>> # #
>>>>> ##########################
>>>>>
>>>>>
>>>>> NFS v4.1:
>>>>> Trial
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> | Data | 8.723s | 7.243s | 8.252s | 6.997s | 6.980s | 7.639s |
>>>>> | Hole | 5.271s | 5.224s | 5.060s | 4.897s | 5.321s | 5.155s |
>>>>> | Mixed | 8.050s | 10.057s | 7.919s | 8.060s | 9.557s | 8.729s |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> NFS v4.2:
>>>>> Trial
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> | Data | 6.707s | 7.070s | 6.722s | 6.761s | 6.810s | 6.814s |
>>>>> | Hole | 5.152s | 5.149s | 5.213s | 5.206s | 5.312s | 5.206s |
>>>>> | Mixed | 7.979s | 7.985s | 8.177s | 7.772s | 8.280s | 8.039s |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> #######################
>>>>> # #
>>>>> # With READ_PLUS #
>>>>> # #
>>>>> #######################
>>>>>
>>>>>
>>>>> NFS v4.1:
>>>>> Trial
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> | Data | 9.082s | 7.008s | 7.116s | 6.771s | 7.902s | 7.576s |
>>>>> | Hole | 5.333s | 5.358s | 5.380s | 5.161s | 5.282s | 5.303s |
>>>>> | Mixed | 8.189s | 8.308s | 9.540s | 7.937s | 8.420s | 8.479s |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> NFS v4.2:
>>>>> Trial
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> | Data | 7.033s | 6.829s | 7.025s | 6.873s | 7.134s | 6.979s |
>>>>> | Hole | 1.794s | 1.800s | 1.905s | 1.811s | 1.725s | 1.807s |
>>>>> | Mixed | 7.590s | 8.777s | 9.423s | 10.366s | 8.024s | 8.836s |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>
>>>>
>>>> So there is a clear win in the 100% hole case here, but otherwise the
>>>> statistical fluctuations are dominating the numbers. Can you get us a
>>>> little more stats and then perhaps run the results through nfsometer?
>>>
>>> Also, could you describe the setup (are these still kvm's), and how
>>> you're clearing the cache between runs?
>>
>> These are still KVMs and my server is exporting an xfs filesystem. I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
>
> What sort of device is the exported xfs filesystem on? (Can't there
> be a second level of caching on the guest, depending on how it's set
> up?)

My host is a macbook pro running Archlinux, and I have all my virtio disks set to "cache mode = none". Let me know if you were asking something different!


>
> Can we get results on bare metal? (The kvm test might be a good
> worst-case for read_plus, as I'd expect bandwidth to be relatively high
> compared to the cost of the extra memcpy's or seek calls. But it also
> seems more complicated.)

I do all of my testing on kvm these days! I'll see how difficult it is to setup refind with a custom kernel to test between my laptop and my desktop (or I could run the test between my raspberry pis!)

Anna

>
> --b.
>


2015-03-27 19:04:44

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

I did two separate dd tests with the same 5G file from yesterday, and still using the same virtual machines. First, I ran dd using direct IO for reads:
dd if=/nfs/file iflag=direct of=/dev/null bs=128K

Mixed file performance was awful, so I reran without direct IO enabled for comparison:
dd if=/nfs/file iflag=nocache of=/dev/null oflag=nocache bs=128K

bs=128K sets the block size used by dd to the NFS rsize, without this dd will only read 512 bytes at a time and take forever to complete.


##########################
# #
# Without READ_PLUS #
# #
##########################


NFS v4.1, iflag=direct:
Trial
|---------|---------|---------|---------|---------|---------|---------|
| | 1 | 2 | 3 | 4 | 5 | Average |
|---------|---------|---------|---------|---------|---------|---------|
| Data | 11.704s | 11.055s | 11.329s | 11.453s | 10.741s | 11.256s |
| Hole | 9.839s | 9.326s | 9.381s | 9.430s | 8.875s | 9.370s |
| Mixed | 19.150s | 19.468s | 18.650s | 18.537s | 19.312s | 19.023s |
|---------|---------|---------|---------|---------|---------|---------|


NFS v4.2, iflag=direct:
Trial
|---------|---------|---------|---------|---------|---------|---------|
| | 1 | 2 | 3 | 4 | 5 | Average |
|---------|---------|---------|---------|---------|---------|---------|
| Data | 10.927s | 10.885s | 11.114s | 11.283s | 10.371s | 10.916s |
| Hole | 9.515s | 9.039s | 9.116s | 8.867s | 8.905s | 9.088s |
| Mixed | 19.149s | 18.656s | 19.400s | 18.834s | 20.041s | 19.216s |
|---------|---------|---------|---------|---------|---------|---------|




NFS v4.1, iflag=nocache oflag=nocache:
Trial
|---------|---------|---------|---------|---------|---------|---------|
| | 1 | 2 | 3 | 4 | 5 | Average |
|---------|---------|---------|---------|---------|---------|---------|
| Data | 6.808s | 6.698s | 7.482s | 6.761s | 7.235s | 6.995s |
| Hole | 5.350s | 5.148s | 5.161s | 5.070s | 5.089s | 5.164s |
| Mixed | 9.316s | 8.731s | 9.072s | 9.145s | 8.627s | 8.978s |
|---------|---------|---------|---------|---------|---------|---------|


NFS v4.2, iflag=nocache oflag=nocache:
Trial
|---------|---------|---------|---------|---------|---------|---------|
| | 1 | 2 | 3 | 4 | 5 | Average |
|---------|---------|---------|---------|---------|---------|---------|
| Data | 6.686s | 6.848s | 6.876s | 6.799s | 7.815s | 7.004s |
| Hole | 5.092s | 5.330s | 5.050s | 5.280s | 5.030s | 5.156s |
| Mixed | 8.142s | 7.897s | 8.040s | 7.960s | 8.050s | 8.018s |
|---------|---------|---------|---------|---------|---------|---------|






#######################
# #
# With READ_PLUS #
# #
#######################


NFS v4.1, iflag=direct:
Trial
|---------|---------|---------|---------|---------|---------|---------|
| | 1 | 2 | 3 | 4 | 5 | Average |
|---------|---------|---------|---------|---------|---------|---------|
| Data | 9.464s | 10.181s | 10.048s | 9.452s | 10.795s | 9.988s |
| Hole | 7.954s | 8.486s | 7.762s | 7.969s | 8.299s | 8.094s |
| Mixed | 19.037s | 18.323s | 18.965s | 18.156s | 19.185s | 18.733s |
|---------|---------|---------|---------|---------|---------|---------|


NFS v4.2, iflag=direct:
Trial
|---------|---------|---------|---------|---------|---------|---------|
| | 1 | 2 | 3 | 4 | 5 | Average |
|---------|---------|---------|---------|---------|---------|---------|
| Data | 11.923s | 10.026s | 10.222s | 12.387s | 11.431s | 11.198s |
| Hole | 3.247s | 3.155s | 3.191s | 3.243s | 3.202s | 3.208s |
| Mixed | 54.677s | 54.697s | 52.978s | 53.704s | 54.054s | 54.022s |
|---------|---------|---------|---------|---------|---------|---------|




NFS v4.1, iflag=nocache oflag=nocache:
Trial
|---------|---------|---------|---------|---------|---------|---------|
| | 1 | 2 | 3 | 4 | 5 | Average |
|---------|---------|---------|---------|---------|---------|---------|
| Data | 6.788s | 6.802s | 6.750s | 6.756s | 6.852s | 6.790s |
| Hole | 5.143s | 5.165s | 5.104s | 5.154s | 5.116s | 5.136s |
| Mixed | 7.902s | 7.693s | 9.169s | 8.186s | 9.157s | 8.421s |
|---------|---------|---------|---------|---------|---------|---------|


NFS v4.2, iflag=nocache oflag=nocache:
Trial
|---------|---------|---------|---------|---------|---------|---------|
| | 1 | 2 | 3 | 4 | 5 | Average |
|---------|---------|---------|---------|---------|---------|---------|
| Data | 6.897s | 6.862s | 7.054s | 6.961s | 7.081s | 6.971s |
| Hole | 1.690s | 1.673s | 1.553s | 1.554s | 1.490s | 1.592s |
| Mixed | 9.009s | 7.840s | 7.661s | 8.945s | 7.649s | 8.221s |
|---------|---------|---------|---------|---------|---------|---------|


On 03/26/2015 12:13 PM, Trond Myklebust wrote:
> On Thu, Mar 26, 2015 at 12:11 PM, Anna Schumaker
> <[email protected]> wrote:
>> On 03/26/2015 12:06 PM, Trond Myklebust wrote:
>>> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
>>> <[email protected]> wrote:
>>>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>>>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>>>>> <[email protected]> wrote:
>>>>>>> Here are my updated numbers! I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K. I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>>>>
>>>>>>> ##########################
>>>>>>> # #
>>>>>>> # Without READ_PLUS #
>>>>>>> # #
>>>>>>> ##########################
>>>>>>>
>>>>>>>
>>>>>>> NFS v4.1:
>>>>>>> Trial
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> | Data | 8.723s | 7.243s | 8.252s | 6.997s | 6.980s | 7.639s |
>>>>>>> | Hole | 5.271s | 5.224s | 5.060s | 4.897s | 5.321s | 5.155s |
>>>>>>> | Mixed | 8.050s | 10.057s | 7.919s | 8.060s | 9.557s | 8.729s |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> NFS v4.2:
>>>>>>> Trial
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> | Data | 6.707s | 7.070s | 6.722s | 6.761s | 6.810s | 6.814s |
>>>>>>> | Hole | 5.152s | 5.149s | 5.213s | 5.206s | 5.312s | 5.206s |
>>>>>>> | Mixed | 7.979s | 7.985s | 8.177s | 7.772s | 8.280s | 8.039s |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> #######################
>>>>>>> # #
>>>>>>> # With READ_PLUS #
>>>>>>> # #
>>>>>>> #######################
>>>>>>>
>>>>>>>
>>>>>>> NFS v4.1:
>>>>>>> Trial
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> | Data | 9.082s | 7.008s | 7.116s | 6.771s | 7.902s | 7.576s |
>>>>>>> | Hole | 5.333s | 5.358s | 5.380s | 5.161s | 5.282s | 5.303s |
>>>>>>> | Mixed | 8.189s | 8.308s | 9.540s | 7.937s | 8.420s | 8.479s |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> NFS v4.2:
>>>>>>> Trial
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> | Data | 7.033s | 6.829s | 7.025s | 6.873s | 7.134s | 6.979s |
>>>>>>> | Hole | 1.794s | 1.800s | 1.905s | 1.811s | 1.725s | 1.807s |
>>>>>>> | Mixed | 7.590s | 8.777s | 9.423s | 10.366s | 8.024s | 8.836s |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>
>>>>>>
>>>>>> So there is a clear win in the 100% hole case here, but otherwise the
>>>>>> statistical fluctuations are dominating the numbers. Can you get us a
>>>>>> little more stats and then perhaps run the results through nfsometer?
>>>>>
>>>>> Also, could you describe the setup (are these still kvm's), and how
>>>>> you're clearing the cache between runs?
>>>>
>>>> These are still KVMs and my server is exporting an xfs filesystem. I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
>>>
>>> I agree that you have to use the 'drop_caches' interface on the
>>> server, but why not just use O_DIRECT on the clients?
>>
>> I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`. I can write something to read files with O_DIRECT if that would be more useful!
>>
>
> 'dd' can do that for you if the appropriate incantations are performed.
>


2015-03-27 20:22:51

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Fri, Mar 27, 2015 at 3:04 PM, Anna Schumaker
<[email protected]> wrote:
> I did two separate dd tests with the same 5G file from yesterday, and still using the same virtual machines. First, I ran dd using direct IO for reads:
> dd if=/nfs/file iflag=direct of=/dev/null bs=128K
>
> Mixed file performance was awful, so I reran without direct IO enabled for comparison:
> dd if=/nfs/file iflag=nocache of=/dev/null oflag=nocache bs=128K
>
> bs=128K sets the block size used by dd to the NFS rsize, without this dd will only read 512 bytes at a time and take forever to complete.
>
>
> ##########################
> # #
> # Without READ_PLUS #
> # #
> ##########################
>
>
> NFS v4.1, iflag=direct:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 11.704s | 11.055s | 11.329s | 11.453s | 10.741s | 11.256s |
> | Hole | 9.839s | 9.326s | 9.381s | 9.430s | 8.875s | 9.370s |
> | Mixed | 19.150s | 19.468s | 18.650s | 18.537s | 19.312s | 19.023s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
> NFS v4.2, iflag=direct:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 10.927s | 10.885s | 11.114s | 11.283s | 10.371s | 10.916s |
> | Hole | 9.515s | 9.039s | 9.116s | 8.867s | 8.905s | 9.088s |
> | Mixed | 19.149s | 18.656s | 19.400s | 18.834s | 20.041s | 19.216s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
>
>
> NFS v4.1, iflag=nocache oflag=nocache:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 6.808s | 6.698s | 7.482s | 6.761s | 7.235s | 6.995s |
> | Hole | 5.350s | 5.148s | 5.161s | 5.070s | 5.089s | 5.164s |
> | Mixed | 9.316s | 8.731s | 9.072s | 9.145s | 8.627s | 8.978s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
> NFS v4.2, iflag=nocache oflag=nocache:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 6.686s | 6.848s | 6.876s | 6.799s | 7.815s | 7.004s |
> | Hole | 5.092s | 5.330s | 5.050s | 5.280s | 5.030s | 5.156s |
> | Mixed | 8.142s | 7.897s | 8.040s | 7.960s | 8.050s | 8.018s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
>
>
>
>
> #######################
> # #
> # With READ_PLUS #
> # #
> #######################
>
>
> NFS v4.1, iflag=direct:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 9.464s | 10.181s | 10.048s | 9.452s | 10.795s | 9.988s |
> | Hole | 7.954s | 8.486s | 7.762s | 7.969s | 8.299s | 8.094s |
> | Mixed | 19.037s | 18.323s | 18.965s | 18.156s | 19.185s | 18.733s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
> NFS v4.2, iflag=direct:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 11.923s | 10.026s | 10.222s | 12.387s | 11.431s | 11.198s |
> | Hole | 3.247s | 3.155s | 3.191s | 3.243s | 3.202s | 3.208s |
> | Mixed | 54.677s | 54.697s | 52.978s | 53.704s | 54.054s | 54.022s |

That's a bit nasty. Any idea what is going on with the Mixed case here?

> |---------|---------|---------|---------|---------|---------|---------|
>
>
>
>
> NFS v4.1, iflag=nocache oflag=nocache:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 6.788s | 6.802s | 6.750s | 6.756s | 6.852s | 6.790s |
> | Hole | 5.143s | 5.165s | 5.104s | 5.154s | 5.116s | 5.136s |
> | Mixed | 7.902s | 7.693s | 9.169s | 8.186s | 9.157s | 8.421s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
> NFS v4.2, iflag=nocache oflag=nocache:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 6.897s | 6.862s | 7.054s | 6.961s | 7.081s | 6.971s |
> | Hole | 1.690s | 1.673s | 1.553s | 1.554s | 1.490s | 1.592s |
> | Mixed | 9.009s | 7.840s | 7.661s | 8.945s | 7.649s | 8.221s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
> On 03/26/2015 12:13 PM, Trond Myklebust wrote:
>> On Thu, Mar 26, 2015 at 12:11 PM, Anna Schumaker
>> <[email protected]> wrote:
>>> On 03/26/2015 12:06 PM, Trond Myklebust wrote:
>>>> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
>>>> <[email protected]> wrote:
>>>>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>>>>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>>>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>>>>>> <[email protected]> wrote:
>>>>>>>> Here are my updated numbers! I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K. I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>>>>>
>>>>>>>> ##########################
>>>>>>>> # #
>>>>>>>> # Without READ_PLUS #
>>>>>>>> # #
>>>>>>>> ##########################
>>>>>>>>
>>>>>>>>
>>>>>>>> NFS v4.1:
>>>>>>>> Trial
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>> | Data | 8.723s | 7.243s | 8.252s | 6.997s | 6.980s | 7.639s |
>>>>>>>> | Hole | 5.271s | 5.224s | 5.060s | 4.897s | 5.321s | 5.155s |
>>>>>>>> | Mixed | 8.050s | 10.057s | 7.919s | 8.060s | 9.557s | 8.729s |
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> NFS v4.2:
>>>>>>>> Trial
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>> | Data | 6.707s | 7.070s | 6.722s | 6.761s | 6.810s | 6.814s |
>>>>>>>> | Hole | 5.152s | 5.149s | 5.213s | 5.206s | 5.312s | 5.206s |
>>>>>>>> | Mixed | 7.979s | 7.985s | 8.177s | 7.772s | 8.280s | 8.039s |
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> #######################
>>>>>>>> # #
>>>>>>>> # With READ_PLUS #
>>>>>>>> # #
>>>>>>>> #######################
>>>>>>>>
>>>>>>>>
>>>>>>>> NFS v4.1:
>>>>>>>> Trial
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>> | Data | 9.082s | 7.008s | 7.116s | 6.771s | 7.902s | 7.576s |
>>>>>>>> | Hole | 5.333s | 5.358s | 5.380s | 5.161s | 5.282s | 5.303s |
>>>>>>>> | Mixed | 8.189s | 8.308s | 9.540s | 7.937s | 8.420s | 8.479s |
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> NFS v4.2:
>>>>>>>> Trial
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>> | Data | 7.033s | 6.829s | 7.025s | 6.873s | 7.134s | 6.979s |
>>>>>>>> | Hole | 1.794s | 1.800s | 1.905s | 1.811s | 1.725s | 1.807s |
>>>>>>>> | Mixed | 7.590s | 8.777s | 9.423s | 10.366s | 8.024s | 8.836s |
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>
>>>>>>>
>>>>>>> So there is a clear win in the 100% hole case here, but otherwise the
>>>>>>> statistical fluctuations are dominating the numbers. Can you get us a
>>>>>>> little more stats and then perhaps run the results through nfsometer?
>>>>>>
>>>>>> Also, could you describe the setup (are these still kvm's), and how
>>>>>> you're clearing the cache between runs?
>>>>>
>>>>> These are still KVMs and my server is exporting an xfs filesystem. I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
>>>>
>>>> I agree that you have to use the 'drop_caches' interface on the
>>>> server, but why not just use O_DIRECT on the clients?
>>>
>>> I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`. I can write something to read files with O_DIRECT if that would be more useful!
>>>
>>
>> 'dd' can do that for you if the appropriate incantations are performed.
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html



--
Trond Myklebust
Linux NFS client maintainer, PrimaryData
[email protected]

2015-03-27 20:46:59

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On 03/27/2015 04:22 PM, Trond Myklebust wrote:
> On Fri, Mar 27, 2015 at 3:04 PM, Anna Schumaker
> <[email protected]> wrote:
>> I did two separate dd tests with the same 5G file from yesterday, and still using the same virtual machines. First, I ran dd using direct IO for reads:
>> dd if=/nfs/file iflag=direct of=/dev/null bs=128K
>>
>> Mixed file performance was awful, so I reran without direct IO enabled for comparison:
>> dd if=/nfs/file iflag=nocache of=/dev/null oflag=nocache bs=128K
>>
>> bs=128K sets the block size used by dd to the NFS rsize, without this dd will only read 512 bytes at a time and take forever to complete.
>>
>>
>> ##########################
>> # #
>> # Without READ_PLUS #
>> # #
>> ##########################
>>
>>
>> NFS v4.1, iflag=direct:
>> Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> | | 1 | 2 | 3 | 4 | 5 | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> | Data | 11.704s | 11.055s | 11.329s | 11.453s | 10.741s | 11.256s |
>> | Hole | 9.839s | 9.326s | 9.381s | 9.430s | 8.875s | 9.370s |
>> | Mixed | 19.150s | 19.468s | 18.650s | 18.537s | 19.312s | 19.023s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>> NFS v4.2, iflag=direct:
>> Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> | | 1 | 2 | 3 | 4 | 5 | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> | Data | 10.927s | 10.885s | 11.114s | 11.283s | 10.371s | 10.916s |
>> | Hole | 9.515s | 9.039s | 9.116s | 8.867s | 8.905s | 9.088s |
>> | Mixed | 19.149s | 18.656s | 19.400s | 18.834s | 20.041s | 19.216s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>>
>>
>> NFS v4.1, iflag=nocache oflag=nocache:
>> Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> | | 1 | 2 | 3 | 4 | 5 | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> | Data | 6.808s | 6.698s | 7.482s | 6.761s | 7.235s | 6.995s |
>> | Hole | 5.350s | 5.148s | 5.161s | 5.070s | 5.089s | 5.164s |
>> | Mixed | 9.316s | 8.731s | 9.072s | 9.145s | 8.627s | 8.978s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>> NFS v4.2, iflag=nocache oflag=nocache:
>> Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> | | 1 | 2 | 3 | 4 | 5 | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> | Data | 6.686s | 6.848s | 6.876s | 6.799s | 7.815s | 7.004s |
>> | Hole | 5.092s | 5.330s | 5.050s | 5.280s | 5.030s | 5.156s |
>> | Mixed | 8.142s | 7.897s | 8.040s | 7.960s | 8.050s | 8.018s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>>
>>
>>
>>
>> #######################
>> # #
>> # With READ_PLUS #
>> # #
>> #######################
>>
>>
>> NFS v4.1, iflag=direct:
>> Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> | | 1 | 2 | 3 | 4 | 5 | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> | Data | 9.464s | 10.181s | 10.048s | 9.452s | 10.795s | 9.988s |
>> | Hole | 7.954s | 8.486s | 7.762s | 7.969s | 8.299s | 8.094s |
>> | Mixed | 19.037s | 18.323s | 18.965s | 18.156s | 19.185s | 18.733s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>> NFS v4.2, iflag=direct:
>> Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> | | 1 | 2 | 3 | 4 | 5 | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> | Data | 11.923s | 10.026s | 10.222s | 12.387s | 11.431s | 11.198s |
>> | Hole | 3.247s | 3.155s | 3.191s | 3.243s | 3.202s | 3.208s |
>> | Mixed | 54.677s | 54.697s | 52.978s | 53.704s | 54.054s | 54.022s |
>
> That's a bit nasty. Any idea what is going on with the Mixed case here?

Not offhand, but my first guess would be something to do with extra seeks to find how long each hole and data segment is.

Anna

>
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>>
>>
>> NFS v4.1, iflag=nocache oflag=nocache:
>> Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> | | 1 | 2 | 3 | 4 | 5 | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> | Data | 6.788s | 6.802s | 6.750s | 6.756s | 6.852s | 6.790s |
>> | Hole | 5.143s | 5.165s | 5.104s | 5.154s | 5.116s | 5.136s |
>> | Mixed | 7.902s | 7.693s | 9.169s | 8.186s | 9.157s | 8.421s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>> NFS v4.2, iflag=nocache oflag=nocache:
>> Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> | | 1 | 2 | 3 | 4 | 5 | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> | Data | 6.897s | 6.862s | 7.054s | 6.961s | 7.081s | 6.971s |
>> | Hole | 1.690s | 1.673s | 1.553s | 1.554s | 1.490s | 1.592s |
>> | Mixed | 9.009s | 7.840s | 7.661s | 8.945s | 7.649s | 8.221s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>> On 03/26/2015 12:13 PM, Trond Myklebust wrote:
>>> On Thu, Mar 26, 2015 at 12:11 PM, Anna Schumaker
>>> <[email protected]> wrote:
>>>> On 03/26/2015 12:06 PM, Trond Myklebust wrote:
>>>>> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
>>>>> <[email protected]> wrote:
>>>>>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>>>>>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>>>>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>>>>>>> <[email protected]> wrote:
>>>>>>>>> Here are my updated numbers! I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K. I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>>>>>>
>>>>>>>>> ##########################
>>>>>>>>> # #
>>>>>>>>> # Without READ_PLUS #
>>>>>>>>> # #
>>>>>>>>> ##########################
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> NFS v4.1:
>>>>>>>>> Trial
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>> | Data | 8.723s | 7.243s | 8.252s | 6.997s | 6.980s | 7.639s |
>>>>>>>>> | Hole | 5.271s | 5.224s | 5.060s | 4.897s | 5.321s | 5.155s |
>>>>>>>>> | Mixed | 8.050s | 10.057s | 7.919s | 8.060s | 9.557s | 8.729s |
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> NFS v4.2:
>>>>>>>>> Trial
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>> | Data | 6.707s | 7.070s | 6.722s | 6.761s | 6.810s | 6.814s |
>>>>>>>>> | Hole | 5.152s | 5.149s | 5.213s | 5.206s | 5.312s | 5.206s |
>>>>>>>>> | Mixed | 7.979s | 7.985s | 8.177s | 7.772s | 8.280s | 8.039s |
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> #######################
>>>>>>>>> # #
>>>>>>>>> # With READ_PLUS #
>>>>>>>>> # #
>>>>>>>>> #######################
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> NFS v4.1:
>>>>>>>>> Trial
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>> | Data | 9.082s | 7.008s | 7.116s | 6.771s | 7.902s | 7.576s |
>>>>>>>>> | Hole | 5.333s | 5.358s | 5.380s | 5.161s | 5.282s | 5.303s |
>>>>>>>>> | Mixed | 8.189s | 8.308s | 9.540s | 7.937s | 8.420s | 8.479s |
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> NFS v4.2:
>>>>>>>>> Trial
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>> | Data | 7.033s | 6.829s | 7.025s | 6.873s | 7.134s | 6.979s |
>>>>>>>>> | Hole | 1.794s | 1.800s | 1.905s | 1.811s | 1.725s | 1.807s |
>>>>>>>>> | Mixed | 7.590s | 8.777s | 9.423s | 10.366s | 8.024s | 8.836s |
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>
>>>>>>>>
>>>>>>>> So there is a clear win in the 100% hole case here, but otherwise the
>>>>>>>> statistical fluctuations are dominating the numbers. Can you get us a
>>>>>>>> little more stats and then perhaps run the results through nfsometer?
>>>>>>>
>>>>>>> Also, could you describe the setup (are these still kvm's), and how
>>>>>>> you're clearing the cache between runs?
>>>>>>
>>>>>> These are still KVMs and my server is exporting an xfs filesystem. I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
>>>>>
>>>>> I agree that you have to use the 'drop_caches' interface on the
>>>>> server, but why not just use O_DIRECT on the clients?
>>>>
>>>> I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`. I can write something to read files with O_DIRECT if that would be more useful!
>>>>
>>>
>>> 'dd' can do that for you if the appropriate incantations are performed.
>>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
>


2015-03-27 20:54:16

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Fri, Mar 27, 2015 at 04:46:55PM -0400, Anna Schumaker wrote:
> On 03/27/2015 04:22 PM, Trond Myklebust wrote:
> > On Fri, Mar 27, 2015 at 3:04 PM, Anna Schumaker
> > <[email protected]> wrote:
> >> I did two separate dd tests with the same 5G file from yesterday, and still using the same virtual machines. First, I ran dd using direct IO for reads:
> >> dd if=/nfs/file iflag=direct of=/dev/null bs=128K
> >>
> >> Mixed file performance was awful, so I reran without direct IO enabled for comparison:
> >> dd if=/nfs/file iflag=nocache of=/dev/null oflag=nocache bs=128K
> >>
> >> bs=128K sets the block size used by dd to the NFS rsize, without this dd will only read 512 bytes at a time and take forever to complete.
> >>
> >>
> >> ##########################
> >> # #
> >> # Without READ_PLUS #
> >> # #
> >> ##########################
> >>
> >>
> >> NFS v4.1, iflag=direct:
> >> Trial
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> | | 1 | 2 | 3 | 4 | 5 | Average |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> | Data | 11.704s | 11.055s | 11.329s | 11.453s | 10.741s | 11.256s |
> >> | Hole | 9.839s | 9.326s | 9.381s | 9.430s | 8.875s | 9.370s |
> >> | Mixed | 19.150s | 19.468s | 18.650s | 18.537s | 19.312s | 19.023s |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >>
> >>
> >> NFS v4.2, iflag=direct:
> >> Trial
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> | | 1 | 2 | 3 | 4 | 5 | Average |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> | Data | 10.927s | 10.885s | 11.114s | 11.283s | 10.371s | 10.916s |
> >> | Hole | 9.515s | 9.039s | 9.116s | 8.867s | 8.905s | 9.088s |
> >> | Mixed | 19.149s | 18.656s | 19.400s | 18.834s | 20.041s | 19.216s |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >>
> >>
> >>
> >>
> >> NFS v4.1, iflag=nocache oflag=nocache:
> >> Trial
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> | | 1 | 2 | 3 | 4 | 5 | Average |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> | Data | 6.808s | 6.698s | 7.482s | 6.761s | 7.235s | 6.995s |
> >> | Hole | 5.350s | 5.148s | 5.161s | 5.070s | 5.089s | 5.164s |
> >> | Mixed | 9.316s | 8.731s | 9.072s | 9.145s | 8.627s | 8.978s |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >>
> >>
> >> NFS v4.2, iflag=nocache oflag=nocache:
> >> Trial
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> | | 1 | 2 | 3 | 4 | 5 | Average |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> | Data | 6.686s | 6.848s | 6.876s | 6.799s | 7.815s | 7.004s |
> >> | Hole | 5.092s | 5.330s | 5.050s | 5.280s | 5.030s | 5.156s |
> >> | Mixed | 8.142s | 7.897s | 8.040s | 7.960s | 8.050s | 8.018s |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >>
> >>
> >>
> >>
> >>
> >>
> >> #######################
> >> # #
> >> # With READ_PLUS #
> >> # #
> >> #######################
> >>
> >>
> >> NFS v4.1, iflag=direct:
> >> Trial
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> | | 1 | 2 | 3 | 4 | 5 | Average |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> | Data | 9.464s | 10.181s | 10.048s | 9.452s | 10.795s | 9.988s |
> >> | Hole | 7.954s | 8.486s | 7.762s | 7.969s | 8.299s | 8.094s |
> >> | Mixed | 19.037s | 18.323s | 18.965s | 18.156s | 19.185s | 18.733s |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >>
> >>
> >> NFS v4.2, iflag=direct:
> >> Trial
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> | | 1 | 2 | 3 | 4 | 5 | Average |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> | Data | 11.923s | 10.026s | 10.222s | 12.387s | 11.431s | 11.198s |
> >> | Hole | 3.247s | 3.155s | 3.191s | 3.243s | 3.202s | 3.208s |
> >> | Mixed | 54.677s | 54.697s | 52.978s | 53.704s | 54.054s | 54.022s |
> >
> > That's a bit nasty. Any idea what is going on with the Mixed case here?
>
> Not offhand, but my first guess would be something to do with extra seeks to find how long each hole and data segment is.

Remind us what "mixed" means? (I think you were alternating, but how
large is each segment?)

--b.

>
> Anna
>
> >
> >> |---------|---------|---------|---------|---------|---------|---------|
> >>
> >>
> >>
> >>
> >> NFS v4.1, iflag=nocache oflag=nocache:
> >> Trial
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> | | 1 | 2 | 3 | 4 | 5 | Average |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> | Data | 6.788s | 6.802s | 6.750s | 6.756s | 6.852s | 6.790s |
> >> | Hole | 5.143s | 5.165s | 5.104s | 5.154s | 5.116s | 5.136s |
> >> | Mixed | 7.902s | 7.693s | 9.169s | 8.186s | 9.157s | 8.421s |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >>
> >>
> >> NFS v4.2, iflag=nocache oflag=nocache:
> >> Trial
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> | | 1 | 2 | 3 | 4 | 5 | Average |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> | Data | 6.897s | 6.862s | 7.054s | 6.961s | 7.081s | 6.971s |
> >> | Hole | 1.690s | 1.673s | 1.553s | 1.554s | 1.490s | 1.592s |
> >> | Mixed | 9.009s | 7.840s | 7.661s | 8.945s | 7.649s | 8.221s |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >>
> >>
> >> On 03/26/2015 12:13 PM, Trond Myklebust wrote:
> >>> On Thu, Mar 26, 2015 at 12:11 PM, Anna Schumaker
> >>> <[email protected]> wrote:
> >>>> On 03/26/2015 12:06 PM, Trond Myklebust wrote:
> >>>>> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
> >>>>> <[email protected]> wrote:
> >>>>>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
> >>>>>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
> >>>>>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
> >>>>>>>> <[email protected]> wrote:
> >>>>>>>>> Here are my updated numbers! I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K. I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
> >>>>>>>>>
> >>>>>>>>> ##########################
> >>>>>>>>> # #
> >>>>>>>>> # Without READ_PLUS #
> >>>>>>>>> # #
> >>>>>>>>> ##########################
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> NFS v4.1:
> >>>>>>>>> Trial
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>> | Data | 8.723s | 7.243s | 8.252s | 6.997s | 6.980s | 7.639s |
> >>>>>>>>> | Hole | 5.271s | 5.224s | 5.060s | 4.897s | 5.321s | 5.155s |
> >>>>>>>>> | Mixed | 8.050s | 10.057s | 7.919s | 8.060s | 9.557s | 8.729s |
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> NFS v4.2:
> >>>>>>>>> Trial
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>> | Data | 6.707s | 7.070s | 6.722s | 6.761s | 6.810s | 6.814s |
> >>>>>>>>> | Hole | 5.152s | 5.149s | 5.213s | 5.206s | 5.312s | 5.206s |
> >>>>>>>>> | Mixed | 7.979s | 7.985s | 8.177s | 7.772s | 8.280s | 8.039s |
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> #######################
> >>>>>>>>> # #
> >>>>>>>>> # With READ_PLUS #
> >>>>>>>>> # #
> >>>>>>>>> #######################
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> NFS v4.1:
> >>>>>>>>> Trial
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>> | Data | 9.082s | 7.008s | 7.116s | 6.771s | 7.902s | 7.576s |
> >>>>>>>>> | Hole | 5.333s | 5.358s | 5.380s | 5.161s | 5.282s | 5.303s |
> >>>>>>>>> | Mixed | 8.189s | 8.308s | 9.540s | 7.937s | 8.420s | 8.479s |
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> NFS v4.2:
> >>>>>>>>> Trial
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>> | Data | 7.033s | 6.829s | 7.025s | 6.873s | 7.134s | 6.979s |
> >>>>>>>>> | Hole | 1.794s | 1.800s | 1.905s | 1.811s | 1.725s | 1.807s |
> >>>>>>>>> | Mixed | 7.590s | 8.777s | 9.423s | 10.366s | 8.024s | 8.836s |
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>
> >>>>>>>>
> >>>>>>>> So there is a clear win in the 100% hole case here, but otherwise the
> >>>>>>>> statistical fluctuations are dominating the numbers. Can you get us a
> >>>>>>>> little more stats and then perhaps run the results through nfsometer?
> >>>>>>>
> >>>>>>> Also, could you describe the setup (are these still kvm's), and how
> >>>>>>> you're clearing the cache between runs?
> >>>>>>
> >>>>>> These are still KVMs and my server is exporting an xfs filesystem. I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
> >>>>>
> >>>>> I agree that you have to use the 'drop_caches' interface on the
> >>>>> server, but why not just use O_DIRECT on the clients?
> >>>>
> >>>> I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`. I can write something to read files with O_DIRECT if that would be more useful!
> >>>>
> >>>
> >>> 'dd' can do that for you if the appropriate incantations are performed.
> >>>
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> >> the body of a message to [email protected]
> >> More majordomo info at http://vger.kernel.org/majordomo-info.html
> >
> >
> >

2015-03-27 20:55:47

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On 03/27/2015 04:54 PM, J. Bruce Fields wrote:
> On Fri, Mar 27, 2015 at 04:46:55PM -0400, Anna Schumaker wrote:
>> On 03/27/2015 04:22 PM, Trond Myklebust wrote:
>>> On Fri, Mar 27, 2015 at 3:04 PM, Anna Schumaker
>>> <[email protected]> wrote:
>>>> I did two separate dd tests with the same 5G file from yesterday, and still using the same virtual machines. First, I ran dd using direct IO for reads:
>>>> dd if=/nfs/file iflag=direct of=/dev/null bs=128K
>>>>
>>>> Mixed file performance was awful, so I reran without direct IO enabled for comparison:
>>>> dd if=/nfs/file iflag=nocache of=/dev/null oflag=nocache bs=128K
>>>>
>>>> bs=128K sets the block size used by dd to the NFS rsize, without this dd will only read 512 bytes at a time and take forever to complete.
>>>>
>>>>
>>>> ##########################
>>>> # #
>>>> # Without READ_PLUS #
>>>> # #
>>>> ##########################
>>>>
>>>>
>>>> NFS v4.1, iflag=direct:
>>>> Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | Data | 11.704s | 11.055s | 11.329s | 11.453s | 10.741s | 11.256s |
>>>> | Hole | 9.839s | 9.326s | 9.381s | 9.430s | 8.875s | 9.370s |
>>>> | Mixed | 19.150s | 19.468s | 18.650s | 18.537s | 19.312s | 19.023s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>> NFS v4.2, iflag=direct:
>>>> Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | Data | 10.927s | 10.885s | 11.114s | 11.283s | 10.371s | 10.916s |
>>>> | Hole | 9.515s | 9.039s | 9.116s | 8.867s | 8.905s | 9.088s |
>>>> | Mixed | 19.149s | 18.656s | 19.400s | 18.834s | 20.041s | 19.216s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>>
>>>>
>>>> NFS v4.1, iflag=nocache oflag=nocache:
>>>> Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | Data | 6.808s | 6.698s | 7.482s | 6.761s | 7.235s | 6.995s |
>>>> | Hole | 5.350s | 5.148s | 5.161s | 5.070s | 5.089s | 5.164s |
>>>> | Mixed | 9.316s | 8.731s | 9.072s | 9.145s | 8.627s | 8.978s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>> NFS v4.2, iflag=nocache oflag=nocache:
>>>> Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | Data | 6.686s | 6.848s | 6.876s | 6.799s | 7.815s | 7.004s |
>>>> | Hole | 5.092s | 5.330s | 5.050s | 5.280s | 5.030s | 5.156s |
>>>> | Mixed | 8.142s | 7.897s | 8.040s | 7.960s | 8.050s | 8.018s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>>
>>>>
>>>>
>>>>
>>>> #######################
>>>> # #
>>>> # With READ_PLUS #
>>>> # #
>>>> #######################
>>>>
>>>>
>>>> NFS v4.1, iflag=direct:
>>>> Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | Data | 9.464s | 10.181s | 10.048s | 9.452s | 10.795s | 9.988s |
>>>> | Hole | 7.954s | 8.486s | 7.762s | 7.969s | 8.299s | 8.094s |
>>>> | Mixed | 19.037s | 18.323s | 18.965s | 18.156s | 19.185s | 18.733s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>> NFS v4.2, iflag=direct:
>>>> Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | Data | 11.923s | 10.026s | 10.222s | 12.387s | 11.431s | 11.198s |
>>>> | Hole | 3.247s | 3.155s | 3.191s | 3.243s | 3.202s | 3.208s |
>>>> | Mixed | 54.677s | 54.697s | 52.978s | 53.704s | 54.054s | 54.022s |
>>>
>>> That's a bit nasty. Any idea what is going on with the Mixed case here?
>>
>> Not offhand, but my first guess would be something to do with extra seeks to find how long each hole and data segment is.
>
> Remind us what "mixed" means? (I think you were alternating, but how
> large is each segment?)

"Mixed" is alternating 4K segments.

>
> --b.
>
>>
>> Anna
>>
>>>
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>>
>>>>
>>>> NFS v4.1, iflag=nocache oflag=nocache:
>>>> Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | Data | 6.788s | 6.802s | 6.750s | 6.756s | 6.852s | 6.790s |
>>>> | Hole | 5.143s | 5.165s | 5.104s | 5.154s | 5.116s | 5.136s |
>>>> | Mixed | 7.902s | 7.693s | 9.169s | 8.186s | 9.157s | 8.421s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>> NFS v4.2, iflag=nocache oflag=nocache:
>>>> Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> | Data | 6.897s | 6.862s | 7.054s | 6.961s | 7.081s | 6.971s |
>>>> | Hole | 1.690s | 1.673s | 1.553s | 1.554s | 1.490s | 1.592s |
>>>> | Mixed | 9.009s | 7.840s | 7.661s | 8.945s | 7.649s | 8.221s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>> On 03/26/2015 12:13 PM, Trond Myklebust wrote:
>>>>> On Thu, Mar 26, 2015 at 12:11 PM, Anna Schumaker
>>>>> <[email protected]> wrote:
>>>>>> On 03/26/2015 12:06 PM, Trond Myklebust wrote:
>>>>>>> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
>>>>>>> <[email protected]> wrote:
>>>>>>>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>>>>>>>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>>>>>>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>>>>>>>>> <[email protected]> wrote:
>>>>>>>>>>> Here are my updated numbers! I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K. I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>>>>>>>>
>>>>>>>>>>> ##########################
>>>>>>>>>>> # #
>>>>>>>>>>> # Without READ_PLUS #
>>>>>>>>>>> # #
>>>>>>>>>>> ##########################
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> NFS v4.1:
>>>>>>>>>>> Trial
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>> | Data | 8.723s | 7.243s | 8.252s | 6.997s | 6.980s | 7.639s |
>>>>>>>>>>> | Hole | 5.271s | 5.224s | 5.060s | 4.897s | 5.321s | 5.155s |
>>>>>>>>>>> | Mixed | 8.050s | 10.057s | 7.919s | 8.060s | 9.557s | 8.729s |
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> NFS v4.2:
>>>>>>>>>>> Trial
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>> | Data | 6.707s | 7.070s | 6.722s | 6.761s | 6.810s | 6.814s |
>>>>>>>>>>> | Hole | 5.152s | 5.149s | 5.213s | 5.206s | 5.312s | 5.206s |
>>>>>>>>>>> | Mixed | 7.979s | 7.985s | 8.177s | 7.772s | 8.280s | 8.039s |
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> #######################
>>>>>>>>>>> # #
>>>>>>>>>>> # With READ_PLUS #
>>>>>>>>>>> # #
>>>>>>>>>>> #######################
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> NFS v4.1:
>>>>>>>>>>> Trial
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>> | Data | 9.082s | 7.008s | 7.116s | 6.771s | 7.902s | 7.576s |
>>>>>>>>>>> | Hole | 5.333s | 5.358s | 5.380s | 5.161s | 5.282s | 5.303s |
>>>>>>>>>>> | Mixed | 8.189s | 8.308s | 9.540s | 7.937s | 8.420s | 8.479s |
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> NFS v4.2:
>>>>>>>>>>> Trial
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>> | Data | 7.033s | 6.829s | 7.025s | 6.873s | 7.134s | 6.979s |
>>>>>>>>>>> | Hole | 1.794s | 1.800s | 1.905s | 1.811s | 1.725s | 1.807s |
>>>>>>>>>>> | Mixed | 7.590s | 8.777s | 9.423s | 10.366s | 8.024s | 8.836s |
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> So there is a clear win in the 100% hole case here, but otherwise the
>>>>>>>>>> statistical fluctuations are dominating the numbers. Can you get us a
>>>>>>>>>> little more stats and then perhaps run the results through nfsometer?
>>>>>>>>>
>>>>>>>>> Also, could you describe the setup (are these still kvm's), and how
>>>>>>>>> you're clearing the cache between runs?
>>>>>>>>
>>>>>>>> These are still KVMs and my server is exporting an xfs filesystem. I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
>>>>>>>
>>>>>>> I agree that you have to use the 'drop_caches' interface on the
>>>>>>> server, but why not just use O_DIRECT on the clients?
>>>>>>
>>>>>> I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`. I can write something to read files with O_DIRECT if that would be more useful!
>>>>>>
>>>>>
>>>>> 'dd' can do that for you if the appropriate incantations are performed.
>>>>>
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>>> the body of a message to [email protected]
>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>
>>>
>>>


2015-03-27 21:08:39

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Fri, Mar 27, 2015 at 04:55:26PM -0400, Anna Schumaker wrote:
> On 03/27/2015 04:54 PM, J. Bruce Fields wrote:
> > On Fri, Mar 27, 2015 at 04:46:55PM -0400, Anna Schumaker wrote:
> >> On 03/27/2015 04:22 PM, Trond Myklebust wrote:
> >>> On Fri, Mar 27, 2015 at 3:04 PM, Anna Schumaker
> >>> <[email protected]> wrote:
> >>>> I did two separate dd tests with the same 5G file from yesterday, and still using the same virtual machines. First, I ran dd using direct IO for reads:
> >>>> dd if=/nfs/file iflag=direct of=/dev/null bs=128K
> >>>>
> >>>> Mixed file performance was awful, so I reran without direct IO enabled for comparison:
> >>>> dd if=/nfs/file iflag=nocache of=/dev/null oflag=nocache bs=128K
> >>>>
> >>>> bs=128K sets the block size used by dd to the NFS rsize, without this dd will only read 512 bytes at a time and take forever to complete.
> >>>>
> >>>>
> >>>> ##########################
> >>>> # #
> >>>> # Without READ_PLUS #
> >>>> # #
> >>>> ##########################
> >>>>
> >>>>
> >>>> NFS v4.1, iflag=direct:
> >>>> Trial
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> | Data | 11.704s | 11.055s | 11.329s | 11.453s | 10.741s | 11.256s |
> >>>> | Hole | 9.839s | 9.326s | 9.381s | 9.430s | 8.875s | 9.370s |
> >>>> | Mixed | 19.150s | 19.468s | 18.650s | 18.537s | 19.312s | 19.023s |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>
> >>>>
> >>>> NFS v4.2, iflag=direct:
> >>>> Trial
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> | Data | 10.927s | 10.885s | 11.114s | 11.283s | 10.371s | 10.916s |
> >>>> | Hole | 9.515s | 9.039s | 9.116s | 8.867s | 8.905s | 9.088s |
> >>>> | Mixed | 19.149s | 18.656s | 19.400s | 18.834s | 20.041s | 19.216s |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>
> >>>>
> >>>>
> >>>>
> >>>> NFS v4.1, iflag=nocache oflag=nocache:
> >>>> Trial
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> | Data | 6.808s | 6.698s | 7.482s | 6.761s | 7.235s | 6.995s |
> >>>> | Hole | 5.350s | 5.148s | 5.161s | 5.070s | 5.089s | 5.164s |
> >>>> | Mixed | 9.316s | 8.731s | 9.072s | 9.145s | 8.627s | 8.978s |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>
> >>>>
> >>>> NFS v4.2, iflag=nocache oflag=nocache:
> >>>> Trial
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> | Data | 6.686s | 6.848s | 6.876s | 6.799s | 7.815s | 7.004s |
> >>>> | Hole | 5.092s | 5.330s | 5.050s | 5.280s | 5.030s | 5.156s |
> >>>> | Mixed | 8.142s | 7.897s | 8.040s | 7.960s | 8.050s | 8.018s |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>
> >>>>
> >>>>
> >>>>
> >>>>
> >>>>
> >>>> #######################
> >>>> # #
> >>>> # With READ_PLUS #
> >>>> # #
> >>>> #######################
> >>>>
> >>>>
> >>>> NFS v4.1, iflag=direct:
> >>>> Trial
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> | Data | 9.464s | 10.181s | 10.048s | 9.452s | 10.795s | 9.988s |
> >>>> | Hole | 7.954s | 8.486s | 7.762s | 7.969s | 8.299s | 8.094s |
> >>>> | Mixed | 19.037s | 18.323s | 18.965s | 18.156s | 19.185s | 18.733s |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>
> >>>>
> >>>> NFS v4.2, iflag=direct:
> >>>> Trial
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> | Data | 11.923s | 10.026s | 10.222s | 12.387s | 11.431s | 11.198s |
> >>>> | Hole | 3.247s | 3.155s | 3.191s | 3.243s | 3.202s | 3.208s |
> >>>> | Mixed | 54.677s | 54.697s | 52.978s | 53.704s | 54.054s | 54.022s |
> >>>
> >>> That's a bit nasty. Any idea what is going on with the Mixed case here?
> >>
> >> Not offhand, but my first guess would be something to do with extra seeks to find how long each hole and data segment is.
> >
> > Remind us what "mixed" means? (I think you were alternating, but how
> > large is each segment?)
>
> "Mixed" is alternating 4K segments.

So it's probably doing 128/4 = 32 reads where previously one was
necessary. You could confirm that by looking at the READ counts in
/proc/self/mountstats. With odirect turned off maybe that's hidden by
readahead?

--b.

>
> >
> > --b.
> >
> >>
> >> Anna
> >>
> >>>
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>
> >>>>
> >>>>
> >>>>
> >>>> NFS v4.1, iflag=nocache oflag=nocache:
> >>>> Trial
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> | Data | 6.788s | 6.802s | 6.750s | 6.756s | 6.852s | 6.790s |
> >>>> | Hole | 5.143s | 5.165s | 5.104s | 5.154s | 5.116s | 5.136s |
> >>>> | Mixed | 7.902s | 7.693s | 9.169s | 8.186s | 9.157s | 8.421s |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>
> >>>>
> >>>> NFS v4.2, iflag=nocache oflag=nocache:
> >>>> Trial
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> | Data | 6.897s | 6.862s | 7.054s | 6.961s | 7.081s | 6.971s |
> >>>> | Hole | 1.690s | 1.673s | 1.553s | 1.554s | 1.490s | 1.592s |
> >>>> | Mixed | 9.009s | 7.840s | 7.661s | 8.945s | 7.649s | 8.221s |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>
> >>>>
> >>>> On 03/26/2015 12:13 PM, Trond Myklebust wrote:
> >>>>> On Thu, Mar 26, 2015 at 12:11 PM, Anna Schumaker
> >>>>> <[email protected]> wrote:
> >>>>>> On 03/26/2015 12:06 PM, Trond Myklebust wrote:
> >>>>>>> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
> >>>>>>> <[email protected]> wrote:
> >>>>>>>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
> >>>>>>>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
> >>>>>>>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
> >>>>>>>>>> <[email protected]> wrote:
> >>>>>>>>>>> Here are my updated numbers! I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K. I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
> >>>>>>>>>>>
> >>>>>>>>>>> ##########################
> >>>>>>>>>>> # #
> >>>>>>>>>>> # Without READ_PLUS #
> >>>>>>>>>>> # #
> >>>>>>>>>>> ##########################
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>> NFS v4.1:
> >>>>>>>>>>> Trial
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>> | Data | 8.723s | 7.243s | 8.252s | 6.997s | 6.980s | 7.639s |
> >>>>>>>>>>> | Hole | 5.271s | 5.224s | 5.060s | 4.897s | 5.321s | 5.155s |
> >>>>>>>>>>> | Mixed | 8.050s | 10.057s | 7.919s | 8.060s | 9.557s | 8.729s |
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>> NFS v4.2:
> >>>>>>>>>>> Trial
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>> | Data | 6.707s | 7.070s | 6.722s | 6.761s | 6.810s | 6.814s |
> >>>>>>>>>>> | Hole | 5.152s | 5.149s | 5.213s | 5.206s | 5.312s | 5.206s |
> >>>>>>>>>>> | Mixed | 7.979s | 7.985s | 8.177s | 7.772s | 8.280s | 8.039s |
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>> #######################
> >>>>>>>>>>> # #
> >>>>>>>>>>> # With READ_PLUS #
> >>>>>>>>>>> # #
> >>>>>>>>>>> #######################
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>> NFS v4.1:
> >>>>>>>>>>> Trial
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>> | Data | 9.082s | 7.008s | 7.116s | 6.771s | 7.902s | 7.576s |
> >>>>>>>>>>> | Hole | 5.333s | 5.358s | 5.380s | 5.161s | 5.282s | 5.303s |
> >>>>>>>>>>> | Mixed | 8.189s | 8.308s | 9.540s | 7.937s | 8.420s | 8.479s |
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>> NFS v4.2:
> >>>>>>>>>>> Trial
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>> | Data | 7.033s | 6.829s | 7.025s | 6.873s | 7.134s | 6.979s |
> >>>>>>>>>>> | Hole | 1.794s | 1.800s | 1.905s | 1.811s | 1.725s | 1.807s |
> >>>>>>>>>>> | Mixed | 7.590s | 8.777s | 9.423s | 10.366s | 8.024s | 8.836s |
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>> So there is a clear win in the 100% hole case here, but otherwise the
> >>>>>>>>>> statistical fluctuations are dominating the numbers. Can you get us a
> >>>>>>>>>> little more stats and then perhaps run the results through nfsometer?
> >>>>>>>>>
> >>>>>>>>> Also, could you describe the setup (are these still kvm's), and how
> >>>>>>>>> you're clearing the cache between runs?
> >>>>>>>>
> >>>>>>>> These are still KVMs and my server is exporting an xfs filesystem. I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
> >>>>>>>
> >>>>>>> I agree that you have to use the 'drop_caches' interface on the
> >>>>>>> server, but why not just use O_DIRECT on the clients?
> >>>>>>
> >>>>>> I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`. I can write something to read files with O_DIRECT if that would be more useful!
> >>>>>>
> >>>>>
> >>>>> 'dd' can do that for you if the appropriate incantations are performed.
> >>>>>
> >>>>
> >>>> --
> >>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> >>>> the body of a message to [email protected]
> >>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
> >>>
> >>>
> >>>

2015-03-30 14:06:45

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
> So there is a clear win in the 100% hole case here, but otherwise the
> statistical fluctuations are dominating the numbers. Can you get us a
> little more stats and then perhaps run the results through nfsometer?

And that's just the uncached reads if I understand the thread correctly.
The cached case isn't uncommon in real life, so regressing it isn't
really an option either.

2015-04-15 19:32:06

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

I just ran some more tests comparing the directio case across different filesystem types. These tests used three 1G files: 100% data, 100% hole, and mixed file with alternating 4k data and hole segments. The mixed case seems to be consistently slower compared to NFS v4.1, and I'm at a loss for anything I could do to make it faster. Here are my numbers:

###########
# #
# XFS #
# #
###########


NFS v4.1:
Trial
|---------|---------|---------|---------|---------|---------|---------|
| | 1 | 2 | 3 | 4 | 5 | Average |
|---------|---------|---------|---------|---------|---------|---------|
| Data | 1.883s | 1.808s | 1.781s | 1.685s | 1.591s | 1.746s |
| Hole | 1.815s | 1.635s | 1.682s | 1.698s | 1.653s | 1.697s |
| Mixed | 2.089s | 2.024s | 1.970s | 1.925s | 2.049s | 2.011s |
|---------|---------|---------|---------|---------|---------|---------|


NFS v4.2:
Trial
|---------|---------|---------|---------|---------|---------|---------|
| | 1 | 2 | 3 | 4 | 5 | Average |
|---------|---------|---------|---------|---------|---------|---------|
| Data | 1.849s | 1.879s | 1.852s | 1.799s | 1.781s | 1.832s |
| Hole | 0.668s | 0.600s | 0.611s | 0.619s | 0.617s | 0.623s |
| Mixed | 5.913s | 5.811s | 5.952s | 5.962s | 5.806s | 5.889s |
|---------|---------|---------|---------|---------|---------|---------|





############
# #
# EXT4 #
# #
############


NFS v4.1:
Trial
|---------|---------|---------|---------|---------|---------|---------|
| | 1 | 2 | 3 | 4 | 5 | Average |
|---------|---------|---------|---------|---------|---------|---------|
| Data | 2.637s | 1.823s | 1.792s | 1.816s | 2.000s | 2.014s |
| Hole | 1.734s | 1.743s | 1.709s | 1.761s | 1.871s | 1.764s |
| Mixed | 5.465s | 2.158s | 2.254s | 2.676s | 2.422s | 2.995s |
|---------|---------|---------|---------|---------|---------|---------|


NFS v4.2:
Trial
|---------|---------|---------|---------|---------|---------|---------|
| | 1 | 2 | 3 | 4 | 5 | Average |
|---------|---------|---------|---------|---------|---------|---------|
| Data | 1.934s | 1.783s | 1.800s | 2.010s | 1.982s | 1.902s |
| Hole | 63.568s | 63.423s | 64.671s | 66.190s | 65.985s | 64.767s |
| Mixed | 6.010s | 5.798s | 6.146s | 6.460s | 6.720s | 6.225s |
|---------|---------|---------|---------|---------|---------|---------|





#############
# #
# BTRFS #
# #
#############


NFS v4.1:
Trial
|---------|---------|---------|---------|---------|---------|---------|
| | 1 | 2 | 3 | 4 | 5 | Average |
|---------|---------|---------|---------|---------|---------|---------|
| Data | 2.386s | 1.952s | 1.832s | 1.818s | 1.826s | 1.963s |
| Hole | 1.759s | 1.717s | 1.754s | 1.621s | 1.708s | 1.712s |
| Mixed | 2.889s | 2.272s | 2.778s | 2.277s | 2.255s | 2.494s |
|---------|---------|---------|---------|---------|---------|---------|


NFS v4.2:
Trial
|---------|---------|---------|---------|---------|---------|---------|
| | 1 | 2 | 3 | 4 | 5 | Average |
|---------|---------|---------|---------|---------|---------|---------|
| Data | 2.586s | 1.816s | 2.022s | 1.862s | 1.975s | 2.052s |
| Hole | 0.646s | 0.659s | 0.669s | 0.628s | 0.605s | 0.641s |
| Mixed | 8.555s | 8.553s | 7.904s | 8.567s | 8.286s | 8.373s |
|---------|---------|---------|---------|---------|---------|---------|


On 03/27/2015 05:08 PM, J. Bruce Fields wrote:
> On Fri, Mar 27, 2015 at 04:55:26PM -0400, Anna Schumaker wrote:
>> On 03/27/2015 04:54 PM, J. Bruce Fields wrote:
>>> On Fri, Mar 27, 2015 at 04:46:55PM -0400, Anna Schumaker wrote:
>>>> On 03/27/2015 04:22 PM, Trond Myklebust wrote:
>>>>> On Fri, Mar 27, 2015 at 3:04 PM, Anna Schumaker
>>>>> <[email protected]> wrote:
>>>>>> I did two separate dd tests with the same 5G file from yesterday, and still using the same virtual machines. First, I ran dd using direct IO for reads:
>>>>>> dd if=/nfs/file iflag=direct of=/dev/null bs=128K
>>>>>>
>>>>>> Mixed file performance was awful, so I reran without direct IO enabled for comparison:
>>>>>> dd if=/nfs/file iflag=nocache of=/dev/null oflag=nocache bs=128K
>>>>>>
>>>>>> bs=128K sets the block size used by dd to the NFS rsize, without this dd will only read 512 bytes at a time and take forever to complete.
>>>>>>
>>>>>>
>>>>>> ##########################
>>>>>> # #
>>>>>> # Without READ_PLUS #
>>>>>> # #
>>>>>> ##########################
>>>>>>
>>>>>>
>>>>>> NFS v4.1, iflag=direct:
>>>>>> Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | Data | 11.704s | 11.055s | 11.329s | 11.453s | 10.741s | 11.256s |
>>>>>> | Hole | 9.839s | 9.326s | 9.381s | 9.430s | 8.875s | 9.370s |
>>>>>> | Mixed | 19.150s | 19.468s | 18.650s | 18.537s | 19.312s | 19.023s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>> NFS v4.2, iflag=direct:
>>>>>> Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | Data | 10.927s | 10.885s | 11.114s | 11.283s | 10.371s | 10.916s |
>>>>>> | Hole | 9.515s | 9.039s | 9.116s | 8.867s | 8.905s | 9.088s |
>>>>>> | Mixed | 19.149s | 18.656s | 19.400s | 18.834s | 20.041s | 19.216s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> NFS v4.1, iflag=nocache oflag=nocache:
>>>>>> Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | Data | 6.808s | 6.698s | 7.482s | 6.761s | 7.235s | 6.995s |
>>>>>> | Hole | 5.350s | 5.148s | 5.161s | 5.070s | 5.089s | 5.164s |
>>>>>> | Mixed | 9.316s | 8.731s | 9.072s | 9.145s | 8.627s | 8.978s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>> NFS v4.2, iflag=nocache oflag=nocache:
>>>>>> Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | Data | 6.686s | 6.848s | 6.876s | 6.799s | 7.815s | 7.004s |
>>>>>> | Hole | 5.092s | 5.330s | 5.050s | 5.280s | 5.030s | 5.156s |
>>>>>> | Mixed | 8.142s | 7.897s | 8.040s | 7.960s | 8.050s | 8.018s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> #######################
>>>>>> # #
>>>>>> # With READ_PLUS #
>>>>>> # #
>>>>>> #######################
>>>>>>
>>>>>>
>>>>>> NFS v4.1, iflag=direct:
>>>>>> Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | Data | 9.464s | 10.181s | 10.048s | 9.452s | 10.795s | 9.988s |
>>>>>> | Hole | 7.954s | 8.486s | 7.762s | 7.969s | 8.299s | 8.094s |
>>>>>> | Mixed | 19.037s | 18.323s | 18.965s | 18.156s | 19.185s | 18.733s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>> NFS v4.2, iflag=direct:
>>>>>> Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | Data | 11.923s | 10.026s | 10.222s | 12.387s | 11.431s | 11.198s |
>>>>>> | Hole | 3.247s | 3.155s | 3.191s | 3.243s | 3.202s | 3.208s |
>>>>>> | Mixed | 54.677s | 54.697s | 52.978s | 53.704s | 54.054s | 54.022s |
>>>>>
>>>>> That's a bit nasty. Any idea what is going on with the Mixed case here?
>>>>
>>>> Not offhand, but my first guess would be something to do with extra seeks to find how long each hole and data segment is.
>>>
>>> Remind us what "mixed" means? (I think you were alternating, but how
>>> large is each segment?)
>>
>> "Mixed" is alternating 4K segments.
>
> So it's probably doing 128/4 = 32 reads where previously one was
> necessary. You could confirm that by looking at the READ counts in
> /proc/self/mountstats. With odirect turned off maybe that's hidden by
> readahead?
>
> --b.
>
>>
>>>
>>> --b.
>>>
>>>>
>>>> Anna
>>>>
>>>>>
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> NFS v4.1, iflag=nocache oflag=nocache:
>>>>>> Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | Data | 6.788s | 6.802s | 6.750s | 6.756s | 6.852s | 6.790s |
>>>>>> | Hole | 5.143s | 5.165s | 5.104s | 5.154s | 5.116s | 5.136s |
>>>>>> | Mixed | 7.902s | 7.693s | 9.169s | 8.186s | 9.157s | 8.421s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>> NFS v4.2, iflag=nocache oflag=nocache:
>>>>>> Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> | Data | 6.897s | 6.862s | 7.054s | 6.961s | 7.081s | 6.971s |
>>>>>> | Hole | 1.690s | 1.673s | 1.553s | 1.554s | 1.490s | 1.592s |
>>>>>> | Mixed | 9.009s | 7.840s | 7.661s | 8.945s | 7.649s | 8.221s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>> On 03/26/2015 12:13 PM, Trond Myklebust wrote:
>>>>>>> On Thu, Mar 26, 2015 at 12:11 PM, Anna Schumaker
>>>>>>> <[email protected]> wrote:
>>>>>>>> On 03/26/2015 12:06 PM, Trond Myklebust wrote:
>>>>>>>>> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
>>>>>>>>> <[email protected]> wrote:
>>>>>>>>>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>>>>>>>>>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>>>>>>>>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>>>>>>>>>>> <[email protected]> wrote:
>>>>>>>>>>>>> Here are my updated numbers! I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K. I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>>>>>>>>>>
>>>>>>>>>>>>> ##########################
>>>>>>>>>>>>> # #
>>>>>>>>>>>>> # Without READ_PLUS #
>>>>>>>>>>>>> # #
>>>>>>>>>>>>> ##########################
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> NFS v4.1:
>>>>>>>>>>>>> Trial
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>> | Data | 8.723s | 7.243s | 8.252s | 6.997s | 6.980s | 7.639s |
>>>>>>>>>>>>> | Hole | 5.271s | 5.224s | 5.060s | 4.897s | 5.321s | 5.155s |
>>>>>>>>>>>>> | Mixed | 8.050s | 10.057s | 7.919s | 8.060s | 9.557s | 8.729s |
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> NFS v4.2:
>>>>>>>>>>>>> Trial
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>> | Data | 6.707s | 7.070s | 6.722s | 6.761s | 6.810s | 6.814s |
>>>>>>>>>>>>> | Hole | 5.152s | 5.149s | 5.213s | 5.206s | 5.312s | 5.206s |
>>>>>>>>>>>>> | Mixed | 7.979s | 7.985s | 8.177s | 7.772s | 8.280s | 8.039s |
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> #######################
>>>>>>>>>>>>> # #
>>>>>>>>>>>>> # With READ_PLUS #
>>>>>>>>>>>>> # #
>>>>>>>>>>>>> #######################
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> NFS v4.1:
>>>>>>>>>>>>> Trial
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>> | Data | 9.082s | 7.008s | 7.116s | 6.771s | 7.902s | 7.576s |
>>>>>>>>>>>>> | Hole | 5.333s | 5.358s | 5.380s | 5.161s | 5.282s | 5.303s |
>>>>>>>>>>>>> | Mixed | 8.189s | 8.308s | 9.540s | 7.937s | 8.420s | 8.479s |
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> NFS v4.2:
>>>>>>>>>>>>> Trial
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>> | | 1 | 2 | 3 | 4 | 5 | Average |
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>> | Data | 7.033s | 6.829s | 7.025s | 6.873s | 7.134s | 6.979s |
>>>>>>>>>>>>> | Hole | 1.794s | 1.800s | 1.905s | 1.811s | 1.725s | 1.807s |
>>>>>>>>>>>>> | Mixed | 7.590s | 8.777s | 9.423s | 10.366s | 8.024s | 8.836s |
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> So there is a clear win in the 100% hole case here, but otherwise the
>>>>>>>>>>>> statistical fluctuations are dominating the numbers. Can you get us a
>>>>>>>>>>>> little more stats and then perhaps run the results through nfsometer?
>>>>>>>>>>>
>>>>>>>>>>> Also, could you describe the setup (are these still kvm's), and how
>>>>>>>>>>> you're clearing the cache between runs?
>>>>>>>>>>
>>>>>>>>>> These are still KVMs and my server is exporting an xfs filesystem. I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
>>>>>>>>>
>>>>>>>>> I agree that you have to use the 'drop_caches' interface on the
>>>>>>>>> server, but why not just use O_DIRECT on the clients?
>>>>>>>>
>>>>>>>> I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`. I can write something to read files with O_DIRECT if that would be more useful!
>>>>>>>>
>>>>>>>
>>>>>>> 'dd' can do that for you if the appropriate incantations are performed.
>>>>>>>
>>>>>>
>>>>>> --
>>>>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>>>>> the body of a message to [email protected]
>>>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>>>
>>>>>
>>>>>


2015-04-15 19:56:16

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Wed, Apr 15, 2015 at 03:32:02PM -0400, Anna Schumaker wrote:
> I just ran some more tests comparing the directio case across
> different filesystem types. These tests used three 1G files: 100%
> data, 100% hole, and mixed file with alternating 4k data and hole
> segments. The mixed case seems to be consistently slower compared to
> NFS v4.1, and I'm at a loss for anything I could do to make it faster.
> Here are my numbers:

Have you tried the implementation we discussed that always returns a
single segment covering the whole requested range, by treating holes as
data if necessary when they don't cover the whole range?

(Also: I assume it's the same as before, but: when you post test
results, could you repost if necessary:

- what the actual test is
- what the hardware/software setup is on client and server

so that we have reproduceable results for posterity's sake.)

Interesting that "Mixed" is a little slower even before READ_PLUS.

And I guess we should really report this to ext4 people, looks like they
may have a bug.

--b.

>
> ###########
> # #
> # XFS #
> # #
> ###########
>
>
> NFS v4.1:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 1.883s | 1.808s | 1.781s | 1.685s | 1.591s | 1.746s |
> | Hole | 1.815s | 1.635s | 1.682s | 1.698s | 1.653s | 1.697s |
> | Mixed | 2.089s | 2.024s | 1.970s | 1.925s | 2.049s | 2.011s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
> NFS v4.2:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 1.849s | 1.879s | 1.852s | 1.799s | 1.781s | 1.832s |
> | Hole | 0.668s | 0.600s | 0.611s | 0.619s | 0.617s | 0.623s |
> | Mixed | 5.913s | 5.811s | 5.952s | 5.962s | 5.806s | 5.889s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
>
>
>
> ############
> # #
> # EXT4 #
> # #
> ############
>
>
> NFS v4.1:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 2.637s | 1.823s | 1.792s | 1.816s | 2.000s | 2.014s |
> | Hole | 1.734s | 1.743s | 1.709s | 1.761s | 1.871s | 1.764s |
> | Mixed | 5.465s | 2.158s | 2.254s | 2.676s | 2.422s | 2.995s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
> NFS v4.2:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 1.934s | 1.783s | 1.800s | 2.010s | 1.982s | 1.902s |
> | Hole | 63.568s | 63.423s | 64.671s | 66.190s | 65.985s | 64.767s |
> | Mixed | 6.010s | 5.798s | 6.146s | 6.460s | 6.720s | 6.225s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
>
>
>
> #############
> # #
> # BTRFS #
> # #
> #############
>
>
> NFS v4.1:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 2.386s | 1.952s | 1.832s | 1.818s | 1.826s | 1.963s |
> | Hole | 1.759s | 1.717s | 1.754s | 1.621s | 1.708s | 1.712s |
> | Mixed | 2.889s | 2.272s | 2.778s | 2.277s | 2.255s | 2.494s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
> NFS v4.2:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 2.586s | 1.816s | 2.022s | 1.862s | 1.975s | 2.052s |
> | Hole | 0.646s | 0.659s | 0.669s | 0.628s | 0.605s | 0.641s |
> | Mixed | 8.555s | 8.553s | 7.904s | 8.567s | 8.286s | 8.373s |
> |---------|---------|---------|---------|---------|---------|---------|

2015-04-15 20:00:17

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Wed, Apr 15, 2015 at 03:56:14PM -0400, J. Bruce Fields wrote:
> On Wed, Apr 15, 2015 at 03:32:02PM -0400, Anna Schumaker wrote:
> > I just ran some more tests comparing the directio case across
> > different filesystem types. These tests used three 1G files: 100%
> > data, 100% hole, and mixed file with alternating 4k data and hole
> > segments. The mixed case seems to be consistently slower compared to
> > NFS v4.1, and I'm at a loss for anything I could do to make it faster.
> > Here are my numbers:
>
> Have you tried the implementation we discussed that always returns a
> single segment covering the whole requested range, by treating holes as
> data if necessary when they don't cover the whole range?
>
> (Also: I assume it's the same as before, but: when you post test
> results, could you repost if necessary:
>
> - what the actual test is
> - what the hardware/software setup is on client and server
>
> so that we have reproduceable results for posterity's sake.)
>
> Interesting that "Mixed" is a little slower even before READ_PLUS.
>
> And I guess we should really report this to ext4 people, looks like they
> may have a bug.

FWIW, this is what I was using to test SEEK_HOLE/SEEK_DATA and map out
holes on files on my local disk. Might be worth checking whether the
ext4 slowdowns are reproduceable just with something like this, to rule
out protocol problems.

--b.

#define _GNU_SOURCE
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <err.h>

long round_up(long n, long b)
{
return ((n + b - 1)/b) * b;
}

long round_down(long n, long b)
{
return (n/b) * b;
}

long hbytes = 0;
long rplusbytes = 0;
long num_holes = 0;

do_stats(off_t hole_start, off_t hole_end)
{
off_t hole_start_up, hole_end_down;

hole_start_up = round_up(hole_start, 1024*1024);
hole_end_down = round_down(hole_end, 1024*1024);

hbytes += hole_end - hole_start;
if (hole_start < hole_end)
num_holes++;
if (hole_start_up < hole_end_down)
rplusbytes += hole_end_down - hole_start_up;
}

int main(int argc, char *argv[])
{
off_t hole_start, hole_end;
int fd;
char *name;

/* Map out holes with SEEK_HOLE, SEEK_DATA */
/* Useful statistics:
* - what percentage of file is in holes?
* - what percentage of file would be skipped if we read it
* sequentially in 1MB chunks?
*/

if (argc != 2)
errx(1, "usage: %s <filename>\n", argv[0]);
name = argv[1];
fd = open(name, O_RDONLY);
if (fd == -1)
err(1, "open");

hole_end = 0;
while (1) {
hole_start = lseek(fd, hole_end, SEEK_HOLE);
if (hole_start == -1)
err(1, "lseek");
hole_end = lseek(fd, hole_start, SEEK_DATA);
if (hole_end == -1) {
if (errno == ENXIO)
break;
err(1, "lseek");
}
do_stats(hole_start, hole_end);
}
hole_end = lseek(fd, 0, SEEK_END);
do_stats(hole_start, hole_end);
printf("%ld holes\n", num_holes);
printf("total hole bytes: %ld (%.0f%)\n", hbytes,
100 * (float)hbytes/hole_end);
printf("in aligned 1MB chunks: %ld (%.0f%)\n", rplusbytes,
100 * (float)rplusbytes/hole_end);
}

2015-04-15 22:50:29

by Dave Chinner

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Wed, Apr 15, 2015 at 04:00:16PM -0400, J. Bruce Fields wrote:
> On Wed, Apr 15, 2015 at 03:56:14PM -0400, J. Bruce Fields wrote:
> > On Wed, Apr 15, 2015 at 03:32:02PM -0400, Anna Schumaker wrote:
> > > I just ran some more tests comparing the directio case across
> > > different filesystem types. These tests used three 1G files: 100%
> > > data, 100% hole, and mixed file with alternating 4k data and hole
> > > segments. The mixed case seems to be consistently slower compared to
> > > NFS v4.1, and I'm at a loss for anything I could do to make it faster.
> > > Here are my numbers:
> >
> > Have you tried the implementation we discussed that always returns a
> > single segment covering the whole requested range, by treating holes as
> > data if necessary when they don't cover the whole range?
> >
> > (Also: I assume it's the same as before, but: when you post test
> > results, could you repost if necessary:
> >
> > - what the actual test is
> > - what the hardware/software setup is on client and server
> >
> > so that we have reproduceable results for posterity's sake.)
> >
> > Interesting that "Mixed" is a little slower even before READ_PLUS.
> >
> > And I guess we should really report this to ext4 people, looks like they
> > may have a bug.
>
> FWIW, this is what I was using to test SEEK_HOLE/SEEK_DATA and map out
> holes on files on my local disk. Might be worth checking whether the
> ext4 slowdowns are reproduceable just with something like this, to rule
> out protocol problems.

Wheel reinvention. :)

$ rm -f /mnt/scratch/bar
$ for i in `seq 20 -2 0`; do
> sudo xfs_io -f -c "pwrite $((i * 8192)) 4096" /mnt/scratch/bar
> done
.....
$ sync
$ sudo xfs_io -c "seek -ar 0" /mnt/scratch/bar
Whence Result
DATA 0
HOLE 4096
DATA 16384
HOLE 20480
DATA 32768
HOLE 36864
DATA 49152
HOLE 53248
DATA 65536
HOLE 69632
DATA 81920
HOLE 86016
DATA 98304
HOLE 102400
DATA 114688
HOLE 118784
DATA 131072
HOLE 135168
DATA 147456
HOLE 151552
DATA 163840
HOLE 167936
$

-Dave.
--
Dave Chinner
[email protected]

2015-04-15 22:57:54

by Dave Chinner

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Wed, Apr 15, 2015 at 03:32:02PM -0400, Anna Schumaker wrote:
> I just ran some more tests comparing the directio case across
> different filesystem types. These tests used three 1G files:
> 100% data, 100% hole, and mixed file with alternating 4k data and
> hole segments. The mixed case seems to be consistently slower
> compared to NFS v4.1, and I'm at a loss for anything I could do to
> make it faster. Here are my numbers:
>
> ###########
> # #
> # XFS #
> # #
> ###########
>
>
> NFS v4.1:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 1.883s | 1.808s | 1.781s | 1.685s | 1.591s | 1.746s |
> | Hole | 1.815s | 1.635s | 1.682s | 1.698s | 1.653s | 1.697s |
> | Mixed | 2.089s | 2.024s | 1.970s | 1.925s | 2.049s | 2.011s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
> NFS v4.2:
> Trial
> |---------|---------|---------|---------|---------|---------|---------|
> | | 1 | 2 | 3 | 4 | 5 | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> | Data | 1.849s | 1.879s | 1.852s | 1.799s | 1.781s | 1.832s |
> | Hole | 0.668s | 0.600s | 0.611s | 0.619s | 0.617s | 0.623s |
> | Mixed | 5.913s | 5.811s | 5.952s | 5.962s | 5.806s | 5.889s |
> |---------|---------|---------|---------|---------|---------|---------|

What that says to me is that the READ_PLUS when there are (worst
case) mixed holes is either burning a lot more CPU than we expected
or it is serialising somewhere (not sure where, everything in XFS
should be shared locks on read/seek).

Can you run a perf profile (even just a snapshot from perf top) on
the server so we can see a bit about what is happening on the CPU
for the different workloads?

Cheers,

Dave.
--
Dave Chinner
[email protected]

2015-04-17 22:07:11

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple segments

On Thu, Apr 16, 2015 at 08:50:02AM +1000, Dave Chinner wrote:
> On Wed, Apr 15, 2015 at 04:00:16PM -0400, J. Bruce Fields wrote:
> > On Wed, Apr 15, 2015 at 03:56:14PM -0400, J. Bruce Fields wrote:
> > > On Wed, Apr 15, 2015 at 03:32:02PM -0400, Anna Schumaker wrote:
> > > > I just ran some more tests comparing the directio case across
> > > > different filesystem types. These tests used three 1G files: 100%
> > > > data, 100% hole, and mixed file with alternating 4k data and hole
> > > > segments. The mixed case seems to be consistently slower compared to
> > > > NFS v4.1, and I'm at a loss for anything I could do to make it faster.
> > > > Here are my numbers:
> > >
> > > Have you tried the implementation we discussed that always returns a
> > > single segment covering the whole requested range, by treating holes as
> > > data if necessary when they don't cover the whole range?

Uh, sorry, I forgot, I think you're running with the patches that
support full multi-segment READ_PLUS on both sides so there's not that
issue with multiplying RPC's in this case.

Still, might be interesting to compare. And wouldn't hurt to remind us
of these details when you repost this stuff to help keep my forgetful
self going in circles.

> > > (Also: I assume it's the same as before, but: when you post test
> > > results, could you repost if necessary:
> > >
> > > - what the actual test is
> > > - what the hardware/software setup is on client and server
> > >
> > > so that we have reproduceable results for posterity's sake.)
> > >
> > > Interesting that "Mixed" is a little slower even before READ_PLUS.
> > >
> > > And I guess we should really report this to ext4 people, looks like they
> > > may have a bug.
> >
> > FWIW, this is what I was using to test SEEK_HOLE/SEEK_DATA and map out
> > holes on files on my local disk. Might be worth checking whether the
> > ext4 slowdowns are reproduceable just with something like this, to rule
> > out protocol problems.
>
> Wheel reinvention. :)

xfs_io appears to have a lot of wheels. OK, I'll go read that man page
one of these days.

--b.

>
> $ rm -f /mnt/scratch/bar
> $ for i in `seq 20 -2 0`; do
> > sudo xfs_io -f -c "pwrite $((i * 8192)) 4096" /mnt/scratch/bar
> > done
> .....
> $ sync
> $ sudo xfs_io -c "seek -ar 0" /mnt/scratch/bar
> Whence Result
> DATA 0
> HOLE 4096
> DATA 16384
> HOLE 20480
> DATA 32768
> HOLE 36864
> DATA 49152
> HOLE 53248
> DATA 65536
> HOLE 69632
> DATA 81920
> HOLE 86016
> DATA 98304
> HOLE 102400
> DATA 114688
> HOLE 118784
> DATA 131072
> HOLE 135168
> DATA 147456
> HOLE 151552
> DATA 163840
> HOLE 167936
> $
>
> -Dave.
> --
> Dave Chinner
> [email protected]