2022-09-07 20:04:26

by Anna Schumaker

[permalink] [raw]
Subject: [PATCH v2 0/1] NFSD: Simplify READ_PLUS

From: Anna Schumaker <[email protected]>

When we left off with READ_PLUS, Chuck had suggested reverting the
server to reply with a single NFS4_CONTENT_DATA segment essentially
mimicing how the READ operation behaves. Then, a future sparse read
function can be added and the server modified to support it without
needing to rip out the old READ_PLUS code at the same time.

This patch takes that first step. I was even able to re-use the
nfsd4_encode_readv() and nfsd4_encode_splice_read() functions to
remove some duuplicate code.

- v2:
- Add splice read support
- Address Chuck's style comments
- Reword the commit message

Thanks,
Anna


Anna Schumaker (1):
NFSD: Simplify READ_PLUS

fs/nfsd/nfs4xdr.c | 139 +++++++++++-----------------------------------
1 file changed, 32 insertions(+), 107 deletions(-)

--
2.37.2


2022-09-07 20:04:46

by Anna Schumaker

[permalink] [raw]
Subject: [PATCH v2 1/1] NFSD: Simplify READ_PLUS

From: Anna Schumaker <[email protected]>

Chuck had suggested reverting READ_PLUS so it returns a single DATA
segment covering the requested read range. This prepares the server for
a future "sparse read" function so support can easily be added without
needing to rip out the old READ_PLUS code at the same time.

Signed-off-by: Anna Schumaker <[email protected]>
---
fs/nfsd/nfs4xdr.c | 139 +++++++++++-----------------------------------
1 file changed, 32 insertions(+), 107 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1e9690a061ec..bcc8c385faf2 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,

static __be32
nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
- struct nfsd4_read *read,
- unsigned long *maxcount, u32 *eof,
- loff_t *pos)
+ struct nfsd4_read *read)
{
- struct xdr_stream *xdr = resp->xdr;
+ bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
struct file *file = read->rd_nf->nf_file;
- int starting_len = xdr->buf->len;
- loff_t hole_pos;
- __be32 nfserr;
- __be32 *p, tmp;
- __be64 tmp64;
-
- hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
- if (hole_pos > read->rd_offset)
- *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
- *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
+ struct xdr_stream *xdr = resp->xdr;
+ unsigned long maxcount;
+ __be32 nfserr, *p;

/* Content type, offset, byte count */
p = xdr_reserve_space(xdr, 4 + 8 + 4);
if (!p)
- return nfserr_resource;
+ return nfserr_io;
+ if (resp->xdr->buf->page_len && splice_ok) {
+ WARN_ON_ONCE(splice_ok);
+ return nfserr_io;
+ }

- read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
- if (read->rd_vlen < 0)
- return nfserr_resource;
+ maxcount = min_t(unsigned long, read->rd_length,
+ (xdr->buf->buflen - xdr->buf->len));

- nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
- resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
+ if (file->f_op->splice_read && splice_ok)
+ nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
+ else
+ nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
if (nfserr)
return nfserr;
- xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount));

- tmp = htonl(NFS4_CONTENT_DATA);
- write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
- tmp64 = cpu_to_be64(read->rd_offset);
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8);
- tmp = htonl(*maxcount);
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4);
-
- tmp = xdr_zero;
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp,
- xdr_pad_size(*maxcount));
- return nfs_ok;
-}
-
-static __be32
-nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
- struct nfsd4_read *read,
- unsigned long *maxcount, u32 *eof)
-{
- struct file *file = read->rd_nf->nf_file;
- loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
- loff_t f_size = i_size_read(file_inode(file));
- unsigned long count;
- __be32 *p;
-
- if (data_pos == -ENXIO)
- data_pos = f_size;
- else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
- return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
- count = data_pos - read->rd_offset;
-
- /* Content type, offset, byte count */
- p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
- if (!p)
- return nfserr_resource;
-
- *p++ = htonl(NFS4_CONTENT_HOLE);
+ *p++ = cpu_to_be32(NFS4_CONTENT_DATA);
p = xdr_encode_hyper(p, read->rd_offset);
- p = xdr_encode_hyper(p, count);
+ *p = cpu_to_be32(read->rd_length);

- *eof = (read->rd_offset + count) >= f_size;
- *maxcount = min_t(unsigned long, count, *maxcount);
return nfs_ok;
}

@@ -4811,69 +4769,36 @@ static __be32
nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_read *read)
{
- unsigned long maxcount, count;
+ struct file *file = read->rd_nf->nf_file;
struct xdr_stream *xdr = resp->xdr;
- struct file *file;
int starting_len = xdr->buf->len;
- int last_segment = xdr->buf->len;
- int segments = 0;
- __be32 *p, tmp;
- bool is_data;
- loff_t pos;
- u32 eof;
+ u32 segments = 0;
+ __be32 *p;

if (nfserr)
return nfserr;
- file = read->rd_nf->nf_file;

/* eof flag, segment count */
p = xdr_reserve_space(xdr, 4 + 4);
if (!p)
- return nfserr_resource;
+ return nfserr_io;
xdr_commit_encode(xdr);

- maxcount = min_t(unsigned long, read->rd_length,
- (xdr->buf->buflen - xdr->buf->len));
- count = maxcount;
-
- eof = read->rd_offset >= i_size_read(file_inode(file));
- if (eof)
+ read->rd_eof = read->rd_offset >= i_size_read(file_inode(file));
+ if (read->rd_eof)
goto out;

- pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
- is_data = pos > read->rd_offset;
-
- while (count > 0 && !eof) {
- maxcount = count;
- if (is_data)
- nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
- segments == 0 ? &pos : NULL);
- else
- nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
- if (nfserr)
- goto out;
- count -= maxcount;
- read->rd_offset += maxcount;
- is_data = !is_data;
- last_segment = xdr->buf->len;
- segments++;
- }
-
-out:
- if (nfserr && segments == 0)
+ nfserr = nfsd4_encode_read_plus_data(resp, read);
+ if (nfserr) {
xdr_truncate_encode(xdr, starting_len);
- else {
- if (nfserr) {
- xdr_truncate_encode(xdr, last_segment);
- nfserr = nfs_ok;
- eof = 0;
- }
- tmp = htonl(eof);
- write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
- tmp = htonl(segments);
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
+ return nfserr;
}

+ segments++;
+
+out:
+ p = xdr_encode_bool(p, read->rd_eof);
+ *p = cpu_to_be32(segments);
return nfserr;
}

--
2.37.2

2022-09-07 20:43:01

by Chuck Lever III

[permalink] [raw]
Subject: Re: [PATCH v2 1/1] NFSD: Simplify READ_PLUS

Be sure to Cc: Jeff on these. Thanks!


> On Sep 7, 2022, at 3:52 PM, Anna Schumaker <[email protected]> wrote:
>
> From: Anna Schumaker <[email protected]>
>
> Chuck had suggested reverting READ_PLUS so it returns a single DATA
> segment covering the requested read range. This prepares the server for
> a future "sparse read" function so support can easily be added without
> needing to rip out the old READ_PLUS code at the same time.
>
> Signed-off-by: Anna Schumaker <[email protected]>
> ---
> fs/nfsd/nfs4xdr.c | 139 +++++++++++-----------------------------------
> 1 file changed, 32 insertions(+), 107 deletions(-)
>
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 1e9690a061ec..bcc8c385faf2 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
>
> static __be32
> nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
> - struct nfsd4_read *read,
> - unsigned long *maxcount, u32 *eof,
> - loff_t *pos)
> + struct nfsd4_read *read)
> {
> - struct xdr_stream *xdr = resp->xdr;
> + bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
> struct file *file = read->rd_nf->nf_file;
> - int starting_len = xdr->buf->len;
> - loff_t hole_pos;
> - __be32 nfserr;
> - __be32 *p, tmp;
> - __be64 tmp64;
> -
> - hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
> - if (hole_pos > read->rd_offset)
> - *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
> - *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
> + struct xdr_stream *xdr = resp->xdr;
> + unsigned long maxcount;
> + __be32 nfserr, *p;
>
> /* Content type, offset, byte count */
> p = xdr_reserve_space(xdr, 4 + 8 + 4);
> if (!p)
> - return nfserr_resource;
> + return nfserr_io;

Wouldn't nfserr_rep_too_big be a more appropriate status for running
off the end of the send buffer? I'm not 100% sure, but I would expect
that exhausting send buffer space would imply the reply has grown too
large.


> + if (resp->xdr->buf->page_len && splice_ok) {
> + WARN_ON_ONCE(splice_ok);
> + return nfserr_io;
> + }

I wish I understood why this test was needed. It seems to have been
copied and pasted from historic code into nfsd4_encode_read(), and
there have been recent mechanical changes to it, but there's no
comment explaining it there...

In any event, this seems to be checking for a server software bug,
so maybe this should return nfserr_serverfault. Oddly that status
code isn't defined yet.


Do you have some performance results for v2?


> - read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
> - if (read->rd_vlen < 0)
> - return nfserr_resource;
> + maxcount = min_t(unsigned long, read->rd_length,
> + (xdr->buf->buflen - xdr->buf->len));
>
> - nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
> - resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
> + if (file->f_op->splice_read && splice_ok)
> + nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
> + else
> + nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
> if (nfserr)
> return nfserr;
> - xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount));
>
> - tmp = htonl(NFS4_CONTENT_DATA);
> - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
> - tmp64 = cpu_to_be64(read->rd_offset);
> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8);
> - tmp = htonl(*maxcount);
> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4);
> -
> - tmp = xdr_zero;
> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp,
> - xdr_pad_size(*maxcount));
> - return nfs_ok;
> -}
> -
> -static __be32
> -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
> - struct nfsd4_read *read,
> - unsigned long *maxcount, u32 *eof)
> -{
> - struct file *file = read->rd_nf->nf_file;
> - loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
> - loff_t f_size = i_size_read(file_inode(file));
> - unsigned long count;
> - __be32 *p;
> -
> - if (data_pos == -ENXIO)
> - data_pos = f_size;
> - else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
> - return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
> - count = data_pos - read->rd_offset;
> -
> - /* Content type, offset, byte count */
> - p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
> - if (!p)
> - return nfserr_resource;
> -
> - *p++ = htonl(NFS4_CONTENT_HOLE);
> + *p++ = cpu_to_be32(NFS4_CONTENT_DATA);
> p = xdr_encode_hyper(p, read->rd_offset);
> - p = xdr_encode_hyper(p, count);
> + *p = cpu_to_be32(read->rd_length);
>
> - *eof = (read->rd_offset + count) >= f_size;
> - *maxcount = min_t(unsigned long, count, *maxcount);
> return nfs_ok;
> }
>
> @@ -4811,69 +4769,36 @@ static __be32
> nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
> struct nfsd4_read *read)
> {
> - unsigned long maxcount, count;
> + struct file *file = read->rd_nf->nf_file;
> struct xdr_stream *xdr = resp->xdr;
> - struct file *file;
> int starting_len = xdr->buf->len;
> - int last_segment = xdr->buf->len;
> - int segments = 0;
> - __be32 *p, tmp;
> - bool is_data;
> - loff_t pos;
> - u32 eof;
> + u32 segments = 0;
> + __be32 *p;
>
> if (nfserr)
> return nfserr;
> - file = read->rd_nf->nf_file;
>
> /* eof flag, segment count */
> p = xdr_reserve_space(xdr, 4 + 4);
> if (!p)
> - return nfserr_resource;
> + return nfserr_io;
> xdr_commit_encode(xdr);
>
> - maxcount = min_t(unsigned long, read->rd_length,
> - (xdr->buf->buflen - xdr->buf->len));
> - count = maxcount;
> -
> - eof = read->rd_offset >= i_size_read(file_inode(file));
> - if (eof)
> + read->rd_eof = read->rd_offset >= i_size_read(file_inode(file));
> + if (read->rd_eof)
> goto out;
>
> - pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
> - is_data = pos > read->rd_offset;
> -
> - while (count > 0 && !eof) {
> - maxcount = count;
> - if (is_data)
> - nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
> - segments == 0 ? &pos : NULL);
> - else
> - nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
> - if (nfserr)
> - goto out;
> - count -= maxcount;
> - read->rd_offset += maxcount;
> - is_data = !is_data;
> - last_segment = xdr->buf->len;
> - segments++;
> - }
> -
> -out:
> - if (nfserr && segments == 0)
> + nfserr = nfsd4_encode_read_plus_data(resp, read);
> + if (nfserr) {
> xdr_truncate_encode(xdr, starting_len);
> - else {
> - if (nfserr) {
> - xdr_truncate_encode(xdr, last_segment);
> - nfserr = nfs_ok;
> - eof = 0;
> - }
> - tmp = htonl(eof);
> - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
> - tmp = htonl(segments);
> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
> + return nfserr;
> }
>
> + segments++;
> +
> +out:
> + p = xdr_encode_bool(p, read->rd_eof);
> + *p = cpu_to_be32(segments);
> return nfserr;
> }
>
> --
> 2.37.2
>

--
Chuck Lever



2022-09-07 20:43:04

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH v2 1/1] NFSD: Simplify READ_PLUS

On Wed, Sep 7, 2022 at 4:29 PM Chuck Lever III <[email protected]> wrote:
>
> Be sure to Cc: Jeff on these. Thanks!
>
>
> > On Sep 7, 2022, at 3:52 PM, Anna Schumaker <[email protected]> wrote:
> >
> > From: Anna Schumaker <[email protected]>
> >
> > Chuck had suggested reverting READ_PLUS so it returns a single DATA
> > segment covering the requested read range. This prepares the server for
> > a future "sparse read" function so support can easily be added without
> > needing to rip out the old READ_PLUS code at the same time.
> >
> > Signed-off-by: Anna Schumaker <[email protected]>
> > ---
> > fs/nfsd/nfs4xdr.c | 139 +++++++++++-----------------------------------
> > 1 file changed, 32 insertions(+), 107 deletions(-)
> >
> > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > index 1e9690a061ec..bcc8c385faf2 100644
> > --- a/fs/nfsd/nfs4xdr.c
> > +++ b/fs/nfsd/nfs4xdr.c
> > @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
> >
> > static __be32
> > nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
> > - struct nfsd4_read *read,
> > - unsigned long *maxcount, u32 *eof,
> > - loff_t *pos)
> > + struct nfsd4_read *read)
> > {
> > - struct xdr_stream *xdr = resp->xdr;
> > + bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
> > struct file *file = read->rd_nf->nf_file;
> > - int starting_len = xdr->buf->len;
> > - loff_t hole_pos;
> > - __be32 nfserr;
> > - __be32 *p, tmp;
> > - __be64 tmp64;
> > -
> > - hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
> > - if (hole_pos > read->rd_offset)
> > - *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
> > - *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
> > + struct xdr_stream *xdr = resp->xdr;
> > + unsigned long maxcount;
> > + __be32 nfserr, *p;
> >
> > /* Content type, offset, byte count */
> > p = xdr_reserve_space(xdr, 4 + 8 + 4);
> > if (!p)
> > - return nfserr_resource;
> > + return nfserr_io;
>
> Wouldn't nfserr_rep_too_big be a more appropriate status for running
> off the end of the send buffer? I'm not 100% sure, but I would expect
> that exhausting send buffer space would imply the reply has grown too
> large.

I can switch it to that, no problem.

>
>
> > + if (resp->xdr->buf->page_len && splice_ok) {
> > + WARN_ON_ONCE(splice_ok);
> > + return nfserr_io;
> > + }
>
> I wish I understood why this test was needed. It seems to have been
> copied and pasted from historic code into nfsd4_encode_read(), and
> there have been recent mechanical changes to it, but there's no
> comment explaining it there...

Yeah, I saw this was in the read code and assumed it was an important
check so I added it here too.
>
> In any event, this seems to be checking for a server software bug,
> so maybe this should return nfserr_serverfault. Oddly that status
> code isn't defined yet.

Do you want me to add that code and return it in this patch?

>
>
> Do you have some performance results for v2?

Not yet, I have it running now so hopefully I'll have something ready
by tomorrow morning.

Anna
>
>
> > - read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
> > - if (read->rd_vlen < 0)
> > - return nfserr_resource;
> > + maxcount = min_t(unsigned long, read->rd_length,
> > + (xdr->buf->buflen - xdr->buf->len));
> >
> > - nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
> > - resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
> > + if (file->f_op->splice_read && splice_ok)
> > + nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
> > + else
> > + nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
> > if (nfserr)
> > return nfserr;
> > - xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount));
> >
> > - tmp = htonl(NFS4_CONTENT_DATA);
> > - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
> > - tmp64 = cpu_to_be64(read->rd_offset);
> > - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8);
> > - tmp = htonl(*maxcount);
> > - write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4);
> > -
> > - tmp = xdr_zero;
> > - write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp,
> > - xdr_pad_size(*maxcount));
> > - return nfs_ok;
> > -}
> > -
> > -static __be32
> > -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
> > - struct nfsd4_read *read,
> > - unsigned long *maxcount, u32 *eof)
> > -{
> > - struct file *file = read->rd_nf->nf_file;
> > - loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
> > - loff_t f_size = i_size_read(file_inode(file));
> > - unsigned long count;
> > - __be32 *p;
> > -
> > - if (data_pos == -ENXIO)
> > - data_pos = f_size;
> > - else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
> > - return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
> > - count = data_pos - read->rd_offset;
> > -
> > - /* Content type, offset, byte count */
> > - p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
> > - if (!p)
> > - return nfserr_resource;
> > -
> > - *p++ = htonl(NFS4_CONTENT_HOLE);
> > + *p++ = cpu_to_be32(NFS4_CONTENT_DATA);
> > p = xdr_encode_hyper(p, read->rd_offset);
> > - p = xdr_encode_hyper(p, count);
> > + *p = cpu_to_be32(read->rd_length);
> >
> > - *eof = (read->rd_offset + count) >= f_size;
> > - *maxcount = min_t(unsigned long, count, *maxcount);
> > return nfs_ok;
> > }
> >
> > @@ -4811,69 +4769,36 @@ static __be32
> > nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
> > struct nfsd4_read *read)
> > {
> > - unsigned long maxcount, count;
> > + struct file *file = read->rd_nf->nf_file;
> > struct xdr_stream *xdr = resp->xdr;
> > - struct file *file;
> > int starting_len = xdr->buf->len;
> > - int last_segment = xdr->buf->len;
> > - int segments = 0;
> > - __be32 *p, tmp;
> > - bool is_data;
> > - loff_t pos;
> > - u32 eof;
> > + u32 segments = 0;
> > + __be32 *p;
> >
> > if (nfserr)
> > return nfserr;
> > - file = read->rd_nf->nf_file;
> >
> > /* eof flag, segment count */
> > p = xdr_reserve_space(xdr, 4 + 4);
> > if (!p)
> > - return nfserr_resource;
> > + return nfserr_io;
> > xdr_commit_encode(xdr);
> >
> > - maxcount = min_t(unsigned long, read->rd_length,
> > - (xdr->buf->buflen - xdr->buf->len));
> > - count = maxcount;
> > -
> > - eof = read->rd_offset >= i_size_read(file_inode(file));
> > - if (eof)
> > + read->rd_eof = read->rd_offset >= i_size_read(file_inode(file));
> > + if (read->rd_eof)
> > goto out;
> >
> > - pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
> > - is_data = pos > read->rd_offset;
> > -
> > - while (count > 0 && !eof) {
> > - maxcount = count;
> > - if (is_data)
> > - nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
> > - segments == 0 ? &pos : NULL);
> > - else
> > - nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
> > - if (nfserr)
> > - goto out;
> > - count -= maxcount;
> > - read->rd_offset += maxcount;
> > - is_data = !is_data;
> > - last_segment = xdr->buf->len;
> > - segments++;
> > - }
> > -
> > -out:
> > - if (nfserr && segments == 0)
> > + nfserr = nfsd4_encode_read_plus_data(resp, read);
> > + if (nfserr) {
> > xdr_truncate_encode(xdr, starting_len);
> > - else {
> > - if (nfserr) {
> > - xdr_truncate_encode(xdr, last_segment);
> > - nfserr = nfs_ok;
> > - eof = 0;
> > - }
> > - tmp = htonl(eof);
> > - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
> > - tmp = htonl(segments);
> > - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
> > + return nfserr;
> > }
> >
> > + segments++;
> > +
> > +out:
> > + p = xdr_encode_bool(p, read->rd_eof);
> > + *p = cpu_to_be32(segments);
> > return nfserr;
> > }
> >
> > --
> > 2.37.2
> >
>
> --
> Chuck Lever
>
>
>

2022-09-07 21:07:50

by Chuck Lever III

[permalink] [raw]
Subject: Re: [PATCH v2 1/1] NFSD: Simplify READ_PLUS



> On Sep 7, 2022, at 4:37 PM, Anna Schumaker <[email protected]> wrote:
>
> On Wed, Sep 7, 2022 at 4:29 PM Chuck Lever III <[email protected]> wrote:
>>
>> Be sure to Cc: Jeff on these. Thanks!
>>
>>
>>> On Sep 7, 2022, at 3:52 PM, Anna Schumaker <[email protected]> wrote:
>>>
>>> From: Anna Schumaker <[email protected]>
>>>
>>> Chuck had suggested reverting READ_PLUS so it returns a single DATA
>>> segment covering the requested read range. This prepares the server for
>>> a future "sparse read" function so support can easily be added without
>>> needing to rip out the old READ_PLUS code at the same time.
>>>
>>> Signed-off-by: Anna Schumaker <[email protected]>
>>> ---
>>> fs/nfsd/nfs4xdr.c | 139 +++++++++++-----------------------------------
>>> 1 file changed, 32 insertions(+), 107 deletions(-)
>>>
>>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
>>> index 1e9690a061ec..bcc8c385faf2 100644
>>> --- a/fs/nfsd/nfs4xdr.c
>>> +++ b/fs/nfsd/nfs4xdr.c
>>> @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
>>>
>>> static __be32
>>> nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
>>> - struct nfsd4_read *read,
>>> - unsigned long *maxcount, u32 *eof,
>>> - loff_t *pos)
>>> + struct nfsd4_read *read)
>>> {
>>> - struct xdr_stream *xdr = resp->xdr;
>>> + bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
>>> struct file *file = read->rd_nf->nf_file;
>>> - int starting_len = xdr->buf->len;
>>> - loff_t hole_pos;
>>> - __be32 nfserr;
>>> - __be32 *p, tmp;
>>> - __be64 tmp64;
>>> -
>>> - hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
>>> - if (hole_pos > read->rd_offset)
>>> - *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
>>> - *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
>>> + struct xdr_stream *xdr = resp->xdr;
>>> + unsigned long maxcount;
>>> + __be32 nfserr, *p;
>>>
>>> /* Content type, offset, byte count */
>>> p = xdr_reserve_space(xdr, 4 + 8 + 4);
>>> if (!p)
>>> - return nfserr_resource;
>>> + return nfserr_io;
>>
>> Wouldn't nfserr_rep_too_big be a more appropriate status for running
>> off the end of the send buffer? I'm not 100% sure, but I would expect
>> that exhausting send buffer space would imply the reply has grown too
>> large.
>
> I can switch it to that, no problem.

I would like to hear opinions from protocol experts before we go
with that choice.


>>> + if (resp->xdr->buf->page_len && splice_ok) {
>>> + WARN_ON_ONCE(splice_ok);
>>> + return nfserr_io;
>>> + }
>>
>> I wish I understood why this test was needed. It seems to have been
>> copied and pasted from historic code into nfsd4_encode_read(), and
>> there have been recent mechanical changes to it, but there's no
>> comment explaining it there...
>
> Yeah, I saw this was in the read code and assumed it was an important
> check so I added it here too.
>>
>> In any event, this seems to be checking for a server software bug,
>> so maybe this should return nfserr_serverfault. Oddly that status
>> code isn't defined yet.
>
> Do you want me to add that code and return it in this patch?

Sure. Make that a predecessor patch and fix up the code in
nfsd4_encode_read() before using it for READ_PLUS in this patch.

Suggested patch description:

RESOURCE is the wrong status code here because

a) This encoder is shared between NFSv4.0 and NFSv4.1+, and
NFSv4.1+ doesn't support RESOURCE, and
b) That status code might cause the client to retry, in which
case it will get the same failure because this check seems
to be looking for a server-side coding mistake.


>> Do you have some performance results for v2?
>
> Not yet, I have it running now so hopefully I'll have something ready
> by tomorrow morning.

Great, thanks!


> Anna
>>
>>
>>> - read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
>>> - if (read->rd_vlen < 0)
>>> - return nfserr_resource;
>>> + maxcount = min_t(unsigned long, read->rd_length,
>>> + (xdr->buf->buflen - xdr->buf->len));
>>>
>>> - nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
>>> - resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
>>> + if (file->f_op->splice_read && splice_ok)
>>> + nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
>>> + else
>>> + nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
>>> if (nfserr)
>>> return nfserr;
>>> - xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount));
>>>
>>> - tmp = htonl(NFS4_CONTENT_DATA);
>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
>>> - tmp64 = cpu_to_be64(read->rd_offset);
>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8);
>>> - tmp = htonl(*maxcount);
>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4);
>>> -
>>> - tmp = xdr_zero;
>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp,
>>> - xdr_pad_size(*maxcount));
>>> - return nfs_ok;
>>> -}
>>> -
>>> -static __be32
>>> -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
>>> - struct nfsd4_read *read,
>>> - unsigned long *maxcount, u32 *eof)
>>> -{
>>> - struct file *file = read->rd_nf->nf_file;
>>> - loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
>>> - loff_t f_size = i_size_read(file_inode(file));
>>> - unsigned long count;
>>> - __be32 *p;
>>> -
>>> - if (data_pos == -ENXIO)
>>> - data_pos = f_size;
>>> - else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
>>> - return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
>>> - count = data_pos - read->rd_offset;
>>> -
>>> - /* Content type, offset, byte count */
>>> - p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
>>> - if (!p)
>>> - return nfserr_resource;
>>> -
>>> - *p++ = htonl(NFS4_CONTENT_HOLE);
>>> + *p++ = cpu_to_be32(NFS4_CONTENT_DATA);
>>> p = xdr_encode_hyper(p, read->rd_offset);
>>> - p = xdr_encode_hyper(p, count);
>>> + *p = cpu_to_be32(read->rd_length);
>>>
>>> - *eof = (read->rd_offset + count) >= f_size;
>>> - *maxcount = min_t(unsigned long, count, *maxcount);
>>> return nfs_ok;
>>> }
>>>
>>> @@ -4811,69 +4769,36 @@ static __be32
>>> nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
>>> struct nfsd4_read *read)
>>> {
>>> - unsigned long maxcount, count;
>>> + struct file *file = read->rd_nf->nf_file;
>>> struct xdr_stream *xdr = resp->xdr;
>>> - struct file *file;
>>> int starting_len = xdr->buf->len;
>>> - int last_segment = xdr->buf->len;
>>> - int segments = 0;
>>> - __be32 *p, tmp;
>>> - bool is_data;
>>> - loff_t pos;
>>> - u32 eof;
>>> + u32 segments = 0;
>>> + __be32 *p;
>>>
>>> if (nfserr)
>>> return nfserr;
>>> - file = read->rd_nf->nf_file;
>>>
>>> /* eof flag, segment count */
>>> p = xdr_reserve_space(xdr, 4 + 4);
>>> if (!p)
>>> - return nfserr_resource;
>>> + return nfserr_io;
>>> xdr_commit_encode(xdr);
>>>
>>> - maxcount = min_t(unsigned long, read->rd_length,
>>> - (xdr->buf->buflen - xdr->buf->len));
>>> - count = maxcount;
>>> -
>>> - eof = read->rd_offset >= i_size_read(file_inode(file));
>>> - if (eof)
>>> + read->rd_eof = read->rd_offset >= i_size_read(file_inode(file));
>>> + if (read->rd_eof)
>>> goto out;
>>>
>>> - pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
>>> - is_data = pos > read->rd_offset;
>>> -
>>> - while (count > 0 && !eof) {
>>> - maxcount = count;
>>> - if (is_data)
>>> - nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
>>> - segments == 0 ? &pos : NULL);
>>> - else
>>> - nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
>>> - if (nfserr)
>>> - goto out;
>>> - count -= maxcount;
>>> - read->rd_offset += maxcount;
>>> - is_data = !is_data;
>>> - last_segment = xdr->buf->len;
>>> - segments++;
>>> - }
>>> -
>>> -out:
>>> - if (nfserr && segments == 0)
>>> + nfserr = nfsd4_encode_read_plus_data(resp, read);
>>> + if (nfserr) {
>>> xdr_truncate_encode(xdr, starting_len);
>>> - else {
>>> - if (nfserr) {
>>> - xdr_truncate_encode(xdr, last_segment);
>>> - nfserr = nfs_ok;
>>> - eof = 0;
>>> - }
>>> - tmp = htonl(eof);
>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
>>> - tmp = htonl(segments);
>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
>>> + return nfserr;
>>> }
>>>
>>> + segments++;
>>> +
>>> +out:
>>> + p = xdr_encode_bool(p, read->rd_eof);
>>> + *p = cpu_to_be32(segments);
>>> return nfserr;
>>> }
>>>
>>> --
>>> 2.37.2
>>>
>>
>> --
>> Chuck Lever

--
Chuck Lever



2022-09-07 22:41:43

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v2 1/1] NFSD: Simplify READ_PLUS

On Wed, 2022-09-07 at 20:51 +0000, Chuck Lever III wrote:
>
>
> > On Sep 7, 2022, at 4:37 PM, Anna Schumaker <[email protected]> wrote:
> >
> > On Wed, Sep 7, 2022 at 4:29 PM Chuck Lever III
> > <[email protected]> wrote:
> > >
> > > Be sure to Cc: Jeff on these. Thanks!
> > >
> > >
> > > > On Sep 7, 2022, at 3:52 PM, Anna Schumaker <[email protected]>
> > > > wrote:
> > > >
> > > > From: Anna Schumaker <[email protected]>
> > > >
> > > > Chuck had suggested reverting READ_PLUS so it returns a single
> > > > DATA
> > > > segment covering the requested read range. This prepares the
> > > > server for
> > > > a future "sparse read" function so support can easily be added
> > > > without
> > > > needing to rip out the old READ_PLUS code at the same time.
> > > >
> > > > Signed-off-by: Anna Schumaker <[email protected]>
> > > > ---
> > > > fs/nfsd/nfs4xdr.c | 139 +++++++++++----------------------------
> > > > -------
> > > > 1 file changed, 32 insertions(+), 107 deletions(-)
> > > >
> > > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > > > index 1e9690a061ec..bcc8c385faf2 100644
> > > > --- a/fs/nfsd/nfs4xdr.c
> > > > +++ b/fs/nfsd/nfs4xdr.c
> > > > @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct
> > > > nfsd4_compoundres *resp, __be32 nfserr,
> > > >
> > > > static __be32
> > > > nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
> > > > -                         struct nfsd4_read *read,
> > > > -                         unsigned long *maxcount, u32 *eof,
> > > > -                         loff_t *pos)
> > > > +                         struct nfsd4_read *read)
> > > > {
> > > > -     struct xdr_stream *xdr = resp->xdr;
> > > > +     bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp-
> > > > >rq_flags);
> > > >      struct file *file = read->rd_nf->nf_file;
> > > > -     int starting_len = xdr->buf->len;
> > > > -     loff_t hole_pos;
> > > > -     __be32 nfserr;
> > > > -     __be32 *p, tmp;
> > > > -     __be64 tmp64;
> > > > -
> > > > -     hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset,
> > > > SEEK_HOLE);
> > > > -     if (hole_pos > read->rd_offset)
> > > > -             *maxcount = min_t(unsigned long, *maxcount,
> > > > hole_pos - read->rd_offset);
> > > > -     *maxcount = min_t(unsigned long, *maxcount, (xdr->buf-
> > > > >buflen - xdr->buf->len));
> > > > +     struct xdr_stream *xdr = resp->xdr;
> > > > +     unsigned long maxcount;
> > > > +     __be32 nfserr, *p;
> > > >
> > > >      /* Content type, offset, byte count */
> > > >      p = xdr_reserve_space(xdr, 4 + 8 + 4);
> > > >      if (!p)
> > > > -             return nfserr_resource;
> > > > +             return nfserr_io;
> > >
> > > Wouldn't nfserr_rep_too_big be a more appropriate status for
> > > running
> > > off the end of the send buffer? I'm not 100% sure, but I would
> > > expect
> > > that exhausting send buffer space would imply the reply has grown
> > > too
> > > large.
> >
> > I can switch it to that, no problem.
>
> I would like to hear opinions from protocol experts before we go
> with that choice.

I'd agree that NFS4ERR_REP_TOO_BIG is correct if you're not able to
even return a short read. However if you can return a short read, then
that's better than returning an error.

It looks to me as if this function can bit hit in both cases, so
perhaps some care is in order.

>
> > > > +     if (resp->xdr->buf->page_len && splice_ok) {
> > > > +             WARN_ON_ONCE(splice_ok);
> > > > +             return nfserr_io;
> > > > +     }
> > >
> > > I wish I understood why this test was needed. It seems to have
> > > been
> > > copied and pasted from historic code into nfsd4_encode_read(),
> > > and
> > > there have been recent mechanical changes to it, but there's no
> > > comment explaining it there...
> >
> > Yeah, I saw this was in the read code and assumed it was an
> > important
> > check so I added it here too.
> > >
> > > In any event, this seems to be checking for a server software
> > > bug,
> > > so maybe this should return nfserr_serverfault. Oddly that status
> > > code isn't defined yet.
> >
> > Do you want me to add that code and return it in this patch?
>
> Sure. Make that a predecessor patch and fix up the code in
> nfsd4_encode_read() before using it for READ_PLUS in this patch.
>
> Suggested patch description:
>
> RESOURCE is the wrong status code here because
>
> a) This encoder is shared between NFSv4.0 and NFSv4.1+, and
>    NFSv4.1+ doesn't support RESOURCE, and

Is it? I thought it was READ_PLUS only. That's NFSv4.2 or higher.

If the encoder is to be shared with both READ and READ_PLUS, then
perhaps it might be wiser to choose a name other than
nfsd4_encode_read_plus_data()?

> b) That status code might cause the client to retry, in which
>    case it will get the same failure because this check seems
>    to be looking for a server-side coding mistake.
>
>
> > > Do you have some performance results for v2?
> >
> > Not yet, I have it running now so hopefully I'll have something
> > ready
> > by tomorrow morning.
>
> Great, thanks!
>
>
> > Anna
> > >
> > >
> > > > -     read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp-
> > > > >rq_vec, *maxcount);
> > > > -     if (read->rd_vlen < 0)
> > > > -             return nfserr_resource;
> > > > +     maxcount = min_t(unsigned long, read->rd_length,
> > > > +                      (xdr->buf->buflen - xdr->buf->len));
> > > >
> > > > -     nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file,
> > > > read->rd_offset,
> > > > -                         resp->rqstp->rq_vec, read->rd_vlen,
> > > > maxcount, eof);
> > > > +     if (file->f_op->splice_read && splice_ok)
> > > > +             nfserr = nfsd4_encode_splice_read(resp, read,
> > > > file, maxcount);
> > > > +     else
> > > > +             nfserr = nfsd4_encode_readv(resp, read, file,
> > > > maxcount);
> > > >      if (nfserr)
> > > >              return nfserr;
> > > > -     xdr_truncate_encode(xdr, starting_len + 16 +
> > > > xdr_align_size(*maxcount));
> > > >
> > > > -     tmp = htonl(NFS4_CONTENT_DATA);
> > > > -     write_bytes_to_xdr_buf(xdr->buf, starting_len,     
> > > > &tmp,   4);
> > > > -     tmp64 = cpu_to_be64(read->rd_offset);
> > > > -     write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, 
> > > > &tmp64, 8);
> > > > -     tmp = htonl(*maxcount);
> > > > -     write_bytes_to_xdr_buf(xdr->buf, starting_len + 12,
> > > > &tmp,   4);
> > > > -
> > > > -     tmp = xdr_zero;
> > > > -     write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 +
> > > > *maxcount, &tmp,
> > > > -                            xdr_pad_size(*maxcount));
> > > > -     return nfs_ok;
> > > > -}
> > > > -
> > > > -static __be32
> > > > -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
> > > > -                         struct nfsd4_read *read,
> > > > -                         unsigned long *maxcount, u32 *eof)
> > > > -{
> > > > -     struct file *file = read->rd_nf->nf_file;
> > > > -     loff_t data_pos = vfs_llseek(file, read->rd_offset,
> > > > SEEK_DATA);
> > > > -     loff_t f_size = i_size_read(file_inode(file));
> > > > -     unsigned long count;
> > > > -     __be32 *p;
> > > > -
> > > > -     if (data_pos == -ENXIO)
> > > > -             data_pos = f_size;
> > > > -     else if (data_pos <= read->rd_offset || (data_pos <
> > > > f_size && data_pos % PAGE_SIZE))
> > > > -             return nfsd4_encode_read_plus_data(resp, read,
> > > > maxcount, eof, &f_size);
> > > > -     count = data_pos - read->rd_offset;
> > > > -
> > > > -     /* Content type, offset, byte count */
> > > > -     p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
> > > > -     if (!p)
> > > > -             return nfserr_resource;
> > > > -
> > > > -     *p++ = htonl(NFS4_CONTENT_HOLE);
> > > > +     *p++ = cpu_to_be32(NFS4_CONTENT_DATA);
> > > >      p = xdr_encode_hyper(p, read->rd_offset);
> > > > -     p = xdr_encode_hyper(p, count);
> > > > +     *p = cpu_to_be32(read->rd_length);
> > > >
> > > > -     *eof = (read->rd_offset + count) >= f_size;
> > > > -     *maxcount = min_t(unsigned long, count, *maxcount);
> > > >      return nfs_ok;
> > > > }
> > > >
> > > > @@ -4811,69 +4769,36 @@ static __be32
> > > > nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32
> > > > nfserr,
> > > >                     struct nfsd4_read *read)
> > > > {
> > > > -     unsigned long maxcount, count;
> > > > +     struct file *file = read->rd_nf->nf_file;
> > > >      struct xdr_stream *xdr = resp->xdr;
> > > > -     struct file *file;
> > > >      int starting_len = xdr->buf->len;
> > > > -     int last_segment = xdr->buf->len;
> > > > -     int segments = 0;
> > > > -     __be32 *p, tmp;
> > > > -     bool is_data;
> > > > -     loff_t pos;
> > > > -     u32 eof;
> > > > +     u32 segments = 0;
> > > > +     __be32 *p;
> > > >
> > > >      if (nfserr)
> > > >              return nfserr;
> > > > -     file = read->rd_nf->nf_file;
> > > >
> > > >      /* eof flag, segment count */
> > > >      p = xdr_reserve_space(xdr, 4 + 4);
> > > >      if (!p)
> > > > -             return nfserr_resource;
> > > > +             return nfserr_io;
> > > >      xdr_commit_encode(xdr);
> > > >
> > > > -     maxcount = min_t(unsigned long, read->rd_length,
> > > > -                      (xdr->buf->buflen - xdr->buf->len));
> > > > -     count    = maxcount;
> > > > -
> > > > -     eof = read->rd_offset >= i_size_read(file_inode(file));
> > > > -     if (eof)
> > > > +     read->rd_eof = read->rd_offset >=
> > > > i_size_read(file_inode(file));
> > > > +     if (read->rd_eof)
> > > >              goto out;
> > > >
> > > > -     pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
> > > > -     is_data = pos > read->rd_offset;
> > > > -
> > > > -     while (count > 0 && !eof) {
> > > > -             maxcount = count;
> > > > -             if (is_data)
> > > > -                     nfserr =
> > > > nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
> > > > -                                             segments == 0 ?
> > > > &pos : NULL);
> > > > -             else
> > > > -                     nfserr =
> > > > nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
> > > > -             if (nfserr)
> > > > -                     goto out;
> > > > -             count -= maxcount;
> > > > -             read->rd_offset += maxcount;
> > > > -             is_data = !is_data;
> > > > -             last_segment = xdr->buf->len;
> > > > -             segments++;
> > > > -     }
> > > > -
> > > > -out:
> > > > -     if (nfserr && segments == 0)
> > > > +     nfserr = nfsd4_encode_read_plus_data(resp, read);
> > > > +     if (nfserr) {
> > > >              xdr_truncate_encode(xdr, starting_len);
> > > > -     else {
> > > > -             if (nfserr) {
> > > > -                     xdr_truncate_encode(xdr, last_segment);
> > > > -                     nfserr = nfs_ok;
> > > > -                     eof = 0;
> > > > -             }
> > > > -             tmp = htonl(eof);
> > > > -             write_bytes_to_xdr_buf(xdr->buf,
> > > > starting_len,     &tmp, 4);
> > > > -             tmp = htonl(segments);
> > > > -             write_bytes_to_xdr_buf(xdr->buf, starting_len +
> > > > 4, &tmp, 4);
> > > > +             return nfserr;
> > > >      }
> > > >
> > > > +     segments++;
> > > > +
> > > > +out:
> > > > +     p = xdr_encode_bool(p, read->rd_eof);
> > > > +     *p = cpu_to_be32(segments);
> > > >      return nfserr;
> > > > }
> > > >
> > > > --
> > > > 2.37.2
> > > >
> > >
> > > --
> > > Chuck Lever
>
> --
> Chuck Lever
>
>
>

--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]


2022-09-08 15:52:07

by Chuck Lever III

[permalink] [raw]
Subject: Re: [PATCH v2 1/1] NFSD: Simplify READ_PLUS



> On Sep 7, 2022, at 6:33 PM, Trond Myklebust <[email protected]> wrote:
>
> On Wed, 2022-09-07 at 20:51 +0000, Chuck Lever III wrote:
>>
>>
>>> On Sep 7, 2022, at 4:37 PM, Anna Schumaker <[email protected]> wrote:
>>>
>>> On Wed, Sep 7, 2022 at 4:29 PM Chuck Lever III
>>> <[email protected]> wrote:
>>>>
>>>> Be sure to Cc: Jeff on these. Thanks!
>>>>
>>>>
>>>>> On Sep 7, 2022, at 3:52 PM, Anna Schumaker <[email protected]>
>>>>> wrote:
>>>>>
>>>>> From: Anna Schumaker <[email protected]>
>>>>>
>>>>> Chuck had suggested reverting READ_PLUS so it returns a single
>>>>> DATA
>>>>> segment covering the requested read range. This prepares the
>>>>> server for
>>>>> a future "sparse read" function so support can easily be added
>>>>> without
>>>>> needing to rip out the old READ_PLUS code at the same time.
>>>>>
>>>>> Signed-off-by: Anna Schumaker <[email protected]>
>>>>> ---
>>>>> fs/nfsd/nfs4xdr.c | 139 +++++++++++----------------------------
>>>>> -------
>>>>> 1 file changed, 32 insertions(+), 107 deletions(-)
>>>>>
>>>>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
>>>>> index 1e9690a061ec..bcc8c385faf2 100644
>>>>> --- a/fs/nfsd/nfs4xdr.c
>>>>> +++ b/fs/nfsd/nfs4xdr.c
>>>>> @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct
>>>>> nfsd4_compoundres *resp, __be32 nfserr,
>>>>>
>>>>> static __be32
>>>>> nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
>>>>> - struct nfsd4_read *read,
>>>>> - unsigned long *maxcount, u32 *eof,
>>>>> - loff_t *pos)
>>>>> + struct nfsd4_read *read)
>>>>> {
>>>>> - struct xdr_stream *xdr = resp->xdr;
>>>>> + bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp-
>>>>>> rq_flags);
>>>>> struct file *file = read->rd_nf->nf_file;
>>>>> - int starting_len = xdr->buf->len;
>>>>> - loff_t hole_pos;
>>>>> - __be32 nfserr;
>>>>> - __be32 *p, tmp;
>>>>> - __be64 tmp64;
>>>>> -
>>>>> - hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset,
>>>>> SEEK_HOLE);
>>>>> - if (hole_pos > read->rd_offset)
>>>>> - *maxcount = min_t(unsigned long, *maxcount,
>>>>> hole_pos - read->rd_offset);
>>>>> - *maxcount = min_t(unsigned long, *maxcount, (xdr->buf-
>>>>>> buflen - xdr->buf->len));
>>>>> + struct xdr_stream *xdr = resp->xdr;
>>>>> + unsigned long maxcount;
>>>>> + __be32 nfserr, *p;
>>>>>
>>>>> /* Content type, offset, byte count */
>>>>> p = xdr_reserve_space(xdr, 4 + 8 + 4);
>>>>> if (!p)
>>>>> - return nfserr_resource;
>>>>> + return nfserr_io;
>>>>
>>>> Wouldn't nfserr_rep_too_big be a more appropriate status for
>>>> running
>>>> off the end of the send buffer? I'm not 100% sure, but I would
>>>> expect
>>>> that exhausting send buffer space would imply the reply has grown
>>>> too
>>>> large.
>>>
>>> I can switch it to that, no problem.
>>
>> I would like to hear opinions from protocol experts before we go
>> with that choice.
>
> I'd agree that NFS4ERR_REP_TOO_BIG is correct if you're not able to
> even return a short read. However if you can return a short read, then
> that's better than returning an error.

Many thanks for reviewing!

In general, I might want to convert all NFSv4 encoders to return
either RESOURCE or REP_TOO_BIG when xdr_reserve_space() fails. I
can post a patch or two that makes that attempt so the special
cases can be sorted on the mailing list.


> It looks to me as if this function can bit hit in both cases, so
> perhaps some care is in order.

Intriguing idea.

For READ, if xdr_reserve_space() fails, that's all she wrote.
I think all these calls happen before the data payload is encoded.

For READ_PLUS, if xdr_reserve_space() fails, it might be possible
to use xdr_truncate_encode() or simply start over with a limit on
the number of segments to be encoded. We're not really there yet,
as currently we want to return just a single segment at this
point.

I feel the question is whether it's practical or a frequent
enough occurrence to bother with the special cases. Encoding a
READ_PLUS response is already challenging.


>>>>> + if (resp->xdr->buf->page_len && splice_ok) {
>>>>> + WARN_ON_ONCE(splice_ok);
>>>>> + return nfserr_io;
>>>>> + }
>>>>
>>>> I wish I understood why this test was needed. It seems to have
>>>> been
>>>> copied and pasted from historic code into nfsd4_encode_read(),
>>>> and
>>>> there have been recent mechanical changes to it, but there's no
>>>> comment explaining it there...
>>>
>>> Yeah, I saw this was in the read code and assumed it was an
>>> important
>>> check so I added it here too.
>>>>
>>>> In any event, this seems to be checking for a server software
>>>> bug,
>>>> so maybe this should return nfserr_serverfault. Oddly that status
>>>> code isn't defined yet.
>>>
>>> Do you want me to add that code and return it in this patch?
>>
>> Sure. Make that a predecessor patch and fix up the code in
>> nfsd4_encode_read() before using it for READ_PLUS in this patch.
>>
>> Suggested patch description:
>>
>> RESOURCE is the wrong status code here because
>>
>> a) This encoder is shared between NFSv4.0 and NFSv4.1+, and
>> NFSv4.1+ doesn't support RESOURCE, and
>
> Is it? I thought it was READ_PLUS only. That's NFSv4.2 or higher.

For the initial patch that adds nfserr_serverfault, I was speaking
only about nfsd4_encode_read(), which is shared amongst all minor
versions. That's where the page_len && splice_ok test originally
came from. Sorry that wasn't clear.


> If the encoder is to be shared with both READ and READ_PLUS, then
> perhaps it might be wiser to choose a name other than
> nfsd4_encode_read_plus_data()?

Although the encoded streams are very similar, there are still
separate encoders for a READ_PLUS data segment and a full READ
response. The common pieces for both of those operations are
nfsd4_encode_readv() and nfsd4_encode_splice_read().

IIUC nfsd4_encode_read_plus_data() has to deal with encoding the
the NFS4_CONTENT_DATA enumerator -- it handles a single
READ_PLUS segment. nfsd4_encode_read() encodes one complete READ
response. I'm comfortable with the current function names.


>> b) That status code might cause the client to retry, in which
>> case it will get the same failure because this check seems
>> to be looking for a server-side coding mistake.
>>
>>
>>>> Do you have some performance results for v2?
>>>
>>> Not yet, I have it running now so hopefully I'll have something
>>> ready
>>> by tomorrow morning.
>>
>> Great, thanks!
>>
>>
>>> Anna
>>>>
>>>>
>>>>> - read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp-
>>>>>> rq_vec, *maxcount);
>>>>> - if (read->rd_vlen < 0)
>>>>> - return nfserr_resource;
>>>>> + maxcount = min_t(unsigned long, read->rd_length,
>>>>> + (xdr->buf->buflen - xdr->buf->len));
>>>>>
>>>>> - nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file,
>>>>> read->rd_offset,
>>>>> - resp->rqstp->rq_vec, read->rd_vlen,
>>>>> maxcount, eof);
>>>>> + if (file->f_op->splice_read && splice_ok)
>>>>> + nfserr = nfsd4_encode_splice_read(resp, read,
>>>>> file, maxcount);
>>>>> + else
>>>>> + nfserr = nfsd4_encode_readv(resp, read, file,
>>>>> maxcount);
>>>>> if (nfserr)
>>>>> return nfserr;
>>>>> - xdr_truncate_encode(xdr, starting_len + 16 +
>>>>> xdr_align_size(*maxcount));
>>>>>
>>>>> - tmp = htonl(NFS4_CONTENT_DATA);
>>>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len,
>>>>> &tmp, 4);
>>>>> - tmp64 = cpu_to_be64(read->rd_offset);
>>>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4,
>>>>> &tmp64, 8);
>>>>> - tmp = htonl(*maxcount);
>>>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 12,
>>>>> &tmp, 4);
>>>>> -
>>>>> - tmp = xdr_zero;
>>>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 +
>>>>> *maxcount, &tmp,
>>>>> - xdr_pad_size(*maxcount));
>>>>> - return nfs_ok;
>>>>> -}
>>>>> -
>>>>> -static __be32
>>>>> -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
>>>>> - struct nfsd4_read *read,
>>>>> - unsigned long *maxcount, u32 *eof)
>>>>> -{
>>>>> - struct file *file = read->rd_nf->nf_file;
>>>>> - loff_t data_pos = vfs_llseek(file, read->rd_offset,
>>>>> SEEK_DATA);
>>>>> - loff_t f_size = i_size_read(file_inode(file));
>>>>> - unsigned long count;
>>>>> - __be32 *p;
>>>>> -
>>>>> - if (data_pos == -ENXIO)
>>>>> - data_pos = f_size;
>>>>> - else if (data_pos <= read->rd_offset || (data_pos <
>>>>> f_size && data_pos % PAGE_SIZE))
>>>>> - return nfsd4_encode_read_plus_data(resp, read,
>>>>> maxcount, eof, &f_size);
>>>>> - count = data_pos - read->rd_offset;
>>>>> -
>>>>> - /* Content type, offset, byte count */
>>>>> - p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
>>>>> - if (!p)
>>>>> - return nfserr_resource;
>>>>> -
>>>>> - *p++ = htonl(NFS4_CONTENT_HOLE);
>>>>> + *p++ = cpu_to_be32(NFS4_CONTENT_DATA);
>>>>> p = xdr_encode_hyper(p, read->rd_offset);
>>>>> - p = xdr_encode_hyper(p, count);
>>>>> + *p = cpu_to_be32(read->rd_length);
>>>>>
>>>>> - *eof = (read->rd_offset + count) >= f_size;
>>>>> - *maxcount = min_t(unsigned long, count, *maxcount);
>>>>> return nfs_ok;
>>>>> }
>>>>>
>>>>> @@ -4811,69 +4769,36 @@ static __be32
>>>>> nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32
>>>>> nfserr,
>>>>> struct nfsd4_read *read)
>>>>> {
>>>>> - unsigned long maxcount, count;
>>>>> + struct file *file = read->rd_nf->nf_file;
>>>>> struct xdr_stream *xdr = resp->xdr;
>>>>> - struct file *file;
>>>>> int starting_len = xdr->buf->len;
>>>>> - int last_segment = xdr->buf->len;
>>>>> - int segments = 0;
>>>>> - __be32 *p, tmp;
>>>>> - bool is_data;
>>>>> - loff_t pos;
>>>>> - u32 eof;
>>>>> + u32 segments = 0;
>>>>> + __be32 *p;
>>>>>
>>>>> if (nfserr)
>>>>> return nfserr;
>>>>> - file = read->rd_nf->nf_file;
>>>>>
>>>>> /* eof flag, segment count */
>>>>> p = xdr_reserve_space(xdr, 4 + 4);
>>>>> if (!p)
>>>>> - return nfserr_resource;
>>>>> + return nfserr_io;
>>>>> xdr_commit_encode(xdr);
>>>>>
>>>>> - maxcount = min_t(unsigned long, read->rd_length,
>>>>> - (xdr->buf->buflen - xdr->buf->len));
>>>>> - count = maxcount;
>>>>> -
>>>>> - eof = read->rd_offset >= i_size_read(file_inode(file));
>>>>> - if (eof)
>>>>> + read->rd_eof = read->rd_offset >=
>>>>> i_size_read(file_inode(file));
>>>>> + if (read->rd_eof)
>>>>> goto out;
>>>>>
>>>>> - pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
>>>>> - is_data = pos > read->rd_offset;
>>>>> -
>>>>> - while (count > 0 && !eof) {
>>>>> - maxcount = count;
>>>>> - if (is_data)
>>>>> - nfserr =
>>>>> nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
>>>>> - segments == 0 ?
>>>>> &pos : NULL);
>>>>> - else
>>>>> - nfserr =
>>>>> nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
>>>>> - if (nfserr)
>>>>> - goto out;
>>>>> - count -= maxcount;
>>>>> - read->rd_offset += maxcount;
>>>>> - is_data = !is_data;
>>>>> - last_segment = xdr->buf->len;
>>>>> - segments++;
>>>>> - }
>>>>> -
>>>>> -out:
>>>>> - if (nfserr && segments == 0)
>>>>> + nfserr = nfsd4_encode_read_plus_data(resp, read);
>>>>> + if (nfserr) {
>>>>> xdr_truncate_encode(xdr, starting_len);
>>>>> - else {
>>>>> - if (nfserr) {
>>>>> - xdr_truncate_encode(xdr, last_segment);
>>>>> - nfserr = nfs_ok;
>>>>> - eof = 0;
>>>>> - }
>>>>> - tmp = htonl(eof);
>>>>> - write_bytes_to_xdr_buf(xdr->buf,
>>>>> starting_len, &tmp, 4);
>>>>> - tmp = htonl(segments);
>>>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len +
>>>>> 4, &tmp, 4);
>>>>> + return nfserr;
>>>>> }
>>>>>
>>>>> + segments++;
>>>>> +
>>>>> +out:
>>>>> + p = xdr_encode_bool(p, read->rd_eof);
>>>>> + *p = cpu_to_be32(segments);
>>>>> return nfserr;
>>>>> }
>>>>>
>>>>> --
>>>>> 2.37.2
>>>>>
>>>>
>>>> --
>>>> Chuck Lever
>>
>> --
>> Chuck Lever
>>
>>
>>
>
> --
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> [email protected]

--
Chuck Lever



2022-09-08 16:45:33

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v2 1/1] NFSD: Simplify READ_PLUS

On Thu, 2022-09-08 at 15:36 +0000, Chuck Lever III wrote:
>
>
> > On Sep 7, 2022, at 6:33 PM, Trond Myklebust
> > <[email protected]> wrote:
> >
> > On Wed, 2022-09-07 at 20:51 +0000, Chuck Lever III wrote:
> > >
> > >
> > > > On Sep 7, 2022, at 4:37 PM, Anna Schumaker <[email protected]>
> > > > wrote:
> > > >
> > > > On Wed, Sep 7, 2022 at 4:29 PM Chuck Lever III
> > > > <[email protected]> wrote:
> > > > >
> > > > > Be sure to Cc: Jeff on these. Thanks!
> > > > >
> > > > >
> > > > > > On Sep 7, 2022, at 3:52 PM, Anna Schumaker
> > > > > > <[email protected]>
> > > > > > wrote:
> > > > > >
> > > > > > From: Anna Schumaker <[email protected]>
> > > > > >
> > > > > > Chuck had suggested reverting READ_PLUS so it returns a
> > > > > > single
> > > > > > DATA
> > > > > > segment covering the requested read range. This prepares
> > > > > > the
> > > > > > server for
> > > > > > a future "sparse read" function so support can easily be
> > > > > > added
> > > > > > without
> > > > > > needing to rip out the old READ_PLUS code at the same time.
> > > > > >
> > > > > > Signed-off-by: Anna Schumaker <[email protected]>
> > > > > > ---
> > > > > > fs/nfsd/nfs4xdr.c | 139 +++++++++++------------------------
> > > > > > ----
> > > > > > -------
> > > > > > 1 file changed, 32 insertions(+), 107 deletions(-)
> > > > > >
> > > > > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > > > > > index 1e9690a061ec..bcc8c385faf2 100644
> > > > > > --- a/fs/nfsd/nfs4xdr.c
> > > > > > +++ b/fs/nfsd/nfs4xdr.c
> > > > > > @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct
> > > > > > nfsd4_compoundres *resp, __be32 nfserr,
> > > > > >
> > > > > > static __be32
> > > > > > nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
> > > > > > -                         struct nfsd4_read *read,
> > > > > > -                         unsigned long *maxcount, u32
> > > > > > *eof,
> > > > > > -                         loff_t *pos)
> > > > > > +                         struct nfsd4_read *read)
> > > > > > {
> > > > > > -     struct xdr_stream *xdr = resp->xdr;
> > > > > > +     bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp-
> > > > > > > rq_flags);
> > > > > >      struct file *file = read->rd_nf->nf_file;
> > > > > > -     int starting_len = xdr->buf->len;
> > > > > > -     loff_t hole_pos;
> > > > > > -     __be32 nfserr;
> > > > > > -     __be32 *p, tmp;
> > > > > > -     __be64 tmp64;
> > > > > > -
> > > > > > -     hole_pos = pos ? *pos : vfs_llseek(file, read-
> > > > > > >rd_offset,
> > > > > > SEEK_HOLE);
> > > > > > -     if (hole_pos > read->rd_offset)
> > > > > > -             *maxcount = min_t(unsigned long, *maxcount,
> > > > > > hole_pos - read->rd_offset);
> > > > > > -     *maxcount = min_t(unsigned long, *maxcount, (xdr-
> > > > > > >buf-
> > > > > > > buflen - xdr->buf->len));
> > > > > > +     struct xdr_stream *xdr = resp->xdr;
> > > > > > +     unsigned long maxcount;
> > > > > > +     __be32 nfserr, *p;
> > > > > >
> > > > > >      /* Content type, offset, byte count */
> > > > > >      p = xdr_reserve_space(xdr, 4 + 8 + 4);
> > > > > >      if (!p)
> > > > > > -             return nfserr_resource;
> > > > > > +             return nfserr_io;
> > > > >
> > > > > Wouldn't nfserr_rep_too_big be a more appropriate status for
> > > > > running
> > > > > off the end of the send buffer? I'm not 100% sure, but I
> > > > > would
> > > > > expect
> > > > > that exhausting send buffer space would imply the reply has
> > > > > grown
> > > > > too
> > > > > large.
> > > >
> > > > I can switch it to that, no problem.
> > >
> > > I would like to hear opinions from protocol experts before we go
> > > with that choice.
> >
> > I'd agree that NFS4ERR_REP_TOO_BIG is correct if you're not able to
> > even return a short read. However if you can return a short read,
> > then
> > that's better than returning an error.
>
> Many thanks for reviewing!
>
> In general, I might want to convert all NFSv4 encoders to return
> either RESOURCE or REP_TOO_BIG when xdr_reserve_space() fails. I
> can post a patch or two that makes that attempt so the special
> cases can be sorted on the mailing list.
>
>
> > It looks to me as if this function can bit hit in both cases, so
> > perhaps some care is in order.
>
> Intriguing idea.
>
> For READ, if xdr_reserve_space() fails, that's all she wrote.
> I think all these calls happen before the data payload is encoded.
>
> For READ_PLUS, if xdr_reserve_space() fails, it might be possible
> to use xdr_truncate_encode() or simply start over with a limit on
> the number of segments to be encoded. We're not really there yet,
> as currently we want to return just a single segment at this
> point.
>
> I feel the question is whether it's practical or a frequent
> enough occurrence to bother with the special cases. Encoding a
> READ_PLUS response is already challenging.

What I'm saying is that you are more likely to hit this issue when you
have a reply with something like "<data><hole><data>"...

If the second "<data>" chunk hits the above xdr_reserve_space() limit
(i.e. the first call to nfsd4_encode_read_plus_data() succeeds as does
the call to nfsd4_encode_read_plus_hole()), then you just want to
return a short read of the form "<data><hole>" instead of returning an
error on the whole READ_PLUS operation.


--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]


2022-09-08 16:47:39

by Chuck Lever III

[permalink] [raw]
Subject: Re: [PATCH v2 1/1] NFSD: Simplify READ_PLUS



> On Sep 7, 2022, at 4:37 PM, Anna Schumaker <[email protected]> wrote:
>
> On Wed, Sep 7, 2022 at 4:29 PM Chuck Lever III <[email protected]> wrote:
>>
>> Be sure to Cc: Jeff on these. Thanks!
>>
>>
>>> On Sep 7, 2022, at 3:52 PM, Anna Schumaker <[email protected]> wrote:
>>>
>>> From: Anna Schumaker <[email protected]>
>>>
>>> Chuck had suggested reverting READ_PLUS so it returns a single DATA
>>> segment covering the requested read range. This prepares the server for
>>> a future "sparse read" function so support can easily be added without
>>> needing to rip out the old READ_PLUS code at the same time.
>>>
>>> Signed-off-by: Anna Schumaker <[email protected]>
>>> ---
>>> fs/nfsd/nfs4xdr.c | 139 +++++++++++-----------------------------------
>>> 1 file changed, 32 insertions(+), 107 deletions(-)
>>>
>>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
>>> index 1e9690a061ec..bcc8c385faf2 100644
>>> --- a/fs/nfsd/nfs4xdr.c
>>> +++ b/fs/nfsd/nfs4xdr.c
>>> @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
>>>
>>> static __be32
>>> nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
>>> - struct nfsd4_read *read,
>>> - unsigned long *maxcount, u32 *eof,
>>> - loff_t *pos)
>>> + struct nfsd4_read *read)
>>> {
>>> - struct xdr_stream *xdr = resp->xdr;
>>> + bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
>>> struct file *file = read->rd_nf->nf_file;
>>> - int starting_len = xdr->buf->len;
>>> - loff_t hole_pos;
>>> - __be32 nfserr;
>>> - __be32 *p, tmp;
>>> - __be64 tmp64;
>>> -
>>> - hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
>>> - if (hole_pos > read->rd_offset)
>>> - *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
>>> - *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
>>> + struct xdr_stream *xdr = resp->xdr;
>>> + unsigned long maxcount;
>>> + __be32 nfserr, *p;
>>>
>>> /* Content type, offset, byte count */
>>> p = xdr_reserve_space(xdr, 4 + 8 + 4);
>>> if (!p)
>>> - return nfserr_resource;
>>> + return nfserr_io;
>>
>> Wouldn't nfserr_rep_too_big be a more appropriate status for running
>> off the end of the send buffer? I'm not 100% sure, but I would expect
>> that exhausting send buffer space would imply the reply has grown too
>> large.
>
> I can switch it to that, no problem.

OK, never mind. I see that nfsd4_encode_compound() handles the status
code conversion for every encoder, and deals with reply caching too:

5349 if (op->status == nfserr_resource && nfsd4_has_session(&resp->cstate)) {
5350 struct nfsd4_slot *slot = resp->cstate.slot;
5351
5352 if (slot->sl_flags & NFSD4_SLOT_CACHETHIS)
5353 op->status = nfserr_rep_too_big_to_cache;
5354 else
5355 op->status = nfserr_rep_too_big;
5356 }

So returning nfserr_resource from the READ_PLUS encoder when
xdr_reserve_space() fails is copacetic and preferred.

Then, once READ_PLUS can return multiple segments again, it should
deal with send buffer space exhaustion by truncating the reply at
the last properly encoded segment, as Trond suggested.


>>> + if (resp->xdr->buf->page_len && splice_ok) {
>>> + WARN_ON_ONCE(splice_ok);
>>> + return nfserr_io;
>>> + }
>>
>> I wish I understood why this test was needed. It seems to have been
>> copied and pasted from historic code into nfsd4_encode_read(), and
>> there have been recent mechanical changes to it, but there's no
>> comment explaining it there...
>
> Yeah, I saw this was in the read code and assumed it was an important
> check so I added it here too.
>>
>> In any event, this seems to be checking for a server software bug,
>> so maybe this should return nfserr_serverfault. Oddly that status
>> code isn't defined yet.
>
> Do you want me to add that code and return it in this patch?

I would still like to return serverfault if the splice check
fails.


>> Do you have some performance results for v2?
>
> Not yet, I have it running now so hopefully I'll have something ready
> by tomorrow morning.
>
> Anna
>>
>>
>>> - read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
>>> - if (read->rd_vlen < 0)
>>> - return nfserr_resource;
>>> + maxcount = min_t(unsigned long, read->rd_length,
>>> + (xdr->buf->buflen - xdr->buf->len));
>>>
>>> - nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
>>> - resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
>>> + if (file->f_op->splice_read && splice_ok)
>>> + nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
>>> + else
>>> + nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
>>> if (nfserr)
>>> return nfserr;
>>> - xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount));
>>>
>>> - tmp = htonl(NFS4_CONTENT_DATA);
>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
>>> - tmp64 = cpu_to_be64(read->rd_offset);
>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8);
>>> - tmp = htonl(*maxcount);
>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4);
>>> -
>>> - tmp = xdr_zero;
>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp,
>>> - xdr_pad_size(*maxcount));
>>> - return nfs_ok;
>>> -}
>>> -
>>> -static __be32
>>> -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
>>> - struct nfsd4_read *read,
>>> - unsigned long *maxcount, u32 *eof)
>>> -{
>>> - struct file *file = read->rd_nf->nf_file;
>>> - loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
>>> - loff_t f_size = i_size_read(file_inode(file));
>>> - unsigned long count;
>>> - __be32 *p;
>>> -
>>> - if (data_pos == -ENXIO)
>>> - data_pos = f_size;
>>> - else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
>>> - return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
>>> - count = data_pos - read->rd_offset;
>>> -
>>> - /* Content type, offset, byte count */
>>> - p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
>>> - if (!p)
>>> - return nfserr_resource;
>>> -
>>> - *p++ = htonl(NFS4_CONTENT_HOLE);
>>> + *p++ = cpu_to_be32(NFS4_CONTENT_DATA);
>>> p = xdr_encode_hyper(p, read->rd_offset);
>>> - p = xdr_encode_hyper(p, count);
>>> + *p = cpu_to_be32(read->rd_length);
>>>
>>> - *eof = (read->rd_offset + count) >= f_size;
>>> - *maxcount = min_t(unsigned long, count, *maxcount);
>>> return nfs_ok;
>>> }
>>>
>>> @@ -4811,69 +4769,36 @@ static __be32
>>> nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
>>> struct nfsd4_read *read)
>>> {
>>> - unsigned long maxcount, count;
>>> + struct file *file = read->rd_nf->nf_file;
>>> struct xdr_stream *xdr = resp->xdr;
>>> - struct file *file;
>>> int starting_len = xdr->buf->len;
>>> - int last_segment = xdr->buf->len;
>>> - int segments = 0;
>>> - __be32 *p, tmp;
>>> - bool is_data;
>>> - loff_t pos;
>>> - u32 eof;
>>> + u32 segments = 0;
>>> + __be32 *p;
>>>
>>> if (nfserr)
>>> return nfserr;
>>> - file = read->rd_nf->nf_file;
>>>
>>> /* eof flag, segment count */
>>> p = xdr_reserve_space(xdr, 4 + 4);
>>> if (!p)
>>> - return nfserr_resource;
>>> + return nfserr_io;
>>> xdr_commit_encode(xdr);
>>>
>>> - maxcount = min_t(unsigned long, read->rd_length,
>>> - (xdr->buf->buflen - xdr->buf->len));
>>> - count = maxcount;
>>> -
>>> - eof = read->rd_offset >= i_size_read(file_inode(file));
>>> - if (eof)
>>> + read->rd_eof = read->rd_offset >= i_size_read(file_inode(file));
>>> + if (read->rd_eof)
>>> goto out;
>>>
>>> - pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
>>> - is_data = pos > read->rd_offset;
>>> -
>>> - while (count > 0 && !eof) {
>>> - maxcount = count;
>>> - if (is_data)
>>> - nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
>>> - segments == 0 ? &pos : NULL);
>>> - else
>>> - nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
>>> - if (nfserr)
>>> - goto out;
>>> - count -= maxcount;
>>> - read->rd_offset += maxcount;
>>> - is_data = !is_data;
>>> - last_segment = xdr->buf->len;
>>> - segments++;
>>> - }
>>> -
>>> -out:
>>> - if (nfserr && segments == 0)
>>> + nfserr = nfsd4_encode_read_plus_data(resp, read);
>>> + if (nfserr) {
>>> xdr_truncate_encode(xdr, starting_len);
>>> - else {
>>> - if (nfserr) {
>>> - xdr_truncate_encode(xdr, last_segment);
>>> - nfserr = nfs_ok;
>>> - eof = 0;
>>> - }
>>> - tmp = htonl(eof);
>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
>>> - tmp = htonl(segments);
>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
>>> + return nfserr;
>>> }
>>>
>>> + segments++;
>>> +
>>> +out:
>>> + p = xdr_encode_bool(p, read->rd_eof);
>>> + *p = cpu_to_be32(segments);
>>> return nfserr;
>>> }
>>>
>>> --
>>> 2.37.2
>>>
>>
>> --
>> Chuck Lever

--
Chuck Lever