2022-07-13 19:15:18

by Anna Schumaker

[permalink] [raw]
Subject: [PATCH v2 0/5] NFS: Improvements for the NFSv4.2 READ_PLUS operation

From: Anna Schumaker <[email protected]>

Previously, decoding was a one step process that expanded holes as they
were seen in the buffer. This had a few undesireable side effects:

1) An extra READ_PLUS call was often needed to fetch any data shifted
off the end of the buffer when the last two segments are a HOLE
followed by DATA
2) We shifted the entire remaining buffer for each hole, meaning some
segments would get moved multiple times during one decode pass.

These patches attempt to address this by turning READ_PLUS decoding into
a two-step process. First, we build up a list of each segment returned
by the server, then we walk the list in reverse and move each segment
directly to their target offset. This cuts out all the extra copying,
and means we won't lose any data off of the end of the reply.

The results of my performance testing can be found here:
https://wiki.linux-nfs.org/wiki/index.php/Read_Plus_May_2022

Between these patches and the corresponding server patches, I'm seeing a
several second decrease in the amount of time that a READ_PLUS call
takes to complete even on the worst case test where pages alternate
between hole and data segments.

I also optimistically remove the CONFIG_NFS_V4_2_READ_PLUS Kconfig
option now that the known performance and correctness issues have been
resolved, but I would also be fine with changing it to default to 'y' if
there are objections to entirely dropping the option.

Please note that these patches rely on the xdr_stream_move_subsegment()
function added as the first patch of the corresponding server patches.

Changes in V2:
- Rebase on v5.19-rc6
- Update for renaming xdr_buf_move_segment() -> xdr_buf_move_subsegment()

Thoughts?
Anna


Anna Schumaker (5):
SUNRPC: Add a function for directly setting the xdr page len
SUNRPC: Add a function for zeroing out a portion of an xdr_stream
NFS: Replace the READ_PLUS decoding code
SUNRPC: Remove xdr_align_data() and xdr_expand_hole()
NFS: Remove the CONFIG_NFS_V4_2_READ_PLUS Kconfig option

fs/nfs/Kconfig | 9 --
fs/nfs/nfs42xdr.c | 168 +++++++++++++++++++------------------
fs/nfs/nfs4proc.c | 2 +-
include/linux/sunrpc/xdr.h | 5 +-
net/sunrpc/xdr.c | 111 +++++++++++-------------
5 files changed, 140 insertions(+), 155 deletions(-)

--
2.37.0


2022-07-13 19:15:29

by Anna Schumaker

[permalink] [raw]
Subject: [PATCH v2 1/5] SUNRPC: Add a function for directly setting the xdr page len

From: Anna Schumaker <[email protected]>

We need to do this step during READ_PLUS decoding so that we know pages
are the right length and any extra data has been preserved in the tail.

Signed-off-by: Anna Schumaker <[email protected]>
---
include/linux/sunrpc/xdr.h | 1 +
net/sunrpc/xdr.c | 30 ++++++++++++++++++++++++++++++
2 files changed, 31 insertions(+)

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 79824fea4529..b8f3011e109b 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -262,6 +262,7 @@ extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
extern int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
+extern void xdr_set_pagelen(struct xdr_stream *, unsigned int len);
extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length);
extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length);
extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf,
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index ea734b14af0f..9d7c93645f01 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1518,6 +1518,36 @@ unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
}
EXPORT_SYMBOL_GPL(xdr_read_pages);

+/**
+ * xdr_set_pagelen - Sets the length of the XDR pages
+ * @xdr: pointer to xdr_stream struct
+ * @len: new length of the XDR page data
+ *
+ * Either grows or shrinks the length of the xdr pages by setting pagelen to
+ * @len bytes. When shrinking, any extra data is moved into buf->tail, whereas
+ * when growing any data beyond the current pointer is moved into the tail.
+ *
+ * Returns True if the operation was successful, and False otherwise.
+ */
+void xdr_set_pagelen(struct xdr_stream *xdr, unsigned int len)
+{
+ struct xdr_buf *buf = xdr->buf;
+ size_t remaining = xdr_stream_remaining(xdr);
+ size_t base = 0;
+
+ if (len < buf->page_len) {
+ base = buf->page_len - len;
+ xdr_shrink_pagelen(buf, len);
+ } else {
+ xdr_buf_head_shift_right(buf, xdr_stream_pos(xdr),
+ buf->page_len, remaining);
+ if (len > buf->page_len)
+ xdr_buf_try_expand(buf, len - buf->page_len);
+ }
+ xdr_set_tail_base(xdr, base, remaining);
+}
+EXPORT_SYMBOL_GPL(xdr_set_pagelen);
+
unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset,
unsigned int length)
{
--
2.37.0

2022-07-13 19:15:54

by Anna Schumaker

[permalink] [raw]
Subject: [PATCH v2 5/5] NFS: Remove the CONFIG_NFS_V4_2_READ_PLUS Kconfig option

From: Anna Schumaker <[email protected]>

Signed-off-by: Anna Schumaker <[email protected]>
---
fs/nfs/Kconfig | 9 ---------
fs/nfs/nfs4proc.c | 2 +-
2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 14a72224b657..5dcd84ce1c0c 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -205,12 +205,3 @@ config NFS_DISABLE_UDP_SUPPORT
Choose Y here to disable the use of NFS over UDP. NFS over UDP
on modern networks (1Gb+) can lead to data corruption caused by
fragmentation during high loads.
-
-config NFS_V4_2_READ_PLUS
- bool "NFS: Enable support for the NFSv4.2 READ_PLUS operation"
- depends on NFS_V4_2
- default n
- help
- This is intended for developers only. The READ_PLUS operation has
- been shown to have issues under specific conditions and should not
- be used in production.
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index bb0e84a46d61..fdd6e22a06ed 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5446,7 +5446,7 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
nfs4_read_done_cb(task, hdr);
}

-#if defined CONFIG_NFS_V4_2 && defined CONFIG_NFS_V4_2_READ_PLUS
+#if defined CONFIG_NFS_V4_2
static void nfs42_read_plus_support(struct nfs_pgio_header *hdr,
struct rpc_message *msg)
{
--
2.37.0

2022-07-13 19:15:57

by Anna Schumaker

[permalink] [raw]
Subject: [PATCH v2 3/5] NFS: Replace the READ_PLUS decoding code

From: Anna Schumaker <[email protected]>

We now take a 2-step process that allows us to place data and hole
segments directly at their final position in the xdr_stream without
needing to do a bunch of redundant copies to expand holes. Due to the
variable lengths of each segment, the xdr metadata might cross page
boundaries which I account for by setting a small scratch buffer so
xdr_inline_decode() won't fail.

Signed-off-by: Anna Schumaker <[email protected]>
---
fs/nfs/nfs42xdr.c | 168 ++++++++++++++++++++++++----------------------
1 file changed, 87 insertions(+), 81 deletions(-)

diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 271e5f92ed01..b56f05113d36 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1025,73 +1025,84 @@ static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *re
return decode_op_hdr(xdr, OP_DEALLOCATE);
}

-static int decode_read_plus_data(struct xdr_stream *xdr,
- struct nfs_pgio_args *args,
- struct nfs_pgio_res *res)
-{
- uint32_t count, recvd;
+struct read_plus_segment {
+ enum data_content4 type;
uint64_t offset;
+ union {
+ struct {
+ uint64_t length;
+ } hole;
+
+ struct {
+ uint32_t length;
+ unsigned int from;
+ } data;
+ };
+};
+
+static inline uint64_t read_plus_segment_length(struct read_plus_segment *seg)
+{
+ return seg->type == NFS4_CONTENT_DATA ? seg->data.length : seg->hole.length;
+}
+
+static int decode_read_plus_segment(struct xdr_stream *xdr,
+ struct read_plus_segment *seg)
+{
__be32 *p;

- p = xdr_inline_decode(xdr, 8 + 4);
+ p = xdr_inline_decode(xdr, 4);
if (!p)
- return 1;
-
- p = xdr_decode_hyper(p, &offset);
- count = be32_to_cpup(p);
- recvd = xdr_align_data(xdr, res->count, xdr_align_size(count));
- if (recvd > count)
- recvd = count;
- if (res->count + recvd > args->count) {
- if (args->count > res->count)
- res->count += args->count - res->count;
- return 1;
- }
- res->count += recvd;
- if (count > recvd)
- return 1;
+ return -EIO;
+ seg->type = be32_to_cpup(p++);
+
+ p = xdr_inline_decode(xdr, seg->type == NFS4_CONTENT_DATA ? 12 : 16);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &seg->offset);
+
+ if (seg->type == NFS4_CONTENT_DATA) {
+ struct xdr_buf buf;
+ uint32_t len = be32_to_cpup(p);
+
+ seg->data.length = len;
+ seg->data.from = xdr_stream_pos(xdr);
+
+ if (!xdr_stream_subsegment(xdr, &buf, xdr_align_size(len)))
+ return -EIO;
+ } else if (seg->type == NFS4_CONTENT_HOLE) {
+ xdr_decode_hyper(p, &seg->hole.length);
+ } else
+ return -EINVAL;
return 0;
}

-static int decode_read_plus_hole(struct xdr_stream *xdr,
- struct nfs_pgio_args *args,
- struct nfs_pgio_res *res, uint32_t *eof)
+static int process_read_plus_segment(struct xdr_stream *xdr,
+ struct nfs_pgio_args *args,
+ struct nfs_pgio_res *res,
+ struct read_plus_segment *seg)
{
- uint64_t offset, length, recvd;
- __be32 *p;
+ unsigned long offset = seg->offset;
+ unsigned long length = read_plus_segment_length(seg);
+ unsigned int bufpos;

- p = xdr_inline_decode(xdr, 8 + 8);
- if (!p)
- return 1;
-
- p = xdr_decode_hyper(p, &offset);
- p = xdr_decode_hyper(p, &length);
- if (offset != args->offset + res->count) {
- /* Server returned an out-of-sequence extent */
- if (offset > args->offset + res->count ||
- offset + length < args->offset + res->count) {
- dprintk("NFS: server returned out of sequence extent: "
- "offset/size = %llu/%llu != expected %llu\n",
- (unsigned long long)offset,
- (unsigned long long)length,
- (unsigned long long)(args->offset +
- res->count));
- return 1;
- }
- length -= args->offset + res->count - offset;
+ if (offset + length < args->offset)
+ return 0;
+ else if (offset > args->offset + args->count) {
+ res->eof = 0;
+ return 0;
+ } else if (offset < args->offset) {
+ length -= (args->offset - offset);
+ offset = args->offset;
+ } else if (offset + length > args->offset + args->count) {
+ length = (args->offset + args->count) - offset;
+ res->eof = 0;
}
- if (length + res->count > args->count) {
- *eof = 0;
- if (unlikely(res->count >= args->count))
- return 1;
- length = args->count - res->count;
- }
- recvd = xdr_expand_hole(xdr, res->count, length);
- res->count += recvd;

- if (recvd < length)
- return 1;
- return 0;
+ bufpos = xdr->buf->head[0].iov_len + (offset - args->offset);
+ if (seg->type == NFS4_CONTENT_HOLE)
+ return xdr_stream_zero(xdr, bufpos, length);
+ else
+ return xdr_stream_move_subsegment(xdr, seg->data.from, bufpos, length);
}

static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
@@ -1099,8 +1110,10 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
struct nfs_pgio_header *hdr =
container_of(res, struct nfs_pgio_header, res);
struct nfs_pgio_args *args = &hdr->args;
- uint32_t eof, segments, type;
+ uint32_t segments;
+ struct read_plus_segment *segs;
int status, i;
+ char scratch_buf[16];
__be32 *p;

status = decode_op_hdr(xdr, OP_READ_PLUS);
@@ -1112,38 +1125,31 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
return -EIO;

res->count = 0;
- eof = be32_to_cpup(p++);
+ res->eof = be32_to_cpup(p++);
segments = be32_to_cpup(p++);
if (segments == 0)
- goto out;
+ return status;

+ segs = kmalloc_array(segments, sizeof(*segs), GFP_KERNEL);
+ if (!segs)
+ return -ENOMEM;
+
+ xdr_set_scratch_buffer(xdr, &scratch_buf, 32);
+ status = -EIO;
for (i = 0; i < segments; i++) {
- p = xdr_inline_decode(xdr, 4);
- if (!p)
- goto early_out;
-
- type = be32_to_cpup(p++);
- if (type == NFS4_CONTENT_DATA)
- status = decode_read_plus_data(xdr, args, res);
- else if (type == NFS4_CONTENT_HOLE)
- status = decode_read_plus_hole(xdr, args, res, &eof);
- else
- return -EINVAL;
-
+ status = decode_read_plus_segment(xdr, &segs[i]);
if (status < 0)
- return status;
- if (status > 0)
- goto early_out;
+ goto out;
}

+ xdr_set_pagelen(xdr, xdr_align_size(args->count));
+ for (i = segments; i > 0; i--)
+ res->count += process_read_plus_segment(xdr, args, res, &segs[i-1]);
+ status = 0;
+
out:
- res->eof = eof;
- return 0;
-early_out:
- if (unlikely(!i))
- return -EIO;
- res->eof = 0;
- return 0;
+ kfree(segs);
+ return status;
}

static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
--
2.37.0

2022-07-13 19:16:02

by Anna Schumaker

[permalink] [raw]
Subject: [PATCH v2 4/5] SUNRPC: Remove xdr_align_data() and xdr_expand_hole()

From: Anna Schumaker <[email protected]>

These functions are no longer needed now that the NFS client places data
and hole segments directly.

Signed-off-by: Anna Schumaker <[email protected]>
---
include/linux/sunrpc/xdr.h | 2 --
net/sunrpc/xdr.c | 66 --------------------------------------
2 files changed, 68 deletions(-)

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 4fd208da1a5e..bc21e313e2e3 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -263,8 +263,6 @@ extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
extern int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
extern void xdr_set_pagelen(struct xdr_stream *, unsigned int len);
-extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length);
-extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length);
extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf,
unsigned int len);
extern unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int offset,
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 5c9ba26875cf..6a3781b2f378 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1548,72 +1548,6 @@ void xdr_set_pagelen(struct xdr_stream *xdr, unsigned int len)
}
EXPORT_SYMBOL_GPL(xdr_set_pagelen);

-unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset,
- unsigned int length)
-{
- struct xdr_buf *buf = xdr->buf;
- unsigned int from, bytes, len;
- unsigned int shift;
-
- xdr_realign_pages(xdr);
- from = xdr_page_pos(xdr);
-
- if (from >= buf->page_len + buf->tail->iov_len)
- return 0;
- if (from + buf->head->iov_len >= buf->len)
- return 0;
-
- len = buf->len - buf->head->iov_len;
-
- /* We only shift data left! */
- if (WARN_ONCE(from < offset, "SUNRPC: misaligned data src=%u dst=%u\n",
- from, offset))
- return 0;
- if (WARN_ONCE(offset > buf->page_len,
- "SUNRPC: buffer overflow. offset=%u, page_len=%u\n",
- offset, buf->page_len))
- return 0;
-
- /* Move page data to the left */
- shift = from - offset;
- xdr_buf_pages_shift_left(buf, from, len, shift);
-
- bytes = xdr_stream_remaining(xdr);
- if (length > bytes)
- length = bytes;
- bytes -= length;
-
- xdr->buf->len -= shift;
- xdr_set_page(xdr, offset + length, bytes);
- return length;
-}
-EXPORT_SYMBOL_GPL(xdr_align_data);
-
-unsigned int xdr_expand_hole(struct xdr_stream *xdr, unsigned int offset,
- unsigned int length)
-{
- struct xdr_buf *buf = xdr->buf;
- unsigned int from, to, shift;
-
- xdr_realign_pages(xdr);
- from = xdr_page_pos(xdr);
- to = xdr_align_size(offset + length);
-
- /* Could the hole be behind us? */
- if (to > from) {
- unsigned int buflen = buf->len - buf->head->iov_len;
- shift = to - from;
- xdr_buf_try_expand(buf, shift);
- xdr_buf_pages_shift_right(buf, from, buflen, shift);
- xdr_set_page(xdr, to, xdr_stream_remaining(xdr));
- } else if (to != from)
- xdr_align_data(xdr, to, 0);
- xdr_buf_pages_zero(buf, offset, length);
-
- return length;
-}
-EXPORT_SYMBOL_GPL(xdr_expand_hole);
-
/**
* xdr_enter_page - decode data from the XDR page
* @xdr: pointer to xdr_stream struct
--
2.37.0