2021-02-03 16:25:33

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v2 0/6] RPC/RDMA client fixes

Hi-

Fresh version of the single patch I posted yesterday, now expanded
with additional fixes.


Changes since v1:
- Respond to review comments
- Split "Remove FMR support" into three patches for clarity
- Fix implicit chunk roundup
- Improve Receive completion tracepoints

---

Chuck Lever (6):
xprtrdma: Remove FMR support in rpcrdma_convert_iovs()
xprtrdma: Simplify rpcrdma_convert_kvec() and frwr_map()
xprtrdma: Refactor invocations of offset_in_page()
rpcrdma: Fix comments about reverse-direction operation
xprtrdma: Pad optimization, revisited
rpcrdma: Capture bytes received in Receive completion tracepoints


include/trace/events/rpcrdma.h | 50 +++++++++++++++++++++-
net/sunrpc/xprtrdma/backchannel.c | 4 +-
net/sunrpc/xprtrdma/frwr_ops.c | 12 ++----
net/sunrpc/xprtrdma/rpc_rdma.c | 17 +++-----
net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 4 +-
net/sunrpc/xprtrdma/xprt_rdma.h | 15 ++++---
6 files changed, 68 insertions(+), 34 deletions(-)

--
Chuck Lever


2021-02-03 16:25:44

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v2 1/6] xprtrdma: Remove FMR support in rpcrdma_convert_iovs()

Support for FMR was removed by commit ba69cd122ece ("xprtrdma:
Remove support for FMR memory registration") [Dec 2018]. That means
the buffer-splitting behavior of rpcrdma_convert_kvec(), added by
commit 821c791a0bde ("xprtrdma: Segment head and tail XDR buffers
on page boundaries") [Mar 2016], is no longer necessary. FRWR
memory registration handles this case with aplomb.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 19 ++++---------------
1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 8f5d0cb68360..832765f3ebba 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -204,9 +204,7 @@ rpcrdma_alloc_sparse_pages(struct xdr_buf *buf)
return 0;
}

-/* Split @vec on page boundaries into SGEs. FMR registers pages, not
- * a byte range. Other modes coalesce these SGEs into a single MR
- * when they can.
+/* Convert @vec to a single SGL element.
*
* Returns pointer to next available SGE, and bumps the total number
* of SGEs consumed.
@@ -215,21 +213,12 @@ static struct rpcrdma_mr_seg *
rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
unsigned int *n)
{
- u32 remaining, page_offset;
- char *base;
-
- base = vec->iov_base;
- page_offset = offset_in_page(base);
- remaining = vec->iov_len;
- while (remaining) {
+ if (vec->iov_len) {
seg->mr_page = NULL;
- seg->mr_offset = base;
- seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
- remaining -= seg->mr_len;
- base += seg->mr_len;
+ seg->mr_offset = vec->iov_base;
+ seg->mr_len = vec->iov_len;
++seg;
++(*n);
- page_offset = 0;
}
return seg;
}


2021-02-03 16:25:49

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v2 2/6] xprtrdma: Simplify rpcrdma_convert_kvec() and frwr_map()

Clean up.

Remove a conditional branch from the SGL set-up loop in frwr_map():
Instead of using either sg_set_page() or sg_set_buf(), initialize
the mr_page field properly when rpcrdma_convert_kvec() converts the
kvec to an SGL entry. frwr_map() can then invoke sg_set_page()
unconditionally.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/frwr_ops.c | 12 ++++--------
net/sunrpc/xprtrdma/rpc_rdma.c | 2 +-
net/sunrpc/xprtrdma/xprt_rdma.h | 9 +++++----
3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index baca49fe83af..13a50f77dddb 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -306,14 +306,10 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
if (nsegs > ep->re_max_fr_depth)
nsegs = ep->re_max_fr_depth;
for (i = 0; i < nsegs;) {
- if (seg->mr_page)
- sg_set_page(&mr->mr_sg[i],
- seg->mr_page,
- seg->mr_len,
- offset_in_page(seg->mr_offset));
- else
- sg_set_buf(&mr->mr_sg[i], seg->mr_offset,
- seg->mr_len);
+ sg_set_page(&mr->mr_sg[i],
+ seg->mr_page,
+ seg->mr_len,
+ offset_in_page(seg->mr_offset));

++seg;
++i;
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 832765f3ebba..529adb6ad4db 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -214,7 +214,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
unsigned int *n)
{
if (vec->iov_len) {
- seg->mr_page = NULL;
+ seg->mr_page = virt_to_page(vec->iov_base);
seg->mr_offset = vec->iov_base;
seg->mr_len = vec->iov_len;
++seg;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 94b28657aeeb..02971e183989 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -283,10 +283,11 @@ enum {
RPCRDMA_MAX_IOV_SEGS,
};

-struct rpcrdma_mr_seg { /* chunk descriptors */
- u32 mr_len; /* length of chunk or segment */
- struct page *mr_page; /* owning page, if any */
- char *mr_offset; /* kva if no page, else offset */
+/* Arguments for DMA mapping and registration */
+struct rpcrdma_mr_seg {
+ u32 mr_len; /* length of segment */
+ struct page *mr_page; /* underlying struct page */
+ char *mr_offset; /* IN: page offset, OUT: iova */
};

/* The Send SGE array is provisioned to send a maximum size


2021-02-03 16:26:02

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v2 4/6] rpcrdma: Fix comments about reverse-direction operation

During the final stages of publication of RFC 8167, reviewers
requested that we use the term "reverse direction" rather than
"backwards direction". Update comments to reflect this preference.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/backchannel.c | 4 ++--
net/sunrpc/xprtrdma/rpc_rdma.c | 6 +-----
net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 4 ++--
net/sunrpc/xprtrdma/xprt_rdma.h | 6 +++---
4 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 946edf2db646..a249837d6a55 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -2,7 +2,7 @@
/*
* Copyright (c) 2015-2020, Oracle and/or its affiliates.
*
- * Support for backward direction RPCs on RPC/RDMA.
+ * Support for reverse-direction RPCs on RPC/RDMA.
*/

#include <linux/sunrpc/xprt.h>
@@ -208,7 +208,7 @@ static struct rpc_rqst *rpcrdma_bc_rqst_get(struct rpcrdma_xprt *r_xprt)
}

/**
- * rpcrdma_bc_receive_call - Handle a backward direction call
+ * rpcrdma_bc_receive_call - Handle a reverse-direction Call
* @r_xprt: transport receiving the call
* @rep: receive buffer containing the call
*
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index b3e66b8f65ab..f0af89a43efd 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1153,14 +1153,10 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
*/
p = xdr_inline_decode(xdr, 3 * sizeof(*p));
if (unlikely(!p))
- goto out_short;
+ return true;

rpcrdma_bc_receive_call(r_xprt, rep);
return true;
-
-out_short:
- pr_warn("RPC/RDMA short backward direction call\n");
- return true;
}
#else /* CONFIG_SUNRPC_BACKCHANNEL */
{
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 63f8be974df2..4a1edbb4028e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -2,7 +2,7 @@
/*
* Copyright (c) 2015-2018 Oracle. All rights reserved.
*
- * Support for backward direction RPCs on RPC/RDMA (server-side).
+ * Support for reverse-direction RPCs on RPC/RDMA (server-side).
*/

#include <linux/sunrpc/svc_rdma.h>
@@ -59,7 +59,7 @@ void svc_rdma_handle_bc_reply(struct svc_rqst *rqstp,
spin_unlock(&xprt->queue_lock);
}

-/* Send a backwards direction RPC call.
+/* Send a reverse-direction RPC Call.
*
* Caller holds the connection's mutex and has already marshaled
* the RPC/RDMA request.
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index ed1c5444fb9d..fe3be985e239 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -98,9 +98,9 @@ struct rpcrdma_ep {
atomic_t re_completion_ids;
};

-/* Pre-allocate extra Work Requests for handling backward receives
- * and sends. This is a fixed value because the Work Queues are
- * allocated when the forward channel is set up, long before the
+/* Pre-allocate extra Work Requests for handling reverse-direction
+ * Receives and Sends. This is a fixed value because the Work Queues
+ * are allocated when the forward channel is set up, long before the
* backchannel is provisioned. This value is two times
* NFS4_DEF_CB_SLOT_TABLE_SIZE.
*/


2021-02-03 16:26:22

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v2 6/6] rpcrdma: Capture bytes received in Receive completion tracepoints

Signed-off-by: Chuck Lever <[email protected]>
---
include/trace/events/rpcrdma.h | 50 ++++++++++++++++++++++++++++++++++++++--
1 file changed, 48 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 76e85e16854b..c838e7ac1c2d 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -60,6 +60,51 @@ DECLARE_EVENT_CLASS(rpcrdma_completion_class,
), \
TP_ARGS(wc, cid))

+DECLARE_EVENT_CLASS(rpcrdma_receive_completion_class,
+ TP_PROTO(
+ const struct ib_wc *wc,
+ const struct rpc_rdma_cid *cid
+ ),
+
+ TP_ARGS(wc, cid),
+
+ TP_STRUCT__entry(
+ __field(u32, cq_id)
+ __field(int, completion_id)
+ __field(u32, received)
+ __field(unsigned long, status)
+ __field(unsigned int, vendor_err)
+ ),
+
+ TP_fast_assign(
+ __entry->cq_id = cid->ci_queue_id;
+ __entry->completion_id = cid->ci_completion_id;
+ __entry->status = wc->status;
+ if (wc->status) {
+ __entry->received = 0;
+ __entry->vendor_err = wc->vendor_err;
+ } else {
+ __entry->received = wc->byte_len;
+ __entry->vendor_err = 0;
+ }
+ ),
+
+ TP_printk("cq.id=%u cid=%d status=%s (%lu/0x%x) received=%u",
+ __entry->cq_id, __entry->completion_id,
+ rdma_show_wc_status(__entry->status),
+ __entry->status, __entry->vendor_err,
+ __entry->received
+ )
+);
+
+#define DEFINE_RECEIVE_COMPLETION_EVENT(name) \
+ DEFINE_EVENT(rpcrdma_receive_completion_class, name, \
+ TP_PROTO( \
+ const struct ib_wc *wc, \
+ const struct rpc_rdma_cid *cid \
+ ), \
+ TP_ARGS(wc, cid))
+
DECLARE_EVENT_CLASS(xprtrdma_reply_class,
TP_PROTO(
const struct rpcrdma_rep *rep
@@ -838,7 +883,8 @@ TRACE_EVENT(xprtrdma_post_linv_err,
** Completion events
**/

-DEFINE_COMPLETION_EVENT(xprtrdma_wc_receive);
+DEFINE_RECEIVE_COMPLETION_EVENT(xprtrdma_wc_receive);
+
DEFINE_COMPLETION_EVENT(xprtrdma_wc_send);
DEFINE_COMPLETION_EVENT(xprtrdma_wc_fastreg);
DEFINE_COMPLETION_EVENT(xprtrdma_wc_li);
@@ -1790,7 +1836,7 @@ TRACE_EVENT(svcrdma_post_recv,
)
);

-DEFINE_COMPLETION_EVENT(svcrdma_wc_receive);
+DEFINE_RECEIVE_COMPLETION_EVENT(svcrdma_wc_receive);

TRACE_EVENT(svcrdma_rq_post_err,
TP_PROTO(


2021-02-03 16:26:36

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v2 3/6] xprtrdma: Refactor invocations of offset_in_page()

Clean up so that offset_in_page() is invoked less often in the
most common case, which is mapping xdr->pages.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/frwr_ops.c | 8 +++-----
net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++--
net/sunrpc/xprtrdma/xprt_rdma.h | 2 +-
3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 13a50f77dddb..766a1048a48a 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -306,16 +306,14 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
if (nsegs > ep->re_max_fr_depth)
nsegs = ep->re_max_fr_depth;
for (i = 0; i < nsegs;) {
- sg_set_page(&mr->mr_sg[i],
- seg->mr_page,
- seg->mr_len,
- offset_in_page(seg->mr_offset));
+ sg_set_page(&mr->mr_sg[i], seg->mr_page,
+ seg->mr_len, seg->mr_offset);

++seg;
++i;
if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS)
continue;
- if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+ if ((i < nsegs && seg->mr_offset) ||
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
}
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 529adb6ad4db..b3e66b8f65ab 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -215,7 +215,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
{
if (vec->iov_len) {
seg->mr_page = virt_to_page(vec->iov_base);
- seg->mr_offset = vec->iov_base;
+ seg->mr_offset = offset_in_page(vec->iov_base);
seg->mr_len = vec->iov_len;
++seg;
++(*n);
@@ -248,7 +248,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
page_base = offset_in_page(xdrbuf->page_base);
while (len) {
seg->mr_page = *ppages;
- seg->mr_offset = (char *)page_base;
+ seg->mr_offset = page_base;
seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
len -= seg->mr_len;
++ppages;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 02971e183989..ed1c5444fb9d 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -287,7 +287,7 @@ enum {
struct rpcrdma_mr_seg {
u32 mr_len; /* length of segment */
struct page *mr_page; /* underlying struct page */
- char *mr_offset; /* IN: page offset, OUT: iova */
+ u64 mr_offset; /* IN: page offset, OUT: iova */
};

/* The Send SGE array is provisioned to send a maximum size


2021-02-03 16:28:42

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v2 5/6] xprtrdma: Pad optimization, revisited

The NetApp Linux team discovered that with NFS/RDMA servers that do
not support RFC 8797, the Linux client is forming NFSv4.x WRITE
requests incorrectly.

In this case, the Linux NFS client disables implicit chunk round-up
for odd-length Read and Write chunks. The goal was to support old
servers that needed that padding to be sent explicitly by clients.

In that case the Linux NFS included the tail kvec in the Read chunk,
since the tail contains any needed padding. That meant a separate
memory registration is needed for the tail kvec, adding to the cost
of forming such requests. To avoid that cost for a mere 3 bytes of
zeroes that are always ignored by receivers, we try to use implicit
roundup when possible.

For NFSv4.x, the tail kvec also sometimes contains a trailing
GETATTR operation. The Linux NFS clients is unintentionally
including that GETATTR operation in the Read chunk as well as
inline. Fortunately, servers ignore this craziness and go about
their normal business.

The fix is simply to /never/ include the tail kvec when forming a
data payload Read chunk.

Note that since commit 9ed5af268e88 ("SUNRPC: Clean up the handling
of page padding in rpc_prepare_reply_pages()") the NFS client passes
payload data to the transport with the padding in xdr->pages instead
of in the send buffer's tail kvec. So now the Linux NFS client
appends XDR padding to all odd-sized Read chunks. This shouldn't be
a problem because:

- RFC 8166-compliant servers are supposed to work with or without
that XDR padding in Read chunks.

- Since the padding is now in the same memory region as the data
payload, a separate memory registration is not needed. In
addition, the link layer extends data in RDMA Read responses to
4-byte boundaries anyway. Thus there is now no savings when the
padding is not included.

Because older kernels include the payload's XDR padding in the
tail kvec, a fix there will be more complicated. Thus backporting
this patch is not recommended.

Reported by: Olga Kornievskaia <[email protected]>
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index f0af89a43efd..f1b52f9ab242 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -257,10 +257,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
page_base = 0;
}

- /* When encoding a Read chunk, the tail iovec contains an
- * XDR pad and may be omitted.
- */
- if (type == rpcrdma_readch && r_xprt->rx_ep->re_implicit_roundup)
+ if (type == rpcrdma_readch)
goto out;

/* When encoding a Write chunk, some servers need to see an


2021-02-03 18:08:55

by Tom Talpey

[permalink] [raw]
Subject: Re: [PATCH v2 1/6] xprtrdma: Remove FMR support in rpcrdma_convert_iovs()

On 2/3/2021 11:23 AM, Chuck Lever wrote:
> Support for FMR was removed by commit ba69cd122ece ("xprtrdma:
> Remove support for FMR memory registration") [Dec 2018]. That means
> the buffer-splitting behavior of rpcrdma_convert_kvec(), added by
> commit 821c791a0bde ("xprtrdma: Segment head and tail XDR buffers
> on page boundaries") [Mar 2016], is no longer necessary. FRWR
> memory registration handles this case with aplomb.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> net/sunrpc/xprtrdma/rpc_rdma.c | 19 ++++---------------
> 1 file changed, 4 insertions(+), 15 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
> index 8f5d0cb68360..832765f3ebba 100644
> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
> @@ -204,9 +204,7 @@ rpcrdma_alloc_sparse_pages(struct xdr_buf *buf)
> return 0;
> }
>
> -/* Split @vec on page boundaries into SGEs. FMR registers pages, not
> - * a byte range. Other modes coalesce these SGEs into a single MR
> - * when they can.
> +/* Convert @vec to a single SGL element.
> *
> * Returns pointer to next available SGE, and bumps the total number
> * of SGEs consumed.
> @@ -215,21 +213,12 @@ static struct rpcrdma_mr_seg *
> rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
> unsigned int *n)
> {
> - u32 remaining, page_offset;
> - char *base;
> -
> - base = vec->iov_base;
> - page_offset = offset_in_page(base);
> - remaining = vec->iov_len;
> - while (remaining) {
> + if (vec->iov_len) {

This is weird. Is it ever possible for a zero-length segment to be
passed? If so, it's obviously wrong to return an uninitialized "seg"
to the caller. I'd suggest simply asserting that iov_len is != 0.

I guess this was an issue in the existing code, but there, it would
only trigger if *all* the segs were zero.

> seg->mr_page = NULL;
> - seg->mr_offset = base;
> - seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
> - remaining -= seg->mr_len;
> - base += seg->mr_len;
> + seg->mr_offset = vec->iov_base;

I thought the previous discussion was that this should be offset_in_page
not the (virtual) iov_base.

Tom.

> + seg->mr_len = vec->iov_len;
> ++seg;
> ++(*n);
> - page_offset = 0;
> }
> return seg;
> }
>
>
>

2021-02-03 18:10:03

by Tom Talpey

[permalink] [raw]
Subject: Re: [PATCH v2 2/6] xprtrdma: Simplify rpcrdma_convert_kvec() and frwr_map()

Reviewed-By: Tom Talpey <[email protected]>

On 2/3/2021 11:24 AM, Chuck Lever wrote:
> Clean up.
>
> Remove a conditional branch from the SGL set-up loop in frwr_map():
> Instead of using either sg_set_page() or sg_set_buf(), initialize
> the mr_page field properly when rpcrdma_convert_kvec() converts the
> kvec to an SGL entry. frwr_map() can then invoke sg_set_page()
> unconditionally.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> net/sunrpc/xprtrdma/frwr_ops.c | 12 ++++--------
> net/sunrpc/xprtrdma/rpc_rdma.c | 2 +-
> net/sunrpc/xprtrdma/xprt_rdma.h | 9 +++++----
> 3 files changed, 10 insertions(+), 13 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
> index baca49fe83af..13a50f77dddb 100644
> --- a/net/sunrpc/xprtrdma/frwr_ops.c
> +++ b/net/sunrpc/xprtrdma/frwr_ops.c
> @@ -306,14 +306,10 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
> if (nsegs > ep->re_max_fr_depth)
> nsegs = ep->re_max_fr_depth;
> for (i = 0; i < nsegs;) {
> - if (seg->mr_page)
> - sg_set_page(&mr->mr_sg[i],
> - seg->mr_page,
> - seg->mr_len,
> - offset_in_page(seg->mr_offset));
> - else
> - sg_set_buf(&mr->mr_sg[i], seg->mr_offset,
> - seg->mr_len);
> + sg_set_page(&mr->mr_sg[i],
> + seg->mr_page,
> + seg->mr_len,
> + offset_in_page(seg->mr_offset));
>
> ++seg;
> ++i;
> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
> index 832765f3ebba..529adb6ad4db 100644
> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
> @@ -214,7 +214,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
> unsigned int *n)
> {
> if (vec->iov_len) {
> - seg->mr_page = NULL;
> + seg->mr_page = virt_to_page(vec->iov_base);
> seg->mr_offset = vec->iov_base;
> seg->mr_len = vec->iov_len;
> ++seg;
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index 94b28657aeeb..02971e183989 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -283,10 +283,11 @@ enum {
> RPCRDMA_MAX_IOV_SEGS,
> };
>
> -struct rpcrdma_mr_seg { /* chunk descriptors */
> - u32 mr_len; /* length of chunk or segment */
> - struct page *mr_page; /* owning page, if any */
> - char *mr_offset; /* kva if no page, else offset */
> +/* Arguments for DMA mapping and registration */
> +struct rpcrdma_mr_seg {
> + u32 mr_len; /* length of segment */
> + struct page *mr_page; /* underlying struct page */
> + char *mr_offset; /* IN: page offset, OUT: iova */
> };
>
> /* The Send SGE array is provisioned to send a maximum size
>
>
>

2021-02-03 18:11:14

by Chuck Lever

[permalink] [raw]
Subject: Re: [PATCH v2 1/6] xprtrdma: Remove FMR support in rpcrdma_convert_iovs()



> On Feb 3, 2021, at 1:06 PM, Tom Talpey <[email protected]> wrote:
>
> On 2/3/2021 11:23 AM, Chuck Lever wrote:
>> Support for FMR was removed by commit ba69cd122ece ("xprtrdma:
>> Remove support for FMR memory registration") [Dec 2018]. That means
>> the buffer-splitting behavior of rpcrdma_convert_kvec(), added by
>> commit 821c791a0bde ("xprtrdma: Segment head and tail XDR buffers
>> on page boundaries") [Mar 2016], is no longer necessary. FRWR
>> memory registration handles this case with aplomb.
>> Signed-off-by: Chuck Lever <[email protected]>
>> ---
>> net/sunrpc/xprtrdma/rpc_rdma.c | 19 ++++---------------
>> 1 file changed, 4 insertions(+), 15 deletions(-)
>> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
>> index 8f5d0cb68360..832765f3ebba 100644
>> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
>> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
>> @@ -204,9 +204,7 @@ rpcrdma_alloc_sparse_pages(struct xdr_buf *buf)
>> return 0;
>> }
>> -/* Split @vec on page boundaries into SGEs. FMR registers pages, not
>> - * a byte range. Other modes coalesce these SGEs into a single MR
>> - * when they can.
>> +/* Convert @vec to a single SGL element.
>> *
>> * Returns pointer to next available SGE, and bumps the total number
>> * of SGEs consumed.
>> @@ -215,21 +213,12 @@ static struct rpcrdma_mr_seg *
>> rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
>> unsigned int *n)
>> {
>> - u32 remaining, page_offset;
>> - char *base;
>> -
>> - base = vec->iov_base;
>> - page_offset = offset_in_page(base);
>> - remaining = vec->iov_len;
>> - while (remaining) {
>> + if (vec->iov_len) {
>
> This is weird. Is it ever possible for a zero-length segment to be
> passed? If so, it's obviously wrong to return an uninitialized "seg"
> to the caller. I'd suggest simply asserting that iov_len is != 0.
>
> I guess this was an issue in the existing code, but there, it would
> only trigger if *all* the segs were zero.

It would be a bug if head.iov_len was zero.

tail.iov_len is checked before the call. I think that means this
check is superfluous and can be removed.


>
>> seg->mr_page = NULL;
>> - seg->mr_offset = base;
>> - seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
>> - remaining -= seg->mr_len;
>> - base += seg->mr_len;
>> + seg->mr_offset = vec->iov_base;
>
> I thought the previous discussion was that this should be offset_in_page
> not the (virtual) iov_base.

Addressed in 3/6?


>
> Tom.
>
>> + seg->mr_len = vec->iov_len;
>> ++seg;
>> ++(*n);
>> - page_offset = 0;
>> }
>> return seg;
>> }

--
Chuck Lever



2021-02-03 18:11:46

by Tom Talpey

[permalink] [raw]
Subject: Re: [PATCH v2 3/6] xprtrdma: Refactor invocations of offset_in_page()

This looks good, but the earlier 1/6 patch depends on the offset_in_page
conversion in rpcrdma_convert_kvec. Won't that complicate any bisection?

Reviewed-By: Tom Talpey <[email protected]>

On 2/3/2021 11:24 AM, Chuck Lever wrote:
> Clean up so that offset_in_page() is invoked less often in the
> most common case, which is mapping xdr->pages.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> net/sunrpc/xprtrdma/frwr_ops.c | 8 +++-----
> net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++--
> net/sunrpc/xprtrdma/xprt_rdma.h | 2 +-
> 3 files changed, 6 insertions(+), 8 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
> index 13a50f77dddb..766a1048a48a 100644
> --- a/net/sunrpc/xprtrdma/frwr_ops.c
> +++ b/net/sunrpc/xprtrdma/frwr_ops.c
> @@ -306,16 +306,14 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
> if (nsegs > ep->re_max_fr_depth)
> nsegs = ep->re_max_fr_depth;
> for (i = 0; i < nsegs;) {
> - sg_set_page(&mr->mr_sg[i],
> - seg->mr_page,
> - seg->mr_len,
> - offset_in_page(seg->mr_offset));
> + sg_set_page(&mr->mr_sg[i], seg->mr_page,
> + seg->mr_len, seg->mr_offset);
>
> ++seg;
> ++i;
> if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS)
> continue;
> - if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
> + if ((i < nsegs && seg->mr_offset) ||
> offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
> break;
> }
> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
> index 529adb6ad4db..b3e66b8f65ab 100644
> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
> @@ -215,7 +215,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
> {
> if (vec->iov_len) {
> seg->mr_page = virt_to_page(vec->iov_base);
> - seg->mr_offset = vec->iov_base;
> + seg->mr_offset = offset_in_page(vec->iov_base);
> seg->mr_len = vec->iov_len;
> ++seg;
> ++(*n);
> @@ -248,7 +248,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
> page_base = offset_in_page(xdrbuf->page_base);
> while (len) {
> seg->mr_page = *ppages;
> - seg->mr_offset = (char *)page_base;
> + seg->mr_offset = page_base;
> seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
> len -= seg->mr_len;
> ++ppages;
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index 02971e183989..ed1c5444fb9d 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -287,7 +287,7 @@ enum {
> struct rpcrdma_mr_seg {
> u32 mr_len; /* length of segment */
> struct page *mr_page; /* underlying struct page */
> - char *mr_offset; /* IN: page offset, OUT: iova */
> + u64 mr_offset; /* IN: page offset, OUT: iova */
> };
>
> /* The Send SGE array is provisioned to send a maximum size
>
>
>

2021-02-03 18:13:11

by Tom Talpey

[permalink] [raw]
Subject: Re: [PATCH v2 4/6] rpcrdma: Fix comments about reverse-direction operation

Reviewed-By: Tom Talpey <[email protected]>

On 2/3/2021 11:24 AM, Chuck Lever wrote:
> During the final stages of publication of RFC 8167, reviewers
> requested that we use the term "reverse direction" rather than
> "backwards direction". Update comments to reflect this preference.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> net/sunrpc/xprtrdma/backchannel.c | 4 ++--
> net/sunrpc/xprtrdma/rpc_rdma.c | 6 +-----
> net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 4 ++--
> net/sunrpc/xprtrdma/xprt_rdma.h | 6 +++---
> 4 files changed, 8 insertions(+), 12 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
> index 946edf2db646..a249837d6a55 100644
> --- a/net/sunrpc/xprtrdma/backchannel.c
> +++ b/net/sunrpc/xprtrdma/backchannel.c
> @@ -2,7 +2,7 @@
> /*
> * Copyright (c) 2015-2020, Oracle and/or its affiliates.
> *
> - * Support for backward direction RPCs on RPC/RDMA.
> + * Support for reverse-direction RPCs on RPC/RDMA.
> */
>
> #include <linux/sunrpc/xprt.h>
> @@ -208,7 +208,7 @@ static struct rpc_rqst *rpcrdma_bc_rqst_get(struct rpcrdma_xprt *r_xprt)
> }
>
> /**
> - * rpcrdma_bc_receive_call - Handle a backward direction call
> + * rpcrdma_bc_receive_call - Handle a reverse-direction Call
> * @r_xprt: transport receiving the call
> * @rep: receive buffer containing the call
> *
> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
> index b3e66b8f65ab..f0af89a43efd 100644
> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
> @@ -1153,14 +1153,10 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
> */
> p = xdr_inline_decode(xdr, 3 * sizeof(*p));
> if (unlikely(!p))
> - goto out_short;
> + return true;
>
> rpcrdma_bc_receive_call(r_xprt, rep);
> return true;
> -
> -out_short:
> - pr_warn("RPC/RDMA short backward direction call\n");
> - return true;
> }
> #else /* CONFIG_SUNRPC_BACKCHANNEL */
> {
> diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
> index 63f8be974df2..4a1edbb4028e 100644
> --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
> +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
> @@ -2,7 +2,7 @@
> /*
> * Copyright (c) 2015-2018 Oracle. All rights reserved.
> *
> - * Support for backward direction RPCs on RPC/RDMA (server-side).
> + * Support for reverse-direction RPCs on RPC/RDMA (server-side).
> */
>
> #include <linux/sunrpc/svc_rdma.h>
> @@ -59,7 +59,7 @@ void svc_rdma_handle_bc_reply(struct svc_rqst *rqstp,
> spin_unlock(&xprt->queue_lock);
> }
>
> -/* Send a backwards direction RPC call.
> +/* Send a reverse-direction RPC Call.
> *
> * Caller holds the connection's mutex and has already marshaled
> * the RPC/RDMA request.
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index ed1c5444fb9d..fe3be985e239 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -98,9 +98,9 @@ struct rpcrdma_ep {
> atomic_t re_completion_ids;
> };
>
> -/* Pre-allocate extra Work Requests for handling backward receives
> - * and sends. This is a fixed value because the Work Queues are
> - * allocated when the forward channel is set up, long before the
> +/* Pre-allocate extra Work Requests for handling reverse-direction
> + * Receives and Sends. This is a fixed value because the Work Queues
> + * are allocated when the forward channel is set up, long before the
> * backchannel is provisioned. This value is two times
> * NFS4_DEF_CB_SLOT_TABLE_SIZE.
> */
>
>
>

2021-02-03 18:17:54

by Tom Talpey

[permalink] [raw]
Subject: Re: [PATCH v2 6/6] rpcrdma: Capture bytes received in Receive completion tracepoints

Acked-By: Tom Talpey <[email protected]>

On 2/3/2021 11:24 AM, Chuck Lever wrote:
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> include/trace/events/rpcrdma.h | 50 ++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 48 insertions(+), 2 deletions(-)
>
> diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
> index 76e85e16854b..c838e7ac1c2d 100644
> --- a/include/trace/events/rpcrdma.h
> +++ b/include/trace/events/rpcrdma.h
> @@ -60,6 +60,51 @@ DECLARE_EVENT_CLASS(rpcrdma_completion_class,
> ), \
> TP_ARGS(wc, cid))
>
> +DECLARE_EVENT_CLASS(rpcrdma_receive_completion_class,
> + TP_PROTO(
> + const struct ib_wc *wc,
> + const struct rpc_rdma_cid *cid
> + ),
> +
> + TP_ARGS(wc, cid),
> +
> + TP_STRUCT__entry(
> + __field(u32, cq_id)
> + __field(int, completion_id)
> + __field(u32, received)
> + __field(unsigned long, status)
> + __field(unsigned int, vendor_err)
> + ),
> +
> + TP_fast_assign(
> + __entry->cq_id = cid->ci_queue_id;
> + __entry->completion_id = cid->ci_completion_id;
> + __entry->status = wc->status;
> + if (wc->status) {
> + __entry->received = 0;
> + __entry->vendor_err = wc->vendor_err;
> + } else {
> + __entry->received = wc->byte_len;
> + __entry->vendor_err = 0;
> + }
> + ),
> +
> + TP_printk("cq.id=%u cid=%d status=%s (%lu/0x%x) received=%u",
> + __entry->cq_id, __entry->completion_id,
> + rdma_show_wc_status(__entry->status),
> + __entry->status, __entry->vendor_err,
> + __entry->received
> + )
> +);
> +
> +#define DEFINE_RECEIVE_COMPLETION_EVENT(name) \
> + DEFINE_EVENT(rpcrdma_receive_completion_class, name, \
> + TP_PROTO( \
> + const struct ib_wc *wc, \
> + const struct rpc_rdma_cid *cid \
> + ), \
> + TP_ARGS(wc, cid))
> +
> DECLARE_EVENT_CLASS(xprtrdma_reply_class,
> TP_PROTO(
> const struct rpcrdma_rep *rep
> @@ -838,7 +883,8 @@ TRACE_EVENT(xprtrdma_post_linv_err,
> ** Completion events
> **/
>
> -DEFINE_COMPLETION_EVENT(xprtrdma_wc_receive);
> +DEFINE_RECEIVE_COMPLETION_EVENT(xprtrdma_wc_receive);
> +
> DEFINE_COMPLETION_EVENT(xprtrdma_wc_send);
> DEFINE_COMPLETION_EVENT(xprtrdma_wc_fastreg);
> DEFINE_COMPLETION_EVENT(xprtrdma_wc_li);
> @@ -1790,7 +1836,7 @@ TRACE_EVENT(svcrdma_post_recv,
> )
> );
>
> -DEFINE_COMPLETION_EVENT(svcrdma_wc_receive);
> +DEFINE_RECEIVE_COMPLETION_EVENT(svcrdma_wc_receive);
>
> TRACE_EVENT(svcrdma_rq_post_err,
> TP_PROTO(
>
>
>

2021-02-03 18:19:37

by Chuck Lever

[permalink] [raw]
Subject: Re: [PATCH v2 3/6] xprtrdma: Refactor invocations of offset_in_page()



> On Feb 3, 2021, at 1:09 PM, Tom Talpey <[email protected]> wrote:
>
> This looks good, but the earlier 1/6 patch depends on the offset_in_page
> conversion in rpcrdma_convert_kvec.

I don't think it does... sg_set_buf() handles the offset_in_page() calculation
in that case.


> Won't that complicate any bisection?
>
> Reviewed-By: Tom Talpey <[email protected]>
>
> On 2/3/2021 11:24 AM, Chuck Lever wrote:
>> Clean up so that offset_in_page() is invoked less often in the
>> most common case, which is mapping xdr->pages.
>> Signed-off-by: Chuck Lever <[email protected]>
>> ---
>> net/sunrpc/xprtrdma/frwr_ops.c | 8 +++-----
>> net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++--
>> net/sunrpc/xprtrdma/xprt_rdma.h | 2 +-
>> 3 files changed, 6 insertions(+), 8 deletions(-)
>> diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
>> index 13a50f77dddb..766a1048a48a 100644
>> --- a/net/sunrpc/xprtrdma/frwr_ops.c
>> +++ b/net/sunrpc/xprtrdma/frwr_ops.c
>> @@ -306,16 +306,14 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
>> if (nsegs > ep->re_max_fr_depth)
>> nsegs = ep->re_max_fr_depth;
>> for (i = 0; i < nsegs;) {
>> - sg_set_page(&mr->mr_sg[i],
>> - seg->mr_page,
>> - seg->mr_len,
>> - offset_in_page(seg->mr_offset));
>> + sg_set_page(&mr->mr_sg[i], seg->mr_page,
>> + seg->mr_len, seg->mr_offset);
>> ++seg;
>> ++i;
>> if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS)
>> continue;
>> - if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
>> + if ((i < nsegs && seg->mr_offset) ||
>> offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
>> break;
>> }
>> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
>> index 529adb6ad4db..b3e66b8f65ab 100644
>> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
>> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
>> @@ -215,7 +215,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
>> {
>> if (vec->iov_len) {
>> seg->mr_page = virt_to_page(vec->iov_base);
>> - seg->mr_offset = vec->iov_base;
>> + seg->mr_offset = offset_in_page(vec->iov_base);
>> seg->mr_len = vec->iov_len;
>> ++seg;
>> ++(*n);
>> @@ -248,7 +248,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
>> page_base = offset_in_page(xdrbuf->page_base);
>> while (len) {
>> seg->mr_page = *ppages;
>> - seg->mr_offset = (char *)page_base;
>> + seg->mr_offset = page_base;
>> seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
>> len -= seg->mr_len;
>> ++ppages;
>> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
>> index 02971e183989..ed1c5444fb9d 100644
>> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
>> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
>> @@ -287,7 +287,7 @@ enum {
>> struct rpcrdma_mr_seg {
>> u32 mr_len; /* length of segment */
>> struct page *mr_page; /* underlying struct page */
>> - char *mr_offset; /* IN: page offset, OUT: iova */
>> + u64 mr_offset; /* IN: page offset, OUT: iova */
>> };
>> /* The Send SGE array is provisioned to send a maximum size

--
Chuck Lever



2021-02-03 18:21:01

by Tom Talpey

[permalink] [raw]
Subject: Re: [PATCH v2 5/6] xprtrdma: Pad optimization, revisited

This is a safe and obviously warranted processing revision.

The changelog is quite an eyeful for a one-liner, and maybe only
makes sense to the truly dedicated reader. But...

Reviewed-By: Tom Talpey <[email protected]>

On 2/3/2021 11:24 AM, Chuck Lever wrote:
> The NetApp Linux team discovered that with NFS/RDMA servers that do
> not support RFC 8797, the Linux client is forming NFSv4.x WRITE
> requests incorrectly.
>
> In this case, the Linux NFS client disables implicit chunk round-up
> for odd-length Read and Write chunks. The goal was to support old
> servers that needed that padding to be sent explicitly by clients.
>
> In that case the Linux NFS included the tail kvec in the Read chunk,
> since the tail contains any needed padding. That meant a separate
> memory registration is needed for the tail kvec, adding to the cost
> of forming such requests. To avoid that cost for a mere 3 bytes of
> zeroes that are always ignored by receivers, we try to use implicit
> roundup when possible.
>
> For NFSv4.x, the tail kvec also sometimes contains a trailing
> GETATTR operation. The Linux NFS clients is unintentionally
> including that GETATTR operation in the Read chunk as well as
> inline. Fortunately, servers ignore this craziness and go about
> their normal business.
>
> The fix is simply to /never/ include the tail kvec when forming a
> data payload Read chunk.
>
> Note that since commit 9ed5af268e88 ("SUNRPC: Clean up the handling
> of page padding in rpc_prepare_reply_pages()") the NFS client passes
> payload data to the transport with the padding in xdr->pages instead
> of in the send buffer's tail kvec. So now the Linux NFS client
> appends XDR padding to all odd-sized Read chunks. This shouldn't be
> a problem because:
>
> - RFC 8166-compliant servers are supposed to work with or without
> that XDR padding in Read chunks.
>
> - Since the padding is now in the same memory region as the data
> payload, a separate memory registration is not needed. In
> addition, the link layer extends data in RDMA Read responses to
> 4-byte boundaries anyway. Thus there is now no savings when the
> padding is not included.
>
> Because older kernels include the payload's XDR padding in the
> tail kvec, a fix there will be more complicated. Thus backporting
> this patch is not recommended.
>
> Reported by: Olga Kornievskaia <[email protected]>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> net/sunrpc/xprtrdma/rpc_rdma.c | 5 +----
> 1 file changed, 1 insertion(+), 4 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
> index f0af89a43efd..f1b52f9ab242 100644
> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
> @@ -257,10 +257,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
> page_base = 0;
> }
>
> - /* When encoding a Read chunk, the tail iovec contains an
> - * XDR pad and may be omitted.
> - */
> - if (type == rpcrdma_readch && r_xprt->rx_ep->re_implicit_roundup)
> + if (type == rpcrdma_readch)
> goto out;
>
> /* When encoding a Write chunk, some servers need to see an
>
>
>

2021-02-03 18:21:35

by Tom Talpey

[permalink] [raw]
Subject: Re: [PATCH v2 3/6] xprtrdma: Refactor invocations of offset_in_page()

On 2/3/2021 1:11 PM, Chuck Lever wrote:
>
>
>> On Feb 3, 2021, at 1:09 PM, Tom Talpey <[email protected]> wrote:
>>
>> This looks good, but the earlier 1/6 patch depends on the offset_in_page
>> conversion in rpcrdma_convert_kvec.
>
> I don't think it does... sg_set_buf() handles the offset_in_page() calculation
> in that case.

Ah, ok. And offset_in_page can be applied repeatedly, as well.

Tom.

>> Won't that complicate any bisection?
>>
>> Reviewed-By: Tom Talpey <[email protected]>
>>
>> On 2/3/2021 11:24 AM, Chuck Lever wrote:
>>> Clean up so that offset_in_page() is invoked less often in the
>>> most common case, which is mapping xdr->pages.
>>> Signed-off-by: Chuck Lever <[email protected]>
>>> ---
>>> net/sunrpc/xprtrdma/frwr_ops.c | 8 +++-----
>>> net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++--
>>> net/sunrpc/xprtrdma/xprt_rdma.h | 2 +-
>>> 3 files changed, 6 insertions(+), 8 deletions(-)
>>> diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
>>> index 13a50f77dddb..766a1048a48a 100644
>>> --- a/net/sunrpc/xprtrdma/frwr_ops.c
>>> +++ b/net/sunrpc/xprtrdma/frwr_ops.c
>>> @@ -306,16 +306,14 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
>>> if (nsegs > ep->re_max_fr_depth)
>>> nsegs = ep->re_max_fr_depth;
>>> for (i = 0; i < nsegs;) {
>>> - sg_set_page(&mr->mr_sg[i],
>>> - seg->mr_page,
>>> - seg->mr_len,
>>> - offset_in_page(seg->mr_offset));
>>> + sg_set_page(&mr->mr_sg[i], seg->mr_page,
>>> + seg->mr_len, seg->mr_offset);
>>> ++seg;
>>> ++i;
>>> if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS)
>>> continue;
>>> - if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
>>> + if ((i < nsegs && seg->mr_offset) ||
>>> offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
>>> break;
>>> }
>>> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
>>> index 529adb6ad4db..b3e66b8f65ab 100644
>>> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
>>> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
>>> @@ -215,7 +215,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
>>> {
>>> if (vec->iov_len) {
>>> seg->mr_page = virt_to_page(vec->iov_base);
>>> - seg->mr_offset = vec->iov_base;
>>> + seg->mr_offset = offset_in_page(vec->iov_base);
>>> seg->mr_len = vec->iov_len;
>>> ++seg;
>>> ++(*n);
>>> @@ -248,7 +248,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
>>> page_base = offset_in_page(xdrbuf->page_base);
>>> while (len) {
>>> seg->mr_page = *ppages;
>>> - seg->mr_offset = (char *)page_base;
>>> + seg->mr_offset = page_base;
>>> seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
>>> len -= seg->mr_len;
>>> ++ppages;
>>> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
>>> index 02971e183989..ed1c5444fb9d 100644
>>> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
>>> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
>>> @@ -287,7 +287,7 @@ enum {
>>> struct rpcrdma_mr_seg {
>>> u32 mr_len; /* length of segment */
>>> struct page *mr_page; /* underlying struct page */
>>> - char *mr_offset; /* IN: page offset, OUT: iova */
>>> + u64 mr_offset; /* IN: page offset, OUT: iova */
>>> };
>>> /* The Send SGE array is provisioned to send a maximum size
>
> --
> Chuck Lever
>
>
>
>