2021-03-29 14:41:20

by Chuck Lever III

[permalink] [raw]
Subject: [PATCH v1 0/6] Restructure how svcrdma handles RDMA Read

Hi-

I'm on a "mission" to reduce the amount of page allocator churn
that happens during server-side RPC call processing.

In the case where svcrdma needs to perform an RDMA Read to pull
Read chunks from a client, currently two calls to svc_rdma_recvfrom
are necessary:

Call 1:

An ingress Receive is processed. If there are Read chunks, the
transport strips the pages out of the passed-in svc_rqst, sets up
the RDMA Read with those pages and posts the Read, then returns
the svc_rqst with missing pages. Return value of zero means no
RPC Call is ready yet.

---

Meanwhile svc has to refill those pages in the svc_rqst to
prepare it for the next incoming RPC.

---

Call 2:

The Read completion has occurred. The transport strips the pages
out of the passed-in svc_rqst and frees them. It copies the
pages from the Read sink buffer into the svc_rqst, and returns
the completed RPC.

---


Instead, let's do the RDMA Reads synchronously with a single call to
svc_rdma_recvfrom(). When svc_rdma_recvfrom() handles an ingress
Receive with Read chunks, set up the RDMA Read sink buffer using the
available pages in the svc_rqst, and wait right there for
completion.

For large Read chunks, this avoids freeing a bunch of pages and then
allocating them again. Fewer trips to the page allocator means less
time with IRQs disabled, and measurably faster turn-around for the
next RPC.

Also, now that XPT_BUSY is cleared immediately when a Receive is
dequeued, the transport can handle a higher rate of incoming RPCs.


---

Chuck Lever (6):
SUNRPC: Export svc_xprt_received()
SUNRPC: Move svc_xprt_received() call sites
svcrdma: Single-stage RDMA Read
svcrdma: Remove sc_read_complete_q
svcrdma: Remove svc_rdma_recv_ctxt::rc_pages and ::rc_arg
svcrdma: Clean up dto_q critical section in svc_rdma_recvfrom()


include/linux/sunrpc/svc_rdma.h | 5 --
net/sunrpc/svc_xprt.c | 7 +-
net/sunrpc/svcsock.c | 9 +-
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 76 +++-------------
net/sunrpc/xprtrdma/svc_rdma_rw.c | 108 ++++++++---------------
net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 -
6 files changed, 60 insertions(+), 146 deletions(-)

--
Chuck Lever


2021-03-29 14:41:30

by Chuck Lever III

[permalink] [raw]
Subject: [PATCH v1 1/6] SUNRPC: Export svc_xprt_received()

Prepare svc_xprt_received() to be called from transport code instead
of from generic RPC server code.

Signed-off-by: Chuck Lever <[email protected]>
---
include/linux/sunrpc/svc_xprt.h | 1 +
include/trace/events/sunrpc.h | 1 +
net/sunrpc/svc_xprt.c | 13 +++++++++----
3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 34dacadfe517..571f605bc91e 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -130,6 +130,7 @@ void svc_xprt_init(struct net *, struct svc_xprt_class *, struct svc_xprt *,
int svc_create_xprt(struct svc_serv *, const char *, struct net *,
const int, const unsigned short, int,
const struct cred *);
+void svc_xprt_received(struct svc_xprt *xprt);
void svc_xprt_do_enqueue(struct svc_xprt *xprt);
void svc_xprt_enqueue(struct svc_xprt *xprt);
void svc_xprt_put(struct svc_xprt *xprt);
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 036eb1f5c133..bda16e9e6ba7 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -1781,6 +1781,7 @@ DECLARE_EVENT_CLASS(svc_xprt_event,
), \
TP_ARGS(xprt))

+DEFINE_SVC_XPRT_EVENT(received);
DEFINE_SVC_XPRT_EVENT(no_write_space);
DEFINE_SVC_XPRT_EVENT(close);
DEFINE_SVC_XPRT_EVENT(detach);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index b134fc5f3b8d..9d1374e82e90 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -247,21 +247,25 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
return xprt;
}

-/*
- * svc_xprt_received conditionally queues the transport for processing
- * by another thread. The caller must hold the XPT_BUSY bit and must
+/**
+ * svc_xprt_received - start next receiver thread
+ * @xprt: controlling transport
+ *
+ * The caller must hold the XPT_BUSY bit and must
* not thereafter touch transport data.
*
* Note: XPT_DATA only gets cleared when a read-attempt finds no (or
* insufficient) data.
*/
-static void svc_xprt_received(struct svc_xprt *xprt)
+void svc_xprt_received(struct svc_xprt *xprt)
{
if (!test_bit(XPT_BUSY, &xprt->xpt_flags)) {
WARN_ONCE(1, "xprt=0x%p already busy!", xprt);
return;
}

+ trace_svc_xprt_received(xprt);
+
/* As soon as we clear busy, the xprt could be closed and
* 'put', so we need a reference to call svc_enqueue_xprt with:
*/
@@ -271,6 +275,7 @@ static void svc_xprt_received(struct svc_xprt *xprt)
xprt->xpt_server->sv_ops->svo_enqueue_xprt(xprt);
svc_xprt_put(xprt);
}
+EXPORT_SYMBOL_GPL(svc_xprt_received);

void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new)
{


2021-03-29 14:41:55

by Chuck Lever III

[permalink] [raw]
Subject: [PATCH v1 6/6] svcrdma: Clean up dto_q critical section in svc_rdma_recvfrom()

This, to me, seems less cluttered and less redundant. I was hoping
it could help reduce lock contention on the dto_q lock by reducing
the size of the critical section, but alas, the only improvement is
readability.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 16 +++++++---------
1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 357e3ae01991..1cf0b04b632a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -794,22 +794,20 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)

rqstp->rq_xprt_ctxt = NULL;

+ ctxt = NULL;
spin_lock(&rdma_xprt->sc_rq_dto_lock);
ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q);
- if (!ctxt) {
+ if (ctxt)
+ list_del(&ctxt->rc_list);
+ else
/* No new incoming requests, terminate the loop */
clear_bit(XPT_DATA, &xprt->xpt_flags);
- spin_unlock(&rdma_xprt->sc_rq_dto_lock);
- svc_xprt_received(xprt);
- return 0;
- }
- list_del(&ctxt->rc_list);
spin_unlock(&rdma_xprt->sc_rq_dto_lock);
- percpu_counter_inc(&svcrdma_stat_recv);
-
- /* Start receiving the next incoming message */
svc_xprt_received(xprt);
+ if (!ctxt)
+ return 0;

+ percpu_counter_inc(&svcrdma_stat_recv);
ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device,
ctxt->rc_recv_sge.addr, ctxt->rc_byte_len,
DMA_FROM_DEVICE);


2021-03-29 14:41:56

by Chuck Lever III

[permalink] [raw]
Subject: [PATCH v1 5/6] svcrdma: Remove svc_rdma_recv_ctxt::rc_pages and ::rc_arg

These fields are no longer used.

The size of struct svc_rdma_recv_ctxt is now less than 300 bytes on
x86_64, down from 2440 bytes.

Signed-off-by: Chuck Lever <[email protected]>
---
include/linux/sunrpc/svc_rdma.h | 3 ---
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 5 -----
net/sunrpc/xprtrdma/svc_rdma_rw.c | 12 ------------
3 files changed, 20 deletions(-)

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index b72f75091404..3184465de3a0 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -134,7 +134,6 @@ struct svc_rdma_recv_ctxt {
struct rpc_rdma_cid rc_cid;
struct ib_sge rc_recv_sge;
void *rc_recv_buf;
- struct xdr_buf rc_arg;
struct xdr_stream rc_stream;
bool rc_temp;
u32 rc_byte_len;
@@ -148,8 +147,6 @@ struct svc_rdma_recv_ctxt {
struct svc_rdma_chunk *rc_cur_result_payload;
struct svc_rdma_pcl rc_write_pcl;
struct svc_rdma_pcl rc_reply_pcl;
-
- struct page *rc_pages[RPCSVC_MAXPAGES];
};

struct svc_rdma_send_ctxt {
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 7c7eca07b965..357e3ae01991 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -227,11 +227,6 @@ struct svc_rdma_recv_ctxt *svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
struct svc_rdma_recv_ctxt *ctxt)
{
- unsigned int i;
-
- for (i = 0; i < ctxt->rc_page_count; i++)
- put_page(ctxt->rc_pages[i]);
-
pcl_free(&ctxt->rc_call_pcl);
pcl_free(&ctxt->rc_read_pcl);
pcl_free(&ctxt->rc_write_pcl);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 9163ab690288..11a7d4b6209e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -1083,18 +1083,6 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
struct svc_rdma_chunk_ctxt *cc;
int ret;

- /* The request (with page list) is constructed in
- * head->rc_arg. Pages involved with RDMA Read I/O are
- * transferred there.
- */
- head->rc_arg.head[0] = rqstp->rq_arg.head[0];
- head->rc_arg.tail[0] = rqstp->rq_arg.tail[0];
- head->rc_arg.pages = head->rc_pages;
- head->rc_arg.page_base = 0;
- head->rc_arg.page_len = 0;
- head->rc_arg.len = rqstp->rq_arg.len;
- head->rc_arg.buflen = rqstp->rq_arg.buflen;
-
info = svc_rdma_read_info_alloc(rdma);
if (!info)
return -ENOMEM;


2021-03-29 14:42:12

by Chuck Lever III

[permalink] [raw]
Subject: [PATCH v1 4/6] svcrdma: Remove sc_read_complete_q

Now that svc_rdma_recvfrom() waits for Read completion,
sc_read_complete_q is no longer used.

Signed-off-by: Chuck Lever <[email protected]>
---
include/linux/sunrpc/svc_rdma.h | 2 -
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 57 +++---------------------------
net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 -
3 files changed, 6 insertions(+), 54 deletions(-)

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 6e621e1f56b8..b72f75091404 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -106,7 +106,6 @@ struct svcxprt_rdma {

wait_queue_head_t sc_send_wait; /* SQ exhaustion waitlist */
unsigned long sc_flags;
- struct list_head sc_read_complete_q;
struct work_struct sc_work;

struct llist_head sc_recv_ctxts;
@@ -140,7 +139,6 @@ struct svc_rdma_recv_ctxt {
bool rc_temp;
u32 rc_byte_len;
unsigned int rc_page_count;
- unsigned int rc_hdr_count;
u32 rc_inv_rkey;
__be32 rc_msgtype;

diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index b857a6805e95..7c7eca07b965 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -89,8 +89,7 @@
* svc_rdma_recvfrom call returns.
*
* During the second svc_rdma_recvfrom call, RDMA Read sink pages
- * are transferred from the svc_rdma_recv_ctxt to the second svc_rqst
- * (see rdma_read_complete() below).
+ * are transferred from the svc_rdma_recv_ctxt to the second svc_rqst.
*/

#include <linux/slab.h>
@@ -379,10 +378,6 @@ void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma)
{
struct svc_rdma_recv_ctxt *ctxt;

- while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) {
- list_del(&ctxt->rc_list);
- svc_rdma_recv_ctxt_put(rdma, ctxt);
- }
while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) {
list_del(&ctxt->rc_list);
svc_rdma_recv_ctxt_put(rdma, ctxt);
@@ -720,35 +715,6 @@ static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg,
return -EINVAL;
}

-static void rdma_read_complete(struct svc_rqst *rqstp,
- struct svc_rdma_recv_ctxt *head)
-{
- int page_no;
-
- /* Move Read chunk pages to rqstp so that they will be released
- * when svc_process is done with them.
- */
- for (page_no = 0; page_no < head->rc_page_count; page_no++) {
- put_page(rqstp->rq_pages[page_no]);
- rqstp->rq_pages[page_no] = head->rc_pages[page_no];
- }
- head->rc_page_count = 0;
-
- /* Point rq_arg.pages past header */
- rqstp->rq_arg.pages = &rqstp->rq_pages[head->rc_hdr_count];
- rqstp->rq_arg.page_len = head->rc_arg.page_len;
-
- /* rq_respages starts after the last arg page */
- rqstp->rq_respages = &rqstp->rq_pages[page_no];
- rqstp->rq_next_page = rqstp->rq_respages + 1;
-
- /* Rebuild rq_arg head and tail. */
- rqstp->rq_arg.head[0] = head->rc_arg.head[0];
- rqstp->rq_arg.tail[0] = head->rc_arg.tail[0];
- rqstp->rq_arg.len = head->rc_arg.len;
- rqstp->rq_arg.buflen = head->rc_arg.buflen;
-}
-
static void svc_rdma_send_error(struct svcxprt_rdma *rdma,
struct svc_rdma_recv_ctxt *rctxt,
int status)
@@ -834,13 +800,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
rqstp->rq_xprt_ctxt = NULL;

spin_lock(&rdma_xprt->sc_rq_dto_lock);
- ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q);
- if (ctxt) {
- list_del(&ctxt->rc_list);
- spin_unlock(&rdma_xprt->sc_rq_dto_lock);
- rdma_read_complete(rqstp, ctxt);
- goto complete;
- }
ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q);
if (!ctxt) {
/* No new incoming requests, terminate the loop */
@@ -880,21 +839,17 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
svc_rdma_get_inv_rkey(rdma_xprt, ctxt);

if (!pcl_is_empty(&ctxt->rc_read_pcl) ||
- !pcl_is_empty(&ctxt->rc_call_pcl))
- goto out_readlist;
+ !pcl_is_empty(&ctxt->rc_call_pcl)) {
+ ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt);
+ if (ret < 0)
+ goto out_readfail;
+ }

-complete:
rqstp->rq_xprt_ctxt = ctxt;
rqstp->rq_prot = IPPROTO_MAX;
svc_xprt_copy_addrs(rqstp, xprt);
return rqstp->rq_arg.len;

-out_readlist:
- ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt);
- if (ret < 0)
- goto out_readfail;
- goto complete;
-
out_err:
svc_rdma_send_error(rdma_xprt, ctxt, ret);
svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 3646216211c5..d94b7759ada1 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -136,7 +136,6 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
- INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts);
init_llist_head(&cma_xprt->sc_recv_ctxts);
INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);


2021-03-29 14:42:12

by Chuck Lever III

[permalink] [raw]
Subject: [PATCH v1 3/6] svcrdma: Single-stage RDMA Read

Currently the generic RPC server layer calls svc_rdma_recvfrom()
twice to retrieve an RPC message that uses Read chunks. I'm not
exactly sure why this design was chosen originally.

Instead, let's wait for the Read chunk completion inline in the
first call to svc_rdma_recvfrom().

The goal is to eliminate some page allocator churn.
rdma_read_complete() replaces pages in the second svc_rqst by
calling put_page() repeatedly while the upper layer waits for
the request to be constructed, which adds unnecessary round-
trip latency.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 10 +--
net/sunrpc/xprtrdma/svc_rdma_rw.c | 96 +++++++++++--------------------
2 files changed, 39 insertions(+), 67 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 9cb5a09c4a01..b857a6805e95 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -853,6 +853,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
spin_unlock(&rdma_xprt->sc_rq_dto_lock);
percpu_counter_inc(&svcrdma_stat_recv);

+ /* Start receiving the next incoming message */
+ svc_xprt_received(xprt);
+
ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device,
ctxt->rc_recv_sge.addr, ctxt->rc_byte_len,
DMA_FROM_DEVICE);
@@ -884,33 +887,28 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
rqstp->rq_xprt_ctxt = ctxt;
rqstp->rq_prot = IPPROTO_MAX;
svc_xprt_copy_addrs(rqstp, xprt);
- svc_xprt_received(xprt);
return rqstp->rq_arg.len;

out_readlist:
ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt);
if (ret < 0)
goto out_readfail;
- svc_xprt_received(xprt);
- return 0;
+ goto complete;

out_err:
svc_rdma_send_error(rdma_xprt, ctxt, ret);
svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
- svc_xprt_received(xprt);
return 0;

out_readfail:
if (ret == -EINVAL)
svc_rdma_send_error(rdma_xprt, ctxt, ret);
svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
- svc_xprt_received(xprt);
return ret;

out_backchannel:
svc_rdma_handle_bc_reply(rqstp, ctxt);
out_drop:
svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
- svc_xprt_received(xprt);
return 0;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index d7054e3a8e33..9163ab690288 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -150,6 +150,8 @@ struct svc_rdma_chunk_ctxt {
struct svcxprt_rdma *cc_rdma;
struct list_head cc_rwctxts;
int cc_sqecount;
+ enum ib_wc_status cc_status;
+ struct completion cc_done;
};

static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma,
@@ -299,29 +301,15 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
struct svc_rdma_chunk_ctxt *cc =
container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
struct svcxprt_rdma *rdma = cc->cc_rdma;
- struct svc_rdma_read_info *info =
- container_of(cc, struct svc_rdma_read_info, ri_cc);

trace_svcrdma_wc_read(wc, &cc->cc_cid);

atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
wake_up(&rdma->sc_send_wait);

- if (unlikely(wc->status != IB_WC_SUCCESS)) {
- set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
- svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt);
- } else {
- spin_lock(&rdma->sc_rq_dto_lock);
- list_add_tail(&info->ri_readctxt->rc_list,
- &rdma->sc_read_complete_q);
- /* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */
- set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
- spin_unlock(&rdma->sc_rq_dto_lock);
-
- svc_xprt_enqueue(&rdma->sc_xprt);
- }
-
- svc_rdma_read_info_free(info);
+ cc->cc_status = wc->status;
+ complete(&cc->cc_done);
+ return;
}

/* This function sleeps when the transport's Send Queue is congested.
@@ -676,8 +664,8 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
struct svc_rdma_chunk_ctxt *cc = &info->ri_cc;
struct svc_rqst *rqstp = info->ri_rqst;
- struct svc_rdma_rw_ctxt *ctxt;
unsigned int sge_no, seg_len, len;
+ struct svc_rdma_rw_ctxt *ctxt;
struct scatterlist *sg;
int ret;

@@ -693,8 +681,8 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
seg_len = min_t(unsigned int, len,
PAGE_SIZE - info->ri_pageoff);

- head->rc_arg.pages[info->ri_pageno] =
- rqstp->rq_pages[info->ri_pageno];
+ /* XXX: ri_pageno and rc_page_count might be exactly the same */
+
if (!info->ri_pageoff)
head->rc_page_count++;

@@ -788,12 +776,10 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info,
page_len = min_t(unsigned int, remaining,
PAGE_SIZE - info->ri_pageoff);

- head->rc_arg.pages[info->ri_pageno] =
- rqstp->rq_pages[info->ri_pageno];
if (!info->ri_pageoff)
head->rc_page_count++;

- dst = page_address(head->rc_arg.pages[info->ri_pageno]);
+ dst = page_address(rqstp->rq_pages[info->ri_pageno]);
memcpy(dst + info->ri_pageno, src + offset, page_len);

info->ri_totalbytes += page_len;
@@ -813,7 +799,7 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info,
* svc_rdma_read_multiple_chunks - Construct RDMA Reads to pull data item Read chunks
* @info: context for RDMA Reads
*
- * The chunk data lands in head->rc_arg as a series of contiguous pages,
+ * The chunk data lands in rqstp->rq_arg as a series of contiguous pages,
* like an incoming TCP call.
*
* Return values:
@@ -827,8 +813,8 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
{
struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
const struct svc_rdma_pcl *pcl = &head->rc_read_pcl;
+ struct xdr_buf *buf = &info->ri_rqst->rq_arg;
struct svc_rdma_chunk *chunk, *next;
- struct xdr_buf *buf = &head->rc_arg;
unsigned int start, length;
int ret;

@@ -864,9 +850,9 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
buf->len += info->ri_totalbytes;
buf->buflen += info->ri_totalbytes;

- head->rc_hdr_count = 1;
- buf->head[0].iov_base = page_address(head->rc_pages[0]);
+ buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]);
buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
+ buf->pages = &info->ri_rqst->rq_pages[1];
buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;
return 0;
}
@@ -875,9 +861,9 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
* svc_rdma_read_data_item - Construct RDMA Reads to pull data item Read chunks
* @info: context for RDMA Reads
*
- * The chunk data lands in the page list of head->rc_arg.pages.
+ * The chunk data lands in the page list of rqstp->rq_arg.pages.
*
- * Currently NFSD does not look at the head->rc_arg.tail[0] iovec.
+ * Currently NFSD does not look at the rqstp->rq_arg.tail[0] kvec.
* Therefore, XDR round-up of the Read chunk and trailing
* inline content must both be added at the end of the pagelist.
*
@@ -891,7 +877,7 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
{
struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
- struct xdr_buf *buf = &head->rc_arg;
+ struct xdr_buf *buf = &info->ri_rqst->rq_arg;
struct svc_rdma_chunk *chunk;
unsigned int length;
int ret;
@@ -901,8 +887,6 @@ static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
if (ret < 0)
goto out;

- head->rc_hdr_count = 0;
-
/* Split the Receive buffer between the head and tail
* buffers at Read chunk's position. XDR roundup of the
* chunk is not included in either the pagelist or in
@@ -921,7 +905,8 @@ static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
* Currently these chunks always start at page offset 0,
* thus the rounded-up length never crosses a page boundary.
*/
- length = XDR_QUADLEN(info->ri_totalbytes) << 2;
+ buf->pages = &info->ri_rqst->rq_pages[0];
+ length = xdr_align_size(chunk->ch_length);
buf->page_len = length;
buf->len += length;
buf->buflen += length;
@@ -1033,8 +1018,7 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info)
* @info: context for RDMA Reads
*
* The start of the data lands in the first page just after the
- * Transport header, and the rest lands in the page list of
- * head->rc_arg.pages.
+ * Transport header, and the rest lands in rqstp->rq_arg.pages.
*
* Assumptions:
* - A PZRC is never sent in an RDMA_MSG message, though it's
@@ -1049,8 +1033,7 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info)
*/
static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info)
{
- struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
- struct xdr_buf *buf = &head->rc_arg;
+ struct xdr_buf *buf = &info->ri_rqst->rq_arg;
int ret;

ret = svc_rdma_read_call_chunk(info);
@@ -1060,35 +1043,15 @@ static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info)
buf->len += info->ri_totalbytes;
buf->buflen += info->ri_totalbytes;

- head->rc_hdr_count = 1;
- buf->head[0].iov_base = page_address(head->rc_pages[0]);
+ buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]);
buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
+ buf->pages = &info->ri_rqst->rq_pages[1];
buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;

out:
return ret;
}

-/* Pages under I/O have been copied to head->rc_pages. Ensure they
- * are not released by svc_xprt_release() until the I/O is complete.
- *
- * This has to be done after all Read WRs are constructed to properly
- * handle a page that is part of I/O on behalf of two different RDMA
- * segments.
- *
- * Do this only if I/O has been posted. Otherwise, we do indeed want
- * svc_xprt_release() to clean things up properly.
- */
-static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
- const unsigned int start,
- const unsigned int num_pages)
-{
- unsigned int i;
-
- for (i = start; i < num_pages + start; i++)
- rqstp->rq_pages[i] = NULL;
-}
-
/**
* svc_rdma_process_read_list - Pull list of Read chunks from the client
* @rdma: controlling RDMA transport
@@ -1153,11 +1116,22 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
goto out_err;

trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount);
+ init_completion(&cc->cc_done);
ret = svc_rdma_post_chunk_ctxt(cc);
if (ret < 0)
goto out_err;
- svc_rdma_save_io_pages(rqstp, 0, head->rc_page_count);
- return 1;
+
+ ret = 1;
+ wait_for_completion(&cc->cc_done);
+ if (cc->cc_status != IB_WC_SUCCESS)
+ ret = -EIO;
+
+ /* rq_respages starts after the last arg page */
+ rqstp->rq_respages = &rqstp->rq_pages[head->rc_page_count];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+ /* Ensure svc_rdma_recv_ctxt_put() does not try to release pages */
+ head->rc_page_count = 0;

out_err:
svc_rdma_read_info_free(info);


2021-03-31 18:36:58

by Tom Talpey

[permalink] [raw]
Subject: Re: [PATCH v1 3/6] svcrdma: Single-stage RDMA Read

On 3/29/2021 10:40 AM, Chuck Lever wrote:
> Currently the generic RPC server layer calls svc_rdma_recvfrom()
> twice to retrieve an RPC message that uses Read chunks. I'm not
> exactly sure why this design was chosen originally.

I'm not either, but remember the design was written over a decade
ago. I vaguely recall there was some bounce buffering for strange
memreg corner cases. The RDMA stack has improved greatly.

> Instead, let's wait for the Read chunk completion inline in the
> first call to svc_rdma_recvfrom().
>
> The goal is to eliminate some page allocator churn.
> rdma_read_complete() replaces pages in the second svc_rqst by
> calling put_page() repeatedly while the upper layer waits for
> the request to be constructed, which adds unnecessary round-
> trip latency.

Local API round-trip, right? Same wire traffic either way. In fact,
I don't see any Verbs changes too.

Some comments/question below.

> Signed-off-by: Chuck Lever <[email protected]>
> ---
> net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 10 +--
> net/sunrpc/xprtrdma/svc_rdma_rw.c | 96 +++++++++++--------------------
> 2 files changed, 39 insertions(+), 67 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
> index 9cb5a09c4a01..b857a6805e95 100644
> --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
> +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
> @@ -853,6 +853,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
> spin_unlock(&rdma_xprt->sc_rq_dto_lock);
> percpu_counter_inc(&svcrdma_stat_recv);
>
> + /* Start receiving the next incoming message */

This comment confused me. This call just unblocks the xprt to move
to the next message, it does not necessarily "start". So IIUC, it
might be clearer to state "transport processing complete" or similar.

> + svc_xprt_received(xprt);
> +
> ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device,
> ctxt->rc_recv_sge.addr, ctxt->rc_byte_len,
> DMA_FROM_DEVICE);
> @@ -884,33 +887,28 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
> rqstp->rq_xprt_ctxt = ctxt;
> rqstp->rq_prot = IPPROTO_MAX;
> svc_xprt_copy_addrs(rqstp, xprt);
> - svc_xprt_received(xprt);
> return rqstp->rq_arg.len;
>
> out_readlist:
> ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt);
> if (ret < 0)
> goto out_readfail;
> - svc_xprt_received(xprt);
> - return 0;
> + goto complete;
>
> out_err:
> svc_rdma_send_error(rdma_xprt, ctxt, ret);
> svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
> - svc_xprt_received(xprt);
> return 0;
>
> out_readfail:
> if (ret == -EINVAL)
> svc_rdma_send_error(rdma_xprt, ctxt, ret);
> svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
> - svc_xprt_received(xprt);
> return ret;
>
> out_backchannel:
> svc_rdma_handle_bc_reply(rqstp, ctxt);
> out_drop:
> svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
> - svc_xprt_received(xprt);
> return 0;
> }
> diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
> index d7054e3a8e33..9163ab690288 100644
> --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
> +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
> @@ -150,6 +150,8 @@ struct svc_rdma_chunk_ctxt {
> struct svcxprt_rdma *cc_rdma;
> struct list_head cc_rwctxts;
> int cc_sqecount;
> + enum ib_wc_status cc_status;
> + struct completion cc_done;
> };
>
> static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma,
> @@ -299,29 +301,15 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
> struct svc_rdma_chunk_ctxt *cc =
> container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
> struct svcxprt_rdma *rdma = cc->cc_rdma;
> - struct svc_rdma_read_info *info =
> - container_of(cc, struct svc_rdma_read_info, ri_cc);
>
> trace_svcrdma_wc_read(wc, &cc->cc_cid);
>
> atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
> wake_up(&rdma->sc_send_wait);
>
> - if (unlikely(wc->status != IB_WC_SUCCESS)) {
> - set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
> - svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt);
> - } else {
> - spin_lock(&rdma->sc_rq_dto_lock);
> - list_add_tail(&info->ri_readctxt->rc_list,
> - &rdma->sc_read_complete_q);
> - /* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */
> - set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
> - spin_unlock(&rdma->sc_rq_dto_lock);
> -
> - svc_xprt_enqueue(&rdma->sc_xprt);
> - }
> -
> - svc_rdma_read_info_free(info);
> + cc->cc_status = wc->status;
> + complete(&cc->cc_done);
> + return;
> }
>
> /* This function sleeps when the transport's Send Queue is congested.
> @@ -676,8 +664,8 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
> struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
> struct svc_rdma_chunk_ctxt *cc = &info->ri_cc;
> struct svc_rqst *rqstp = info->ri_rqst;
> - struct svc_rdma_rw_ctxt *ctxt;
> unsigned int sge_no, seg_len, len;
> + struct svc_rdma_rw_ctxt *ctxt;
> struct scatterlist *sg;
> int ret;
>
> @@ -693,8 +681,8 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
> seg_len = min_t(unsigned int, len,
> PAGE_SIZE - info->ri_pageoff);
>
> - head->rc_arg.pages[info->ri_pageno] =
> - rqstp->rq_pages[info->ri_pageno];
> + /* XXX: ri_pageno and rc_page_count might be exactly the same */
> +

What is this comment conveying? It looks like a note-to-self that
resulted in deleting the prior line. If the "XXX" notation is
still significant, it needs more detail on what needs to be
fixed in future.

> if (!info->ri_pageoff)
> head->rc_page_count++;
>
> @@ -788,12 +776,10 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info,
> page_len = min_t(unsigned int, remaining,
> PAGE_SIZE - info->ri_pageoff);
>
> - head->rc_arg.pages[info->ri_pageno] =
> - rqstp->rq_pages[info->ri_pageno];
> if (!info->ri_pageoff)
> head->rc_page_count++;
>
> - dst = page_address(head->rc_arg.pages[info->ri_pageno]);
> + dst = page_address(rqstp->rq_pages[info->ri_pageno]);
> memcpy(dst + info->ri_pageno, src + offset, page_len);
>
> info->ri_totalbytes += page_len;
> @@ -813,7 +799,7 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info,
> * svc_rdma_read_multiple_chunks - Construct RDMA Reads to pull data item Read chunks
> * @info: context for RDMA Reads
> *
> - * The chunk data lands in head->rc_arg as a series of contiguous pages,
> + * The chunk data lands in rqstp->rq_arg as a series of contiguous pages,
> * like an incoming TCP call.
> *
> * Return values:
> @@ -827,8 +813,8 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
> {
> struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
> const struct svc_rdma_pcl *pcl = &head->rc_read_pcl;
> + struct xdr_buf *buf = &info->ri_rqst->rq_arg;
> struct svc_rdma_chunk *chunk, *next;
> - struct xdr_buf *buf = &head->rc_arg;
> unsigned int start, length;
> int ret;
>
> @@ -864,9 +850,9 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
> buf->len += info->ri_totalbytes;
> buf->buflen += info->ri_totalbytes;
>
> - head->rc_hdr_count = 1;
> - buf->head[0].iov_base = page_address(head->rc_pages[0]);
> + buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]);
> buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
> + buf->pages = &info->ri_rqst->rq_pages[1];
> buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;
> return 0;
> }
> @@ -875,9 +861,9 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
> * svc_rdma_read_data_item - Construct RDMA Reads to pull data item Read chunks
> * @info: context for RDMA Reads
> *
> - * The chunk data lands in the page list of head->rc_arg.pages.
> + * The chunk data lands in the page list of rqstp->rq_arg.pages.
> *
> - * Currently NFSD does not look at the head->rc_arg.tail[0] iovec.
> + * Currently NFSD does not look at the rqstp->rq_arg.tail[0] kvec.
> * Therefore, XDR round-up of the Read chunk and trailing
> * inline content must both be added at the end of the pagelist.
> *
> @@ -891,7 +877,7 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
> static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
> {
> struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
> - struct xdr_buf *buf = &head->rc_arg;
> + struct xdr_buf *buf = &info->ri_rqst->rq_arg;
> struct svc_rdma_chunk *chunk;
> unsigned int length;
> int ret;
> @@ -901,8 +887,6 @@ static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
> if (ret < 0)
> goto out;
>
> - head->rc_hdr_count = 0;
> -
> /* Split the Receive buffer between the head and tail
> * buffers at Read chunk's position. XDR roundup of the
> * chunk is not included in either the pagelist or in
> @@ -921,7 +905,8 @@ static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
> * Currently these chunks always start at page offset 0,
> * thus the rounded-up length never crosses a page boundary.
> */
> - length = XDR_QUADLEN(info->ri_totalbytes) << 2;
> + buf->pages = &info->ri_rqst->rq_pages[0];
> + length = xdr_align_size(chunk->ch_length);
> buf->page_len = length;
> buf->len += length;
> buf->buflen += length;
> @@ -1033,8 +1018,7 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info)
> * @info: context for RDMA Reads
> *
> * The start of the data lands in the first page just after the
> - * Transport header, and the rest lands in the page list of
> - * head->rc_arg.pages.
> + * Transport header, and the rest lands in rqstp->rq_arg.pages.
> *
> * Assumptions:
> * - A PZRC is never sent in an RDMA_MSG message, though it's
> @@ -1049,8 +1033,7 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info)
> */
> static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info)
> {
> - struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
> - struct xdr_buf *buf = &head->rc_arg;
> + struct xdr_buf *buf = &info->ri_rqst->rq_arg;
> int ret;
>
> ret = svc_rdma_read_call_chunk(info);
> @@ -1060,35 +1043,15 @@ static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info)
> buf->len += info->ri_totalbytes;
> buf->buflen += info->ri_totalbytes;
>
> - head->rc_hdr_count = 1;
> - buf->head[0].iov_base = page_address(head->rc_pages[0]);
> + buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]);
> buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
> + buf->pages = &info->ri_rqst->rq_pages[1];
> buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;
>
> out:
> return ret;
> }
>
> -/* Pages under I/O have been copied to head->rc_pages. Ensure they
> - * are not released by svc_xprt_release() until the I/O is complete.
> - *
> - * This has to be done after all Read WRs are constructed to properly
> - * handle a page that is part of I/O on behalf of two different RDMA
> - * segments.
> - *
> - * Do this only if I/O has been posted. Otherwise, we do indeed want
> - * svc_xprt_release() to clean things up properly.
> - */
> -static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
> - const unsigned int start,
> - const unsigned int num_pages)
> -{
> - unsigned int i;
> -
> - for (i = start; i < num_pages + start; i++)
> - rqstp->rq_pages[i] = NULL;
> -}
> -
> /**
> * svc_rdma_process_read_list - Pull list of Read chunks from the client
> * @rdma: controlling RDMA transport
> @@ -1153,11 +1116,22 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
> goto out_err;
>
> trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount);
> + init_completion(&cc->cc_done);
> ret = svc_rdma_post_chunk_ctxt(cc);
> if (ret < 0)
> goto out_err;
> - svc_rdma_save_io_pages(rqstp, 0, head->rc_page_count);
> - return 1;
> +
> + ret = 1;
> + wait_for_completion(&cc->cc_done);
> + if (cc->cc_status != IB_WC_SUCCESS)
> + ret = -EIO;
> +
> + /* rq_respages starts after the last arg page */
> + rqstp->rq_respages = &rqstp->rq_pages[head->rc_page_count];
> + rqstp->rq_next_page = rqstp->rq_respages + 1;
> +
> + /* Ensure svc_rdma_recv_ctxt_put() does not try to release pages */
> + head->rc_page_count = 0;
>
> out_err:
> svc_rdma_read_info_free(info);
>
>
>

2021-03-31 19:34:43

by Chuck Lever III

[permalink] [raw]
Subject: Re: [PATCH v1 3/6] svcrdma: Single-stage RDMA Read



> On Mar 31, 2021, at 2:34 PM, Tom Talpey <[email protected]> wrote:
>
> On 3/29/2021 10:40 AM, Chuck Lever wrote:
>> Currently the generic RPC server layer calls svc_rdma_recvfrom()
>> twice to retrieve an RPC message that uses Read chunks. I'm not
>> exactly sure why this design was chosen originally.
>
> I'm not either, but remember the design was written over a decade
> ago. I vaguely recall there was some bounce buffering for strange
> memreg corner cases. The RDMA stack has improved greatly.
>
>> Instead, let's wait for the Read chunk completion inline in the
>> first call to svc_rdma_recvfrom().
>> The goal is to eliminate some page allocator churn.
>> rdma_read_complete() replaces pages in the second svc_rqst by
>> calling put_page() repeatedly while the upper layer waits for
>> the request to be constructed, which adds unnecessary round-
>> trip latency.
>
> Local API round-trip, right? Same wire traffic either way. In fact,
> I don't see any Verbs changes too.

The rdma_read_complete() latency is incurred during the second call
to svc_rdma_recvfrom().

That holds up the construction of each message with a Read chunk,
since memory free operations need to complete first before the
second svc_rdma_recvfrom() call can return.

It also holds up the next message in the queue because XPT_BUSY
is held while the Read chunk and memory allocation is being
dealt with.

Therefore it lengthens the NFS WRITE round-trip latency by
inserting memory allocation operations (put_page()) in a hot path.


> Some comments/question below.
>
>> Signed-off-by: Chuck Lever <[email protected]>
>> ---
>> net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 10 +--
>> net/sunrpc/xprtrdma/svc_rdma_rw.c | 96 +++++++++++--------------------
>> 2 files changed, 39 insertions(+), 67 deletions(-)
>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> index 9cb5a09c4a01..b857a6805e95 100644
>> --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> @@ -853,6 +853,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
>> spin_unlock(&rdma_xprt->sc_rq_dto_lock);
>> percpu_counter_inc(&svcrdma_stat_recv);
>> + /* Start receiving the next incoming message */
>
> This comment confused me. This call just unblocks the xprt to move
> to the next message, it does not necessarily "start". So IIUC, it
> might be clearer to state "transport processing complete" or similar.

How about "/* Unblock the transport for the next receive */" ?


>> + svc_xprt_received(xprt);
>> +
>> ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device,
>> ctxt->rc_recv_sge.addr, ctxt->rc_byte_len,
>> DMA_FROM_DEVICE);
>> @@ -884,33 +887,28 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
>> rqstp->rq_xprt_ctxt = ctxt;
>> rqstp->rq_prot = IPPROTO_MAX;
>> svc_xprt_copy_addrs(rqstp, xprt);
>> - svc_xprt_received(xprt);
>> return rqstp->rq_arg.len;
>> out_readlist:
>> ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt);
>> if (ret < 0)
>> goto out_readfail;
>> - svc_xprt_received(xprt);
>> - return 0;
>> + goto complete;
>> out_err:
>> svc_rdma_send_error(rdma_xprt, ctxt, ret);
>> svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
>> - svc_xprt_received(xprt);
>> return 0;
>> out_readfail:
>> if (ret == -EINVAL)
>> svc_rdma_send_error(rdma_xprt, ctxt, ret);
>> svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
>> - svc_xprt_received(xprt);
>> return ret;
>> out_backchannel:
>> svc_rdma_handle_bc_reply(rqstp, ctxt);
>> out_drop:
>> svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
>> - svc_xprt_received(xprt);
>> return 0;
>> }
>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
>> index d7054e3a8e33..9163ab690288 100644
>> --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
>> +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
>> @@ -150,6 +150,8 @@ struct svc_rdma_chunk_ctxt {
>> struct svcxprt_rdma *cc_rdma;
>> struct list_head cc_rwctxts;
>> int cc_sqecount;
>> + enum ib_wc_status cc_status;
>> + struct completion cc_done;
>> };
>> static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma,
>> @@ -299,29 +301,15 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
>> struct svc_rdma_chunk_ctxt *cc =
>> container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
>> struct svcxprt_rdma *rdma = cc->cc_rdma;
>> - struct svc_rdma_read_info *info =
>> - container_of(cc, struct svc_rdma_read_info, ri_cc);
>> trace_svcrdma_wc_read(wc, &cc->cc_cid);
>> atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
>> wake_up(&rdma->sc_send_wait);
>> - if (unlikely(wc->status != IB_WC_SUCCESS)) {
>> - set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
>> - svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt);
>> - } else {
>> - spin_lock(&rdma->sc_rq_dto_lock);
>> - list_add_tail(&info->ri_readctxt->rc_list,
>> - &rdma->sc_read_complete_q);
>> - /* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */
>> - set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
>> - spin_unlock(&rdma->sc_rq_dto_lock);
>> -
>> - svc_xprt_enqueue(&rdma->sc_xprt);
>> - }
>> -
>> - svc_rdma_read_info_free(info);
>> + cc->cc_status = wc->status;
>> + complete(&cc->cc_done);
>> + return;
>> }
>> /* This function sleeps when the transport's Send Queue is congested.
>> @@ -676,8 +664,8 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
>> struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
>> struct svc_rdma_chunk_ctxt *cc = &info->ri_cc;
>> struct svc_rqst *rqstp = info->ri_rqst;
>> - struct svc_rdma_rw_ctxt *ctxt;
>> unsigned int sge_no, seg_len, len;
>> + struct svc_rdma_rw_ctxt *ctxt;
>> struct scatterlist *sg;
>> int ret;
>> @@ -693,8 +681,8 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
>> seg_len = min_t(unsigned int, len,
>> PAGE_SIZE - info->ri_pageoff);
>> - head->rc_arg.pages[info->ri_pageno] =
>> - rqstp->rq_pages[info->ri_pageno];
>> + /* XXX: ri_pageno and rc_page_count might be exactly the same */
>> +
>
> What is this comment conveying? It looks like a note-to-self that
> resulted in deleting the prior line.

Yeah, pretty much. I think it is a reminder that one of those
fields is superfluous and can be removed.


> If the "XXX" notation is
> still significant, it needs more detail on what needs to be
> fixed in future.

Or it should be removed before this patch is merged.


>> if (!info->ri_pageoff)
>> head->rc_page_count++;
>> @@ -788,12 +776,10 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info,
>> page_len = min_t(unsigned int, remaining,
>> PAGE_SIZE - info->ri_pageoff);
>> - head->rc_arg.pages[info->ri_pageno] =
>> - rqstp->rq_pages[info->ri_pageno];
>> if (!info->ri_pageoff)
>> head->rc_page_count++;
>> - dst = page_address(head->rc_arg.pages[info->ri_pageno]);
>> + dst = page_address(rqstp->rq_pages[info->ri_pageno]);
>> memcpy(dst + info->ri_pageno, src + offset, page_len);
>> info->ri_totalbytes += page_len;
>> @@ -813,7 +799,7 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info,
>> * svc_rdma_read_multiple_chunks - Construct RDMA Reads to pull data item Read chunks
>> * @info: context for RDMA Reads
>> *
>> - * The chunk data lands in head->rc_arg as a series of contiguous pages,
>> + * The chunk data lands in rqstp->rq_arg as a series of contiguous pages,
>> * like an incoming TCP call.
>> *
>> * Return values:
>> @@ -827,8 +813,8 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
>> {
>> struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
>> const struct svc_rdma_pcl *pcl = &head->rc_read_pcl;
>> + struct xdr_buf *buf = &info->ri_rqst->rq_arg;
>> struct svc_rdma_chunk *chunk, *next;
>> - struct xdr_buf *buf = &head->rc_arg;
>> unsigned int start, length;
>> int ret;
>> @@ -864,9 +850,9 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
>> buf->len += info->ri_totalbytes;
>> buf->buflen += info->ri_totalbytes;
>> - head->rc_hdr_count = 1;
>> - buf->head[0].iov_base = page_address(head->rc_pages[0]);
>> + buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]);
>> buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
>> + buf->pages = &info->ri_rqst->rq_pages[1];
>> buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;
>> return 0;
>> }
>> @@ -875,9 +861,9 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
>> * svc_rdma_read_data_item - Construct RDMA Reads to pull data item Read chunks
>> * @info: context for RDMA Reads
>> *
>> - * The chunk data lands in the page list of head->rc_arg.pages.
>> + * The chunk data lands in the page list of rqstp->rq_arg.pages.
>> *
>> - * Currently NFSD does not look at the head->rc_arg.tail[0] iovec.
>> + * Currently NFSD does not look at the rqstp->rq_arg.tail[0] kvec.
>> * Therefore, XDR round-up of the Read chunk and trailing
>> * inline content must both be added at the end of the pagelist.
>> *
>> @@ -891,7 +877,7 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
>> static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
>> {
>> struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
>> - struct xdr_buf *buf = &head->rc_arg;
>> + struct xdr_buf *buf = &info->ri_rqst->rq_arg;
>> struct svc_rdma_chunk *chunk;
>> unsigned int length;
>> int ret;
>> @@ -901,8 +887,6 @@ static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
>> if (ret < 0)
>> goto out;
>> - head->rc_hdr_count = 0;
>> -
>> /* Split the Receive buffer between the head and tail
>> * buffers at Read chunk's position. XDR roundup of the
>> * chunk is not included in either the pagelist or in
>> @@ -921,7 +905,8 @@ static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
>> * Currently these chunks always start at page offset 0,
>> * thus the rounded-up length never crosses a page boundary.
>> */
>> - length = XDR_QUADLEN(info->ri_totalbytes) << 2;
>> + buf->pages = &info->ri_rqst->rq_pages[0];
>> + length = xdr_align_size(chunk->ch_length);
>> buf->page_len = length;
>> buf->len += length;
>> buf->buflen += length;
>> @@ -1033,8 +1018,7 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info)
>> * @info: context for RDMA Reads
>> *
>> * The start of the data lands in the first page just after the
>> - * Transport header, and the rest lands in the page list of
>> - * head->rc_arg.pages.
>> + * Transport header, and the rest lands in rqstp->rq_arg.pages.
>> *
>> * Assumptions:
>> * - A PZRC is never sent in an RDMA_MSG message, though it's
>> @@ -1049,8 +1033,7 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info)
>> */
>> static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info)
>> {
>> - struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
>> - struct xdr_buf *buf = &head->rc_arg;
>> + struct xdr_buf *buf = &info->ri_rqst->rq_arg;
>> int ret;
>> ret = svc_rdma_read_call_chunk(info);
>> @@ -1060,35 +1043,15 @@ static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info)
>> buf->len += info->ri_totalbytes;
>> buf->buflen += info->ri_totalbytes;
>> - head->rc_hdr_count = 1;
>> - buf->head[0].iov_base = page_address(head->rc_pages[0]);
>> + buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]);
>> buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
>> + buf->pages = &info->ri_rqst->rq_pages[1];
>> buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;
>> out:
>> return ret;
>> }
>> -/* Pages under I/O have been copied to head->rc_pages. Ensure they
>> - * are not released by svc_xprt_release() until the I/O is complete.
>> - *
>> - * This has to be done after all Read WRs are constructed to properly
>> - * handle a page that is part of I/O on behalf of two different RDMA
>> - * segments.
>> - *
>> - * Do this only if I/O has been posted. Otherwise, we do indeed want
>> - * svc_xprt_release() to clean things up properly.
>> - */
>> -static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
>> - const unsigned int start,
>> - const unsigned int num_pages)
>> -{
>> - unsigned int i;
>> -
>> - for (i = start; i < num_pages + start; i++)
>> - rqstp->rq_pages[i] = NULL;
>> -}
>> -
>> /**
>> * svc_rdma_process_read_list - Pull list of Read chunks from the client
>> * @rdma: controlling RDMA transport
>> @@ -1153,11 +1116,22 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
>> goto out_err;
>> trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount);
>> + init_completion(&cc->cc_done);
>> ret = svc_rdma_post_chunk_ctxt(cc);
>> if (ret < 0)
>> goto out_err;
>> - svc_rdma_save_io_pages(rqstp, 0, head->rc_page_count);
>> - return 1;
>> +
>> + ret = 1;
>> + wait_for_completion(&cc->cc_done);
>> + if (cc->cc_status != IB_WC_SUCCESS)
>> + ret = -EIO;
>> +
>> + /* rq_respages starts after the last arg page */
>> + rqstp->rq_respages = &rqstp->rq_pages[head->rc_page_count];
>> + rqstp->rq_next_page = rqstp->rq_respages + 1;
>> +
>> + /* Ensure svc_rdma_recv_ctxt_put() does not try to release pages */
>> + head->rc_page_count = 0;
>> out_err:
>> svc_rdma_read_info_free(info);

--
Chuck Lever



2021-03-31 19:52:23

by Tom Talpey

[permalink] [raw]
Subject: Re: [PATCH v1 3/6] svcrdma: Single-stage RDMA Read

LGTM. With extra info in the patch description and
the comments changed/deleted:

Reviewed-By: Tom Talpey <[email protected]>

On 3/31/2021 3:33 PM, Chuck Lever III wrote:
>
>
>> On Mar 31, 2021, at 2:34 PM, Tom Talpey <[email protected]> wrote:
>>
>> On 3/29/2021 10:40 AM, Chuck Lever wrote:
>>> Currently the generic RPC server layer calls svc_rdma_recvfrom()
>>> twice to retrieve an RPC message that uses Read chunks. I'm not
>>> exactly sure why this design was chosen originally.
>>
>> I'm not either, but remember the design was written over a decade
>> ago. I vaguely recall there was some bounce buffering for strange
>> memreg corner cases. The RDMA stack has improved greatly.
>>
>>> Instead, let's wait for the Read chunk completion inline in the
>>> first call to svc_rdma_recvfrom().
>>> The goal is to eliminate some page allocator churn.
>>> rdma_read_complete() replaces pages in the second svc_rqst by
>>> calling put_page() repeatedly while the upper layer waits for
>>> the request to be constructed, which adds unnecessary round-
>>> trip latency.
>>
>> Local API round-trip, right? Same wire traffic either way. In fact,
>> I don't see any Verbs changes too.
>
> The rdma_read_complete() latency is incurred during the second call
> to svc_rdma_recvfrom().
>
> That holds up the construction of each message with a Read chunk,
> since memory free operations need to complete first before the
> second svc_rdma_recvfrom() call can return.
>
> It also holds up the next message in the queue because XPT_BUSY
> is held while the Read chunk and memory allocation is being
> dealt with.
>
> Therefore it lengthens the NFS WRITE round-trip latency by
> inserting memory allocation operations (put_page()) in a hot path.
>
>
>> Some comments/question below.
>>
>>> Signed-off-by: Chuck Lever <[email protected]>
>>> ---
>>> net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 10 +--
>>> net/sunrpc/xprtrdma/svc_rdma_rw.c | 96 +++++++++++--------------------
>>> 2 files changed, 39 insertions(+), 67 deletions(-)
>>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>>> index 9cb5a09c4a01..b857a6805e95 100644
>>> --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>>> +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>>> @@ -853,6 +853,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
>>> spin_unlock(&rdma_xprt->sc_rq_dto_lock);
>>> percpu_counter_inc(&svcrdma_stat_recv);
>>> + /* Start receiving the next incoming message */
>>
>> This comment confused me. This call just unblocks the xprt to move
>> to the next message, it does not necessarily "start". So IIUC, it
>> might be clearer to state "transport processing complete" or similar.
>
> How about "/* Unblock the transport for the next receive */" ?
>
>
>>> + svc_xprt_received(xprt);
>>> +
>>> ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device,
>>> ctxt->rc_recv_sge.addr, ctxt->rc_byte_len,
>>> DMA_FROM_DEVICE);
>>> @@ -884,33 +887,28 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
>>> rqstp->rq_xprt_ctxt = ctxt;
>>> rqstp->rq_prot = IPPROTO_MAX;
>>> svc_xprt_copy_addrs(rqstp, xprt);
>>> - svc_xprt_received(xprt);
>>> return rqstp->rq_arg.len;
>>> out_readlist:
>>> ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt);
>>> if (ret < 0)
>>> goto out_readfail;
>>> - svc_xprt_received(xprt);
>>> - return 0;
>>> + goto complete;
>>> out_err:
>>> svc_rdma_send_error(rdma_xprt, ctxt, ret);
>>> svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
>>> - svc_xprt_received(xprt);
>>> return 0;
>>> out_readfail:
>>> if (ret == -EINVAL)
>>> svc_rdma_send_error(rdma_xprt, ctxt, ret);
>>> svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
>>> - svc_xprt_received(xprt);
>>> return ret;
>>> out_backchannel:
>>> svc_rdma_handle_bc_reply(rqstp, ctxt);
>>> out_drop:
>>> svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
>>> - svc_xprt_received(xprt);
>>> return 0;
>>> }
>>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
>>> index d7054e3a8e33..9163ab690288 100644
>>> --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
>>> +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
>>> @@ -150,6 +150,8 @@ struct svc_rdma_chunk_ctxt {
>>> struct svcxprt_rdma *cc_rdma;
>>> struct list_head cc_rwctxts;
>>> int cc_sqecount;
>>> + enum ib_wc_status cc_status;
>>> + struct completion cc_done;
>>> };
>>> static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma,
>>> @@ -299,29 +301,15 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
>>> struct svc_rdma_chunk_ctxt *cc =
>>> container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
>>> struct svcxprt_rdma *rdma = cc->cc_rdma;
>>> - struct svc_rdma_read_info *info =
>>> - container_of(cc, struct svc_rdma_read_info, ri_cc);
>>> trace_svcrdma_wc_read(wc, &cc->cc_cid);
>>> atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
>>> wake_up(&rdma->sc_send_wait);
>>> - if (unlikely(wc->status != IB_WC_SUCCESS)) {
>>> - set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
>>> - svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt);
>>> - } else {
>>> - spin_lock(&rdma->sc_rq_dto_lock);
>>> - list_add_tail(&info->ri_readctxt->rc_list,
>>> - &rdma->sc_read_complete_q);
>>> - /* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */
>>> - set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
>>> - spin_unlock(&rdma->sc_rq_dto_lock);
>>> -
>>> - svc_xprt_enqueue(&rdma->sc_xprt);
>>> - }
>>> -
>>> - svc_rdma_read_info_free(info);
>>> + cc->cc_status = wc->status;
>>> + complete(&cc->cc_done);
>>> + return;
>>> }
>>> /* This function sleeps when the transport's Send Queue is congested.
>>> @@ -676,8 +664,8 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
>>> struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
>>> struct svc_rdma_chunk_ctxt *cc = &info->ri_cc;
>>> struct svc_rqst *rqstp = info->ri_rqst;
>>> - struct svc_rdma_rw_ctxt *ctxt;
>>> unsigned int sge_no, seg_len, len;
>>> + struct svc_rdma_rw_ctxt *ctxt;
>>> struct scatterlist *sg;
>>> int ret;
>>> @@ -693,8 +681,8 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
>>> seg_len = min_t(unsigned int, len,
>>> PAGE_SIZE - info->ri_pageoff);
>>> - head->rc_arg.pages[info->ri_pageno] =
>>> - rqstp->rq_pages[info->ri_pageno];
>>> + /* XXX: ri_pageno and rc_page_count might be exactly the same */
>>> +
>>
>> What is this comment conveying? It looks like a note-to-self that
>> resulted in deleting the prior line.
>
> Yeah, pretty much. I think it is a reminder that one of those
> fields is superfluous and can be removed.
>
>
>> If the "XXX" notation is
>> still significant, it needs more detail on what needs to be
>> fixed in future.
>
> Or it should be removed before this patch is merged.
>
>
>>> if (!info->ri_pageoff)
>>> head->rc_page_count++;
>>> @@ -788,12 +776,10 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info,
>>> page_len = min_t(unsigned int, remaining,
>>> PAGE_SIZE - info->ri_pageoff);
>>> - head->rc_arg.pages[info->ri_pageno] =
>>> - rqstp->rq_pages[info->ri_pageno];
>>> if (!info->ri_pageoff)
>>> head->rc_page_count++;
>>> - dst = page_address(head->rc_arg.pages[info->ri_pageno]);
>>> + dst = page_address(rqstp->rq_pages[info->ri_pageno]);
>>> memcpy(dst + info->ri_pageno, src + offset, page_len);
>>> info->ri_totalbytes += page_len;
>>> @@ -813,7 +799,7 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info,
>>> * svc_rdma_read_multiple_chunks - Construct RDMA Reads to pull data item Read chunks
>>> * @info: context for RDMA Reads
>>> *
>>> - * The chunk data lands in head->rc_arg as a series of contiguous pages,
>>> + * The chunk data lands in rqstp->rq_arg as a series of contiguous pages,
>>> * like an incoming TCP call.
>>> *
>>> * Return values:
>>> @@ -827,8 +813,8 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
>>> {
>>> struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
>>> const struct svc_rdma_pcl *pcl = &head->rc_read_pcl;
>>> + struct xdr_buf *buf = &info->ri_rqst->rq_arg;
>>> struct svc_rdma_chunk *chunk, *next;
>>> - struct xdr_buf *buf = &head->rc_arg;
>>> unsigned int start, length;
>>> int ret;
>>> @@ -864,9 +850,9 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
>>> buf->len += info->ri_totalbytes;
>>> buf->buflen += info->ri_totalbytes;
>>> - head->rc_hdr_count = 1;
>>> - buf->head[0].iov_base = page_address(head->rc_pages[0]);
>>> + buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]);
>>> buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
>>> + buf->pages = &info->ri_rqst->rq_pages[1];
>>> buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;
>>> return 0;
>>> }
>>> @@ -875,9 +861,9 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
>>> * svc_rdma_read_data_item - Construct RDMA Reads to pull data item Read chunks
>>> * @info: context for RDMA Reads
>>> *
>>> - * The chunk data lands in the page list of head->rc_arg.pages.
>>> + * The chunk data lands in the page list of rqstp->rq_arg.pages.
>>> *
>>> - * Currently NFSD does not look at the head->rc_arg.tail[0] iovec.
>>> + * Currently NFSD does not look at the rqstp->rq_arg.tail[0] kvec.
>>> * Therefore, XDR round-up of the Read chunk and trailing
>>> * inline content must both be added at the end of the pagelist.
>>> *
>>> @@ -891,7 +877,7 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
>>> static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
>>> {
>>> struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
>>> - struct xdr_buf *buf = &head->rc_arg;
>>> + struct xdr_buf *buf = &info->ri_rqst->rq_arg;
>>> struct svc_rdma_chunk *chunk;
>>> unsigned int length;
>>> int ret;
>>> @@ -901,8 +887,6 @@ static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
>>> if (ret < 0)
>>> goto out;
>>> - head->rc_hdr_count = 0;
>>> -
>>> /* Split the Receive buffer between the head and tail
>>> * buffers at Read chunk's position. XDR roundup of the
>>> * chunk is not included in either the pagelist or in
>>> @@ -921,7 +905,8 @@ static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
>>> * Currently these chunks always start at page offset 0,
>>> * thus the rounded-up length never crosses a page boundary.
>>> */
>>> - length = XDR_QUADLEN(info->ri_totalbytes) << 2;
>>> + buf->pages = &info->ri_rqst->rq_pages[0];
>>> + length = xdr_align_size(chunk->ch_length);
>>> buf->page_len = length;
>>> buf->len += length;
>>> buf->buflen += length;
>>> @@ -1033,8 +1018,7 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info)
>>> * @info: context for RDMA Reads
>>> *
>>> * The start of the data lands in the first page just after the
>>> - * Transport header, and the rest lands in the page list of
>>> - * head->rc_arg.pages.
>>> + * Transport header, and the rest lands in rqstp->rq_arg.pages.
>>> *
>>> * Assumptions:
>>> * - A PZRC is never sent in an RDMA_MSG message, though it's
>>> @@ -1049,8 +1033,7 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info)
>>> */
>>> static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info)
>>> {
>>> - struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
>>> - struct xdr_buf *buf = &head->rc_arg;
>>> + struct xdr_buf *buf = &info->ri_rqst->rq_arg;
>>> int ret;
>>> ret = svc_rdma_read_call_chunk(info);
>>> @@ -1060,35 +1043,15 @@ static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info)
>>> buf->len += info->ri_totalbytes;
>>> buf->buflen += info->ri_totalbytes;
>>> - head->rc_hdr_count = 1;
>>> - buf->head[0].iov_base = page_address(head->rc_pages[0]);
>>> + buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]);
>>> buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
>>> + buf->pages = &info->ri_rqst->rq_pages[1];
>>> buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;
>>> out:
>>> return ret;
>>> }
>>> -/* Pages under I/O have been copied to head->rc_pages. Ensure they
>>> - * are not released by svc_xprt_release() until the I/O is complete.
>>> - *
>>> - * This has to be done after all Read WRs are constructed to properly
>>> - * handle a page that is part of I/O on behalf of two different RDMA
>>> - * segments.
>>> - *
>>> - * Do this only if I/O has been posted. Otherwise, we do indeed want
>>> - * svc_xprt_release() to clean things up properly.
>>> - */
>>> -static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
>>> - const unsigned int start,
>>> - const unsigned int num_pages)
>>> -{
>>> - unsigned int i;
>>> -
>>> - for (i = start; i < num_pages + start; i++)
>>> - rqstp->rq_pages[i] = NULL;
>>> -}
>>> -
>>> /**
>>> * svc_rdma_process_read_list - Pull list of Read chunks from the client
>>> * @rdma: controlling RDMA transport
>>> @@ -1153,11 +1116,22 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
>>> goto out_err;
>>> trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount);
>>> + init_completion(&cc->cc_done);
>>> ret = svc_rdma_post_chunk_ctxt(cc);
>>> if (ret < 0)
>>> goto out_err;
>>> - svc_rdma_save_io_pages(rqstp, 0, head->rc_page_count);
>>> - return 1;
>>> +
>>> + ret = 1;
>>> + wait_for_completion(&cc->cc_done);
>>> + if (cc->cc_status != IB_WC_SUCCESS)
>>> + ret = -EIO;
>>> +
>>> + /* rq_respages starts after the last arg page */
>>> + rqstp->rq_respages = &rqstp->rq_pages[head->rc_page_count];
>>> + rqstp->rq_next_page = rqstp->rq_respages + 1;
>>> +
>>> + /* Ensure svc_rdma_recv_ctxt_put() does not try to release pages */
>>> + head->rc_page_count = 0;
>>> out_err:
>>> svc_rdma_read_info_free(info);
>
> --
> Chuck Lever
>
>
>
>