2023-12-11 15:24:10

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 0/8] More svcrdma improvements

Here are some clean-ups and minor optimizations for svcrdma, in
addition to two patches that reduce the likelihood of connection
loss on heavy workloads (the final two patches in the series).

svcrdma appears to under-allocate Send Queue resources, which will
cause NFSD to drop the connection occasionally. This results in a
burp in the Send pipeline and a loss of throughput.

The series (including these patches) is in the svcrdma-next branch
of:

https://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git

---

Chuck Lever (8):
svcrdma: De-duplicate completion ID initialization helpers
svcrdma: Optimize svc_rdma_cc_init()
svcrdma: Remove pointer addresses shown in dprintk()
svcrdma: Remove queue-shortening warnings
svcrdma: Clean up comment in svc_rdma_accept()
svcrdma: Reserve an extra WQE for ib_drain_rq()
svcrdma: Use all allocated Send Queue entries
svcrdma: Increase the per-transport rw_ctx count


include/linux/sunrpc/svc_rdma.h | 24 ++++++++
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 9 +--
net/sunrpc/xprtrdma/svc_rdma_rw.c | 16 ++---
net/sunrpc/xprtrdma/svc_rdma_sendto.c | 9 +--
net/sunrpc/xprtrdma/svc_rdma_transport.c | 75 ++++++++++++++----------
5 files changed, 74 insertions(+), 59 deletions(-)

--
Chuck Lever



2023-12-11 15:24:17

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 1/8] svcrdma: De-duplicate completion ID initialization helpers

From: Chuck Lever <[email protected]>

Signed-off-by: Chuck Lever <[email protected]>
---
include/linux/sunrpc/svc_rdma.h | 24 ++++++++++++++++++++++++
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 7 -------
net/sunrpc/xprtrdma/svc_rdma_rw.c | 9 +--------
net/sunrpc/xprtrdma/svc_rdma_sendto.c | 7 -------
4 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 051fefde8d51..46f2ce9f810b 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -134,6 +134,30 @@ enum {

#define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD

+/**
+ * svc_rdma_send_cid_init - Initialize a Receive Queue completion ID
+ * @rdma: controlling transport
+ * @cid: completion ID to initialize
+ */
+static inline void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma,
+ struct rpc_rdma_cid *cid)
+{
+ cid->ci_queue_id = rdma->sc_rq_cq->res.id;
+ cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
+}
+
+/**
+ * svc_rdma_send_cid_init - Initialize a Send Queue completion ID
+ * @rdma: controlling transport
+ * @cid: completion ID to initialize
+ */
+static inline void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma,
+ struct rpc_rdma_cid *cid)
+{
+ cid->ci_queue_id = rdma->sc_sq_cq->res.id;
+ cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
+}
+
/*
* A chunk context tracks all I/O for moving one Read or Write
* chunk. This is a set of rdma_rw's that handle data movement
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 392a91dc8a99..ac6351e292c5 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -115,13 +115,6 @@ svc_rdma_next_recv_ctxt(struct list_head *list)
rc_list);
}

-static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma,
- struct rpc_rdma_cid *cid)
-{
- cid->ci_queue_id = rdma->sc_rq_cq->res.id;
- cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
-}
-
static struct svc_rdma_recv_ctxt *
svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
{
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 4d2db06ccfd2..eab71f3867fa 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -146,13 +146,6 @@ static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma,
return ret;
}

-static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma,
- struct rpc_rdma_cid *cid)
-{
- cid->ci_queue_id = rdma->sc_sq_cq->res.id;
- cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
-}
-
/**
* svc_rdma_cc_init - Initialize an svc_rdma_chunk_ctxt
* @rdma: controlling transport instance
@@ -161,7 +154,7 @@ static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma,
void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
struct svc_rdma_chunk_ctxt *cc)
{
- svc_rdma_cc_cid_init(rdma, &cc->cc_cid);
+ svc_rdma_send_cid_init(rdma, &cc->cc_cid);

INIT_LIST_HEAD(&cc->cc_rwctxts);
cc->cc_sqecount = 0;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 9571ed4a74d4..c9585e469ca8 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -113,13 +113,6 @@

static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc);

-static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma,
- struct rpc_rdma_cid *cid)
-{
- cid->ci_queue_id = rdma->sc_sq_cq->res.id;
- cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
-}
-
static struct svc_rdma_send_ctxt *
svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
{



2023-12-11 15:24:25

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 2/8] svcrdma: Optimize svc_rdma_cc_init()

From: Chuck Lever <[email protected]>

The atomic_inc_return() in svc_rdma_send_cid_init() is expensive.

Some svc_rdma_chunk_ctxt's now reside in long-lived container
structures. They don't need a fresh completion ID for every I/O
operation.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 +-
net/sunrpc/xprtrdma/svc_rdma_rw.c | 9 +++++----
net/sunrpc/xprtrdma/svc_rdma_sendto.c | 2 +-
3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index ac6351e292c5..38f01652dc6d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -123,7 +123,7 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
dma_addr_t addr;
void *buffer;

- ctxt = kmalloc_node(sizeof(*ctxt), GFP_KERNEL, node);
+ ctxt = kzalloc_node(sizeof(*ctxt), GFP_KERNEL, node);
if (!ctxt)
goto fail0;
buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index eab71f3867fa..ff54bb268b7d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -154,7 +154,10 @@ static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma,
void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
struct svc_rdma_chunk_ctxt *cc)
{
- svc_rdma_send_cid_init(rdma, &cc->cc_cid);
+ struct rpc_rdma_cid *cid = &cc->cc_cid;
+
+ if (unlikely(!cid->ci_completion_id))
+ svc_rdma_send_cid_init(rdma, cid);

INIT_LIST_HEAD(&cc->cc_rwctxts);
cc->cc_sqecount = 0;
@@ -221,15 +224,13 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma,
{
struct svc_rdma_write_info *info;

- info = kmalloc_node(sizeof(*info), GFP_KERNEL,
+ info = kzalloc_node(sizeof(*info), GFP_KERNEL,
ibdev_to_node(rdma->sc_cm_id->device));
if (!info)
return info;

info->wi_rdma = rdma;
info->wi_chunk = chunk;
- info->wi_seg_off = 0;
- info->wi_seg_no = 0;
svc_rdma_cc_init(rdma, &info->wi_cc);
info->wi_cc.cc_cqe.done = svc_rdma_write_done;
return info;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index c9585e469ca8..1a49b7f02041 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -122,7 +122,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
void *buffer;
int i;

- ctxt = kmalloc_node(struct_size(ctxt, sc_sges, rdma->sc_max_send_sges),
+ ctxt = kzalloc_node(struct_size(ctxt, sc_sges, rdma->sc_max_send_sges),
GFP_KERNEL, node);
if (!ctxt)
goto fail0;



2023-12-11 15:24:29

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 3/8] svcrdma: Remove pointer addresses shown in dprintk()

From: Chuck Lever <[email protected]>

There are a couple of dprintk() call sites in svc_rdma_accept()
that show pointer addresses. These days, displayed pointer addresses
are hashed and thus have little or no diagnostic value, especially
for site administrators.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/svc_rdma_transport.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 3826da1c15f3..451814eb12b9 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -457,8 +457,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
qp_attr.qp_type = IB_QPT_RC;
qp_attr.send_cq = newxprt->sc_sq_cq;
qp_attr.recv_cq = newxprt->sc_rq_cq;
- dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n",
- newxprt->sc_cm_id, newxprt->sc_pd);
dprintk(" cap.max_send_wr = %d, cap.max_recv_wr = %d\n",
qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr);
dprintk(" cap.max_send_sge = %d, cap.max_recv_sge = %d\n",
@@ -512,7 +510,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
}

#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
- dprintk("svcrdma: new connection %p accepted:\n", newxprt);
+ dprintk("svcrdma: new connection accepted on device %s:\n", dev->name);
sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap));
sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;



2023-12-11 15:24:31

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 4/8] svcrdma: Remove queue-shortening warnings

From: Chuck Lever <[email protected]>

These won't have much diagnostic value for site administrators.
Since they can't be disabled, they become noise.

What's more, the subsequent rdma_create_qp() call adjusts the Send
Queue size (possibly downward) without warning, making the size
reported by these pr_warns inaccurate.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/svc_rdma_transport.c | 7 +------
1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 451814eb12b9..040d2ef6400c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -412,8 +412,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests +
newxprt->sc_recv_batch;
if (rq_depth > dev->attrs.max_qp_wr) {
- pr_warn("svcrdma: reducing receive depth to %d\n",
- dev->attrs.max_qp_wr);
rq_depth = dev->attrs.max_qp_wr;
newxprt->sc_recv_batch = 1;
newxprt->sc_max_requests = rq_depth - 2;
@@ -423,11 +421,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES);
ctxts *= newxprt->sc_max_requests;
newxprt->sc_sq_depth = rq_depth + ctxts;
- if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) {
- pr_warn("svcrdma: reducing send depth to %d\n",
- dev->attrs.max_qp_wr);
+ if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr)
newxprt->sc_sq_depth = dev->attrs.max_qp_wr;
- }
atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);

newxprt->sc_pd = ib_alloc_pd(dev, 0);



2023-12-11 15:24:38

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 5/8] svcrdma: Clean up comment in svc_rdma_accept()

From: Chuck Lever <[email protected]>

The comment that starts "Qualify ..." applies to only some of the
following code paragraph. Re-arrange the lines so the comment makes
more sense.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/svc_rdma_transport.c | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 040d2ef6400c..8127c711fa3b 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -397,18 +397,22 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
dev = newxprt->sc_cm_id->device;
newxprt->sc_port_num = newxprt->sc_cm_id->port_num;

- /* Qualify the transport resource defaults with the
- * capabilities of this particular device */
+ newxprt->sc_max_req_size = svcrdma_max_req_size;
+ newxprt->sc_max_requests = svcrdma_max_requests;
+ newxprt->sc_max_bc_requests = svcrdma_max_bc_requests;
+ newxprt->sc_recv_batch = RPCRDMA_MAX_RECV_BATCH;
+ newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests);
+
+ /* Qualify the transport's resource defaults with the
+ * capabilities of this particular device.
+ */
+
/* Transport header, head iovec, tail iovec */
newxprt->sc_max_send_sges = 3;
/* Add one SGE per page list entry */
newxprt->sc_max_send_sges += (svcrdma_max_req_size / PAGE_SIZE) + 1;
if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge)
newxprt->sc_max_send_sges = dev->attrs.max_send_sge;
- newxprt->sc_max_req_size = svcrdma_max_req_size;
- newxprt->sc_max_requests = svcrdma_max_requests;
- newxprt->sc_max_bc_requests = svcrdma_max_bc_requests;
- newxprt->sc_recv_batch = RPCRDMA_MAX_RECV_BATCH;
rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests +
newxprt->sc_recv_batch;
if (rq_depth > dev->attrs.max_qp_wr) {
@@ -417,7 +421,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_max_requests = rq_depth - 2;
newxprt->sc_max_bc_requests = 2;
}
- newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests);
ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES);
ctxts *= newxprt->sc_max_requests;
newxprt->sc_sq_depth = rq_depth + ctxts;



2023-12-11 15:24:44

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 6/8] svcrdma: Reserve an extra WQE for ib_drain_rq()

From: Chuck Lever <[email protected]>

Do as other ULPs already do: ensure there is an extra Receive WQE
reserved for the tear-down drain WR. I haven't heard reports of
problems but it can't hurt.

Note that rq_depth is used to compute the Send Queue depth as well,
so this fix should affect both the SQ and RQ.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 8127c711fa3b..0541aa54674c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -414,7 +414,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge)
newxprt->sc_max_send_sges = dev->attrs.max_send_sge;
rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests +
- newxprt->sc_recv_batch;
+ newxprt->sc_recv_batch + 1 /* drain */;
if (rq_depth > dev->attrs.max_qp_wr) {
rq_depth = dev->attrs.max_qp_wr;
newxprt->sc_recv_batch = 1;



2023-12-11 15:24:56

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 7/8] svcrdma: Use all allocated Send Queue entries

From: Chuck Lever <[email protected]>

For upper layer protocols that request rw_ctxs, ib_create_qp()
adjusts ib_qp_init_attr::max_send_wr to accommodate the WQEs those
rw_ctxs will consume. See rdma_rw_init_qp() for details.

To actually use those additional WQEs, svc_rdma_accept() needs to
retrieve the corrected SQ depth after calling rdma_create_qp() and
set newxprt->sc_sq_depth and newxprt->sc_sq_avail so that
svc_rdma_send() and svc_rdma_post_chunk_ctxt() can utilize those
WQEs.

The NVMe target driver, for example, already does this properly.

Fixes: 26fb2254dd33 ("svcrdma: Estimate Send Queue depth properly")
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/svc_rdma_transport.c | 39 +++++++++++++++++++-----------
1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 0541aa54674c..790841864153 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -369,12 +369,12 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
*/
static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
{
+ unsigned int ctxts, rq_depth, sq_depth;
struct svcxprt_rdma *listen_rdma;
struct svcxprt_rdma *newxprt = NULL;
struct rdma_conn_param conn_param;
struct rpcrdma_connect_private pmsg;
struct ib_qp_init_attr qp_attr;
- unsigned int ctxts, rq_depth;
struct ib_device *dev;
int ret = 0;
RPC_IFDEBUG(struct sockaddr *sap);
@@ -421,24 +421,32 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_max_requests = rq_depth - 2;
newxprt->sc_max_bc_requests = 2;
}
+
ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES);
ctxts *= newxprt->sc_max_requests;
- newxprt->sc_sq_depth = rq_depth + ctxts;
- if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr)
- newxprt->sc_sq_depth = dev->attrs.max_qp_wr;
- atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
+
+ sq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests + 1;
+ if (sq_depth > dev->attrs.max_qp_wr)
+ sq_depth = dev->attrs.max_qp_wr;

newxprt->sc_pd = ib_alloc_pd(dev, 0);
if (IS_ERR(newxprt->sc_pd)) {
trace_svcrdma_pd_err(newxprt, PTR_ERR(newxprt->sc_pd));
goto errout;
}
- newxprt->sc_sq_cq = ib_alloc_cq_any(dev, newxprt, newxprt->sc_sq_depth,
+
+ /* The Completion Queue depth is the maximum number of signaled
+ * WRs expected to be in flight. Every Send WR is signaled, and
+ * each rw_ctx has a chain of WRs, but only one WR in each chain
+ * is signaled.
+ */
+ newxprt->sc_sq_cq = ib_alloc_cq_any(dev, newxprt, sq_depth + ctxts,
IB_POLL_WORKQUEUE);
if (IS_ERR(newxprt->sc_sq_cq))
goto errout;
- newxprt->sc_rq_cq =
- ib_alloc_cq_any(dev, newxprt, rq_depth, IB_POLL_WORKQUEUE);
+ /* Every Receive WR is signaled. */
+ newxprt->sc_rq_cq = ib_alloc_cq_any(dev, newxprt, rq_depth,
+ IB_POLL_WORKQUEUE);
if (IS_ERR(newxprt->sc_rq_cq))
goto errout;

@@ -447,7 +455,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
qp_attr.qp_context = &newxprt->sc_xprt;
qp_attr.port_num = newxprt->sc_port_num;
qp_attr.cap.max_rdma_ctxs = ctxts;
- qp_attr.cap.max_send_wr = newxprt->sc_sq_depth - ctxts;
+ qp_attr.cap.max_send_wr = sq_depth;
qp_attr.cap.max_recv_wr = rq_depth;
qp_attr.cap.max_send_sge = newxprt->sc_max_send_sges;
qp_attr.cap.max_recv_sge = 1;
@@ -455,17 +463,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
qp_attr.qp_type = IB_QPT_RC;
qp_attr.send_cq = newxprt->sc_sq_cq;
qp_attr.recv_cq = newxprt->sc_rq_cq;
- dprintk(" cap.max_send_wr = %d, cap.max_recv_wr = %d\n",
- qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr);
- dprintk(" cap.max_send_sge = %d, cap.max_recv_sge = %d\n",
- qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge);
-
ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
if (ret) {
trace_svcrdma_qp_err(newxprt, ret);
goto errout;
}
+ dprintk("svcrdma: cap.max_send_wr = %d, cap.max_recv_wr = %d\n",
+ qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr);
+ dprintk(" cap.max_send_sge = %d, cap.max_recv_sge = %d\n",
+ qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge);
+ dprintk(" send CQ depth = %d, recv CQ depth = %d\n",
+ sq_depth, rq_depth);
newxprt->sc_qp = newxprt->sc_cm_id->qp;
+ newxprt->sc_sq_depth = qp_attr.cap.max_send_wr;
+ atomic_set(&newxprt->sc_sq_avail, qp_attr.cap.max_send_wr);

if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
newxprt->sc_snd_w_inv = false;



2023-12-11 15:24:57

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 8/8] svcrdma: Increase the per-transport rw_ctx count

From: Chuck Lever <[email protected]>

rdma_rw_mr_factor() returns the smallest number of MRs needed to
move a particular number of pages. svcrdma currently asks for the
number of MRs needed to move RPCSVC_MAXPAGES (a little over one
megabyte), as that is the number of pages in the largest r/wsize
the server supports.

This call assumes that the client's NIC can bundle a full one
megabyte payload in a single rdma_segment. In fact, most NICs cannot
handle a full megabyte with a single rkey / rdma_segment. Clients
will typically split even a single Read chunk into many segments.

The server needs one MR to read each rdma_segment in a Read chunk,
and thus each one needs an rw_ctx.

svcrdma has been vastly underestimating the number of rw_ctxs needed
to handle 64 RPC requests with large Read chunks using small
rdma_segments.

Unfortunately there doesn't seem to be a good way to estimate this
number without knowing the client NIC's capabilities. Even then,
the client RPC/RDMA implementation is still free to split a chunk
into smaller segments (for example, it might be using physical
registration, which needs an rdma_segment per page).

The best we can do for now is choose a number that will guarantee
forward progress in the worst case (one page per segment).

At some later point, we could add some mechanisms to make this
much less of a problem:
- Add a core API to add more rw_ctxs to an already-established QP
- svcrdma could treat rw_ctx exhaustion as a temporary error and
try again
- Limit the number of Reads in flight

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/svc_rdma_transport.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 790841864153..0ceb2817ca4d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -422,8 +422,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_max_bc_requests = 2;
}

- ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES);
- ctxts *= newxprt->sc_max_requests;
+ /* Arbitrarily estimate the number of rw_ctxs needed for
+ * this transport. This is enough rw_ctxs to make forward
+ * progress even if the client is using one rkey per page
+ * in each Read chunk.
+ */
+ ctxts = 3 * RPCSVC_MAXPAGES;

sq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests + 1;
if (sq_depth > dev->attrs.max_qp_wr)