After folks tried out RPCRDMA_REGISTER support as I requested in
the cover letter of the last version of this series, existing
problems were discovered already in the upstream kernel, starting
with the problem addressed by Steve's LOCAL_WRITE patch from last
week.
Rather than address them, this series simply removes what is now an
obsolete memory registration mode. For some time, HCAs that do not
support FRMR fall back on ALLPHYSICAL mode, leaving REGISTER to
gather bit rot.
In addition I added three patches that implement improvements to
the new split completion handler, based on suggestions from Sagi
Grimberg.
Test and review the "nfs-rdma-client" branch:
git://git.linux-nfs.org/projects/cel/cel-2.6.git
Thanks!
---
Allen Andrews (1):
nfs-rdma: Fix for FMR leaks
Chuck Lever (15):
xprtrdma: Reduce the number of hardway buffer allocations
xprtrdma: Limit work done by completion handler
xprtrmda: Reduce calls to ib_poll_cq() in completion handlers
xprtrmda: Reduce lock contention in completion handlers
xprtrdma: Split the completion queue
xprtrdma: Make rpcrdma_ep_destroy() return void
xprtrdma: Simplify rpcrdma_deregister_external() synopsis
xprtrdma: Add CONFIG setting that can disable ALLPHYSICAL
xprtrdma: mount reports "Invalid mount option" if memreg mode not supported
xprtrdma: Fall back to MTHCAFMR when FRMR is not supported
xprtrdma: Remove REGISTER memory registration mode
xprtrdma: Remove MEMWINDOWS registration modes
xprtrdma: Remove BOUNCEBUFFERS memory registration mode
xprtrdma: RPC/RDMA must invoke xprt_wake_pending_tasks() in process context
xprtrdma: Enable RDMA pad optimization by default
Steve Wise (1):
xprtrdma: mind the device's max fast register page list depth
include/linux/sunrpc/xprtrdma.h | 2
net/sunrpc/Kconfig | 14 +
net/sunrpc/xprtrdma/rpc_rdma.c | 63 +---
net/sunrpc/xprtrdma/transport.c | 34 --
net/sunrpc/xprtrdma/verbs.c | 694 +++++++++++++++------------------------
net/sunrpc/xprtrdma/xprt_rdma.h | 16 +
6 files changed, 316 insertions(+), 507 deletions(-)
--
Chuck Lever
Skip the ib_poll_cq() after re-arming, if the provider knows there
are no additional items waiting. (Have a look at commit ed23a727 for
more details).
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/verbs.c | 14 ++++++++++----
1 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index ce7b45f..81daf2b 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -192,8 +192,11 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
return;
}
- rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
- if (rc) {
+ rc = ib_req_notify_cq(cq,
+ IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+ if (rc == 0)
+ return;
+ if (rc < 0) {
dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
__func__, rc);
return;
@@ -272,8 +275,11 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
return;
}
- rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
- if (rc) {
+ rc = ib_req_notify_cq(cq,
+ IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+ if (rc == 0)
+ return;
+ if (rc < 0) {
dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
__func__, rc);
return;
The MEMWINDOWS and MEMWINDOES_ASYNC memory registration modes were
intended as stop-gap modes before the introduction of FRMR. They
are now considered obsolete.
MEMWINDOWS_ASYNC is also considered unsafe because it can leave
client memory registered and exposed for an indeterminant time after
each I/O.
At this point, the MEMWINDOWS modes add needless complexity, so
remove them.
Signed-off-by: Chuck Lever <[email protected]>
Tested-by: Steve Wise <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 34 --------
net/sunrpc/xprtrdma/transport.c | 9 --
net/sunrpc/xprtrdma/verbs.c | 165 +--------------------------------------
net/sunrpc/xprtrdma/xprt_rdma.h | 2
4 files changed, 7 insertions(+), 203 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index b963e50..e4af26a 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -202,7 +202,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
return 0;
do {
- /* bind/register the memory, then build chunk from result. */
int n = rpcrdma_register_external(seg, nsegs,
cur_wchunk != NULL, r_xprt);
if (n <= 0)
@@ -701,16 +700,6 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
}
/*
- * This function is called when memory window unbind which we are waiting
- * for completes. Just use rr_func (zeroed by upcall) to signal completion.
- */
-static void
-rpcrdma_unbind_func(struct rpcrdma_rep *rep)
-{
- wake_up(&rep->rr_unbind);
-}
-
-/*
* Called as a tasklet to do req/reply match and complete a request
* Errors must result in the RPC task either being awakened, or
* allowed to timeout, to discover the errors at that time.
@@ -724,7 +713,7 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
struct rpc_xprt *xprt = rep->rr_xprt;
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
__be32 *iptr;
- int i, rdmalen, status;
+ int rdmalen, status;
/* Check status. If bad, signal disconnect and return rep to pool */
if (rep->rr_len == ~0U) {
@@ -853,27 +842,6 @@ badheader:
break;
}
- /* If using mw bind, start the deregister process now. */
- /* (Note: if mr_free(), cannot perform it here, in tasklet context) */
- if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) {
- case RPCRDMA_MEMWINDOWS:
- for (i = 0; req->rl_nchunks-- > 1;)
- i += rpcrdma_deregister_external(
- &req->rl_segments[i], r_xprt, NULL);
- /* Optionally wait (not here) for unbinds to complete */
- rep->rr_func = rpcrdma_unbind_func;
- (void) rpcrdma_deregister_external(&req->rl_segments[i],
- r_xprt, rep);
- break;
- case RPCRDMA_MEMWINDOWS_ASYNC:
- for (i = 0; req->rl_nchunks--;)
- i += rpcrdma_deregister_external(&req->rl_segments[i],
- r_xprt, NULL);
- break;
- default:
- break;
- }
-
dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
__func__, xprt, rqst, status);
xprt_complete_rqst(rqst->rq_task, status);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index b4330b2..4607b62 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -566,9 +566,7 @@ xprt_rdma_free(void *buffer)
__func__, rep, (rep && rep->rr_func) ? " (with waiter)" : "");
/*
- * Finish the deregistration. When using mw bind, this was
- * begun in rpcrdma_reply_handler(). In all other modes, we
- * do it here, in thread context. The process is considered
+ * Finish the deregistration. The process is considered
* complete when the rr_func vector becomes NULL - this
* was put in place during rpcrdma_reply_handler() - the wait
* call below will not block if the dereg is "done". If
@@ -580,11 +578,6 @@ xprt_rdma_free(void *buffer)
&req->rl_segments[i], r_xprt, NULL);
}
- if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) {
- rep->rr_func = NULL; /* abandon the callback */
- req->rl_reply = NULL;
- }
-
if (req->rl_iov.length == 0) { /* see allocate above */
struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer;
oreq->rl_reply = req->rl_reply;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 4a4e4ea..304c7ad 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -152,7 +152,7 @@ void rpcrdma_event_process(struct ib_wc *wc)
dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
__func__, rep, wc->status, wc->opcode, wc->byte_len);
- if (!rep) /* send or bind completion that we don't care about */
+ if (!rep) /* send completion that we don't care about */
return;
if (IB_WC_SUCCESS != wc->status) {
@@ -197,8 +197,6 @@ void rpcrdma_event_process(struct ib_wc *wc)
}
atomic_set(&rep->rr_buffer->rb_credits, credits);
}
- /* fall through */
- case IB_WC_BIND_MW:
rpcrdma_schedule_tasklet(rep);
break;
default:
@@ -233,7 +231,7 @@ rpcrdma_cq_poll(struct ib_cq *cq)
/*
* rpcrdma_cq_event_upcall
*
- * This upcall handles recv, send, bind and unbind events.
+ * This upcall handles recv and send events.
* It is reentrant but processes single events in order to maintain
* ordering of receives to keep server credits.
*
@@ -494,16 +492,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
}
switch (memreg) {
- case RPCRDMA_MEMWINDOWS:
- case RPCRDMA_MEMWINDOWS_ASYNC:
- if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
- dprintk("RPC: %s: MEMWINDOWS registration "
- "specified but not supported by adapter, "
- "using slower RPCRDMA_REGISTER\n",
- __func__);
- memreg = RPCRDMA_REGISTER;
- }
- break;
case RPCRDMA_MTHCAFMR:
if (!ia->ri_id->device->alloc_fmr) {
#if RPCRDMA_PERSISTENT_REGISTRATION
@@ -567,16 +555,13 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
IB_ACCESS_REMOTE_READ;
goto register_setup;
#endif
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- mem_priv = IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_MW_BIND;
- goto register_setup;
case RPCRDMA_MTHCAFMR:
if (ia->ri_have_dma_lkey)
break;
mem_priv = IB_ACCESS_LOCAL_WRITE;
+#if RPCRDMA_PERSISTENT_REGISTRATION
register_setup:
+#endif
ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
if (IS_ERR(ia->ri_bind_mem)) {
printk(KERN_ALERT "%s: ib_get_dma_mr for "
@@ -699,14 +684,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
}
break;
}
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- /* Add room for mw_binds+unbinds - overkill! */
- ep->rep_attr.cap.max_send_wr++;
- ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
- if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
- return -EINVAL;
- break;
default:
break;
}
@@ -728,14 +705,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
/* set trigger for requesting send completion */
ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
- break;
- default:
- break;
- }
if (ep->rep_cqinit <= 2)
ep->rep_cqinit = 0;
INIT_CQCOUNT(ep);
@@ -743,11 +712,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
init_waitqueue_head(&ep->rep_connect_wait);
INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
- /*
- * Create a single cq for receive dto and mw_bind (only ever
- * care about unbind, really). Send completions are suppressed.
- * Use single threaded tasklet upcalls to maintain ordering.
- */
ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
rpcrdma_cq_async_error_upcall, NULL,
ep->rep_attr.cap.max_recv_wr +
@@ -1020,11 +984,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
sizeof(struct rpcrdma_mw);
break;
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
- sizeof(struct rpcrdma_mw);
- break;
default:
break;
}
@@ -1055,11 +1014,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
}
p += cdata->padding;
- /*
- * Allocate the fmr's, or mw's for mw_bind chunk registration.
- * We "cycle" the mw's in order to minimize rkey reuse,
- * and also reduce unbind-to-bind collision.
- */
INIT_LIST_HEAD(&buf->rb_mws);
r = (struct rpcrdma_mw *)p;
switch (ia->ri_memreg_strategy) {
@@ -1107,21 +1061,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
++r;
}
break;
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- /* Allocate one extra request's worth, for full cycling */
- for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
- r->r.mw = ib_alloc_mw(ia->ri_pd, IB_MW_TYPE_1);
- if (IS_ERR(r->r.mw)) {
- rc = PTR_ERR(r->r.mw);
- dprintk("RPC: %s: ib_alloc_mw"
- " failed %i\n", __func__, rc);
- goto out;
- }
- list_add(&r->mw_list, &buf->rb_mws);
- ++r;
- }
- break;
default:
break;
}
@@ -1170,7 +1109,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
memset(rep, 0, sizeof(struct rpcrdma_rep));
buf->rb_recv_bufs[i] = rep;
buf->rb_recv_bufs[i]->rr_buffer = buf;
- init_waitqueue_head(&rep->rr_unbind);
rc = rpcrdma_register_internal(ia, rep->rr_base,
len - offsetof(struct rpcrdma_rep, rr_base),
@@ -1204,7 +1142,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
/* clean up in reverse order from create
* 1. recv mr memory (mr free, then kfree)
- * 1a. bind mw memory
* 2. send mr memory (mr free, then kfree)
* 3. padding (if any) [moved to rpcrdma_ep_destroy]
* 4. arrays
@@ -1248,15 +1185,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
" failed %i\n",
__func__, rc);
break;
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- rc = ib_dealloc_mw(r->r.mw);
- if (rc)
- dprintk("RPC: %s:"
- " ib_dealloc_mw"
- " failed %i\n",
- __func__, rc);
- break;
default:
break;
}
@@ -1331,15 +1259,12 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
req->rl_niovs = 0;
if (req->rl_reply) {
buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
- init_waitqueue_head(&req->rl_reply->rr_unbind);
req->rl_reply->rr_func = NULL;
req->rl_reply = NULL;
}
switch (ia->ri_memreg_strategy) {
case RPCRDMA_FRMR:
case RPCRDMA_MTHCAFMR:
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
/*
* Cycle mw's back in reverse order, and "spin" them.
* This delays and scrambles reuse as much as possible.
@@ -1384,8 +1309,7 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
/*
* Put reply buffers back into pool when not attached to
- * request. This happens in error conditions, and when
- * aborting unbinds. Pre-decrement counter/array index.
+ * request. This happens in error conditions.
*/
void
rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
@@ -1688,74 +1612,6 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
}
static int
-rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
- int *nsegs, int writing, struct rpcrdma_ia *ia,
- struct rpcrdma_xprt *r_xprt)
-{
- int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
- IB_ACCESS_REMOTE_READ);
- struct ib_mw_bind param;
- int rc;
-
- *nsegs = 1;
- rpcrdma_map_one(ia, seg, writing);
- param.bind_info.mr = ia->ri_bind_mem;
- param.wr_id = 0ULL; /* no send cookie */
- param.bind_info.addr = seg->mr_dma;
- param.bind_info.length = seg->mr_len;
- param.send_flags = 0;
- param.bind_info.mw_access_flags = mem_priv;
-
- DECR_CQCOUNT(&r_xprt->rx_ep);
- rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m);
- if (rc) {
- dprintk("RPC: %s: failed ib_bind_mw "
- "%u@0x%llx status %i\n",
- __func__, seg->mr_len,
- (unsigned long long)seg->mr_dma, rc);
- rpcrdma_unmap_one(ia, seg);
- } else {
- seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
- seg->mr_base = param.bind_info.addr;
- seg->mr_nsegs = 1;
- }
- return rc;
-}
-
-static int
-rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
- struct rpcrdma_ia *ia,
- struct rpcrdma_xprt *r_xprt, void **r)
-{
- struct ib_mw_bind param;
- LIST_HEAD(l);
- int rc;
-
- BUG_ON(seg->mr_nsegs != 1);
- param.bind_info.mr = ia->ri_bind_mem;
- param.bind_info.addr = 0ULL; /* unbind */
- param.bind_info.length = 0;
- param.bind_info.mw_access_flags = 0;
- if (*r) {
- param.wr_id = (u64) (unsigned long) *r;
- param.send_flags = IB_SEND_SIGNALED;
- INIT_CQCOUNT(&r_xprt->rx_ep);
- } else {
- param.wr_id = 0ULL;
- param.send_flags = 0;
- DECR_CQCOUNT(&r_xprt->rx_ep);
- }
- rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m);
- rpcrdma_unmap_one(ia, seg);
- if (rc)
- dprintk("RPC: %s: failed ib_(un)bind_mw,"
- " status %i\n", __func__, rc);
- else
- *r = NULL; /* will upcall on completion */
- return rc;
-}
-
-static int
rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
int *nsegs, int writing, struct rpcrdma_ia *ia)
{
@@ -1845,12 +1701,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
break;
- /* Registration using memory windows */
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
- break;
-
/* Default registration each time */
default:
rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
@@ -1887,11 +1737,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
rc = rpcrdma_deregister_fmr_external(seg, ia);
break;
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
- break;
-
default:
rc = rpcrdma_deregister_default_external(seg, ia);
break;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index c620d13..bf08ee0 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -127,7 +127,6 @@ struct rpcrdma_rep {
struct rpc_xprt *rr_xprt; /* needed for request/reply matching */
void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */
struct list_head rr_list; /* tasklet list */
- wait_queue_head_t rr_unbind; /* optional unbind wait */
struct ib_sge rr_iov; /* for posting */
struct ib_mr *rr_handle; /* handle for mem in rr_iov */
char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
@@ -162,7 +161,6 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
struct ib_mr *rl_mr; /* if registered directly */
struct rpcrdma_mw { /* if registered from region */
union {
- struct ib_mw *mw;
struct ib_fmr *fmr;
struct {
struct ib_fast_reg_page_list *fr_pgl;
On 4/22/2014 9:23 AM, Christoph Hellwig wrote:
> On Mon, Apr 21, 2014 at 06:02:14PM -0400, Chuck Lever wrote:
>> ALLPHYSICAL is not a safe memory registration mode because it
>> permits NFS servers to write anywhere in a client's memory. NFS
>> server bugs could result in client memory being overwritten.
>>
>> This can be useful for embedded systems which do not support more
>> surgical RDMA memory registration and protection methods, or for
>> bring-up of new HCA hardware.
>>
>> However, enterprise Linux distributions have expressed a desire to
>> disable it in production environments.
> It's just as unsafe in embedded devices. I think it should go
For small IOs pattern, ALLPHYSICAL should outperform any registration method
in terms of IOP rate (simply because it doesn't do it).
Generally speaking, deployments that may prefer higher IOP rate in the cost
of a security do exist out there...
Sagi.
An audit of in-kernel RDMA providers that do not support the FRMR
memory registration shows that several of them support MTHCAFMR.
Prefer MTHCAFMR when FRMR is not supported.
If MTHCAFMR is not supported, only then choose ALLPHYSICAL.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/verbs.c | 31 +++++++++++++++----------------
1 files changed, 15 insertions(+), 16 deletions(-)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 6bb9a07..a352798 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -491,33 +491,32 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
}
- switch (memreg) {
- case RPCRDMA_MTHCAFMR:
- if (!ia->ri_id->device->alloc_fmr) {
- dprintk("RPC: %s: MTHCAFMR registration "
- "specified but not supported by adapter, "
- "using riskier RPCRDMA_ALLPHYSICAL\n",
- __func__);
- memreg = RPCRDMA_ALLPHYSICAL;
- }
- break;
- case RPCRDMA_FRMR:
+ if (memreg == RPCRDMA_FRMR) {
/* Requires both frmr reg and local dma lkey */
if ((devattr.device_cap_flags &
(IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
(IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
dprintk("RPC: %s: FRMR registration "
- "specified but not supported by adapter, "
- "using riskier RPCRDMA_ALLPHYSICAL\n",
- __func__);
- memreg = RPCRDMA_ALLPHYSICAL;
+ "not supported by HCA\n", __func__);
+ memreg = RPCRDMA_MTHCAFMR;
} else {
/* Mind the ia limit on FRMR page list depth */
ia->ri_max_frmr_depth = min_t(unsigned int,
RPCRDMA_MAX_DATA_SEGS,
devattr.max_fast_reg_page_list_len);
}
- break;
+ }
+ if (memreg == RPCRDMA_MTHCAFMR) {
+ if (!ia->ri_id->device->alloc_fmr) {
+ dprintk("RPC: %s: MTHCAFMR registration "
+ "not supported by HCA\n", __func__);
+#if RPCRDMA_PERSISTENT_REGISTRATION
+ memreg = RPCRDMA_ALLPHYSICAL;
+#else
+ rc = -EINVAL;
+ goto out2;
+#endif
+ }
}
/*
On 4/22/2014 1:03 AM, Chuck Lever wrote:
> Sagi Grimberg <[email protected]> points out that a steady
> stream of CQ events could starve other work because of the boundless
> loop pooling in rpcrdma_{send,recv}_poll().
>
> Instead of a (potentially infinite) while loop, return after
> collecting a budgeted number of completions.
>
> Note that the total number of WCs that can be handled during one
> upcall is RPCRDMA_WC_BUDGET * 2, since the handler polls once before
> and once after re-enabling completion notifications.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
>
> net/sunrpc/xprtrdma/verbs.c | 10 ++++++----
> net/sunrpc/xprtrdma/xprt_rdma.h | 1 +
> 2 files changed, 7 insertions(+), 4 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> index abb8d8d..d46bdee 100644
> --- a/net/sunrpc/xprtrdma/verbs.c
> +++ b/net/sunrpc/xprtrdma/verbs.c
> @@ -165,8 +165,9 @@ static int
> rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
> {
> struct ib_wc *wcs;
> - int count, rc;
> + int budget, count, rc;
>
> + budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
> do {
> wcs = ep->rep_send_wcs;
>
> @@ -177,7 +178,7 @@ rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
> count = rc;
> while (count-- > 0)
> rpcrdma_sendcq_process_wc(wcs++);
> - } while (rc == RPCRDMA_POLLSIZE);
> + } while (rc == RPCRDMA_POLLSIZE && --budget);
> return 0;
> }
>
> @@ -254,8 +255,9 @@ static int
> rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
> {
> struct ib_wc *wcs;
> - int count, rc;
> + int budget, count, rc;
>
> + budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
> do {
> wcs = ep->rep_recv_wcs;
>
> @@ -266,7 +268,7 @@ rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
> count = rc;
> while (count-- > 0)
> rpcrdma_recvcq_process_wc(wcs++);
> - } while (rc == RPCRDMA_POLLSIZE);
> + } while (rc == RPCRDMA_POLLSIZE && --budget);
> return 0;
> }
>
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index cb4c882..0c3b88e 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -74,6 +74,7 @@ struct rpcrdma_ia {
> * RDMA Endpoint -- one per transport instance
> */
>
> +#define RPCRDMA_WC_BUDGET (128)
Would be nice to be able to configure that (modparam perhaps?)
Other than that, looks OK.
Acked-by: Sagi Grimberg <[email protected]>
Sagi Grimberg <[email protected]> points out that a steady
stream of CQ events could starve other work because of the boundless
loop pooling in rpcrdma_{send,recv}_poll().
Instead of a (potentially infinite) while loop, return after
collecting a budgeted number of completions.
Note that the total number of WCs that can be handled during one
upcall is RPCRDMA_WC_BUDGET * 2, since the handler polls once before
and once after re-enabling completion notifications.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/verbs.c | 10 ++++++----
net/sunrpc/xprtrdma/xprt_rdma.h | 1 +
2 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index abb8d8d..d46bdee 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -165,8 +165,9 @@ static int
rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
{
struct ib_wc *wcs;
- int count, rc;
+ int budget, count, rc;
+ budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
do {
wcs = ep->rep_send_wcs;
@@ -177,7 +178,7 @@ rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
count = rc;
while (count-- > 0)
rpcrdma_sendcq_process_wc(wcs++);
- } while (rc == RPCRDMA_POLLSIZE);
+ } while (rc == RPCRDMA_POLLSIZE && --budget);
return 0;
}
@@ -254,8 +255,9 @@ static int
rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
{
struct ib_wc *wcs;
- int count, rc;
+ int budget, count, rc;
+ budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
do {
wcs = ep->rep_recv_wcs;
@@ -266,7 +268,7 @@ rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
count = rc;
while (count-- > 0)
rpcrdma_recvcq_process_wc(wcs++);
- } while (rc == RPCRDMA_POLLSIZE);
+ } while (rc == RPCRDMA_POLLSIZE && --budget);
return 0;
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index cb4c882..0c3b88e 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -74,6 +74,7 @@ struct rpcrdma_ia {
* RDMA Endpoint -- one per transport instance
*/
+#define RPCRDMA_WC_BUDGET (128)
#define RPCRDMA_POLLSIZE (16)
struct rpcrdma_ep {
From: Allen Andrews <[email protected]>
Two memory region leaks were found during testing:
1. rpcrdma_buffer_create: While allocating RPCRDMA_FRMR's
ib_alloc_fast_reg_mr is called and then ib_alloc_fast_reg_page_list is
called. If ib_alloc_fast_reg_page_list returns an error it bails out of
the routine dropping the last ib_alloc_fast_reg_mr frmr region creating a
memory leak. Added code to dereg the last frmr if
ib_alloc_fast_reg_page_list fails.
2. rpcrdma_buffer_destroy: While cleaning up, the routine will only free
the MR's on the rb_mws list if there are rb_send_bufs present. However, in
rpcrdma_buffer_create while the rb_mws list is being built if one of the MR
allocation requests fail after some MR's have been allocated on the rb_mws
list the routine never gets to create any rb_send_bufs but instead jumps to
the rpcrdma_buffer_destroy routine which will never free the MR's on rb_mws
list because the rb_send_bufs were never created. This leaks all the MR's
on the rb_mws list that were created prior to one of the MR allocations
failing.
Issue(2) was seen during testing. Our adapter had a finite number of MR's
available and we created enough connections to where we saw an MR
allocation failure on our Nth NFS connection request. After the kernel
cleaned up the resources it had allocated for the Nth connection we noticed
that FMR's had been leaked due to the coding error described above.
Issue(1) was seen during a code review while debugging issue(2).
Signed-off-by: Allen Andrews <[email protected]>
Reviewed-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/verbs.c | 73 ++++++++++++++++++++++---------------------
1 files changed, 38 insertions(+), 35 deletions(-)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 55fb09a..8f9704e 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1081,6 +1081,8 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
dprintk("RPC: %s: "
"ib_alloc_fast_reg_page_list "
"failed %i\n", __func__, rc);
+
+ ib_dereg_mr(r->r.frmr.fr_mr);
goto out;
}
list_add(&r->mw_list, &buf->rb_mws);
@@ -1217,41 +1219,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
kfree(buf->rb_recv_bufs[i]);
}
if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
- while (!list_empty(&buf->rb_mws)) {
- r = list_entry(buf->rb_mws.next,
- struct rpcrdma_mw, mw_list);
- list_del(&r->mw_list);
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_FRMR:
- rc = ib_dereg_mr(r->r.frmr.fr_mr);
- if (rc)
- dprintk("RPC: %s:"
- " ib_dereg_mr"
- " failed %i\n",
- __func__, rc);
- ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
- break;
- case RPCRDMA_MTHCAFMR:
- rc = ib_dealloc_fmr(r->r.fmr);
- if (rc)
- dprintk("RPC: %s:"
- " ib_dealloc_fmr"
- " failed %i\n",
- __func__, rc);
- break;
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- rc = ib_dealloc_mw(r->r.mw);
- if (rc)
- dprintk("RPC: %s:"
- " ib_dealloc_mw"
- " failed %i\n",
- __func__, rc);
- break;
- default:
- break;
- }
- }
rpcrdma_deregister_internal(ia,
buf->rb_send_bufs[i]->rl_handle,
&buf->rb_send_bufs[i]->rl_iov);
@@ -1259,6 +1226,42 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
}
}
+ while (!list_empty(&buf->rb_mws)) {
+ r = list_entry(buf->rb_mws.next,
+ struct rpcrdma_mw, mw_list);
+ list_del(&r->mw_list);
+ switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FRMR:
+ rc = ib_dereg_mr(r->r.frmr.fr_mr);
+ if (rc)
+ dprintk("RPC: %s:"
+ " ib_dereg_mr"
+ " failed %i\n",
+ __func__, rc);
+ ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+ break;
+ case RPCRDMA_MTHCAFMR:
+ rc = ib_dealloc_fmr(r->r.fmr);
+ if (rc)
+ dprintk("RPC: %s:"
+ " ib_dealloc_fmr"
+ " failed %i\n",
+ __func__, rc);
+ break;
+ case RPCRDMA_MEMWINDOWS_ASYNC:
+ case RPCRDMA_MEMWINDOWS:
+ rc = ib_dealloc_mw(r->r.mw);
+ if (rc)
+ dprintk("RPC: %s:"
+ " ib_dealloc_mw"
+ " failed %i\n",
+ __func__, rc);
+ break;
+ default:
+ break;
+ }
+ }
+
kfree(buf->rb_pool);
}
If the selected memory registration mode is not supported by the
underlying provider/HCA, the NFS mount command reports that there was
an invalid mount option, and fails. This is misleading.
Reporting a problem allocating memory is a lot closer to the truth.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/verbs.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index a352798..35dcd10 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -513,7 +513,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
#if RPCRDMA_PERSISTENT_REGISTRATION
memreg = RPCRDMA_ALLPHYSICAL;
#else
- rc = -EINVAL;
+ rc = -ENOMEM;
goto out2;
#endif
}
@@ -556,7 +556,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
default:
printk(KERN_ERR "%s: invalid memory registration mode %d\n",
__func__, memreg);
- rc = -EINVAL;
+ rc = -ENOMEM;
goto out2;
}
dprintk("RPC: %s: memory registration strategy is %d\n",
On Apr 21, 2014, at 6:01 PM, Chuck Lever <[email protected]> wrote:
> Section 4 of RFC 5667 (NFS/RDMA) says:
>
>> The server MUST ignore any Read list for other NFS procedures,
>> as well as additional Read list entries beyond the first in the
>> list.
>
> Our XDR code adds a zero pad at the end of NFS WRITEs and SYMLINKs
> whose content is not a multiple of 4 octets long. xprtrdma treats
> the tail buffer containing the zero pad as a separate read chunk,
> which the server ignores.
>
> Enable the pad optimization so our NFS client avoids sending zeroes
> the server is just going to ignore.
Looks like the Linux NFS/RDMA server is not spec compliant here.
When pad optimization is enabled, the Linux NFS/RDMA server returns
GARBAGEARGS for WRITEs whose length does not align with an XDR_QUAD.
Dropping this one for now.
> Signed-off-by: Chuck Lever <[email protected]>
> ---
>
> net/sunrpc/xprtrdma/transport.c | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
> index 1eb9c46..f94a6c4 100644
> --- a/net/sunrpc/xprtrdma/transport.c
> +++ b/net/sunrpc/xprtrdma/transport.c
> @@ -73,7 +73,7 @@ static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
> static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
> static unsigned int xprt_rdma_inline_write_padding;
> static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
> - int xprt_rdma_pad_optimize = 0;
> +int xprt_rdma_pad_optimize = 1;
>
> #ifdef RPC_DEBUG
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com
On Mon, Apr 21, 2014 at 06:02:14PM -0400, Chuck Lever wrote:
> ALLPHYSICAL is not a safe memory registration mode because it
> permits NFS servers to write anywhere in a client's memory. NFS
> server bugs could result in client memory being overwritten.
>
> This can be useful for embedded systems which do not support more
> surgical RDMA memory registration and protection methods, or for
> bring-up of new HCA hardware.
>
> However, enterprise Linux distributions have expressed a desire to
> disable it in production environments.
It's just as unsafe in embedded devices. I think it should go
All kernel RDMA providers except amso1100 support either MTHCAFMR
or FRMR, both of which are faster than REGISTER. amso1100 can
continue to use ALLPHYSICAL.
The only other ULP consumer in the kernel that uses the reg_phys_mr
verb is Lustre.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 3 -
net/sunrpc/xprtrdma/verbs.c | 90 ++--------------------------------------
2 files changed, 5 insertions(+), 88 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index e4af26a..a38efda 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -479,8 +479,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* on receive. Therefore, we request a reply chunk
* for non-writes wherever feasible and efficient.
*/
- if (wtype == rpcrdma_noch &&
- r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER)
+ if (wtype == rpcrdma_noch)
wtype = rpcrdma_replych;
}
}
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 304c7ad..6bb9a07 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -494,19 +494,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
switch (memreg) {
case RPCRDMA_MTHCAFMR:
if (!ia->ri_id->device->alloc_fmr) {
-#if RPCRDMA_PERSISTENT_REGISTRATION
dprintk("RPC: %s: MTHCAFMR registration "
"specified but not supported by adapter, "
"using riskier RPCRDMA_ALLPHYSICAL\n",
__func__);
memreg = RPCRDMA_ALLPHYSICAL;
-#else
- dprintk("RPC: %s: MTHCAFMR registration "
- "specified but not supported by adapter, "
- "using slower RPCRDMA_REGISTER\n",
- __func__);
- memreg = RPCRDMA_REGISTER;
-#endif
}
break;
case RPCRDMA_FRMR:
@@ -514,19 +506,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
if ((devattr.device_cap_flags &
(IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
(IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
-#if RPCRDMA_PERSISTENT_REGISTRATION
dprintk("RPC: %s: FRMR registration "
"specified but not supported by adapter, "
"using riskier RPCRDMA_ALLPHYSICAL\n",
__func__);
memreg = RPCRDMA_ALLPHYSICAL;
-#else
- dprintk("RPC: %s: FRMR registration "
- "specified but not supported by adapter, "
- "using slower RPCRDMA_REGISTER\n",
- __func__);
- memreg = RPCRDMA_REGISTER;
-#endif
} else {
/* Mind the ia limit on FRMR page list depth */
ia->ri_max_frmr_depth = min_t(unsigned int,
@@ -545,7 +529,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
* adapter.
*/
switch (memreg) {
- case RPCRDMA_REGISTER:
case RPCRDMA_FRMR:
break;
#if RPCRDMA_PERSISTENT_REGISTRATION
@@ -565,11 +548,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
if (IS_ERR(ia->ri_bind_mem)) {
printk(KERN_ALERT "%s: ib_get_dma_mr for "
- "phys register failed with %lX\n\t"
- "Will continue with degraded performance\n",
+ "phys register failed with %lX\n",
__func__, PTR_ERR(ia->ri_bind_mem));
- memreg = RPCRDMA_REGISTER;
- ia->ri_bind_mem = NULL;
+ rc = -ENOMEM;
+ goto out2;
}
break;
default:
@@ -1611,67 +1593,6 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
return rc;
}
-static int
-rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
- int *nsegs, int writing, struct rpcrdma_ia *ia)
-{
- int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
- IB_ACCESS_REMOTE_READ);
- struct rpcrdma_mr_seg *seg1 = seg;
- struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
- int len, i, rc = 0;
-
- if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
- *nsegs = RPCRDMA_MAX_DATA_SEGS;
- for (len = 0, i = 0; i < *nsegs;) {
- rpcrdma_map_one(ia, seg, writing);
- ipb[i].addr = seg->mr_dma;
- ipb[i].size = seg->mr_len;
- len += seg->mr_len;
- ++seg;
- ++i;
- /* Check for holes */
- if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
- offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
- break;
- }
- seg1->mr_base = seg1->mr_dma;
- seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
- ipb, i, mem_priv, &seg1->mr_base);
- if (IS_ERR(seg1->mr_chunk.rl_mr)) {
- rc = PTR_ERR(seg1->mr_chunk.rl_mr);
- dprintk("RPC: %s: failed ib_reg_phys_mr "
- "%u@0x%llx (%d)... status %i\n",
- __func__, len,
- (unsigned long long)seg1->mr_dma, i, rc);
- while (i--)
- rpcrdma_unmap_one(ia, --seg);
- } else {
- seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
- seg1->mr_nsegs = i;
- seg1->mr_len = len;
- }
- *nsegs = i;
- return rc;
-}
-
-static int
-rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
- struct rpcrdma_ia *ia)
-{
- struct rpcrdma_mr_seg *seg1 = seg;
- int rc;
-
- rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
- seg1->mr_chunk.rl_mr = NULL;
- while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(ia, seg++);
- if (rc)
- dprintk("RPC: %s: failed ib_dereg_mr,"
- " status %i\n", __func__, rc);
- return rc;
-}
-
int
rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
@@ -1701,10 +1622,8 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
break;
- /* Default registration each time */
default:
- rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
- break;
+ return -1;
}
if (rc)
return -1;
@@ -1738,7 +1657,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
break;
default:
- rc = rpcrdma_deregister_default_external(seg, ia);
break;
}
if (r) {
ALLPHYSICAL is not a safe memory registration mode because it
permits NFS servers to write anywhere in a client's memory. NFS
server bugs could result in client memory being overwritten.
This can be useful for embedded systems which do not support more
surgical RDMA memory registration and protection methods, or for
bring-up of new HCA hardware.
However, enterprise Linux distributions have expressed a desire to
disable it in production environments.
Signed-off-by: Chuck Lever <[email protected]>
---
include/linux/sunrpc/xprtrdma.h | 2 --
net/sunrpc/Kconfig | 14 ++++++++++++++
net/sunrpc/xprtrdma/verbs.c | 10 +++++-----
3 files changed, 19 insertions(+), 7 deletions(-)
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index c2f04e1..64a0a0a 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -62,8 +62,6 @@
#define RPCRDMA_INLINE_PAD_THRESH (512)/* payload threshold to pad (bytes) */
/* memory registration strategies */
-#define RPCRDMA_PERSISTENT_REGISTRATION (1)
-
enum rpcrdma_memreg {
RPCRDMA_BOUNCEBUFFERS = 0,
RPCRDMA_REGISTER,
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 0754d0f..c9a736e 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -58,6 +58,20 @@ config SUNRPC_XPRT_RDMA_CLIENT
If unsure, say N.
+config SUNRPC_XPRT_RDMA_CLIENT_ALLPHYSICAL
+ bool "Enable ALLPHYSICAL memory registration mode"
+ depends on SUNRPC_XPRT_RDMA_CLIENT
+ default y
+ help
+ This option enables support for the ALLPHYSICAL memory
+ registration mode.
+
+ This mode is very fast but not safe because it registers
+ and exposes all of local memory. This could allow an
+ NFS server bug to corrupt client memory.
+
+ If unsure, say Y.
+
config SUNRPC_XPRT_RDMA_SERVER
tristate "RPC over RDMA Server Support"
depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 35dcd10..9b71896 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -510,7 +510,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
if (!ia->ri_id->device->alloc_fmr) {
dprintk("RPC: %s: MTHCAFMR registration "
"not supported by HCA\n", __func__);
-#if RPCRDMA_PERSISTENT_REGISTRATION
+#ifdef CONFIG_SUNRPC_XPRT_RDMA_CLIENT_ALLPHYSICAL
memreg = RPCRDMA_ALLPHYSICAL;
#else
rc = -ENOMEM;
@@ -530,7 +530,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
switch (memreg) {
case RPCRDMA_FRMR:
break;
-#if RPCRDMA_PERSISTENT_REGISTRATION
+#ifdef CONFIG_SUNRPC_XPRT_RDMA_CLIENT_ALLPHYSICAL
case RPCRDMA_ALLPHYSICAL:
mem_priv = IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_WRITE |
@@ -541,7 +541,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
if (ia->ri_have_dma_lkey)
break;
mem_priv = IB_ACCESS_LOCAL_WRITE;
-#if RPCRDMA_PERSISTENT_REGISTRATION
+#ifdef CONFIG_SUNRPC_XPRT_RDMA_CLIENT_ALLPHYSICAL
register_setup:
#endif
ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
@@ -1601,7 +1601,7 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
switch (ia->ri_memreg_strategy) {
-#if RPCRDMA_PERSISTENT_REGISTRATION
+#ifdef CONFIG_SUNRPC_XPRT_RDMA_CLIENT_ALLPHYSICAL
case RPCRDMA_ALLPHYSICAL:
rpcrdma_map_one(ia, seg, writing);
seg->mr_rkey = ia->ri_bind_mem->rkey;
@@ -1639,7 +1639,7 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
switch (ia->ri_memreg_strategy) {
-#if RPCRDMA_PERSISTENT_REGISTRATION
+#ifdef CONFIG_SUNRPC_XPRT_RDMA_CLIENT_ALLPHYSICAL
case RPCRDMA_ALLPHYSICAL:
BUG_ON(nsegs != 1);
rpcrdma_unmap_one(ia, seg);
From: Steve Wise <[email protected]>
Some rdma devices don't support a fast register page list depth of
at least RPCRDMA_MAX_DATA_SEGS. So xprtrdma needs to chunk its fast
register regions according to the minimum of the device max supported
depth or RPCRDMA_MAX_DATA_SEGS.
Signed-off-by: Steve Wise <[email protected]>
Reviewed-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 4 ---
net/sunrpc/xprtrdma/verbs.c | 47 +++++++++++++++++++++++++++++----------
net/sunrpc/xprtrdma/xprt_rdma.h | 1 +
3 files changed, 36 insertions(+), 16 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 96ead52..400aa1b 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -248,10 +248,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
/* success. all failures return above */
req->rl_nchunks = nchunks;
- BUG_ON(nchunks == 0);
- BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
- && (nchunks > 3));
-
/*
* finish off header. If write, marshal discrim and nchunks.
*/
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 9372656..55fb09a 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -539,6 +539,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
__func__);
memreg = RPCRDMA_REGISTER;
#endif
+ } else {
+ /* Mind the ia limit on FRMR page list depth */
+ ia->ri_max_frmr_depth = min_t(unsigned int,
+ RPCRDMA_MAX_DATA_SEGS,
+ devattr.max_fast_reg_page_list_len);
}
break;
}
@@ -659,24 +664,42 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.srq = NULL;
ep->rep_attr.cap.max_send_wr = cdata->max_requests;
switch (ia->ri_memreg_strategy) {
- case RPCRDMA_FRMR:
+ case RPCRDMA_FRMR: {
+ int depth = 7;
+
/* Add room for frmr register and invalidate WRs.
* 1. FRMR reg WR for head
* 2. FRMR invalidate WR for head
- * 3. FRMR reg WR for pagelist
- * 4. FRMR invalidate WR for pagelist
+ * 3. N FRMR reg WRs for pagelist
+ * 4. N FRMR invalidate WRs for pagelist
* 5. FRMR reg WR for tail
* 6. FRMR invalidate WR for tail
* 7. The RDMA_SEND WR
*/
- ep->rep_attr.cap.max_send_wr *= 7;
+
+ /* Calculate N if the device max FRMR depth is smaller than
+ * RPCRDMA_MAX_DATA_SEGS.
+ */
+ if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
+ int delta = RPCRDMA_MAX_DATA_SEGS -
+ ia->ri_max_frmr_depth;
+
+ do {
+ depth += 2; /* FRMR reg + invalidate */
+ delta -= ia->ri_max_frmr_depth;
+ } while (delta > 0);
+
+ }
+ ep->rep_attr.cap.max_send_wr *= depth;
if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
- cdata->max_requests = devattr.max_qp_wr / 7;
+ cdata->max_requests = devattr.max_qp_wr / depth;
if (!cdata->max_requests)
return -EINVAL;
- ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
+ ep->rep_attr.cap.max_send_wr = cdata->max_requests *
+ depth;
}
break;
+ }
case RPCRDMA_MEMWINDOWS_ASYNC:
case RPCRDMA_MEMWINDOWS:
/* Add room for mw_binds+unbinds - overkill! */
@@ -1043,16 +1066,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
case RPCRDMA_FRMR:
for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
- RPCRDMA_MAX_SEGS);
+ ia->ri_max_frmr_depth);
if (IS_ERR(r->r.frmr.fr_mr)) {
rc = PTR_ERR(r->r.frmr.fr_mr);
dprintk("RPC: %s: ib_alloc_fast_reg_mr"
" failed %i\n", __func__, rc);
goto out;
}
- r->r.frmr.fr_pgl =
- ib_alloc_fast_reg_page_list(ia->ri_id->device,
- RPCRDMA_MAX_SEGS);
+ r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
+ ia->ri_id->device,
+ ia->ri_max_frmr_depth);
if (IS_ERR(r->r.frmr.fr_pgl)) {
rc = PTR_ERR(r->r.frmr.fr_pgl);
dprintk("RPC: %s: "
@@ -1498,8 +1521,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
seg1->mr_offset -= pageoff; /* start of page */
seg1->mr_len += pageoff;
len = -pageoff;
- if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
- *nsegs = RPCRDMA_MAX_DATA_SEGS;
+ if (*nsegs > ia->ri_max_frmr_depth)
+ *nsegs = ia->ri_max_frmr_depth;
for (page_no = i = 0; i < *nsegs;) {
rpcrdma_map_one(ia, seg, writing);
pa = seg->mr_dma;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index cc1445d..98340a3 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -66,6 +66,7 @@ struct rpcrdma_ia {
struct completion ri_done;
int ri_async_rc;
enum rpcrdma_memreg ri_memreg_strategy;
+ unsigned int ri_max_frmr_depth;
};
/*
Section 4 of RFC 5667 (NFS/RDMA) says:
> The server MUST ignore any Read list for other NFS procedures,
> as well as additional Read list entries beyond the first in the
> list.
Our XDR code adds a zero pad at the end of NFS WRITEs and SYMLINKs
whose content is not a multiple of 4 octets long. xprtrdma treats
the tail buffer containing the zero pad as a separate read chunk,
which the server ignores.
Enable the pad optimization so our NFS client avoids sending zeroes
the server is just going to ignore.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/transport.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 1eb9c46..f94a6c4 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -73,7 +73,7 @@ static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_inline_write_padding;
static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
- int xprt_rdma_pad_optimize = 0;
+int xprt_rdma_pad_optimize = 1;
#ifdef RPC_DEBUG
Clean up: rpcrdma_ep_destroy() returns a value that is used
only to print a debugging message. rpcrdma_ep_destroy() already
prints debugging messages in all error cases.
Make rpcrdma_ep_destroy() return void instead.
Signed-off-by: Chuck Lever <[email protected]>
Tested-by: Steve Wise <[email protected]>
---
net/sunrpc/xprtrdma/transport.c | 8 ++------
net/sunrpc/xprtrdma/verbs.c | 7 +------
net/sunrpc/xprtrdma/xprt_rdma.h | 2 +-
3 files changed, 4 insertions(+), 13 deletions(-)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 6a47193..44004eb 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -229,7 +229,6 @@ static void
xprt_rdma_destroy(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- int rc;
dprintk("RPC: %s: called\n", __func__);
@@ -238,10 +237,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
xprt_clear_connected(xprt);
rpcrdma_buffer_destroy(&r_xprt->rx_buf);
- rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
- if (rc)
- dprintk("RPC: %s: rpcrdma_ep_destroy returned %i\n",
- __func__, rc);
+ rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
rpcrdma_ia_close(&r_xprt->rx_ia);
xprt_rdma_free_addresses(xprt);
@@ -391,7 +387,7 @@ out4:
xprt_rdma_free_addresses(xprt);
rc = -EINVAL;
out3:
- (void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
+ rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
out2:
rpcrdma_ia_close(&new_xprt->rx_ia);
out1:
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 7a8a1bf..a60bf15 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -748,11 +748,8 @@ out1:
* Disconnect and destroy endpoint. After this, the only
* valid operations on the ep are to free it (if dynamically
* allocated) or re-create it.
- *
- * The caller's error handling must be sure to not leak the endpoint
- * if this function fails.
*/
-int
+void
rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
int rc;
@@ -782,8 +779,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
if (rc)
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, rc);
-
- return rc;
}
/*
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 3f44d6a..362a19d 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -301,7 +301,7 @@ void rpcrdma_ia_close(struct rpcrdma_ia *);
*/
int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
struct rpcrdma_create_data_internal *);
-int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
Change the completion handlers to grab up to 16 items per
ib_poll_cq() call. No extra ib_poll_cq() is needed if fewer than 16
items are returned.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/verbs.c | 56 ++++++++++++++++++++++++++-------------
net/sunrpc/xprtrdma/xprt_rdma.h | 4 +++
2 files changed, 42 insertions(+), 18 deletions(-)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 81daf2b..abb8d8d 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -162,14 +162,23 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc)
}
static int
-rpcrdma_sendcq_poll(struct ib_cq *cq)
+rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
{
- struct ib_wc wc;
- int rc;
+ struct ib_wc *wcs;
+ int count, rc;
- while ((rc = ib_poll_cq(cq, 1, &wc)) == 1)
- rpcrdma_sendcq_process_wc(&wc);
- return rc;
+ do {
+ wcs = ep->rep_send_wcs;
+
+ rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
+ if (rc <= 0)
+ return rc;
+
+ count = rc;
+ while (count-- > 0)
+ rpcrdma_sendcq_process_wc(wcs++);
+ } while (rc == RPCRDMA_POLLSIZE);
+ return 0;
}
/*
@@ -183,9 +192,10 @@ rpcrdma_sendcq_poll(struct ib_cq *cq)
static void
rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
{
+ struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
int rc;
- rc = rpcrdma_sendcq_poll(cq);
+ rc = rpcrdma_sendcq_poll(cq, ep);
if (rc) {
dprintk("RPC: %s: ib_poll_cq failed: %i\n",
__func__, rc);
@@ -202,7 +212,7 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
return;
}
- rpcrdma_sendcq_poll(cq);
+ rpcrdma_sendcq_poll(cq, ep);
}
static void
@@ -241,14 +251,23 @@ out_schedule:
}
static int
-rpcrdma_recvcq_poll(struct ib_cq *cq)
+rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
{
- struct ib_wc wc;
- int rc;
+ struct ib_wc *wcs;
+ int count, rc;
- while ((rc = ib_poll_cq(cq, 1, &wc)) == 1)
- rpcrdma_recvcq_process_wc(&wc);
- return rc;
+ do {
+ wcs = ep->rep_recv_wcs;
+
+ rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
+ if (rc <= 0)
+ return rc;
+
+ count = rc;
+ while (count-- > 0)
+ rpcrdma_recvcq_process_wc(wcs++);
+ } while (rc == RPCRDMA_POLLSIZE);
+ return 0;
}
/*
@@ -266,9 +285,10 @@ rpcrdma_recvcq_poll(struct ib_cq *cq)
static void
rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
{
+ struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
int rc;
- rc = rpcrdma_recvcq_poll(cq);
+ rc = rpcrdma_recvcq_poll(cq, ep);
if (rc) {
dprintk("RPC: %s: ib_poll_cq failed: %i\n",
__func__, rc);
@@ -285,7 +305,7 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
return;
}
- rpcrdma_recvcq_poll(cq);
+ rpcrdma_recvcq_poll(cq, ep);
}
#ifdef RPC_DEBUG
@@ -721,7 +741,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
- rpcrdma_cq_async_error_upcall, NULL,
+ rpcrdma_cq_async_error_upcall, ep,
ep->rep_attr.cap.max_send_wr + 1, 0);
if (IS_ERR(sendcq)) {
rc = PTR_ERR(sendcq);
@@ -738,7 +758,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
}
recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
- rpcrdma_cq_async_error_upcall, NULL,
+ rpcrdma_cq_async_error_upcall, ep,
ep->rep_attr.cap.max_recv_wr + 1, 0);
if (IS_ERR(recvcq)) {
rc = PTR_ERR(recvcq);
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 334ab6e..cb4c882 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -74,6 +74,8 @@ struct rpcrdma_ia {
* RDMA Endpoint -- one per transport instance
*/
+#define RPCRDMA_POLLSIZE (16)
+
struct rpcrdma_ep {
atomic_t rep_cqcount;
int rep_cqinit;
@@ -88,6 +90,8 @@ struct rpcrdma_ep {
struct rdma_conn_param rep_remote_cma;
struct sockaddr_storage rep_remote_addr;
struct delayed_work rep_connect_worker;
+ struct ib_wc rep_send_wcs[RPCRDMA_POLLSIZE];
+ struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE];
};
#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
While marshaling an RPC/RDMA request, the inline_{rsize,wsize}
settings determine whether an inline request is used, or whether
read or write chunks lists are built. The current default value of
these settings is 1024. Any RPC request smaller than 1024 bytes is
sent to the NFS server completely inline.
rpcrdma_buffer_create() allocates and pre-registers a set of RPC
buffers for each transport instance, also based on the inline rsize
and wsize settings.
RPC/RDMA requests and replies are built in these buffers. However,
if an RPC/RDMA request is expected to be larger than 1024, a buffer
has to be allocated and registered for that RPC, and deregistered
and released when the RPC is complete. This is known has a
"hardway allocation."
Since the introduction of NFSv4, the size of RPC requests has become
larger, and hardway allocations are thus more frequent. Hardway
allocations are significant overhead, and they waste the existing
RPC buffers pre-allocated by rpcrdma_buffer_create().
We'd like fewer hardway allocations.
Increasing the size of the pre-registered buffers is the most direct
way to do this. However, a blanket increase of the inline thresholds
has interoperability consequences.
On my 64-bit system, rpcrdma_buffer_create() requests roughly 7000
bytes for each RPC request buffer, using kmalloc(). Due to internal
fragmentation, this wastes nearly 1200 bytes because kmalloc()
already returns an 8192-byte piece of memory for a 7000-byte
allocation request, though the extra space remains unused.
So let's round up the size of the pre-allocated buffers, and make
use of the unused space in the kmalloc'd memory.
This change reduces the amount of hardway allocated memory for an
NFSv4 general connectathon run from 1322092 to 9472 bytes (99%).
Signed-off-by: Chuck Lever <[email protected]>
Tested-by: Steve Wise <[email protected]>
---
net/sunrpc/xprtrdma/verbs.c | 25 +++++++++++++------------
1 files changed, 13 insertions(+), 12 deletions(-)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index d46bdee..945aac3 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -50,6 +50,7 @@
#include <linux/interrupt.h>
#include <linux/pci.h> /* for Tavor hack below */
#include <linux/slab.h>
+#include <asm/bitops.h>
#include "xprt_rdma.h"
@@ -1005,7 +1006,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
{
char *p;
- size_t len;
+ size_t len, rlen, wlen;
int i, rc;
struct rpcrdma_mw *r;
@@ -1120,16 +1121,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
* Allocate/init the request/reply buffers. Doing this
* using kmalloc for now -- one for each buf.
*/
+ wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
+ rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
+ dprintk("RPC: %s: wlen = %zu, rlen = %zu\n",
+ __func__, wlen, rlen);
+
for (i = 0; i < buf->rb_max_requests; i++) {
struct rpcrdma_req *req;
struct rpcrdma_rep *rep;
- len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
- /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
- /* Typical ~2400b, so rounding up saves work later */
- if (len < 4096)
- len = 4096;
- req = kmalloc(len, GFP_KERNEL);
+ req = kmalloc(wlen, GFP_KERNEL);
if (req == NULL) {
dprintk("RPC: %s: request buffer %d alloc"
" failed\n", __func__, i);
@@ -1141,16 +1142,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
buf->rb_send_bufs[i]->rl_buffer = buf;
rc = rpcrdma_register_internal(ia, req->rl_base,
- len - offsetof(struct rpcrdma_req, rl_base),
+ wlen - offsetof(struct rpcrdma_req, rl_base),
&buf->rb_send_bufs[i]->rl_handle,
&buf->rb_send_bufs[i]->rl_iov);
if (rc)
goto out;
- buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
+ buf->rb_send_bufs[i]->rl_size = wlen -
+ sizeof(struct rpcrdma_req);
- len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
- rep = kmalloc(len, GFP_KERNEL);
+ rep = kmalloc(rlen, GFP_KERNEL);
if (rep == NULL) {
dprintk("RPC: %s: reply buffer %d alloc failed\n",
__func__, i);
@@ -1162,7 +1163,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
buf->rb_recv_bufs[i]->rr_buffer = buf;
rc = rpcrdma_register_internal(ia, rep->rr_base,
- len - offsetof(struct rpcrdma_rep, rr_base),
+ rlen - offsetof(struct rpcrdma_rep, rr_base),
&buf->rb_recv_bufs[i]->rr_handle,
&buf->rb_recv_bufs[i]->rr_iov);
if (rc)
Clean up: This memory registration mode is slow and was never
meant for use in production environments. Remove it to reduce
implementation complexity.
Signed-off-by: Chuck Lever <[email protected]>
Tested-by: Steve Wise <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 8 --------
net/sunrpc/xprtrdma/transport.c | 13 -------------
net/sunrpc/xprtrdma/verbs.c | 5 +----
3 files changed, 1 insertions(+), 25 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index c296468..b963e50 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -439,14 +439,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
wtype = rpcrdma_noch;
BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch);
- if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS &&
- (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) {
- /* forced to "pure inline"? */
- dprintk("RPC: %s: too much data (%d/%d) for inline\n",
- __func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len);
- return -1;
- }
-
hdrlen = 28; /*sizeof *headerp;*/
padlen = 0;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index f94a6c4..b4330b2 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -503,18 +503,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
* If the allocation or registration fails, the RPC framework
* will (doggedly) retry.
*/
- if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy ==
- RPCRDMA_BOUNCEBUFFERS) {
- /* forced to "pure inline" */
- dprintk("RPC: %s: too much data (%zd) for inline "
- "(r/w max %d/%d)\n", __func__, size,
- rpcx_to_rdmad(xprt).inline_rsize,
- rpcx_to_rdmad(xprt).inline_wsize);
- size = req->rl_size;
- rpc_exit(task, -EIO); /* fail the operation */
- rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
- goto out;
- }
if (task->tk_flags & RPC_TASK_SWAPPER)
nreq = kmalloc(sizeof *req + size, GFP_ATOMIC);
else
@@ -543,7 +531,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
req = nreq;
}
dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
-out:
req->rl_connect_cookie = 0; /* our reserved value */
return req->rl_xdr_buf;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 9cb88f3..4a4e4ea 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -557,7 +557,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
* adapter.
*/
switch (memreg) {
- case RPCRDMA_BOUNCEBUFFERS:
case RPCRDMA_REGISTER:
case RPCRDMA_FRMR:
break;
@@ -778,9 +777,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
/* Client offers RDMA Read but does not initiate */
ep->rep_remote_cma.initiator_depth = 0;
- if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
- ep->rep_remote_cma.responder_resources = 0;
- else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
+ if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
ep->rep_remote_cma.responder_resources = 32;
else
ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
Clean up: All remaining callers of rpcrdma_deregister_external()
pass NULL as the last argument, so remove that argument.
Signed-off-by: Chuck Lever <[email protected]>
Tested-by: Steve Wise <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 2 +-
net/sunrpc/xprtrdma/transport.c | 2 +-
net/sunrpc/xprtrdma/verbs.c | 8 +-------
net/sunrpc/xprtrdma/xprt_rdma.h | 2 +-
4 files changed, 4 insertions(+), 10 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index a38efda..315417d 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -273,7 +273,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
out:
for (pos = 0; nchunks--;)
pos += rpcrdma_deregister_external(
- &req->rl_segments[pos], r_xprt, NULL);
+ &req->rl_segments[pos], r_xprt);
return 0;
}
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 4607b62..6a47193 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -575,7 +575,7 @@ xprt_rdma_free(void *buffer)
for (i = 0; req->rl_nchunks;) {
--req->rl_nchunks;
i += rpcrdma_deregister_external(
- &req->rl_segments[i], r_xprt, NULL);
+ &req->rl_segments[i], r_xprt);
}
if (req->rl_iov.length == 0) { /* see allocate above */
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 9b71896..7a8a1bf 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1632,7 +1632,7 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
int
rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
- struct rpcrdma_xprt *r_xprt, void *r)
+ struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
int nsegs = seg->mr_nsegs, rc;
@@ -1658,12 +1658,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
default:
break;
}
- if (r) {
- struct rpcrdma_rep *rep = r;
- void (*func)(struct rpcrdma_rep *) = rep->rr_func;
- rep->rr_func = NULL;
- func(rep); /* dereg done, callback now */
- }
return nsegs;
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index bf08ee0..3f44d6a 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -331,7 +331,7 @@ int rpcrdma_deregister_internal(struct rpcrdma_ia *,
int rpcrdma_register_external(struct rpcrdma_mr_seg *,
int, int, struct rpcrdma_xprt *);
int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
- struct rpcrdma_xprt *, void *);
+ struct rpcrdma_xprt *);
/*
* RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
The current CQ handler uses the ib_wc.opcode field to distinguish
between event types. However, the contents of that field are not
reliable if the completion status is not IB_WC_SUCCESS.
When an error completion occurs on a send event, the CQ handler
schedules a tasklet with something that is not a struct rpcrdma_rep.
This is never correct behavior, and sometimes it results in a panic.
To resolve this issue, split the completion queue into a send CQ and
a receive CQ. The send CQ handler now handles only struct rpcrdma_mw
wr_id's, and the receive CQ handler now handles only struct
rpcrdma_rep wr_id's.
Fix suggested by Shirley Ma <[email protected]>
Reported-by: Rafael Reiter <[email protected]>
Fixes: 5c635e09cec0feeeb310968e51dad01040244851
BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=73211
Signed-off-by: Chuck Lever <[email protected]>
Tested-by: Klemens Senn <[email protected]>
Tested-by: Steve Wise <[email protected]>
---
net/sunrpc/xprtrdma/verbs.c | 228 +++++++++++++++++++++++----------------
net/sunrpc/xprtrdma/xprt_rdma.h | 1
2 files changed, 137 insertions(+), 92 deletions(-)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index a60bf15..ce7b45f 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -142,96 +142,115 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
}
}
-static inline
-void rpcrdma_event_process(struct ib_wc *wc)
+static void
+rpcrdma_sendcq_process_wc(struct ib_wc *wc)
{
- struct rpcrdma_mw *frmr;
- struct rpcrdma_rep *rep =
- (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
+ struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
- dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
- __func__, rep, wc->status, wc->opcode, wc->byte_len);
+ dprintk("RPC: %s: frmr %p status %X opcode %d\n",
+ __func__, frmr, wc->status, wc->opcode);
- if (!rep) /* send completion that we don't care about */
+ if (wc->wr_id == 0ULL)
return;
-
- if (IB_WC_SUCCESS != wc->status) {
- dprintk("RPC: %s: WC opcode %d status %X, connection lost\n",
- __func__, wc->opcode, wc->status);
- rep->rr_len = ~0U;
- if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
- rpcrdma_schedule_tasklet(rep);
+ if (wc->status != IB_WC_SUCCESS)
return;
- }
- switch (wc->opcode) {
- case IB_WC_FAST_REG_MR:
- frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+ if (wc->opcode == IB_WC_FAST_REG_MR)
frmr->r.frmr.state = FRMR_IS_VALID;
- break;
- case IB_WC_LOCAL_INV:
- frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+ else if (wc->opcode == IB_WC_LOCAL_INV)
frmr->r.frmr.state = FRMR_IS_INVALID;
- break;
- case IB_WC_RECV:
- rep->rr_len = wc->byte_len;
- ib_dma_sync_single_for_cpu(
- rdmab_to_ia(rep->rr_buffer)->ri_id->device,
- rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
- /* Keep (only) the most recent credits, after check validity */
- if (rep->rr_len >= 16) {
- struct rpcrdma_msg *p =
- (struct rpcrdma_msg *) rep->rr_base;
- unsigned int credits = ntohl(p->rm_credit);
- if (credits == 0) {
- dprintk("RPC: %s: server"
- " dropped credits to 0!\n", __func__);
- /* don't deadlock */
- credits = 1;
- } else if (credits > rep->rr_buffer->rb_max_requests) {
- dprintk("RPC: %s: server"
- " over-crediting: %d (%d)\n",
- __func__, credits,
- rep->rr_buffer->rb_max_requests);
- credits = rep->rr_buffer->rb_max_requests;
- }
- atomic_set(&rep->rr_buffer->rb_credits, credits);
- }
- rpcrdma_schedule_tasklet(rep);
- break;
- default:
- dprintk("RPC: %s: unexpected WC event %X\n",
- __func__, wc->opcode);
- break;
- }
}
-static inline int
-rpcrdma_cq_poll(struct ib_cq *cq)
+static int
+rpcrdma_sendcq_poll(struct ib_cq *cq)
{
struct ib_wc wc;
int rc;
- for (;;) {
- rc = ib_poll_cq(cq, 1, &wc);
- if (rc < 0) {
- dprintk("RPC: %s: ib_poll_cq failed %i\n",
- __func__, rc);
- return rc;
- }
- if (rc == 0)
- break;
+ while ((rc = ib_poll_cq(cq, 1, &wc)) == 1)
+ rpcrdma_sendcq_process_wc(&wc);
+ return rc;
+}
- rpcrdma_event_process(&wc);
+/*
+ * Handle send, fast_reg_mr, and local_inv completions.
+ *
+ * Send events are typically suppressed and thus do not result
+ * in an upcall. Occasionally one is signaled, however. This
+ * prevents the provider's completion queue from wrapping and
+ * losing a completion.
+ */
+static void
+rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
+{
+ int rc;
+
+ rc = rpcrdma_sendcq_poll(cq);
+ if (rc) {
+ dprintk("RPC: %s: ib_poll_cq failed: %i\n",
+ __func__, rc);
+ return;
}
- return 0;
+ rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ if (rc) {
+ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
+ __func__, rc);
+ return;
+ }
+
+ rpcrdma_sendcq_poll(cq);
+}
+
+static void
+rpcrdma_recvcq_process_wc(struct ib_wc *wc)
+{
+ struct rpcrdma_rep *rep =
+ (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
+
+ dprintk("RPC: %s: rep %p status %X opcode %X length %u\n",
+ __func__, rep, wc->status, wc->opcode, wc->byte_len);
+
+ if (wc->status != IB_WC_SUCCESS) {
+ rep->rr_len = ~0U;
+ goto out_schedule;
+ }
+ if (wc->opcode != IB_WC_RECV)
+ return;
+
+ rep->rr_len = wc->byte_len;
+ ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
+ rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
+
+ if (rep->rr_len >= 16) {
+ struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
+ unsigned int credits = ntohl(p->rm_credit);
+
+ if (credits == 0)
+ credits = 1; /* don't deadlock */
+ else if (credits > rep->rr_buffer->rb_max_requests)
+ credits = rep->rr_buffer->rb_max_requests;
+ atomic_set(&rep->rr_buffer->rb_credits, credits);
+ }
+
+out_schedule:
+ rpcrdma_schedule_tasklet(rep);
+}
+
+static int
+rpcrdma_recvcq_poll(struct ib_cq *cq)
+{
+ struct ib_wc wc;
+ int rc;
+
+ while ((rc = ib_poll_cq(cq, 1, &wc)) == 1)
+ rpcrdma_recvcq_process_wc(&wc);
+ return rc;
}
/*
- * rpcrdma_cq_event_upcall
+ * Handle receive completions.
*
- * This upcall handles recv and send events.
* It is reentrant but processes single events in order to maintain
* ordering of receives to keep server credits.
*
@@ -240,26 +259,27 @@ rpcrdma_cq_poll(struct ib_cq *cq)
* connection shutdown. That is, the structures required for
* the completion of the reply handler must remain intact until
* all memory has been reclaimed.
- *
- * Note that send events are suppressed and do not result in an upcall.
*/
static void
-rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
+rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
{
int rc;
- rc = rpcrdma_cq_poll(cq);
- if (rc)
+ rc = rpcrdma_recvcq_poll(cq);
+ if (rc) {
+ dprintk("RPC: %s: ib_poll_cq failed: %i\n",
+ __func__, rc);
return;
+ }
rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
if (rc) {
- dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
+ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
__func__, rc);
return;
}
- rpcrdma_cq_poll(cq);
+ rpcrdma_recvcq_poll(cq);
}
#ifdef RPC_DEBUG
@@ -610,6 +630,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
struct rpcrdma_create_data_internal *cdata)
{
struct ib_device_attr devattr;
+ struct ib_cq *sendcq, *recvcq;
int rc, err;
rc = ib_query_device(ia->ri_id->device, &devattr);
@@ -685,7 +706,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.cap.max_recv_sge);
/* set trigger for requesting send completion */
- ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
+ ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
if (ep->rep_cqinit <= 2)
ep->rep_cqinit = 0;
INIT_CQCOUNT(ep);
@@ -693,26 +714,43 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
init_waitqueue_head(&ep->rep_connect_wait);
INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
- ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
+ sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
rpcrdma_cq_async_error_upcall, NULL,
- ep->rep_attr.cap.max_recv_wr +
ep->rep_attr.cap.max_send_wr + 1, 0);
- if (IS_ERR(ep->rep_cq)) {
- rc = PTR_ERR(ep->rep_cq);
- dprintk("RPC: %s: ib_create_cq failed: %i\n",
+ if (IS_ERR(sendcq)) {
+ rc = PTR_ERR(sendcq);
+ dprintk("RPC: %s: failed to create send CQ: %i\n",
__func__, rc);
goto out1;
}
- rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
+ rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
if (rc) {
dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
__func__, rc);
goto out2;
}
- ep->rep_attr.send_cq = ep->rep_cq;
- ep->rep_attr.recv_cq = ep->rep_cq;
+ recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
+ rpcrdma_cq_async_error_upcall, NULL,
+ ep->rep_attr.cap.max_recv_wr + 1, 0);
+ if (IS_ERR(recvcq)) {
+ rc = PTR_ERR(recvcq);
+ dprintk("RPC: %s: failed to create recv CQ: %i\n",
+ __func__, rc);
+ goto out2;
+ }
+
+ rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
+ if (rc) {
+ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
+ __func__, rc);
+ ib_destroy_cq(recvcq);
+ goto out2;
+ }
+
+ ep->rep_attr.send_cq = sendcq;
+ ep->rep_attr.recv_cq = recvcq;
/* Initialize cma parameters */
@@ -734,7 +772,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
return 0;
out2:
- err = ib_destroy_cq(ep->rep_cq);
+ err = ib_destroy_cq(sendcq);
if (err)
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, err);
@@ -774,8 +812,14 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
ep->rep_pad_mr = NULL;
}
- rpcrdma_clean_cq(ep->rep_cq);
- rc = ib_destroy_cq(ep->rep_cq);
+ rpcrdma_clean_cq(ep->rep_attr.recv_cq);
+ rc = ib_destroy_cq(ep->rep_attr.recv_cq);
+ if (rc)
+ dprintk("RPC: %s: ib_destroy_cq returned %i\n",
+ __func__, rc);
+
+ rpcrdma_clean_cq(ep->rep_attr.send_cq);
+ rc = ib_destroy_cq(ep->rep_attr.send_cq);
if (rc)
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, rc);
@@ -798,7 +842,9 @@ retry:
if (rc && rc != -ENOTCONN)
dprintk("RPC: %s: rpcrdma_ep_disconnect"
" status %i\n", __func__, rc);
- rpcrdma_clean_cq(ep->rep_cq);
+
+ rpcrdma_clean_cq(ep->rep_attr.recv_cq);
+ rpcrdma_clean_cq(ep->rep_attr.send_cq);
xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
id = rpcrdma_create_id(xprt, ia,
@@ -907,7 +953,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
int rc;
- rpcrdma_clean_cq(ep->rep_cq);
+ rpcrdma_clean_cq(ep->rep_attr.recv_cq);
+ rpcrdma_clean_cq(ep->rep_attr.send_cq);
rc = rdma_disconnect(ia->ri_id);
if (!rc) {
/* returns without wait if not connected */
@@ -1727,7 +1774,6 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
ib_dma_sync_single_for_cpu(ia->ri_id->device,
rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
- DECR_CQCOUNT(ep);
rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
if (rc)
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 362a19d..334ab6e 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -79,7 +79,6 @@ struct rpcrdma_ep {
int rep_cqinit;
int rep_connected;
struct rpcrdma_ia *rep_ia;
- struct ib_cq *rep_cq;
struct ib_qp_init_attr rep_attr;
wait_queue_head_t rep_connect_wait;
struct ib_sge rep_pad; /* holds zeroed pad */
An IB provider can invoke rpcrdma_conn_func() in an IRQ context,
thus rpcrdma_conn_func() cannot be allowed to directly invoke
generic RPC functions like xprt_wake_pending_tasks().
Signed-off-by: Chuck Lever <[email protected]>
Tested-by: Steve Wise <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 22 +++++++++++++++-------
net/sunrpc/xprtrdma/verbs.c | 3 +++
net/sunrpc/xprtrdma/xprt_rdma.h | 3 +++
3 files changed, 21 insertions(+), 7 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 400aa1b..c296468 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -676,15 +676,11 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
rqst->rq_private_buf = rqst->rq_rcv_buf;
}
-/*
- * This function is called when an async event is posted to
- * the connection which changes the connection state. All it
- * does at this point is mark the connection up/down, the rpc
- * timers do the rest.
- */
void
-rpcrdma_conn_func(struct rpcrdma_ep *ep)
+rpcrdma_connect_worker(struct work_struct *work)
{
+ struct rpcrdma_ep *ep =
+ container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
struct rpc_xprt *xprt = ep->rep_xprt;
spin_lock_bh(&xprt->transport_lock);
@@ -701,6 +697,18 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
}
/*
+ * This function is called when an async event is posted to
+ * the connection which changes the connection state. All it
+ * does at this point is mark the connection up/down, the rpc
+ * timers do the rest.
+ */
+void
+rpcrdma_conn_func(struct rpcrdma_ep *ep)
+{
+ schedule_delayed_work(&ep->rep_connect_worker, 0);
+}
+
+/*
* This function is called when memory window unbind which we are waiting
* for completes. Just use rr_func (zeroed by upcall) to signal completion.
*/
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 8f9704e..9cb88f3 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -742,6 +742,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
INIT_CQCOUNT(ep);
ep->rep_ia = ia;
init_waitqueue_head(&ep->rep_connect_wait);
+ INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
/*
* Create a single cq for receive dto and mw_bind (only ever
@@ -817,6 +818,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
dprintk("RPC: %s: entering, connected is %d\n",
__func__, ep->rep_connected);
+ cancel_delayed_work_sync(&ep->rep_connect_worker);
+
if (ia->ri_id->qp) {
rc = rpcrdma_ep_disconnect(ep, ia);
if (rc)
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 98340a3..c620d13 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -43,6 +43,7 @@
#include <linux/wait.h> /* wait_queue_head_t, etc */
#include <linux/spinlock.h> /* spinlock_t, etc */
#include <linux/atomic.h> /* atomic_t, etc */
+#include <linux/workqueue.h> /* struct work_struct */
#include <rdma/rdma_cm.h> /* RDMA connection api */
#include <rdma/ib_verbs.h> /* RDMA verbs api */
@@ -87,6 +88,7 @@ struct rpcrdma_ep {
struct rpc_xprt *rep_xprt; /* for rep_func */
struct rdma_conn_param rep_remote_cma;
struct sockaddr_storage rep_remote_addr;
+ struct delayed_work rep_connect_worker;
};
#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
@@ -336,6 +338,7 @@ int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
/*
* RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
*/
+void rpcrdma_connect_worker(struct work_struct *);
void rpcrdma_conn_func(struct rpcrdma_ep *);
void rpcrdma_reply_handler(struct rpcrdma_rep *);