Hi-
Not sure I've posted these yet. I've been working on some error
injection features and while testing them, I found a few bugs in
the NFS/RDMA client.
---
Chuck Lever (5):
xprtrdma: Disconnect after an ib_post_send() immediate error
xprtrdma: Put rpcrdma_reps before waking the tear-down completion
xprtrdma: Add xprtrdma_post_recvs_err() tracepoint
xprtrdma: Add an xprtrdma_post_send_err tracepoint
xprtrdma: Eliminate rpcrdma_post_sends()
include/trace/events/rpcrdma.h | 74 ++++++++++++++++++++++++++++---
net/sunrpc/xprtrdma/backchannel.c | 2 +-
net/sunrpc/xprtrdma/frwr_ops.c | 14 +++++-
net/sunrpc/xprtrdma/transport.c | 2 +-
net/sunrpc/xprtrdma/verbs.c | 28 +++---------
net/sunrpc/xprtrdma/xprt_rdma.h | 2 +-
6 files changed, 90 insertions(+), 32 deletions(-)
--
Chuck Lever
In the vast majority of cases, rc=0. Don't record that in the
post_recvs tracepoint. Instead, add a separate tracepoint that can
be left enabled all the time to capture the very rare immediate
errors returned by ib_post_recv().
Signed-off-by: Chuck Lever <[email protected]>
---
include/trace/events/rpcrdma.h | 41 +++++++++++++++++++++++++++++++++-------
net/sunrpc/xprtrdma/verbs.c | 3 ++-
2 files changed, 36 insertions(+), 8 deletions(-)
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index bd55908c1bef..d65a84bd040c 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -818,16 +818,14 @@ TRACE_EVENT(xprtrdma_post_recv,
TRACE_EVENT(xprtrdma_post_recvs,
TP_PROTO(
const struct rpcrdma_xprt *r_xprt,
- unsigned int count,
- int status
+ unsigned int count
),
- TP_ARGS(r_xprt, count, status),
+ TP_ARGS(r_xprt, count),
TP_STRUCT__entry(
__field(u32, cq_id)
__field(unsigned int, count)
- __field(int, status)
__field(int, posted)
__string(addr, rpcrdma_addrstr(r_xprt))
__string(port, rpcrdma_portstr(r_xprt))
@@ -838,15 +836,44 @@ TRACE_EVENT(xprtrdma_post_recvs,
__entry->cq_id = ep->re_attr.recv_cq->res.id;
__entry->count = count;
- __entry->status = status;
__entry->posted = ep->re_receive_count;
__assign_str(addr, rpcrdma_addrstr(r_xprt));
__assign_str(port, rpcrdma_portstr(r_xprt));
),
- TP_printk("peer=[%s]:%s cq.id=%d %u new recvs, %d active (rc %d)",
+ TP_printk("peer=[%s]:%s cq.id=%d %u new recvs, %d active",
+ __get_str(addr), __get_str(port), __entry->cq_id,
+ __entry->count, __entry->posted
+ )
+);
+
+TRACE_EVENT(xprtrdma_post_recvs_err,
+ TP_PROTO(
+ const struct rpcrdma_xprt *r_xprt,
+ int status
+ ),
+
+ TP_ARGS(r_xprt, status),
+
+ TP_STRUCT__entry(
+ __field(u32, cq_id)
+ __field(int, status)
+ __string(addr, rpcrdma_addrstr(r_xprt))
+ __string(port, rpcrdma_portstr(r_xprt))
+ ),
+
+ TP_fast_assign(
+ const struct rpcrdma_ep *ep = r_xprt->rx_ep;
+
+ __entry->cq_id = ep->re_attr.recv_cq->res.id;
+ __entry->status = status;
+ __assign_str(addr, rpcrdma_addrstr(r_xprt));
+ __assign_str(port, rpcrdma_portstr(r_xprt));
+ ),
+
+ TP_printk("peer=[%s]:%s cq.id=%d rc=%d",
__get_str(addr), __get_str(port), __entry->cq_id,
- __entry->count, __entry->posted, __entry->status
+ __entry->status
)
);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 016f10a781b4..1e9041c022b6 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1417,6 +1417,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
rc = ib_post_recv(ep->re_id->qp, wr,
(const struct ib_recv_wr **)&bad_wr);
if (rc) {
+ trace_xprtrdma_post_recvs_err(r_xprt, rc);
for (wr = bad_wr; wr;) {
struct rpcrdma_rep *rep;
@@ -1430,7 +1431,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
complete(&ep->re_done);
out:
- trace_xprtrdma_post_recvs(r_xprt, count, rc);
+ trace_xprtrdma_post_recvs(r_xprt, count);
ep->re_receive_count += count;
return;
}
Ensure the tear-down completion is awoken only /after/ we've stopped
fiddling with rpcrdma_rep objects in rpcrdma_post_recvs().
Fixes: 15788d1d1077 ("xprtrdma: Do not refresh Receive Queue while it is draining")
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/verbs.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index c1797ea19418..016f10a781b4 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1416,11 +1416,6 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
rc = ib_post_recv(ep->re_id->qp, wr,
(const struct ib_recv_wr **)&bad_wr);
- if (atomic_dec_return(&ep->re_receiving) > 0)
- complete(&ep->re_done);
-
-out:
- trace_xprtrdma_post_recvs(r_xprt, count, rc);
if (rc) {
for (wr = bad_wr; wr;) {
struct rpcrdma_rep *rep;
@@ -1431,6 +1426,11 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
--count;
}
}
+ if (atomic_dec_return(&ep->re_receiving) > 0)
+ complete(&ep->re_done);
+
+out:
+ trace_xprtrdma_post_recvs(r_xprt, count, rc);
ep->re_receive_count += count;
return;
}
ib_post_send() does not disconnect the QP when it returns an
immediate error. Thus, the code that posts LocalInv has to
explicitly disconnect after an immediate error. This is just
like the frwr_send() callers handle it.
If a disconnect isn't done here, the transport deadlocks.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/frwr_ops.c | 8 ++++++++
net/sunrpc/xprtrdma/verbs.c | 2 +-
net/sunrpc/xprtrdma/xprt_rdma.h | 1 +
3 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 229fcc9a9064..754c5dffe127 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -557,6 +557,10 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* On error, the MRs get destroyed once the QP has drained. */
trace_xprtrdma_post_linv_err(req, rc);
+
+ /* Force a connection loss to ensure complete recovery.
+ */
+ rpcrdma_force_disconnect(ep);
}
/**
@@ -653,4 +657,8 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* retransmission.
*/
rpcrdma_unpin_rqst(req->rl_reply);
+
+ /* Force a connection loss to ensure complete recovery.
+ */
+ rpcrdma_force_disconnect(ep);
}
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 649c23518ec0..c1797ea19418 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -124,7 +124,7 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
* connection is closed or lost. (The important thing is it needs
* to be invoked "at least" once).
*/
-static void rpcrdma_force_disconnect(struct rpcrdma_ep *ep)
+void rpcrdma_force_disconnect(struct rpcrdma_ep *ep)
{
if (atomic_add_unless(&ep->re_force_disconnect, 1, 1))
xprt_force_disconnect(ep->re_xprt);
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 5d231d94e944..927e20a2c04e 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -454,6 +454,7 @@ extern unsigned int xprt_rdma_memreg_strategy;
/*
* Endpoint calls - xprtrdma/verbs.c
*/
+void rpcrdma_force_disconnect(struct rpcrdma_ep *ep);
void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc);
int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt);
void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt);
Clean up.
Now that there is only one registration mode, there is only one
target "post_send" method: frwr_send(). rpcrdma_post_sends() no
longer adds much value, especially since all of its call sites
ignore the return code value except to check if it's non-zero.
Just have them call frwr_send() directly instead.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/backchannel.c | 2 +-
net/sunrpc/xprtrdma/transport.c | 2 +-
net/sunrpc/xprtrdma/verbs.c | 15 ---------------
net/sunrpc/xprtrdma/xprt_rdma.h | 1 -
4 files changed, 2 insertions(+), 18 deletions(-)
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 1151efd09b27..17f174d6ea3b 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -115,7 +115,7 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst)
if (rc < 0)
goto failed_marshal;
- if (rpcrdma_post_sends(r_xprt, req))
+ if (frwr_send(r_xprt, req))
goto drop_connection;
return 0;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 9c2ffc67c0fd..a463400ed5a3 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -661,7 +661,7 @@ xprt_rdma_send_request(struct rpc_rqst *rqst)
goto drop_connection;
rqst->rq_xtime = ktime_get();
- if (rpcrdma_post_sends(r_xprt, req))
+ if (frwr_send(r_xprt, req))
goto drop_connection;
rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 1e9041c022b6..aaec3c9be8db 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1349,21 +1349,6 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb)
kfree(rb);
}
-/**
- * rpcrdma_post_sends - Post WRs to a transport's Send Queue
- * @r_xprt: controlling transport instance
- * @req: rpcrdma_req containing the Send WR to post
- *
- * Returns 0 if the post was successful, otherwise -ENOTCONN
- * is returned.
- */
-int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
-{
- if (frwr_send(r_xprt, req))
- return -ENOTCONN;
- return 0;
-}
-
/**
* rpcrdma_post_recvs - Refill the Receive Queue
* @r_xprt: controlling transport instance
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 927e20a2c04e..d91f54eae00b 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -459,7 +459,6 @@ void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc);
int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt);
void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt);
-int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp);
/*
Unlike xprtrdma_post_send(), this one can be left enabled all the
time, and should almost never fire. But we do want to know about
immediate errors when they happen.
Note that there is already a similar post_linv_err tracepoint.
Signed-off-by: Chuck Lever <[email protected]>
---
include/trace/events/rpcrdma.h | 33 +++++++++++++++++++++++++++++++++
net/sunrpc/xprtrdma/frwr_ops.c | 6 +++++-
2 files changed, 38 insertions(+), 1 deletion(-)
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index d65a84bd040c..de4195499592 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -793,6 +793,39 @@ TRACE_EVENT(xprtrdma_post_send,
)
);
+TRACE_EVENT(xprtrdma_post_send_err,
+ TP_PROTO(
+ const struct rpcrdma_xprt *r_xprt,
+ const struct rpcrdma_req *req,
+ int rc
+ ),
+
+ TP_ARGS(r_xprt, req, rc),
+
+ TP_STRUCT__entry(
+ __field(u32, cq_id)
+ __field(unsigned int, task_id)
+ __field(unsigned int, client_id)
+ __field(int, rc)
+ ),
+
+ TP_fast_assign(
+ const struct rpc_rqst *rqst = &req->rl_slot;
+ const struct rpcrdma_ep *ep = r_xprt->rx_ep;
+
+ __entry->cq_id = ep ? ep->re_attr.recv_cq->res.id : 0;
+ __entry->task_id = rqst->rq_task->tk_pid;
+ __entry->client_id = rqst->rq_task->tk_client ?
+ rqst->rq_task->tk_client->cl_clid : -1;
+ __entry->rc = rc;
+ ),
+
+ TP_printk("task:%u@%u cq.id=%u rc=%d",
+ __entry->task_id, __entry->client_id,
+ __entry->cq_id, __entry->rc
+ )
+);
+
TRACE_EVENT(xprtrdma_post_recv,
TP_PROTO(
const struct rpcrdma_rep *rep
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 754c5dffe127..f700b34a5bfd 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -394,6 +394,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
struct rpcrdma_ep *ep = r_xprt->rx_ep;
struct rpcrdma_mr *mr;
unsigned int num_wrs;
+ int ret;
num_wrs = 1;
post_wr = send_wr;
@@ -420,7 +421,10 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
}
trace_xprtrdma_post_send(req);
- return ib_post_send(ep->re_id->qp, post_wr, NULL);
+ ret = ib_post_send(ep->re_id->qp, post_wr, NULL);
+ if (ret)
+ trace_xprtrdma_post_send_err(r_xprt, req, ret);
+ return ret;
}
/**
Hi Chuck,
On Mon, Aug 2, 2021 at 2:45 PM Chuck Lever <[email protected]> wrote:
>
> Hi-
>
> Not sure I've posted these yet. I've been working on some error
> injection features and while testing them, I found a few bugs in
> the NFS/RDMA client.
Thanks! I've added these to my linux-next branch for 5.14.
Anna
>
> ---
>
> Chuck Lever (5):
> xprtrdma: Disconnect after an ib_post_send() immediate error
> xprtrdma: Put rpcrdma_reps before waking the tear-down completion
> xprtrdma: Add xprtrdma_post_recvs_err() tracepoint
> xprtrdma: Add an xprtrdma_post_send_err tracepoint
> xprtrdma: Eliminate rpcrdma_post_sends()
>
>
> include/trace/events/rpcrdma.h | 74 ++++++++++++++++++++++++++++---
> net/sunrpc/xprtrdma/backchannel.c | 2 +-
> net/sunrpc/xprtrdma/frwr_ops.c | 14 +++++-
> net/sunrpc/xprtrdma/transport.c | 2 +-
> net/sunrpc/xprtrdma/verbs.c | 28 +++---------
> net/sunrpc/xprtrdma/xprt_rdma.h | 2 +-
> 6 files changed, 90 insertions(+), 32 deletions(-)
>
> --
> Chuck Lever
>