Hi Anna-
These patches harden xprtrdma's reply handler code path, particu-
larly XDR decoding of incoming transport headers. I didn't copy
linux-rdma on this series since it is strictly XDR-related.
Please have a look at these and let me know if they are not
acceptable for v4.14.
---
Chuck Lever (7):
xprtrdma: Add xdr_init_decode to rpcrdma_reply_handler()
xprtrdma: Harden backchannel call decoding
xprtrdma: Refactor rpcrdma_reply_handler()
xprtrdma: Replace rpcrdma_count_chunks()
xprtrdma: Remove opcode check in Receive completion handler
xprtrdma: Remove rpcrdma_rep::rr_len
xprtrdma: Clean up XDR decoding in rpcrdma_update_granted_credits()
include/linux/sunrpc/xdr.h | 13 +
net/sunrpc/xprtrdma/backchannel.c | 40 +---
net/sunrpc/xprtrdma/rpc_rdma.c | 422 +++++++++++++++++++++++--------------
net/sunrpc/xprtrdma/verbs.c | 21 +-
net/sunrpc/xprtrdma/xprt_rdma.h | 13 +
5 files changed, 304 insertions(+), 205 deletions(-)
--
Chuck Lever
Transport header decoding deals with untrusted input data, therefore
decoding this header needs to be hardened.
Adopt the same infrastructure that is used when XDR decoding NFS
replies. This is slightly more CPU-intensive than the replaced code,
but we're not adding new atomics, locking, or context switches. The
cost is manageable.
Start by initializing an xdr_stream in rpcrdma_reply_handler().
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 37 +++++++++++++++++++++++--------------
net/sunrpc/xprtrdma/verbs.c | 3 +++
net/sunrpc/xprtrdma/xprt_rdma.h | 8 ++++++++
3 files changed, 34 insertions(+), 14 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index ca4d6e4..24f58c7 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -993,10 +993,11 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+ struct xdr_stream *xdr = &rep->rr_stream;
struct rpcrdma_msg *headerp;
struct rpcrdma_req *req;
struct rpc_rqst *rqst;
- __be32 *iptr;
+ __be32 *iptr, *p, xid, vers, proc;
int rdmalen, status, rmerr;
unsigned long cwnd;
struct list_head mws;
@@ -1005,8 +1006,18 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
if (rep->rr_len == RPCRDMA_BAD_LEN)
goto out_badstatus;
- if (rep->rr_len < RPCRDMA_HDRLEN_ERR)
+
+ xdr_init_decode(xdr, &rep->rr_hdrbuf,
+ rep->rr_hdrbuf.head[0].iov_base);
+
+ /* Fixed transport header fields */
+ p = xdr_inline_decode(xdr, 4 * sizeof(*p));
+ if (unlikely(!p))
goto out_shortreply;
+ xid = *p++;
+ vers = *p++;
+ p++; /* credits */
+ proc = *p++;
headerp = rdmab_to_msg(rep->rr_rdmabuf);
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
@@ -1018,8 +1029,7 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
* get context for handling any incoming chunks.
*/
spin_lock(&buf->rb_lock);
- req = rpcrdma_lookup_req_locked(&r_xprt->rx_buf,
- headerp->rm_xid);
+ req = rpcrdma_lookup_req_locked(&r_xprt->rx_buf, xid);
if (!req)
goto out_nomatch;
if (req->rl_reply)
@@ -1035,7 +1045,7 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
spin_unlock(&buf->rb_lock);
dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n",
- __func__, rep, req, be32_to_cpu(headerp->rm_xid));
+ __func__, rep, req, be32_to_cpu(xid));
/* Invalidate and unmap the data payloads before waking the
* waiting application. This guarantees the memory regions
@@ -1052,16 +1062,16 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
* the rep, rqst, and rq_task pointers remain stable.
*/
spin_lock_bh(&xprt->transport_lock);
- rqst = xprt_lookup_rqst(xprt, headerp->rm_xid);
+ rqst = xprt_lookup_rqst(xprt, xid);
if (!rqst)
goto out_norqst;
xprt->reestablish_timeout = 0;
- if (headerp->rm_vers != rpcrdma_version)
+ if (vers != rpcrdma_version)
goto out_badversion;
/* check for expected message types */
/* The order of some of these tests is important. */
- switch (headerp->rm_type) {
+ switch (proc) {
case rdma_msg:
/* never expect read chunks */
/* never expect reply chunks (two ways to check) */
@@ -1123,7 +1133,7 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
default:
dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
rqst->rq_task->tk_pid, __func__,
- be32_to_cpu(headerp->rm_type));
+ be32_to_cpu(proc));
status = -EIO;
r_xprt->rx_stats.bad_reply_count++;
break;
@@ -1161,7 +1171,7 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
*/
out_badversion:
dprintk("RPC: %s: invalid version %d\n",
- __func__, be32_to_cpu(headerp->rm_vers));
+ __func__, be32_to_cpu(vers));
status = -EIO;
r_xprt->rx_stats.bad_reply_count++;
goto out;
@@ -1204,16 +1214,15 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
out_nomatch:
spin_unlock(&buf->rb_lock);
- dprintk("RPC: %s: no match for incoming xid 0x%08x len %d\n",
- __func__, be32_to_cpu(headerp->rm_xid),
- rep->rr_len);
+ dprintk("RPC: %s: no match for incoming xid 0x%08x\n",
+ __func__, be32_to_cpu(xid));
goto repost;
out_duplicate:
spin_unlock(&buf->rb_lock);
dprintk("RPC: %s: "
"duplicate reply %p to RPC request %p: xid 0x%08x\n",
- __func__, rep, req, be32_to_cpu(headerp->rm_xid));
+ __func__, rep, req, be32_to_cpu(xid));
/* If no pending RPC transaction was matched, post a replacement
* receive buffer before returning.
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index e4171f2..f1b1c37 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -180,6 +180,7 @@
__func__, rep, wc->byte_len);
rep->rr_len = wc->byte_len;
+ rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len);
rep->rr_wc_flags = wc->wc_flags;
rep->rr_inv_rkey = wc->ex.invalidate_rkey;
@@ -974,6 +975,8 @@ struct rpcrdma_rep *
rc = PTR_ERR(rep->rr_rdmabuf);
goto out_free;
}
+ xdr_buf_init(&rep->rr_hdrbuf, rep->rr_rdmabuf->rg_base,
+ rdmab_length(rep->rr_rdmabuf));
rep->rr_cqe.done = rpcrdma_wc_receive;
rep->rr_rxprt = r_xprt;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index b282d3f..13556ae 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -223,6 +223,8 @@ struct rpcrdma_rep {
u32 rr_inv_rkey;
struct rpcrdma_xprt *rr_rxprt;
struct work_struct rr_work;
+ struct xdr_buf rr_hdrbuf;
+ struct xdr_stream rr_stream;
struct list_head rr_list;
struct ib_recv_wr rr_recv_wr;
struct rpcrdma_regbuf *rr_rdmabuf;
@@ -642,6 +644,12 @@ bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *,
void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
void rpcrdma_reply_handler(struct work_struct *work);
+static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
+{
+ xdr->head[0].iov_len = len;
+ xdr->len = len;
+}
+
/* RPC/RDMA module init - xprtrdma/transport.c
*/
extern unsigned int xprt_rdma_max_inline_read;
Refactor the reply handler's transport header decoding logic to make
it easier to understand and update.
Convert some of the handler to use xdr_streams, which will enable
stricter validation of input data and enable the eventual addition
of support for new combinations of chunks, such as "Write + Reply"
or "PZRC + normal Read".
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 212 +++++++++++++++++++++++++---------------
1 file changed, 130 insertions(+), 82 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 9b5ab59..56f2277 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1004,6 +1004,124 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
}
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+static int
+rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
+ struct rpc_rqst *rqst)
+{
+ struct xdr_stream *xdr = &rep->rr_stream;
+ int rdmalen, status;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 2 * sizeof(*p));
+ if (unlikely(!p))
+ return -EIO;
+
+ /* never expect read list */
+ if (unlikely(*p++ != xdr_zero))
+ return -EIO;
+
+ /* Write list */
+ if (*p != xdr_zero) {
+ char *base = rep->rr_hdrbuf.head[0].iov_base;
+
+ p++;
+ rdmalen = rpcrdma_count_chunks(rep, 1, &p);
+ if (rdmalen < 0 || *p++ != xdr_zero)
+ return -EIO;
+
+ rep->rr_len -= (char *)p - base;
+ status = rep->rr_len + rdmalen;
+ r_xprt->rx_stats.total_rdma_reply += rdmalen;
+
+ /* special case - last segment may omit padding */
+ rdmalen &= 3;
+ if (rdmalen) {
+ rdmalen = 4 - rdmalen;
+ status += rdmalen;
+ }
+ } else {
+ p = xdr_inline_decode(xdr, sizeof(*p));
+ if (!p)
+ return -EIO;
+
+ /* never expect reply chunk */
+ if (*p++ != xdr_zero)
+ return -EIO;
+ rdmalen = 0;
+ rep->rr_len -= RPCRDMA_HDRLEN_MIN;
+ status = rep->rr_len;
+ }
+
+ r_xprt->rx_stats.fixup_copy_count +=
+ rpcrdma_inline_fixup(rqst, (char *)p, rep->rr_len,
+ rdmalen);
+
+ return status;
+}
+
+static noinline int
+rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
+{
+ struct xdr_stream *xdr = &rep->rr_stream;
+ int rdmalen;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 3 * sizeof(*p));
+ if (unlikely(!p))
+ return -EIO;
+
+ /* never expect Read chunks */
+ if (unlikely(*p++ != xdr_zero))
+ return -EIO;
+ /* never expect Write chunks */
+ if (unlikely(*p++ != xdr_zero))
+ return -EIO;
+ /* always expect a Reply chunk */
+ if (unlikely(*p++ == xdr_zero))
+ return -EIO;
+
+ rdmalen = rpcrdma_count_chunks(rep, 0, &p);
+ if (rdmalen < 0)
+ return -EIO;
+ r_xprt->rx_stats.total_rdma_reply += rdmalen;
+
+ /* Reply chunk buffer already is the reply vector - no fixup. */
+ return rdmalen;
+}
+
+static noinline int
+rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
+ struct rpc_rqst *rqst)
+{
+ struct xdr_stream *xdr = &rep->rr_stream;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, sizeof(*p));
+ if (unlikely(!p))
+ return -EIO;
+
+ switch (*p) {
+ case err_vers:
+ p = xdr_inline_decode(xdr, 2 * sizeof(*p));
+ if (!p)
+ break;
+ dprintk("RPC: %5u: %s: server reports version error (%u-%u)\n",
+ rqst->rq_task->tk_pid, __func__,
+ be32_to_cpup(p), be32_to_cpu(*(p + 1)));
+ break;
+ case err_chunk:
+ dprintk("RPC: %5u: %s: server reports header decoding error\n",
+ rqst->rq_task->tk_pid, __func__);
+ break;
+ default:
+ dprintk("RPC: %5u: %s: server reports unrecognized error %d\n",
+ rqst->rq_task->tk_pid, __func__, be32_to_cpup(p));
+ }
+
+ r_xprt->rx_stats.bad_reply_count++;
+ return -EREMOTEIO;
+}
+
/* Process received RPC/RDMA messages.
*
* Errors must result in the RPC task either being awakened, or
@@ -1018,13 +1136,12 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
struct xdr_stream *xdr = &rep->rr_stream;
- struct rpcrdma_msg *headerp;
struct rpcrdma_req *req;
struct rpc_rqst *rqst;
- __be32 *iptr, *p, xid, vers, proc;
- int rdmalen, status, rmerr;
+ __be32 *p, xid, vers, proc;
unsigned long cwnd;
struct list_head mws;
+ int status;
dprintk("RPC: %s: incoming rep %p\n", __func__, rep);
@@ -1043,7 +1160,6 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
p++; /* credits */
proc = *p++;
- headerp = rdmab_to_msg(rep->rr_rdmabuf);
if (rpcrdma_is_bcall(r_xprt, rep, xid, proc))
return;
@@ -1091,75 +1207,21 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
if (vers != rpcrdma_version)
goto out_badversion;
- /* check for expected message types */
- /* The order of some of these tests is important. */
switch (proc) {
case rdma_msg:
- /* never expect read chunks */
- /* never expect reply chunks (two ways to check) */
- if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
- (headerp->rm_body.rm_chunks[1] == xdr_zero &&
- headerp->rm_body.rm_chunks[2] != xdr_zero))
- goto badheader;
- if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
- /* count any expected write chunks in read reply */
- /* start at write chunk array count */
- iptr = &headerp->rm_body.rm_chunks[2];
- rdmalen = rpcrdma_count_chunks(rep, 1, &iptr);
- /* check for validity, and no reply chunk after */
- if (rdmalen < 0 || *iptr++ != xdr_zero)
- goto badheader;
- rep->rr_len -=
- ((unsigned char *)iptr - (unsigned char *)headerp);
- status = rep->rr_len + rdmalen;
- r_xprt->rx_stats.total_rdma_reply += rdmalen;
- /* special case - last chunk may omit padding */
- if (rdmalen &= 3) {
- rdmalen = 4 - rdmalen;
- status += rdmalen;
- }
- } else {
- /* else ordinary inline */
- rdmalen = 0;
- iptr = (__be32 *)((unsigned char *)headerp +
- RPCRDMA_HDRLEN_MIN);
- rep->rr_len -= RPCRDMA_HDRLEN_MIN;
- status = rep->rr_len;
- }
-
- r_xprt->rx_stats.fixup_copy_count +=
- rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len,
- rdmalen);
+ status = rpcrdma_decode_msg(r_xprt, rep, rqst);
break;
-
case rdma_nomsg:
- /* never expect read or write chunks, always reply chunks */
- if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
- headerp->rm_body.rm_chunks[1] != xdr_zero ||
- headerp->rm_body.rm_chunks[2] != xdr_one)
- goto badheader;
- iptr = (__be32 *)((unsigned char *)headerp +
- RPCRDMA_HDRLEN_MIN);
- rdmalen = rpcrdma_count_chunks(rep, 0, &iptr);
- if (rdmalen < 0)
- goto badheader;
- r_xprt->rx_stats.total_rdma_reply += rdmalen;
- /* Reply chunk buffer already is the reply vector - no fixup. */
- status = rdmalen;
+ status = rpcrdma_decode_nomsg(r_xprt, rep);
break;
-
case rdma_error:
- goto out_rdmaerr;
-
-badheader:
+ status = rpcrdma_decode_error(r_xprt, rep, rqst);
+ break;
default:
- dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
- rqst->rq_task->tk_pid, __func__,
- be32_to_cpu(proc));
status = -EIO;
- r_xprt->rx_stats.bad_reply_count++;
- break;
}
+ if (status < 0)
+ goto out_badheader;
out:
cwnd = xprt->cwnd;
@@ -1192,25 +1254,11 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
r_xprt->rx_stats.bad_reply_count++;
goto out;
-out_rdmaerr:
- rmerr = be32_to_cpu(headerp->rm_body.rm_error.rm_err);
- switch (rmerr) {
- case ERR_VERS:
- pr_err("%s: server reports header version error (%u-%u)\n",
- __func__,
- be32_to_cpu(headerp->rm_body.rm_error.rm_vers_low),
- be32_to_cpu(headerp->rm_body.rm_error.rm_vers_high));
- break;
- case ERR_CHUNK:
- pr_err("%s: server reports header decoding error\n",
- __func__);
- break;
- default:
- pr_err("%s: server reports unknown error %d\n",
- __func__, rmerr);
- }
- status = -EREMOTEIO;
+out_badheader:
+ dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
+ rqst->rq_task->tk_pid, __func__, be32_to_cpu(proc));
r_xprt->rx_stats.bad_reply_count++;
+ status = -EIO;
goto out;
/* The req was still available, but by the time the transport_lock
Signed-off-by: Chuck Lever <[email protected]>
---
include/linux/sunrpc/xdr.h | 13 ++++++++
net/sunrpc/xprtrdma/backchannel.c | 40 ++++++--------------------
net/sunrpc/xprtrdma/rpc_rdma.c | 58 ++++++++++++++++++++++++-------------
3 files changed, 59 insertions(+), 52 deletions(-)
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 261b48a..86b59e3 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -239,6 +239,19 @@ extern void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
+/**
+ * xdr_stream_remaining - Return the number of bytes remaining in the stream
+ * @xdr: pointer to struct xdr_stream
+ *
+ * Return value:
+ * Number of bytes remaining in @xdr before xdr->end
+ */
+static inline size_t
+xdr_stream_remaining(const struct xdr_stream *xdr)
+{
+ return xdr->nwords << 2;
+}
+
ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str,
size_t maxlen, gfp_t gfp_flags);
/**
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 03f6b58..183a103 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -271,9 +271,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
* @xprt: transport receiving the call
* @rep: receive buffer containing the call
*
- * Called in the RPC reply handler, which runs in a tasklet.
- * Be quick about it.
- *
* Operational assumptions:
* o Backchannel credits are ignored, just as the NFS server
* forechannel currently does
@@ -284,7 +281,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_rep *rep)
{
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
- struct rpcrdma_msg *headerp;
struct svc_serv *bc_serv;
struct rpcrdma_req *req;
struct rpc_rqst *rqst;
@@ -292,24 +288,15 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
size_t size;
__be32 *p;
- headerp = rdmab_to_msg(rep->rr_rdmabuf);
+ p = xdr_inline_decode(&rep->rr_stream, 0);
+ size = xdr_stream_remaining(&rep->rr_stream);
+
#ifdef RPCRDMA_BACKCHANNEL_DEBUG
pr_info("RPC: %s: callback XID %08x, length=%u\n",
- __func__, be32_to_cpu(headerp->rm_xid), rep->rr_len);
- pr_info("RPC: %s: %*ph\n", __func__, rep->rr_len, headerp);
+ __func__, be32_to_cpup(p), size);
+ pr_info("RPC: %s: %*ph\n", __func__, size, p);
#endif
- /* Sanity check:
- * Need at least enough bytes for RPC/RDMA header, as code
- * here references the header fields by array offset. Also,
- * backward calls are always inline, so ensure there
- * are some bytes beyond the RPC/RDMA header.
- */
- if (rep->rr_len < RPCRDMA_HDRLEN_MIN + 24)
- goto out_short;
- p = (__be32 *)((unsigned char *)headerp + RPCRDMA_HDRLEN_MIN);
- size = rep->rr_len - RPCRDMA_HDRLEN_MIN;
-
/* Grab a free bc rqst */
spin_lock(&xprt->bc_pa_lock);
if (list_empty(&xprt->bc_pa_list)) {
@@ -325,7 +312,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
/* Prepare rqst */
rqst->rq_reply_bytes_recvd = 0;
rqst->rq_bytes_sent = 0;
- rqst->rq_xid = headerp->rm_xid;
+ rqst->rq_xid = *p;
rqst->rq_private_buf.len = size;
set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
@@ -337,9 +324,9 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
buf->len = size;
/* The receive buffer has to be hooked to the rpcrdma_req
- * so that it can be reposted after the server is done
- * parsing it but just before sending the backward
- * direction reply.
+ * so that it is not released while the req is pointing
+ * to its buffer, and so that it can be reposted after
+ * the Upper Layer is done decoding it.
*/
req = rpcr_to_rdmar(rqst);
dprintk("RPC: %s: attaching rep %p to req %p\n",
@@ -367,13 +354,4 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
* when the connection is re-established.
*/
return;
-
-out_short:
- pr_warn("RPC/RDMA short backward direction call\n");
-
- if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
- xprt_disconnect_done(xprt);
- else
- pr_warn("RPC: %s: reposting rep %p\n",
- __func__, rep);
}
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 24f58c7..9b5ab59 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -949,35 +949,59 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
}
}
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
/* By convention, backchannel calls arrive via rdma_msg type
* messages, and never populate the chunk lists. This makes
* the RPC/RDMA header small and fixed in size, so it is
* straightforward to check the RPC header's direction field.
*/
static bool
-rpcrdma_is_bcall(struct rpcrdma_msg *headerp)
+rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
+ __be32 xid, __be32 proc)
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
{
- __be32 *p = (__be32 *)headerp;
+ struct xdr_stream *xdr = &rep->rr_stream;
+ __be32 *p;
- if (headerp->rm_type != rdma_msg)
+ if (proc != rdma_msg)
return false;
- if (headerp->rm_body.rm_chunks[0] != xdr_zero)
+
+ /* Peek at stream contents without advancing. */
+ p = xdr_inline_decode(xdr, 0);
+
+ /* Chunk lists */
+ if (*p++ != xdr_zero)
return false;
- if (headerp->rm_body.rm_chunks[1] != xdr_zero)
+ if (*p++ != xdr_zero)
return false;
- if (headerp->rm_body.rm_chunks[2] != xdr_zero)
+ if (*p++ != xdr_zero)
return false;
- /* sanity */
- if (p[7] != headerp->rm_xid)
+ /* RPC header */
+ if (*p++ != xid)
return false;
- /* call direction */
- if (p[8] != cpu_to_be32(RPC_CALL))
+ if (*p != cpu_to_be32(RPC_CALL))
return false;
+ /* Now that we are sure this is a backchannel call,
+ * advance to the RPC header.
+ */
+ p = xdr_inline_decode(xdr, 3 * sizeof(*p));
+ if (unlikely(!p))
+ goto out_short;
+
+ rpcrdma_bc_receive_call(r_xprt, rep);
+ return true;
+
+out_short:
+ pr_warn("RPC/RDMA short backward direction call\n");
+ if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
+ xprt_disconnect_done(&r_xprt->rx_xprt);
return true;
}
+#else /* CONFIG_SUNRPC_BACKCHANNEL */
+{
+ return false;
+}
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
/* Process received RPC/RDMA messages.
@@ -1020,10 +1044,8 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
proc = *p++;
headerp = rdmab_to_msg(rep->rr_rdmabuf);
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
- if (rpcrdma_is_bcall(headerp))
- goto out_bcall;
-#endif
+ if (rpcrdma_is_bcall(r_xprt, rep, xid, proc))
+ return;
/* Match incoming rpcrdma_rep to an rpcrdma_req to
* get context for handling any incoming chunks.
@@ -1159,12 +1181,6 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
}
return;
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
-out_bcall:
- rpcrdma_bc_receive_call(r_xprt, rep);
- return;
-#endif
-
/* If the incoming reply terminated a pending RPC, the next
* RPC call will post a replacement receive buffer as it is
* being marshaled.
This field is no longer used outside the Receive completion handler.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 2 +-
net/sunrpc/xprtrdma/verbs.c | 11 ++++-------
net/sunrpc/xprtrdma/xprt_rdma.h | 3 ---
3 files changed, 5 insertions(+), 11 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index e422c0f..6219861 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1178,7 +1178,7 @@ static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length)
dprintk("RPC: %s: incoming rep %p\n", __func__, rep);
- if (rep->rr_len == RPCRDMA_BAD_LEN)
+ if (rep->rr_hdrbuf.head[0].iov_len == 0)
goto out_badstatus;
xdr_init_decode(xdr, &rep->rr_hdrbuf,
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 74dbba8..5d36c06 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -143,9 +143,6 @@
struct rpcrdma_buffer *buffer = &rep->rr_rxprt->rx_buf;
u32 credits;
- if (rep->rr_len < RPCRDMA_HDRLEN_ERR)
- return;
-
credits = be32_to_cpu(rmsgp->rm_credit);
if (credits == 0)
credits = 1; /* don't deadlock */
@@ -176,16 +173,16 @@
dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
__func__, rep, wc->byte_len);
- rep->rr_len = wc->byte_len;
rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len);
rep->rr_wc_flags = wc->wc_flags;
rep->rr_inv_rkey = wc->ex.invalidate_rkey;
ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf),
rdmab_addr(rep->rr_rdmabuf),
- rep->rr_len, DMA_FROM_DEVICE);
+ wc->byte_len, DMA_FROM_DEVICE);
- rpcrdma_update_granted_credits(rep);
+ if (wc->byte_len >= RPCRDMA_HDRLEN_ERR)
+ rpcrdma_update_granted_credits(rep);
out_schedule:
queue_work(rpcrdma_receive_wq, &rep->rr_work);
@@ -196,7 +193,7 @@
pr_err("rpcrdma: Recv: %s (%u/0x%x)\n",
ib_wc_status_msg(wc->status),
wc->status, wc->vendor_err);
- rep->rr_len = RPCRDMA_BAD_LEN;
+ rpcrdma_set_xdrlen(&rep->rr_hdrbuf, 0);
goto out_schedule;
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 13556ae..d4a897a 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -218,7 +218,6 @@ enum {
struct rpcrdma_rep {
struct ib_cqe rr_cqe;
- unsigned int rr_len;
int rr_wc_flags;
u32 rr_inv_rkey;
struct rpcrdma_xprt *rr_rxprt;
@@ -230,8 +229,6 @@ struct rpcrdma_rep {
struct rpcrdma_regbuf *rr_rdmabuf;
};
-#define RPCRDMA_BAD_LEN (~0U)
-
/*
* struct rpcrdma_mw - external memory region metadata
*
Clean up: The opcode check is no longer necessary, because since
commit 2fa8f88d8892 ("xprtrdma: Use new CQ API for RPC-over-RDMA
client send CQs"), this completion handler is invoked only for
RECV work requests.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/verbs.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index f1b1c37..74dbba8 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -173,9 +173,6 @@
goto out_fail;
/* status == SUCCESS means all fields in wc are trustworthy */
- if (wc->opcode != IB_WC_RECV)
- return;
-
dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
__func__, rep, wc->byte_len);
Clean up: Replace C-structure based XDR decoding for consistency
with other areas.
struct rpcrdma_rep is rearranged slightly so that the relevant fields
are in cache when the Receive completion handler is invoked.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/verbs.c | 4 ++--
net/sunrpc/xprtrdma/xprt_rdma.h | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 5d36c06..c78fb27 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -139,11 +139,11 @@
static void
rpcrdma_update_granted_credits(struct rpcrdma_rep *rep)
{
- struct rpcrdma_msg *rmsgp = rdmab_to_msg(rep->rr_rdmabuf);
struct rpcrdma_buffer *buffer = &rep->rr_rxprt->rx_buf;
+ __be32 *p = rep->rr_rdmabuf->rg_base;
u32 credits;
- credits = be32_to_cpu(rmsgp->rm_credit);
+ credits = be32_to_cpup(p + 2);
if (credits == 0)
credits = 1; /* don't deadlock */
else if (credits > buffer->rb_max_requests)
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index d4a897a..52e73ea 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -220,13 +220,13 @@ struct rpcrdma_rep {
struct ib_cqe rr_cqe;
int rr_wc_flags;
u32 rr_inv_rkey;
+ struct rpcrdma_regbuf *rr_rdmabuf;
struct rpcrdma_xprt *rr_rxprt;
struct work_struct rr_work;
struct xdr_buf rr_hdrbuf;
struct xdr_stream rr_stream;
struct list_head rr_list;
struct ib_recv_wr rr_recv_wr;
- struct rpcrdma_regbuf *rr_rdmabuf;
};
/*
Clean up chunk list decoding by using the xdr_stream set up in
rpcrdma_reply_handler. This hardens decoding by checking for buffer
overflow at every step while unmarshaling variable-length XDR
objects.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 221 +++++++++++++++++++++++-----------------
1 file changed, 127 insertions(+), 94 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 56f2277..e422c0f 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -792,48 +792,6 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
return PTR_ERR(iptr);
}
-/*
- * Chase down a received write or reply chunklist to get length
- * RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
- */
-static int
-rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp)
-{
- unsigned int i, total_len;
- struct rpcrdma_write_chunk *cur_wchunk;
- char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf);
-
- i = be32_to_cpu(**iptrp);
- cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
- total_len = 0;
- while (i--) {
- struct rpcrdma_segment *seg = &cur_wchunk->wc_target;
- ifdebug(FACILITY) {
- u64 off;
- xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);
- dprintk("RPC: %s: chunk %d@0x%016llx:0x%08x\n",
- __func__,
- be32_to_cpu(seg->rs_length),
- (unsigned long long)off,
- be32_to_cpu(seg->rs_handle));
- }
- total_len += be32_to_cpu(seg->rs_length);
- ++cur_wchunk;
- }
- /* check and adjust for properly terminated write chunk */
- if (wrchunk) {
- __be32 *w = (__be32 *) cur_wchunk;
- if (*w++ != xdr_zero)
- return -1;
- cur_wchunk = (struct rpcrdma_write_chunk *) w;
- }
- if ((char *)cur_wchunk > base + rep->rr_len)
- return -1;
-
- *iptrp = (__be32 *) cur_wchunk;
- return total_len;
-}
-
/**
* rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
* @rqst: controlling RPC request
@@ -1004,89 +962,164 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
}
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
-static int
-rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
- struct rpc_rqst *rqst)
+static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length)
{
- struct xdr_stream *xdr = &rep->rr_stream;
- int rdmalen, status;
__be32 *p;
- p = xdr_inline_decode(xdr, 2 * sizeof(*p));
+ p = xdr_inline_decode(xdr, 4 * sizeof(*p));
if (unlikely(!p))
return -EIO;
- /* never expect read list */
- if (unlikely(*p++ != xdr_zero))
- return -EIO;
+ ifdebug(FACILITY) {
+ u64 offset;
+ u32 handle;
- /* Write list */
- if (*p != xdr_zero) {
- char *base = rep->rr_hdrbuf.head[0].iov_base;
+ handle = be32_to_cpup(p++);
+ *length = be32_to_cpup(p++);
+ xdr_decode_hyper(p, &offset);
+ dprintk("RPC: %s: segment %u@0x%016llx:0x%08x\n",
+ __func__, *length, (unsigned long long)offset,
+ handle);
+ } else {
+ *length = be32_to_cpup(p + 1);
+ }
- p++;
- rdmalen = rpcrdma_count_chunks(rep, 1, &p);
- if (rdmalen < 0 || *p++ != xdr_zero)
+ return 0;
+}
+
+static int decode_write_chunk(struct xdr_stream *xdr, u32 *length)
+{
+ u32 segcount, seglength;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, sizeof(*p));
+ if (unlikely(!p))
+ return -EIO;
+
+ *length = 0;
+ segcount = be32_to_cpup(p);
+ while (segcount--) {
+ if (decode_rdma_segment(xdr, &seglength))
return -EIO;
+ *length += seglength;
+ }
- rep->rr_len -= (char *)p - base;
- status = rep->rr_len + rdmalen;
- r_xprt->rx_stats.total_rdma_reply += rdmalen;
+ dprintk("RPC: %s: segcount=%u, %u bytes\n",
+ __func__, be32_to_cpup(p), *length);
+ return 0;
+}
- /* special case - last segment may omit padding */
- rdmalen &= 3;
- if (rdmalen) {
- rdmalen = 4 - rdmalen;
- status += rdmalen;
- }
- } else {
+/* In RPC-over-RDMA Version One replies, a Read list is never
+ * expected. This decoder is a stub that returns an error if
+ * a Read list is present.
+ */
+static int decode_read_list(struct xdr_stream *xdr)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, sizeof(*p));
+ if (unlikely(!p))
+ return -EIO;
+ if (unlikely(*p != xdr_zero))
+ return -EIO;
+ return 0;
+}
+
+/* Supports only one Write chunk in the Write list
+ */
+static int decode_write_list(struct xdr_stream *xdr, u32 *length)
+{
+ u32 chunklen;
+ bool first;
+ __be32 *p;
+
+ *length = 0;
+ first = true;
+ do {
p = xdr_inline_decode(xdr, sizeof(*p));
- if (!p)
+ if (unlikely(!p))
+ return -EIO;
+ if (*p == xdr_zero)
+ break;
+ if (!first)
return -EIO;
- /* never expect reply chunk */
- if (*p++ != xdr_zero)
+ if (decode_write_chunk(xdr, &chunklen))
return -EIO;
- rdmalen = 0;
- rep->rr_len -= RPCRDMA_HDRLEN_MIN;
- status = rep->rr_len;
- }
+ *length += chunklen;
+ first = false;
+ } while (true);
+ return 0;
+}
+
+static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, sizeof(*p));
+ if (unlikely(!p))
+ return -EIO;
+
+ *length = 0;
+ if (*p != xdr_zero)
+ if (decode_write_chunk(xdr, length))
+ return -EIO;
+ return 0;
+}
+static int
+rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
+ struct rpc_rqst *rqst)
+{
+ struct xdr_stream *xdr = &rep->rr_stream;
+ u32 writelist, replychunk, rpclen;
+ char *base;
+
+ /* Decode the chunk lists */
+ if (decode_read_list(xdr))
+ return -EIO;
+ if (decode_write_list(xdr, &writelist))
+ return -EIO;
+ if (decode_reply_chunk(xdr, &replychunk))
+ return -EIO;
+
+ /* RDMA_MSG sanity checks */
+ if (unlikely(replychunk))
+ return -EIO;
+
+ /* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */
+ base = (char *)xdr_inline_decode(xdr, 0);
+ rpclen = xdr_stream_remaining(xdr);
r_xprt->rx_stats.fixup_copy_count +=
- rpcrdma_inline_fixup(rqst, (char *)p, rep->rr_len,
- rdmalen);
+ rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3);
- return status;
+ r_xprt->rx_stats.total_rdma_reply += writelist;
+ return rpclen + xdr_align_size(writelist);
}
static noinline int
rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
{
struct xdr_stream *xdr = &rep->rr_stream;
- int rdmalen;
- __be32 *p;
+ u32 writelist, replychunk;
- p = xdr_inline_decode(xdr, 3 * sizeof(*p));
- if (unlikely(!p))
+ /* Decode the chunk lists */
+ if (decode_read_list(xdr))
return -EIO;
-
- /* never expect Read chunks */
- if (unlikely(*p++ != xdr_zero))
+ if (decode_write_list(xdr, &writelist))
return -EIO;
- /* never expect Write chunks */
- if (unlikely(*p++ != xdr_zero))
- return -EIO;
- /* always expect a Reply chunk */
- if (unlikely(*p++ == xdr_zero))
+ if (decode_reply_chunk(xdr, &replychunk))
return -EIO;
- rdmalen = rpcrdma_count_chunks(rep, 0, &p);
- if (rdmalen < 0)
+ /* RDMA_NOMSG sanity checks */
+ if (unlikely(writelist))
+ return -EIO;
+ if (unlikely(!replychunk))
return -EIO;
- r_xprt->rx_stats.total_rdma_reply += rdmalen;
- /* Reply chunk buffer already is the reply vector - no fixup. */
- return rdmalen;
+ /* Reply chunk buffer already is the reply vector */
+ r_xprt->rx_stats.total_rdma_reply += replychunk;
+ return replychunk;
}
static noinline int
On 08/03/2017 02:29 PM, Chuck Lever wrote:
> Hi Anna-
Hi Chuck,
>
> These patches harden xprtrdma's reply handler code path, particu-
> larly XDR decoding of incoming transport headers. I didn't copy
> linux-rdma on this series since it is strictly XDR-related.
>
> Please have a look at these and let me know if they are not
> acceptable for v4.14.
Patches look okay to me. Thanks!
Anna
>
> ---
>
> Chuck Lever (7):
> xprtrdma: Add xdr_init_decode to rpcrdma_reply_handler()
> xprtrdma: Harden backchannel call decoding
> xprtrdma: Refactor rpcrdma_reply_handler()
> xprtrdma: Replace rpcrdma_count_chunks()
> xprtrdma: Remove opcode check in Receive completion handler
> xprtrdma: Remove rpcrdma_rep::rr_len
> xprtrdma: Clean up XDR decoding in rpcrdma_update_granted_credits()
>
>
> include/linux/sunrpc/xdr.h | 13 +
> net/sunrpc/xprtrdma/backchannel.c | 40 +---
> net/sunrpc/xprtrdma/rpc_rdma.c | 422 +++++++++++++++++++++++--------------
> net/sunrpc/xprtrdma/verbs.c | 21 +-
> net/sunrpc/xprtrdma/xprt_rdma.h | 13 +
> 5 files changed, 304 insertions(+), 205 deletions(-)
>
> --
> Chuck Lever
>