2016-04-11 20:10:10

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 00/18] NFS/RDMA client patches for 4.7

One larger change: Attempt to fence memory regions after a signal
interrupts a synchronous RPC. This should prevent a server from
writing a reply into a client's memory after the memory has been
released.

In addition, the following changes and fixes:

- Use new ib_drain_qp() API
- Advertise max size of NFSv4.1 callbacks on RPC/RDMA
- Prevent overflowing the server's receive buffers
- Read list + Reply chunk (to support krb5i and krbp on RPC/RDMA)
- Detect connection loss sooner

---

Chuck Lever (18):
sunrpc: Advertise maximum backchannel payload size
xprtrdma: Bound the inline threshold values
xprtrdma: Limit number of RDMA segments in RPC-over-RDMA headers
xprtrdma: Prevent inline overflow
xprtrdma: Avoid using Write list for small NFS READ requests
xprtrdma: Update comments in rpcrdma_marshal_req()
xprtrdma: Allow Read list and Reply chunk simultaneously
xprtrdma: Remove rpcrdma_create_chunks()
xprtrdma: Use core ib_drain_qp() API
xprtrdma: Rename rpcrdma_frwr::sg and sg_nents
xprtrdma: Save I/O direction in struct rpcrdma_frwr
xprtrdma: Reset MRs in frwr_op_unmap_sync()
xprtrdma: Refactor the FRWR recovery worker
xprtrdma: Move fr_xprt and fr_worker to struct rpcrdma_mw
xprtrdma: Refactor __fmr_dma_unmap()
xprtrdma: Add ro_unmap_safe memreg method
xprtrdma: Remove ro_unmap() from all registration modes
xprtrdma: Faster server reboot recovery


fs/nfs/nfs4proc.c | 10 -
include/linux/sunrpc/clnt.h | 1
include/linux/sunrpc/xprt.h | 1
include/linux/sunrpc/xprtrdma.h | 4
net/sunrpc/clnt.c | 17 +
net/sunrpc/xprtrdma/backchannel.c | 16 +
net/sunrpc/xprtrdma/fmr_ops.c | 134 +++++++--
net/sunrpc/xprtrdma/frwr_ops.c | 214 ++++++++-------
net/sunrpc/xprtrdma/physical_ops.c | 39 ++-
net/sunrpc/xprtrdma/rpc_rdma.c | 517 ++++++++++++++++++++++--------------
net/sunrpc/xprtrdma/transport.c | 16 +
net/sunrpc/xprtrdma/verbs.c | 91 ++----
net/sunrpc/xprtrdma/xprt_rdma.h | 42 ++-
net/sunrpc/xprtsock.c | 6
14 files changed, 674 insertions(+), 434 deletions(-)

--
Chuck Lever


2016-04-11 20:10:19

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 01/18] sunrpc: Advertise maximum backchannel payload size

RPC-over-RDMA transports have a limit on how large a backward
direction (backchannel) RPC message can be. Ensure that the NFSv4.x
CREATE_SESSION operation advertises this limit to servers.

Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfs/nfs4proc.c | 10 ++++++----
include/linux/sunrpc/clnt.h | 1 +
include/linux/sunrpc/xprt.h | 1 +
net/sunrpc/clnt.c | 17 +++++++++++++++++
net/sunrpc/xprtrdma/backchannel.c | 16 ++++++++++++++++
net/sunrpc/xprtrdma/transport.c | 1 +
net/sunrpc/xprtrdma/xprt_rdma.h | 1 +
net/sunrpc/xprtsock.c | 6 ++++++
8 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 327b8c3..01bef06 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7351,9 +7351,11 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
* always set csa_cachethis to FALSE because the current implementation
* of the back channel DRC only supports caching the CB_SEQUENCE operation.
*/
-static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
+static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args,
+ struct rpc_clnt *clnt)
{
unsigned int max_rqst_sz, max_resp_sz;
+ unsigned int max_bc_payload = rpc_max_bc_payload(clnt);

max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
@@ -7371,8 +7373,8 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
args->fc_attrs.max_ops, args->fc_attrs.max_reqs);

/* Back channel attributes */
- args->bc_attrs.max_rqst_sz = PAGE_SIZE;
- args->bc_attrs.max_resp_sz = PAGE_SIZE;
+ args->bc_attrs.max_rqst_sz = max_bc_payload;
+ args->bc_attrs.max_resp_sz = max_bc_payload;
args->bc_attrs.max_resp_sz_cached = 0;
args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS;
@@ -7476,7 +7478,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
};
int status;

- nfs4_init_channel_attrs(&args);
+ nfs4_init_channel_attrs(&args, clp->cl_rpcclient);
args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);

status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 9a7ddba..19c659d 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -176,6 +176,7 @@ void rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
int rpc_protocol(struct rpc_clnt *);
struct net * rpc_net_ns(struct rpc_clnt *);
size_t rpc_max_payload(struct rpc_clnt *);
+size_t rpc_max_bc_payload(struct rpc_clnt *);
unsigned long rpc_get_timeout(struct rpc_clnt *clnt);
void rpc_force_rebind(struct rpc_clnt *);
size_t rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t);
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index fb0d212..5aa3834 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -142,6 +142,7 @@ struct rpc_xprt_ops {
int (*bc_setup)(struct rpc_xprt *xprt,
unsigned int min_reqs);
int (*bc_up)(struct svc_serv *serv, struct net *net);
+ size_t (*bc_maxpayload)(struct rpc_xprt *xprt);
void (*bc_free_rqst)(struct rpc_rqst *rqst);
void (*bc_destroy)(struct rpc_xprt *xprt,
unsigned int max_reqs);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 7e0c9bf..06b4df9 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1414,6 +1414,23 @@ size_t rpc_max_payload(struct rpc_clnt *clnt)
EXPORT_SYMBOL_GPL(rpc_max_payload);

/**
+ * rpc_max_bc_payload - Get maximum backchannel payload size, in bytes
+ * @clnt: RPC client to query
+ */
+size_t rpc_max_bc_payload(struct rpc_clnt *clnt)
+{
+ struct rpc_xprt *xprt;
+ size_t ret;
+
+ rcu_read_lock();
+ xprt = rcu_dereference(clnt->cl_xprt);
+ ret = xprt->ops->bc_maxpayload(xprt);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
+
+/**
* rpc_get_timeout - Get timeout for transport in units of HZ
* @clnt: RPC client to query
*/
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 2dcd764..87762d9 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -192,6 +192,22 @@ int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net)
}

/**
+ * xprt_rdma_bc_maxpayload - Return maximum backchannel message size
+ * @xprt: transport
+ *
+ * Returns maximum size, in bytes, of a backchannel message
+ */
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+ size_t maxmsg;
+
+ maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize);
+ return maxmsg - RPCRDMA_HDRLEN_MIN;
+}
+
+/**
* rpcrdma_bc_marshal_reply - Send backwards direction reply
* @rqst: buffer containing RPC reply data
*
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index b1b009f..9954342 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -707,6 +707,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
.bc_setup = xprt_rdma_bc_setup,
.bc_up = xprt_rdma_bc_up,
+ .bc_maxpayload = xprt_rdma_bc_maxpayload,
.bc_free_rqst = xprt_rdma_bc_free_rqst,
.bc_destroy = xprt_rdma_bc_destroy,
#endif
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 2ebc743..7723e5f 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -534,6 +534,7 @@ void xprt_rdma_cleanup(void);
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
int xprt_rdma_bc_up(struct svc_serv *, struct net *);
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
int rpcrdma_bc_marshal_reply(struct rpc_rqst *);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 65e7595..f1faf6b 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1365,6 +1365,11 @@ static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net)
return ret;
return 0;
}
+
+static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt)
+{
+ return PAGE_SIZE;
+}
#else
static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
struct xdr_skb_reader *desc)
@@ -2660,6 +2665,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
#ifdef CONFIG_SUNRPC_BACKCHANNEL
.bc_setup = xprt_setup_bc,
.bc_up = xs_tcp_bc_up,
+ .bc_maxpayload = xs_tcp_bc_maxpayload,
.bc_free_rqst = xprt_free_bc_rqst,
.bc_destroy = xprt_destroy_bc,
#endif


2016-04-11 20:10:27

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 02/18] xprtrdma: Bound the inline threshold values

Currently the sysctls that allow setting the inline threshold allow
any value to be set.

Small values only make the transport run slower. The default 1KB
setting is as low as is reasonable. And the logic that decides how
to divide a Send buffer buffer between RPC-over-RDMA header and RPC
message assumes (but does not check) that the lower bound is not
crazy (say, 57 bytes).

Send and receive buffers share a page with some control information.
Values larger than about 3KB can't be supported, currently.

Signed-off-by: Chuck Lever <[email protected]>
---
include/linux/sunrpc/xprtrdma.h | 4 +++-
net/sunrpc/xprtrdma/transport.c | 6 ++++++
2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index 767190b..39267dc 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -52,7 +52,9 @@
#define RPCRDMA_DEF_SLOT_TABLE (128U)
#define RPCRDMA_MAX_SLOT_TABLE (256U)

-#define RPCRDMA_DEF_INLINE (1024) /* default inline max */
+#define RPCRDMA_MIN_INLINE (1024) /* min inline thresh */
+#define RPCRDMA_DEF_INLINE (1024) /* default inline thresh */
+#define RPCRDMA_MAX_INLINE (3068) /* max inline thresh */

/* Memory registration strategies, by number.
* This is part of a kernel / user space API. Do not remove. */
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 9954342..16595ff 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -73,6 +73,8 @@ static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;

static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
+static unsigned int min_inline_size = RPCRDMA_MIN_INLINE;
+static unsigned int max_inline_size = RPCRDMA_MAX_INLINE;
static unsigned int zero;
static unsigned int max_padding = PAGE_SIZE;
static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
@@ -96,6 +98,8 @@ static struct ctl_table xr_tunables_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
+ .extra1 = &min_inline_size,
+ .extra2 = &max_inline_size,
},
{
.procname = "rdma_max_inline_write",
@@ -103,6 +107,8 @@ static struct ctl_table xr_tunables_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
+ .extra1 = &min_inline_size,
+ .extra2 = &max_inline_size,
},
{
.procname = "rdma_inline_write_padding",


2016-04-11 20:10:36

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 03/18] xprtrdma: Limit number of RDMA segments in RPC-over-RDMA headers

Send buffer space is shared between the RPC-over-RDMA header and
an RPC message. A large RPC-over-RDMA header means less space is
available for the associated RPC message, which then has to be
moved via an RDMA Read or Write.

As more segments are added to the chunk lists, the header increases
in size. Typical modern hardware needs only a few segments to
convey the maximum payload size, but some devices and registration
modes may need a lot of segments to convey data payload. Sometimes
so many are needed that the remaining space in the Send buffer is
not enough for the RPC message. Sending such a message usually
fails.

To ensure a transport can always make forward progress, cap the
number of RDMA segments that are allowed in chunk lists. This
prevents less-capable devices and memory registrations from
consuming a large portion of the Send buffer by reducing the
maximum data payload that can be conveyed with such devices.

For now I choose an arbitrary maximum of 8 RDMA segments. This
allows a maximum size RPC-over-RDMA header to fit nicely in the
current 1024 byte inline threshold with over 700 bytes remaining
for an inline RPC message.

The current maximum data payload of NFS READ or WRITE requests is
one megabyte. To convey that payload on a client with 4KB pages,
each chunk segment would need to handle 32 or more data pages. This
is well within the capabilities of FMR. For physical registration,
the maximum payload size on platforms with 4KB pages is reduced to
32KB.

For FRWR, a device's maximum page list depth would need to be at
least 34 to support the maximum 1MB payload. A device with a smaller
maximum page list depth means the maximum data payload is reduced
when using that device.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/fmr_ops.c | 2 +-
net/sunrpc/xprtrdma/frwr_ops.c | 2 +-
net/sunrpc/xprtrdma/physical_ops.c | 2 +-
net/sunrpc/xprtrdma/verbs.c | 22 ----------------------
net/sunrpc/xprtrdma/xprt_rdma.h | 21 ++++++++++++++++++++-
5 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index b289e10..4aeb104 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -48,7 +48,7 @@ static size_t
fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
{
return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES);
+ RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
}

static int
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index c250924..2f37598 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -243,7 +243,7 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
struct rpcrdma_ia *ia = &r_xprt->rx_ia;

return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
+ RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frmr_depth);
}

static void
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index 481b9b6..e16ed54 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -47,7 +47,7 @@ static size_t
physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
{
return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- rpcrdma_max_segments(r_xprt));
+ RPCRDMA_MAX_HDR_SEGS);
}

static int
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index f5ed9f9..9f8d6c1 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1271,25 +1271,3 @@ out_rc:
rpcrdma_recv_buffer_put(rep);
return rc;
}
-
-/* How many chunk list items fit within our inline buffers?
- */
-unsigned int
-rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
- int bytes, segments;
-
- bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
- bytes -= RPCRDMA_HDRLEN_MIN;
- if (bytes < sizeof(struct rpcrdma_segment) * 2) {
- pr_warn("RPC: %s: inline threshold too small\n",
- __func__);
- return 0;
- }
-
- segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
- dprintk("RPC: %s: max chunk list size = %d segments\n",
- __func__, segments);
- return segments;
-}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 7723e5f..0028748 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -144,6 +144,26 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)

#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN)

+/* To ensure a transport can always make forward progress,
+ * the number of RDMA segments allowed in header chunk lists
+ * is capped at 8. This prevents less-capable devices and
+ * memory registrations from overrunning the Send buffer
+ * while building chunk lists.
+ *
+ * Elements of the Read list take up more room than the
+ * Write list or Reply chunk. 8 read segments means the Read
+ * list (or Write list or Reply chunk) cannot consume more
+ * than
+ *
+ * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes.
+ *
+ * And the fixed part of the header is another 24 bytes.
+ *
+ * The smallest inline threshold is 1024 bytes, ensuring that
+ * at least 750 bytes are available for RPC messages.
+ */
+#define RPCRDMA_MAX_HDR_SEGS (8)
+
/*
* struct rpcrdma_rep -- this structure encapsulates state required to recv
* and complete a reply, asychronously. It needs several pieces of
@@ -456,7 +476,6 @@ struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
void rpcrdma_free_regbuf(struct rpcrdma_ia *,
struct rpcrdma_regbuf *);

-unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);

int frwr_alloc_recovery_wq(void);


2016-04-11 20:10:44

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 04/18] xprtrdma: Prevent inline overflow

When deciding whether to send a Call inline, rpcrdma_marshal_req
doesn't take into account header bytes consumed by chunk lists.
This results in Call messages on the wire that are sometimes larger
than the inline threshold.

Likewise, when a Write list or Reply chunk is in play, the server's
reply has to emit an RDMA Send that includes a larger-than-minimal
RPC-over-RDMA header.

The actual size of a Call message cannot be estimated until after
the chunk lists have been registered. Thus the size of each
RPC-over-RDMA header can be estimated only after chunks are
registered; but the decision to register chunks is based on the size
of that header. Chicken, meet egg.

The best a client can do is estimate header size based on the
largest header that might occur, and then ensure that inline content
is always smaller than that.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/fmr_ops.c | 3 +
net/sunrpc/xprtrdma/frwr_ops.c | 3 +
net/sunrpc/xprtrdma/physical_ops.c | 5 ++
net/sunrpc/xprtrdma/rpc_rdma.c | 85 ++++++++++++++++++++++++++++++++----
net/sunrpc/xprtrdma/xprt_rdma.h | 5 ++
5 files changed, 90 insertions(+), 11 deletions(-)

diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 4aeb104..009694b 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -39,6 +39,9 @@ static int
fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
+ rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
+ RPCRDMA_MAX_DATA_SEGS /
+ RPCRDMA_MAX_FMR_SGES));
return 0;
}

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 2f37598..41e02e7 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -231,6 +231,9 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
depth;
}

+ rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
+ RPCRDMA_MAX_DATA_SEGS /
+ ia->ri_max_frmr_depth));
return 0;
}

diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index e16ed54..2dc6ec2 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -36,8 +36,11 @@ physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
__func__, PTR_ERR(mr));
return -ENOMEM;
}
-
ia->ri_dma_mr = mr;
+
+ rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int,
+ RPCRDMA_MAX_DATA_SEGS,
+ RPCRDMA_MAX_HDR_SEGS));
return 0;
}

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 888823b..205b81b 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -61,7 +61,6 @@ enum rpcrdma_chunktype {
rpcrdma_replych
};

-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
static const char transfertypes[][12] = {
"pure inline", /* no chunks */
" read chunk", /* some argument via rdma read */
@@ -69,18 +68,72 @@ static const char transfertypes[][12] = {
"write chunk", /* some result via rdma write */
"reply chunk" /* entire reply via rdma write */
};
-#endif
+
+/* Returns size of largest RPC-over-RDMA header in a Call message
+ *
+ * The client marshals only one chunk list per Call message.
+ * The largest list is the Read list.
+ */
+static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
+{
+ unsigned int size;
+
+ /* Fixed header fields and list discriminators */
+ size = RPCRDMA_HDRLEN_MIN;
+
+ /* Maximum Read list size */
+ maxsegs += 2; /* segment for head and tail buffers */
+ size = maxsegs * sizeof(struct rpcrdma_read_chunk);
+
+ dprintk("RPC: %s: max call header size = %u\n",
+ __func__, size);
+ return size;
+}
+
+/* Returns size of largest RPC-over-RDMA header in a Reply message
+ *
+ * There is only one Write list or one Reply chunk per Reply
+ * message. The larger list is the Write list.
+ */
+static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
+{
+ unsigned int size;
+
+ /* Fixed header fields and list discriminators */
+ size = RPCRDMA_HDRLEN_MIN;
+
+ /* Maximum Write list size */
+ maxsegs += 2; /* segment for head and tail buffers */
+ size = sizeof(__be32); /* segment count */
+ size += maxsegs * sizeof(struct rpcrdma_segment);
+ size += sizeof(__be32); /* list discriminator */
+
+ dprintk("RPC: %s: max reply header size = %u\n",
+ __func__, size);
+ return size;
+}
+
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia,
+ struct rpcrdma_create_data_internal *cdata,
+ unsigned int maxsegs)
+{
+ ia->ri_max_inline_write = cdata->inline_wsize -
+ rpcrdma_max_call_header_size(maxsegs);
+ ia->ri_max_inline_read = cdata->inline_rsize -
+ rpcrdma_max_reply_header_size(maxsegs);
+}

/* The client can send a request inline as long as the RPCRDMA header
* plus the RPC call fit under the transport's inline limit. If the
* combined call message size exceeds that limit, the client must use
* the read chunk list for this operation.
*/
-static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
+static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
+ struct rpc_rqst *rqst)
{
- unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;

- return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
+ return rqst->rq_snd_buf.len <= ia->ri_max_inline_write;
}

/* The client can't know how large the actual reply will be. Thus it
@@ -89,11 +142,12 @@ static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
* limit, the client must provide a write list or a reply chunk for
* this request.
*/
-static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
+static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
+ struct rpc_rqst *rqst)
{
- unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;

- return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
+ return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
}

static int
@@ -492,7 +546,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
*/
if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
wtype = rpcrdma_writech;
- else if (rpcrdma_results_inline(rqst))
+ else if (rpcrdma_results_inline(r_xprt, rqst))
wtype = rpcrdma_noch;
else
wtype = rpcrdma_replych;
@@ -511,7 +565,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* that both has a data payload, and whose non-data arguments
* by themselves are larger than the inline threshold.
*/
- if (rpcrdma_args_inline(rqst)) {
+ if (rpcrdma_args_inline(r_xprt, rqst)) {
rtype = rpcrdma_noch;
} else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
rtype = rpcrdma_readch;
@@ -561,6 +615,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
if (hdrlen < 0)
return hdrlen;

+ if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+ goto out_overflow;
+
dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd"
" headerp 0x%p base 0x%p lkey 0x%x\n",
__func__, transfertypes[wtype], hdrlen, rpclen,
@@ -587,6 +644,14 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)

req->rl_niovs = 2;
return 0;
+
+out_overflow:
+ pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s\n",
+ hdrlen, rpclen, transfertypes[wtype]);
+ /* Terminate this RPC. Chunks registered above will be
+ * released by xprt_release -> xprt_rmda_free .
+ */
+ return -EIO;
}

/*
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 0028748..4349e03 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -73,6 +73,8 @@ struct rpcrdma_ia {
struct completion ri_done;
int ri_async_rc;
unsigned int ri_max_frmr_depth;
+ unsigned int ri_max_inline_write;
+ unsigned int ri_max_inline_read;
struct ib_qp_attr ri_qp_attr;
struct ib_qp_init_attr ri_qp_init_attr;
};
@@ -538,6 +540,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
* RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
*/
int rpcrdma_marshal_req(struct rpc_rqst *);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *,
+ struct rpcrdma_create_data_internal *,
+ unsigned int);

/* RPC/RDMA module init - xprtrdma/transport.c
*/


2016-04-11 20:10:52

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 05/18] xprtrdma: Avoid using Write list for small NFS READ requests

Avoid the latency and interrupt overhead of registering a Write
chunk when handling NFS READ requests of a few hundred bytes or
less.

This change does not interoperate with Linux NFS/RDMA servers
that do not have commit 9d11b51ce7c1 ('svcrdma: Fix send_reply()
scatter/gather set-up').

$ git describe 9d11b51ce7c1
v4.2-rc3-10-g9d11b51

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 205b81b..0105e65 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -539,15 +539,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
/*
* Chunks needed for results?
*
- * o Read ops return data as write chunk(s), header as inline.
* o If the expected result is under the inline threshold, all ops
* return as inline.
+ * o Large read ops return data as write chunk(s), header as
+ * inline.
* o Large non-read ops return as a single reply chunk.
*/
- if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
- wtype = rpcrdma_writech;
- else if (rpcrdma_results_inline(r_xprt, rqst))
+ if (rpcrdma_results_inline(r_xprt, rqst))
wtype = rpcrdma_noch;
+ else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+ wtype = rpcrdma_writech;
else
wtype = rpcrdma_replych;



2016-04-11 20:11:01

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 06/18] xprtrdma: Update comments in rpcrdma_marshal_req()

Update documenting comments to reflect code changes over the past
year.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 18 ++++--------------
1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 0105e65..c7c9bbb 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -494,13 +494,10 @@ static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
* Marshal a request: the primary job of this routine is to choose
* the transfer modes. See comments below.
*
- * Uses multiple RDMA IOVs for a request:
- * [0] -- RPC RDMA header, which uses memory from the *start* of the
- * preregistered buffer that already holds the RPC data in
- * its middle.
- * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
- * [2] -- optional padding.
- * [3] -- if padded, header only in [1] and data here.
+ * Prepares up to two IOVs per Call message:
+ *
+ * [0] -- RPC RDMA header
+ * [1] -- the RPC header/data
*
* Returns zero on success, otherwise a negative errno.
*/
@@ -624,13 +621,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
__func__, transfertypes[wtype], hdrlen, rpclen,
headerp, base, rdmab_lkey(req->rl_rdmabuf));

- /*
- * initialize send_iov's - normally only two: rdma chunk header and
- * single preregistered RPC header buffer, but if padding is present,
- * then use a preregistered (and zeroed) pad buffer between the RPC
- * header and any write data. In all non-rdma cases, any following
- * data has been copied into the RPC header buffer.
- */
req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
req->rl_send_iov[0].length = hdrlen;
req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);


2016-04-11 20:11:09

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 07/18] xprtrdma: Allow Read list and Reply chunk simultaneously

rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.

In fact, this assumption is asserted in the code:

if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}

But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.

Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.

The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.

Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 327 ++++++++++++++++++++++++++++++++-------
net/sunrpc/xprtrdma/xprt_rdma.h | 1
2 files changed, 269 insertions(+), 59 deletions(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index c7c9bbb..e80f43d 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -62,17 +62,17 @@ enum rpcrdma_chunktype {
};

static const char transfertypes[][12] = {
- "pure inline", /* no chunks */
- " read chunk", /* some argument via rdma read */
- "*read chunk", /* entire request via rdma read */
- "write chunk", /* some result via rdma write */
+ "inline", /* no chunks */
+ "read list", /* some argument via rdma read */
+ "*read list", /* entire request via rdma read */
+ "write list", /* some result via rdma write */
"reply chunk" /* entire reply via rdma write */
};

/* Returns size of largest RPC-over-RDMA header in a Call message
*
- * The client marshals only one chunk list per Call message.
- * The largest list is the Read list.
+ * The largest Call header contains a full-size Read list and a
+ * minimal Reply chunk.
*/
static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
{
@@ -85,6 +85,11 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
maxsegs += 2; /* segment for head and tail buffers */
size = maxsegs * sizeof(struct rpcrdma_read_chunk);

+ /* Minimal Read chunk size */
+ size += sizeof(__be32); /* segment count */
+ size += sizeof(struct rpcrdma_segment);
+ size += sizeof(__be32); /* list discriminator */
+
dprintk("RPC: %s: max call header size = %u\n",
__func__, size);
return size;
@@ -431,6 +436,209 @@ out:
return n;
}

+static inline __be32 *
+xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg)
+{
+ *iptr++ = cpu_to_be32(seg->mr_rkey);
+ *iptr++ = cpu_to_be32(seg->mr_len);
+ return xdr_encode_hyper(iptr, seg->mr_base);
+}
+
+/* XDR-encode the Read list. Supports encoding a list of read
+ * segments that belong to a single read chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
+ * Read chunklist (a linked list):
+ * N elements, position P (same P for all chunks of same arg!):
+ * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
+ *
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Read list, or an error pointer.
+ */
+static __be32 *
+rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req, struct rpc_rqst *rqst,
+ __be32 *iptr, enum rpcrdma_chunktype rtype)
+{
+ struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ unsigned int pos;
+ int n, nsegs;
+
+ if (rtype == rpcrdma_noch) {
+ *iptr++ = xdr_zero; /* item not present */
+ return iptr;
+ }
+
+ pos = rqst->rq_snd_buf.head[0].iov_len;
+ if (rtype == rpcrdma_areadch)
+ pos = 0;
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg,
+ RPCRDMA_MAX_SEGS - req->rl_nchunks);
+ if (nsegs < 0)
+ return ERR_PTR(nsegs);
+
+ do {
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false);
+ if (n <= 0)
+ return ERR_PTR(n);
+
+ *iptr++ = xdr_one; /* item present */
+
+ /* All read segments in this chunk
+ * have the same "position".
+ */
+ *iptr++ = cpu_to_be32(pos);
+ iptr = xdr_encode_rdma_segment(iptr, seg);
+
+ dprintk("RPC: %5u %s: read segment pos %u "
+ "%d@0x%016llx:0x%08x (%s)\n",
+ rqst->rq_task->tk_pid, __func__, pos,
+ seg->mr_len, (unsigned long long)seg->mr_base,
+ seg->mr_rkey, n < nsegs ? "more" : "last");
+
+ r_xprt->rx_stats.read_chunk_count++;
+ req->rl_nchunks++;
+ seg += n;
+ nsegs -= n;
+ } while (nsegs);
+ req->rl_nextseg = seg;
+
+ /* Finish Read list */
+ *iptr++ = xdr_zero; /* Next item not present */
+ return iptr;
+}
+
+/* XDR-encode the Write list. Supports encoding a list containing
+ * one array of plain segments that belong to a single write chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
+ * Write chunklist (a list of (one) counted array):
+ * N elements:
+ * 1 - N - HLOO - HLOO - ... - HLOO - 0
+ *
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Write list, or an error pointer.
+ */
+static __be32 *
+rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ struct rpc_rqst *rqst, __be32 *iptr,
+ enum rpcrdma_chunktype wtype)
+{
+ struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ int n, nsegs, nchunks;
+ __be32 *segcount;
+
+ if (wtype != rpcrdma_writech) {
+ *iptr++ = xdr_zero; /* no Write list present */
+ return iptr;
+ }
+
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
+ rqst->rq_rcv_buf.head[0].iov_len,
+ wtype, seg,
+ RPCRDMA_MAX_SEGS - req->rl_nchunks);
+ if (nsegs < 0)
+ return ERR_PTR(nsegs);
+
+ *iptr++ = xdr_one; /* Write list present */
+ segcount = iptr++; /* save location of segment count */
+
+ nchunks = 0;
+ do {
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
+ if (n <= 0)
+ return ERR_PTR(n);
+
+ iptr = xdr_encode_rdma_segment(iptr, seg);
+
+ dprintk("RPC: %5u %s: write segment "
+ "%d@0x016%llx:0x%08x (%s)\n",
+ rqst->rq_task->tk_pid, __func__,
+ seg->mr_len, (unsigned long long)seg->mr_base,
+ seg->mr_rkey, n < nsegs ? "more" : "last");
+
+ r_xprt->rx_stats.write_chunk_count++;
+ r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+ req->rl_nchunks++;
+ nchunks++;
+ seg += n;
+ nsegs -= n;
+ } while (nsegs);
+ req->rl_nextseg = seg;
+
+ /* Update count of segments in this Write chunk */
+ *segcount = cpu_to_be32(nchunks);
+
+ /* Finish Write list */
+ *iptr++ = xdr_zero; /* Next item not present */
+ return iptr;
+}
+
+/* XDR-encode the Reply chunk. Supports encoding an array of plain
+ * segments that belong to a single write (reply) chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
+ * Reply chunk (a counted array):
+ * N elements:
+ * 1 - N - HLOO - HLOO - ... - HLOO
+ *
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Reply chunk, or an error pointer.
+ */
+static __be32 *
+rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req, struct rpc_rqst *rqst,
+ __be32 *iptr, enum rpcrdma_chunktype wtype)
+{
+ struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ int n, nsegs, nchunks;
+ __be32 *segcount;
+
+ if (wtype != rpcrdma_replych) {
+ *iptr++ = xdr_zero; /* no Reply chunk present */
+ return iptr;
+ }
+
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
+ RPCRDMA_MAX_SEGS - req->rl_nchunks);
+ if (nsegs < 0)
+ return ERR_PTR(nsegs);
+
+ *iptr++ = xdr_one; /* Reply chunk present */
+ segcount = iptr++; /* save location of segment count */
+
+ nchunks = 0;
+ do {
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
+ if (n <= 0)
+ return ERR_PTR(n);
+
+ iptr = xdr_encode_rdma_segment(iptr, seg);
+
+ dprintk("RPC: %5u %s: reply segment "
+ "%d@0x%016llx:0x%08x (%s)\n",
+ rqst->rq_task->tk_pid, __func__,
+ seg->mr_len, (unsigned long long)seg->mr_base,
+ seg->mr_rkey, n < nsegs ? "more" : "last");
+
+ r_xprt->rx_stats.reply_chunk_count++;
+ r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+ req->rl_nchunks++;
+ nchunks++;
+ seg += n;
+ nsegs -= n;
+ } while (nsegs);
+ req->rl_nextseg = seg;
+
+ /* Update count of segments in the Reply chunk */
+ *segcount = cpu_to_be32(nchunks);
+
+ return iptr;
+}
+
/*
* Copy write data inline.
* This function is used for "small" requests. Data which is passed
@@ -508,24 +716,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
struct rpc_xprt *xprt = rqst->rq_xprt;
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
- char *base;
- size_t rpclen;
- ssize_t hdrlen;
enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp;
+ unsigned int pos;
+ ssize_t hdrlen;
+ size_t rpclen;
+ __be32 *iptr;

#if defined(CONFIG_SUNRPC_BACKCHANNEL)
if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state))
return rpcrdma_bc_marshal_reply(rqst);
#endif

- /*
- * rpclen gets amount of data in first buffer, which is the
- * pre-registered buffer.
- */
- base = rqst->rq_svec[0].iov_base;
- rpclen = rqst->rq_svec[0].iov_len;
-
headerp = rdmab_to_msg(req->rl_rdmabuf);
/* don't byte-swap XID, it's already done in request */
headerp->rm_xid = rqst->rq_xid;
@@ -565,8 +767,12 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
*/
if (rpcrdma_args_inline(r_xprt, rqst)) {
rtype = rpcrdma_noch;
+ rpcrdma_inline_pullup(rqst);
+ rpclen = rqst->rq_svec[0].iov_len;
} else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
rtype = rpcrdma_readch;
+ rpclen = rqst->rq_svec[0].iov_len;
+ rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
} else {
r_xprt->rx_stats.nomsg_call_count++;
headerp->rm_type = htonl(RDMA_NOMSG);
@@ -574,52 +780,49 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
rpclen = 0;
}

- /* The following simplification is not true forever */
- if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
- wtype = rpcrdma_noch;
- if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
- dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
- __func__);
- return -EIO;
- }
-
- hdrlen = RPCRDMA_HDRLEN_MIN;
-
- /*
- * Pull up any extra send data into the preregistered buffer.
- * When padding is in use and applies to the transfer, insert
- * it and change the message type.
+ /* This implementation supports the following combinations
+ * of chunk lists in one RPC-over-RDMA Call message:
+ *
+ * - Read list
+ * - Write list
+ * - Reply chunk
+ * - Read list + Reply chunk
+ *
+ * It might not yet support the following combinations:
+ *
+ * - Read list + Write list
+ *
+ * It does not support the following combinations:
+ *
+ * - Write list + Reply chunk
+ * - Read list + Write list + Reply chunk
+ *
+ * This implementation supports only a single chunk in each
+ * Read or Write list. Thus for example the client cannot
+ * send a Call message with a Position Zero Read chunk and a
+ * regular Read chunk at the same time.
*/
- if (rtype == rpcrdma_noch) {
-
- rpcrdma_inline_pullup(rqst);
-
- headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
- headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
- headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
- /* new length after pullup */
- rpclen = rqst->rq_svec[0].iov_len;
- } else if (rtype == rpcrdma_readch)
- rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
- if (rtype != rpcrdma_noch) {
- hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
- headerp, rtype);
- wtype = rtype; /* simplify dprintk */
-
- } else if (wtype != rpcrdma_noch) {
- hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
- headerp, wtype);
- }
- if (hdrlen < 0)
- return hdrlen;
+ req->rl_nchunks = 0;
+ req->rl_nextseg = req->rl_segments;
+ iptr = headerp->rm_body.rm_chunks;
+ iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
+ if (IS_ERR(iptr))
+ goto out_unmap;
+ iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype);
+ if (IS_ERR(iptr))
+ goto out_unmap;
+ iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype);
+ if (IS_ERR(iptr))
+ goto out_unmap;
+ hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;

if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
goto out_overflow;

- dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd"
- " headerp 0x%p base 0x%p lkey 0x%x\n",
- __func__, transfertypes[wtype], hdrlen, rpclen,
- headerp, base, rdmab_lkey(req->rl_rdmabuf));
+ dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
+ rqst->rq_task->tk_pid, __func__,
+ transfertypes[rtype], transfertypes[wtype],
+ hdrlen, rpclen);

req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
req->rl_send_iov[0].length = hdrlen;
@@ -637,12 +840,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
return 0;

out_overflow:
- pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s\n",
- hdrlen, rpclen, transfertypes[wtype]);
+ pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
+ hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
/* Terminate this RPC. Chunks registered above will be
* released by xprt_release -> xprt_rmda_free .
*/
return -EIO;
+
+out_unmap:
+ for (pos = 0; req->rl_nchunks--;)
+ pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
+ &req->rl_segments[pos]);
+ return PTR_ERR(iptr);
}

/*
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 4349e03..d44e6e3 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -298,6 +298,7 @@ struct rpcrdma_req {
struct rpcrdma_regbuf *rl_rdmabuf;
struct rpcrdma_regbuf *rl_sendbuf;
struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
+ struct rpcrdma_mr_seg *rl_nextseg;

struct ib_cqe rl_cqe;
struct list_head rl_all;


2016-04-11 20:11:17

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 08/18] xprtrdma: Remove rpcrdma_create_chunks()

rpcrdma_create_chunks() has been replaced, and can be removed.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 151 ----------------------------------------
1 file changed, 151 deletions(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index e80f43d..9ebaf79 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -285,157 +285,6 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
return n;
}

-/*
- * Create read/write chunk lists, and reply chunks, for RDMA
- *
- * Assume check against THRESHOLD has been done, and chunks are required.
- * Assume only encoding one list entry for read|write chunks. The NFSv3
- * protocol is simple enough to allow this as it only has a single "bulk
- * result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The
- * RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.)
- *
- * When used for a single reply chunk (which is a special write
- * chunk used for the entire reply, rather than just the data), it
- * is used primarily for READDIR and READLINK which would otherwise
- * be severely size-limited by a small rdma inline read max. The server
- * response will come back as an RDMA Write, followed by a message
- * of type RDMA_NOMSG carrying the xid and length. As a result, reply
- * chunks do not provide data alignment, however they do not require
- * "fixup" (moving the response to the upper layer buffer) either.
- *
- * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
- *
- * Read chunklist (a linked list):
- * N elements, position P (same P for all chunks of same arg!):
- * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
- *
- * Write chunklist (a list of (one) counted array):
- * N elements:
- * 1 - N - HLOO - HLOO - ... - HLOO - 0
- *
- * Reply chunk (a counted array):
- * N elements:
- * 1 - N - HLOO - HLOO - ... - HLOO
- *
- * Returns positive RPC/RDMA header size, or negative errno.
- */
-
-static ssize_t
-rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
- struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
-{
- struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
- int n, nsegs, nchunks = 0;
- unsigned int pos;
- struct rpcrdma_mr_seg *seg = req->rl_segments;
- struct rpcrdma_read_chunk *cur_rchunk = NULL;
- struct rpcrdma_write_array *warray = NULL;
- struct rpcrdma_write_chunk *cur_wchunk = NULL;
- __be32 *iptr = headerp->rm_body.rm_chunks;
- int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool);
-
- if (type == rpcrdma_readch || type == rpcrdma_areadch) {
- /* a read chunk - server will RDMA Read our memory */
- cur_rchunk = (struct rpcrdma_read_chunk *) iptr;
- } else {
- /* a write or reply chunk - server will RDMA Write our memory */
- *iptr++ = xdr_zero; /* encode a NULL read chunk list */
- if (type == rpcrdma_replych)
- *iptr++ = xdr_zero; /* a NULL write chunk list */
- warray = (struct rpcrdma_write_array *) iptr;
- cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1);
- }
-
- if (type == rpcrdma_replych || type == rpcrdma_areadch)
- pos = 0;
- else
- pos = target->head[0].iov_len;
-
- nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
- if (nsegs < 0)
- return nsegs;
-
- map = r_xprt->rx_ia.ri_ops->ro_map;
- do {
- n = map(r_xprt, seg, nsegs, cur_wchunk != NULL);
- if (n <= 0)
- goto out;
- if (cur_rchunk) { /* read */
- cur_rchunk->rc_discrim = xdr_one;
- /* all read chunks have the same "position" */
- cur_rchunk->rc_position = cpu_to_be32(pos);
- cur_rchunk->rc_target.rs_handle =
- cpu_to_be32(seg->mr_rkey);
- cur_rchunk->rc_target.rs_length =
- cpu_to_be32(seg->mr_len);
- xdr_encode_hyper(
- (__be32 *)&cur_rchunk->rc_target.rs_offset,
- seg->mr_base);
- dprintk("RPC: %s: read chunk "
- "elem %d@0x%llx:0x%x pos %u (%s)\n", __func__,
- seg->mr_len, (unsigned long long)seg->mr_base,
- seg->mr_rkey, pos, n < nsegs ? "more" : "last");
- cur_rchunk++;
- r_xprt->rx_stats.read_chunk_count++;
- } else { /* write/reply */
- cur_wchunk->wc_target.rs_handle =
- cpu_to_be32(seg->mr_rkey);
- cur_wchunk->wc_target.rs_length =
- cpu_to_be32(seg->mr_len);
- xdr_encode_hyper(
- (__be32 *)&cur_wchunk->wc_target.rs_offset,
- seg->mr_base);
- dprintk("RPC: %s: %s chunk "
- "elem %d@0x%llx:0x%x (%s)\n", __func__,
- (type == rpcrdma_replych) ? "reply" : "write",
- seg->mr_len, (unsigned long long)seg->mr_base,
- seg->mr_rkey, n < nsegs ? "more" : "last");
- cur_wchunk++;
- if (type == rpcrdma_replych)
- r_xprt->rx_stats.reply_chunk_count++;
- else
- r_xprt->rx_stats.write_chunk_count++;
- r_xprt->rx_stats.total_rdma_request += seg->mr_len;
- }
- nchunks++;
- seg += n;
- nsegs -= n;
- } while (nsegs);
-
- /* success. all failures return above */
- req->rl_nchunks = nchunks;
-
- /*
- * finish off header. If write, marshal discrim and nchunks.
- */
- if (cur_rchunk) {
- iptr = (__be32 *) cur_rchunk;
- *iptr++ = xdr_zero; /* finish the read chunk list */
- *iptr++ = xdr_zero; /* encode a NULL write chunk list */
- *iptr++ = xdr_zero; /* encode a NULL reply chunk */
- } else {
- warray->wc_discrim = xdr_one;
- warray->wc_nchunks = cpu_to_be32(nchunks);
- iptr = (__be32 *) cur_wchunk;
- if (type == rpcrdma_writech) {
- *iptr++ = xdr_zero; /* finish the write chunk list */
- *iptr++ = xdr_zero; /* encode a NULL reply chunk */
- }
- }
-
- /*
- * Return header size.
- */
- return (unsigned char *)iptr - (unsigned char *)headerp;
-
-out:
- for (pos = 0; nchunks--;)
- pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
- &req->rl_segments[pos]);
- return n;
-}
-
static inline __be32 *
xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg)
{


2016-04-11 20:11:28

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 09/18] xprtrdma: Use core ib_drain_qp() API

Clean up: Replace rpcrdma_flush_cqs() and rpcrdma_clean_cqs() with
the new ib_drain_qp() API.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/verbs.c | 41 ++++++-----------------------------------
1 file changed, 6 insertions(+), 35 deletions(-)

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 9f8d6c1..b7a5bc1 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -203,15 +203,6 @@ out_fail:
goto out_schedule;
}

-static void
-rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
-{
- struct ib_wc wc;
-
- while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
- rpcrdma_receive_wc(NULL, &wc);
-}
-
static int
rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
{
@@ -374,23 +365,6 @@ out:
}

/*
- * Drain any cq, prior to teardown.
- */
-static void
-rpcrdma_clean_cq(struct ib_cq *cq)
-{
- struct ib_wc wc;
- int count = 0;
-
- while (1 == ib_poll_cq(cq, 1, &wc))
- ++count;
-
- if (count)
- dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
- __func__, count, wc.opcode);
-}
-
-/*
* Exported functions.
*/

@@ -515,7 +489,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
__func__);
return -ENOMEM;
}
- max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS;
+ max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS - 1;

/* check provider's send/recv wr limits */
if (cdata->max_requests > max_qp_wr)
@@ -526,11 +500,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.srq = NULL;
ep->rep_attr.cap.max_send_wr = cdata->max_requests;
ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
+ ep->rep_attr.cap.max_send_wr += 1; /* drain cqe */
rc = ia->ri_ops->ro_open(ia, ep, cdata);
if (rc)
return rc;
ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
+ ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */
ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
ep->rep_attr.cap.max_recv_sge = 1;
ep->rep_attr.cap.max_inline_data = 0;
@@ -622,13 +598,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)

cancel_delayed_work_sync(&ep->rep_connect_worker);

- if (ia->ri_id->qp)
- rpcrdma_ep_disconnect(ep, ia);
-
- rpcrdma_clean_cq(ep->rep_attr.recv_cq);
- rpcrdma_clean_cq(ep->rep_attr.send_cq);
-
if (ia->ri_id->qp) {
+ rpcrdma_ep_disconnect(ep, ia);
rdma_destroy_qp(ia->ri_id);
ia->ri_id->qp = NULL;
}
@@ -659,7 +630,6 @@ retry:
dprintk("RPC: %s: reconnecting...\n", __func__);

rpcrdma_ep_disconnect(ep, ia);
- rpcrdma_flush_cqs(ep);

xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
id = rpcrdma_create_id(xprt, ia,
@@ -785,7 +755,6 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
int rc;

- rpcrdma_flush_cqs(ep);
rc = rdma_disconnect(ia->ri_id);
if (!rc) {
/* returns without wait if not connected */
@@ -797,6 +766,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
ep->rep_connected = rc;
}
+
+ ib_drain_qp(ia->ri_id->qp);
}

struct rpcrdma_req *


2016-04-11 20:11:34

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 10/18] xprtrdma: Rename rpcrdma_frwr::sg and sg_nents

Clean up: Follow same naming convention as other fields in struct
rpcrdma_frwr.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/frwr_ops.c | 36 ++++++++++++++++++------------------
net/sunrpc/xprtrdma/xprt_rdma.h | 4 ++--
2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 41e02e7..7c22e9e 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -152,11 +152,11 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
if (IS_ERR(f->fr_mr))
goto out_mr_err;

- f->sg = kcalloc(depth, sizeof(*f->sg), GFP_KERNEL);
- if (!f->sg)
+ f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL);
+ if (!f->fr_sg)
goto out_list_err;

- sg_init_table(f->sg, depth);
+ sg_init_table(f->fr_sg, depth);

init_completion(&f->fr_linv_done);

@@ -185,7 +185,7 @@ __frwr_release(struct rpcrdma_mw *r)
if (rc)
dprintk("RPC: %s: ib_dereg_mr status %i\n",
__func__, rc);
- kfree(r->frmr.sg);
+ kfree(r->frmr.fr_sg);
}

static int
@@ -399,12 +399,12 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,

for (i = 0; i < nsegs;) {
if (seg->mr_page)
- sg_set_page(&frmr->sg[i],
+ sg_set_page(&frmr->fr_sg[i],
seg->mr_page,
seg->mr_len,
offset_in_page(seg->mr_offset));
else
- sg_set_buf(&frmr->sg[i], seg->mr_offset,
+ sg_set_buf(&frmr->fr_sg[i], seg->mr_offset,
seg->mr_len);

++seg;
@@ -415,25 +415,25 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
}
- frmr->sg_nents = i;
+ frmr->fr_nents = i;

- dma_nents = ib_dma_map_sg(device, frmr->sg, frmr->sg_nents, direction);
+ dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction);
if (!dma_nents) {
pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n",
- __func__, frmr->sg, frmr->sg_nents);
+ __func__, frmr->fr_sg, frmr->fr_nents);
return -ENOMEM;
}

- n = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, PAGE_SIZE);
- if (unlikely(n != frmr->sg_nents)) {
+ n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, PAGE_SIZE);
+ if (unlikely(n != frmr->fr_nents)) {
pr_err("RPC: %s: failed to map mr %p (%u/%u)\n",
- __func__, frmr->fr_mr, n, frmr->sg_nents);
+ __func__, frmr->fr_mr, n, frmr->fr_nents);
rc = n < 0 ? n : -EINVAL;
goto out_senderr;
}

dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n",
- __func__, mw, frmr->sg_nents, mr->length);
+ __func__, mw, frmr->fr_nents, mr->length);

key = (u8)(mr->rkey & 0x000000FF);
ib_update_fast_reg_key(mr, ++key);
@@ -459,14 +459,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
seg1->rl_mw = mw;
seg1->mr_rkey = mr->rkey;
seg1->mr_base = mr->iova;
- seg1->mr_nsegs = frmr->sg_nents;
+ seg1->mr_nsegs = frmr->fr_nents;
seg1->mr_len = mr->length;

- return frmr->sg_nents;
+ return frmr->fr_nents;

out_senderr:
dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
- ib_dma_unmap_sg(device, frmr->sg, dma_nents, direction);
+ ib_dma_unmap_sg(device, frmr->fr_sg, dma_nents, direction);
__frwr_queue_recovery(mw);
return rc;
}
@@ -500,7 +500,7 @@ __frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,

seg->rl_mw = NULL;

- ib_dma_unmap_sg(device, f->sg, f->sg_nents, seg->mr_dir);
+ ib_dma_unmap_sg(device, f->fr_sg, f->fr_nents, seg->mr_dir);

if (!rc)
rpcrdma_put_mw(r_xprt, mw);
@@ -611,7 +611,7 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey;
DECR_CQCOUNT(&r_xprt->rx_ep);

- ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir);
+ ib_dma_unmap_sg(ia->ri_device, frmr->fr_sg, frmr->fr_nents, seg1->mr_dir);
read_lock(&ia->ri_qplock);
rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr);
read_unlock(&ia->ri_qplock);
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index d44e6e3..abfce80 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -220,8 +220,8 @@ enum rpcrdma_frmr_state {
};

struct rpcrdma_frmr {
- struct scatterlist *sg;
- int sg_nents;
+ struct scatterlist *fr_sg;
+ int fr_nents;
struct ib_mr *fr_mr;
struct ib_cqe fr_cqe;
enum rpcrdma_frmr_state fr_state;


2016-04-11 20:11:42

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 11/18] xprtrdma: Save I/O direction in struct rpcrdma_frwr

Move the the I/O direction field from rpcrdma_mr_seg into the
rpcrdma_frmr. This makes it possible to perform the DMA unmapping
long after the rpcrdma_mr_seg has been released and re-used.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/frwr_ops.c | 6 +++---
net/sunrpc/xprtrdma/xprt_rdma.h | 1 +
2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 7c22e9e..e1e6ac1 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -416,6 +416,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
break;
}
frmr->fr_nents = i;
+ frmr->fr_dir = direction;

dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction);
if (!dma_nents) {
@@ -455,7 +456,6 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
if (rc)
goto out_senderr;

- seg1->mr_dir = direction;
seg1->rl_mw = mw;
seg1->mr_rkey = mr->rkey;
seg1->mr_base = mr->iova;
@@ -500,7 +500,7 @@ __frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,

seg->rl_mw = NULL;

- ib_dma_unmap_sg(device, f->fr_sg, f->fr_nents, seg->mr_dir);
+ ib_dma_unmap_sg(device, f->fr_sg, f->fr_nents, f->fr_dir);

if (!rc)
rpcrdma_put_mw(r_xprt, mw);
@@ -611,7 +611,7 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey;
DECR_CQCOUNT(&r_xprt->rx_ep);

- ib_dma_unmap_sg(ia->ri_device, frmr->fr_sg, frmr->fr_nents, seg1->mr_dir);
+ ib_dma_unmap_sg(ia->ri_device, frmr->fr_sg, frmr->fr_nents, frmr->fr_dir);
read_lock(&ia->ri_qplock);
rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr);
read_unlock(&ia->ri_qplock);
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index abfce80..0b3aa54 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -222,6 +222,7 @@ enum rpcrdma_frmr_state {
struct rpcrdma_frmr {
struct scatterlist *fr_sg;
int fr_nents;
+ enum dma_data_direction fr_dir;
struct ib_mr *fr_mr;
struct ib_cqe fr_cqe;
enum rpcrdma_frmr_state fr_state;


2016-04-11 20:11:51

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 12/18] xprtrdma: Reset MRs in frwr_op_unmap_sync()

frwr_op_unmap_sync() is now invoked in a workqueue context, the same
as __frwr_queue_recovery(). There's no need to defer MR reset if
posting LOCAL_INV MRs fails.

This means that even when ib_post_send() fails (which should occur
very rarely) the invalidation and DMA unmapping steps are still done
in the correct order.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/frwr_ops.c | 98 ++++++++++++++++++++++++----------------
1 file changed, 60 insertions(+), 38 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index e1e6ac1..ce245dc 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -98,6 +98,32 @@ frwr_destroy_recovery_wq(void)
destroy_workqueue(wq);
}

+static int
+__frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
+{
+ struct rpcrdma_frmr *f = &r->frmr;
+ int rc;
+
+ rc = ib_dereg_mr(f->fr_mr);
+ if (rc) {
+ pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n",
+ rc, r);
+ return rc;
+ }
+
+ f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG,
+ ia->ri_max_frmr_depth);
+ if (IS_ERR(f->fr_mr)) {
+ pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n",
+ PTR_ERR(f->fr_mr), r);
+ return PTR_ERR(f->fr_mr);
+ }
+
+ dprintk("RPC: %s: recovered FRMR %p\n", __func__, r);
+ f->fr_state = FRMR_IS_INVALID;
+ return 0;
+}
+
/* Deferred reset of a single FRMR. Generate a fresh rkey by
* replacing the MR.
*
@@ -111,24 +137,15 @@ __frwr_recovery_worker(struct work_struct *work)
struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
frmr.fr_work);
struct rpcrdma_xprt *r_xprt = r->frmr.fr_xprt;
- unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
- struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
-
- if (ib_dereg_mr(r->frmr.fr_mr))
- goto out_fail;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ int rc;

- r->frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
- if (IS_ERR(r->frmr.fr_mr))
- goto out_fail;
+ rc = __frwr_reset_mr(ia, r);
+ if (rc)
+ return;

- dprintk("RPC: %s: recovered FRMR %p\n", __func__, r);
- r->frmr.fr_state = FRMR_IS_INVALID;
rpcrdma_put_mw(r_xprt, r);
return;
-
-out_fail:
- pr_warn("RPC: %s: FRMR %p unrecovered\n",
- __func__, r);
}

/* A broken MR was discovered in a context that can't sleep.
@@ -490,24 +507,6 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
return invalidate_wr;
}

-static void
-__frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
- int rc)
-{
- struct ib_device *device = r_xprt->rx_ia.ri_device;
- struct rpcrdma_mw *mw = seg->rl_mw;
- struct rpcrdma_frmr *f = &mw->frmr;
-
- seg->rl_mw = NULL;
-
- ib_dma_unmap_sg(device, f->fr_sg, f->fr_nents, f->fr_dir);
-
- if (!rc)
- rpcrdma_put_mw(r_xprt, mw);
- else
- __frwr_queue_recovery(mw);
-}
-
/* Invalidate all memory regions that were registered for "req".
*
* Sleeps until it is safe for the host CPU to access the
@@ -521,6 +520,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
struct rpcrdma_mr_seg *seg;
unsigned int i, nchunks;
struct rpcrdma_frmr *f;
+ struct rpcrdma_mw *mw;
int rc;

dprintk("RPC: %s: req %p\n", __func__, req);
@@ -561,11 +561,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* unless ri_id->qp is a valid pointer.
*/
rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
- if (rc) {
- pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
- rdma_disconnect(ia->ri_id);
- goto unmap;
- }
+ if (rc)
+ goto reset_mrs;

wait_for_completion(&f->fr_linv_done);

@@ -575,14 +572,39 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
unmap:
for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
seg = &req->rl_segments[i];
+ mw = seg->rl_mw;
+ seg->rl_mw = NULL;

- __frwr_dma_unmap(r_xprt, seg, rc);
+ ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents,
+ f->fr_dir);
+ rpcrdma_put_mw(r_xprt, mw);

i += seg->mr_nsegs;
seg->mr_nsegs = 0;
}

req->rl_nchunks = 0;
+ return;
+
+reset_mrs:
+ pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
+
+ /* Find and reset the MRs in the LOCAL_INV WRs that did not
+ * get posted. This is synchronous, and slow.
+ */
+ for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+ seg = &req->rl_segments[i];
+ mw = seg->rl_mw;
+ f = &mw->frmr;
+
+ if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
+ __frwr_reset_mr(ia, mw);
+ bad_wr = bad_wr->next;
+ }
+
+ i += seg->mr_nsegs;
+ }
+ goto unmap;
}

/* Post a LOCAL_INV Work Request to prevent further remote access


2016-04-11 20:11:53

by Chuck Lever

[permalink] [raw]
Subject: Re: [PATCH v1 00/18] NFS/RDMA client patches for 4.7


> On Apr 11, 2016, at 4:10 PM, Chuck Lever <[email protected]> wrote:
>
> One larger change: Attempt to fence memory regions after a signal
> interrupts a synchronous RPC. This should prevent a server from
> writing a reply into a client's memory after the memory has been
> released.
>
> In addition, the following changes and fixes:
>
> - Use new ib_drain_qp() API
> - Advertise max size of NFSv4.1 callbacks on RPC/RDMA
> - Prevent overflowing the server's receive buffers
> - Read list + Reply chunk (to support krb5i and krbp on RPC/RDMA)
> - Detect connection loss sooner

Forgot to add:

Available in the "nfs-rdma-for-4.7" topic branch of this git repo:

git://git.linux-nfs.org/projects/cel/cel-2.6.git

Or for browsing:

http://git.linux-nfs.org/?p=cel/cel-2.6.git;a=log;h=refs/heads/nfs-rdma-for-4.7


> ---
>
> Chuck Lever (18):
> sunrpc: Advertise maximum backchannel payload size
> xprtrdma: Bound the inline threshold values
> xprtrdma: Limit number of RDMA segments in RPC-over-RDMA headers
> xprtrdma: Prevent inline overflow
> xprtrdma: Avoid using Write list for small NFS READ requests
> xprtrdma: Update comments in rpcrdma_marshal_req()
> xprtrdma: Allow Read list and Reply chunk simultaneously
> xprtrdma: Remove rpcrdma_create_chunks()
> xprtrdma: Use core ib_drain_qp() API
> xprtrdma: Rename rpcrdma_frwr::sg and sg_nents
> xprtrdma: Save I/O direction in struct rpcrdma_frwr
> xprtrdma: Reset MRs in frwr_op_unmap_sync()
> xprtrdma: Refactor the FRWR recovery worker
> xprtrdma: Move fr_xprt and fr_worker to struct rpcrdma_mw
> xprtrdma: Refactor __fmr_dma_unmap()
> xprtrdma: Add ro_unmap_safe memreg method
> xprtrdma: Remove ro_unmap() from all registration modes
> xprtrdma: Faster server reboot recovery
>
>
> fs/nfs/nfs4proc.c | 10 -
> include/linux/sunrpc/clnt.h | 1
> include/linux/sunrpc/xprt.h | 1
> include/linux/sunrpc/xprtrdma.h | 4
> net/sunrpc/clnt.c | 17 +
> net/sunrpc/xprtrdma/backchannel.c | 16 +
> net/sunrpc/xprtrdma/fmr_ops.c | 134 +++++++--
> net/sunrpc/xprtrdma/frwr_ops.c | 214 ++++++++-------
> net/sunrpc/xprtrdma/physical_ops.c | 39 ++-
> net/sunrpc/xprtrdma/rpc_rdma.c | 517 ++++++++++++++++++++++--------------
> net/sunrpc/xprtrdma/transport.c | 16 +
> net/sunrpc/xprtrdma/verbs.c | 91 ++----
> net/sunrpc/xprtrdma/xprt_rdma.h | 42 ++-
> net/sunrpc/xprtsock.c | 6
> 14 files changed, 674 insertions(+), 434 deletions(-)
>
> --
> Chuck Lever
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

--
Chuck Lever




2016-04-11 20:11:59

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 13/18] xprtrdma: Refactor the FRWR recovery worker

Maintain the order of invalidation and DMA unmapping when doing
a background MR reset.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/frwr_ops.c | 25 ++++++++++++++++---------
1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index ce245dc..4e0a5c1 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -124,6 +124,21 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
return 0;
}

+static void
+__frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
+{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_frmr *f = &mw->frmr;
+ int rc;
+
+ rc = __frwr_reset_mr(ia, mw);
+ ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir);
+ if (rc)
+ return;
+
+ rpcrdma_put_mw(r_xprt, mw);
+}
+
/* Deferred reset of a single FRMR. Generate a fresh rkey by
* replacing the MR.
*
@@ -136,15 +151,8 @@ __frwr_recovery_worker(struct work_struct *work)
{
struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
frmr.fr_work);
- struct rpcrdma_xprt *r_xprt = r->frmr.fr_xprt;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- int rc;
-
- rc = __frwr_reset_mr(ia, r);
- if (rc)
- return;

- rpcrdma_put_mw(r_xprt, r);
+ __frwr_reset_and_unmap(r->frmr.fr_xprt, r);
return;
}

@@ -483,7 +491,6 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,

out_senderr:
dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
- ib_dma_unmap_sg(device, frmr->fr_sg, dma_nents, direction);
__frwr_queue_recovery(mw);
return rc;
}


2016-04-11 20:12:08

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 14/18] xprtrdma: Move fr_xprt and fr_worker to struct rpcrdma_mw

In a subsequent patch, the fr_xprt and fr_worker fields will be
needed by another memory registration mode. Move them into the
generic rpcrdma_mw structure that wraps struct rpcrdma_frmr.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/frwr_ops.c | 10 +++++-----
net/sunrpc/xprtrdma/xprt_rdma.h | 4 ++--
2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 4e0a5c1..1251a1d 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -150,9 +150,9 @@ static void
__frwr_recovery_worker(struct work_struct *work)
{
struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
- frmr.fr_work);
+ mw_work);

- __frwr_reset_and_unmap(r->frmr.fr_xprt, r);
+ __frwr_reset_and_unmap(r->mw_xprt, r);
return;
}

@@ -162,8 +162,8 @@ __frwr_recovery_worker(struct work_struct *work)
static void
__frwr_queue_recovery(struct rpcrdma_mw *r)
{
- INIT_WORK(&r->frmr.fr_work, __frwr_recovery_worker);
- queue_work(frwr_recovery_wq, &r->frmr.fr_work);
+ INIT_WORK(&r->mw_work, __frwr_recovery_worker);
+ queue_work(frwr_recovery_wq, &r->mw_work);
}

static int
@@ -378,9 +378,9 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt)
return rc;
}

+ r->mw_xprt = r_xprt;
list_add(&r->mw_list, &buf->rb_mws);
list_add(&r->mw_all, &buf->rb_all);
- r->frmr.fr_xprt = r_xprt;
}

return 0;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 0b3aa54..61d5cce 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -227,8 +227,6 @@ struct rpcrdma_frmr {
struct ib_cqe fr_cqe;
enum rpcrdma_frmr_state fr_state;
struct completion fr_linv_done;
- struct work_struct fr_work;
- struct rpcrdma_xprt *fr_xprt;
union {
struct ib_reg_wr fr_regwr;
struct ib_send_wr fr_invwr;
@@ -245,6 +243,8 @@ struct rpcrdma_mw {
struct rpcrdma_fmr fmr;
struct rpcrdma_frmr frmr;
};
+ struct work_struct mw_work;
+ struct rpcrdma_xprt *mw_xprt;
struct list_head mw_list;
struct list_head mw_all;
};


2016-04-11 20:12:16

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 15/18] xprtrdma: Refactor __fmr_dma_unmap()

Separate the DMA unmap operation from freeing the MW. In a
subsequent patch they will not always be done at the same time,
and they are not related operations (except by order; freeing
the MW must be the last step during invalidation).

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/fmr_ops.c | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 009694b..9d50f3a 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -186,15 +186,10 @@ static void
__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
{
struct ib_device *device = r_xprt->rx_ia.ri_device;
- struct rpcrdma_mw *mw = seg->rl_mw;
int nsegs = seg->mr_nsegs;

- seg->rl_mw = NULL;
-
while (nsegs--)
rpcrdma_unmap_one(device, seg++);
-
- rpcrdma_put_mw(r_xprt, mw);
}

/* Invalidate all memory regions that were registered for "req".
@@ -237,9 +232,11 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
seg = &req->rl_segments[i];

__fmr_dma_unmap(r_xprt, seg);
+ rpcrdma_put_mw(r_xprt, seg->rl_mw);

i += seg->mr_nsegs;
seg->mr_nsegs = 0;
+ seg->rl_mw = NULL;
}

req->rl_nchunks = 0;


2016-04-11 20:12:24

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 16/18] xprtrdma: Add ro_unmap_safe memreg method

There needs to be a safe method of releasing registered memory
resources when an RPC terminates. Safe can mean a number of things:

+ Doesn't have to sleep

+ Doesn't rely on having a QP in RTS

ro_unmap_safe will be that safe method. It can be used in cases
where synchronous memory invalidation can deadlock, or needs to have
an active QP.

The important case is fencing an RPC's memory regions after it is
signaled (^C) and before it exits. If this is not done, there is a
window where the server can write an RPC reply into memory that the
client has released and re-used for some other purpose.

Note that this is a full solution for FRWR, but FMR and physical
still have some gaps where a particularly bad server can wreak
some havoc on the client. These gaps are not made worse by this
patch and are expected to be exceptionally rare and timing-based.
They are noted in documenting comments.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/fmr_ops.c | 105 +++++++++++++++++++++++++++++++++---
net/sunrpc/xprtrdma/frwr_ops.c | 27 +++++++++
net/sunrpc/xprtrdma/physical_ops.c | 20 +++++++
net/sunrpc/xprtrdma/rpc_rdma.c | 5 --
net/sunrpc/xprtrdma/transport.c | 9 +--
net/sunrpc/xprtrdma/xprt_rdma.h | 3 +
6 files changed, 150 insertions(+), 19 deletions(-)

diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 9d50f3a..a658dcf 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -35,6 +35,64 @@
/* Maximum scatter/gather per FMR */
#define RPCRDMA_MAX_FMR_SGES (64)

+static struct workqueue_struct *fmr_recovery_wq;
+
+#define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND)
+
+int
+fmr_alloc_recovery_wq(void)
+{
+ fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0);
+ return !fmr_recovery_wq ? -ENOMEM : 0;
+}
+
+void
+fmr_destroy_recovery_wq(void)
+{
+ struct workqueue_struct *wq;
+
+ if (!fmr_recovery_wq)
+ return;
+
+ wq = fmr_recovery_wq;
+ fmr_recovery_wq = NULL;
+ destroy_workqueue(wq);
+}
+
+static int
+__fmr_unmap(struct rpcrdma_mw *mw)
+{
+ LIST_HEAD(l);
+
+ list_add(&mw->fmr.fmr->list, &l);
+ return ib_unmap_fmr(&l);
+}
+
+/* Deferred reset of a single FMR. Generate a fresh rkey by
+ * replacing the MR. There's no recovery if this fails.
+ */
+static void
+__fmr_recovery_worker(struct work_struct *work)
+{
+ struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw,
+ mw_work);
+ struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+
+ __fmr_unmap(mw);
+ rpcrdma_put_mw(r_xprt, mw);
+ return;
+}
+
+/* A broken MR was discovered in a context that can't sleep.
+ * Defer recovery to the recovery worker.
+ */
+static void
+__fmr_queue_recovery(struct rpcrdma_mw *mw)
+{
+ INIT_WORK(&mw->mw_work, __fmr_recovery_worker);
+ queue_work(fmr_recovery_wq, &mw->mw_work);
+}
+
static int
fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
@@ -92,6 +150,7 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
if (IS_ERR(r->fmr.fmr))
goto out_fmr_err;

+ r->mw_xprt = r_xprt;
list_add(&r->mw_list, &buf->rb_mws);
list_add(&r->mw_all, &buf->rb_all);
}
@@ -107,15 +166,6 @@ out:
return rc;
}

-static int
-__fmr_unmap(struct rpcrdma_mw *r)
-{
- LIST_HEAD(l);
-
- list_add(&r->fmr.fmr->list, &l);
- return ib_unmap_fmr(&l);
-}
-
/* Use the ib_map_phys_fmr() verb to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
@@ -242,6 +292,42 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
req->rl_nchunks = 0;
}

+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ *
+ * In the asynchronous case, DMA unmapping occurs first here
+ * because the rpcrdma_mr_seg is released immediately after this
+ * call. It's contents won't be available in __fmr_dma_unmap later.
+ * FIXME.
+ */
+static void
+fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ bool sync)
+{
+ struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_mw *mw;
+ unsigned int i;
+
+ for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
+ seg = &req->rl_segments[i];
+ mw = seg->rl_mw;
+
+ if (sync) {
+ /* ORDER */
+ __fmr_unmap(mw);
+ __fmr_dma_unmap(r_xprt, seg);
+ rpcrdma_put_mw(r_xprt, mw);
+ } else {
+ __fmr_dma_unmap(r_xprt, seg);
+ __fmr_queue_recovery(mw);
+ }
+
+ i += seg->mr_nsegs;
+ seg->mr_nsegs = 0;
+ seg->rl_mw = NULL;
+ }
+}
+
/* Use the ib_unmap_fmr() verb to prevent further remote
* access via RDMA READ or RDMA WRITE.
*/
@@ -295,6 +381,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_map = fmr_op_map,
.ro_unmap_sync = fmr_op_unmap_sync,
+ .ro_unmap_safe = fmr_op_unmap_safe,
.ro_unmap = fmr_op_unmap,
.ro_open = fmr_op_open,
.ro_maxpages = fmr_op_maxpages,
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 1251a1d..79ba323 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -614,6 +614,32 @@ reset_mrs:
goto unmap;
}

+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ */
+static void
+frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ bool sync)
+{
+ struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_mw *mw;
+ unsigned int i;
+
+ for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
+ seg = &req->rl_segments[i];
+ mw = seg->rl_mw;
+
+ if (sync)
+ __frwr_reset_and_unmap(r_xprt, mw);
+ else
+ __frwr_queue_recovery(mw);
+
+ i += seg->mr_nsegs;
+ seg->mr_nsegs = 0;
+ seg->rl_mw = NULL;
+ }
+}
+
/* Post a LOCAL_INV Work Request to prevent further remote access
* via RDMA READ or RDMA WRITE.
*/
@@ -675,6 +701,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf)
const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
.ro_map = frwr_op_map,
.ro_unmap_sync = frwr_op_unmap_sync,
+ .ro_unmap_safe = frwr_op_unmap_safe,
.ro_unmap = frwr_op_unmap,
.ro_open = frwr_op_open,
.ro_maxpages = frwr_op_maxpages,
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index 2dc6ec2..95ef3a7 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -97,6 +97,25 @@ physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
rpcrdma_unmap_one(device, &req->rl_segments[i++]);
}

+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ *
+ * For physical memory registration, there is no good way to
+ * fence a single MR that has been advertised to the server. The
+ * client has already handed the server an R_key that cannot be
+ * invalidated and is shared by all MRs on this connection.
+ * Tearing down the PD might be the only safe choice, but it's
+ * not clear that a freshly acquired DMA R_key would be different
+ * than the one used by the PD that was just destroyed.
+ * FIXME.
+ */
+static void
+physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ bool sync)
+{
+ physical_op_unmap_sync(r_xprt, req);
+}
+
static void
physical_op_destroy(struct rpcrdma_buffer *buf)
{
@@ -105,6 +124,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf)
const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
.ro_map = physical_op_map,
.ro_unmap_sync = physical_op_unmap_sync,
+ .ro_unmap_safe = physical_op_unmap_safe,
.ro_unmap = physical_op_unmap,
.ro_open = physical_op_open,
.ro_maxpages = physical_op_maxpages,
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 9ebaf79..35a8109 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -567,7 +567,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp;
- unsigned int pos;
ssize_t hdrlen;
size_t rpclen;
__be32 *iptr;
@@ -697,9 +696,7 @@ out_overflow:
return -EIO;

out_unmap:
- for (pos = 0; req->rl_nchunks--;)
- pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
- &req->rl_segments[pos]);
+ r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
return PTR_ERR(iptr);
}

diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 16595ff..99d2e5b 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -514,6 +514,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
out:
dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
req->rl_connect_cookie = 0; /* our reserved value */
+ req->rl_task = task;
return req->rl_sendbuf->rg_base;

out_rdmabuf:
@@ -570,7 +571,6 @@ xprt_rdma_free(void *buffer)
struct rpcrdma_req *req;
struct rpcrdma_xprt *r_xprt;
struct rpcrdma_regbuf *rb;
- int i;

if (buffer == NULL)
return;
@@ -584,11 +584,8 @@ xprt_rdma_free(void *buffer)

dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply);

- for (i = 0; req->rl_nchunks;) {
- --req->rl_nchunks;
- i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
- &req->rl_segments[i]);
- }
+ r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req,
+ !RPC_IS_ASYNC(req->rl_task));

rpcrdma_buffer_put(req);
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 61d5cce..04a7058 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -293,6 +293,7 @@ struct rpcrdma_req {
unsigned int rl_niovs;
unsigned int rl_nchunks;
unsigned int rl_connect_cookie;
+ struct rpc_task *rl_task;
struct rpcrdma_buffer *rl_buffer;
struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
@@ -398,6 +399,8 @@ struct rpcrdma_memreg_ops {
struct rpcrdma_req *);
int (*ro_unmap)(struct rpcrdma_xprt *,
struct rpcrdma_mr_seg *);
+ void (*ro_unmap_safe)(struct rpcrdma_xprt *,
+ struct rpcrdma_req *, bool);
int (*ro_open)(struct rpcrdma_ia *,
struct rpcrdma_ep *,
struct rpcrdma_create_data_internal *);


2016-04-11 20:12:33

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 17/18] xprtrdma: Remove ro_unmap() from all registration modes

Clean up: The ro_unmap method is no longer used.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/fmr_ops.c | 31 --------------------------
net/sunrpc/xprtrdma/frwr_ops.c | 43 ------------------------------------
net/sunrpc/xprtrdma/physical_ops.c | 12 ----------
net/sunrpc/xprtrdma/xprt_rdma.h | 2 --
4 files changed, 88 deletions(-)

diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index a658dcf..6326ebe 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -328,36 +328,6 @@ fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
}
}

-/* Use the ib_unmap_fmr() verb to prevent further remote
- * access via RDMA READ or RDMA WRITE.
- */
-static int
-fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_mr_seg *seg1 = seg;
- struct rpcrdma_mw *mw = seg1->rl_mw;
- int rc, nsegs = seg->mr_nsegs;
-
- dprintk("RPC: %s: FMR %p\n", __func__, mw);
-
- seg1->rl_mw = NULL;
- while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(ia->ri_device, seg++);
- rc = __fmr_unmap(mw);
- if (rc)
- goto out_err;
- rpcrdma_put_mw(r_xprt, mw);
- return nsegs;
-
-out_err:
- /* The FMR is abandoned, but remains in rb_all. fmr_op_destroy
- * will attempt to release it when the transport is destroyed.
- */
- dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc);
- return nsegs;
-}
-
static void
fmr_op_destroy(struct rpcrdma_buffer *buf)
{
@@ -382,7 +352,6 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_map = fmr_op_map,
.ro_unmap_sync = fmr_op_unmap_sync,
.ro_unmap_safe = fmr_op_unmap_safe,
- .ro_unmap = fmr_op_unmap,
.ro_open = fmr_op_open,
.ro_maxpages = fmr_op_maxpages,
.ro_init = fmr_op_init,
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 79ba323..a192b91 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -640,48 +640,6 @@ frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
}
}

-/* Post a LOCAL_INV Work Request to prevent further remote access
- * via RDMA READ or RDMA WRITE.
- */
-static int
-frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
- struct rpcrdma_mr_seg *seg1 = seg;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_mw *mw = seg1->rl_mw;
- struct rpcrdma_frmr *frmr = &mw->frmr;
- struct ib_send_wr *invalidate_wr, *bad_wr;
- int rc, nsegs = seg->mr_nsegs;
-
- dprintk("RPC: %s: FRMR %p\n", __func__, mw);
-
- seg1->rl_mw = NULL;
- frmr->fr_state = FRMR_IS_INVALID;
- invalidate_wr = &mw->frmr.fr_invwr;
-
- memset(invalidate_wr, 0, sizeof(*invalidate_wr));
- frmr->fr_cqe.done = frwr_wc_localinv;
- invalidate_wr->wr_cqe = &frmr->fr_cqe;
- invalidate_wr->opcode = IB_WR_LOCAL_INV;
- invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey;
- DECR_CQCOUNT(&r_xprt->rx_ep);
-
- ib_dma_unmap_sg(ia->ri_device, frmr->fr_sg, frmr->fr_nents, frmr->fr_dir);
- read_lock(&ia->ri_qplock);
- rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr);
- read_unlock(&ia->ri_qplock);
- if (rc)
- goto out_err;
-
- rpcrdma_put_mw(r_xprt, mw);
- return nsegs;
-
-out_err:
- dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
- __frwr_queue_recovery(mw);
- return nsegs;
-}
-
static void
frwr_op_destroy(struct rpcrdma_buffer *buf)
{
@@ -702,7 +660,6 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
.ro_map = frwr_op_map,
.ro_unmap_sync = frwr_op_unmap_sync,
.ro_unmap_safe = frwr_op_unmap_safe,
- .ro_unmap = frwr_op_unmap,
.ro_open = frwr_op_open,
.ro_maxpages = frwr_op_maxpages,
.ro_init = frwr_op_init,
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index 95ef3a7..3750596 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -74,17 +74,6 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
return 1;
}

-/* Unmap a memory region, but leave it registered.
- */
-static int
-physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
- rpcrdma_unmap_one(ia->ri_device, seg);
- return 1;
-}
-
/* DMA unmap all memory regions that were mapped for "req".
*/
static void
@@ -125,7 +114,6 @@ const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
.ro_map = physical_op_map,
.ro_unmap_sync = physical_op_unmap_sync,
.ro_unmap_safe = physical_op_unmap_safe,
- .ro_unmap = physical_op_unmap,
.ro_open = physical_op_open,
.ro_maxpages = physical_op_maxpages,
.ro_init = physical_op_init,
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 04a7058..24bbea4 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -397,8 +397,6 @@ struct rpcrdma_memreg_ops {
struct rpcrdma_mr_seg *, int, bool);
void (*ro_unmap_sync)(struct rpcrdma_xprt *,
struct rpcrdma_req *);
- int (*ro_unmap)(struct rpcrdma_xprt *,
- struct rpcrdma_mr_seg *);
void (*ro_unmap_safe)(struct rpcrdma_xprt *,
struct rpcrdma_req *, bool);
int (*ro_open)(struct rpcrdma_ia *,


2016-04-11 20:12:41

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 18/18] xprtrdma: Faster server reboot recovery

In a cluster failover scenario, it is desirable for the client to
attempt to reconnect quickly, as an alternate NFS server is already
waiting to take over for the down server. The client can't see that
a server IP address has moved to a new server until the existing
connection is gone.

For fabrics and devices where it is meaningful, set an upper bound
on the amount of time before it is determined that a connection is
no longer valid. This allows the RPC client to detect connection
loss in a timely matter, then perform a fresh resolution of the
server GUID in case it has changed (cluster failover).

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/verbs.c | 28 ++++++++++++++++++++++++----
1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index b7a5bc1..5cc57fb 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -211,9 +211,10 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
struct rpcrdma_ep *ep = &xprt->rx_ep;
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
-#endif
+ u64 timeout;
struct ib_qp_attr *attr = &ia->ri_qp_attr;
struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
+#endif
int connstate = 0;

switch (event->event) {
@@ -235,14 +236,23 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
complete(&ia->ri_done);
break;
case RDMA_CM_EVENT_ESTABLISHED:
- connstate = 1;
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+ memset(attr, 0, sizeof(*attr));
ib_query_qp(ia->ri_id->qp, attr,
- IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
+ IB_QP_MAX_QP_RD_ATOMIC |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_TIMEOUT,
iattr);
dprintk("RPC: %s: %d responder resources"
" (%d initiator)\n",
__func__, attr->max_dest_rd_atomic,
attr->max_rd_atomic);
+ timeout = 4096 * (1ULL << attr->timeout);
+ do_div(timeout, NSEC_PER_SEC);
+ dprintk("RPC: %s: retry timeout: %llu seconds\n",
+ __func__, timeout);
+#endif
+ connstate = 1;
goto connected;
case RDMA_CM_EVENT_CONNECT_ERROR:
connstate = -ENOTCONN;
@@ -554,6 +564,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.recv_cq = recvcq;

/* Initialize cma parameters */
+ memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));

/* RPC/RDMA does not use private data */
ep->rep_remote_cma.private_data = NULL;
@@ -567,7 +578,16 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_remote_cma.responder_resources =
ia->ri_device->attrs.max_qp_rd_atom;

- ep->rep_remote_cma.retry_count = 7;
+ /* Limit transport retries so client can detect server
+ * GID changes quickly. RPC layer handles re-establishing
+ * transport connection and retransmission.
+ */
+ ep->rep_remote_cma.retry_count = 6;
+
+ /* RPC-over-RDMA handles its own flow control. In addition,
+ * make all RNR NAKs visible so we know that RPC-over-RDMA
+ * flow control is working correctly (no NAKs should be seen).
+ */
ep->rep_remote_cma.flow_control = 0;
ep->rep_remote_cma.rnr_retry_count = 0;



2016-04-11 20:34:58

by Steve Wise

[permalink] [raw]
Subject: RE: [PATCH v1 05/18] xprtrdma: Avoid using Write list for small NFS READ requests

> Avoid the latency and interrupt overhead of registering a Write
> chunk when handling NFS READ requests of a few hundred bytes or
> less.
>
> This change does not interoperate with Linux NFS/RDMA servers
> that do not have commit 9d11b51ce7c1 ('svcrdma: Fix send_reply()
> scatter/gather set-up').
>
> $ git describe 9d11b51ce7c1
> v4.2-rc3-10-g9d11b51
>

Hey Chuck,

Is there any way to avoid breaking interoperability with older servers?

Steve



2016-04-11 20:38:50

by Chuck Lever

[permalink] [raw]
Subject: Re: [PATCH v1 05/18] xprtrdma: Avoid using Write list for small NFS READ requests


> On Apr 11, 2016, at 4:35 PM, Steve Wise <[email protected]> wrote:
>
>> Avoid the latency and interrupt overhead of registering a Write
>> chunk when handling NFS READ requests of a few hundred bytes or
>> less.
>>
>> This change does not interoperate with Linux NFS/RDMA servers
>> that do not have commit 9d11b51ce7c1 ('svcrdma: Fix send_reply()
>> scatter/gather set-up').
>>
>> $ git describe 9d11b51ce7c1
>> v4.2-rc3-10-g9d11b51
>>
>
> Hey Chuck,
>
> Is there any way to avoid breaking interoperability with older servers?

No, pre-4.2 Linux servers have a bug that can be fixed by
applying the above commit, which shouldn't be difficult to
backport.

Clients have always been allowed to send NFS READ this way.

--
Chuck Lever




2016-04-12 05:15:54

by Leon Romanovsky

[permalink] [raw]
Subject: Re: [PATCH v1 09/18] xprtrdma: Use core ib_drain_qp() API

On Mon, Apr 11, 2016 at 04:11:23PM -0400, Chuck Lever wrote:
> Clean up: Replace rpcrdma_flush_cqs() and rpcrdma_clean_cqs() with
> the new ib_drain_qp() API.
>
> Signed-off-by: Chuck Lever <[email protected]>

Thanks
Reviewed-By: Leon Romanovsky <[email protected]>


Attachments:
(No filename) (275.00 B)
signature.asc (819.00 B)
Digital signature
Download all attachments

2016-04-12 14:15:35

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v1 05/18] xprtrdma: Avoid using Write list for small NFS READ requests

On Mon, Apr 11, 2016 at 04:38:44PM -0400, Chuck Lever wrote:
> No, pre-4.2 Linux servers have a bug that can be fixed by
> applying the above commit, which shouldn't be difficult to
> backport.
>
> Clients have always been allowed to send NFS READ this way.

But the clients might be out in a the wild. I think we'll need at
least a mount option to work around them.

2016-04-12 14:49:58

by Chuck Lever

[permalink] [raw]
Subject: Re: [PATCH v1 05/18] xprtrdma: Avoid using Write list for small NFS READ requests


> On Apr 12, 2016, at 10:15 AM, Christoph Hellwig <[email protected]> wrote:
>
> On Mon, Apr 11, 2016 at 04:38:44PM -0400, Chuck Lever wrote:
>> No, pre-4.2 Linux servers have a bug that can be fixed by
>> applying the above commit, which shouldn't be difficult to
>> backport.
>>
>> Clients have always been allowed to send NFS READ this way.
>
> But the clients might be out in a the wild. I think we'll need at
> least a mount option to work around them.

Huh?

I'm talking about non-Linux clients. RFC 5666 allows
this behavior, it always has. The Linux server before
9d11b51ce7c1 is broken, full stop.

Our policy is that we do not work around broken NFS
servers on our client. The exception is existing bugs
in the Linux server, where we may work around them
temporarily until an upstream fix is available. The
client change is held off for one or two kernel
releases after the server is fixed.

I've made several changes like this already over the
past two years, and I fail to see why this one is
any different.

In addition, there is always NFS/TCP on IPoIB to fall
back on.

Can you explain your concern more fully?

--
Chuck Lever




2016-04-12 17:01:24

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v1 05/18] xprtrdma: Avoid using Write list for small NFS READ requests

On Tue, Apr 12, 2016 at 10:49:24AM -0400, Chuck Lever wrote:
> >> No, pre-4.2 Linux servers have a bug that can be fixed by
> >> applying the above commit, which shouldn't be difficult to
> >> backport.
> >>
> >> Clients have always been allowed to send NFS READ this way.
> >
> > But the clients might be out in a the wild. I think we'll need at
> > least a mount option to work around them.
>
> Huh?
>
> I'm talking about non-Linux clients. RFC 5666 allows
> this behavior, it always has. The Linux server before
> 9d11b51ce7c1 is broken, full stop.

You are deliverately breaking exsisting and fairly recent clients.
I see why we want to do this, but there needs to be an options
that things keep working. The idea of upgrading your client from
4.6 to 4.7 and NFS simply not working anymore with the developer
knowing it and not providing a workaround is simply not an option.

2016-04-12 18:05:03

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH v1 02/18] xprtrdma: Bound the inline threshold values

On 04/11/2016 04:10 PM, Chuck Lever wrote:
> Currently the sysctls that allow setting the inline threshold allow
> any value to be set.
>
> Small values only make the transport run slower. The default 1KB
> setting is as low as is reasonable. And the logic that decides how
> to divide a Send buffer buffer between RPC-over-RDMA header and RPC
^^^^^^^^^^^^^
I'm guessing you don't mean a buffer of buffers? :)

> message assumes (but does not check) that the lower bound is not
> crazy (say, 57 bytes).
>
> Send and receive buffers share a page with some control information.
> Values larger than about 3KB can't be supported, currently.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> include/linux/sunrpc/xprtrdma.h | 4 +++-
> net/sunrpc/xprtrdma/transport.c | 6 ++++++
> 2 files changed, 9 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
> index 767190b..39267dc 100644
> --- a/include/linux/sunrpc/xprtrdma.h
> +++ b/include/linux/sunrpc/xprtrdma.h
> @@ -52,7 +52,9 @@
> #define RPCRDMA_DEF_SLOT_TABLE (128U)
> #define RPCRDMA_MAX_SLOT_TABLE (256U)
>
> -#define RPCRDMA_DEF_INLINE (1024) /* default inline max */
> +#define RPCRDMA_MIN_INLINE (1024) /* min inline thresh */
> +#define RPCRDMA_DEF_INLINE (1024) /* default inline thresh */
> +#define RPCRDMA_MAX_INLINE (3068) /* max inline thresh */

It looks like RPCRDMA_DEF_INLINE is used to set xprt_rdma_max_inline_{read,write}. Are we keeping DEF_INLINE around so we can set these values separately from the minimum and maximum amounts?

Thanks,
Anna

>
> /* Memory registration strategies, by number.
> * This is part of a kernel / user space API. Do not remove. */
> diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
> index 9954342..16595ff 100644
> --- a/net/sunrpc/xprtrdma/transport.c
> +++ b/net/sunrpc/xprtrdma/transport.c
> @@ -73,6 +73,8 @@ static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
>
> static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
> static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
> +static unsigned int min_inline_size = RPCRDMA_MIN_INLINE;
> +static unsigned int max_inline_size = RPCRDMA_MAX_INLINE;
> static unsigned int zero;
> static unsigned int max_padding = PAGE_SIZE;
> static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
> @@ -96,6 +98,8 @@ static struct ctl_table xr_tunables_table[] = {
> .maxlen = sizeof(unsigned int),
> .mode = 0644,
> .proc_handler = proc_dointvec,
> + .extra1 = &min_inline_size,
> + .extra2 = &max_inline_size,
> },
> {
> .procname = "rdma_max_inline_write",
> @@ -103,6 +107,8 @@ static struct ctl_table xr_tunables_table[] = {
> .maxlen = sizeof(unsigned int),
> .mode = 0644,
> .proc_handler = proc_dointvec,
> + .extra1 = &min_inline_size,
> + .extra2 = &max_inline_size,
> },
> {
> .procname = "rdma_inline_write_padding",
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>


2016-04-12 18:08:54

by Chuck Lever

[permalink] [raw]
Subject: Re: [PATCH v1 05/18] xprtrdma: Avoid using Write list for small NFS READ requests


> On Apr 12, 2016, at 1:01 PM, Christoph Hellwig <[email protected]> wrote:
>
> On Tue, Apr 12, 2016 at 10:49:24AM -0400, Chuck Lever wrote:
>>>> No, pre-4.2 Linux servers have a bug that can be fixed by
>>>> applying the above commit, which shouldn't be difficult to
>>>> backport.
>>>>
>>>> Clients have always been allowed to send NFS READ this way.
>>>
>>> But the clients might be out in a the wild. I think we'll need at
>>> least a mount option to work around them.
>>
>> Huh?
>>
>> I'm talking about non-Linux clients. RFC 5666 allows
>> this behavior, it always has. The Linux server before
>> 9d11b51ce7c1 is broken, full stop.
>
> You are deliverately breaking exsisting and fairly recent clients.

How am I breaking existing clients? Before 4.7, our
client works with a broken Linux NFS server and it
works with a fixed one.

_New_ clients won't work with broken servers.


> I see why we want to do this, but there needs to be an options
> that things keep working. The idea of upgrading your client from
> 4.6 to 4.7 and NFS simply not working anymore with the developer
> knowing it and not providing a workaround is simply not an option.

I don't understand why this approach was reasonable in
the past, but is now suddenly not workable.

We have a workaround: Use NFS/TCP on IPoIB.


--
Chuck Lever




2016-04-12 19:12:31

by Chuck Lever

[permalink] [raw]
Subject: Re: [PATCH v1 02/18] xprtrdma: Bound the inline threshold values


> On Apr 12, 2016, at 2:04 PM, Anna Schumaker <[email protected]> wrote:
>
> On 04/11/2016 04:10 PM, Chuck Lever wrote:
>> Currently the sysctls that allow setting the inline threshold allow
>> any value to be set.
>>
>> Small values only make the transport run slower. The default 1KB
>> setting is as low as is reasonable. And the logic that decides how
>> to divide a Send buffer buffer between RPC-over-RDMA header and RPC
> ^^^^^^^^^^^^^
> I'm guessing you don't mean a buffer of buffers? :)
>
>> message assumes (but does not check) that the lower bound is not
>> crazy (say, 57 bytes).
>>
>> Send and receive buffers share a page with some control information.
>> Values larger than about 3KB can't be supported, currently.
>>
>> Signed-off-by: Chuck Lever <[email protected]>
>> ---
>> include/linux/sunrpc/xprtrdma.h | 4 +++-
>> net/sunrpc/xprtrdma/transport.c | 6 ++++++
>> 2 files changed, 9 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
>> index 767190b..39267dc 100644
>> --- a/include/linux/sunrpc/xprtrdma.h
>> +++ b/include/linux/sunrpc/xprtrdma.h
>> @@ -52,7 +52,9 @@
>> #define RPCRDMA_DEF_SLOT_TABLE (128U)
>> #define RPCRDMA_MAX_SLOT_TABLE (256U)
>>
>> -#define RPCRDMA_DEF_INLINE (1024) /* default inline max */
>> +#define RPCRDMA_MIN_INLINE (1024) /* min inline thresh */
>> +#define RPCRDMA_DEF_INLINE (1024) /* default inline thresh */
>> +#define RPCRDMA_MAX_INLINE (3068) /* max inline thresh */
>
> It looks like RPCRDMA_DEF_INLINE is used to set xprt_rdma_max_inline_{read,write}. Are we keeping DEF_INLINE around so we can set these values separately from the minimum and maximum amounts?

Yes, the default and the minimum happen to be the same
at the moment, but that may not always be the case.


> Thanks,
> Anna
>
>>
>> /* Memory registration strategies, by number.
>> * This is part of a kernel / user space API. Do not remove. */
>> diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
>> index 9954342..16595ff 100644
>> --- a/net/sunrpc/xprtrdma/transport.c
>> +++ b/net/sunrpc/xprtrdma/transport.c
>> @@ -73,6 +73,8 @@ static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
>>
>> static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
>> static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
>> +static unsigned int min_inline_size = RPCRDMA_MIN_INLINE;
>> +static unsigned int max_inline_size = RPCRDMA_MAX_INLINE;
>> static unsigned int zero;
>> static unsigned int max_padding = PAGE_SIZE;
>> static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
>> @@ -96,6 +98,8 @@ static struct ctl_table xr_tunables_table[] = {
>> .maxlen = sizeof(unsigned int),
>> .mode = 0644,
>> .proc_handler = proc_dointvec,
>> + .extra1 = &min_inline_size,
>> + .extra2 = &max_inline_size,
>> },
>> {
>> .procname = "rdma_max_inline_write",
>> @@ -103,6 +107,8 @@ static struct ctl_table xr_tunables_table[] = {
>> .maxlen = sizeof(unsigned int),
>> .mode = 0644,
>> .proc_handler = proc_dointvec,
>> + .extra1 = &min_inline_size,
>> + .extra2 = &max_inline_size,
>> },
>> {
>> .procname = "rdma_inline_write_padding",
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

--
Chuck Lever