Several important client-side performance and scalability
improvements are made in this series, proposed for the 4.3
kernel, including:
- Increase maximum RDMA credits to 128
- Increase maximum NFS r/wsize to one megabyte
- Prefer inline rather than reply chunk replies
And these fixes:
- Send NFSv4 WRITE compounds correctly
- Support RDMA_NOMSG calls
- Fix large NFS symlink operations
- Get inline threshold accounting right
Also available in the "nfs-rdma-for-4.3" topic branch of this git repo:
git://git.linux-nfs.org/projects/cel/cel-2.6.git
Or for browsing:
http://git.linux-nfs.org/?p=cel/cel-2.6.git;a=log;h=refs/heads/nfs-rdma-for-4.3
---
Chuck Lever (12):
xprtrdma: Make xprt_setup_rdma() agnostic to family of server address
xprtrdma: Raise maximum payload size to one megabyte
xprtrdma: Increase default credit limit
xprtrdma: Remove last ib_reg_phys_mr() call site
xprtrdma: Account for RPC/RDMA header size when deciding to inline
xprtrdma: Always provide a write list when sending NFS READ
xprtrdma: Don't provide a reply chunk when expecting a short reply
xprtrdma: Fix XDR tail buffer marshalling
xprtrdma: Prepare rpcrdma_ep_post() for RDMA_NOMSG calls
xprtrdma: Fix large NFS SYMLINK calls
xprtrdma: Clean up xprt_rdma_print_stats()
xprtrdma: Count RDMA_NOMSG type calls
fs/nfs/nfs3xdr.c | 1
fs/nfs/nfs4xdr.c | 1
include/linux/sunrpc/xprtrdma.h | 2 -
net/sunrpc/xprtrdma/rpc_rdma.c | 129 +++++++++++++++++++++++++++------------
net/sunrpc/xprtrdma/transport.c | 77 +++++++++++------------
net/sunrpc/xprtrdma/verbs.c | 129 ++++++++++-----------------------------
net/sunrpc/xprtrdma/xprt_rdma.h | 23 ++++---
7 files changed, 171 insertions(+), 191 deletions(-)
--
Chuck Lever
In particular, recognize when an IPv6 connection is bound.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/transport.c | 28 +++++++++++-----------------
1 file changed, 11 insertions(+), 17 deletions(-)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 680f888..d737300 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -175,10 +175,8 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
}
static void
-xprt_rdma_format_addresses(struct rpc_xprt *xprt)
+xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
{
- struct sockaddr *sap = (struct sockaddr *)
- &rpcx_to_rdmad(xprt).addr;
char buf[128];
switch (sap->sa_family) {
@@ -302,7 +300,7 @@ xprt_setup_rdma(struct xprt_create *args)
struct rpc_xprt *xprt;
struct rpcrdma_xprt *new_xprt;
struct rpcrdma_ep *new_ep;
- struct sockaddr_in *sin;
+ struct sockaddr *sap;
int rc;
if (args->addrlen > sizeof(xprt->addr)) {
@@ -333,26 +331,20 @@ xprt_setup_rdma(struct xprt_create *args)
* Set up RDMA-specific connect data.
*/
- /* Put server RDMA address in local cdata */
- memcpy(&cdata.addr, args->dstaddr, args->addrlen);
+ sap = (struct sockaddr *)&cdata.addr;
+ memcpy(sap, args->dstaddr, args->addrlen);
/* Ensure xprt->addr holds valid server TCP (not RDMA)
* address, for any side protocols which peek at it */
xprt->prot = IPPROTO_TCP;
xprt->addrlen = args->addrlen;
- memcpy(&xprt->addr, &cdata.addr, xprt->addrlen);
+ memcpy(&xprt->addr, sap, xprt->addrlen);
- sin = (struct sockaddr_in *)&cdata.addr;
- if (ntohs(sin->sin_port) != 0)
+ if (rpc_get_port(sap))
xprt_set_bound(xprt);
- dprintk("RPC: %s: %pI4:%u\n",
- __func__, &sin->sin_addr.s_addr, ntohs(sin->sin_port));
-
- /* Set max requests */
cdata.max_requests = xprt->max_reqs;
- /* Set some length limits */
cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
@@ -375,8 +367,7 @@ xprt_setup_rdma(struct xprt_create *args)
new_xprt = rpcx_to_rdmax(xprt);
- rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr,
- xprt_rdma_memreg_strategy);
+ rc = rpcrdma_ia_open(new_xprt, sap, xprt_rdma_memreg_strategy);
if (rc)
goto out1;
@@ -409,7 +400,7 @@ xprt_setup_rdma(struct xprt_create *args)
INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
xprt_rdma_connect_worker);
- xprt_rdma_format_addresses(xprt);
+ xprt_rdma_format_addresses(xprt, sap);
xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
if (xprt->max_payload == 0)
goto out4;
@@ -420,6 +411,9 @@ xprt_setup_rdma(struct xprt_create *args)
if (!try_module_get(THIS_MODULE))
goto out4;
+ dprintk("RPC: %s: %s:%s\n", __func__,
+ xprt->address_strings[RPC_DISPLAY_ADDR],
+ xprt->address_strings[RPC_DISPLAY_PORT]);
return xprt;
out4:
The point of larger rsize and wsize is to reduce the per-byte cost
of memory registration and deregistration. Modern HCAs can typically
handle a megabyte or more with a single registration operation.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/xprt_rdma.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index f49dd8b..abee472 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -165,8 +165,7 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
* struct rpcrdma_buffer. N is the max number of outstanding requests.
*/
-/* temporary static scatter/gather max */
-#define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */
+#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE)
#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
struct rpcrdma_buffer;
In preparation for similar increases on NFS/RDMA servers, bump the
advertised credit limit for RPC/RDMA to 128. This allocates some
extra resources, but the client will continue to allow only the
number of RPCs in flight that the server requests via its advertised
credit limit.
Signed-off-by: Chuck Lever <[email protected]>
---
include/linux/sunrpc/xprtrdma.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index b176130..b7b279b 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -49,7 +49,7 @@
* a single chunk type per message is supported currently.
*/
#define RPCRDMA_MIN_SLOT_TABLE (2U)
-#define RPCRDMA_DEF_SLOT_TABLE (32U)
+#define RPCRDMA_DEF_SLOT_TABLE (128U)
#define RPCRDMA_MAX_SLOT_TABLE (256U)
#define RPCRDMA_DEF_INLINE (1024) /* default inline max */
All HCA providers have an ib_get_dma_mr() verb. Thus
rpcrdma_ia_open() will either grab the device's local_dma_key if one
is available, or it will call ib_get_dma_mr() which is a 100%
guaranteed fallback. There is never any need to use the
ib_reg_phys_mr() code path in rpcrdma_register_internal(), so it can
be removed.
The remaining logic in rpcrdma_{de}register_internal() is folded
into rpcrdma_{alloc,free}_regbuf().
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/verbs.c | 102 ++++++++-------------------------------
net/sunrpc/xprtrdma/xprt_rdma.h | 1
2 files changed, 21 insertions(+), 82 deletions(-)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 891c4ed..cdf5220 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1229,75 +1229,6 @@ rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
(unsigned long long)seg->mr_dma, seg->mr_dmalen);
}
-static int
-rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
- struct ib_mr **mrp, struct ib_sge *iov)
-{
- struct ib_phys_buf ipb;
- struct ib_mr *mr;
- int rc;
-
- /*
- * All memory passed here was kmalloc'ed, therefore phys-contiguous.
- */
- iov->addr = ib_dma_map_single(ia->ri_device,
- va, len, DMA_BIDIRECTIONAL);
- if (ib_dma_mapping_error(ia->ri_device, iov->addr))
- return -ENOMEM;
-
- iov->length = len;
-
- if (ia->ri_have_dma_lkey) {
- *mrp = NULL;
- iov->lkey = ia->ri_dma_lkey;
- return 0;
- } else if (ia->ri_bind_mem != NULL) {
- *mrp = NULL;
- iov->lkey = ia->ri_bind_mem->lkey;
- return 0;
- }
-
- ipb.addr = iov->addr;
- ipb.size = iov->length;
- mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
- IB_ACCESS_LOCAL_WRITE, &iov->addr);
-
- dprintk("RPC: %s: phys convert: 0x%llx "
- "registered 0x%llx length %d\n",
- __func__, (unsigned long long)ipb.addr,
- (unsigned long long)iov->addr, len);
-
- if (IS_ERR(mr)) {
- *mrp = NULL;
- rc = PTR_ERR(mr);
- dprintk("RPC: %s: failed with %i\n", __func__, rc);
- } else {
- *mrp = mr;
- iov->lkey = mr->lkey;
- rc = 0;
- }
-
- return rc;
-}
-
-static int
-rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
- struct ib_mr *mr, struct ib_sge *iov)
-{
- int rc;
-
- ib_dma_unmap_single(ia->ri_device,
- iov->addr, iov->length, DMA_BIDIRECTIONAL);
-
- if (NULL == mr)
- return 0;
-
- rc = ib_dereg_mr(mr);
- if (rc)
- dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
- return rc;
-}
-
/**
* rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
* @ia: controlling rpcrdma_ia
@@ -1317,26 +1248,30 @@ struct rpcrdma_regbuf *
rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
{
struct rpcrdma_regbuf *rb;
- int rc;
+ struct ib_sge *iov;
- rc = -ENOMEM;
rb = kmalloc(sizeof(*rb) + size, flags);
if (rb == NULL)
goto out;
- rb->rg_size = size;
- rb->rg_owner = NULL;
- rc = rpcrdma_register_internal(ia, rb->rg_base, size,
- &rb->rg_mr, &rb->rg_iov);
- if (rc)
+ iov = &rb->rg_iov;
+ iov->addr = ib_dma_map_single(ia->ri_device,
+ (void *)rb->rg_base, size,
+ DMA_BIDIRECTIONAL);
+ if (ib_dma_mapping_error(ia->ri_device, iov->addr))
goto out_free;
+ iov->length = size;
+ iov->lkey = ia->ri_have_dma_lkey ?
+ ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
+ rb->rg_size = size;
+ rb->rg_owner = NULL;
return rb;
out_free:
kfree(rb);
out:
- return ERR_PTR(rc);
+ return ERR_PTR(-ENOMEM);
}
/**
@@ -1347,10 +1282,15 @@ out:
void
rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
{
- if (rb) {
- rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
- kfree(rb);
- }
+ struct ib_sge *iov;
+
+ if (!rb)
+ return;
+
+ iov = &rb->rg_iov;
+ ib_dma_unmap_single(ia->ri_device,
+ iov->addr, iov->length, DMA_BIDIRECTIONAL);
+ kfree(rb);
}
/*
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index abee472..ce4e79e 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -119,7 +119,6 @@ struct rpcrdma_ep {
struct rpcrdma_regbuf {
size_t rg_size;
struct rpcrdma_req *rg_owner;
- struct ib_mr *rg_mr;
struct ib_sge rg_iov;
__be32 rg_base[0] __attribute__ ((aligned(256)));
};
When marshaling RPC/RDMA requests, ensure the combined size of
RPC/RDMA header and RPC header do not exceed the inline threshold.
Endpoints typically reject RPC/RDMA messages that exceed the size
of their receive buffers.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 29 +++++++++++++++++++++++++++--
1 file changed, 27 insertions(+), 2 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 84ea37d..8cf9402 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -71,6 +71,31 @@ static const char transfertypes[][12] = {
};
#endif
+/* The client can send a request inline as long as the RPCRDMA header
+ * plus the RPC call fit under the transport's inline limit. If the
+ * combined call message size exceeds that limit, the client must use
+ * the read chunk list for this operation.
+ */
+static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
+{
+ unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;
+
+ return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
+}
+
+/* The client can’t know how large the actual reply will be. Thus it
+ * plans for the largest possible reply for that particular ULP
+ * operation. If the maximum combined reply message size exceeds that
+ * limit, the client must provide a write list or a reply chunk for
+ * this request.
+ */
+static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
+{
+ unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;
+
+ return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
+}
+
/*
* Chunk assembly from upper layer xdr_buf.
*
@@ -418,7 +443,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* a READ, then use write chunks to separate the file data
* into pages; otherwise use reply chunks.
*/
- if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
+ if (rpcrdma_results_inline(rqst))
wtype = rpcrdma_noch;
else if (rqst->rq_rcv_buf.page_len == 0)
wtype = rpcrdma_replych;
@@ -441,7 +466,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* implies the op is a write.
* TBD check NFSv4 setacl
*/
- if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+ if (rpcrdma_args_inline(rqst))
rtype = rpcrdma_noch;
else if (rqst->rq_snd_buf.page_len == 0)
rtype = rpcrdma_areadch;
The client has been setting up a reply chunk for NFS READs that are
smaller than the inline threshold. This is not efficient: both the
server and client CPUs have to copy the reply's data payload into
and out of the memory region that is then transferred via RDMA.
Using the write list, the data payload is moved by the device and no
extra data copying is necessary.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 21 ++++-----------------
1 file changed, 4 insertions(+), 17 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 8cf9402..e569da4 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -427,28 +427,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
/*
* Chunks needed for results?
*
+ * o Read ops return data as write chunk(s), header as inline.
* o If the expected result is under the inline threshold, all ops
* return as inline (but see later).
* o Large non-read ops return as a single reply chunk.
- * o Large read ops return data as write chunk(s), header as inline.
- *
- * Note: the NFS code sending down multiple result segments implies
- * the op is one of read, readdir[plus], readlink or NFSv4 getacl.
- */
-
- /*
- * This code can handle read chunks, write chunks OR reply
- * chunks -- only one type. If the request is too big to fit
- * inline, then we will choose read chunks. If the request is
- * a READ, then use write chunks to separate the file data
- * into pages; otherwise use reply chunks.
*/
- if (rpcrdma_results_inline(rqst))
- wtype = rpcrdma_noch;
- else if (rqst->rq_rcv_buf.page_len == 0)
- wtype = rpcrdma_replych;
- else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+ if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
wtype = rpcrdma_writech;
+ else if (rpcrdma_results_inline(rqst))
+ wtype = rpcrdma_noch;
else
wtype = rpcrdma_replych;
Currently Linux always offers a reply chunk, even for small replies
(unless a read or write list is needed for the RPC operation).
A comment in rpcrdma_marshal_req() reads:
> Currently we try to not actually use read inline.
> Reply chunks have the desirable property that
> they land, packed, directly in the target buffers
> without headers, so they require no fixup. The
> additional RDMA Write op sends the same amount
> of data, streams on-the-wire and adds no overhead
> on receive. Therefore, we request a reply chunk
> for non-writes wherever feasible and efficient.
This considers only the network bandwidth cost of sending the RPC
reply. For replies which are only a few dozen bytes, this is
typically not a good trade-off.
If the server chooses to return the reply inline:
- The client has registered and invalidated a memory region to
catch the reply, which is then not used
If the server chooses to use the reply chunk:
- The server sends a few bytes using a heavyweight RDMA WRITE for
operation. The entire RPC reply is conveyed in two RDMA
operations (WRITE_ONLY, SEND) instead of one.
Note that both the server and client have to prepare or copy the
reply data anyway to construct these replies. There's no benefit to
using an RDMA transfer since the host CPU has to be involved.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 14 +-------------
1 file changed, 1 insertion(+), 13 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index e569da4..8ac1448c 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -429,7 +429,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
*
* o Read ops return data as write chunk(s), header as inline.
* o If the expected result is under the inline threshold, all ops
- * return as inline (but see later).
+ * return as inline.
* o Large non-read ops return as a single reply chunk.
*/
if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
@@ -503,18 +503,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
/* new length after pullup */
rpclen = rqst->rq_svec[0].iov_len;
- /*
- * Currently we try to not actually use read inline.
- * Reply chunks have the desirable property that
- * they land, packed, directly in the target buffers
- * without headers, so they require no fixup. The
- * additional RDMA Write op sends the same amount
- * of data, streams on-the-wire and adds no overhead
- * on receive. Therefore, we request a reply chunk
- * for non-writes wherever feasible and efficient.
- */
- if (wtype == rpcrdma_noch)
- wtype = rpcrdma_replych;
}
}
Currently xprtrdma appends an extra chunk element to the RPC/RDMA
read chunk list of each NFSv4 WRITE compound. The extra element
contains the final GETATTR operation in the compound.
The result is an extra RDMA READ operation to transfer a very short
piece of each NFS WRITE compound (typically 16 bytes). This is
inefficient.
It is also incorrect. Although RFC 5667 is not precise about when
using a read list with NFSv4 COMPOUND is allowed, the intent is that
only data arguments not touched by the NFS client are to be sent
using RDMA READ or WRITE. The NFS client constructs GETATTR
arguments itself, and therefore is required to send the trailing
GETATTR operation as additional inline content, not as a data
payload.
NB: This change is not backwards compatible. Some older servers do
not accept inline content following the read list. The Linux NFS
server should handle this content correctly as of commit
a97c331f9aa9 ("svcrdma: Handle additional inline content").
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 43 +++++++++++++++++++++++++++++++++++++++-
1 file changed, 42 insertions(+), 1 deletion(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 8ac1448c..cb05233 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -96,6 +96,42 @@ static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
}
+static int
+rpcrdma_tail_pullup(struct xdr_buf *buf)
+{
+ size_t tlen = buf->tail[0].iov_len;
+ size_t skip = tlen & 3;
+
+ /* Do not include the tail if it is only an XDR pad */
+ if (tlen < 4)
+ return 0;
+
+ /* xdr_write_pages() adds a pad at the beginning of the tail
+ * if the content in "buf->pages" is unaligned. Force the
+ * tail's actual content to land at the next XDR position
+ * after the head instead.
+ */
+ if (skip) {
+ unsigned char *src, *dst;
+ unsigned int count;
+
+ src = buf->tail[0].iov_base;
+ dst = buf->head[0].iov_base;
+ dst += buf->head[0].iov_len;
+
+ src += skip;
+ tlen -= skip;
+
+ dprintk("RPC: %s: skip=%zu, memmove(%p, %p, %zu)\n",
+ __func__, skip, dst, src, tlen);
+
+ for (count = tlen; count; count--)
+ *dst++ = *src++;
+ }
+
+ return tlen;
+}
+
/*
* Chunk assembly from upper layer xdr_buf.
*
@@ -147,6 +183,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
if (len && n == nsegs)
return -EIO;
+ /* When encoding the read list, the tail is always sent inline */
+ if (type == rpcrdma_readch)
+ return n;
+
if (xdrbuf->tail[0].iov_len) {
/* the rpcrdma protocol allows us to omit any trailing
* xdr pad bytes, saving the server an RDMA operation. */
@@ -504,7 +544,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
/* new length after pullup */
rpclen = rqst->rq_svec[0].iov_len;
}
- }
+ } else if (rtype == rpcrdma_readch)
+ rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
if (rtype != rpcrdma_noch) {
hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
Only the RPC/RDMA header is sent when making an RDMA_NOMSG call.
That header resides in the first element of the iovec array
passed to rpcrdma_ep_post().
Instead of special casing the iovec element with the pad, just
sync all the elements in the send iovec. Syncing the zero pad is
not strictly necessary, but the pad is rarely if ever used these
days, and the extra cost in that case is small.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++++
net/sunrpc/xprtrdma/verbs.c | 27 +++++++++++----------------
net/sunrpc/xprtrdma/xprt_rdma.h | 18 ++++++++++--------
3 files changed, 25 insertions(+), 24 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index cb05233..2e721f2 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -575,6 +575,10 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
req->rl_send_iov[0].length = hdrlen;
req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
+ req->rl_niovs = 1;
+ if (rtype == rpcrdma_areadch)
+ return 0;
+
req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
req->rl_send_iov[1].length = rpclen;
req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index cdf5220..9199436 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -651,7 +651,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
if (rc)
return rc;
ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
- ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
+ ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
ep->rep_attr.cap.max_recv_sge = 1;
ep->rep_attr.cap.max_inline_data = 0;
ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -1303,9 +1303,11 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
struct rpcrdma_ep *ep,
struct rpcrdma_req *req)
{
+ struct ib_device *device = ia->ri_device;
struct ib_send_wr send_wr, *send_wr_fail;
struct rpcrdma_rep *rep = req->rl_reply;
- int rc;
+ struct ib_sge *iov = req->rl_send_iov;
+ int i, rc;
if (rep) {
rc = rpcrdma_ep_post_recv(ia, ep, rep);
@@ -1316,22 +1318,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
send_wr.next = NULL;
send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
- send_wr.sg_list = req->rl_send_iov;
+ send_wr.sg_list = iov;
send_wr.num_sge = req->rl_niovs;
send_wr.opcode = IB_WR_SEND;
- if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
- ib_dma_sync_single_for_device(ia->ri_device,
- req->rl_send_iov[3].addr,
- req->rl_send_iov[3].length,
- DMA_TO_DEVICE);
- ib_dma_sync_single_for_device(ia->ri_device,
- req->rl_send_iov[1].addr,
- req->rl_send_iov[1].length,
- DMA_TO_DEVICE);
- ib_dma_sync_single_for_device(ia->ri_device,
- req->rl_send_iov[0].addr,
- req->rl_send_iov[0].length,
- DMA_TO_DEVICE);
+
+ for (i = 0; i < send_wr.num_sge; i++)
+ ib_dma_sync_single_for_device(device, iov[i].addr,
+ iov[i].length, DMA_TO_DEVICE);
+ dprintk("RPC: %s: posting %d s/g entries\n",
+ __func__, send_wr.num_sge);
if (DECR_CQCOUNT(ep) > 0)
send_wr.send_flags = 0;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index ce4e79e..90da480 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -256,16 +256,18 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
char *mr_offset; /* kva if no page, else offset */
};
+#define RPCRDMA_MAX_IOVS (4)
+
struct rpcrdma_req {
- unsigned int rl_niovs; /* 0, 2 or 4 */
- unsigned int rl_nchunks; /* non-zero if chunks */
- unsigned int rl_connect_cookie; /* retry detection */
- struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
+ unsigned int rl_niovs;
+ unsigned int rl_nchunks;
+ unsigned int rl_connect_cookie;
+ struct rpcrdma_buffer *rl_buffer;
struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
- struct ib_sge rl_send_iov[4]; /* for active requests */
- struct rpcrdma_regbuf *rl_rdmabuf;
- struct rpcrdma_regbuf *rl_sendbuf;
- struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
+ struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
+ struct rpcrdma_regbuf *rl_rdmabuf;
+ struct rpcrdma_regbuf *rl_sendbuf;
+ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
};
static inline struct rpcrdma_req *
Repair how rpcrdma_marshal_req() chooses which RDMA message type
to use for large non-WRITE operations so that it picks RDMA_NOMSG
in the correct situations, and sets up the marshaling logic to
SEND only the RPC/RDMA header.
Large NFSv2 SYMLINK requests now use RDMA_NOMSG calls. The Linux NFS
server XDR decoder for NFSv2 SYMLINK does not handle having the
pathname argument arrive in a separate buffer. The decoder could be
fixed, but this is simpler and RDMA_NOMSG can be used in a variety
of other situations.
Ensure that the Linux client continues to use "RDMA_MSG + read
list" when sending large NFSv3 SYMLINK requests, which is more
efficient than using RDMA_NOMSG.
Large NFSv4 CREATE(NF4LNK) requests are changed to use "RDMA_MSG +
read list" just like NFSv3 (see Section 5 of RFC 5667). Before,
these did not work at all.
Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfs/nfs3xdr.c | 1 +
fs/nfs/nfs4xdr.c | 1 +
net/sunrpc/xprtrdma/rpc_rdma.c | 21 ++++++++++++---------
3 files changed, 14 insertions(+), 9 deletions(-)
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 9b04c2e..267126d 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1103,6 +1103,7 @@ static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
{
encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
encode_symlinkdata3(xdr, args);
+ xdr->buf->flags |= XDRBUF_WRITE;
}
/*
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 558cd65d..03a20ec 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1155,6 +1155,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
p = reserve_space(xdr, 4);
*p = cpu_to_be32(create->u.symlink.len);
xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
+ xdr->buf->flags |= XDRBUF_WRITE;
break;
case NF4BLK: case NF4CHR:
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 2e721f2..64fc4b4 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -484,21 +484,24 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
*
* o If the total request is under the inline threshold, all ops
* are sent as inline.
- * o Large non-write ops are sent with the entire message as a
- * single read chunk (protocol 0-position special case).
* o Large write ops transmit data as read chunk(s), header as
* inline.
+ * o Large non-write ops are sent with the entire message as a
+ * single read chunk (protocol 0-position special case).
*
- * Note: the NFS code sending down multiple argument segments
- * implies the op is a write.
- * TBD check NFSv4 setacl
+ * This assumes that the upper layer does not present a request
+ * that both has a data payload, and whose non-data arguments
+ * by themselves are larger than the inline threshold.
*/
- if (rpcrdma_args_inline(rqst))
+ if (rpcrdma_args_inline(rqst)) {
rtype = rpcrdma_noch;
- else if (rqst->rq_snd_buf.page_len == 0)
- rtype = rpcrdma_areadch;
- else
+ } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
rtype = rpcrdma_readch;
+ } else {
+ headerp->rm_type = htonl(RDMA_NOMSG);
+ rtype = rpcrdma_areadch;
+ rpclen = 0;
+ }
/* The following simplification is not true forever */
if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
checkpatch.pl complained about the seq_printf() format string split
across lines and the use of %Lu.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/transport.c | 48 +++++++++++++++++++--------------------
1 file changed, 23 insertions(+), 25 deletions(-)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index d737300..0985b2b 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -647,31 +647,29 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
if (xprt_connected(xprt))
idle_time = (long)(jiffies - xprt->last_used) / HZ;
- seq_printf(seq,
- "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu "
- "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n",
-
- 0, /* need a local port? */
- xprt->stat.bind_count,
- xprt->stat.connect_count,
- xprt->stat.connect_time,
- idle_time,
- xprt->stat.sends,
- xprt->stat.recvs,
- xprt->stat.bad_xids,
- xprt->stat.req_u,
- xprt->stat.bklog_u,
-
- r_xprt->rx_stats.read_chunk_count,
- r_xprt->rx_stats.write_chunk_count,
- r_xprt->rx_stats.reply_chunk_count,
- r_xprt->rx_stats.total_rdma_request,
- r_xprt->rx_stats.total_rdma_reply,
- r_xprt->rx_stats.pullup_copy_count,
- r_xprt->rx_stats.fixup_copy_count,
- r_xprt->rx_stats.hardway_register_count,
- r_xprt->rx_stats.failed_marshal_count,
- r_xprt->rx_stats.bad_reply_count);
+ seq_puts(seq, "\txprt:\trdma ");
+ seq_printf(seq, "%u %lu %lu %lu %ld %lu %lu %lu %llu %llu ",
+ 0, /* need a local port? */
+ xprt->stat.bind_count,
+ xprt->stat.connect_count,
+ xprt->stat.connect_time,
+ idle_time,
+ xprt->stat.sends,
+ xprt->stat.recvs,
+ xprt->stat.bad_xids,
+ xprt->stat.req_u,
+ xprt->stat.bklog_u);
+ seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu\n",
+ r_xprt->rx_stats.read_chunk_count,
+ r_xprt->rx_stats.write_chunk_count,
+ r_xprt->rx_stats.reply_chunk_count,
+ r_xprt->rx_stats.total_rdma_request,
+ r_xprt->rx_stats.total_rdma_reply,
+ r_xprt->rx_stats.pullup_copy_count,
+ r_xprt->rx_stats.fixup_copy_count,
+ r_xprt->rx_stats.hardway_register_count,
+ r_xprt->rx_stats.failed_marshal_count,
+ r_xprt->rx_stats.bad_reply_count);
}
static int
RDMA_NOMSG type calls are less efficient than RDMA_MSG. Count NOMSG
calls so administrators can tell if they happen to be used more than
expected.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 1 +
net/sunrpc/xprtrdma/transport.c | 5 +++--
net/sunrpc/xprtrdma/xprt_rdma.h | 1 +
3 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 64fc4b4..7c8808d 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -498,6 +498,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
} else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
rtype = rpcrdma_readch;
} else {
+ r_xprt->rx_stats.nomsg_call_count++;
headerp->rm_type = htonl(RDMA_NOMSG);
rtype = rpcrdma_areadch;
rpclen = 0;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 0985b2b..64443eb 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -659,7 +659,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
xprt->stat.bad_xids,
xprt->stat.req_u,
xprt->stat.bklog_u);
- seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu\n",
+ seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n",
r_xprt->rx_stats.read_chunk_count,
r_xprt->rx_stats.write_chunk_count,
r_xprt->rx_stats.reply_chunk_count,
@@ -669,7 +669,8 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
r_xprt->rx_stats.fixup_copy_count,
r_xprt->rx_stats.hardway_register_count,
r_xprt->rx_stats.failed_marshal_count,
- r_xprt->rx_stats.bad_reply_count);
+ r_xprt->rx_stats.bad_reply_count,
+ r_xprt->rx_stats.nomsg_call_count);
}
static int
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 90da480..566c60b 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -342,6 +342,7 @@ struct rpcrdma_stats {
unsigned long hardway_register_count;
unsigned long failed_marshal_count;
unsigned long bad_reply_count;
+ unsigned long nomsg_call_count;
};
/*
Looks good
Reviewed-By: Devesh Sharma <[email protected]>
On Fri, Jul 10, 2015 at 2:11 AM, Chuck Lever <[email protected]> wrote:
> The point of larger rsize and wsize is to reduce the per-byte cost
> of memory registration and deregistration. Modern HCAs can typically
> handle a megabyte or more with a single registration operation.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> net/sunrpc/xprtrdma/xprt_rdma.h | 3 +--
> 1 file changed, 1 insertion(+), 2 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index f49dd8b..abee472 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -165,8 +165,7 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
> * struct rpcrdma_buffer. N is the max number of outstanding requests.
> */
>
> -/* temporary static scatter/gather max */
> -#define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */
> +#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE)
> #define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
>
> struct rpcrdma_buffer;
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
Increasing the default slot table entries will increase the MR
requirements per mount.
Currently, with 32 as default Client ends up allocating 2178 frmrs
(ref: kernel 4.1-rc4) for a single mount. With 128 frmr requirement
for startup would be 8448.
8K+ MRs per mount just for start-up, I am a little doubtful about this
change. We can always release-note that "for better performance
increase the slot table entries by echo 128 >
/proc/sys/sunrpc/rdma_slot_table_entries"
-Regards
Devesh
On Fri, Jul 10, 2015 at 2:12 AM, Chuck Lever <[email protected]> wrote:
> In preparation for similar increases on NFS/RDMA servers, bump the
> advertised credit limit for RPC/RDMA to 128. This allocates some
> extra resources, but the client will continue to allow only the
> number of RPCs in flight that the server requests via its advertised
> credit limit.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> include/linux/sunrpc/xprtrdma.h | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
> index b176130..b7b279b 100644
> --- a/include/linux/sunrpc/xprtrdma.h
> +++ b/include/linux/sunrpc/xprtrdma.h
> @@ -49,7 +49,7 @@
> * a single chunk type per message is supported currently.
> */
> #define RPCRDMA_MIN_SLOT_TABLE (2U)
> -#define RPCRDMA_DEF_SLOT_TABLE (32U)
> +#define RPCRDMA_DEF_SLOT_TABLE (128U)
> #define RPCRDMA_MAX_SLOT_TABLE (256U)
>
> #define RPCRDMA_DEF_INLINE (1024) /* default inline max */
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
Looks good.
Reviewed-By: Devesh Sharma <[email protected]>
On Fri, Jul 10, 2015 at 2:12 AM, Chuck Lever <[email protected]> wrote:
> All HCA providers have an ib_get_dma_mr() verb. Thus
> rpcrdma_ia_open() will either grab the device's local_dma_key if one
> is available, or it will call ib_get_dma_mr() which is a 100%
> guaranteed fallback. There is never any need to use the
> ib_reg_phys_mr() code path in rpcrdma_register_internal(), so it can
> be removed.
>
> The remaining logic in rpcrdma_{de}register_internal() is folded
> into rpcrdma_{alloc,free}_regbuf().
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> net/sunrpc/xprtrdma/verbs.c | 102 ++++++++-------------------------------
> net/sunrpc/xprtrdma/xprt_rdma.h | 1
> 2 files changed, 21 insertions(+), 82 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> index 891c4ed..cdf5220 100644
> --- a/net/sunrpc/xprtrdma/verbs.c
> +++ b/net/sunrpc/xprtrdma/verbs.c
> @@ -1229,75 +1229,6 @@ rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
> (unsigned long long)seg->mr_dma, seg->mr_dmalen);
> }
>
> -static int
> -rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
> - struct ib_mr **mrp, struct ib_sge *iov)
> -{
> - struct ib_phys_buf ipb;
> - struct ib_mr *mr;
> - int rc;
> -
> - /*
> - * All memory passed here was kmalloc'ed, therefore phys-contiguous.
> - */
> - iov->addr = ib_dma_map_single(ia->ri_device,
> - va, len, DMA_BIDIRECTIONAL);
> - if (ib_dma_mapping_error(ia->ri_device, iov->addr))
> - return -ENOMEM;
> -
> - iov->length = len;
> -
> - if (ia->ri_have_dma_lkey) {
> - *mrp = NULL;
> - iov->lkey = ia->ri_dma_lkey;
> - return 0;
> - } else if (ia->ri_bind_mem != NULL) {
> - *mrp = NULL;
> - iov->lkey = ia->ri_bind_mem->lkey;
> - return 0;
> - }
> -
> - ipb.addr = iov->addr;
> - ipb.size = iov->length;
> - mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
> - IB_ACCESS_LOCAL_WRITE, &iov->addr);
> -
> - dprintk("RPC: %s: phys convert: 0x%llx "
> - "registered 0x%llx length %d\n",
> - __func__, (unsigned long long)ipb.addr,
> - (unsigned long long)iov->addr, len);
> -
> - if (IS_ERR(mr)) {
> - *mrp = NULL;
> - rc = PTR_ERR(mr);
> - dprintk("RPC: %s: failed with %i\n", __func__, rc);
> - } else {
> - *mrp = mr;
> - iov->lkey = mr->lkey;
> - rc = 0;
> - }
> -
> - return rc;
> -}
> -
> -static int
> -rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
> - struct ib_mr *mr, struct ib_sge *iov)
> -{
> - int rc;
> -
> - ib_dma_unmap_single(ia->ri_device,
> - iov->addr, iov->length, DMA_BIDIRECTIONAL);
> -
> - if (NULL == mr)
> - return 0;
> -
> - rc = ib_dereg_mr(mr);
> - if (rc)
> - dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
> - return rc;
> -}
> -
> /**
> * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
> * @ia: controlling rpcrdma_ia
> @@ -1317,26 +1248,30 @@ struct rpcrdma_regbuf *
> rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
> {
> struct rpcrdma_regbuf *rb;
> - int rc;
> + struct ib_sge *iov;
>
> - rc = -ENOMEM;
> rb = kmalloc(sizeof(*rb) + size, flags);
> if (rb == NULL)
> goto out;
>
> - rb->rg_size = size;
> - rb->rg_owner = NULL;
> - rc = rpcrdma_register_internal(ia, rb->rg_base, size,
> - &rb->rg_mr, &rb->rg_iov);
> - if (rc)
> + iov = &rb->rg_iov;
> + iov->addr = ib_dma_map_single(ia->ri_device,
> + (void *)rb->rg_base, size,
> + DMA_BIDIRECTIONAL);
> + if (ib_dma_mapping_error(ia->ri_device, iov->addr))
> goto out_free;
>
> + iov->length = size;
> + iov->lkey = ia->ri_have_dma_lkey ?
> + ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
> + rb->rg_size = size;
> + rb->rg_owner = NULL;
> return rb;
>
> out_free:
> kfree(rb);
> out:
> - return ERR_PTR(rc);
> + return ERR_PTR(-ENOMEM);
> }
>
> /**
> @@ -1347,10 +1282,15 @@ out:
> void
> rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
> {
> - if (rb) {
> - rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
> - kfree(rb);
> - }
> + struct ib_sge *iov;
> +
> + if (!rb)
> + return;
> +
> + iov = &rb->rg_iov;
> + ib_dma_unmap_single(ia->ri_device,
> + iov->addr, iov->length, DMA_BIDIRECTIONAL);
> + kfree(rb);
> }
>
> /*
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index abee472..ce4e79e 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -119,7 +119,6 @@ struct rpcrdma_ep {
> struct rpcrdma_regbuf {
> size_t rg_size;
> struct rpcrdma_req *rg_owner;
> - struct ib_mr *rg_mr;
> struct ib_sge rg_iov;
> __be32 rg_base[0] __attribute__ ((aligned(256)));
> };
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
Looks good
Reveiwed-By: Devesh Sharma <[email protected]>
On Fri, Jul 10, 2015 at 2:12 AM, Chuck Lever <[email protected]> wrote:
> When marshaling RPC/RDMA requests, ensure the combined size of
> RPC/RDMA header and RPC header do not exceed the inline threshold.
> Endpoints typically reject RPC/RDMA messages that exceed the size
> of their receive buffers.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> net/sunrpc/xprtrdma/rpc_rdma.c | 29 +++++++++++++++++++++++++++--
> 1 file changed, 27 insertions(+), 2 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
> index 84ea37d..8cf9402 100644
> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
> @@ -71,6 +71,31 @@ static const char transfertypes[][12] = {
> };
> #endif
>
> +/* The client can send a request inline as long as the RPCRDMA header
> + * plus the RPC call fit under the transport's inline limit. If the
> + * combined call message size exceeds that limit, the client must use
> + * the read chunk list for this operation.
> + */
> +static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
> +{
> + unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;
> +
> + return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
> +}
> +
> +/* The client can’t know how large the actual reply will be. Thus it
> + * plans for the largest possible reply for that particular ULP
> + * operation. If the maximum combined reply message size exceeds that
> + * limit, the client must provide a write list or a reply chunk for
> + * this request.
> + */
> +static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
> +{
> + unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;
> +
> + return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
> +}
> +
> /*
> * Chunk assembly from upper layer xdr_buf.
> *
> @@ -418,7 +443,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
> * a READ, then use write chunks to separate the file data
> * into pages; otherwise use reply chunks.
> */
> - if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
> + if (rpcrdma_results_inline(rqst))
> wtype = rpcrdma_noch;
> else if (rqst->rq_rcv_buf.page_len == 0)
> wtype = rpcrdma_replych;
> @@ -441,7 +466,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
> * implies the op is a write.
> * TBD check NFSv4 setacl
> */
> - if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
> + if (rpcrdma_args_inline(rqst))
> rtype = rpcrdma_noch;
> else if (rqst->rq_snd_buf.page_len == 0)
> rtype = rpcrdma_areadch;
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
Looks good
Reveiwed-By: Devesh Sharma <[email protected]>
On Fri, Jul 10, 2015 at 2:12 AM, Chuck Lever <[email protected]> wrote:
> The client has been setting up a reply chunk for NFS READs that are
> smaller than the inline threshold. This is not efficient: both the
> server and client CPUs have to copy the reply's data payload into
> and out of the memory region that is then transferred via RDMA.
>
> Using the write list, the data payload is moved by the device and no
> extra data copying is necessary.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> net/sunrpc/xprtrdma/rpc_rdma.c | 21 ++++-----------------
> 1 file changed, 4 insertions(+), 17 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
> index 8cf9402..e569da4 100644
> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
> @@ -427,28 +427,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
> /*
> * Chunks needed for results?
> *
> + * o Read ops return data as write chunk(s), header as inline.
> * o If the expected result is under the inline threshold, all ops
> * return as inline (but see later).
> * o Large non-read ops return as a single reply chunk.
> - * o Large read ops return data as write chunk(s), header as inline.
> - *
> - * Note: the NFS code sending down multiple result segments implies
> - * the op is one of read, readdir[plus], readlink or NFSv4 getacl.
> - */
> -
> - /*
> - * This code can handle read chunks, write chunks OR reply
> - * chunks -- only one type. If the request is too big to fit
> - * inline, then we will choose read chunks. If the request is
> - * a READ, then use write chunks to separate the file data
> - * into pages; otherwise use reply chunks.
> */
> - if (rpcrdma_results_inline(rqst))
> - wtype = rpcrdma_noch;
> - else if (rqst->rq_rcv_buf.page_len == 0)
> - wtype = rpcrdma_replych;
> - else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
> + if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
> wtype = rpcrdma_writech;
> + else if (rpcrdma_results_inline(rqst))
> + wtype = rpcrdma_noch;
> else
> wtype = rpcrdma_replych;
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
we need to honor the max limits of device by checking
dev_attr.max_sge? a vendor may not support 4 sges.
On Fri, Jul 10, 2015 at 2:13 AM, Chuck Lever <[email protected]> wrote:
> Only the RPC/RDMA header is sent when making an RDMA_NOMSG call.
> That header resides in the first element of the iovec array
> passed to rpcrdma_ep_post().
>
> Instead of special casing the iovec element with the pad, just
> sync all the elements in the send iovec. Syncing the zero pad is
> not strictly necessary, but the pad is rarely if ever used these
> days, and the extra cost in that case is small.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++++
> net/sunrpc/xprtrdma/verbs.c | 27 +++++++++++----------------
> net/sunrpc/xprtrdma/xprt_rdma.h | 18 ++++++++++--------
> 3 files changed, 25 insertions(+), 24 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
> index cb05233..2e721f2 100644
> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
> @@ -575,6 +575,10 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
> req->rl_send_iov[0].length = hdrlen;
> req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
>
> + req->rl_niovs = 1;
> + if (rtype == rpcrdma_areadch)
> + return 0;
> +
> req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
> req->rl_send_iov[1].length = rpclen;
> req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> index cdf5220..9199436 100644
> --- a/net/sunrpc/xprtrdma/verbs.c
> +++ b/net/sunrpc/xprtrdma/verbs.c
> @@ -651,7 +651,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
> if (rc)
> return rc;
> ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
> - ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
> + ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
> ep->rep_attr.cap.max_recv_sge = 1;
> ep->rep_attr.cap.max_inline_data = 0;
> ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
> @@ -1303,9 +1303,11 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
> struct rpcrdma_ep *ep,
> struct rpcrdma_req *req)
> {
> + struct ib_device *device = ia->ri_device;
> struct ib_send_wr send_wr, *send_wr_fail;
> struct rpcrdma_rep *rep = req->rl_reply;
> - int rc;
> + struct ib_sge *iov = req->rl_send_iov;
> + int i, rc;
>
> if (rep) {
> rc = rpcrdma_ep_post_recv(ia, ep, rep);
> @@ -1316,22 +1318,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
>
> send_wr.next = NULL;
> send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
> - send_wr.sg_list = req->rl_send_iov;
> + send_wr.sg_list = iov;
> send_wr.num_sge = req->rl_niovs;
> send_wr.opcode = IB_WR_SEND;
> - if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
> - ib_dma_sync_single_for_device(ia->ri_device,
> - req->rl_send_iov[3].addr,
> - req->rl_send_iov[3].length,
> - DMA_TO_DEVICE);
> - ib_dma_sync_single_for_device(ia->ri_device,
> - req->rl_send_iov[1].addr,
> - req->rl_send_iov[1].length,
> - DMA_TO_DEVICE);
> - ib_dma_sync_single_for_device(ia->ri_device,
> - req->rl_send_iov[0].addr,
> - req->rl_send_iov[0].length,
> - DMA_TO_DEVICE);
> +
> + for (i = 0; i < send_wr.num_sge; i++)
> + ib_dma_sync_single_for_device(device, iov[i].addr,
> + iov[i].length, DMA_TO_DEVICE);
> + dprintk("RPC: %s: posting %d s/g entries\n",
> + __func__, send_wr.num_sge);
>
> if (DECR_CQCOUNT(ep) > 0)
> send_wr.send_flags = 0;
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index ce4e79e..90da480 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -256,16 +256,18 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
> char *mr_offset; /* kva if no page, else offset */
> };
>
> +#define RPCRDMA_MAX_IOVS (4)
> +
> struct rpcrdma_req {
> - unsigned int rl_niovs; /* 0, 2 or 4 */
> - unsigned int rl_nchunks; /* non-zero if chunks */
> - unsigned int rl_connect_cookie; /* retry detection */
> - struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
> + unsigned int rl_niovs;
> + unsigned int rl_nchunks;
> + unsigned int rl_connect_cookie;
> + struct rpcrdma_buffer *rl_buffer;
> struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
> - struct ib_sge rl_send_iov[4]; /* for active requests */
> - struct rpcrdma_regbuf *rl_rdmabuf;
> - struct rpcrdma_regbuf *rl_sendbuf;
> - struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
> + struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
> + struct rpcrdma_regbuf *rl_rdmabuf;
> + struct rpcrdma_regbuf *rl_sendbuf;
> + struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
> };
>
> static inline struct rpcrdma_req *
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On 7/10/2015 7:29 AM, Devesh Sharma wrote:
> we need to honor the max limits of device by checking
> dev_attr.max_sge? a vendor may not support 4 sges.
iWARP requires a minimum of 4 send SGEs (draft-hilland-verbs 8.1.3.2)
An RI MUST support at least four Scatter/Gather Elements per
Scatter/Gather List when the Scatter/Gather List refers to the Data
Source of a Send Operation Type or the Data Sink of a Receive
Operation. An RI is NOT REQUIRED to support more than one
Scatter/Gather Element per Scatter/Gather List when the
Scatter/Gather List refers to the Data Source of an RDMA Write.
I'm not certain if IB and RoCE state a similar minimum requirement,
but it seems a very bad idea to have fewer.
On Fri, Jul 10, 2015 at 6:28 PM, Tom Talpey <[email protected]> wrote:
> On 7/10/2015 7:29 AM, Devesh Sharma wrote:
>>
>> we need to honor the max limits of device by checking
>> dev_attr.max_sge? a vendor may not support 4 sges.
>
>
> iWARP requires a minimum of 4 send SGEs (draft-hilland-verbs 8.1.3.2)
>
> An RI MUST support at least four Scatter/Gather Elements per
> Scatter/Gather List when the Scatter/Gather List refers to the Data
> Source of a Send Operation Type or the Data Sink of a Receive
> Operation. An RI is NOT REQUIRED to support more than one
> Scatter/Gather Element per Scatter/Gather List when the
> Scatter/Gather List refers to the Data Source of an RDMA Write.
>
> I'm not certain if IB and RoCE state a similar minimum requirement,
> but it seems a very bad idea to have fewer.
To my knowledge IBTA Spec do not pose any such minimum requirement.
RoCE also do not puts any minimum requirement. I think its fine if
xprtrdma honors the device limits, thus covering iWARP devices because
all iWARP devices would support minimum 4.
Chuck would correct me if xprtrdma do have any minimum requirements
>
On Jul 10, 2015, at 6:45 AM, Devesh Sharma <[email protected]> wrote:
> Increasing the default slot table entries will increase the MR
> requirements per mount.
Yes, but:
> Currently, with 32 as default Client ends up allocating 2178 frmrs
> (ref: kernel 4.1-rc4) for a single mount. With 128 frmr requirement
> for startup would be 8448.
Commit 40c6ed0c8a7f ("xprtrdma: Reduce per-transport MR allocation?)
is supposed to address this. This commit is in 4.1.
The number of MRs per credit is now 256 divided by the HCA?s
max_fast_reg_page_list_len. See frwr_op_open().
For mlx4 the number of MRs per credit is just 1, for example.
> 8K+ MRs per mount just for start-up, I am a little doubtful about this
> change. We can always release-note that "for better performance
> increase the slot table entries by echo 128 >
> /proc/sys/sunrpc/rdma_slot_table_entries"
>
> -Regards
> Devesh
>
> On Fri, Jul 10, 2015 at 2:12 AM, Chuck Lever <[email protected]> wrote:
>> In preparation for similar increases on NFS/RDMA servers, bump the
>> advertised credit limit for RPC/RDMA to 128. This allocates some
>> extra resources, but the client will continue to allow only the
>> number of RPCs in flight that the server requests via its advertised
>> credit limit.
>>
>> Signed-off-by: Chuck Lever <[email protected]>
>> ---
>> include/linux/sunrpc/xprtrdma.h | 2 +-
>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
>> index b176130..b7b279b 100644
>> --- a/include/linux/sunrpc/xprtrdma.h
>> +++ b/include/linux/sunrpc/xprtrdma.h
>> @@ -49,7 +49,7 @@
>> * a single chunk type per message is supported currently.
>> */
>> #define RPCRDMA_MIN_SLOT_TABLE (2U)
>> -#define RPCRDMA_DEF_SLOT_TABLE (32U)
>> +#define RPCRDMA_DEF_SLOT_TABLE (128U)
>> #define RPCRDMA_MAX_SLOT_TABLE (256U)
>>
>> #define RPCRDMA_DEF_INLINE (1024) /* default inline max */
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Chuck Lever
Yes, we are covered here, I took reference of 4.1-rc4 and that series
was pulled in 4.1-rc7.
I will update my test-bench and re-validate the numbers.
-Regards
On Fri, Jul 10, 2015 at 8:03 PM, Chuck Lever <[email protected]> wrote:
>
> On Jul 10, 2015, at 6:45 AM, Devesh Sharma <[email protected]> wrote:
>
>> Increasing the default slot table entries will increase the MR
>> requirements per mount.
>
> Yes, but:
>
>> Currently, with 32 as default Client ends up allocating 2178 frmrs
>> (ref: kernel 4.1-rc4) for a single mount. With 128 frmr requirement
>> for startup would be 8448.
>
> Commit 40c6ed0c8a7f ("xprtrdma: Reduce per-transport MR allocation”)
> is supposed to address this. This commit is in 4.1.
>
> The number of MRs per credit is now 256 divided by the HCA’s
> max_fast_reg_page_list_len. See frwr_op_open().
>
> For mlx4 the number of MRs per credit is just 1, for example.
>
>
>> 8K+ MRs per mount just for start-up, I am a little doubtful about this
>> change. We can always release-note that "for better performance
>> increase the slot table entries by echo 128 >
>> /proc/sys/sunrpc/rdma_slot_table_entries"
>>
>> -Regards
>> Devesh
>>
>> On Fri, Jul 10, 2015 at 2:12 AM, Chuck Lever <[email protected]> wrote:
>>> In preparation for similar increases on NFS/RDMA servers, bump the
>>> advertised credit limit for RPC/RDMA to 128. This allocates some
>>> extra resources, but the client will continue to allow only the
>>> number of RPCs in flight that the server requests via its advertised
>>> credit limit.
>>>
>>> Signed-off-by: Chuck Lever <[email protected]>
>>> ---
>>> include/linux/sunrpc/xprtrdma.h | 2 +-
>>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
>>> index b176130..b7b279b 100644
>>> --- a/include/linux/sunrpc/xprtrdma.h
>>> +++ b/include/linux/sunrpc/xprtrdma.h
>>> @@ -49,7 +49,7 @@
>>> * a single chunk type per message is supported currently.
>>> */
>>> #define RPCRDMA_MIN_SLOT_TABLE (2U)
>>> -#define RPCRDMA_DEF_SLOT_TABLE (32U)
>>> +#define RPCRDMA_DEF_SLOT_TABLE (128U)
>>> #define RPCRDMA_MAX_SLOT_TABLE (256U)
>>>
>>> #define RPCRDMA_DEF_INLINE (1024) /* default inline max */
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>>> the body of a message to [email protected]
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
> --
> Chuck Lever
>
>
>
On Jul 10, 2015, at 10:11 AM, Devesh Sharma <[email protected]> wrote:
> On Fri, Jul 10, 2015 at 6:28 PM, Tom Talpey <[email protected]> wrote:
>> On 7/10/2015 7:29 AM, Devesh Sharma wrote:
>>>
>>> we need to honor the max limits of device by checking
>>> dev_attr.max_sge? a vendor may not support 4 sges.
>>
>>
>> iWARP requires a minimum of 4 send SGEs (draft-hilland-verbs 8.1.3.2)
>>
>> An RI MUST support at least four Scatter/Gather Elements per
>> Scatter/Gather List when the Scatter/Gather List refers to the Data
>> Source of a Send Operation Type or the Data Sink of a Receive
>> Operation. An RI is NOT REQUIRED to support more than one
>> Scatter/Gather Element per Scatter/Gather List when the
>> Scatter/Gather List refers to the Data Source of an RDMA Write.
>>
>> I'm not certain if IB and RoCE state a similar minimum requirement,
>> but it seems a very bad idea to have fewer.
>
> To my knowledge IBTA Spec do not pose any such minimum requirement.
> RoCE also do not puts any minimum requirement. I think its fine if
> xprtrdma honors the device limits, thus covering iWARP devices because
> all iWARP devices would support minimum 4.
>
> Chuck would correct me if xprtrdma do have any minimum requirements
At least 2 SGEs are required for normal operation. The code in
rpcrdma_marshal_req() sets up an iovec for the RPC/RDMA header
and one for the RPC message itself. These are used in
rpcrdma_ep_post() to SEND each request.
Four SGEs are needed only for RDMA_MSGP type requests, but so far
I have never seen one of these used. I wonder if that logic can
be removed.
It is certainly possible to examine the device?s max_sge field
in rpcrdma_ep_create() and fail transport creation if the
device?s max_sge is less than RPC_MAX_IOVS.
--
Chuck Lever
Hi Chuck,
On 07/09/2015 04:41 PM, Chuck Lever wrote:
> The point of larger rsize and wsize is to reduce the per-byte cost
> of memory registration and deregistration. Modern HCAs can typically
> handle a megabyte or more with a single registration operation.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> net/sunrpc/xprtrdma/xprt_rdma.h | 3 +--
> 1 file changed, 1 insertion(+), 2 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index f49dd8b..abee472 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -165,8 +165,7 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
> * struct rpcrdma_buffer. N is the max number of outstanding requests.
> */
>
> -/* temporary static scatter/gather max */
> -#define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */
> +#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE)
^^^
What is the significance of the 1 here?
Thanks,
Anna
> #define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
>
> struct rpcrdma_buffer;
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
On Jul 10, 2015, at 3:21 PM, Anna Schumaker <[email protected]> wrote:
> Hi Chuck,
>
> On 07/09/2015 04:41 PM, Chuck Lever wrote:
>> The point of larger rsize and wsize is to reduce the per-byte cost
>> of memory registration and deregistration. Modern HCAs can typically
>> handle a megabyte or more with a single registration operation.
>>
>> Signed-off-by: Chuck Lever <[email protected]>
>> ---
>> net/sunrpc/xprtrdma/xprt_rdma.h | 3 +--
>> 1 file changed, 1 insertion(+), 2 deletions(-)
>>
>> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
>> index f49dd8b..abee472 100644
>> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
>> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
>> @@ -165,8 +165,7 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
>> * struct rpcrdma_buffer. N is the max number of outstanding requests.
>> */
>>
>> -/* temporary static scatter/gather max */
>> -#define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */
>> +#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE)
> ^^^
> What is the significance of the 1 here?
This echoes the definition of RPCSVC_MAXPAYLOAD.
?1? documents ?one? megabyte, I assume.
> Thanks,
> Anna
>
>> #define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
>>
>> struct rpcrdma_buffer;
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Chuck Lever
On 07/10/2015 03:33 PM, Chuck Lever wrote:
>
> On Jul 10, 2015, at 3:21 PM, Anna Schumaker <[email protected]> wrote:
>
>> Hi Chuck,
>>
>> On 07/09/2015 04:41 PM, Chuck Lever wrote:
>>> The point of larger rsize and wsize is to reduce the per-byte cost
>>> of memory registration and deregistration. Modern HCAs can typically
>>> handle a megabyte or more with a single registration operation.
>>>
>>> Signed-off-by: Chuck Lever <[email protected]>
>>> ---
>>> net/sunrpc/xprtrdma/xprt_rdma.h | 3 +--
>>> 1 file changed, 1 insertion(+), 2 deletions(-)
>>>
>>> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
>>> index f49dd8b..abee472 100644
>>> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
>>> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
>>> @@ -165,8 +165,7 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
>>> * struct rpcrdma_buffer. N is the max number of outstanding requests.
>>> */
>>>
>>> -/* temporary static scatter/gather max */
>>> -#define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */
>>> +#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE)
>> ^^^
>> What is the significance of the 1 here?
>
> This echoes the definition of RPCSVC_MAXPAYLOAD.
>
> “1” documents “one” megabyte, I assume.
Makes sense. Thanks!
>
>
>> Thanks,
>> Anna
>>
>>> #define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
>>>
>>> struct rpcrdma_buffer;
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>> the body of a message to [email protected]
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
> --
> Chuck Lever
>
>
>
Hi Chuck,
On 07/09/2015 04:42 PM, Chuck Lever wrote:
> When marshaling RPC/RDMA requests, ensure the combined size of
> RPC/RDMA header and RPC header do not exceed the inline threshold.
> Endpoints typically reject RPC/RDMA messages that exceed the size
> of their receive buffers.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> net/sunrpc/xprtrdma/rpc_rdma.c | 29 +++++++++++++++++++++++++++--
> 1 file changed, 27 insertions(+), 2 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
> index 84ea37d..8cf9402 100644
> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
> @@ -71,6 +71,31 @@ static const char transfertypes[][12] = {
> };
> #endif
>
> +/* The client can send a request inline as long as the RPCRDMA header
> + * plus the RPC call fit under the transport's inline limit. If the
> + * combined call message size exceeds that limit, the client must use
> + * the read chunk list for this operation.
> + */
> +static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
> +{
> + unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;
> +
> + return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
> +}
> +
> +/* The client can’t know how large the actual reply will be. Thus it
^^^^^
This is showing up as "can<80><99>t" in git-show, leading me to think that the apostrophe was replaced with a unicode-apostrophe. Google might have made the switch, but can you double check the patch on your side just in case?
Thanks,
Anna
> + * plans for the largest possible reply for that particular ULP
> + * operation. If the maximum combined reply message size exceeds that
> + * limit, the client must provide a write list or a reply chunk for
> + * this request.
> + */
> +static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
> +{
> + unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;
> +
> + return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
> +}
> +
> /*
> * Chunk assembly from upper layer xdr_buf.
> *
> @@ -418,7 +443,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
> * a READ, then use write chunks to separate the file data
> * into pages; otherwise use reply chunks.
> */
> - if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
> + if (rpcrdma_results_inline(rqst))
> wtype = rpcrdma_noch;
> else if (rqst->rq_rcv_buf.page_len == 0)
> wtype = rpcrdma_replych;
> @@ -441,7 +466,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
> * implies the op is a write.
> * TBD check NFSv4 setacl
> */
> - if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
> + if (rpcrdma_args_inline(rqst))
> rtype = rpcrdma_noch;
> else if (rqst->rq_snd_buf.page_len == 0)
> rtype = rpcrdma_areadch;
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
On Jul 10, 2015, at 4:08 PM, Anna Schumaker <[email protected]> wrote:
> Hi Chuck,
>
> On 07/09/2015 04:42 PM, Chuck Lever wrote:
>> When marshaling RPC/RDMA requests, ensure the combined size of
>> RPC/RDMA header and RPC header do not exceed the inline threshold.
>> Endpoints typically reject RPC/RDMA messages that exceed the size
>> of their receive buffers.
>>
>> Signed-off-by: Chuck Lever <[email protected]>
>> ---
>> net/sunrpc/xprtrdma/rpc_rdma.c | 29 +++++++++++++++++++++++++++--
>> 1 file changed, 27 insertions(+), 2 deletions(-)
>>
>> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
>> index 84ea37d..8cf9402 100644
>> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
>> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
>> @@ -71,6 +71,31 @@ static const char transfertypes[][12] = {
>> };
>> #endif
>>
>> +/* The client can send a request inline as long as the RPCRDMA header
>> + * plus the RPC call fit under the transport's inline limit. If the
>> + * combined call message size exceeds that limit, the client must use
>> + * the read chunk list for this operation.
>> + */
>> +static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
>> +{
>> + unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;
>> +
>> + return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
>> +}
>> +
>> +/* The client can?t know how large the actual reply will be. Thus it
> ^^^^^
> This is showing up as "can<80><99>t" in git-show, leading me to think that the apostrophe was replaced with a unicode-apostrophe. Google might have made the switch, but can you double check the patch on your side just in case?
Fixed.
>
> Thanks,
> Anna
>
>> + * plans for the largest possible reply for that particular ULP
>> + * operation. If the maximum combined reply message size exceeds that
>> + * limit, the client must provide a write list or a reply chunk for
>> + * this request.
>> + */
>> +static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
>> +{
>> + unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;
>> +
>> + return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
>> +}
>> +
>> /*
>> * Chunk assembly from upper layer xdr_buf.
>> *
>> @@ -418,7 +443,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
>> * a READ, then use write chunks to separate the file data
>> * into pages; otherwise use reply chunks.
>> */
>> - if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
>> + if (rpcrdma_results_inline(rqst))
>> wtype = rpcrdma_noch;
>> else if (rqst->rq_rcv_buf.page_len == 0)
>> wtype = rpcrdma_replych;
>> @@ -441,7 +466,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
>> * implies the op is a write.
>> * TBD check NFSv4 setacl
>> */
>> - if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
>> + if (rpcrdma_args_inline(rqst))
>> rtype = rpcrdma_noch;
>> else if (rqst->rq_snd_buf.page_len == 0)
>> rtype = rpcrdma_areadch;
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
>
--
Chuck Lever
Hi Chuck,
On 07/09/2015 04:43 PM, Chuck Lever wrote:
> Only the RPC/RDMA header is sent when making an RDMA_NOMSG call.
> That header resides in the first element of the iovec array
> passed to rpcrdma_ep_post().
>
> Instead of special casing the iovec element with the pad, just
> sync all the elements in the send iovec. Syncing the zero pad is
> not strictly necessary, but the pad is rarely if ever used these
> days, and the extra cost in that case is small.
>
> Signed-off-by: Chuck Lever <[email protected]>
> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> index cdf5220..9199436 100644
> --- a/net/sunrpc/xprtrdma/verbs.c
> +++ b/net/sunrpc/xprtrdma/verbs.c
> @@ -1316,22 +1318,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
>
> send_wr.next = NULL;
> send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
> - send_wr.sg_list = req->rl_send_iov;
> + send_wr.sg_list = iov;
> send_wr.num_sge = req->rl_niovs;
> send_wr.opcode = IB_WR_SEND;
> - if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
> - ib_dma_sync_single_for_device(ia->ri_device,
> - req->rl_send_iov[3].addr,
> - req->rl_send_iov[3].length,
> - DMA_TO_DEVICE);
> - ib_dma_sync_single_for_device(ia->ri_device,
> - req->rl_send_iov[1].addr,
> - req->rl_send_iov[1].length,
> - DMA_TO_DEVICE);
> - ib_dma_sync_single_for_device(ia->ri_device,
> - req->rl_send_iov[0].addr,
> - req->rl_send_iov[0].length,
> - DMA_TO_DEVICE);
> +
> + for (i = 0; i < send_wr.num_sge; i++)
> + ib_dma_sync_single_for_device(device, iov[i].addr,
> + iov[i].length, DMA_TO_DEVICE);
Two questions here:
1) Is syncing order important? The original code went 3 - 1 - 0, but now we're going 0 - 1 - 2 - 3.
2) Is iov[2] the zero pad you mentioned in your commit message? If not, do you know why it wasn't already syncing?
Thanks,
Anna
> + dprintk("RPC: %s: posting %d s/g entries\n",
> + __func__, send_wr.num_sge);
>
> if (DECR_CQCOUNT(ep) > 0)
> send_wr.send_flags = 0;
> On Jul 10, 2015, at 4:43 PM, Anna Schumaker <[email protected]> wrote:
>
> Hi Chuck,
>
> On 07/09/2015 04:43 PM, Chuck Lever wrote:
>> Only the RPC/RDMA header is sent when making an RDMA_NOMSG call.
>> That header resides in the first element of the iovec array
>> passed to rpcrdma_ep_post().
>>
>> Instead of special casing the iovec element with the pad, just
>> sync all the elements in the send iovec. Syncing the zero pad is
>> not strictly necessary, but the pad is rarely if ever used these
>> days, and the extra cost in that case is small.
>>
>> Signed-off-by: Chuck Lever <[email protected]>
>> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
>> index cdf5220..9199436 100644
>> --- a/net/sunrpc/xprtrdma/verbs.c
>> +++ b/net/sunrpc/xprtrdma/verbs.c
>> @@ -1316,22 +1318,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
>>
>> send_wr.next = NULL;
>> send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
>> - send_wr.sg_list = req->rl_send_iov;
>> + send_wr.sg_list = iov;
>> send_wr.num_sge = req->rl_niovs;
>> send_wr.opcode = IB_WR_SEND;
>> - if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
>> - ib_dma_sync_single_for_device(ia->ri_device,
>> - req->rl_send_iov[3].addr,
>> - req->rl_send_iov[3].length,
>> - DMA_TO_DEVICE);
>> - ib_dma_sync_single_for_device(ia->ri_device,
>> - req->rl_send_iov[1].addr,
>> - req->rl_send_iov[1].length,
>> - DMA_TO_DEVICE);
>> - ib_dma_sync_single_for_device(ia->ri_device,
>> - req->rl_send_iov[0].addr,
>> - req->rl_send_iov[0].length,
>> - DMA_TO_DEVICE);
>> +
>> + for (i = 0; i < send_wr.num_sge; i++)
>> + ib_dma_sync_single_for_device(device, iov[i].addr,
>> + iov[i].length, DMA_TO_DEVICE);
>
> Two questions here:
> 1) Is syncing order important? The original code went 3 - 1 - 0, but now we're going 0 - 1 - 2 - 3.
No, the syncing order isn’t important. What’s important is that
the sge’s be synced _before_ the SEND Work Request is posted so
that the HCA can see the freshly marshaled RPC.
> 2) Is iov[2] the zero pad you mentioned in your commit message?
Yes, it is.
> If not, do you know why it wasn't already syncing?
The pad always contains zeroes. Syncing it is optional, AFAICT.
v2 of this series will have some changes in this area, and the
zero pad will be gone entirely.
> Thanks,
> Anna
>
>> + dprintk("RPC: %s: posting %d s/g entries\n",
>> + __func__, send_wr.num_sge);
>>
>> if (DECR_CQCOUNT(ep) > 0)
>> send_wr.send_flags = 0;
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Chuck Lever
[email protected]
On Fri, Jul 10, 2015 at 10:53:40AM -0400, Chuck Lever wrote:
> It is certainly possible to examine the device’s max_sge field
> in rpcrdma_ep_create() and fail transport creation if the
> device’s max_sge is less than RPC_MAX_IOVS.
I just want to draw Sagi's attention to this problem, when considering
my thoughts on an alternate API for posting.
This thread is about NFS needing 4 sges for a (rare?) message type.
The API direction I suggested addresses this kind of issue as well,
because under the covers the driver/core can spin up a temporary MR
for this case and thus support an infinite s/g list for all posts.
Jason
On Thu, Jul 09, 2015 at 04:42:18PM -0400, Chuck Lever wrote:
> All HCA providers have an ib_get_dma_mr() verb. Thus
> rpcrdma_ia_open() will either grab the device's local_dma_key if one
> is available, or it will call ib_get_dma_mr() which is a 100%
> guaranteed fallback. There is never any need to use the
> ib_reg_phys_mr() code path in rpcrdma_register_internal(), so it can
> be removed.
Can you also send a follow-on to remove ib_reg_phys_mr entirely from
all core (= non-staging) code?
> On Jul 11, 2015, at 6:34 AM, Christoph Hellwig <[email protected]> wrote:
>
> On Thu, Jul 09, 2015 at 04:42:18PM -0400, Chuck Lever wrote:
>> All HCA providers have an ib_get_dma_mr() verb. Thus
>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
>> is available, or it will call ib_get_dma_mr() which is a 100%
>> guaranteed fallback. There is never any need to use the
>> ib_reg_phys_mr() code path in rpcrdma_register_internal(), so it can
>> be removed.
>
> Can you also send a follow-on to remove ib_reg_phys_mr entirely from
> all core (= non-staging) code?
I would prefer to run this by Doug Oucharek first, as a courtesy.
Unless I’m mistaken, the Lustre client is supposed to be on its
way into the kernel, not on its way out.
Anyone have Doug’s e-mail address?
--
Chuck Lever
[email protected]
On Sat, Jul 11, 2015 at 02:50:58PM -0400, Chuck Lever wrote:
> I would prefer to run this by Doug Oucharek first, as a courtesy.
> Unless I???m mistaken, the Lustre client is supposed to be on its
> way into the kernel, not on its way out.
The only way a mess like lustre got into the kernel tree at all is
because the rules allow ignoring the staging tree for API changes,
otherwise kernel development wouldn't scale. Lustre isn't really
on the way in, it's been in the staging tree forever and has made
zero progress towards actually being maintainable code.
On 7/9/2015 11:41 PM, Chuck Lever wrote:
> The point of larger rsize and wsize is to reduce the per-byte cost
> of memory registration and deregistration. Modern HCAs can typically
> handle a megabyte or more with a single registration operation.
Reviewed-By: Sagi Grimberg <[email protected]>
On 7/9/2015 11:42 PM, Chuck Lever wrote:
> In preparation for similar increases on NFS/RDMA servers, bump the
> advertised credit limit for RPC/RDMA to 128. This allocates some
> extra resources, but the client will continue to allow only the
> number of RPCs in flight that the server requests via its advertised
> credit limit.
>
> Signed-off-by: Chuck Lever <[email protected]>
Looks good,
Reviewed-By: Sagi Grimberg <[email protected]>
On 7/9/2015 11:42 PM, Chuck Lever wrote:
> All HCA providers have an ib_get_dma_mr() verb. Thus
> rpcrdma_ia_open() will either grab the device's local_dma_key if one
> is available, or it will call ib_get_dma_mr() which is a 100%
> guaranteed fallback. There is never any need to use the
> ib_reg_phys_mr() code path in rpcrdma_register_internal(), so it can
> be removed.
>
> The remaining logic in rpcrdma_{de}register_internal() is folded
> into rpcrdma_{alloc,free}_regbuf().
>
> Signed-off-by: Chuck Lever <[email protected]>
Like,
Reviewed-By: Sagi Grimberg <[email protected]>
On 7/9/2015 11:42 PM, Chuck Lever wrote:
> When marshaling RPC/RDMA requests, ensure the combined size of
> RPC/RDMA header and RPC header do not exceed the inline threshold.
> Endpoints typically reject RPC/RDMA messages that exceed the size
> of their receive buffers.
Did this solve a bug? because is seems like it does.
Maybe it will be a good idea to describe this bug.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> net/sunrpc/xprtrdma/rpc_rdma.c | 29 +++++++++++++++++++++++++++--
> 1 file changed, 27 insertions(+), 2 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
> index 84ea37d..8cf9402 100644
> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
> @@ -71,6 +71,31 @@ static const char transfertypes[][12] = {
> };
> #endif
>
> +/* The client can send a request inline as long as the RPCRDMA header
> + * plus the RPC call fit under the transport's inline limit. If the
> + * combined call message size exceeds that limit, the client must use
> + * the read chunk list for this operation.
> + */
> +static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
maybe static inline?
> +{
> + unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;
> +
> + return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
> +}
> +
> +/* The client can’t know how large the actual reply will be. Thus it
> + * plans for the largest possible reply for that particular ULP
> + * operation. If the maximum combined reply message size exceeds that
> + * limit, the client must provide a write list or a reply chunk for
> + * this request.
> + */
> +static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
> +{
> + unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;
> +
> + return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
> +}
> +
> /*
> * Chunk assembly from upper layer xdr_buf.
> *
> @@ -418,7 +443,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
> * a READ, then use write chunks to separate the file data
> * into pages; otherwise use reply chunks.
> */
> - if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
> + if (rpcrdma_results_inline(rqst))
> wtype = rpcrdma_noch;
> else if (rqst->rq_rcv_buf.page_len == 0)
> wtype = rpcrdma_replych;
> @@ -441,7 +466,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
> * implies the op is a write.
> * TBD check NFSv4 setacl
> */
> - if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
> + if (rpcrdma_args_inline(rqst))
> rtype = rpcrdma_noch;
> else if (rqst->rq_snd_buf.page_len == 0)
> rtype = rpcrdma_areadch;
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
On 7/9/2015 11:42 PM, Chuck Lever wrote:
> The client has been setting up a reply chunk for NFS READs that are
> smaller than the inline threshold. This is not efficient: both the
> server and client CPUs have to copy the reply's data payload into
> and out of the memory region that is then transferred via RDMA.
>
> Using the write list, the data payload is moved by the device and no
> extra data copying is necessary.
>
> Signed-off-by: Chuck Lever <[email protected]>
Reviewed-By: Sagi Grimberg <[email protected]>
On 7/9/2015 11:42 PM, Chuck Lever wrote:
> Currently Linux always offers a reply chunk, even for small replies
> (unless a read or write list is needed for the RPC operation).
>
> A comment in rpcrdma_marshal_req() reads:
>
>> Currently we try to not actually use read inline.
>> Reply chunks have the desirable property that
>> they land, packed, directly in the target buffers
>> without headers, so they require no fixup. The
>> additional RDMA Write op sends the same amount
>> of data, streams on-the-wire and adds no overhead
>> on receive. Therefore, we request a reply chunk
>> for non-writes wherever feasible and efficient.
>
> This considers only the network bandwidth cost of sending the RPC
> reply. For replies which are only a few dozen bytes, this is
> typically not a good trade-off.
>
> If the server chooses to return the reply inline:
>
> - The client has registered and invalidated a memory region to
> catch the reply, which is then not used
>
> If the server chooses to use the reply chunk:
>
> - The server sends a few bytes using a heavyweight RDMA WRITE for
> operation. The entire RPC reply is conveyed in two RDMA
> operations (WRITE_ONLY, SEND) instead of one.
Pipelined WRITE+SEND operations are hardly an overhead compared to
copying chunks of data.
>
> Note that both the server and client have to prepare or copy the
> reply data anyway to construct these replies. There's no benefit to
> using an RDMA transfer since the host CPU has to be involved.
I think that preparation (posting 1 or 2 WQEs) and copying
chunks of data of say 8K-16K might be different.
I understand that you probably see better performance scaling. But this
might be HW dependent. Also, this might backfire on you if your
configuration is one-to-many. Then, data copy CPU cycles might become
more expensive.
I don't really know what is better, but just thought I'd present
another side to this.
Hi Sagi-
On Jul 12, 2015, at 10:37 AM, Sagi Grimberg <[email protected]> wrote:
> On 7/9/2015 11:42 PM, Chuck Lever wrote:
>> When marshaling RPC/RDMA requests, ensure the combined size of
>> RPC/RDMA header and RPC header do not exceed the inline threshold.
>> Endpoints typically reject RPC/RDMA messages that exceed the size
>> of their receive buffers.
>
> Did this solve a bug? because is seems like it does.
> Maybe it will be a good idea to describe this bug.
There?s no bugzilla for this, as no issue has been encountered
in the field so far. It?s hard to trigger and servers are
forgiving.
I added some text in the patch description.
>> Signed-off-by: Chuck Lever <[email protected]>
>> ---
>> net/sunrpc/xprtrdma/rpc_rdma.c | 29 +++++++++++++++++++++++++++--
>> 1 file changed, 27 insertions(+), 2 deletions(-)
>>
>> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
>> index 84ea37d..8cf9402 100644
>> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
>> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
>> @@ -71,6 +71,31 @@ static const char transfertypes[][12] = {
>> };
>> #endif
>>
>> +/* The client can send a request inline as long as the RPCRDMA header
>> + * plus the RPC call fit under the transport's inline limit. If the
>> + * combined call message size exceeds that limit, the client must use
>> + * the read chunk list for this operation.
>> + */
>> +static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
>
> maybe static inline?
The final paragraph of Chapter 15 of Documentation/CodingStyle
suggests ?static inline? is undesirable here. I think gcc makes
the correct inlining choice here by itself.
>> +{
>> + unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;
>> +
>> + return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
>> +}
>> +
>> +/* The client can?t know how large the actual reply will be. Thus it
>> + * plans for the largest possible reply for that particular ULP
>> + * operation. If the maximum combined reply message size exceeds that
>> + * limit, the client must provide a write list or a reply chunk for
>> + * this request.
>> + */
>> +static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
>> +{
>> + unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;
>> +
>> + return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
>> +}
>> +
>> /*
>> * Chunk assembly from upper layer xdr_buf.
>> *
>> @@ -418,7 +443,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
>> * a READ, then use write chunks to separate the file data
>> * into pages; otherwise use reply chunks.
>> */
>> - if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
>> + if (rpcrdma_results_inline(rqst))
>> wtype = rpcrdma_noch;
>> else if (rqst->rq_rcv_buf.page_len == 0)
>> wtype = rpcrdma_replych;
>> @@ -441,7 +466,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
>> * implies the op is a write.
>> * TBD check NFSv4 setacl
>> */
>> - if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
>> + if (rpcrdma_args_inline(rqst))
>> rtype = rpcrdma_noch;
>> else if (rqst->rq_snd_buf.page_len == 0)
>> rtype = rpcrdma_areadch;
>>
--
Chuck Lever
Hi Sagi-
On Jul 12, 2015, at 10:58 AM, Sagi Grimberg <[email protected]> wrote:
> On 7/9/2015 11:42 PM, Chuck Lever wrote:
>> Currently Linux always offers a reply chunk, even for small replies
>> (unless a read or write list is needed for the RPC operation).
>>
>> A comment in rpcrdma_marshal_req() reads:
>>
>>> Currently we try to not actually use read inline.
>>> Reply chunks have the desirable property that
>>> they land, packed, directly in the target buffers
>>> without headers, so they require no fixup. The
>>> additional RDMA Write op sends the same amount
>>> of data, streams on-the-wire and adds no overhead
>>> on receive. Therefore, we request a reply chunk
>>> for non-writes wherever feasible and efficient.
>>
>> This considers only the network bandwidth cost of sending the RPC
>> reply. For replies which are only a few dozen bytes, this is
>> typically not a good trade-off.
>>
>> If the server chooses to return the reply inline:
>>
>> - The client has registered and invalidated a memory region to
>> catch the reply, which is then not used
>>
>> If the server chooses to use the reply chunk:
>>
>> - The server sends a few bytes using a heavyweight RDMA WRITE for
>> operation. The entire RPC reply is conveyed in two RDMA
>> operations (WRITE_ONLY, SEND) instead of one.
>
> Pipelined WRITE+SEND operations are hardly an overhead compared to
> copying chunks of data.
>
>>
>> Note that both the server and client have to prepare or copy the
>> reply data anyway to construct these replies. There's no benefit to
>> using an RDMA transfer since the host CPU has to be involved.
>
> I think that preparation (posting 1 or 2 WQEs) and copying
> chunks of data of say 8K-16K might be different.
Two points that are probably not clear from my patch description:
1. This patch affects only replies (usually much) smaller than the
client?s inline threshold (1KB). Anything larger will continue
to use RDMA transfer.
2. These replies are constructed in the RPC buffer by the server,
and parsed in the receive buffer by the client. They are not
simple data copies on either endpoint.
Think NFS GETATTR: the server is gathering metadata from multiple
sources, and XDR encoding it in the reply send buffer. The data
is not copied, it is manipulated before the SEND.
The client then XDR decodes the received stream and scatters the
decoded results into multiple in-memory data structures.
Because XDR encoding/decoding is involved, there really is no
benefit to an RDMA transfer for these replies.
> I understand that you probably see better performance scaling. But this
> might be HW dependent. Also, this might backfire on you if your
> configuration is one-to-many. Then, data copy CPU cycles might become
> more expensive.
>
> I don't really know what is better, but just thought I'd present
> another side to this.
Thanks for your review!
--
Chuck Lever
On 7/12/2015 9:38 PM, Chuck Lever wrote:
> Hi Sagi-
>
>
> On Jul 12, 2015, at 10:58 AM, Sagi Grimberg <[email protected]> wrote:
>
>> On 7/9/2015 11:42 PM, Chuck Lever wrote:
>>> Currently Linux always offers a reply chunk, even for small replies
>>> (unless a read or write list is needed for the RPC operation).
>>>
>>> A comment in rpcrdma_marshal_req() reads:
>>>
>>>> Currently we try to not actually use read inline.
>>>> Reply chunks have the desirable property that
>>>> they land, packed, directly in the target buffers
>>>> without headers, so they require no fixup. The
>>>> additional RDMA Write op sends the same amount
>>>> of data, streams on-the-wire and adds no overhead
>>>> on receive. Therefore, we request a reply chunk
>>>> for non-writes wherever feasible and efficient.
>>>
>>> This considers only the network bandwidth cost of sending the RPC
>>> reply. For replies which are only a few dozen bytes, this is
>>> typically not a good trade-off.
>>>
>>> If the server chooses to return the reply inline:
>>>
>>> - The client has registered and invalidated a memory region to
>>> catch the reply, which is then not used
>>>
>>> If the server chooses to use the reply chunk:
>>>
>>> - The server sends a few bytes using a heavyweight RDMA WRITE for
>>> operation. The entire RPC reply is conveyed in two RDMA
>>> operations (WRITE_ONLY, SEND) instead of one.
>>
>> Pipelined WRITE+SEND operations are hardly an overhead compared to
>> copying chunks of data.
>>
>>>
>>> Note that both the server and client have to prepare or copy the
>>> reply data anyway to construct these replies. There's no benefit to
>>> using an RDMA transfer since the host CPU has to be involved.
>>
>> I think that preparation (posting 1 or 2 WQEs) and copying
>> chunks of data of say 8K-16K might be different.
>
> Two points that are probably not clear from my patch description:
>
> 1. This patch affects only replies (usually much) smaller than the
> client?s inline threshold (1KB). Anything larger will continue
> to use RDMA transfer.
>
> 2. These replies are constructed in the RPC buffer by the server,
> and parsed in the receive buffer by the client. They are not
> simple data copies on either endpoint.
>
> Think NFS GETATTR: the server is gathering metadata from multiple
> sources, and XDR encoding it in the reply send buffer. The data
> is not copied, it is manipulated before the SEND.
>
> The client then XDR decodes the received stream and scatters the
> decoded results into multiple in-memory data structures.
>
> Because XDR encoding/decoding is involved, there really is no
> benefit to an RDMA transfer for these replies.
I see. Thanks for the clarification.
Reviewed-By: Sagi Grimberg <[email protected]>
Hey Chuck,
On 07/09/2015 04:43 PM, Chuck Lever wrote:
> Repair how rpcrdma_marshal_req() chooses which RDMA message type
> to use for large non-WRITE operations so that it picks RDMA_NOMSG
> in the correct situations, and sets up the marshaling logic to
> SEND only the RPC/RDMA header.
>
> Large NFSv2 SYMLINK requests now use RDMA_NOMSG calls. The Linux NFS
> server XDR decoder for NFSv2 SYMLINK does not handle having the
> pathname argument arrive in a separate buffer. The decoder could be
> fixed, but this is simpler and RDMA_NOMSG can be used in a variety
> of other situations.
>
> Ensure that the Linux client continues to use "RDMA_MSG + read
> list" when sending large NFSv3 SYMLINK requests, which is more
> efficient than using RDMA_NOMSG.
>
> Large NFSv4 CREATE(NF4LNK) requests are changed to use "RDMA_MSG +
> read list" just like NFSv3 (see Section 5 of RFC 5667). Before,
> these did not work at all.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> fs/nfs/nfs3xdr.c | 1 +
> fs/nfs/nfs4xdr.c | 1 +
> net/sunrpc/xprtrdma/rpc_rdma.c | 21 ++++++++++++---------
> 3 files changed, 14 insertions(+), 9 deletions(-)
It might be better to split this into separate patches for nfs and sunrpc, since Trond might want to accept the nfs changes separately.
Anna
>
> diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
> index 9b04c2e..267126d 100644
> --- a/fs/nfs/nfs3xdr.c
> +++ b/fs/nfs/nfs3xdr.c
> @@ -1103,6 +1103,7 @@ static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
> {
> encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
> encode_symlinkdata3(xdr, args);
> + xdr->buf->flags |= XDRBUF_WRITE;
> }
>
> /*
> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
> index 558cd65d..03a20ec 100644
> --- a/fs/nfs/nfs4xdr.c
> +++ b/fs/nfs/nfs4xdr.c
> @@ -1155,6 +1155,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
> p = reserve_space(xdr, 4);
> *p = cpu_to_be32(create->u.symlink.len);
> xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
> + xdr->buf->flags |= XDRBUF_WRITE;
> break;
>
> case NF4BLK: case NF4CHR:
> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
> index 2e721f2..64fc4b4 100644
> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
> @@ -484,21 +484,24 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
> *
> * o If the total request is under the inline threshold, all ops
> * are sent as inline.
> - * o Large non-write ops are sent with the entire message as a
> - * single read chunk (protocol 0-position special case).
> * o Large write ops transmit data as read chunk(s), header as
> * inline.
> + * o Large non-write ops are sent with the entire message as a
> + * single read chunk (protocol 0-position special case).
> *
> - * Note: the NFS code sending down multiple argument segments
> - * implies the op is a write.
> - * TBD check NFSv4 setacl
> + * This assumes that the upper layer does not present a request
> + * that both has a data payload, and whose non-data arguments
> + * by themselves are larger than the inline threshold.
> */
> - if (rpcrdma_args_inline(rqst))
> + if (rpcrdma_args_inline(rqst)) {
> rtype = rpcrdma_noch;
> - else if (rqst->rq_snd_buf.page_len == 0)
> - rtype = rpcrdma_areadch;
> - else
> + } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
> rtype = rpcrdma_readch;
> + } else {
> + headerp->rm_type = htonl(RDMA_NOMSG);
> + rtype = rpcrdma_areadch;
> + rpclen = 0;
> + }
>
> /* The following simplification is not true forever */
> if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
On Jul 14, 2015, at 12:01 PM, Anna Schumaker <[email protected]> wrote:
> Hey Chuck,
>
> On 07/09/2015 04:43 PM, Chuck Lever wrote:
>> Repair how rpcrdma_marshal_req() chooses which RDMA message type
>> to use for large non-WRITE operations so that it picks RDMA_NOMSG
>> in the correct situations, and sets up the marshaling logic to
>> SEND only the RPC/RDMA header.
>>
>> Large NFSv2 SYMLINK requests now use RDMA_NOMSG calls. The Linux NFS
>> server XDR decoder for NFSv2 SYMLINK does not handle having the
>> pathname argument arrive in a separate buffer. The decoder could be
>> fixed, but this is simpler and RDMA_NOMSG can be used in a variety
>> of other situations.
>>
>> Ensure that the Linux client continues to use "RDMA_MSG + read
>> list" when sending large NFSv3 SYMLINK requests, which is more
>> efficient than using RDMA_NOMSG.
>>
>> Large NFSv4 CREATE(NF4LNK) requests are changed to use "RDMA_MSG +
>> read list" just like NFSv3 (see Section 5 of RFC 5667). Before,
>> these did not work at all.
>>
>> Signed-off-by: Chuck Lever <[email protected]>
>> ---
>> fs/nfs/nfs3xdr.c | 1 +
>> fs/nfs/nfs4xdr.c | 1 +
>> net/sunrpc/xprtrdma/rpc_rdma.c | 21 ++++++++++++---------
>> 3 files changed, 14 insertions(+), 9 deletions(-)
>
> It might be better to split this into separate patches for nfs and sunrpc, since Trond might want to accept the nfs changes separately.
Originally I had the fs/nfs/*xdr.c changes split into separate patches.
But I?ve convinced myself that it is better to bundle the NFS changes
with the rpc_rdma.c changes here.
If the fs/nfs/*xdr.c changes come before the rpc_rdma.c changes, then
the XDRBUF_WRITE flag is being set in the NFS XDR encoders, but never
used anywhere. Which is safe, but somewhat nonsensical.
If the fs/nfs/*xdr.c changes come after the rpc_rdma.c changes, then
the client will send NFSv3 SYMLINK and NFSv4 CREATE(NF4LNK) differently
for a moment when just the rpc_rdma.c change is applied.
Together in a single commit, these are a single indivisible change that
stands up well to bisect, and is easy for distributions to cherry-pick.
It doesn?t make sense to apply the NFS and RPC/RDMA changes separately.
So, it?s up to you and Trond. I will change it if he prefers it broken
out. I think the most benign way to split them is to merge the
fs/nfs/*xdr.c changes first, but let?s hear from Trond.
> Anna
>
>>
>> diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
>> index 9b04c2e..267126d 100644
>> --- a/fs/nfs/nfs3xdr.c
>> +++ b/fs/nfs/nfs3xdr.c
>> @@ -1103,6 +1103,7 @@ static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
>> {
>> encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
>> encode_symlinkdata3(xdr, args);
>> + xdr->buf->flags |= XDRBUF_WRITE;
>> }
>>
>> /*
>> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
>> index 558cd65d..03a20ec 100644
>> --- a/fs/nfs/nfs4xdr.c
>> +++ b/fs/nfs/nfs4xdr.c
>> @@ -1155,6 +1155,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
>> p = reserve_space(xdr, 4);
>> *p = cpu_to_be32(create->u.symlink.len);
>> xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
>> + xdr->buf->flags |= XDRBUF_WRITE;
>> break;
>>
>> case NF4BLK: case NF4CHR:
>> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
>> index 2e721f2..64fc4b4 100644
>> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
>> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
>> @@ -484,21 +484,24 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
>> *
>> * o If the total request is under the inline threshold, all ops
>> * are sent as inline.
>> - * o Large non-write ops are sent with the entire message as a
>> - * single read chunk (protocol 0-position special case).
>> * o Large write ops transmit data as read chunk(s), header as
>> * inline.
>> + * o Large non-write ops are sent with the entire message as a
>> + * single read chunk (protocol 0-position special case).
>> *
>> - * Note: the NFS code sending down multiple argument segments
>> - * implies the op is a write.
>> - * TBD check NFSv4 setacl
>> + * This assumes that the upper layer does not present a request
>> + * that both has a data payload, and whose non-data arguments
>> + * by themselves are larger than the inline threshold.
>> */
>> - if (rpcrdma_args_inline(rqst))
>> + if (rpcrdma_args_inline(rqst)) {
>> rtype = rpcrdma_noch;
>> - else if (rqst->rq_snd_buf.page_len == 0)
>> - rtype = rpcrdma_areadch;
>> - else
>> + } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
>> rtype = rpcrdma_readch;
>> + } else {
>> + headerp->rm_type = htonl(RDMA_NOMSG);
>> + rtype = rpcrdma_areadch;
>> + rpclen = 0;
>> + }
>>
>> /* The following simplification is not true forever */
>> if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
>
--
Chuck Lever