Several important client-side performance and scalability
improvements are made in this series, proposed for the 4.3
kernel, including:
- Increase maximum RPC/RDMA credits to 128
- Increase maximum NFS/RDMA r/wsize to one megabyte
- Prefer inline rather than reply chunk replies
And these fixes:
- Send NFSv4 WRITE compounds correctly
- Support RDMA_NOMSG calls
- Remove support for RDMA_MSGP calls
- Fix large NFS symlink operations
- Get inline threshold accounting right
Also available in the "nfs-rdma-for-4.3" topic branch of this git repo:
git://git.linux-nfs.org/projects/cel/cel-2.6.git
Or for browsing:
http://git.linux-nfs.org/?p=cel/cel-2.6.git;a=log;h=refs/heads/nfs-rdma-for-4.3
Changes since v2:
- Rebased on Linux v4.2-rc3
- Corrected RPCRDMA_MAX_IOVS macro
- Included patch to remove core ib_reg_phys_mr() API
Changes since v1:
- Rebased on Linux v4.2-rc2
- PHYSICAL registration, being insecure, must now be explicitly selected
- Further clean-ups were done because ib_reg_phys_mr() is gone
- Support for RDMA_MSGP type calls has been removed
- Some patch descriptions have been clarified
---
Chuck Lever (15):
xprtrdma: Make xprt_setup_rdma() agnostic to family of server address
xprtrdma: Raise maximum payload size to one megabyte
xprtrdma: Increase default credit limit
xprtrdma: Don't fall back to PHYSICAL memory registration
xprtrdma: Remove last ib_reg_phys_mr() call site
xprtrdma: Clean up rpcrdma_ia_open()
xprtrdma: Remove logic that constructs RDMA_MSGP type calls
xprtrdma: Account for RPC/RDMA header size when deciding to inline
xprtrdma: Always provide a write list when sending NFS READ
xprtrdma: Don't provide a reply chunk when expecting a short reply
xprtrdma: Fix XDR tail buffer marshalling
xprtrdma: Fix large NFS SYMLINK calls
xprtrdma: Clean up xprt_rdma_print_stats()
xprtrdma: Count RDMA_NOMSG type calls
core: Remove the ib_reg_phys_mr() and ib_rereg_phys_mr() verbs
drivers/infiniband/core/verbs.c | 67 ------------
fs/nfs/nfs3xdr.c | 1
fs/nfs/nfs4xdr.c | 4 +
include/linux/sunrpc/xprtrdma.h | 2
include/rdma/ib_verbs.h | 46 --------
net/sunrpc/xprtrdma/fmr_ops.c | 19 +++
net/sunrpc/xprtrdma/frwr_ops.c | 5 +
net/sunrpc/xprtrdma/physical_ops.c | 25 ++++-
net/sunrpc/xprtrdma/rpc_rdma.c | 197 ++++++++++++++++++------------------
net/sunrpc/xprtrdma/transport.c | 77 ++++++--------
net/sunrpc/xprtrdma/verbs.c | 197 +++++++++---------------------------
net/sunrpc/xprtrdma/xprt_rdma.h | 27 ++---
12 files changed, 252 insertions(+), 415 deletions(-)
--
Chuck Lever
In particular, recognize when an IPv6 connection is bound.
Signed-off-by: Chuck Lever <[email protected]>
Tested-by: Devesh Sharma <[email protected]>
---
net/sunrpc/xprtrdma/transport.c | 28 +++++++++++-----------------
1 file changed, 11 insertions(+), 17 deletions(-)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 680f888..d737300 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -175,10 +175,8 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
}
static void
-xprt_rdma_format_addresses(struct rpc_xprt *xprt)
+xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
{
- struct sockaddr *sap = (struct sockaddr *)
- &rpcx_to_rdmad(xprt).addr;
char buf[128];
switch (sap->sa_family) {
@@ -302,7 +300,7 @@ xprt_setup_rdma(struct xprt_create *args)
struct rpc_xprt *xprt;
struct rpcrdma_xprt *new_xprt;
struct rpcrdma_ep *new_ep;
- struct sockaddr_in *sin;
+ struct sockaddr *sap;
int rc;
if (args->addrlen > sizeof(xprt->addr)) {
@@ -333,26 +331,20 @@ xprt_setup_rdma(struct xprt_create *args)
* Set up RDMA-specific connect data.
*/
- /* Put server RDMA address in local cdata */
- memcpy(&cdata.addr, args->dstaddr, args->addrlen);
+ sap = (struct sockaddr *)&cdata.addr;
+ memcpy(sap, args->dstaddr, args->addrlen);
/* Ensure xprt->addr holds valid server TCP (not RDMA)
* address, for any side protocols which peek at it */
xprt->prot = IPPROTO_TCP;
xprt->addrlen = args->addrlen;
- memcpy(&xprt->addr, &cdata.addr, xprt->addrlen);
+ memcpy(&xprt->addr, sap, xprt->addrlen);
- sin = (struct sockaddr_in *)&cdata.addr;
- if (ntohs(sin->sin_port) != 0)
+ if (rpc_get_port(sap))
xprt_set_bound(xprt);
- dprintk("RPC: %s: %pI4:%u\n",
- __func__, &sin->sin_addr.s_addr, ntohs(sin->sin_port));
-
- /* Set max requests */
cdata.max_requests = xprt->max_reqs;
- /* Set some length limits */
cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
@@ -375,8 +367,7 @@ xprt_setup_rdma(struct xprt_create *args)
new_xprt = rpcx_to_rdmax(xprt);
- rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr,
- xprt_rdma_memreg_strategy);
+ rc = rpcrdma_ia_open(new_xprt, sap, xprt_rdma_memreg_strategy);
if (rc)
goto out1;
@@ -409,7 +400,7 @@ xprt_setup_rdma(struct xprt_create *args)
INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
xprt_rdma_connect_worker);
- xprt_rdma_format_addresses(xprt);
+ xprt_rdma_format_addresses(xprt, sap);
xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
if (xprt->max_payload == 0)
goto out4;
@@ -420,6 +411,9 @@ xprt_setup_rdma(struct xprt_create *args)
if (!try_module_get(THIS_MODULE))
goto out4;
+ dprintk("RPC: %s: %s:%s\n", __func__,
+ xprt->address_strings[RPC_DISPLAY_ADDR],
+ xprt->address_strings[RPC_DISPLAY_PORT]);
return xprt;
out4:
The point of larger rsize and wsize is to reduce the per-byte cost
of memory registration and deregistration. Modern HCAs can typically
handle a megabyte or more with a single registration operation.
Signed-off-by: Chuck Lever <[email protected]>
Reviewed-by: Devesh Sharma <[email protected]>
Reviewed-By: Sagi Grimberg <[email protected]>
Tested-by: Devesh Sharma <[email protected]>
---
net/sunrpc/xprtrdma/xprt_rdma.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index f49dd8b..abee472 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -165,8 +165,7 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
* struct rpcrdma_buffer. N is the max number of outstanding requests.
*/
-/* temporary static scatter/gather max */
-#define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */
+#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE)
#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
struct rpcrdma_buffer;
In preparation for similar increases on NFS/RDMA servers, bump the
advertised credit limit for RPC/RDMA to 128. This allocates some
extra resources, but the client will continue to allow only the
number of RPCs in flight that the server requests via its advertised
credit limit.
Signed-off-by: Chuck Lever <[email protected]>
Reviewed-By: Sagi Grimberg <[email protected]>
Tested-by: Devesh Sharma <[email protected]>
---
include/linux/sunrpc/xprtrdma.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index b176130..b7b279b 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -49,7 +49,7 @@
* a single chunk type per message is supported currently.
*/
#define RPCRDMA_MIN_SLOT_TABLE (2U)
-#define RPCRDMA_DEF_SLOT_TABLE (32U)
+#define RPCRDMA_DEF_SLOT_TABLE (128U)
#define RPCRDMA_MAX_SLOT_TABLE (256U)
#define RPCRDMA_DEF_INLINE (1024) /* default inline max */
PHYSICAL memory registration uses a single rkey for all of the
client's memory, thus is insecure. It is still useful in some cases
for testing.
Retain the ability to select PHYSICAL memory registration capability
via /proc/sys/sunrpc/rdma_memreg_strategy, but don't fall back to it
if the HCA does not support FRWR or FMR.
This means amso1100 no longer works out of the box with NFS/RDMA.
When using amso1100 HCAs, set the memreg_strategy sysctl to 6 before
performing NFS/RDMA mounts.
Signed-off-by: Chuck Lever <[email protected]>
Tested-by: Devesh Sharma <[email protected]>
---
net/sunrpc/xprtrdma/verbs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 891c4ed..1065808 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -539,7 +539,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
if (!ia->ri_device->alloc_fmr) {
dprintk("RPC: %s: MTHCAFMR registration "
"not supported by HCA\n", __func__);
- memreg = RPCRDMA_ALLPHYSICAL;
+ goto out3;
}
}
All HCA providers have an ib_get_dma_mr() verb. Thus
rpcrdma_ia_open() will either grab the device's local_dma_key if one
is available, or it will call ib_get_dma_mr() which is a 100%
guaranteed fallback.
There is never any need to use the ib_reg_phys_mr() code path in
rpcrdma_register_internal(), so it can be removed. The remaining
logic in rpcrdma_{de}register_internal() is folded into
rpcrdma_{alloc,free}_regbuf().
This is clean up only. No behavior change is expected.
Signed-off-by: Chuck Lever <[email protected]>
Reviewed-by: Devesh Sharma <[email protected]>
Reviewed-By: Sagi Grimberg <[email protected]>
Tested-by: Devesh Sharma <[email protected]>
---
net/sunrpc/xprtrdma/verbs.c | 102 ++++++++-------------------------------
net/sunrpc/xprtrdma/xprt_rdma.h | 1
2 files changed, 21 insertions(+), 82 deletions(-)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 1065808..da184f9 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1229,75 +1229,6 @@ rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
(unsigned long long)seg->mr_dma, seg->mr_dmalen);
}
-static int
-rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
- struct ib_mr **mrp, struct ib_sge *iov)
-{
- struct ib_phys_buf ipb;
- struct ib_mr *mr;
- int rc;
-
- /*
- * All memory passed here was kmalloc'ed, therefore phys-contiguous.
- */
- iov->addr = ib_dma_map_single(ia->ri_device,
- va, len, DMA_BIDIRECTIONAL);
- if (ib_dma_mapping_error(ia->ri_device, iov->addr))
- return -ENOMEM;
-
- iov->length = len;
-
- if (ia->ri_have_dma_lkey) {
- *mrp = NULL;
- iov->lkey = ia->ri_dma_lkey;
- return 0;
- } else if (ia->ri_bind_mem != NULL) {
- *mrp = NULL;
- iov->lkey = ia->ri_bind_mem->lkey;
- return 0;
- }
-
- ipb.addr = iov->addr;
- ipb.size = iov->length;
- mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
- IB_ACCESS_LOCAL_WRITE, &iov->addr);
-
- dprintk("RPC: %s: phys convert: 0x%llx "
- "registered 0x%llx length %d\n",
- __func__, (unsigned long long)ipb.addr,
- (unsigned long long)iov->addr, len);
-
- if (IS_ERR(mr)) {
- *mrp = NULL;
- rc = PTR_ERR(mr);
- dprintk("RPC: %s: failed with %i\n", __func__, rc);
- } else {
- *mrp = mr;
- iov->lkey = mr->lkey;
- rc = 0;
- }
-
- return rc;
-}
-
-static int
-rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
- struct ib_mr *mr, struct ib_sge *iov)
-{
- int rc;
-
- ib_dma_unmap_single(ia->ri_device,
- iov->addr, iov->length, DMA_BIDIRECTIONAL);
-
- if (NULL == mr)
- return 0;
-
- rc = ib_dereg_mr(mr);
- if (rc)
- dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
- return rc;
-}
-
/**
* rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
* @ia: controlling rpcrdma_ia
@@ -1317,26 +1248,30 @@ struct rpcrdma_regbuf *
rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
{
struct rpcrdma_regbuf *rb;
- int rc;
+ struct ib_sge *iov;
- rc = -ENOMEM;
rb = kmalloc(sizeof(*rb) + size, flags);
if (rb == NULL)
goto out;
- rb->rg_size = size;
- rb->rg_owner = NULL;
- rc = rpcrdma_register_internal(ia, rb->rg_base, size,
- &rb->rg_mr, &rb->rg_iov);
- if (rc)
+ iov = &rb->rg_iov;
+ iov->addr = ib_dma_map_single(ia->ri_device,
+ (void *)rb->rg_base, size,
+ DMA_BIDIRECTIONAL);
+ if (ib_dma_mapping_error(ia->ri_device, iov->addr))
goto out_free;
+ iov->length = size;
+ iov->lkey = ia->ri_have_dma_lkey ?
+ ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
+ rb->rg_size = size;
+ rb->rg_owner = NULL;
return rb;
out_free:
kfree(rb);
out:
- return ERR_PTR(rc);
+ return ERR_PTR(-ENOMEM);
}
/**
@@ -1347,10 +1282,15 @@ out:
void
rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
{
- if (rb) {
- rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
- kfree(rb);
- }
+ struct ib_sge *iov;
+
+ if (!rb)
+ return;
+
+ iov = &rb->rg_iov;
+ ib_dma_unmap_single(ia->ri_device,
+ iov->addr, iov->length, DMA_BIDIRECTIONAL);
+ kfree(rb);
}
/*
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index abee472..ce4e79e 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -119,7 +119,6 @@ struct rpcrdma_ep {
struct rpcrdma_regbuf {
size_t rg_size;
struct rpcrdma_req *rg_owner;
- struct ib_mr *rg_mr;
struct ib_sge rg_iov;
__be32 rg_base[0] __attribute__ ((aligned(256)));
};
Untangle the end of rpcrdma_ia_open() by moving DMA MR set-up, which
is different for each registration method, to the .ro_open functions.
This is refactoring only. No behavior change is expected.
Signed-off-by: Chuck Lever <[email protected]>
Tested-by: Devesh Sharma <[email protected]>
---
net/sunrpc/xprtrdma/fmr_ops.c | 19 +++++++++++
net/sunrpc/xprtrdma/frwr_ops.c | 5 +++
net/sunrpc/xprtrdma/physical_ops.c | 25 ++++++++++++++-
net/sunrpc/xprtrdma/verbs.c | 60 +++++++++++-------------------------
net/sunrpc/xprtrdma/xprt_rdma.h | 3 +-
5 files changed, 67 insertions(+), 45 deletions(-)
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index f1e8daf..cb25c89 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -39,6 +39,25 @@ static int
fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
+ struct ib_device_attr *devattr = &ia->ri_devattr;
+ struct ib_mr *mr;
+
+ /* Obtain an lkey to use for the regbufs, which are
+ * protected from remote access.
+ */
+ if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
+ ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
+ } else {
+ mr = ib_get_dma_mr(ia->ri_pd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(mr)) {
+ pr_err("%s: ib_get_dma_mr for failed with %lX\n",
+ __func__, PTR_ERR(mr));
+ return -ENOMEM;
+ }
+ ia->ri_dma_lkey = ia->ri_dma_mr->lkey;
+ ia->ri_dma_mr = mr;
+ }
+
return 0;
}
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 04ea914..63f282e 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -189,6 +189,11 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct ib_device_attr *devattr = &ia->ri_devattr;
int depth, delta;
+ /* Obtain an lkey to use for the regbufs, which are
+ * protected from remote access.
+ */
+ ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
+
ia->ri_max_frmr_depth =
min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
devattr->max_fast_reg_page_list_len);
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index 41985d0..72cf8b1 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -23,6 +23,29 @@ static int
physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
+ struct ib_device_attr *devattr = &ia->ri_devattr;
+ struct ib_mr *mr;
+
+ /* Obtain an rkey to use for RPC data payloads.
+ */
+ mr = ib_get_dma_mr(ia->ri_pd,
+ IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_READ);
+ if (IS_ERR(mr)) {
+ pr_err("%s: ib_get_dma_mr for failed with %lX\n",
+ __func__, PTR_ERR(mr));
+ return -ENOMEM;
+ }
+ ia->ri_dma_mr = mr;
+
+ /* Obtain an lkey to use for regbufs.
+ */
+ if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+ ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
+ else
+ ia->ri_dma_lkey = ia->ri_dma_mr->lkey;
+
return 0;
}
@@ -51,7 +74,7 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
- seg->mr_rkey = ia->ri_bind_mem->rkey;
+ seg->mr_rkey = ia->ri_dma_mr->rkey;
seg->mr_base = seg->mr_dma;
seg->mr_nsegs = 1;
return 1;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index da184f9..8516d98 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -493,9 +493,11 @@ rpcrdma_clean_cq(struct ib_cq *cq)
int
rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
{
- int rc, mem_priv;
struct rpcrdma_ia *ia = &xprt->rx_ia;
struct ib_device_attr *devattr = &ia->ri_devattr;
+ int rc;
+
+ ia->ri_dma_mr = NULL;
ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
if (IS_ERR(ia->ri_id)) {
@@ -519,11 +521,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
goto out3;
}
- if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
- ia->ri_have_dma_lkey = 1;
- ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
- }
-
if (memreg == RPCRDMA_FRMR) {
/* Requires both frmr reg and local dma lkey */
if (((devattr->device_cap_flags &
@@ -543,38 +540,15 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
}
}
- /*
- * Optionally obtain an underlying physical identity mapping in
- * order to do a memory window-based bind. This base registration
- * is protected from remote access - that is enabled only by binding
- * for the specific bytes targeted during each RPC operation, and
- * revoked after the corresponding completion similar to a storage
- * adapter.
- */
switch (memreg) {
case RPCRDMA_FRMR:
ia->ri_ops = &rpcrdma_frwr_memreg_ops;
break;
case RPCRDMA_ALLPHYSICAL:
ia->ri_ops = &rpcrdma_physical_memreg_ops;
- mem_priv = IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_REMOTE_READ;
- goto register_setup;
+ break;
case RPCRDMA_MTHCAFMR:
ia->ri_ops = &rpcrdma_fmr_memreg_ops;
- if (ia->ri_have_dma_lkey)
- break;
- mem_priv = IB_ACCESS_LOCAL_WRITE;
- register_setup:
- ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
- if (IS_ERR(ia->ri_bind_mem)) {
- printk(KERN_ALERT "%s: ib_get_dma_mr for "
- "phys register failed with %lX\n",
- __func__, PTR_ERR(ia->ri_bind_mem));
- rc = -ENOMEM;
- goto out3;
- }
break;
default:
printk(KERN_ERR "RPC: Unsupported memory "
@@ -606,15 +580,7 @@ out1:
void
rpcrdma_ia_close(struct rpcrdma_ia *ia)
{
- int rc;
-
dprintk("RPC: %s: entering\n", __func__);
- if (ia->ri_bind_mem != NULL) {
- rc = ib_dereg_mr(ia->ri_bind_mem);
- dprintk("RPC: %s: ib_dereg_mr returned %i\n",
- __func__, rc);
- }
-
if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
if (ia->ri_id->qp)
rdma_destroy_qp(ia->ri_id);
@@ -661,8 +627,10 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
if (cdata->padding) {
ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
GFP_KERNEL);
- if (IS_ERR(ep->rep_padbuf))
- return PTR_ERR(ep->rep_padbuf);
+ if (IS_ERR(ep->rep_padbuf)) {
+ rc = PTR_ERR(ep->rep_padbuf);
+ goto out0;
+ }
} else
ep->rep_padbuf = NULL;
@@ -749,6 +717,9 @@ out2:
__func__, err);
out1:
rpcrdma_free_regbuf(ia, ep->rep_padbuf);
+out0:
+ if (ia->ri_dma_mr)
+ ib_dereg_mr(ia->ri_dma_mr);
return rc;
}
@@ -788,6 +759,12 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
if (rc)
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, rc);
+
+ if (ia->ri_dma_mr) {
+ rc = ib_dereg_mr(ia->ri_dma_mr);
+ dprintk("RPC: %s: ib_dereg_mr returned %i\n",
+ __func__, rc);
+ }
}
/*
@@ -1262,8 +1239,7 @@ rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
goto out_free;
iov->length = size;
- iov->lkey = ia->ri_have_dma_lkey ?
- ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
+ iov->lkey = ia->ri_dma_lkey;
rb->rg_size = size;
rb->rg_owner = NULL;
return rb;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index ce4e79e..8219011 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -65,9 +65,8 @@ struct rpcrdma_ia {
struct ib_device *ri_device;
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
- struct ib_mr *ri_bind_mem;
+ struct ib_mr *ri_dma_mr;
u32 ri_dma_lkey;
- int ri_have_dma_lkey;
struct completion ri_done;
int ri_async_rc;
unsigned int ri_max_frmr_depth;
RDMA_MSGP type calls insert a zero pad in the middle of the RPC
message to align the RPC request's data payload to the server's
alignment preferences. A server can then "page flip" the payload
into place to avoid a data copy in certain circumstances. However:
1. The client has to have a priori knowledge of the server's
preferred alignment
2. Requests eligible for RDMA_MSGP are requests that are small
enough to have been sent inline, and convey a data payload
at the _end_ of the RPC message
Today 1. is done with a sysctl, and is a global setting that is
copied during mount. Linux does not support CCP to query the
server's preferences (RFC 5666, Section 6).
A small-ish NFSv3 WRITE might use RDMA_MSGP, but no NFSv4
compound fits bullet 2.
Thus the Linux client currently leaves RDMA_MSGP disabled. The
Linux server handles RDMA_MSGP, but does not use any special
page flipping, so it confers no benefit.
Clean up the marshaling code by removing the logic that constructs
RDMA_MSGP type calls. This also reduces the maximum send iovec size
from four to just two elements.
/proc/sys/sunrpc/rdma_inline_write_padding is a kernel API, and
thus is left in place.
Signed-off-by: Chuck Lever <[email protected]>
Tested-by: Devesh Sharma <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 92 ++++++++++-----------------------------
net/sunrpc/xprtrdma/verbs.c | 47 +++++++-------------
net/sunrpc/xprtrdma/xprt_rdma.h | 19 ++++----
3 files changed, 51 insertions(+), 107 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 84ea37d..8e9c564 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -297,8 +297,7 @@ out:
* pre-registered memory buffer for this request. For small amounts
* of data, this is efficient. The cutoff value is tunable.
*/
-static int
-rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
+static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
{
int i, npages, curlen;
int copy_len;
@@ -310,16 +309,9 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
destp = rqst->rq_svec[0].iov_base;
curlen = rqst->rq_svec[0].iov_len;
destp += curlen;
- /*
- * Do optional padding where it makes sense. Alignment of write
- * payload can help the server, if our setting is accurate.
- */
- pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/);
- if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH)
- pad = 0; /* don't pad this request */
- dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n",
- __func__, pad, destp, rqst->rq_slen, curlen);
+ dprintk("RPC: %s: destp 0x%p len %d hdrlen %d\n",
+ __func__, destp, rqst->rq_slen, curlen);
copy_len = rqst->rq_snd_buf.page_len;
@@ -355,7 +347,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
page_base = 0;
}
/* header now contains entire send message */
- return pad;
}
/*
@@ -380,7 +371,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
char *base;
- size_t rpclen, padlen;
+ size_t rpclen;
ssize_t hdrlen;
enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp;
@@ -458,7 +449,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
}
hdrlen = RPCRDMA_HDRLEN_MIN;
- padlen = 0;
/*
* Pull up any extra send data into the preregistered buffer.
@@ -467,43 +457,24 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
*/
if (rtype == rpcrdma_noch) {
- padlen = rpcrdma_inline_pullup(rqst,
- RPCRDMA_INLINE_PAD_VALUE(rqst));
-
- if (padlen) {
- headerp->rm_type = rdma_msgp;
- headerp->rm_body.rm_padded.rm_align =
- cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst));
- headerp->rm_body.rm_padded.rm_thresh =
- cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH);
- headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;
- headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
- headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
- hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
- if (wtype != rpcrdma_noch) {
- dprintk("RPC: %s: invalid chunk list\n",
- __func__);
- return -EIO;
- }
- } else {
- headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
- headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
- headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
- /* new length after pullup */
- rpclen = rqst->rq_svec[0].iov_len;
- /*
- * Currently we try to not actually use read inline.
- * Reply chunks have the desirable property that
- * they land, packed, directly in the target buffers
- * without headers, so they require no fixup. The
- * additional RDMA Write op sends the same amount
- * of data, streams on-the-wire and adds no overhead
- * on receive. Therefore, we request a reply chunk
- * for non-writes wherever feasible and efficient.
- */
- if (wtype == rpcrdma_noch)
- wtype = rpcrdma_replych;
- }
+ rpcrdma_inline_pullup(rqst);
+
+ headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
+ headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
+ headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
+ /* new length after pullup */
+ rpclen = rqst->rq_svec[0].iov_len;
+ /* Currently we try to not actually use read inline.
+ * Reply chunks have the desirable property that
+ * they land, packed, directly in the target buffers
+ * without headers, so they require no fixup. The
+ * additional RDMA Write op sends the same amount
+ * of data, streams on-the-wire and adds no overhead
+ * on receive. Therefore, we request a reply chunk
+ * for non-writes wherever feasible and efficient.
+ */
+ if (wtype == rpcrdma_noch)
+ wtype = rpcrdma_replych;
}
if (rtype != rpcrdma_noch) {
@@ -518,9 +489,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
if (hdrlen < 0)
return hdrlen;
- dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
+ dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd"
" headerp 0x%p base 0x%p lkey 0x%x\n",
- __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
+ __func__, transfertypes[wtype], hdrlen, rpclen,
headerp, base, rdmab_lkey(req->rl_rdmabuf));
/*
@@ -539,21 +510,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
req->rl_niovs = 2;
-
- if (padlen) {
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
-
- req->rl_send_iov[2].addr = rdmab_addr(ep->rep_padbuf);
- req->rl_send_iov[2].length = padlen;
- req->rl_send_iov[2].lkey = rdmab_lkey(ep->rep_padbuf);
-
- req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;
- req->rl_send_iov[3].length = rqst->rq_slen - rpclen;
- req->rl_send_iov[3].lkey = rdmab_lkey(req->rl_sendbuf);
-
- req->rl_niovs = 4;
- }
-
return 0;
}
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 8516d98..b4d4f63 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -605,6 +605,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
struct ib_cq_init_attr cq_attr = {};
int rc, err;
+ if (devattr->max_sge < RPCRDMA_MAX_IOVS) {
+ dprintk("RPC: %s: insufficient sge's available\n",
+ __func__);
+ return -ENOMEM;
+ }
+
/* check provider's send/recv wr limits */
if (cdata->max_requests > devattr->max_qp_wr)
cdata->max_requests = devattr->max_qp_wr;
@@ -617,23 +623,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
if (rc)
return rc;
ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
- ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
+ ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
ep->rep_attr.cap.max_recv_sge = 1;
ep->rep_attr.cap.max_inline_data = 0;
ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
ep->rep_attr.qp_type = IB_QPT_RC;
ep->rep_attr.port_num = ~0;
- if (cdata->padding) {
- ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
- GFP_KERNEL);
- if (IS_ERR(ep->rep_padbuf)) {
- rc = PTR_ERR(ep->rep_padbuf);
- goto out0;
- }
- } else
- ep->rep_padbuf = NULL;
-
dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
"iovs: send %d recv %d\n",
__func__,
@@ -716,8 +712,6 @@ out2:
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, err);
out1:
- rpcrdma_free_regbuf(ia, ep->rep_padbuf);
-out0:
if (ia->ri_dma_mr)
ib_dereg_mr(ia->ri_dma_mr);
return rc;
@@ -746,8 +740,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
ia->ri_id->qp = NULL;
}
- rpcrdma_free_regbuf(ia, ep->rep_padbuf);
-
rpcrdma_clean_cq(ep->rep_attr.recv_cq);
rc = ib_destroy_cq(ep->rep_attr.recv_cq);
if (rc)
@@ -1279,9 +1271,11 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
struct rpcrdma_ep *ep,
struct rpcrdma_req *req)
{
+ struct ib_device *device = ia->ri_device;
struct ib_send_wr send_wr, *send_wr_fail;
struct rpcrdma_rep *rep = req->rl_reply;
- int rc;
+ struct ib_sge *iov = req->rl_send_iov;
+ int i, rc;
if (rep) {
rc = rpcrdma_ep_post_recv(ia, ep, rep);
@@ -1292,22 +1286,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
send_wr.next = NULL;
send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
- send_wr.sg_list = req->rl_send_iov;
+ send_wr.sg_list = iov;
send_wr.num_sge = req->rl_niovs;
send_wr.opcode = IB_WR_SEND;
- if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
- ib_dma_sync_single_for_device(ia->ri_device,
- req->rl_send_iov[3].addr,
- req->rl_send_iov[3].length,
- DMA_TO_DEVICE);
- ib_dma_sync_single_for_device(ia->ri_device,
- req->rl_send_iov[1].addr,
- req->rl_send_iov[1].length,
- DMA_TO_DEVICE);
- ib_dma_sync_single_for_device(ia->ri_device,
- req->rl_send_iov[0].addr,
- req->rl_send_iov[0].length,
- DMA_TO_DEVICE);
+
+ for (i = 0; i < send_wr.num_sge; i++)
+ ib_dma_sync_single_for_device(device, iov[i].addr,
+ iov[i].length, DMA_TO_DEVICE);
+ dprintk("RPC: %s: posting %d s/g entries\n",
+ __func__, send_wr.num_sge);
if (DECR_CQCOUNT(ep) > 0)
send_wr.send_flags = 0;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 8219011..8422c09 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -88,7 +88,6 @@ struct rpcrdma_ep {
int rep_connected;
struct ib_qp_init_attr rep_attr;
wait_queue_head_t rep_connect_wait;
- struct rpcrdma_regbuf *rep_padbuf;
struct rdma_conn_param rep_remote_cma;
struct sockaddr_storage rep_remote_addr;
struct delayed_work rep_connect_worker;
@@ -255,16 +254,18 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
char *mr_offset; /* kva if no page, else offset */
};
+#define RPCRDMA_MAX_IOVS (2)
+
struct rpcrdma_req {
- unsigned int rl_niovs; /* 0, 2 or 4 */
- unsigned int rl_nchunks; /* non-zero if chunks */
- unsigned int rl_connect_cookie; /* retry detection */
- struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
+ unsigned int rl_niovs;
+ unsigned int rl_nchunks;
+ unsigned int rl_connect_cookie;
+ struct rpcrdma_buffer *rl_buffer;
struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
- struct ib_sge rl_send_iov[4]; /* for active requests */
- struct rpcrdma_regbuf *rl_rdmabuf;
- struct rpcrdma_regbuf *rl_sendbuf;
- struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
+ struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
+ struct rpcrdma_regbuf *rl_rdmabuf;
+ struct rpcrdma_regbuf *rl_sendbuf;
+ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
};
static inline struct rpcrdma_req *
When the size of the RPC message is near the inline threshold (1KB),
the client would allow messages to be sent that were a few bytes too
large.
When marshaling RPC/RDMA requests, ensure the combined size of
RPC/RDMA header and RPC header do not exceed the inline threshold.
Endpoints typically reject RPC/RDMA messages that exceed the size
of their receive buffers.
The two server implementations I test with (Linux and Solaris) use
receive buffers that are larger than the client’s inline threshold.
Thus so far this has been benign, observed only by code inspection.
Signed-off-by: Chuck Lever <[email protected]>
Reviewed-by: Devesh Sharma <[email protected]>
Tested-by: Devesh Sharma <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 29 +++++++++++++++++++++++++++--
1 file changed, 27 insertions(+), 2 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 8e9c564..950b654 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -71,6 +71,31 @@ static const char transfertypes[][12] = {
};
#endif
+/* The client can send a request inline as long as the RPCRDMA header
+ * plus the RPC call fit under the transport's inline limit. If the
+ * combined call message size exceeds that limit, the client must use
+ * the read chunk list for this operation.
+ */
+static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
+{
+ unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;
+
+ return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
+}
+
+/* The client can't know how large the actual reply will be. Thus it
+ * plans for the largest possible reply for that particular ULP
+ * operation. If the maximum combined reply message size exceeds that
+ * limit, the client must provide a write list or a reply chunk for
+ * this request.
+ */
+static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
+{
+ unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;
+
+ return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
+}
+
/*
* Chunk assembly from upper layer xdr_buf.
*
@@ -409,7 +434,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* a READ, then use write chunks to separate the file data
* into pages; otherwise use reply chunks.
*/
- if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
+ if (rpcrdma_results_inline(rqst))
wtype = rpcrdma_noch;
else if (rqst->rq_rcv_buf.page_len == 0)
wtype = rpcrdma_replych;
@@ -432,7 +457,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* implies the op is a write.
* TBD check NFSv4 setacl
*/
- if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+ if (rpcrdma_args_inline(rqst))
rtype = rpcrdma_noch;
else if (rqst->rq_snd_buf.page_len == 0)
rtype = rpcrdma_areadch;
The client has been setting up a reply chunk for NFS READs that are
smaller than the inline threshold. This is not efficient: both the
server and client CPUs have to copy the reply's data payload into
and out of the memory region that is then transferred via RDMA.
Using the write list, the data payload is moved by the device and no
extra data copying is necessary.
Signed-off-by: Chuck Lever <[email protected]>
Reviewed-by: Devesh Sharma <[email protected]>
Reviewed-By: Sagi Grimberg <[email protected]>
Tested-by: Devesh Sharma <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 21 ++++-----------------
1 file changed, 4 insertions(+), 17 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 950b654..e7cf976 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -418,28 +418,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
/*
* Chunks needed for results?
*
+ * o Read ops return data as write chunk(s), header as inline.
* o If the expected result is under the inline threshold, all ops
* return as inline (but see later).
* o Large non-read ops return as a single reply chunk.
- * o Large read ops return data as write chunk(s), header as inline.
- *
- * Note: the NFS code sending down multiple result segments implies
- * the op is one of read, readdir[plus], readlink or NFSv4 getacl.
- */
-
- /*
- * This code can handle read chunks, write chunks OR reply
- * chunks -- only one type. If the request is too big to fit
- * inline, then we will choose read chunks. If the request is
- * a READ, then use write chunks to separate the file data
- * into pages; otherwise use reply chunks.
*/
- if (rpcrdma_results_inline(rqst))
- wtype = rpcrdma_noch;
- else if (rqst->rq_rcv_buf.page_len == 0)
- wtype = rpcrdma_replych;
- else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+ if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
wtype = rpcrdma_writech;
+ else if (rpcrdma_results_inline(rqst))
+ wtype = rpcrdma_noch;
else
wtype = rpcrdma_replych;
Currently Linux always offers a reply chunk, even when the reply
can be sent inline (ie. is smaller than 1KB).
On the client, registering a memory region can be expensive. A
server may choose not to use the reply chunk, wasting the cost of
the registration.
This is a change only for RPC replies smaller than 1KB which the
server constructs in the RPC reply send buffer. Because the elements
of the reply must be XDR encoded, a copy-free data transfer has no
benefit in this case.
Signed-off-by: Chuck Lever <[email protected]>
Reviewed-by: Sagi Grimberg <[email protected]>
Tested-by: Devesh Sharma <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 13 +------------
1 file changed, 1 insertion(+), 12 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index e7cf976..62150ae 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -420,7 +420,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
*
* o Read ops return data as write chunk(s), header as inline.
* o If the expected result is under the inline threshold, all ops
- * return as inline (but see later).
+ * return as inline.
* o Large non-read ops return as a single reply chunk.
*/
if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
@@ -476,17 +476,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
/* new length after pullup */
rpclen = rqst->rq_svec[0].iov_len;
- /* Currently we try to not actually use read inline.
- * Reply chunks have the desirable property that
- * they land, packed, directly in the target buffers
- * without headers, so they require no fixup. The
- * additional RDMA Write op sends the same amount
- * of data, streams on-the-wire and adds no overhead
- * on receive. Therefore, we request a reply chunk
- * for non-writes wherever feasible and efficient.
- */
- if (wtype == rpcrdma_noch)
- wtype = rpcrdma_replych;
}
if (rtype != rpcrdma_noch) {
Currently xprtrdma appends an extra chunk element to the RPC/RDMA
read chunk list of each NFSv4 WRITE compound. The extra element
contains the final GETATTR operation in the compound.
The result is an extra RDMA READ operation to transfer a very short
piece of each NFS WRITE compound (typically 16 bytes). This is
inefficient.
It is also incorrect. Although RFC 5667 is not precise about when
using a read list with NFSv4 COMPOUND is allowed, the intent is that
only data arguments not touched by NFS (ie, read and write payloads)
are to be sent using RDMA READ or WRITE. The NFS client constructs
GETATTR arguments itself, and therefore is required to send the
trailing GETATTR operation as additional inline content, not as a
data payload.
NB: This change is not backwards compatible. Some older servers do
not accept inline content following the read list. The Linux NFS
server should handle this content correctly as of commit
a97c331f9aa9 ("svcrdma: Handle additional inline content").
Signed-off-by: Chuck Lever <[email protected]>
Tested-by: Devesh Sharma <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 44 ++++++++++++++++++++++++++++++++++++++--
1 file changed, 42 insertions(+), 2 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 62150ae..1dd48f2 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -96,6 +96,42 @@ static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
}
+static int
+rpcrdma_tail_pullup(struct xdr_buf *buf)
+{
+ size_t tlen = buf->tail[0].iov_len;
+ size_t skip = tlen & 3;
+
+ /* Do not include the tail if it is only an XDR pad */
+ if (tlen < 4)
+ return 0;
+
+ /* xdr_write_pages() adds a pad at the beginning of the tail
+ * if the content in "buf->pages" is unaligned. Force the
+ * tail's actual content to land at the next XDR position
+ * after the head instead.
+ */
+ if (skip) {
+ unsigned char *src, *dst;
+ unsigned int count;
+
+ src = buf->tail[0].iov_base;
+ dst = buf->head[0].iov_base;
+ dst += buf->head[0].iov_len;
+
+ src += skip;
+ tlen -= skip;
+
+ dprintk("RPC: %s: skip=%zu, memmove(%p, %p, %zu)\n",
+ __func__, skip, dst, src, tlen);
+
+ for (count = tlen; count; count--)
+ *dst++ = *src++;
+ }
+
+ return tlen;
+}
+
/*
* Chunk assembly from upper layer xdr_buf.
*
@@ -147,6 +183,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
if (len && n == nsegs)
return -EIO;
+ /* When encoding the read list, the tail is always sent inline */
+ if (type == rpcrdma_readch)
+ return n;
+
if (xdrbuf->tail[0].iov_len) {
/* the rpcrdma protocol allows us to omit any trailing
* xdr pad bytes, saving the server an RDMA operation. */
@@ -476,8 +516,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
/* new length after pullup */
rpclen = rqst->rq_svec[0].iov_len;
- }
-
+ } else if (rtype == rpcrdma_readch)
+ rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
if (rtype != rpcrdma_noch) {
hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
headerp, rtype);
Repair how rpcrdma_marshal_req() chooses which RDMA message type
to use for large non-WRITE operations so that it picks RDMA_NOMSG
in the correct situations, and sets up the marshaling logic to
SEND only the RPC/RDMA header.
Large NFSv2 SYMLINK requests now use RDMA_NOMSG calls. The Linux NFS
server XDR decoder for NFSv2 SYMLINK does not handle having the
pathname argument arrive in a separate buffer. The decoder could be
fixed, but this is simpler and RDMA_NOMSG can be used in a variety
of other situations.
Ensure that the Linux client continues to use "RDMA_MSG + read
list" when sending large NFSv3 SYMLINK requests, which is more
efficient than using RDMA_NOMSG.
Large NFSv4 CREATE(NF4LNK) requests are changed to use "RDMA_MSG +
read list" just like NFSv3 (see Section 5 of RFC 5667). Before,
these did not work at all.
Signed-off-by: Chuck Lever <[email protected]>
Tested-by: Devesh Sharma <[email protected]>
---
fs/nfs/nfs3xdr.c | 1 +
fs/nfs/nfs4xdr.c | 4 +++-
net/sunrpc/xprtrdma/rpc_rdma.c | 25 ++++++++++++++++---------
3 files changed, 20 insertions(+), 10 deletions(-)
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 9b04c2e..267126d 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1103,6 +1103,7 @@ static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
{
encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
encode_symlinkdata3(xdr, args);
+ xdr->buf->flags |= XDRBUF_WRITE;
}
/*
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 558cd65d..c42459e 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1154,7 +1154,9 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
case NF4LNK:
p = reserve_space(xdr, 4);
*p = cpu_to_be32(create->u.symlink.len);
- xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
+ xdr_write_pages(xdr, create->u.symlink.pages, 0,
+ create->u.symlink.len);
+ xdr->buf->flags |= XDRBUF_WRITE;
break;
case NF4BLK: case NF4CHR:
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 1dd48f2..2721586 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -475,21 +475,24 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
*
* o If the total request is under the inline threshold, all ops
* are sent as inline.
- * o Large non-write ops are sent with the entire message as a
- * single read chunk (protocol 0-position special case).
* o Large write ops transmit data as read chunk(s), header as
* inline.
+ * o Large non-write ops are sent with the entire message as a
+ * single read chunk (protocol 0-position special case).
*
- * Note: the NFS code sending down multiple argument segments
- * implies the op is a write.
- * TBD check NFSv4 setacl
+ * This assumes that the upper layer does not present a request
+ * that both has a data payload, and whose non-data arguments
+ * by themselves are larger than the inline threshold.
*/
- if (rpcrdma_args_inline(rqst))
+ if (rpcrdma_args_inline(rqst)) {
rtype = rpcrdma_noch;
- else if (rqst->rq_snd_buf.page_len == 0)
- rtype = rpcrdma_areadch;
- else
+ } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
rtype = rpcrdma_readch;
+ } else {
+ headerp->rm_type = htonl(RDMA_NOMSG);
+ rtype = rpcrdma_areadch;
+ rpclen = 0;
+ }
/* The following simplification is not true forever */
if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
@@ -546,6 +549,10 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
req->rl_send_iov[0].length = hdrlen;
req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
+ req->rl_niovs = 1;
+ if (rtype == rpcrdma_areadch)
+ return 0;
+
req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
req->rl_send_iov[1].length = rpclen;
req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
checkpatch.pl complained about the seq_printf() format string split
across lines and the use of %Lu.
Signed-off-by: Chuck Lever <[email protected]>
Tested-by: Devesh Sharma <[email protected]>
---
net/sunrpc/xprtrdma/transport.c | 48 +++++++++++++++++++--------------------
1 file changed, 23 insertions(+), 25 deletions(-)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index d737300..0985b2b 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -647,31 +647,29 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
if (xprt_connected(xprt))
idle_time = (long)(jiffies - xprt->last_used) / HZ;
- seq_printf(seq,
- "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu "
- "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n",
-
- 0, /* need a local port? */
- xprt->stat.bind_count,
- xprt->stat.connect_count,
- xprt->stat.connect_time,
- idle_time,
- xprt->stat.sends,
- xprt->stat.recvs,
- xprt->stat.bad_xids,
- xprt->stat.req_u,
- xprt->stat.bklog_u,
-
- r_xprt->rx_stats.read_chunk_count,
- r_xprt->rx_stats.write_chunk_count,
- r_xprt->rx_stats.reply_chunk_count,
- r_xprt->rx_stats.total_rdma_request,
- r_xprt->rx_stats.total_rdma_reply,
- r_xprt->rx_stats.pullup_copy_count,
- r_xprt->rx_stats.fixup_copy_count,
- r_xprt->rx_stats.hardway_register_count,
- r_xprt->rx_stats.failed_marshal_count,
- r_xprt->rx_stats.bad_reply_count);
+ seq_puts(seq, "\txprt:\trdma ");
+ seq_printf(seq, "%u %lu %lu %lu %ld %lu %lu %lu %llu %llu ",
+ 0, /* need a local port? */
+ xprt->stat.bind_count,
+ xprt->stat.connect_count,
+ xprt->stat.connect_time,
+ idle_time,
+ xprt->stat.sends,
+ xprt->stat.recvs,
+ xprt->stat.bad_xids,
+ xprt->stat.req_u,
+ xprt->stat.bklog_u);
+ seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu\n",
+ r_xprt->rx_stats.read_chunk_count,
+ r_xprt->rx_stats.write_chunk_count,
+ r_xprt->rx_stats.reply_chunk_count,
+ r_xprt->rx_stats.total_rdma_request,
+ r_xprt->rx_stats.total_rdma_reply,
+ r_xprt->rx_stats.pullup_copy_count,
+ r_xprt->rx_stats.fixup_copy_count,
+ r_xprt->rx_stats.hardway_register_count,
+ r_xprt->rx_stats.failed_marshal_count,
+ r_xprt->rx_stats.bad_reply_count);
}
static int
RDMA_NOMSG type calls are less efficient than RDMA_MSG. Count NOMSG
calls so administrators can tell if they happen to be used more than
expected.
Signed-off-by: Chuck Lever <[email protected]>
Tested-by: Devesh Sharma <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 1 +
net/sunrpc/xprtrdma/transport.c | 5 +++--
net/sunrpc/xprtrdma/xprt_rdma.h | 1 +
3 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 2721586..bc8bd65 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -489,6 +489,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
} else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
rtype = rpcrdma_readch;
} else {
+ r_xprt->rx_stats.nomsg_call_count++;
headerp->rm_type = htonl(RDMA_NOMSG);
rtype = rpcrdma_areadch;
rpclen = 0;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 0985b2b..64443eb 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -659,7 +659,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
xprt->stat.bad_xids,
xprt->stat.req_u,
xprt->stat.bklog_u);
- seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu\n",
+ seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n",
r_xprt->rx_stats.read_chunk_count,
r_xprt->rx_stats.write_chunk_count,
r_xprt->rx_stats.reply_chunk_count,
@@ -669,7 +669,8 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
r_xprt->rx_stats.fixup_copy_count,
r_xprt->rx_stats.hardway_register_count,
r_xprt->rx_stats.failed_marshal_count,
- r_xprt->rx_stats.bad_reply_count);
+ r_xprt->rx_stats.bad_reply_count,
+ r_xprt->rx_stats.nomsg_call_count);
}
static int
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 8422c09..d252457 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -340,6 +340,7 @@ struct rpcrdma_stats {
unsigned long hardway_register_count;
unsigned long failed_marshal_count;
unsigned long bad_reply_count;
+ unsigned long nomsg_call_count;
};
/*
The verbs are obsolete. The ib_rereg_phys_mr() verb is not used by
kernel ULPs, and the last ib_reg_phys_mr() call site in the kernel
tree has now been removed.
Two staging tree call sites remain in the Lustre client. The Lustre
team has been notified of the deprecation of reg_phys_mr.
Signed-off-by: Chuck Lever <[email protected]>
Acked-by: Doug Ledford <[email protected]>
---
drivers/infiniband/core/verbs.c | 67 ---------------------------------------
include/rdma/ib_verbs.h | 46 ---------------------------
2 files changed, 113 deletions(-)
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index bac3fb4..30eb245 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1144,73 +1144,6 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
}
EXPORT_SYMBOL(ib_get_dma_mr);
-struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags,
- u64 *iova_start)
-{
- struct ib_mr *mr;
- int err;
-
- err = ib_check_mr_access(mr_access_flags);
- if (err)
- return ERR_PTR(err);
-
- if (!pd->device->reg_phys_mr)
- return ERR_PTR(-ENOSYS);
-
- mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
- mr_access_flags, iova_start);
-
- if (!IS_ERR(mr)) {
- mr->device = pd->device;
- mr->pd = pd;
- mr->uobject = NULL;
- atomic_inc(&pd->usecnt);
- atomic_set(&mr->usecnt, 0);
- }
-
- return mr;
-}
-EXPORT_SYMBOL(ib_reg_phys_mr);
-
-int ib_rereg_phys_mr(struct ib_mr *mr,
- int mr_rereg_mask,
- struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags,
- u64 *iova_start)
-{
- struct ib_pd *old_pd;
- int ret;
-
- ret = ib_check_mr_access(mr_access_flags);
- if (ret)
- return ret;
-
- if (!mr->device->rereg_phys_mr)
- return -ENOSYS;
-
- if (atomic_read(&mr->usecnt))
- return -EBUSY;
-
- old_pd = mr->pd;
-
- ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
- phys_buf_array, num_phys_buf,
- mr_access_flags, iova_start);
-
- if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
- atomic_dec(&old_pd->usecnt);
- atomic_inc(&pd->usecnt);
- }
-
- return ret;
-}
-EXPORT_SYMBOL(ib_rereg_phys_mr);
-
int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
{
return mr->device->query_mr ?
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index b0f898e..43c1cf0 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2760,52 +2760,6 @@ static inline void ib_dma_free_coherent(struct ib_device *dev,
}
/**
- * ib_reg_phys_mr - Prepares a virtually addressed memory region for use
- * by an HCA.
- * @pd: The protection domain associated assigned to the registered region.
- * @phys_buf_array: Specifies a list of physical buffers to use in the
- * memory region.
- * @num_phys_buf: Specifies the size of the phys_buf_array.
- * @mr_access_flags: Specifies the memory access rights.
- * @iova_start: The offset of the region's starting I/O virtual address.
- */
-struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags,
- u64 *iova_start);
-
-/**
- * ib_rereg_phys_mr - Modifies the attributes of an existing memory region.
- * Conceptually, this call performs the functions deregister memory region
- * followed by register physical memory region. Where possible,
- * resources are reused instead of deallocated and reallocated.
- * @mr: The memory region to modify.
- * @mr_rereg_mask: A bit-mask used to indicate which of the following
- * properties of the memory region are being modified.
- * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies
- * the new protection domain to associated with the memory region,
- * otherwise, this parameter is ignored.
- * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
- * field specifies a list of physical buffers to use in the new
- * translation, otherwise, this parameter is ignored.
- * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
- * field specifies the size of the phys_buf_array, otherwise, this
- * parameter is ignored.
- * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this
- * field specifies the new memory access rights, otherwise, this
- * parameter is ignored.
- * @iova_start: The offset of the region's starting I/O virtual address.
- */
-int ib_rereg_phys_mr(struct ib_mr *mr,
- int mr_rereg_mask,
- struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags,
- u64 *iova_start);
-
-/**
* ib_query_mr - Retrieves information about a specific memory region.
* @mr: The memory region to retrieve information about.
* @mr_attr: The attributes of the specified memory region.
On 7/20/2015 12:03 PM, Chuck Lever wrote:
> All HCA providers have an ib_get_dma_mr() verb. Thus
> rpcrdma_ia_open() will either grab the device's local_dma_key if one
> is available, or it will call ib_get_dma_mr() which is a 100%
> guaranteed fallback.
I recall that in the past, some providers did not support mapping
all of the machine's potential physical memory with a single dma_mr.
If an rnic did/does not support 44-ish bits of length per region,
for example.
Have you verified that all current providers do in fact support the
necessary physical address range for this, and that the requirement
is stated in the verbs going forward?
>
> There is never any need to use the ib_reg_phys_mr() code path in
> rpcrdma_register_internal(), so it can be removed. The remaining
> logic in rpcrdma_{de}register_internal() is folded into
> rpcrdma_{alloc,free}_regbuf().
>
> This is clean up only. No behavior change is expected.
>
> Signed-off-by: Chuck Lever <[email protected]>
> Reviewed-by: Devesh Sharma <[email protected]>
> Reviewed-By: Sagi Grimberg <[email protected]>
> Tested-by: Devesh Sharma <[email protected]>
> ---
> net/sunrpc/xprtrdma/verbs.c | 102 ++++++++-------------------------------
> net/sunrpc/xprtrdma/xprt_rdma.h | 1
> 2 files changed, 21 insertions(+), 82 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> index 1065808..da184f9 100644
> --- a/net/sunrpc/xprtrdma/verbs.c
> +++ b/net/sunrpc/xprtrdma/verbs.c
> @@ -1229,75 +1229,6 @@ rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
> (unsigned long long)seg->mr_dma, seg->mr_dmalen);
> }
>
> -static int
> -rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
> - struct ib_mr **mrp, struct ib_sge *iov)
> -{
> - struct ib_phys_buf ipb;
> - struct ib_mr *mr;
> - int rc;
> -
> - /*
> - * All memory passed here was kmalloc'ed, therefore phys-contiguous.
> - */
> - iov->addr = ib_dma_map_single(ia->ri_device,
> - va, len, DMA_BIDIRECTIONAL);
> - if (ib_dma_mapping_error(ia->ri_device, iov->addr))
> - return -ENOMEM;
> -
> - iov->length = len;
> -
> - if (ia->ri_have_dma_lkey) {
> - *mrp = NULL;
> - iov->lkey = ia->ri_dma_lkey;
> - return 0;
> - } else if (ia->ri_bind_mem != NULL) {
> - *mrp = NULL;
> - iov->lkey = ia->ri_bind_mem->lkey;
> - return 0;
> - }
> -
> - ipb.addr = iov->addr;
> - ipb.size = iov->length;
> - mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
> - IB_ACCESS_LOCAL_WRITE, &iov->addr);
> -
> - dprintk("RPC: %s: phys convert: 0x%llx "
> - "registered 0x%llx length %d\n",
> - __func__, (unsigned long long)ipb.addr,
> - (unsigned long long)iov->addr, len);
> -
> - if (IS_ERR(mr)) {
> - *mrp = NULL;
> - rc = PTR_ERR(mr);
> - dprintk("RPC: %s: failed with %i\n", __func__, rc);
> - } else {
> - *mrp = mr;
> - iov->lkey = mr->lkey;
> - rc = 0;
> - }
> -
> - return rc;
> -}
> -
> -static int
> -rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
> - struct ib_mr *mr, struct ib_sge *iov)
> -{
> - int rc;
> -
> - ib_dma_unmap_single(ia->ri_device,
> - iov->addr, iov->length, DMA_BIDIRECTIONAL);
> -
> - if (NULL == mr)
> - return 0;
> -
> - rc = ib_dereg_mr(mr);
> - if (rc)
> - dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
> - return rc;
> -}
> -
> /**
> * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
> * @ia: controlling rpcrdma_ia
> @@ -1317,26 +1248,30 @@ struct rpcrdma_regbuf *
> rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
> {
> struct rpcrdma_regbuf *rb;
> - int rc;
> + struct ib_sge *iov;
>
> - rc = -ENOMEM;
> rb = kmalloc(sizeof(*rb) + size, flags);
> if (rb == NULL)
> goto out;
>
> - rb->rg_size = size;
> - rb->rg_owner = NULL;
> - rc = rpcrdma_register_internal(ia, rb->rg_base, size,
> - &rb->rg_mr, &rb->rg_iov);
> - if (rc)
> + iov = &rb->rg_iov;
> + iov->addr = ib_dma_map_single(ia->ri_device,
> + (void *)rb->rg_base, size,
> + DMA_BIDIRECTIONAL);
> + if (ib_dma_mapping_error(ia->ri_device, iov->addr))
> goto out_free;
>
> + iov->length = size;
> + iov->lkey = ia->ri_have_dma_lkey ?
> + ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
> + rb->rg_size = size;
> + rb->rg_owner = NULL;
> return rb;
>
> out_free:
> kfree(rb);
> out:
> - return ERR_PTR(rc);
> + return ERR_PTR(-ENOMEM);
> }
>
> /**
> @@ -1347,10 +1282,15 @@ out:
> void
> rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
> {
> - if (rb) {
> - rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
> - kfree(rb);
> - }
> + struct ib_sge *iov;
> +
> + if (!rb)
> + return;
> +
> + iov = &rb->rg_iov;
> + ib_dma_unmap_single(ia->ri_device,
> + iov->addr, iov->length, DMA_BIDIRECTIONAL);
> + kfree(rb);
> }
>
> /*
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index abee472..ce4e79e 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -119,7 +119,6 @@ struct rpcrdma_ep {
> struct rpcrdma_regbuf {
> size_t rg_size;
> struct rpcrdma_req *rg_owner;
> - struct ib_mr *rg_mr;
> struct ib_sge rg_iov;
> __be32 rg_base[0] __attribute__ ((aligned(256)));
> };
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
Hi Tom-
On Jul 20, 2015, at 4:34 PM, Tom Talpey <[email protected]> wrote:
> On 7/20/2015 12:03 PM, Chuck Lever wrote:
>> All HCA providers have an ib_get_dma_mr() verb. Thus
>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
>> is available, or it will call ib_get_dma_mr() which is a 100%
>> guaranteed fallback.
>
> I recall that in the past, some providers did not support mapping
> all of the machine's potential physical memory with a single dma_mr.
> If an rnic did/does not support 44-ish bits of length per region,
> for example.
The buffers affected by this change are small, so I?m confident that
restriction would not be a problem here.
What might break with such restricted hardware is ALLPHYSICAL on
large-memory systems. ALLPHYSICAL does rely on a single DMA MR that
covers all of the NFS client?s memory.
That would be a problem both before and after this patch, as far as
I can tell.
> Have you verified that all current providers do in fact support the
> necessary physical address range for this, and that the requirement
> is stated in the verbs going forward?
--
Chuck Lever
On Mon, Jul 20, 2015 at 01:34:16PM -0700, Tom Talpey wrote:
> On 7/20/2015 12:03 PM, Chuck Lever wrote:
> >All HCA providers have an ib_get_dma_mr() verb. Thus
> >rpcrdma_ia_open() will either grab the device's local_dma_key if one
> >is available, or it will call ib_get_dma_mr() which is a 100%
> >guaranteed fallback.
>
> I recall that in the past, some providers did not support mapping
> all of the machine's potential physical memory with a single dma_mr.
> If an rnic did/does not support 44-ish bits of length per region,
> for example.
Looks like you are right, but the standard in kernel is to require
ib_get_dma_mr, if the HCA can't do that, then it cannot be used on a
big memory machine with kernel ULPs.
Looking deeper, both amso1100 and cxgb3 seem limited to 32 bits of
physical memory, and silently break all kernel ULPs if they are used
on a modern machine with > 4G.
Is that right Steve?
Based on that, should we remove the cxgb3 driver as well? Or at least
can you fix it up to at least fail get_dma_mr if there is too much
ram?
Everything else looked Ok.
Jason
> -----Original Message-----
> From: [email protected] [mailto:[email protected]] On Behalf Of Jason Gunthorpe
> Sent: Monday, July 20, 2015 4:06 PM
> To: Tom Talpey; Steve Wise
> Cc: Chuck Lever; [email protected]; [email protected]
> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>
> On Mon, Jul 20, 2015 at 01:34:16PM -0700, Tom Talpey wrote:
> > On 7/20/2015 12:03 PM, Chuck Lever wrote:
> > >All HCA providers have an ib_get_dma_mr() verb. Thus
> > >rpcrdma_ia_open() will either grab the device's local_dma_key if one
> > >is available, or it will call ib_get_dma_mr() which is a 100%
> > >guaranteed fallback.
> >
> > I recall that in the past, some providers did not support mapping
> > all of the machine's potential physical memory with a single dma_mr.
> > If an rnic did/does not support 44-ish bits of length per region,
> > for example.
>
> Looks like you are right, but the standard in kernel is to require
> ib_get_dma_mr, if the HCA can't do that, then it cannot be used on a
> big memory machine with kernel ULPs.
>
> Looking deeper, both amso1100 and cxgb3 seem limited to 32 bits of
> physical memory, and silently break all kernel ULPs if they are used
> on a modern machine with > 4G.
>
> Is that right Steve?
>
Yes.
> Based on that, should we remove the cxgb3 driver as well? Or at least
> can you fix it up to at least fail get_dma_mr if there is too much
> ram?
>
I would like to keep cxgb3 around. I can add code to fail if the memory is > 32b. Do you know how I get the amount of available
ram?
What is the process again to remove a driver? I can remove amso...
> Everything else looked Ok.
>
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
> > Based on that, should we remove the cxgb3 driver as well? Or at least
> > can you fix it up to at least fail get_dma_mr if there is too much
> > ram?
> >
>
> I would like to keep cxgb3 around. I can add code to fail if the memory is > 32b. Do you know how I get the amount of available
ram?
>
Something like this?
@@ -736,6 +736,9 @@ static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
/*
* T3 only supports 32 bits of size.
*/
+ if ((totalram_pages << PAGE_SHIFT) > 0xffffffff)
+ return -ENOTSUPP;
+
bl.size = 0xffffffff;
bl.addr = 0;
kva = 0;
> -----Original Message-----
> From: Steve Wise [mailto:[email protected]]
> Sent: Monday, July 20, 2015 4:34 PM
> To: 'Jason Gunthorpe'; 'Tom Talpey'
> Cc: 'Chuck Lever'; '[email protected]'; '[email protected]'
> Subject: RE: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>
>
> >
> > > Based on that, should we remove the cxgb3 driver as well? Or at least
> > > can you fix it up to at least fail get_dma_mr if there is too much
> > > ram?
> > >
> >
> > I would like to keep cxgb3 around. I can add code to fail if the memory is > 32b. Do you know how I get the amount of
available ram?
> >
>
> Something like this?
>
> @@ -736,6 +736,9 @@ static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
> /*
> * T3 only supports 32 bits of size.
> */
> + if ((totalram_pages << PAGE_SHIFT) > 0xffffffff)
> + return -ENOTSUPP;
oops: ERR_PTR(-ENOTSUPP);
> +
> bl.size = 0xffffffff;
> bl.addr = 0;
> kva = 0;
>
On 7/20/2015 1:55 PM, Chuck Lever wrote:
> On Jul 20, 2015, at 4:34 PM, Tom Talpey <[email protected]> wrote:
>
>> On 7/20/2015 12:03 PM, Chuck Lever wrote:
>>> All HCA providers have an ib_get_dma_mr() verb. Thus
>>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
>>> is available, or it will call ib_get_dma_mr() which is a 100%
>>> guaranteed fallback.
>>
>> I recall that in the past, some providers did not support mapping
>> all of the machine's potential physical memory with a single dma_mr.
>> If an rnic did/does not support 44-ish bits of length per region,
>> for example.
>
> The buffers affected by this change are small, so I?m confident that
> restriction would not be a problem here.
It's not about the buffer size, it's about the region. Because the
get_dma_mr does not specify a base address and length, the rnic must
basically attempt to map a base of zero and a length of the largest
physical offset.
This is not the case with the previous phys_reg_mr, which specified
the exact phys page range.
> What might break with such restricted hardware is ALLPHYSICAL on
> large-memory systems. ALLPHYSICAL does rely on a single DMA MR that
> covers all of the NFS client?s memory.
Yes, indeed it always did, but that mode was not intended for general
use.
On 7/20/2015 2:16 PM, Steve Wise wrote:
>
>
>> -----Original Message-----
>> From: [email protected] [mailto:[email protected]] On Behalf Of Jason Gunthorpe
>> Sent: Monday, July 20, 2015 4:06 PM
>> To: Tom Talpey; Steve Wise
>> Cc: Chuck Lever; [email protected]; [email protected]
>> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>>
>> On Mon, Jul 20, 2015 at 01:34:16PM -0700, Tom Talpey wrote:
>>> On 7/20/2015 12:03 PM, Chuck Lever wrote:
>>>> All HCA providers have an ib_get_dma_mr() verb. Thus
>>>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
>>>> is available, or it will call ib_get_dma_mr() which is a 100%
>>>> guaranteed fallback.
>>>
>>> I recall that in the past, some providers did not support mapping
>>> all of the machine's potential physical memory with a single dma_mr.
>>> If an rnic did/does not support 44-ish bits of length per region,
>>> for example.
>>
>> Looks like you are right, but the standard in kernel is to require
>> ib_get_dma_mr, if the HCA can't do that, then it cannot be used on a
>> big memory machine with kernel ULPs.
>>
>> Looking deeper, both amso1100 and cxgb3 seem limited to 32 bits of
>> physical memory, and silently break all kernel ULPs if they are used
>> on a modern machine with > 4G.
>>
>> Is that right Steve?
>>
>
> Yes.
>
>> Based on that, should we remove the cxgb3 driver as well? Or at least
>> can you fix it up to at least fail get_dma_mr if there is too much
>> ram?
>>
>
> I would like to keep cxgb3 around. I can add code to fail if the memory is > 32b. Do you know how I get the amount of available
> ram?
A) are you sure it's an unsigned length, i.e. is it really 31 bits?
B) why bother to check? Are machines with <4GB interesting, and worth
supporting a special optimization?
On Mon, Jul 20, 2015 at 04:37:15PM -0500, Steve Wise wrote:
>
>
> > From: Steve Wise [mailto:[email protected]]
> > Sent: Monday, July 20, 2015 4:34 PM
> > To: 'Jason Gunthorpe'; 'Tom Talpey'
> > Cc: 'Chuck Lever'; '[email protected]'; '[email protected]'
> > Subject: RE: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> >
> >
> > >
> > > > Based on that, should we remove the cxgb3 driver as well? Or at least
> > > > can you fix it up to at least fail get_dma_mr if there is too much
> > > > ram?
> > > >
> > >
> > > I would like to keep cxgb3 around. I can add code to fail if the memory is > 32b. Do you know how I get the amount of
> available ram?
> > >
> >
> > Something like this?
> >
> > @@ -736,6 +736,9 @@ static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
> > /*
> > * T3 only supports 32 bits of size.
> > */
> > + if ((totalram_pages << PAGE_SHIFT) > 0xffffffff)
> > + return -ENOTSUPP;
>
> oops: ERR_PTR(-ENOTSUPP);
>
> > +
> > bl.size = 0xffffffff;
> > bl.addr = 0;
> > kva = 0;
Yah, that seems much better.. With the patch set I am working on this
will mean all ULPs will fail to create a kernel PD on cxgb3 if the
above triggers. If our error handling works that should just make it
unuable from kernel space which is what we need to see..
To remove the amasso driver, move it to Staging. I don't know the
exact details, sorry. Hopefully Doug will be able to help you do that.
Maybe add some kind of one time print 'cxgb3 is not usuable on
machines with >4G of RAM' as well?
Jason
On Mon, Jul 20, 2015 at 03:04:21PM -0700, Tom Talpey wrote:
> B) why bother to check? Are machines with <4GB interesting, and worth
> supporting a special optimization?
mainline drivers shouldn't silently malfunction.
Jason
On Jul 20, 2015, at 5:55 PM, Tom Talpey <[email protected]> wrote:
> On 7/20/2015 1:55 PM, Chuck Lever wrote:
>> On Jul 20, 2015, at 4:34 PM, Tom Talpey <[email protected]> wrote:
>>
>>> On 7/20/2015 12:03 PM, Chuck Lever wrote:
>>>> All HCA providers have an ib_get_dma_mr() verb. Thus
>>>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
>>>> is available, or it will call ib_get_dma_mr() which is a 100%
>>>> guaranteed fallback.
>>>
>>> I recall that in the past, some providers did not support mapping
>>> all of the machine's potential physical memory with a single dma_mr.
>>> If an rnic did/does not support 44-ish bits of length per region,
>>> for example.
>>
>> The buffers affected by this change are small, so I?m confident that
>> restriction would not be a problem here.
>
> It's not about the buffer size, it's about the region. Because the
> get_dma_mr does not specify a base address and length, the rnic must
> basically attempt to map a base of zero and a length of the largest
> physical offset.
Understood now, but:
> This is not the case with the previous phys_reg_mr, which specified
> the exact phys page range.
rpcrdma_ia_open() fails immediately if IB_DEVICE_LOCAL_DMA_LKEY
is not asserted _and_ ib_get_dma_mr() fails. We never get to the
logic in rpcrdma_regbuf_alloc() in that case, so ib_reg_phys_mr()
would still never be invoked. That really is dead code.
If you prefer, I can adjust the patch description to remove the
?100% guaranteed fallback? line.
--
Chuck Lever
On 7/20/2015 3:17 PM, Jason Gunthorpe wrote:
> On Mon, Jul 20, 2015 at 03:04:21PM -0700, Tom Talpey wrote:
>> B) why bother to check? Are machines with <4GB interesting, and worth
>> supporting a special optimization?
>
> mainline drivers shouldn't silently malfunction.
I meant why bother to check, just always return failure. It's more
honest, and robust. Someone might test with 2GB, conclude it worked,
then deploy with 8GB and boom.
Also, what about hotplug RAM? Same situation.
On Mon, Jul 20, 2015 at 03:03:11PM -0400, Chuck Lever wrote:
> + iov->length = size;
> + iov->lkey = ia->ri_have_dma_lkey ?
> + ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
> + rb->rg_size = size;
> + rb->rg_owner = NULL;
> return rb;
There is something odd looking about this..
ri_bind_mem is only setup in the RPCRDMA_ALLPHYSICAL and
RPCRDMA_MTHCAFMR cases.
RPCRDMA_FRMR doesn't set it up.
So this code in rpcrdma_alloc_regbuf is never called for the FRMR
case?
If yes, then, how is FRMR working? There is absolutely no reason to
use FRMR to register local send buffers, just use the global all
memory lkey...
If no, then that is an oops?
Jason
On 7/20/2015 3:21 PM, Chuck Lever wrote:
>
> On Jul 20, 2015, at 5:55 PM, Tom Talpey <[email protected]> wrote:
>
>> On 7/20/2015 1:55 PM, Chuck Lever wrote:
>>> On Jul 20, 2015, at 4:34 PM, Tom Talpey <[email protected]> wrote:
>>>
>>>> On 7/20/2015 12:03 PM, Chuck Lever wrote:
>>>>> All HCA providers have an ib_get_dma_mr() verb. Thus
>>>>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
>>>>> is available, or it will call ib_get_dma_mr() which is a 100%
>>>>> guaranteed fallback.
>>>>
>>>> I recall that in the past, some providers did not support mapping
>>>> all of the machine's potential physical memory with a single dma_mr.
>>>> If an rnic did/does not support 44-ish bits of length per region,
>>>> for example.
>>>
>>> The buffers affected by this change are small, so I?m confident that
>>> restriction would not be a problem here.
>>
>> It's not about the buffer size, it's about the region. Because the
>> get_dma_mr does not specify a base address and length, the rnic must
>> basically attempt to map a base of zero and a length of the largest
>> physical offset.
>
> Understood now, but:
>
>
>> This is not the case with the previous phys_reg_mr, which specified
>> the exact phys page range.
>
> rpcrdma_ia_open() fails immediately if IB_DEVICE_LOCAL_DMA_LKEY
> is not asserted _and_ ib_get_dma_mr() fails. We never get to the
> logic in rpcrdma_regbuf_alloc() in that case, so ib_reg_phys_mr()
> would still never be invoked. That really is dead code.
>
> If you prefer, I can adjust the patch description to remove the
> ?100% guaranteed fallback? line.
Ok, good and yes 100% sounds like a risky statement.
On Jul 20, 2015, at 6:26 PM, Jason Gunthorpe <[email protected]> wrote:
> On Mon, Jul 20, 2015 at 03:03:11PM -0400, Chuck Lever wrote:
>> + iov->length = size;
>> + iov->lkey = ia->ri_have_dma_lkey ?
>> + ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
>> + rb->rg_size = size;
>> + rb->rg_owner = NULL;
>> return rb;
>
> There is something odd looking about this..
>
> ri_bind_mem is only setup in the RPCRDMA_ALLPHYSICAL and
> RPCRDMA_MTHCAFMR cases.
>
> RPCRDMA_FRMR doesn't set it up.
>
> So this code in rpcrdma_alloc_regbuf is never called for the FRMR
> case?
>
> If yes, then, how is FRMR working? There is absolutely no reason to
> use FRMR to register local send buffers, just use the global all
> memory lkey...
>
> If no, then that is an oops?
I?ve tested this code, no oops.
FRWR always uses the DMA lkey. xprtrdma does not use FRWR if
IB_DEVICE_LOCAL_DMA_LKEY is not asserted.
FMR and PHYSICAL use the DMA lkey if it exists, otherwise they
use an lkey from ib_get_dma_mr.
Look in 06/15, this is cleaned up.
--
Chuck Lever
> -----Original Message-----
> From: Tom Talpey [mailto:[email protected]]
> Sent: Monday, July 20, 2015 5:04 PM
> To: Steve Wise; 'Jason Gunthorpe'
> Cc: 'Chuck Lever'; [email protected]; [email protected]
> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>
> On 7/20/2015 2:16 PM, Steve Wise wrote:
> >
> >
> >> -----Original Message-----
> >> From: [email protected] [mailto:[email protected]] On Behalf Of Jason Gunthorpe
> >> Sent: Monday, July 20, 2015 4:06 PM
> >> To: Tom Talpey; Steve Wise
> >> Cc: Chuck Lever; [email protected]; [email protected]
> >> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> >>
> >> On Mon, Jul 20, 2015 at 01:34:16PM -0700, Tom Talpey wrote:
> >>> On 7/20/2015 12:03 PM, Chuck Lever wrote:
> >>>> All HCA providers have an ib_get_dma_mr() verb. Thus
> >>>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
> >>>> is available, or it will call ib_get_dma_mr() which is a 100%
> >>>> guaranteed fallback.
> >>>
> >>> I recall that in the past, some providers did not support mapping
> >>> all of the machine's potential physical memory with a single dma_mr.
> >>> If an rnic did/does not support 44-ish bits of length per region,
> >>> for example.
> >>
> >> Looks like you are right, but the standard in kernel is to require
> >> ib_get_dma_mr, if the HCA can't do that, then it cannot be used on a
> >> big memory machine with kernel ULPs.
> >>
> >> Looking deeper, both amso1100 and cxgb3 seem limited to 32 bits of
> >> physical memory, and silently break all kernel ULPs if they are used
> >> on a modern machine with > 4G.
> >>
> >> Is that right Steve?
> >>
> >
> > Yes.
> >
> >> Based on that, should we remove the cxgb3 driver as well? Or at least
> >> can you fix it up to at least fail get_dma_mr if there is too much
> >> ram?
> >>
> >
> > I would like to keep cxgb3 around. I can add code to fail if the memory is > 32b. Do you know how I get the amount of
available
> > ram?
>
> A) are you sure it's an unsigned length, i.e. is it really 31 bits?
>
yes.
> B) why bother to check? Are machines with <4GB interesting, and worth
> supporting a special optimization?
No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.
On Mon, Jul 20, 2015 at 06:31:11PM -0400, Chuck Lever wrote:
>
> On Jul 20, 2015, at 6:26 PM, Jason Gunthorpe <[email protected]> wrote:
>
> > On Mon, Jul 20, 2015 at 03:03:11PM -0400, Chuck Lever wrote:
> >> + iov->length = size;
> >> + iov->lkey = ia->ri_have_dma_lkey ?
> >> + ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
> >> + rb->rg_size = size;
> >> + rb->rg_owner = NULL;
> >> return rb;
> >
> > There is something odd looking about this..
> >
> > ri_bind_mem is only setup in the RPCRDMA_ALLPHYSICAL and
> > RPCRDMA_MTHCAFMR cases.
> >
> > RPCRDMA_FRMR doesn't set it up.
> >
> > So this code in rpcrdma_alloc_regbuf is never called for the FRMR
> > case?
> >
> > If yes, then, how is FRMR working? There is absolutely no reason to
> > use FRMR to register local send buffers, just use the global all
> > memory lkey...
> >
> > If no, then that is an oops?
>
> I’ve tested this code, no oops.
>
> FRWR always uses the DMA lkey. xprtrdma does not use FRWR if
> IB_DEVICE_LOCAL_DMA_LKEY is not asserted.
Ah, I see. Ok.
Is there a reason to link FRWR and LOCAL_DMA_LKEY together? Just use
the code you have for fmr with frwr to get the lkey, probably move it
to rpcrdma_ia_open .. Physical MR should create a 2nd MR dedicated for
rkey use.
That will work really well with the series I'm working on:
https://github.com/jgunthorpe/linux/tree/remove-ib_get_dma_mr
To just drop ib_get_dma_mr entirely.
Jason
On Mon, Jul 20, 2015 at 05:41:27PM -0500, Steve Wise wrote:
> > B) why bother to check? Are machines with <4GB interesting, and worth
> > supporting a special optimization?
>
> No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.
Doesn't look like the NFS client will work. It requires an all
physical memory lkey for SEND and RECV buffers..
Jason
> -----Original Message-----
> From: Jason Gunthorpe [mailto:[email protected]]
> Sent: Monday, July 20, 2015 5:14 PM
> To: Steve Wise
> Cc: 'Tom Talpey'; 'Chuck Lever'; [email protected]; [email protected]
> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>
> On Mon, Jul 20, 2015 at 04:37:15PM -0500, Steve Wise wrote:
> >
> >
> > > From: Steve Wise [mailto:[email protected]]
> > > Sent: Monday, July 20, 2015 4:34 PM
> > > To: 'Jason Gunthorpe'; 'Tom Talpey'
> > > Cc: 'Chuck Lever'; '[email protected]'; '[email protected]'
> > > Subject: RE: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> > >
> > >
> > > >
> > > > > Based on that, should we remove the cxgb3 driver as well? Or at least
> > > > > can you fix it up to at least fail get_dma_mr if there is too much
> > > > > ram?
> > > > >
> > > >
> > > > I would like to keep cxgb3 around. I can add code to fail if the memory is > 32b. Do you know how I get the amount of
> > available ram?
> > > >
> > >
> > > Something like this?
> > >
> > > @@ -736,6 +736,9 @@ static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
> > > /*
> > > * T3 only supports 32 bits of size.
> > > */
> > > + if ((totalram_pages << PAGE_SHIFT) > 0xffffffff)
> > > + return -ENOTSUPP;
> >
> > oops: ERR_PTR(-ENOTSUPP);
> >
> > > +
> > > bl.size = 0xffffffff;
> > > bl.addr = 0;
> > > kva = 0;
>
> Yah, that seems much better.. With the patch set I am working on this
> will mean all ULPs will fail to create a kernel PD on cxgb3 if the
> above triggers. If our error handling works that should just make it
> unuable from kernel space which is what we need to see..
>
What about user PDs? We need that to work for cxgb3.
> To remove the amasso driver, move it to Staging. I don't know the
> exact details, sorry. Hopefully Doug will be able to help you do that.
>
ok thanks
> Maybe add some kind of one time print 'cxgb3 is not usuable on
> machines with >4G of RAM' as well?
>
Could do.
> Jason
On Mon, Jul 20, 2015 at 05:43:34PM -0500, Steve Wise wrote:
> > Yah, that seems much better.. With the patch set I am working on this
> > will mean all ULPs will fail to create a kernel PD on cxgb3 if the
> > above triggers. If our error handling works that should just make it
> > unuable from kernel space which is what we need to see..
> What about user PDs? We need that to work for cxgb3.
Only in-kernel uses call ib_alloc_pd
The user flow calls ib_uverbs_alloc_pd which calls the driver
directly.
Jason
> -----Original Message-----
> From: Jason Gunthorpe [mailto:[email protected]]
> Sent: Monday, July 20, 2015 5:54 PM
> To: Steve Wise
> Cc: 'Tom Talpey'; 'Chuck Lever'; [email protected]; [email protected]
> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>
> On Mon, Jul 20, 2015 at 05:43:34PM -0500, Steve Wise wrote:
> > > Yah, that seems much better.. With the patch set I am working on this
> > > will mean all ULPs will fail to create a kernel PD on cxgb3 if the
> > > above triggers. If our error handling works that should just make it
> > > unuable from kernel space which is what we need to see..
>
> > What about user PDs? We need that to work for cxgb3.
>
> Only in-kernel uses call ib_alloc_pd
>
> The user flow calls ib_uverbs_alloc_pd which calls the driver
> directly.
>
Ok good. Thanks!
On Jul 20, 2015, at 6:41 PM, Jason Gunthorpe <[email protected]> wrote:
> On Mon, Jul 20, 2015 at 06:31:11PM -0400, Chuck Lever wrote:
>>
>> On Jul 20, 2015, at 6:26 PM, Jason Gunthorpe <[email protected]> wrote:
>>
>>> On Mon, Jul 20, 2015 at 03:03:11PM -0400, Chuck Lever wrote:
>>>> + iov->length = size;
>>>> + iov->lkey = ia->ri_have_dma_lkey ?
>>>> + ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
>>>> + rb->rg_size = size;
>>>> + rb->rg_owner = NULL;
>>>> return rb;
>>>
>>> There is something odd looking about this..
>>>
>>> ri_bind_mem is only setup in the RPCRDMA_ALLPHYSICAL and
>>> RPCRDMA_MTHCAFMR cases.
>>>
>>> RPCRDMA_FRMR doesn't set it up.
>>>
>>> So this code in rpcrdma_alloc_regbuf is never called for the FRMR
>>> case?
>>>
>>> If yes, then, how is FRMR working? There is absolutely no reason to
>>> use FRMR to register local send buffers, just use the global all
>>> memory lkey...
>>>
>>> If no, then that is an oops?
>>
>> I?ve tested this code, no oops.
>>
>> FRWR always uses the DMA lkey. xprtrdma does not use FRWR if
>> IB_DEVICE_LOCAL_DMA_LKEY is not asserted.
>
> Ah, I see. Ok.
>
> Is there a reason to link FRWR and LOCAL_DMA_LKEY together?
Tom would know.
> Just use
> the code you have for fmr with frwr to get the lkey, probably move it
> to rpcrdma_ia_open .. Physical MR should create a 2nd MR dedicated for
> rkey use.
>
> That will work really well with the series I'm working on:
>
> https://github.com/jgunthorpe/linux/tree/remove-ib_get_dma_mr
>
> To just drop ib_get_dma_mr entirely.
Yes, I sorted this out so it would be obvious how to deal with
xprtrdma when the time comes. All three registration modes can
just use pd->local_dma_lkey for the SEND/RECV buffers.
PHYSICAL would still need ib_get_dma_mr() (or something equivalent)
for MRs needing remote access, agreed.
--
Chuck Lever
On 7/20/2015 4:36 PM, Chuck Lever wrote:
>
> On Jul 20, 2015, at 6:41 PM, Jason Gunthorpe <[email protected]> wrote:
>
>> On Mon, Jul 20, 2015 at 06:31:11PM -0400, Chuck Lever wrote:
>>>
>>> On Jul 20, 2015, at 6:26 PM, Jason Gunthorpe <[email protected]> wrote:
>>>
>>>> On Mon, Jul 20, 2015 at 03:03:11PM -0400, Chuck Lever wrote:
>>>>> + iov->length = size;
>>>>> + iov->lkey = ia->ri_have_dma_lkey ?
>>>>> + ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
>>>>> + rb->rg_size = size;
>>>>> + rb->rg_owner = NULL;
>>>>> return rb;
>>>>
>>>> There is something odd looking about this..
>>>>
>>>> ri_bind_mem is only setup in the RPCRDMA_ALLPHYSICAL and
>>>> RPCRDMA_MTHCAFMR cases.
>>>>
>>>> RPCRDMA_FRMR doesn't set it up.
>>>>
>>>> So this code in rpcrdma_alloc_regbuf is never called for the FRMR
>>>> case?
>>>>
>>>> If yes, then, how is FRMR working? There is absolutely no reason to
>>>> use FRMR to register local send buffers, just use the global all
>>>> memory lkey...
>>>>
>>>> If no, then that is an oops?
>>>
>>> I?ve tested this code, no oops.
>>>
>>> FRWR always uses the DMA lkey. xprtrdma does not use FRWR if
>>> IB_DEVICE_LOCAL_DMA_LKEY is not asserted.
>>
>> Ah, I see. Ok.
>>
>> Is there a reason to link FRWR and LOCAL_DMA_LKEY together?
>
> Tom would know.
Tom Talpey or Tom Tucker? I don't think I used a dma_lkey at all
in the original work, but there were some changes later.
I thought not all rnic's supported a dma_lkey. So requiring it
seems like a bad idea, to me.
>> Just use
>> the code you have for fmr with frwr to get the lkey, probably move it
>> to rpcrdma_ia_open .. Physical MR should create a 2nd MR dedicated for
>> rkey use.
But FMR is not supported by modern mlx5 and cxgb4?
On 7/20/2015 3:41 PM, Steve Wise wrote:
>
>
>> -----Original Message-----
>> From: Tom Talpey [mailto:[email protected]]
>> Sent: Monday, July 20, 2015 5:04 PM
>> To: Steve Wise; 'Jason Gunthorpe'
>> Cc: 'Chuck Lever'; [email protected]; [email protected]
>> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>>
>> On 7/20/2015 2:16 PM, Steve Wise wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: [email protected] [mailto:[email protected]] On Behalf Of Jason Gunthorpe
>>>> Sent: Monday, July 20, 2015 4:06 PM
>>>> To: Tom Talpey; Steve Wise
>>>> Cc: Chuck Lever; [email protected]; [email protected]
>>>> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>>>>
>>>> On Mon, Jul 20, 2015 at 01:34:16PM -0700, Tom Talpey wrote:
>>>>> On 7/20/2015 12:03 PM, Chuck Lever wrote:
>>>>>> All HCA providers have an ib_get_dma_mr() verb. Thus
>>>>>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
>>>>>> is available, or it will call ib_get_dma_mr() which is a 100%
>>>>>> guaranteed fallback.
>>>>>
>>>>> I recall that in the past, some providers did not support mapping
>>>>> all of the machine's potential physical memory with a single dma_mr.
>>>>> If an rnic did/does not support 44-ish bits of length per region,
>>>>> for example.
>>>>
>>>> Looks like you are right, but the standard in kernel is to require
>>>> ib_get_dma_mr, if the HCA can't do that, then it cannot be used on a
>>>> big memory machine with kernel ULPs.
>>>>
>>>> Looking deeper, both amso1100 and cxgb3 seem limited to 32 bits of
>>>> physical memory, and silently break all kernel ULPs if they are used
>>>> on a modern machine with > 4G.
>>>>
>>>> Is that right Steve?
>>>>
>>>
>>> Yes.
>>>
>>>> Based on that, should we remove the cxgb3 driver as well? Or at least
>>>> can you fix it up to at least fail get_dma_mr if there is too much
>>>> ram?
>>>>
>>>
>>> I would like to keep cxgb3 around. I can add code to fail if the memory is > 32b. Do you know how I get the amount of
> available
>>> ram?
>>
>> A) are you sure it's an unsigned length, i.e. is it really 31 bits?
>>
>
> yes.
>
>> B) why bother to check? Are machines with <4GB interesting, and worth
>> supporting a special optimization?
>
> No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.
I'm obviously not making myself clear. I am suggesting that cxgb3 fail
the ib_get_dma_mr() verb, regardless of installed memory.
I am not suggesting it fail to load, or fail other memreg requests. It
should work normally in all other respects.
On Jul 20, 2015, at 8:11 PM, Tom Talpey <[email protected]> wrote:
> On 7/20/2015 4:36 PM, Chuck Lever wrote:
>>
>> On Jul 20, 2015, at 6:41 PM, Jason Gunthorpe <[email protected]> wrote:
>>
>>> On Mon, Jul 20, 2015 at 06:31:11PM -0400, Chuck Lever wrote:
>>>>
>>>> On Jul 20, 2015, at 6:26 PM, Jason Gunthorpe <[email protected]> wrote:
>>>>
>>>>> On Mon, Jul 20, 2015 at 03:03:11PM -0400, Chuck Lever wrote:
>>>>>> + iov->length = size;
>>>>>> + iov->lkey = ia->ri_have_dma_lkey ?
>>>>>> + ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
>>>>>> + rb->rg_size = size;
>>>>>> + rb->rg_owner = NULL;
>>>>>> return rb;
>>>>>
>>>>> There is something odd looking about this..
>>>>>
>>>>> ri_bind_mem is only setup in the RPCRDMA_ALLPHYSICAL and
>>>>> RPCRDMA_MTHCAFMR cases.
>>>>>
>>>>> RPCRDMA_FRMR doesn't set it up.
>>>>>
>>>>> So this code in rpcrdma_alloc_regbuf is never called for the FRMR
>>>>> case?
>>>>>
>>>>> If yes, then, how is FRMR working? There is absolutely no reason to
>>>>> use FRMR to register local send buffers, just use the global all
>>>>> memory lkey...
>>>>>
>>>>> If no, then that is an oops?
>>>>
>>>> I?ve tested this code, no oops.
>>>>
>>>> FRWR always uses the DMA lkey. xprtrdma does not use FRWR if
>>>> IB_DEVICE_LOCAL_DMA_LKEY is not asserted.
>>>
>>> Ah, I see. Ok.
>>>
>>> Is there a reason to link FRWR and LOCAL_DMA_LKEY together?
>>
>> Tom would know.
>
> Tom Talpey or Tom Tucker? I don't think I used a dma_lkey at all
> in the original work, but there were some changes later.
The commit which adds FRWR support to xprtrdma introduced a check in
rpcrdma_ia_open:
+ case RPCRDMA_FRMR:
+ /* Requires both frmr reg and local dma lkey */
+ if ((devattr.device_cap_flags &
+ (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
+ (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
This seems to be the only place the local DMA lkey requirement is
expressed or enforced. But yeah, the local DMA lkey doesn?t seem
to be used anywhere in the FRWR-specific parts of the code.
--
Chuck Lever
On 7/20/2015 5:34 PM, Chuck Lever wrote:
>
> On Jul 20, 2015, at 8:11 PM, Tom Talpey <[email protected]> wrote:
>
>> On 7/20/2015 4:36 PM, Chuck Lever wrote:
>>>
>>> On Jul 20, 2015, at 6:41 PM, Jason Gunthorpe <[email protected]> wrote:
>>>
>>>> On Mon, Jul 20, 2015 at 06:31:11PM -0400, Chuck Lever wrote:
>>>>>
>>>>> On Jul 20, 2015, at 6:26 PM, Jason Gunthorpe <[email protected]> wrote:
>>>>>
>>>>>> On Mon, Jul 20, 2015 at 03:03:11PM -0400, Chuck Lever wrote:
>>>>>>> + iov->length = size;
>>>>>>> + iov->lkey = ia->ri_have_dma_lkey ?
>>>>>>> + ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
>>>>>>> + rb->rg_size = size;
>>>>>>> + rb->rg_owner = NULL;
>>>>>>> return rb;
>>>>>>
>>>>>> There is something odd looking about this..
>>>>>>
>>>>>> ri_bind_mem is only setup in the RPCRDMA_ALLPHYSICAL and
>>>>>> RPCRDMA_MTHCAFMR cases.
>>>>>>
>>>>>> RPCRDMA_FRMR doesn't set it up.
>>>>>>
>>>>>> So this code in rpcrdma_alloc_regbuf is never called for the FRMR
>>>>>> case?
>>>>>>
>>>>>> If yes, then, how is FRMR working? There is absolutely no reason to
>>>>>> use FRMR to register local send buffers, just use the global all
>>>>>> memory lkey...
>>>>>>
>>>>>> If no, then that is an oops?
>>>>>
>>>>> I?ve tested this code, no oops.
>>>>>
>>>>> FRWR always uses the DMA lkey. xprtrdma does not use FRWR if
>>>>> IB_DEVICE_LOCAL_DMA_LKEY is not asserted.
>>>>
>>>> Ah, I see. Ok.
>>>>
>>>> Is there a reason to link FRWR and LOCAL_DMA_LKEY together?
>>>
>>> Tom would know.
>>
>> Tom Talpey or Tom Tucker? I don't think I used a dma_lkey at all
>> in the original work, but there were some changes later.
>
> The commit which adds FRWR support to xprtrdma introduced a check in
> rpcrdma_ia_open:
>
> + case RPCRDMA_FRMR:
> + /* Requires both frmr reg and local dma lkey */
> + if ((devattr.device_cap_flags &
> + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
> + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
>
> This seems to be the only place the local DMA lkey requirement is
> expressed or enforced. But yeah, the local DMA lkey doesn?t seem
> to be used anywhere in the FRWR-specific parts of the code.
I now vaguely remember someone telling me that both attributes were
required, but a) I may have misunderstood and b) I'm sure something
has changed. FRMR was brand new at the time.
> -----Original Message-----
> From: [email protected] [mailto:[email protected]] On Behalf Of Tom Talpey
> Sent: Monday, July 20, 2015 7:15 PM
> To: Steve Wise; 'Jason Gunthorpe'
> Cc: 'Chuck Lever'; [email protected]; [email protected]
> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>
> On 7/20/2015 3:41 PM, Steve Wise wrote:
> >
> >
> >> -----Original Message-----
> >> From: Tom Talpey [mailto:[email protected]]
> >> Sent: Monday, July 20, 2015 5:04 PM
> >> To: Steve Wise; 'Jason Gunthorpe'
> >> Cc: 'Chuck Lever'; [email protected]; [email protected]
> >> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> >>
> >> On 7/20/2015 2:16 PM, Steve Wise wrote:
> >>>
> >>>
> >>>> -----Original Message-----
> >>>> From: [email protected] [mailto:[email protected]] On Behalf Of Jason Gunthorpe
> >>>> Sent: Monday, July 20, 2015 4:06 PM
> >>>> To: Tom Talpey; Steve Wise
> >>>> Cc: Chuck Lever; [email protected]; [email protected]
> >>>> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> >>>>
> >>>> On Mon, Jul 20, 2015 at 01:34:16PM -0700, Tom Talpey wrote:
> >>>>> On 7/20/2015 12:03 PM, Chuck Lever wrote:
> >>>>>> All HCA providers have an ib_get_dma_mr() verb. Thus
> >>>>>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
> >>>>>> is available, or it will call ib_get_dma_mr() which is a 100%
> >>>>>> guaranteed fallback.
> >>>>>
> >>>>> I recall that in the past, some providers did not support mapping
> >>>>> all of the machine's potential physical memory with a single dma_mr.
> >>>>> If an rnic did/does not support 44-ish bits of length per region,
> >>>>> for example.
> >>>>
> >>>> Looks like you are right, but the standard in kernel is to require
> >>>> ib_get_dma_mr, if the HCA can't do that, then it cannot be used on a
> >>>> big memory machine with kernel ULPs.
> >>>>
> >>>> Looking deeper, both amso1100 and cxgb3 seem limited to 32 bits of
> >>>> physical memory, and silently break all kernel ULPs if they are used
> >>>> on a modern machine with > 4G.
> >>>>
> >>>> Is that right Steve?
> >>>>
> >>>
> >>> Yes.
> >>>
> >>>> Based on that, should we remove the cxgb3 driver as well? Or at least
> >>>> can you fix it up to at least fail get_dma_mr if there is too much
> >>>> ram?
> >>>>
> >>>
> >>> I would like to keep cxgb3 around. I can add code to fail if the memory is > 32b. Do you know how I get the amount of
> > available
> >>> ram?
> >>
> >> A) are you sure it's an unsigned length, i.e. is it really 31 bits?
> >>
> >
> > yes.
> >
> >> B) why bother to check? Are machines with <4GB interesting, and worth
> >> supporting a special optimization?
> >
> > No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.
>
> I'm obviously not making myself clear. I am suggesting that cxgb3 fail
> the ib_get_dma_mr() verb, regardless of installed memory.
>
> I am not suggesting it fail to load, or fail other memreg requests. It
> should work normally in all other respects.
Even with its limitation, doesn't it have utility for someone using cxgb3 in an embedded 32b environment?
Hey Chuck,
Just a heads up that this patch should go through the infiniband tree, and not NFS :).
Thanks,
Anna
On 07/20/2015 03:04 PM, Chuck Lever wrote:
> The verbs are obsolete. The ib_rereg_phys_mr() verb is not used by
> kernel ULPs, and the last ib_reg_phys_mr() call site in the kernel
> tree has now been removed.
>
> Two staging tree call sites remain in the Lustre client. The Lustre
> team has been notified of the deprecation of reg_phys_mr.
>
> Signed-off-by: Chuck Lever <[email protected]>
> Acked-by: Doug Ledford <[email protected]>
> ---
> drivers/infiniband/core/verbs.c | 67 ---------------------------------------
> include/rdma/ib_verbs.h | 46 ---------------------------
> 2 files changed, 113 deletions(-)
>
> diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
> index bac3fb4..30eb245 100644
> --- a/drivers/infiniband/core/verbs.c
> +++ b/drivers/infiniband/core/verbs.c
> @@ -1144,73 +1144,6 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
> }
> EXPORT_SYMBOL(ib_get_dma_mr);
>
> -struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
> - struct ib_phys_buf *phys_buf_array,
> - int num_phys_buf,
> - int mr_access_flags,
> - u64 *iova_start)
> -{
> - struct ib_mr *mr;
> - int err;
> -
> - err = ib_check_mr_access(mr_access_flags);
> - if (err)
> - return ERR_PTR(err);
> -
> - if (!pd->device->reg_phys_mr)
> - return ERR_PTR(-ENOSYS);
> -
> - mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
> - mr_access_flags, iova_start);
> -
> - if (!IS_ERR(mr)) {
> - mr->device = pd->device;
> - mr->pd = pd;
> - mr->uobject = NULL;
> - atomic_inc(&pd->usecnt);
> - atomic_set(&mr->usecnt, 0);
> - }
> -
> - return mr;
> -}
> -EXPORT_SYMBOL(ib_reg_phys_mr);
> -
> -int ib_rereg_phys_mr(struct ib_mr *mr,
> - int mr_rereg_mask,
> - struct ib_pd *pd,
> - struct ib_phys_buf *phys_buf_array,
> - int num_phys_buf,
> - int mr_access_flags,
> - u64 *iova_start)
> -{
> - struct ib_pd *old_pd;
> - int ret;
> -
> - ret = ib_check_mr_access(mr_access_flags);
> - if (ret)
> - return ret;
> -
> - if (!mr->device->rereg_phys_mr)
> - return -ENOSYS;
> -
> - if (atomic_read(&mr->usecnt))
> - return -EBUSY;
> -
> - old_pd = mr->pd;
> -
> - ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
> - phys_buf_array, num_phys_buf,
> - mr_access_flags, iova_start);
> -
> - if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
> - atomic_dec(&old_pd->usecnt);
> - atomic_inc(&pd->usecnt);
> - }
> -
> - return ret;
> -}
> -EXPORT_SYMBOL(ib_rereg_phys_mr);
> -
> int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
> {
> return mr->device->query_mr ?
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index b0f898e..43c1cf0 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -2760,52 +2760,6 @@ static inline void ib_dma_free_coherent(struct ib_device *dev,
> }
>
> /**
> - * ib_reg_phys_mr - Prepares a virtually addressed memory region for use
> - * by an HCA.
> - * @pd: The protection domain associated assigned to the registered region.
> - * @phys_buf_array: Specifies a list of physical buffers to use in the
> - * memory region.
> - * @num_phys_buf: Specifies the size of the phys_buf_array.
> - * @mr_access_flags: Specifies the memory access rights.
> - * @iova_start: The offset of the region's starting I/O virtual address.
> - */
> -struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
> - struct ib_phys_buf *phys_buf_array,
> - int num_phys_buf,
> - int mr_access_flags,
> - u64 *iova_start);
> -
> -/**
> - * ib_rereg_phys_mr - Modifies the attributes of an existing memory region.
> - * Conceptually, this call performs the functions deregister memory region
> - * followed by register physical memory region. Where possible,
> - * resources are reused instead of deallocated and reallocated.
> - * @mr: The memory region to modify.
> - * @mr_rereg_mask: A bit-mask used to indicate which of the following
> - * properties of the memory region are being modified.
> - * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies
> - * the new protection domain to associated with the memory region,
> - * otherwise, this parameter is ignored.
> - * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
> - * field specifies a list of physical buffers to use in the new
> - * translation, otherwise, this parameter is ignored.
> - * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
> - * field specifies the size of the phys_buf_array, otherwise, this
> - * parameter is ignored.
> - * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this
> - * field specifies the new memory access rights, otherwise, this
> - * parameter is ignored.
> - * @iova_start: The offset of the region's starting I/O virtual address.
> - */
> -int ib_rereg_phys_mr(struct ib_mr *mr,
> - int mr_rereg_mask,
> - struct ib_pd *pd,
> - struct ib_phys_buf *phys_buf_array,
> - int num_phys_buf,
> - int mr_access_flags,
> - u64 *iova_start);
> -
> -/**
> * ib_query_mr - Retrieves information about a specific memory region.
> * @mr: The memory region to retrieve information about.
> * @mr_attr: The attributes of the specified memory region.
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
Hi Anna-
On Jul 21, 2015, at 4:08 PM, Anna Schumaker <[email protected]> wrote:
> Hey Chuck,
>
> Just a heads up that this patch should go through the infiniband tree, and not NFS :).
The IB maintainer has ACK?d it, so it can go through NFS. It depends
on changes in an earlier patch in this series.
> Thanks,
> Anna
>
> On 07/20/2015 03:04 PM, Chuck Lever wrote:
>> The verbs are obsolete. The ib_rereg_phys_mr() verb is not used by
>> kernel ULPs, and the last ib_reg_phys_mr() call site in the kernel
>> tree has now been removed.
>>
>> Two staging tree call sites remain in the Lustre client. The Lustre
>> team has been notified of the deprecation of reg_phys_mr.
>>
>> Signed-off-by: Chuck Lever <[email protected]>
>> Acked-by: Doug Ledford <[email protected]>
>> ---
>> drivers/infiniband/core/verbs.c | 67 ---------------------------------------
>> include/rdma/ib_verbs.h | 46 ---------------------------
>> 2 files changed, 113 deletions(-)
>>
>> diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
>> index bac3fb4..30eb245 100644
>> --- a/drivers/infiniband/core/verbs.c
>> +++ b/drivers/infiniband/core/verbs.c
>> @@ -1144,73 +1144,6 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
>> }
>> EXPORT_SYMBOL(ib_get_dma_mr);
>>
>> -struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
>> - struct ib_phys_buf *phys_buf_array,
>> - int num_phys_buf,
>> - int mr_access_flags,
>> - u64 *iova_start)
>> -{
>> - struct ib_mr *mr;
>> - int err;
>> -
>> - err = ib_check_mr_access(mr_access_flags);
>> - if (err)
>> - return ERR_PTR(err);
>> -
>> - if (!pd->device->reg_phys_mr)
>> - return ERR_PTR(-ENOSYS);
>> -
>> - mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
>> - mr_access_flags, iova_start);
>> -
>> - if (!IS_ERR(mr)) {
>> - mr->device = pd->device;
>> - mr->pd = pd;
>> - mr->uobject = NULL;
>> - atomic_inc(&pd->usecnt);
>> - atomic_set(&mr->usecnt, 0);
>> - }
>> -
>> - return mr;
>> -}
>> -EXPORT_SYMBOL(ib_reg_phys_mr);
>> -
>> -int ib_rereg_phys_mr(struct ib_mr *mr,
>> - int mr_rereg_mask,
>> - struct ib_pd *pd,
>> - struct ib_phys_buf *phys_buf_array,
>> - int num_phys_buf,
>> - int mr_access_flags,
>> - u64 *iova_start)
>> -{
>> - struct ib_pd *old_pd;
>> - int ret;
>> -
>> - ret = ib_check_mr_access(mr_access_flags);
>> - if (ret)
>> - return ret;
>> -
>> - if (!mr->device->rereg_phys_mr)
>> - return -ENOSYS;
>> -
>> - if (atomic_read(&mr->usecnt))
>> - return -EBUSY;
>> -
>> - old_pd = mr->pd;
>> -
>> - ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
>> - phys_buf_array, num_phys_buf,
>> - mr_access_flags, iova_start);
>> -
>> - if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
>> - atomic_dec(&old_pd->usecnt);
>> - atomic_inc(&pd->usecnt);
>> - }
>> -
>> - return ret;
>> -}
>> -EXPORT_SYMBOL(ib_rereg_phys_mr);
>> -
>> int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
>> {
>> return mr->device->query_mr ?
>> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
>> index b0f898e..43c1cf0 100644
>> --- a/include/rdma/ib_verbs.h
>> +++ b/include/rdma/ib_verbs.h
>> @@ -2760,52 +2760,6 @@ static inline void ib_dma_free_coherent(struct ib_device *dev,
>> }
>>
>> /**
>> - * ib_reg_phys_mr - Prepares a virtually addressed memory region for use
>> - * by an HCA.
>> - * @pd: The protection domain associated assigned to the registered region.
>> - * @phys_buf_array: Specifies a list of physical buffers to use in the
>> - * memory region.
>> - * @num_phys_buf: Specifies the size of the phys_buf_array.
>> - * @mr_access_flags: Specifies the memory access rights.
>> - * @iova_start: The offset of the region's starting I/O virtual address.
>> - */
>> -struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
>> - struct ib_phys_buf *phys_buf_array,
>> - int num_phys_buf,
>> - int mr_access_flags,
>> - u64 *iova_start);
>> -
>> -/**
>> - * ib_rereg_phys_mr - Modifies the attributes of an existing memory region.
>> - * Conceptually, this call performs the functions deregister memory region
>> - * followed by register physical memory region. Where possible,
>> - * resources are reused instead of deallocated and reallocated.
>> - * @mr: The memory region to modify.
>> - * @mr_rereg_mask: A bit-mask used to indicate which of the following
>> - * properties of the memory region are being modified.
>> - * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies
>> - * the new protection domain to associated with the memory region,
>> - * otherwise, this parameter is ignored.
>> - * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
>> - * field specifies a list of physical buffers to use in the new
>> - * translation, otherwise, this parameter is ignored.
>> - * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
>> - * field specifies the size of the phys_buf_array, otherwise, this
>> - * parameter is ignored.
>> - * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this
>> - * field specifies the new memory access rights, otherwise, this
>> - * parameter is ignored.
>> - * @iova_start: The offset of the region's starting I/O virtual address.
>> - */
>> -int ib_rereg_phys_mr(struct ib_mr *mr,
>> - int mr_rereg_mask,
>> - struct ib_pd *pd,
>> - struct ib_phys_buf *phys_buf_array,
>> - int num_phys_buf,
>> - int mr_access_flags,
>> - u64 *iova_start);
>> -
>> -/**
>> * ib_query_mr - Retrieves information about a specific memory region.
>> * @mr: The memory region to retrieve information about.
>> * @mr_attr: The attributes of the specified memory region.
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Chuck Lever
On 07/21/2015 04:16 PM, Chuck Lever wrote:
> Hi Anna-
>
>
> On Jul 21, 2015, at 4:08 PM, Anna Schumaker <[email protected]> wrote:
>
>> Hey Chuck,
>>
>> Just a heads up that this patch should go through the infiniband tree, and not NFS :).
>
> The IB maintainer has ACK’d it, so it can go through NFS. It depends
> on changes in an earlier patch in this series.
Fair enough! I didn't realize Doug was the IB maintainer.
Anna
>
>
>> Thanks,
>> Anna
>>
>> On 07/20/2015 03:04 PM, Chuck Lever wrote:
>>> The verbs are obsolete. The ib_rereg_phys_mr() verb is not used by
>>> kernel ULPs, and the last ib_reg_phys_mr() call site in the kernel
>>> tree has now been removed.
>>>
>>> Two staging tree call sites remain in the Lustre client. The Lustre
>>> team has been notified of the deprecation of reg_phys_mr.
>>>
>>> Signed-off-by: Chuck Lever <[email protected]>
>>> Acked-by: Doug Ledford <[email protected]>
>>> ---
>>> drivers/infiniband/core/verbs.c | 67 ---------------------------------------
>>> include/rdma/ib_verbs.h | 46 ---------------------------
>>> 2 files changed, 113 deletions(-)
>>>
>>> diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
>>> index bac3fb4..30eb245 100644
>>> --- a/drivers/infiniband/core/verbs.c
>>> +++ b/drivers/infiniband/core/verbs.c
>>> @@ -1144,73 +1144,6 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
>>> }
>>> EXPORT_SYMBOL(ib_get_dma_mr);
>>>
>>> -struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
>>> - struct ib_phys_buf *phys_buf_array,
>>> - int num_phys_buf,
>>> - int mr_access_flags,
>>> - u64 *iova_start)
>>> -{
>>> - struct ib_mr *mr;
>>> - int err;
>>> -
>>> - err = ib_check_mr_access(mr_access_flags);
>>> - if (err)
>>> - return ERR_PTR(err);
>>> -
>>> - if (!pd->device->reg_phys_mr)
>>> - return ERR_PTR(-ENOSYS);
>>> -
>>> - mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
>>> - mr_access_flags, iova_start);
>>> -
>>> - if (!IS_ERR(mr)) {
>>> - mr->device = pd->device;
>>> - mr->pd = pd;
>>> - mr->uobject = NULL;
>>> - atomic_inc(&pd->usecnt);
>>> - atomic_set(&mr->usecnt, 0);
>>> - }
>>> -
>>> - return mr;
>>> -}
>>> -EXPORT_SYMBOL(ib_reg_phys_mr);
>>> -
>>> -int ib_rereg_phys_mr(struct ib_mr *mr,
>>> - int mr_rereg_mask,
>>> - struct ib_pd *pd,
>>> - struct ib_phys_buf *phys_buf_array,
>>> - int num_phys_buf,
>>> - int mr_access_flags,
>>> - u64 *iova_start)
>>> -{
>>> - struct ib_pd *old_pd;
>>> - int ret;
>>> -
>>> - ret = ib_check_mr_access(mr_access_flags);
>>> - if (ret)
>>> - return ret;
>>> -
>>> - if (!mr->device->rereg_phys_mr)
>>> - return -ENOSYS;
>>> -
>>> - if (atomic_read(&mr->usecnt))
>>> - return -EBUSY;
>>> -
>>> - old_pd = mr->pd;
>>> -
>>> - ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
>>> - phys_buf_array, num_phys_buf,
>>> - mr_access_flags, iova_start);
>>> -
>>> - if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
>>> - atomic_dec(&old_pd->usecnt);
>>> - atomic_inc(&pd->usecnt);
>>> - }
>>> -
>>> - return ret;
>>> -}
>>> -EXPORT_SYMBOL(ib_rereg_phys_mr);
>>> -
>>> int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
>>> {
>>> return mr->device->query_mr ?
>>> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
>>> index b0f898e..43c1cf0 100644
>>> --- a/include/rdma/ib_verbs.h
>>> +++ b/include/rdma/ib_verbs.h
>>> @@ -2760,52 +2760,6 @@ static inline void ib_dma_free_coherent(struct ib_device *dev,
>>> }
>>>
>>> /**
>>> - * ib_reg_phys_mr - Prepares a virtually addressed memory region for use
>>> - * by an HCA.
>>> - * @pd: The protection domain associated assigned to the registered region.
>>> - * @phys_buf_array: Specifies a list of physical buffers to use in the
>>> - * memory region.
>>> - * @num_phys_buf: Specifies the size of the phys_buf_array.
>>> - * @mr_access_flags: Specifies the memory access rights.
>>> - * @iova_start: The offset of the region's starting I/O virtual address.
>>> - */
>>> -struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
>>> - struct ib_phys_buf *phys_buf_array,
>>> - int num_phys_buf,
>>> - int mr_access_flags,
>>> - u64 *iova_start);
>>> -
>>> -/**
>>> - * ib_rereg_phys_mr - Modifies the attributes of an existing memory region.
>>> - * Conceptually, this call performs the functions deregister memory region
>>> - * followed by register physical memory region. Where possible,
>>> - * resources are reused instead of deallocated and reallocated.
>>> - * @mr: The memory region to modify.
>>> - * @mr_rereg_mask: A bit-mask used to indicate which of the following
>>> - * properties of the memory region are being modified.
>>> - * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies
>>> - * the new protection domain to associated with the memory region,
>>> - * otherwise, this parameter is ignored.
>>> - * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
>>> - * field specifies a list of physical buffers to use in the new
>>> - * translation, otherwise, this parameter is ignored.
>>> - * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
>>> - * field specifies the size of the phys_buf_array, otherwise, this
>>> - * parameter is ignored.
>>> - * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this
>>> - * field specifies the new memory access rights, otherwise, this
>>> - * parameter is ignored.
>>> - * @iova_start: The offset of the region's starting I/O virtual address.
>>> - */
>>> -int ib_rereg_phys_mr(struct ib_mr *mr,
>>> - int mr_rereg_mask,
>>> - struct ib_pd *pd,
>>> - struct ib_phys_buf *phys_buf_array,
>>> - int num_phys_buf,
>>> - int mr_access_flags,
>>> - u64 *iova_start);
>>> -
>>> -/**
>>> * ib_query_mr - Retrieves information about a specific memory region.
>>> * @mr: The memory region to retrieve information about.
>>> * @mr_attr: The attributes of the specified memory region.
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>> the body of a message to [email protected]
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
> --
> Chuck Lever
>
>
>
On 7/21/2015 7:33 AM, Steve Wise wrote:
>
>
>> -----Original Message-----
>> From: [email protected] [mailto:[email protected]] On Behalf Of Tom Talpey
>> Sent: Monday, July 20, 2015 7:15 PM
>> To: Steve Wise; 'Jason Gunthorpe'
>> Cc: 'Chuck Lever'; [email protected]; [email protected]
>> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>>
>> On 7/20/2015 3:41 PM, Steve Wise wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: Tom Talpey [mailto:[email protected]]
>>>> Sent: Monday, July 20, 2015 5:04 PM
>>>> To: Steve Wise; 'Jason Gunthorpe'
>>>> Cc: 'Chuck Lever'; [email protected]; [email protected]
>>>> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>>>>
>>>> On 7/20/2015 2:16 PM, Steve Wise wrote:
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: [email protected] [mailto:[email protected]] On Behalf Of Jason Gunthorpe
>>>>>> Sent: Monday, July 20, 2015 4:06 PM
>>>>>> To: Tom Talpey; Steve Wise
>>>>>> Cc: Chuck Lever; [email protected]; [email protected]
>>>>>> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>>>>>>
>>>>>> On Mon, Jul 20, 2015 at 01:34:16PM -0700, Tom Talpey wrote:
>>>>>>> On 7/20/2015 12:03 PM, Chuck Lever wrote:
>>>>>>>> All HCA providers have an ib_get_dma_mr() verb. Thus
>>>>>>>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
>>>>>>>> is available, or it will call ib_get_dma_mr() which is a 100%
>>>>>>>> guaranteed fallback.
>>>>>>>
>>>>>>> I recall that in the past, some providers did not support mapping
>>>>>>> all of the machine's potential physical memory with a single dma_mr.
>>>>>>> If an rnic did/does not support 44-ish bits of length per region,
>>>>>>> for example.
>>>>>>
>>>>>> Looks like you are right, but the standard in kernel is to require
>>>>>> ib_get_dma_mr, if the HCA can't do that, then it cannot be used on a
>>>>>> big memory machine with kernel ULPs.
>>>>>>
>>>>>> Looking deeper, both amso1100 and cxgb3 seem limited to 32 bits of
>>>>>> physical memory, and silently break all kernel ULPs if they are used
>>>>>> on a modern machine with > 4G.
>>>>>>
>>>>>> Is that right Steve?
>>>>>>
>>>>>
>>>>> Yes.
>>>>>
>>>>>> Based on that, should we remove the cxgb3 driver as well? Or at least
>>>>>> can you fix it up to at least fail get_dma_mr if there is too much
>>>>>> ram?
>>>>>>
>>>>>
>>>>> I would like to keep cxgb3 around. I can add code to fail if the memory is > 32b. Do you know how I get the amount of
>>> available
>>>>> ram?
>>>>
>>>> A) are you sure it's an unsigned length, i.e. is it really 31 bits?
>>>>
>>>
>>> yes.
>>>
>>>> B) why bother to check? Are machines with <4GB interesting, and worth
>>>> supporting a special optimization?
>>>
>>> No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.
>>
>> I'm obviously not making myself clear. I am suggesting that cxgb3 fail
>> the ib_get_dma_mr() verb, regardless of installed memory.
>>
>> I am not suggesting it fail to load, or fail other memreg requests. It
>> should work normally in all other respects.
>
> Even with its limitation, doesn't it have utility for someone using cxgb3 in an embedded 32b environment?
Sure, do you mean making it conditional on #if sizeof(physaddr) <= 32?
That would make sense I guess.
> -----Original Message-----
> From: Tom Talpey [mailto:[email protected]]
> Sent: Tuesday, July 21, 2015 3:47 PM
> To: Steve Wise; 'Jason Gunthorpe'
> Cc: 'Chuck Lever'; [email protected]; [email protected]
> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>
> On 7/21/2015 7:33 AM, Steve Wise wrote:
> >
> >
> >> -----Original Message-----
> >> From: [email protected] [mailto:[email protected]] On Behalf Of Tom Talpey
> >> Sent: Monday, July 20, 2015 7:15 PM
> >> To: Steve Wise; 'Jason Gunthorpe'
> >> Cc: 'Chuck Lever'; [email protected]; [email protected]
> >> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> >>
> >> On 7/20/2015 3:41 PM, Steve Wise wrote:
> >>>
> >>>
> >>>> -----Original Message-----
> >>>> From: Tom Talpey [mailto:[email protected]]
> >>>> Sent: Monday, July 20, 2015 5:04 PM
> >>>> To: Steve Wise; 'Jason Gunthorpe'
> >>>> Cc: 'Chuck Lever'; [email protected]; [email protected]
> >>>> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> >>>>
> >>>> On 7/20/2015 2:16 PM, Steve Wise wrote:
> >>>>>
> >>>>>
> >>>>>> -----Original Message-----
> >>>>>> From: [email protected] [mailto:[email protected]] On Behalf Of Jason Gunthorpe
> >>>>>> Sent: Monday, July 20, 2015 4:06 PM
> >>>>>> To: Tom Talpey; Steve Wise
> >>>>>> Cc: Chuck Lever; [email protected]; [email protected]
> >>>>>> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> >>>>>>
> >>>>>> On Mon, Jul 20, 2015 at 01:34:16PM -0700, Tom Talpey wrote:
> >>>>>>> On 7/20/2015 12:03 PM, Chuck Lever wrote:
> >>>>>>>> All HCA providers have an ib_get_dma_mr() verb. Thus
> >>>>>>>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
> >>>>>>>> is available, or it will call ib_get_dma_mr() which is a 100%
> >>>>>>>> guaranteed fallback.
> >>>>>>>
> >>>>>>> I recall that in the past, some providers did not support mapping
> >>>>>>> all of the machine's potential physical memory with a single dma_mr.
> >>>>>>> If an rnic did/does not support 44-ish bits of length per region,
> >>>>>>> for example.
> >>>>>>
> >>>>>> Looks like you are right, but the standard in kernel is to require
> >>>>>> ib_get_dma_mr, if the HCA can't do that, then it cannot be used on a
> >>>>>> big memory machine with kernel ULPs.
> >>>>>>
> >>>>>> Looking deeper, both amso1100 and cxgb3 seem limited to 32 bits of
> >>>>>> physical memory, and silently break all kernel ULPs if they are used
> >>>>>> on a modern machine with > 4G.
> >>>>>>
> >>>>>> Is that right Steve?
> >>>>>>
> >>>>>
> >>>>> Yes.
> >>>>>
> >>>>>> Based on that, should we remove the cxgb3 driver as well? Or at least
> >>>>>> can you fix it up to at least fail get_dma_mr if there is too much
> >>>>>> ram?
> >>>>>>
> >>>>>
> >>>>> I would like to keep cxgb3 around. I can add code to fail if the memory is > 32b. Do you know how I get the amount of
> >>> available
> >>>>> ram?
> >>>>
> >>>> A) are you sure it's an unsigned length, i.e. is it really 31 bits?
> >>>>
> >>>
> >>> yes.
> >>>
> >>>> B) why bother to check? Are machines with <4GB interesting, and worth
> >>>> supporting a special optimization?
> >>>
> >>> No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.
> >>
> >> I'm obviously not making myself clear. I am suggesting that cxgb3 fail
> >> the ib_get_dma_mr() verb, regardless of installed memory.
> >>
> >> I am not suggesting it fail to load, or fail other memreg requests. It
> >> should work normally in all other respects.
> >
> > Even with its limitation, doesn't it have utility for someone using cxgb3 in an embedded 32b environment?
>
> Sure, do you mean making it conditional on #if sizeof(physaddr) <= 32?
> That would make sense I guess.
No, a runtime check. x64 platforms will work too if the mem size takes <= 32b to describe.
> > >>>> B) why bother to check? Are machines with <4GB interesting, and worth
> > >>>> supporting a special optimization?
> > >>>
> > >>> No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.
> > >>
> > >> I'm obviously not making myself clear. I am suggesting that cxgb3 fail
> > >> the ib_get_dma_mr() verb, regardless of installed memory.
> > >>
> > >> I am not suggesting it fail to load, or fail other memreg requests. It
> > >> should work normally in all other respects.
> > >
> > > Even with its limitation, doesn't it have utility for someone using cxgb3 in an embedded 32b environment?
> >
> > Sure, do you mean making it conditional on #if sizeof(physaddr) <= 32?
> > That would make sense I guess.
>
> No, a runtime check. x64 platforms will work too if the mem size takes <= 32b to describe.
Jason/Doug, do you think it should allow dma mr allocaton iff totalmem_pages < 4GB? Or do a check on the sizeof totalmem_pages and
only allow dma mrs if it is <= 4?
On 7/20/2015 5:42 PM, Jason Gunthorpe wrote:
> On Mon, Jul 20, 2015 at 05:41:27PM -0500, Steve Wise wrote:
>>> B) why bother to check? Are machines with <4GB interesting, and worth
>>> supporting a special optimization?
>> No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.
> Doesn't look like the NFS client will work. It requires an all
> physical memory lkey for SEND and RECV buffers..
>
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
Looks like cxgb3 supports LOCAL_DMA_LKEY and MEM_MGT_EXTENSIONS so dma
mrs aren't required for NFSRDMA:
t4:~/linux-2.6/drivers/infiniband/hw/cxgb3 # grep IB_DEVICE_ iwch_provider.c
strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX);
dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
IB_DEVICE_MEM_WINDOW |
IB_DEVICE_MEM_MGT_EXTENSIONS;
So cxgb3 can still support NFSRDMA and user verbs w/o get_dma_mr(). I'll
submit a patch soon to only support get_dma_mr() if unsigned long is 4
bytes...
Steve.
On Tue, Jul 21, 2015 at 05:41:22PM -0500, Steve Wise wrote:
> On 7/20/2015 5:42 PM, Jason Gunthorpe wrote:
> >On Mon, Jul 20, 2015 at 05:41:27PM -0500, Steve Wise wrote:
> >>>B) why bother to check? Are machines with <4GB interesting, and worth
> >>>supporting a special optimization?
> >>No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.
> >Doesn't look like the NFS client will work. It requires an all
> >physical memory lkey for SEND and RECV buffers..
> >
> >Jason
>
> Looks like cxgb3 supports LOCAL_DMA_LKEY and MEM_MGT_EXTENSIONS so dma mrs
> aren't required for NFSRDMA:
>
> t4:~/linux-2.6/drivers/infiniband/hw/cxgb3 # grep IB_DEVICE_ iwch_provider.c
> strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX);
> dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
> IB_DEVICE_MEM_WINDOW |
> IB_DEVICE_MEM_MGT_EXTENSIONS;
Neat. Is the dma_mask set properly (I don't see any set at all)?
> So cxgb3 can still support NFSRDMA and user verbs w/o get_dma_mr(). I'll
> submit a patch soon to only support get_dma_mr() if unsigned long is 4
> bytes...
So, NFS and RDS seem to be the only iWarp compatible ULPs?
NFS has worked, and will continue to work with the global lkey.
RDS looks like it relies on an insecure all physical rkey, so it won't
work until that is fixed.
So, I'd just use sizeof(physaddr_t) > 4 as the test. The only people
that could be impacted are RDS users using distro kernels on machines
with less than 4G of ram. I somehow doubt there are any of those...
Jason
> -----Original Message-----
> From: [email protected] [mailto:[email protected]] On Behalf Of Jason Gunthorpe
> Sent: Tuesday, July 21, 2015 5:55 PM
> To: Steve Wise
> Cc: 'Tom Talpey'; 'Chuck Lever'; [email protected]; [email protected]
> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>
> On Tue, Jul 21, 2015 at 05:41:22PM -0500, Steve Wise wrote:
> > On 7/20/2015 5:42 PM, Jason Gunthorpe wrote:
> > >On Mon, Jul 20, 2015 at 05:41:27PM -0500, Steve Wise wrote:
> > >>>B) why bother to check? Are machines with <4GB interesting, and worth
> > >>>supporting a special optimization?
> > >>No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.
> > >Doesn't look like the NFS client will work. It requires an all
> > >physical memory lkey for SEND and RECV buffers..
> > >
> > >Jason
> >
> > Looks like cxgb3 supports LOCAL_DMA_LKEY and MEM_MGT_EXTENSIONS so dma mrs
> > aren't required for NFSRDMA:
> >
> > t4:~/linux-2.6/drivers/infiniband/hw/cxgb3 # grep IB_DEVICE_ iwch_provider.c
> > strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX);
> > dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
> > IB_DEVICE_MEM_WINDOW |
> > IB_DEVICE_MEM_MGT_EXTENSIONS;
>
> Neat. Is the dma_mask set properly (I don't see any set at all)?
>
iw_cxgb3 isn't a PCI driver. It sits on top of cxgb3 which is the pci device driver and calls pci_set_dma_mask().
> > So cxgb3 can still support NFSRDMA and user verbs w/o get_dma_mr(). I'll
> > submit a patch soon to only support get_dma_mr() if unsigned long is 4
> > bytes...
>
> So, NFS and RDS seem to be the only iWarp compatible ULPs?
>
> NFS has worked, and will continue to work with the global lkey.
>
> RDS looks like it relies on an insecure all physical rkey, so it won't
> work until that is fixed.
>
> So, I'd just use sizeof(physaddr_t) > 4 as the test. The only people
> that could be impacted are RDS users using distro kernels on machines
> with less than 4G of ram. I somehow doubt there are any of those...
>
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jul 20, 2015 at 03:02:33PM -0400, Chuck Lever wrote:
> In particular, recognize when an IPv6 connection is bound.
>
> Signed-off-by: Chuck Lever <[email protected]>
> Tested-by: Devesh Sharma <[email protected]>
Looks good,
Reviewed-by: Christoph Hellwig <[email protected]>
On Mon, Jul 20, 2015 at 03:03:02PM -0400, Chuck Lever wrote:
> PHYSICAL memory registration uses a single rkey for all of the
> client's memory, thus is insecure. It is still useful in some cases
> for testing.
>
> Retain the ability to select PHYSICAL memory registration capability
> via /proc/sys/sunrpc/rdma_memreg_strategy, but don't fall back to it
> if the HCA does not support FRWR or FMR.
>
> This means amso1100 no longer works out of the box with NFS/RDMA.
> When using amso1100 HCAs, set the memreg_strategy sysctl to 6 before
> performing NFS/RDMA mounts.
Looks good,
Reviewed-by: Christoph Hellwig <[email protected]>
Jason has patches that provide a local_dma_lkey in the PD that is always
available. Do you need this clean up for the next merge window? If not
it might be worth to postponed it to avoid merge conflicts, specially
as I assume the NFS changes will go in through Trond.
On Mon, Jul 20, 2015 at 03:03:20PM -0400, Chuck Lever wrote:
> Untangle the end of rpcrdma_ia_open() by moving DMA MR set-up, which
> is different for each registration method, to the .ro_open functions.
>
> This is refactoring only. No behavior change is expected.
>
> Signed-off-by: Chuck Lever <[email protected]>
> Tested-by: Devesh Sharma <[email protected]>
> ---
> net/sunrpc/xprtrdma/fmr_ops.c | 19 +++++++++++
> net/sunrpc/xprtrdma/frwr_ops.c | 5 +++
> net/sunrpc/xprtrdma/physical_ops.c | 25 ++++++++++++++-
> net/sunrpc/xprtrdma/verbs.c | 60 +++++++++++-------------------------
> net/sunrpc/xprtrdma/xprt_rdma.h | 3 +-
> 5 files changed, 67 insertions(+), 45 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
> index f1e8daf..cb25c89 100644
> --- a/net/sunrpc/xprtrdma/fmr_ops.c
> +++ b/net/sunrpc/xprtrdma/fmr_ops.c
> @@ -39,6 +39,25 @@ static int
> fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
> struct rpcrdma_create_data_internal *cdata)
> {
> + struct ib_device_attr *devattr = &ia->ri_devattr;
> + struct ib_mr *mr;
> +
> + /* Obtain an lkey to use for the regbufs, which are
> + * protected from remote access.
> + */
> + if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
> + ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
> + } else {
> + mr = ib_get_dma_mr(ia->ri_pd, IB_ACCESS_LOCAL_WRITE);
> + if (IS_ERR(mr)) {
> + pr_err("%s: ib_get_dma_mr for failed with %lX\n",
> + __func__, PTR_ERR(mr));
> + return -ENOMEM;
> + }
> + ia->ri_dma_lkey = ia->ri_dma_mr->lkey;
> + ia->ri_dma_mr = mr;
> + }
> +
> return 0;
> }
>
> diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
> index 04ea914..63f282e 100644
> --- a/net/sunrpc/xprtrdma/frwr_ops.c
> +++ b/net/sunrpc/xprtrdma/frwr_ops.c
> @@ -189,6 +189,11 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
> struct ib_device_attr *devattr = &ia->ri_devattr;
> int depth, delta;
>
> + /* Obtain an lkey to use for the regbufs, which are
> + * protected from remote access.
> + */
> + ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
> +
> ia->ri_max_frmr_depth =
> min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
> devattr->max_fast_reg_page_list_len);
> diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
> index 41985d0..72cf8b1 100644
> --- a/net/sunrpc/xprtrdma/physical_ops.c
> +++ b/net/sunrpc/xprtrdma/physical_ops.c
> @@ -23,6 +23,29 @@ static int
> physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
> struct rpcrdma_create_data_internal *cdata)
> {
> + struct ib_device_attr *devattr = &ia->ri_devattr;
> + struct ib_mr *mr;
> +
> + /* Obtain an rkey to use for RPC data payloads.
> + */
> + mr = ib_get_dma_mr(ia->ri_pd,
> + IB_ACCESS_LOCAL_WRITE |
> + IB_ACCESS_REMOTE_WRITE |
> + IB_ACCESS_REMOTE_READ);
> + if (IS_ERR(mr)) {
> + pr_err("%s: ib_get_dma_mr for failed with %lX\n",
> + __func__, PTR_ERR(mr));
> + return -ENOMEM;
> + }
> + ia->ri_dma_mr = mr;
> +
> + /* Obtain an lkey to use for regbufs.
> + */
> + if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
> + ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
> + else
> + ia->ri_dma_lkey = ia->ri_dma_mr->lkey;
> +
> return 0;
> }
>
> @@ -51,7 +74,7 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
> struct rpcrdma_ia *ia = &r_xprt->rx_ia;
>
> rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
> - seg->mr_rkey = ia->ri_bind_mem->rkey;
> + seg->mr_rkey = ia->ri_dma_mr->rkey;
> seg->mr_base = seg->mr_dma;
> seg->mr_nsegs = 1;
> return 1;
> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> index da184f9..8516d98 100644
> --- a/net/sunrpc/xprtrdma/verbs.c
> +++ b/net/sunrpc/xprtrdma/verbs.c
> @@ -493,9 +493,11 @@ rpcrdma_clean_cq(struct ib_cq *cq)
> int
> rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
> {
> - int rc, mem_priv;
> struct rpcrdma_ia *ia = &xprt->rx_ia;
> struct ib_device_attr *devattr = &ia->ri_devattr;
> + int rc;
> +
> + ia->ri_dma_mr = NULL;
>
> ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
> if (IS_ERR(ia->ri_id)) {
> @@ -519,11 +521,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
> goto out3;
> }
>
> - if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
> - ia->ri_have_dma_lkey = 1;
> - ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
> - }
> -
> if (memreg == RPCRDMA_FRMR) {
> /* Requires both frmr reg and local dma lkey */
> if (((devattr->device_cap_flags &
> @@ -543,38 +540,15 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
> }
> }
>
> - /*
> - * Optionally obtain an underlying physical identity mapping in
> - * order to do a memory window-based bind. This base registration
> - * is protected from remote access - that is enabled only by binding
> - * for the specific bytes targeted during each RPC operation, and
> - * revoked after the corresponding completion similar to a storage
> - * adapter.
> - */
> switch (memreg) {
> case RPCRDMA_FRMR:
> ia->ri_ops = &rpcrdma_frwr_memreg_ops;
> break;
> case RPCRDMA_ALLPHYSICAL:
> ia->ri_ops = &rpcrdma_physical_memreg_ops;
> - mem_priv = IB_ACCESS_LOCAL_WRITE |
> - IB_ACCESS_REMOTE_WRITE |
> - IB_ACCESS_REMOTE_READ;
> - goto register_setup;
> + break;
> case RPCRDMA_MTHCAFMR:
> ia->ri_ops = &rpcrdma_fmr_memreg_ops;
> - if (ia->ri_have_dma_lkey)
> - break;
> - mem_priv = IB_ACCESS_LOCAL_WRITE;
> - register_setup:
> - ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
> - if (IS_ERR(ia->ri_bind_mem)) {
> - printk(KERN_ALERT "%s: ib_get_dma_mr for "
> - "phys register failed with %lX\n",
> - __func__, PTR_ERR(ia->ri_bind_mem));
> - rc = -ENOMEM;
> - goto out3;
> - }
> break;
> default:
> printk(KERN_ERR "RPC: Unsupported memory "
> @@ -606,15 +580,7 @@ out1:
> void
> rpcrdma_ia_close(struct rpcrdma_ia *ia)
> {
> - int rc;
> -
> dprintk("RPC: %s: entering\n", __func__);
> - if (ia->ri_bind_mem != NULL) {
> - rc = ib_dereg_mr(ia->ri_bind_mem);
> - dprintk("RPC: %s: ib_dereg_mr returned %i\n",
> - __func__, rc);
> - }
> -
> if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
> if (ia->ri_id->qp)
> rdma_destroy_qp(ia->ri_id);
> @@ -661,8 +627,10 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
> if (cdata->padding) {
> ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
> GFP_KERNEL);
> - if (IS_ERR(ep->rep_padbuf))
> - return PTR_ERR(ep->rep_padbuf);
> + if (IS_ERR(ep->rep_padbuf)) {
> + rc = PTR_ERR(ep->rep_padbuf);
> + goto out0;
> + }
> } else
> ep->rep_padbuf = NULL;
>
> @@ -749,6 +717,9 @@ out2:
> __func__, err);
> out1:
> rpcrdma_free_regbuf(ia, ep->rep_padbuf);
> +out0:
> + if (ia->ri_dma_mr)
> + ib_dereg_mr(ia->ri_dma_mr);
> return rc;
> }
>
> @@ -788,6 +759,12 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
> if (rc)
> dprintk("RPC: %s: ib_destroy_cq returned %i\n",
> __func__, rc);
> +
> + if (ia->ri_dma_mr) {
> + rc = ib_dereg_mr(ia->ri_dma_mr);
> + dprintk("RPC: %s: ib_dereg_mr returned %i\n",
> + __func__, rc);
> + }
> }
>
> /*
> @@ -1262,8 +1239,7 @@ rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
> goto out_free;
>
> iov->length = size;
> - iov->lkey = ia->ri_have_dma_lkey ?
> - ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
> + iov->lkey = ia->ri_dma_lkey;
> rb->rg_size = size;
> rb->rg_owner = NULL;
> return rb;
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index ce4e79e..8219011 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -65,9 +65,8 @@ struct rpcrdma_ia {
> struct ib_device *ri_device;
> struct rdma_cm_id *ri_id;
> struct ib_pd *ri_pd;
> - struct ib_mr *ri_bind_mem;
> + struct ib_mr *ri_dma_mr;
> u32 ri_dma_lkey;
> - int ri_have_dma_lkey;
> struct completion ri_done;
> int ri_async_rc;
> unsigned int ri_max_frmr_depth;
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
---end quoted text---
Hi Christoph-
On Jul 26, 2015, at 12:53 PM, Christoph Hellwig <[email protected]> wrote:
> Jason has patches that provide a local_dma_lkey in the PD that is always
> available. Do you need this clean up for the next merge window? If not
> it might be worth to postponed it to avoid merge conflicts, specially
> as I assume the NFS changes will go in through Trond.
No, this patch is not strictly needed in 4.3, but my read of
Jason?s series is that he does not touch xprtrdma. I don?t
believe there will be a merge conflict.
The goal of this patch is to move xprtrdma forward so it will
be straightforward to use pd->local_dma_key for RPC send and
receive buffers. That?s a change that can be added after both
this patch and Jason?s series is merged.
I prefer keeping this patch separate, because that makes it
simpler to review and test this refactor. I don?t see a reason
to delay it, but I can do that if it is needed.
> On Mon, Jul 20, 2015 at 03:03:20PM -0400, Chuck Lever wrote:
>> Untangle the end of rpcrdma_ia_open() by moving DMA MR set-up, which
>> is different for each registration method, to the .ro_open functions.
>>
>> This is refactoring only. No behavior change is expected.
>>
>> Signed-off-by: Chuck Lever <[email protected]>
>> Tested-by: Devesh Sharma <[email protected]>
>> ---
>> net/sunrpc/xprtrdma/fmr_ops.c | 19 +++++++++++
>> net/sunrpc/xprtrdma/frwr_ops.c | 5 +++
>> net/sunrpc/xprtrdma/physical_ops.c | 25 ++++++++++++++-
>> net/sunrpc/xprtrdma/verbs.c | 60 +++++++++++-------------------------
>> net/sunrpc/xprtrdma/xprt_rdma.h | 3 +-
>> 5 files changed, 67 insertions(+), 45 deletions(-)
>>
>> diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
>> index f1e8daf..cb25c89 100644
>> --- a/net/sunrpc/xprtrdma/fmr_ops.c
>> +++ b/net/sunrpc/xprtrdma/fmr_ops.c
>> @@ -39,6 +39,25 @@ static int
>> fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
>> struct rpcrdma_create_data_internal *cdata)
>> {
>> + struct ib_device_attr *devattr = &ia->ri_devattr;
>> + struct ib_mr *mr;
>> +
>> + /* Obtain an lkey to use for the regbufs, which are
>> + * protected from remote access.
>> + */
>> + if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
>> + ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
>> + } else {
>> + mr = ib_get_dma_mr(ia->ri_pd, IB_ACCESS_LOCAL_WRITE);
>> + if (IS_ERR(mr)) {
>> + pr_err("%s: ib_get_dma_mr for failed with %lX\n",
>> + __func__, PTR_ERR(mr));
>> + return -ENOMEM;
>> + }
>> + ia->ri_dma_lkey = ia->ri_dma_mr->lkey;
>> + ia->ri_dma_mr = mr;
>> + }
>> +
>> return 0;
>> }
>>
>> diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
>> index 04ea914..63f282e 100644
>> --- a/net/sunrpc/xprtrdma/frwr_ops.c
>> +++ b/net/sunrpc/xprtrdma/frwr_ops.c
>> @@ -189,6 +189,11 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
>> struct ib_device_attr *devattr = &ia->ri_devattr;
>> int depth, delta;
>>
>> + /* Obtain an lkey to use for the regbufs, which are
>> + * protected from remote access.
>> + */
>> + ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
>> +
>> ia->ri_max_frmr_depth =
>> min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
>> devattr->max_fast_reg_page_list_len);
>> diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
>> index 41985d0..72cf8b1 100644
>> --- a/net/sunrpc/xprtrdma/physical_ops.c
>> +++ b/net/sunrpc/xprtrdma/physical_ops.c
>> @@ -23,6 +23,29 @@ static int
>> physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
>> struct rpcrdma_create_data_internal *cdata)
>> {
>> + struct ib_device_attr *devattr = &ia->ri_devattr;
>> + struct ib_mr *mr;
>> +
>> + /* Obtain an rkey to use for RPC data payloads.
>> + */
>> + mr = ib_get_dma_mr(ia->ri_pd,
>> + IB_ACCESS_LOCAL_WRITE |
>> + IB_ACCESS_REMOTE_WRITE |
>> + IB_ACCESS_REMOTE_READ);
>> + if (IS_ERR(mr)) {
>> + pr_err("%s: ib_get_dma_mr for failed with %lX\n",
>> + __func__, PTR_ERR(mr));
>> + return -ENOMEM;
>> + }
>> + ia->ri_dma_mr = mr;
>> +
>> + /* Obtain an lkey to use for regbufs.
>> + */
>> + if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
>> + ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
>> + else
>> + ia->ri_dma_lkey = ia->ri_dma_mr->lkey;
>> +
>> return 0;
>> }
>>
>> @@ -51,7 +74,7 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
>> struct rpcrdma_ia *ia = &r_xprt->rx_ia;
>>
>> rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
>> - seg->mr_rkey = ia->ri_bind_mem->rkey;
>> + seg->mr_rkey = ia->ri_dma_mr->rkey;
>> seg->mr_base = seg->mr_dma;
>> seg->mr_nsegs = 1;
>> return 1;
>> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
>> index da184f9..8516d98 100644
>> --- a/net/sunrpc/xprtrdma/verbs.c
>> +++ b/net/sunrpc/xprtrdma/verbs.c
>> @@ -493,9 +493,11 @@ rpcrdma_clean_cq(struct ib_cq *cq)
>> int
>> rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
>> {
>> - int rc, mem_priv;
>> struct rpcrdma_ia *ia = &xprt->rx_ia;
>> struct ib_device_attr *devattr = &ia->ri_devattr;
>> + int rc;
>> +
>> + ia->ri_dma_mr = NULL;
>>
>> ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
>> if (IS_ERR(ia->ri_id)) {
>> @@ -519,11 +521,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
>> goto out3;
>> }
>>
>> - if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
>> - ia->ri_have_dma_lkey = 1;
>> - ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
>> - }
>> -
>> if (memreg == RPCRDMA_FRMR) {
>> /* Requires both frmr reg and local dma lkey */
>> if (((devattr->device_cap_flags &
>> @@ -543,38 +540,15 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
>> }
>> }
>>
>> - /*
>> - * Optionally obtain an underlying physical identity mapping in
>> - * order to do a memory window-based bind. This base registration
>> - * is protected from remote access - that is enabled only by binding
>> - * for the specific bytes targeted during each RPC operation, and
>> - * revoked after the corresponding completion similar to a storage
>> - * adapter.
>> - */
>> switch (memreg) {
>> case RPCRDMA_FRMR:
>> ia->ri_ops = &rpcrdma_frwr_memreg_ops;
>> break;
>> case RPCRDMA_ALLPHYSICAL:
>> ia->ri_ops = &rpcrdma_physical_memreg_ops;
>> - mem_priv = IB_ACCESS_LOCAL_WRITE |
>> - IB_ACCESS_REMOTE_WRITE |
>> - IB_ACCESS_REMOTE_READ;
>> - goto register_setup;
>> + break;
>> case RPCRDMA_MTHCAFMR:
>> ia->ri_ops = &rpcrdma_fmr_memreg_ops;
>> - if (ia->ri_have_dma_lkey)
>> - break;
>> - mem_priv = IB_ACCESS_LOCAL_WRITE;
>> - register_setup:
>> - ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
>> - if (IS_ERR(ia->ri_bind_mem)) {
>> - printk(KERN_ALERT "%s: ib_get_dma_mr for "
>> - "phys register failed with %lX\n",
>> - __func__, PTR_ERR(ia->ri_bind_mem));
>> - rc = -ENOMEM;
>> - goto out3;
>> - }
>> break;
>> default:
>> printk(KERN_ERR "RPC: Unsupported memory "
>> @@ -606,15 +580,7 @@ out1:
>> void
>> rpcrdma_ia_close(struct rpcrdma_ia *ia)
>> {
>> - int rc;
>> -
>> dprintk("RPC: %s: entering\n", __func__);
>> - if (ia->ri_bind_mem != NULL) {
>> - rc = ib_dereg_mr(ia->ri_bind_mem);
>> - dprintk("RPC: %s: ib_dereg_mr returned %i\n",
>> - __func__, rc);
>> - }
>> -
>> if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
>> if (ia->ri_id->qp)
>> rdma_destroy_qp(ia->ri_id);
>> @@ -661,8 +627,10 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
>> if (cdata->padding) {
>> ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
>> GFP_KERNEL);
>> - if (IS_ERR(ep->rep_padbuf))
>> - return PTR_ERR(ep->rep_padbuf);
>> + if (IS_ERR(ep->rep_padbuf)) {
>> + rc = PTR_ERR(ep->rep_padbuf);
>> + goto out0;
>> + }
>> } else
>> ep->rep_padbuf = NULL;
>>
>> @@ -749,6 +717,9 @@ out2:
>> __func__, err);
>> out1:
>> rpcrdma_free_regbuf(ia, ep->rep_padbuf);
>> +out0:
>> + if (ia->ri_dma_mr)
>> + ib_dereg_mr(ia->ri_dma_mr);
>> return rc;
>> }
>>
>> @@ -788,6 +759,12 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
>> if (rc)
>> dprintk("RPC: %s: ib_destroy_cq returned %i\n",
>> __func__, rc);
>> +
>> + if (ia->ri_dma_mr) {
>> + rc = ib_dereg_mr(ia->ri_dma_mr);
>> + dprintk("RPC: %s: ib_dereg_mr returned %i\n",
>> + __func__, rc);
>> + }
>> }
>>
>> /*
>> @@ -1262,8 +1239,7 @@ rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
>> goto out_free;
>>
>> iov->length = size;
>> - iov->lkey = ia->ri_have_dma_lkey ?
>> - ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
>> + iov->lkey = ia->ri_dma_lkey;
>> rb->rg_size = size;
>> rb->rg_owner = NULL;
>> return rb;
>> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
>> index ce4e79e..8219011 100644
>> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
>> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
>> @@ -65,9 +65,8 @@ struct rpcrdma_ia {
>> struct ib_device *ri_device;
>> struct rdma_cm_id *ri_id;
>> struct ib_pd *ri_pd;
>> - struct ib_mr *ri_bind_mem;
>> + struct ib_mr *ri_dma_mr;
>> u32 ri_dma_lkey;
>> - int ri_have_dma_lkey;
>> struct completion ri_done;
>> int ri_async_rc;
>> unsigned int ri_max_frmr_depth;
--
Chuck Lever
On Sun, Jul 26, 2015 at 02:21:23PM -0400, Chuck Lever wrote:
> No, this patch is not strictly needed in 4.3, but my read of
> Jason?s series is that he does not touch xprtrdma. I don?t
> believe there will be a merge conflict.
>
> The goal of this patch is to move xprtrdma forward so it will
> be straightforward to use pd->local_dma_key for RPC send and
> receive buffers. That?s a change that can be added after both
> this patch and Jason?s series is merged.
>
> I prefer keeping this patch separate, because that makes it
> simpler to review and test this refactor. I don?t see a reason
> to delay it, but I can do that if it is needed.
You're right, Jason didn't touch xprtrdma. Sorry for the noise.