2008-08-13 16:18:33

by Tom Tucker

[permalink] [raw]
Subject: [PATCH,RFC 02/02] xprtrdma: Update the RPC memory registration to use FRMR


Use FRMR when registering client memory if the memory registration
strategy is FRMR.

Signed-off-by: Tom Tucker <[email protected]>

---
net/sunrpc/xprtrdma/verbs.c | 296 ++++++++++++++++++++++++++++++++++++++-----
1 files changed, 263 insertions(+), 33 deletions(-)

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 8ea283e..ed401f9 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -423,6 +423,7 @@ rpcrdma_clean_cq(struct ib_cq *cq)
int
rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
{
+ struct ib_device_attr devattr;
int rc;
struct rpcrdma_ia *ia = &xprt->rx_ia;

@@ -443,6 +444,49 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
}

/*
+ * Query the device to determine if the requested memory
+ * registration strategy is supported. If it isnt't set the
+ * strategy to a globally supported model.
+ */
+ rc = ib_query_device(ia->ri_id->device, &devattr);
+ if (rc) {
+ dprintk("RPC: %s: ib_query_device failed %d\n",
+ __func__, rc);
+ goto out2;
+ }
+ switch (memreg) {
+ case RPCRDMA_MEMWINDOWS:
+ case RPCRDMA_MEMWINDOWS_ASYNC:
+ if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
+ dprintk("RPC: %s: MEMWINDOWS specified but not "
+ "supported, using RPCRDMA_ALLPHYSICAL",
+ __func__);
+ memreg = RPCRDMA_ALLPHYSICAL;
+ }
+ break;
+ case RPCRDMA_MTHCAFMR:
+ if (!ia->ri_id->device->alloc_fmr) {
+ dprintk("RPC: %s: MTHCAFMR specified but not "
+ "supported, using RPCRDMA_ALLPHYSICAL",
+ __func__);
+ memreg = RPCRDMA_ALLPHYSICAL;
+ }
+ break;
+ case RPCRDMA_FASTREG:
+ /* Requires both fast reg and global dma lkey */
+ if ((0 ==
+ (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) ||
+ (0 == (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY))) {
+ dprintk("RPC: %s: FASTREG specified but not "
+ "supported, using RPCRDMA_ALLPHYSICAL",
+ __func__);
+ memreg = RPCRDMA_ALLPHYSICAL;
+ }
+ break;
+ }
+ dprintk("RPC: memory registration strategy is %d\n", memreg);
+
+ /*
* Optionally obtain an underlying physical identity mapping in
* order to do a memory window-based bind. This base registration
* is protected from remote access - that is enabled only by binding
@@ -450,7 +494,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
* revoked after the corresponding completion similar to a storage
* adapter.
*/
- if (memreg > RPCRDMA_REGISTER) {
+ if ((memreg > RPCRDMA_REGISTER) && (memreg != RPCRDMA_FASTREG)) {
int mem_priv = IB_ACCESS_LOCAL_WRITE;
switch (memreg) {
#if RPCRDMA_PERSISTENT_REGISTRATION
@@ -475,7 +519,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
memreg = RPCRDMA_REGISTER;
ia->ri_bind_mem = NULL;
}
+ ia->ri_dma_lkey = ia->ri_bind_mem->lkey;
}
+ if (memreg == RPCRDMA_FASTREG)
+ ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;

/* Else will do memory reg/dereg for each chunk */
ia->ri_memreg_strategy = memreg;
@@ -541,6 +588,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.srq = NULL;
ep->rep_attr.cap.max_send_wr = cdata->max_requests;
switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FASTREG:
+ /* Add room for fast reg and invalidate */
+ ep->rep_attr.cap.max_send_wr *= 3;
+ if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
+ return -EINVAL;
+ break;
case RPCRDMA_MEMWINDOWS_ASYNC:
case RPCRDMA_MEMWINDOWS:
/* Add room for mw_binds+unbinds - overkill! */
@@ -623,6 +676,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
break;
case RPCRDMA_MTHCAFMR:
case RPCRDMA_REGISTER:
+ case RPCRDMA_FASTREG:
ep->rep_remote_cma.responder_resources = cdata->max_requests *
(RPCRDMA_MAX_DATA_SEGS / 8);
break;
@@ -866,6 +920,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,

buf->rb_max_requests = cdata->max_requests;
spin_lock_init(&buf->rb_lock);
+ spin_lock_init(&buf->rb_frs_lock);
atomic_set(&buf->rb_credits, 1);

/* Need to allocate:
@@ -874,6 +929,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
* 3. array of struct rpcrdma_rep for replies
* 4. padding, if any
* 5. mw's, if any
+ * 6. frmr's, if any
* Send/recv buffers in req/rep need to be registered
*/

@@ -881,6 +937,10 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
(sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
len += cdata->padding;
switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FASTREG:
+ len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
+ sizeof(struct rpcrdma_frmr);
+ break;
case RPCRDMA_MTHCAFMR:
/* TBD we are perhaps overallocating here */
len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
@@ -895,7 +955,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
break;
}

- /* allocate 1, 4 and 5 in one shot */
+ /* allocate 1, 4, 5 and 6 in one shot */
p = kzalloc(len, GFP_KERNEL);
if (p == NULL) {
dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
@@ -927,7 +987,38 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
* and also reduce unbind-to-bind collision.
*/
INIT_LIST_HEAD(&buf->rb_mws);
+ INIT_LIST_HEAD(&buf->rb_frs);
switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FASTREG:
+ {
+ struct rpcrdma_frmr *fr = (struct rpcrdma_frmr *)p;
+ for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
+ fr->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+ RPCRDMA_MAX_SEGS);
+ if (IS_ERR(fr->fr_mr)) {
+ rc = PTR_ERR(fr->fr_mr);
+ printk("RPC: %s: ib_alloc_fast_reg_mr"
+ " failed %i\n", __func__, rc);
+ goto out;
+ }
+ fr->fr_pgl =
+ ib_alloc_fast_reg_page_list(ia->ri_id->device,
+ RPCRDMA_MAX_SEGS);
+ if (IS_ERR(fr->fr_pgl)) {
+ rc = PTR_ERR(fr->fr_pgl);
+ printk("RPC: %s: "
+ "ib_alloc_fast_reg_page_list "
+ "failed %i\n", __func__, rc);
+ goto out;
+ }
+ INIT_LIST_HEAD(&fr->fr_list);
+ list_add(&fr->fr_list, &buf->rb_frs);
+ dprintk("RPC: %s alloc fmr %p pgl %p\n", __func__,
+ fr->fr_mr, fr->fr_pgl);
+ ++fr;
+ }
+ }
+ break;
case RPCRDMA_MTHCAFMR:
{
struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
@@ -1056,6 +1147,49 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
*/
dprintk("RPC: %s: entering\n", __func__);

+ while (!list_empty(&buf->rb_frs)) {
+ struct rpcrdma_frmr *fr =
+ list_entry(buf->rb_frs.next,
+ struct rpcrdma_frmr, fr_list);
+ list_del(&fr->fr_list);
+ rc = ib_dereg_mr(fr->fr_mr);
+ if (rc)
+ dprintk("RPC: %s:"
+ " ib_dereg_mr"
+ " failed %i\n",
+ __func__, rc);
+ ib_free_fast_reg_page_list(fr->fr_pgl);
+ }
+
+ while (!list_empty(&buf->rb_mws)) {
+ struct rpcrdma_mw *r;
+ switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_MTHCAFMR:
+ r = list_entry(buf->rb_mws.next,
+ struct rpcrdma_mw, mw_list);
+ list_del(&r->mw_list);
+ rc = ib_dealloc_fmr(r->r.fmr);
+ if (rc)
+ dprintk("RPC: %s:"
+ " ib_dealloc_fmr"
+ " failed %i\n",
+ __func__, rc);
+ break;
+ case RPCRDMA_MEMWINDOWS_ASYNC:
+ case RPCRDMA_MEMWINDOWS:
+ r = list_entry(buf->rb_mws.next,
+ struct rpcrdma_mw, mw_list);
+ list_del(&r->mw_list);
+ rc = ib_dealloc_mw(r->r.mw);
+ if (rc)
+ dprintk("RPC: %s: ib_dealloc_mw "
+ "failed %i\n", __func__, rc);
+ break;
+ default:
+ break;
+ }
+ }
+
for (i = 0; i < buf->rb_max_requests; i++) {
if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
rpcrdma_deregister_internal(ia,
@@ -1064,33 +1198,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
kfree(buf->rb_recv_bufs[i]);
}
if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
- while (!list_empty(&buf->rb_mws)) {
- struct rpcrdma_mw *r;
- r = list_entry(buf->rb_mws.next,
- struct rpcrdma_mw, mw_list);
- list_del(&r->mw_list);
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_MTHCAFMR:
- rc = ib_dealloc_fmr(r->r.fmr);
- if (rc)
- dprintk("RPC: %s:"
- " ib_dealloc_fmr"
- " failed %i\n",
- __func__, rc);
- break;
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- rc = ib_dealloc_mw(r->r.mw);
- if (rc)
- dprintk("RPC: %s:"
- " ib_dealloc_mw"
- " failed %i\n",
- __func__, rc);
- break;
- default:
- break;
- }
- }
rpcrdma_deregister_internal(ia,
buf->rb_send_bufs[i]->rl_handle,
&buf->rb_send_bufs[i]->rl_iov);
@@ -1115,6 +1222,7 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
{
struct rpcrdma_req *req;
unsigned long flags;
+ int i;

spin_lock_irqsave(&buffers->rb_lock, flags);
if (buffers->rb_send_index == buffers->rb_max_requests) {
@@ -1134,8 +1242,11 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
}
buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
+ for (i = 0; i < RPCRDMA_MAX_SEGS; i++)
+ req->rl_segments[i].mr_chunk.rl_fr = NULL;
+
if (!list_empty(&buffers->rb_mws)) {
- int i = RPCRDMA_MAX_SEGS - 1;
+ i = RPCRDMA_MAX_SEGS - 1;
do {
struct rpcrdma_mw *r;
r = list_entry(buffers->rb_mws.next,
@@ -1148,6 +1259,31 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
return req;
}

+static void
+rpcrdma_free_frmr(struct rpcrdma_buffer *buf, struct rpcrdma_frmr *fr_mr)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&buf->rb_frs_lock, flags);
+ list_add(&fr_mr->fr_list, &buf->rb_frs);
+ spin_unlock_irqrestore(&buf->rb_frs_lock, flags);
+}
+
+static struct rpcrdma_frmr *
+rpcrdma_alloc_frmr(struct rpcrdma_buffer *buf)
+{
+ unsigned long flags;
+ struct rpcrdma_frmr *fr_mr = NULL;
+
+ spin_lock_irqsave(&buf->rb_frs_lock, flags);
+ if (!list_empty(&buf->rb_frs)) {
+ fr_mr = list_entry(buf->rb_frs.next,
+ struct rpcrdma_frmr, fr_list);
+ list_del_init(&fr_mr->fr_list);
+ }
+ spin_unlock_irqrestore(&buf->rb_frs_lock, flags);
+ return fr_mr;
+}
+
/*
* Put request/reply buffers back into pool.
* Pre-decrement counter/array index.
@@ -1252,9 +1388,10 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
va, len, DMA_BIDIRECTIONAL);
iov->length = len;

- if (ia->ri_bind_mem != NULL) {
+ if (RPCRDMA_FASTREG == ia->ri_memreg_strategy ||
+ ia->ri_bind_mem) {
*mrp = NULL;
- iov->lkey = ia->ri_bind_mem->lkey;
+ iov->lkey = ia->ri_dma_lkey;
return 0;
}

@@ -1302,6 +1439,43 @@ rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
/*
* Wrappers for chunk registration, shared by read/write chunk code.
*/
+static int
+rpcrdma_fastreg_seg(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *mr,
+ int nsegs, u32 access)
+{
+ struct ib_send_wr invalidate_wr, fastreg_wr, *bad_wr;
+ u8 key;
+ u32 rkey = mr->mr_chunk.rl_fr->fr_mr->rkey;
+ int ret;
+
+ /* Prepare INVALIDATE WR */
+ memset(&invalidate_wr, 0, sizeof invalidate_wr);
+ invalidate_wr.opcode = IB_WR_LOCAL_INV;
+ invalidate_wr.send_flags = IB_SEND_SIGNALED;
+ invalidate_wr.ex.invalidate_rkey = rkey;
+ invalidate_wr.next = &fastreg_wr;
+
+ /* Bump the key */
+ key = (u8)(mr->mr_chunk.rl_fr->fr_mr->rkey & 0x000000FF);
+ ib_update_fast_reg_key(mr->mr_chunk.rl_fr->fr_mr, ++key);
+
+ /* Prepare FASTREG WR */
+ memset(&fastreg_wr, 0, sizeof fastreg_wr);
+ fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+ fastreg_wr.send_flags = IB_SEND_SIGNALED;
+ fastreg_wr.wr.fast_reg.iova_start = (unsigned long)mr->mr_dma;
+ fastreg_wr.wr.fast_reg.page_list = mr->mr_chunk.rl_fr->fr_pgl;
+ fastreg_wr.wr.fast_reg.page_list_len = nsegs;
+ fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+ fastreg_wr.wr.fast_reg.length = nsegs << PAGE_SHIFT;
+ fastreg_wr.wr.fast_reg.access_flags = access;
+ fastreg_wr.wr.fast_reg.rkey = mr->mr_chunk.rl_fr->fr_mr->rkey;
+ ret = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+ dprintk("RPC: %s fast reg rkey %08x kva %llx map_len "
+ "%d page_list_len %d ret %d\n", __func__,
+ rkey, mr->mr_dma, nsegs << PAGE_SHIFT, nsegs, ret);
+ return ret;
+}

static void
rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
@@ -1353,6 +1527,53 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
#endif

/* Registration using fast memory registration */
+ case RPCRDMA_FASTREG:
+ {
+ int len, pageoff = offset_in_page(seg->mr_offset);
+
+ seg1->mr_chunk.rl_fr = rpcrdma_alloc_frmr(&r_xprt->rx_buf);
+ if (!seg1->mr_chunk.rl_fr) {
+ printk("RPC: Failed to allocate frmr\n");
+ rc = -ENOMEM;
+ break;
+ }
+ seg1->mr_offset -= pageoff; /* start of page */
+ seg1->mr_len += pageoff;
+ len = -pageoff;
+ if (nsegs > RPCRDMA_MAX_DATA_SEGS)
+ nsegs = RPCRDMA_MAX_DATA_SEGS;
+ for (i = 0; i < nsegs;) {
+ rpcrdma_map_one(ia, seg, writing);
+ seg1->mr_chunk.rl_fr->fr_pgl->page_list[i] = seg->mr_dma;
+ len += seg->mr_len;
+ ++seg;
+ ++i;
+ /* Check for holes */
+ if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+ offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
+ break;
+ }
+ nsegs = i;
+ dprintk("RPC: %s: Using fmr %p to map %d segments\n",
+ __func__, seg1->mr_chunk.rl_fr, nsegs);
+ rc = rpcrdma_fastreg_seg(ia, seg1, nsegs, mem_priv);
+ if (rc) {
+ printk("RPC: %s: failed ib_map_phys_fmr "
+ "%u@0x%llx+%i (%d)... status %i\n", __func__,
+ len, (unsigned long long)seg1->mr_dma,
+ pageoff, nsegs, rc);
+ while (nsegs--)
+ rpcrdma_unmap_one(ia, --seg);
+ } else {
+ seg1->mr_rkey = seg1->mr_chunk.rl_fr->fr_mr->rkey;
+ seg1->mr_base = seg1->mr_dma + pageoff;
+ seg1->mr_nsegs = nsegs;
+ seg1->mr_len = len;
+ }
+ }
+ break;
+
+ /* Registration using MTHCA FMR */
case RPCRDMA_MTHCAFMR:
{
u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
@@ -1486,6 +1707,16 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
break;
#endif

+ case RPCRDMA_FASTREG:
+ while (seg1->mr_nsegs--) {
+ if (seg1->mr_chunk.rl_fr) {
+ rpcrdma_free_frmr(&r_xprt->rx_buf, seg1->mr_chunk.rl_fr);
+ seg1->mr_chunk.rl_fr = NULL;
+ }
+ rpcrdma_unmap_one(ia, seg++);
+ }
+ break;
+
case RPCRDMA_MTHCAFMR:
{
LIST_HEAD(l);
@@ -1590,7 +1821,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
INIT_CQCOUNT(ep);
send_wr.send_flags = IB_SEND_SIGNALED;
}
-
rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
if (rc)
dprintk("RPC: %s: ib_post_send returned %i\n", __func__,


2008-08-13 17:41:00

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH,RFC 02/02] xprtrdma: Update the RPC memory registration to use FRMR

On Wed, 2008-08-13 at 11:18 -0500, Tom Tucker wrote:
> Use FRMR when registering client memory if the memory registration
> strategy is FRMR.
>
> Signed-off-by: Tom Tucker <[email protected]>
>
> ---
> net/sunrpc/xprtrdma/verbs.c | 296 ++++++++++++++++++++++++++++++++++++++-----
> 1 files changed, 263 insertions(+), 33 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> index 8ea283e..ed401f9 100644
> --- a/net/sunrpc/xprtrdma/verbs.c
> +++ b/net/sunrpc/xprtrdma/verbs.c
> @@ -423,6 +423,7 @@ rpcrdma_clean_cq(struct ib_cq *cq)
> int
> rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
> {
> + struct ib_device_attr devattr;
> int rc;
> struct rpcrdma_ia *ia = &xprt->rx_ia;
>
> @@ -443,6 +444,49 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
> }
>
> /*
> + * Query the device to determine if the requested memory
> + * registration strategy is supported. If it isnt't set the
> + * strategy to a globally supported model.
> + */
> + rc = ib_query_device(ia->ri_id->device, &devattr);
> + if (rc) {
> + dprintk("RPC: %s: ib_query_device failed %d\n",
> + __func__, rc);
> + goto out2;
> + }
> + switch (memreg) {
> + case RPCRDMA_MEMWINDOWS:
> + case RPCRDMA_MEMWINDOWS_ASYNC:
> + if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
> + dprintk("RPC: %s: MEMWINDOWS specified but not "
> + "supported, using RPCRDMA_ALLPHYSICAL",
> + __func__);
> + memreg = RPCRDMA_ALLPHYSICAL;
> + }
> + break;
> + case RPCRDMA_MTHCAFMR:
> + if (!ia->ri_id->device->alloc_fmr) {
> + dprintk("RPC: %s: MTHCAFMR specified but not "
> + "supported, using RPCRDMA_ALLPHYSICAL",
> + __func__);
> + memreg = RPCRDMA_ALLPHYSICAL;
> + }
> + break;
> + case RPCRDMA_FASTREG:
> + /* Requires both fast reg and global dma lkey */
> + if ((0 ==
> + (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) ||
> + (0 == (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY))) {
> + dprintk("RPC: %s: FASTREG specified but not "
> + "supported, using RPCRDMA_ALLPHYSICAL",
> + __func__);
> + memreg = RPCRDMA_ALLPHYSICAL;
> + }
> + break;
> + }
> + dprintk("RPC: memory registration strategy is %d\n", memreg);
> +
> + /*
> * Optionally obtain an underlying physical identity mapping in
> * order to do a memory window-based bind. This base registration
> * is protected from remote access - that is enabled only by binding
> @@ -450,7 +494,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
> * revoked after the corresponding completion similar to a storage
> * adapter.
> */
> - if (memreg > RPCRDMA_REGISTER) {
> + if ((memreg > RPCRDMA_REGISTER) && (memreg != RPCRDMA_FASTREG)) {
> int mem_priv = IB_ACCESS_LOCAL_WRITE;
> switch (memreg) {
> #if RPCRDMA_PERSISTENT_REGISTRATION
> @@ -475,7 +519,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
> memreg = RPCRDMA_REGISTER;
> ia->ri_bind_mem = NULL;
> }
> + ia->ri_dma_lkey = ia->ri_bind_mem->lkey;
> }
> + if (memreg == RPCRDMA_FASTREG)
> + ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
>
> /* Else will do memory reg/dereg for each chunk */
> ia->ri_memreg_strategy = memreg;
> @@ -541,6 +588,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
> ep->rep_attr.srq = NULL;
> ep->rep_attr.cap.max_send_wr = cdata->max_requests;
> switch (ia->ri_memreg_strategy) {
> + case RPCRDMA_FASTREG:
> + /* Add room for fast reg and invalidate */
> + ep->rep_attr.cap.max_send_wr *= 3;
> + if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
> + return -EINVAL;
> + break;
> case RPCRDMA_MEMWINDOWS_ASYNC:
> case RPCRDMA_MEMWINDOWS:
> /* Add room for mw_binds+unbinds - overkill! */
> @@ -623,6 +676,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
> break;
> case RPCRDMA_MTHCAFMR:
> case RPCRDMA_REGISTER:
> + case RPCRDMA_FASTREG:
> ep->rep_remote_cma.responder_resources = cdata->max_requests *
> (RPCRDMA_MAX_DATA_SEGS / 8);
> break;
> @@ -866,6 +920,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
>
> buf->rb_max_requests = cdata->max_requests;
> spin_lock_init(&buf->rb_lock);
> + spin_lock_init(&buf->rb_frs_lock);
> atomic_set(&buf->rb_credits, 1);
>
> /* Need to allocate:
> @@ -874,6 +929,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
> * 3. array of struct rpcrdma_rep for replies
> * 4. padding, if any
> * 5. mw's, if any
> + * 6. frmr's, if any
> * Send/recv buffers in req/rep need to be registered
> */
>
> @@ -881,6 +937,10 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
> (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
> len += cdata->padding;
> switch (ia->ri_memreg_strategy) {
> + case RPCRDMA_FASTREG:
> + len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
> + sizeof(struct rpcrdma_frmr);
> + break;
> case RPCRDMA_MTHCAFMR:
> /* TBD we are perhaps overallocating here */
> len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
> @@ -895,7 +955,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
> break;
> }
>
> - /* allocate 1, 4 and 5 in one shot */
> + /* allocate 1, 4, 5 and 6 in one shot */
> p = kzalloc(len, GFP_KERNEL);
> if (p == NULL) {
> dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
> @@ -927,7 +987,38 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
> * and also reduce unbind-to-bind collision.
> */
> INIT_LIST_HEAD(&buf->rb_mws);
> + INIT_LIST_HEAD(&buf->rb_frs);
> switch (ia->ri_memreg_strategy) {
> + case RPCRDMA_FASTREG:
> + {

Can we please get rid of this kind of ugliness whereby we insert blocks
just in order to set up a temporary variable. I know Chuck is fond of
it, but as far as I'm concerned it just screws up readability of the
code.
If you feel you need to isolate a variable to a particular block of
code, then make a function, and that's particularly true of these huge
switch statements that are trying to do 100 entirely unrelated things in
one function.

> + struct rpcrdma_frmr *fr = (struct rpcrdma_frmr *)p;
> + for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
> + fr->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
> + RPCRDMA_MAX_SEGS);
> + if (IS_ERR(fr->fr_mr)) {
> + rc = PTR_ERR(fr->fr_mr);
> + printk("RPC: %s: ib_alloc_fast_reg_mr"
> + " failed %i\n", __func__, rc);
> + goto out;
> + }
> + fr->fr_pgl =
> + ib_alloc_fast_reg_page_list(ia->ri_id->device,
> + RPCRDMA_MAX_SEGS);
> + if (IS_ERR(fr->fr_pgl)) {
> + rc = PTR_ERR(fr->fr_pgl);
> + printk("RPC: %s: "
> + "ib_alloc_fast_reg_page_list "
> + "failed %i\n", __func__, rc);
> + goto out;
> + }
> + INIT_LIST_HEAD(&fr->fr_list);
> + list_add(&fr->fr_list, &buf->rb_frs);
> + dprintk("RPC: %s alloc fmr %p pgl %p\n", __func__,
> + fr->fr_mr, fr->fr_pgl);
> + ++fr;
> + }
> + }
^urgh

> + break;
> case RPCRDMA_MTHCAFMR:
> {
> struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
> @@ -1056,6 +1147,49 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
> */
> dprintk("RPC: %s: entering\n", __func__);
>
> + while (!list_empty(&buf->rb_frs)) {
> + struct rpcrdma_frmr *fr =
> + list_entry(buf->rb_frs.next,
> + struct rpcrdma_frmr, fr_list);
> + list_del(&fr->fr_list);
> + rc = ib_dereg_mr(fr->fr_mr);
> + if (rc)
> + dprintk("RPC: %s:"
> + " ib_dereg_mr"
> + " failed %i\n",
> + __func__, rc);
> + ib_free_fast_reg_page_list(fr->fr_pgl);
> + }
> +
> + while (!list_empty(&buf->rb_mws)) {
> + struct rpcrdma_mw *r;
> + switch (ia->ri_memreg_strategy) {
> + case RPCRDMA_MTHCAFMR:
> + r = list_entry(buf->rb_mws.next,
> + struct rpcrdma_mw, mw_list);
> + list_del(&r->mw_list);
> + rc = ib_dealloc_fmr(r->r.fmr);
> + if (rc)
> + dprintk("RPC: %s:"
> + " ib_dealloc_fmr"
> + " failed %i\n",
> + __func__, rc);
> + break;
> + case RPCRDMA_MEMWINDOWS_ASYNC:
> + case RPCRDMA_MEMWINDOWS:
> + r = list_entry(buf->rb_mws.next,
> + struct rpcrdma_mw, mw_list);
> + list_del(&r->mw_list);
> + rc = ib_dealloc_mw(r->r.mw);
> + if (rc)
> + dprintk("RPC: %s: ib_dealloc_mw "
> + "failed %i\n", __func__, rc);
> + break;
> + default:
> + break;
> + }
> + }
> +
> for (i = 0; i < buf->rb_max_requests; i++) {
> if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
> rpcrdma_deregister_internal(ia,
> @@ -1064,33 +1198,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
> kfree(buf->rb_recv_bufs[i]);
> }
> if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
> - while (!list_empty(&buf->rb_mws)) {
> - struct rpcrdma_mw *r;
> - r = list_entry(buf->rb_mws.next,
> - struct rpcrdma_mw, mw_list);
> - list_del(&r->mw_list);
> - switch (ia->ri_memreg_strategy) {
> - case RPCRDMA_MTHCAFMR:
> - rc = ib_dealloc_fmr(r->r.fmr);
> - if (rc)
> - dprintk("RPC: %s:"
> - " ib_dealloc_fmr"
> - " failed %i\n",
> - __func__, rc);
> - break;
> - case RPCRDMA_MEMWINDOWS_ASYNC:
> - case RPCRDMA_MEMWINDOWS:
> - rc = ib_dealloc_mw(r->r.mw);
> - if (rc)
> - dprintk("RPC: %s:"
> - " ib_dealloc_mw"
> - " failed %i\n",
> - __func__, rc);
> - break;
> - default:
> - break;
> - }
> - }
> rpcrdma_deregister_internal(ia,
> buf->rb_send_bufs[i]->rl_handle,
> &buf->rb_send_bufs[i]->rl_iov);
> @@ -1115,6 +1222,7 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
> {
> struct rpcrdma_req *req;
> unsigned long flags;
> + int i;
>
> spin_lock_irqsave(&buffers->rb_lock, flags);
> if (buffers->rb_send_index == buffers->rb_max_requests) {
> @@ -1134,8 +1242,11 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
> buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
> }
> buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
> + for (i = 0; i < RPCRDMA_MAX_SEGS; i++)
> + req->rl_segments[i].mr_chunk.rl_fr = NULL;
> +
> if (!list_empty(&buffers->rb_mws)) {
> - int i = RPCRDMA_MAX_SEGS - 1;
> + i = RPCRDMA_MAX_SEGS - 1;
> do {
> struct rpcrdma_mw *r;
> r = list_entry(buffers->rb_mws.next,
> @@ -1148,6 +1259,31 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
> return req;
> }
>
> +static void
> +rpcrdma_free_frmr(struct rpcrdma_buffer *buf, struct rpcrdma_frmr *fr_mr)
> +{
> + unsigned long flags;
> + spin_lock_irqsave(&buf->rb_frs_lock, flags);
> + list_add(&fr_mr->fr_list, &buf->rb_frs);
> + spin_unlock_irqrestore(&buf->rb_frs_lock, flags);
> +}
> +
> +static struct rpcrdma_frmr *
> +rpcrdma_alloc_frmr(struct rpcrdma_buffer *buf)
> +{
> + unsigned long flags;
> + struct rpcrdma_frmr *fr_mr = NULL;
> +
> + spin_lock_irqsave(&buf->rb_frs_lock, flags);
> + if (!list_empty(&buf->rb_frs)) {
> + fr_mr = list_entry(buf->rb_frs.next,
> + struct rpcrdma_frmr, fr_list);
> + list_del_init(&fr_mr->fr_list);
> + }
> + spin_unlock_irqrestore(&buf->rb_frs_lock, flags);
> + return fr_mr;
> +}
> +
> /*
> * Put request/reply buffers back into pool.
> * Pre-decrement counter/array index.
> @@ -1252,9 +1388,10 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
> va, len, DMA_BIDIRECTIONAL);
> iov->length = len;
>
> - if (ia->ri_bind_mem != NULL) {
> + if (RPCRDMA_FASTREG == ia->ri_memreg_strategy ||
> + ia->ri_bind_mem) {
> *mrp = NULL;
> - iov->lkey = ia->ri_bind_mem->lkey;
> + iov->lkey = ia->ri_dma_lkey;
> return 0;
> }
>
> @@ -1302,6 +1439,43 @@ rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
> /*
> * Wrappers for chunk registration, shared by read/write chunk code.
> */
> +static int
> +rpcrdma_fastreg_seg(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *mr,
> + int nsegs, u32 access)
> +{
> + struct ib_send_wr invalidate_wr, fastreg_wr, *bad_wr;
> + u8 key;
> + u32 rkey = mr->mr_chunk.rl_fr->fr_mr->rkey;
> + int ret;
> +
> + /* Prepare INVALIDATE WR */
> + memset(&invalidate_wr, 0, sizeof invalidate_wr);
> + invalidate_wr.opcode = IB_WR_LOCAL_INV;
> + invalidate_wr.send_flags = IB_SEND_SIGNALED;
> + invalidate_wr.ex.invalidate_rkey = rkey;
> + invalidate_wr.next = &fastreg_wr;
> +
> + /* Bump the key */
> + key = (u8)(mr->mr_chunk.rl_fr->fr_mr->rkey & 0x000000FF);
> + ib_update_fast_reg_key(mr->mr_chunk.rl_fr->fr_mr, ++key);
> +
> + /* Prepare FASTREG WR */
> + memset(&fastreg_wr, 0, sizeof fastreg_wr);
> + fastreg_wr.opcode = IB_WR_FAST_REG_MR;
> + fastreg_wr.send_flags = IB_SEND_SIGNALED;
> + fastreg_wr.wr.fast_reg.iova_start = (unsigned long)mr->mr_dma;
> + fastreg_wr.wr.fast_reg.page_list = mr->mr_chunk.rl_fr->fr_pgl;
> + fastreg_wr.wr.fast_reg.page_list_len = nsegs;
> + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
> + fastreg_wr.wr.fast_reg.length = nsegs << PAGE_SHIFT;
> + fastreg_wr.wr.fast_reg.access_flags = access;
> + fastreg_wr.wr.fast_reg.rkey = mr->mr_chunk.rl_fr->fr_mr->rkey;
> + ret = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
> + dprintk("RPC: %s fast reg rkey %08x kva %llx map_len "
> + "%d page_list_len %d ret %d\n", __func__,
> + rkey, mr->mr_dma, nsegs << PAGE_SHIFT, nsegs, ret);
> + return ret;
> +}
>
> static void
> rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
> @@ -1353,6 +1527,53 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
> #endif
>
> /* Registration using fast memory registration */
> + case RPCRDMA_FASTREG:
> + {
> + int len, pageoff = offset_in_page(seg->mr_offset);

Ditto...

> +
> + seg1->mr_chunk.rl_fr = rpcrdma_alloc_frmr(&r_xprt->rx_buf);
> + if (!seg1->mr_chunk.rl_fr) {
> + printk("RPC: Failed to allocate frmr\n");
> + rc = -ENOMEM;
> + break;
> + }
> + seg1->mr_offset -= pageoff; /* start of page */
> + seg1->mr_len += pageoff;
> + len = -pageoff;
> + if (nsegs > RPCRDMA_MAX_DATA_SEGS)
> + nsegs = RPCRDMA_MAX_DATA_SEGS;
> + for (i = 0; i < nsegs;) {
> + rpcrdma_map_one(ia, seg, writing);
> + seg1->mr_chunk.rl_fr->fr_pgl->page_list[i] = seg->mr_dma;
> + len += seg->mr_len;
> + ++seg;
> + ++i;
> + /* Check for holes */
> + if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
> + offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
> + break;
> + }
> + nsegs = i;
> + dprintk("RPC: %s: Using fmr %p to map %d segments\n",
> + __func__, seg1->mr_chunk.rl_fr, nsegs);
> + rc = rpcrdma_fastreg_seg(ia, seg1, nsegs, mem_priv);
> + if (rc) {
> + printk("RPC: %s: failed ib_map_phys_fmr "
> + "%u@0x%llx+%i (%d)... status %i\n", __func__,
> + len, (unsigned long long)seg1->mr_dma,
> + pageoff, nsegs, rc);
> + while (nsegs--)
> + rpcrdma_unmap_one(ia, --seg);
> + } else {
> + seg1->mr_rkey = seg1->mr_chunk.rl_fr->fr_mr->rkey;
> + seg1->mr_base = seg1->mr_dma + pageoff;
> + seg1->mr_nsegs = nsegs;
> + seg1->mr_len = len;
> + }
> + }
> + break;
> +
> + /* Registration using MTHCA FMR */
> case RPCRDMA_MTHCAFMR:
> {
> u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
> @@ -1486,6 +1707,16 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
> break;
> #endif
>
> + case RPCRDMA_FASTREG:
> + while (seg1->mr_nsegs--) {
> + if (seg1->mr_chunk.rl_fr) {
> + rpcrdma_free_frmr(&r_xprt->rx_buf, seg1->mr_chunk.rl_fr);
> + seg1->mr_chunk.rl_fr = NULL;
> + }
> + rpcrdma_unmap_one(ia, seg++);
> + }
> + break;
> +
> case RPCRDMA_MTHCAFMR:
> {
> LIST_HEAD(l);
> @@ -1590,7 +1821,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
> INIT_CQCOUNT(ep);
> send_wr.send_flags = IB_SEND_SIGNALED;
> }
> -
> rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
> if (rc)
> dprintk("RPC: %s: ib_post_send returned %i\n", __func__,


2008-08-13 22:31:15

by Chuck Lever

[permalink] [raw]
Subject: Re: [PATCH,RFC 02/02] xprtrdma: Update the RPC memory registration to use FRMR

On Aug 13, 2008, at 1:40 PM, Trond Myklebust wrote:
> On Wed, 2008-08-13 at 11:18 -0500, Tom Tucker wrote:
>> Use FRMR when registering client memory if the memory registration
>> strategy is FRMR.
>>
>> Signed-off-by: Tom Tucker <[email protected]>
>>
>> ---
>> net/sunrpc/xprtrdma/verbs.c | 296 ++++++++++++++++++++++++++++++++
>> ++++++-----
>> 1 files changed, 263 insertions(+), 33 deletions(-)
>>
>> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/
>> verbs.c
>> index 8ea283e..ed401f9 100644
>> --- a/net/sunrpc/xprtrdma/verbs.c
>> +++ b/net/sunrpc/xprtrdma/verbs.c
>> @@ -423,6 +423,7 @@ rpcrdma_clean_cq(struct ib_cq *cq)
>> int
>> rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr,
>> int memreg)
>> {
>> + struct ib_device_attr devattr;
>> int rc;
>> struct rpcrdma_ia *ia = &xprt->rx_ia;
>>
>> @@ -443,6 +444,49 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt,
>> struct sockaddr *addr, int memreg)
>> }
>>
>> /*
>> + * Query the device to determine if the requested memory
>> + * registration strategy is supported. If it isnt't set the
>> + * strategy to a globally supported model.
>> + */
>> + rc = ib_query_device(ia->ri_id->device, &devattr);
>> + if (rc) {
>> + dprintk("RPC: %s: ib_query_device failed %d\n",
>> + __func__, rc);
>> + goto out2;
>> + }
>> + switch (memreg) {
>> + case RPCRDMA_MEMWINDOWS:
>> + case RPCRDMA_MEMWINDOWS_ASYNC:
>> + if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
>> + dprintk("RPC: %s: MEMWINDOWS specified but not "
>> + "supported, using RPCRDMA_ALLPHYSICAL",
>> + __func__);
>> + memreg = RPCRDMA_ALLPHYSICAL;
>> + }
>> + break;
>> + case RPCRDMA_MTHCAFMR:
>> + if (!ia->ri_id->device->alloc_fmr) {
>> + dprintk("RPC: %s: MTHCAFMR specified but not "
>> + "supported, using RPCRDMA_ALLPHYSICAL",
>> + __func__);
>> + memreg = RPCRDMA_ALLPHYSICAL;
>> + }
>> + break;
>> + case RPCRDMA_FASTREG:
>> + /* Requires both fast reg and global dma lkey */
>> + if ((0 ==
>> + (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) ||
>> + (0 == (devattr.device_cap_flags &
>> IB_DEVICE_LOCAL_DMA_LKEY))) {
>> + dprintk("RPC: %s: FASTREG specified but not "
>> + "supported, using RPCRDMA_ALLPHYSICAL",
>> + __func__);
>> + memreg = RPCRDMA_ALLPHYSICAL;
>> + }
>> + break;
>> + }
>> + dprintk("RPC: memory registration strategy is %d\n", memreg);
>> +
>> + /*
>> * Optionally obtain an underlying physical identity mapping in
>> * order to do a memory window-based bind. This base registration
>> * is protected from remote access - that is enabled only by
>> binding
>> @@ -450,7 +494,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt,
>> struct sockaddr *addr, int memreg)
>> * revoked after the corresponding completion similar to a storage
>> * adapter.
>> */
>> - if (memreg > RPCRDMA_REGISTER) {
>> + if ((memreg > RPCRDMA_REGISTER) && (memreg != RPCRDMA_FASTREG)) {
>> int mem_priv = IB_ACCESS_LOCAL_WRITE;
>> switch (memreg) {
>> #if RPCRDMA_PERSISTENT_REGISTRATION
>> @@ -475,7 +519,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt,
>> struct sockaddr *addr, int memreg)
>> memreg = RPCRDMA_REGISTER;
>> ia->ri_bind_mem = NULL;
>> }
>> + ia->ri_dma_lkey = ia->ri_bind_mem->lkey;
>> }
>> + if (memreg == RPCRDMA_FASTREG)
>> + ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
>>
>> /* Else will do memory reg/dereg for each chunk */
>> ia->ri_memreg_strategy = memreg;
>> @@ -541,6 +588,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep,
>> struct rpcrdma_ia *ia,
>> ep->rep_attr.srq = NULL;
>> ep->rep_attr.cap.max_send_wr = cdata->max_requests;
>> switch (ia->ri_memreg_strategy) {
>> + case RPCRDMA_FASTREG:
>> + /* Add room for fast reg and invalidate */
>> + ep->rep_attr.cap.max_send_wr *= 3;
>> + if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
>> + return -EINVAL;
>> + break;
>> case RPCRDMA_MEMWINDOWS_ASYNC:
>> case RPCRDMA_MEMWINDOWS:
>> /* Add room for mw_binds+unbinds - overkill! */
>> @@ -623,6 +676,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct
>> rpcrdma_ia *ia,
>> break;
>> case RPCRDMA_MTHCAFMR:
>> case RPCRDMA_REGISTER:
>> + case RPCRDMA_FASTREG:
>> ep->rep_remote_cma.responder_resources = cdata->max_requests *
>> (RPCRDMA_MAX_DATA_SEGS / 8);
>> break;
>> @@ -866,6 +920,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer
>> *buf, struct rpcrdma_ep *ep,
>>
>> buf->rb_max_requests = cdata->max_requests;
>> spin_lock_init(&buf->rb_lock);
>> + spin_lock_init(&buf->rb_frs_lock);
>> atomic_set(&buf->rb_credits, 1);
>>
>> /* Need to allocate:
>> @@ -874,6 +929,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer
>> *buf, struct rpcrdma_ep *ep,
>> * 3. array of struct rpcrdma_rep for replies
>> * 4. padding, if any
>> * 5. mw's, if any
>> + * 6. frmr's, if any
>> * Send/recv buffers in req/rep need to be registered
>> */
>>
>> @@ -881,6 +937,10 @@ rpcrdma_buffer_create(struct rpcrdma_buffer
>> *buf, struct rpcrdma_ep *ep,
>> (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
>> len += cdata->padding;
>> switch (ia->ri_memreg_strategy) {
>> + case RPCRDMA_FASTREG:
>> + len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
>> + sizeof(struct rpcrdma_frmr);
>> + break;
>> case RPCRDMA_MTHCAFMR:
>> /* TBD we are perhaps overallocating here */
>> len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
>> @@ -895,7 +955,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer
>> *buf, struct rpcrdma_ep *ep,
>> break;
>> }
>>
>> - /* allocate 1, 4 and 5 in one shot */
>> + /* allocate 1, 4, 5 and 6 in one shot */
>> p = kzalloc(len, GFP_KERNEL);
>> if (p == NULL) {
>> dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
>> @@ -927,7 +987,38 @@ rpcrdma_buffer_create(struct rpcrdma_buffer
>> *buf, struct rpcrdma_ep *ep,
>> * and also reduce unbind-to-bind collision.
>> */
>> INIT_LIST_HEAD(&buf->rb_mws);
>> + INIT_LIST_HEAD(&buf->rb_frs);
>> switch (ia->ri_memreg_strategy) {
>> + case RPCRDMA_FASTREG:
>> + {
>
> Can we please get rid of this kind of ugliness whereby we insert
> blocks
> just in order to set up a temporary variable. I know Chuck is fond of
> it, but as far as I'm concerned it just screws up readability of the
> code.
> If you feel you need to isolate a variable to a particular block of
> code, then make a function, and that's particularly true of these huge
> switch statements that are trying to do 100 entirely unrelated
> things in
> one function.

Never fear, I have stopped the practice, and prefer to use a separate
function now. I agree that these are not very readable. The double
bracket at the end of such structures makes my eyes cross.

There are probably still some instances in my queued patch series, but
I've tried to cull them out before posting them.

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com

2008-08-13 22:35:25

by Tom Tucker

[permalink] [raw]
Subject: Re: [PATCH,RFC 02/02] xprtrdma: Update the RPC memory registration to use FRMR

Chuck Lever wrote:
> On Aug 13, 2008, at 1:40 PM, Trond Myklebust wrote:
>> On Wed, 2008-08-13 at 11:18 -0500, Tom Tucker wrote:
>>> Use FRMR when registering client memory if the memory registration
>>> strategy is FRMR.
>>>
>>> Signed-off-by: Tom Tucker <[email protected]>
>>>
>>> ---
>>> net/sunrpc/xprtrdma/verbs.c | 296
>>> ++++++++++++++++++++++++++++++++++++++-----
>>> 1 files changed, 263 insertions(+), 33 deletions(-)
>>>
>>> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
>>> index 8ea283e..ed401f9 100644
>>> --- a/net/sunrpc/xprtrdma/verbs.c
>>> +++ b/net/sunrpc/xprtrdma/verbs.c
>>> @@ -423,6 +423,7 @@ rpcrdma_clean_cq(struct ib_cq *cq)
>>> int
>>> rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr,
>>> int memreg)
>>> {
>>> + struct ib_device_attr devattr;
>>> int rc;
>>> struct rpcrdma_ia *ia = &xprt->rx_ia;
>>>
>>> @@ -443,6 +444,49 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt,
>>> struct sockaddr *addr, int memreg)
>>> }
>>>
>>> /*
>>> + * Query the device to determine if the requested memory
>>> + * registration strategy is supported. If it isnt't set the
>>> + * strategy to a globally supported model.
>>> + */
>>> + rc = ib_query_device(ia->ri_id->device, &devattr);
>>> + if (rc) {
>>> + dprintk("RPC: %s: ib_query_device failed %d\n",
>>> + __func__, rc);
>>> + goto out2;
>>> + }
>>> + switch (memreg) {
>>> + case RPCRDMA_MEMWINDOWS:
>>> + case RPCRDMA_MEMWINDOWS_ASYNC:
>>> + if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
>>> + dprintk("RPC: %s: MEMWINDOWS specified but not "
>>> + "supported, using RPCRDMA_ALLPHYSICAL",
>>> + __func__);
>>> + memreg = RPCRDMA_ALLPHYSICAL;
>>> + }
>>> + break;
>>> + case RPCRDMA_MTHCAFMR:
>>> + if (!ia->ri_id->device->alloc_fmr) {
>>> + dprintk("RPC: %s: MTHCAFMR specified but not "
>>> + "supported, using RPCRDMA_ALLPHYSICAL",
>>> + __func__);
>>> + memreg = RPCRDMA_ALLPHYSICAL;
>>> + }
>>> + break;
>>> + case RPCRDMA_FASTREG:
>>> + /* Requires both fast reg and global dma lkey */
>>> + if ((0 ==
>>> + (devattr.device_cap_flags &
>>> IB_DEVICE_MEM_MGT_EXTENSIONS)) ||
>>> + (0 == (devattr.device_cap_flags &
>>> IB_DEVICE_LOCAL_DMA_LKEY))) {
>>> + dprintk("RPC: %s: FASTREG specified but not "
>>> + "supported, using RPCRDMA_ALLPHYSICAL",
>>> + __func__);
>>> + memreg = RPCRDMA_ALLPHYSICAL;
>>> + }
>>> + break;
>>> + }
>>> + dprintk("RPC: memory registration strategy is %d\n", memreg);
>>> +
>>> + /*
>>> * Optionally obtain an underlying physical identity mapping in
>>> * order to do a memory window-based bind. This base registration
>>> * is protected from remote access - that is enabled only by
>>> binding
>>> @@ -450,7 +494,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct
>>> sockaddr *addr, int memreg)
>>> * revoked after the corresponding completion similar to a storage
>>> * adapter.
>>> */
>>> - if (memreg > RPCRDMA_REGISTER) {
>>> + if ((memreg > RPCRDMA_REGISTER) && (memreg != RPCRDMA_FASTREG)) {
>>> int mem_priv = IB_ACCESS_LOCAL_WRITE;
>>> switch (memreg) {
>>> #if RPCRDMA_PERSISTENT_REGISTRATION
>>> @@ -475,7 +519,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt,
>>> struct sockaddr *addr, int memreg)
>>> memreg = RPCRDMA_REGISTER;
>>> ia->ri_bind_mem = NULL;
>>> }
>>> + ia->ri_dma_lkey = ia->ri_bind_mem->lkey;
>>> }
>>> + if (memreg == RPCRDMA_FASTREG)
>>> + ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
>>>
>>> /* Else will do memory reg/dereg for each chunk */
>>> ia->ri_memreg_strategy = memreg;
>>> @@ -541,6 +588,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct
>>> rpcrdma_ia *ia,
>>> ep->rep_attr.srq = NULL;
>>> ep->rep_attr.cap.max_send_wr = cdata->max_requests;
>>> switch (ia->ri_memreg_strategy) {
>>> + case RPCRDMA_FASTREG:
>>> + /* Add room for fast reg and invalidate */
>>> + ep->rep_attr.cap.max_send_wr *= 3;
>>> + if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
>>> + return -EINVAL;
>>> + break;
>>> case RPCRDMA_MEMWINDOWS_ASYNC:
>>> case RPCRDMA_MEMWINDOWS:
>>> /* Add room for mw_binds+unbinds - overkill! */
>>> @@ -623,6 +676,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct
>>> rpcrdma_ia *ia,
>>> break;
>>> case RPCRDMA_MTHCAFMR:
>>> case RPCRDMA_REGISTER:
>>> + case RPCRDMA_FASTREG:
>>> ep->rep_remote_cma.responder_resources = cdata->max_requests *
>>> (RPCRDMA_MAX_DATA_SEGS / 8);
>>> break;
>>> @@ -866,6 +920,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf,
>>> struct rpcrdma_ep *ep,
>>>
>>> buf->rb_max_requests = cdata->max_requests;
>>> spin_lock_init(&buf->rb_lock);
>>> + spin_lock_init(&buf->rb_frs_lock);
>>> atomic_set(&buf->rb_credits, 1);
>>>
>>> /* Need to allocate:
>>> @@ -874,6 +929,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf,
>>> struct rpcrdma_ep *ep,
>>> * 3. array of struct rpcrdma_rep for replies
>>> * 4. padding, if any
>>> * 5. mw's, if any
>>> + * 6. frmr's, if any
>>> * Send/recv buffers in req/rep need to be registered
>>> */
>>>
>>> @@ -881,6 +937,10 @@ rpcrdma_buffer_create(struct rpcrdma_buffer
>>> *buf, struct rpcrdma_ep *ep,
>>> (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
>>> len += cdata->padding;
>>> switch (ia->ri_memreg_strategy) {
>>> + case RPCRDMA_FASTREG:
>>> + len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
>>> + sizeof(struct rpcrdma_frmr);
>>> + break;
>>> case RPCRDMA_MTHCAFMR:
>>> /* TBD we are perhaps overallocating here */
>>> len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
>>> @@ -895,7 +955,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf,
>>> struct rpcrdma_ep *ep,
>>> break;
>>> }
>>>
>>> - /* allocate 1, 4 and 5 in one shot */
>>> + /* allocate 1, 4, 5 and 6 in one shot */
>>> p = kzalloc(len, GFP_KERNEL);
>>> if (p == NULL) {
>>> dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
>>> @@ -927,7 +987,38 @@ rpcrdma_buffer_create(struct rpcrdma_buffer
>>> *buf, struct rpcrdma_ep *ep,
>>> * and also reduce unbind-to-bind collision.
>>> */
>>> INIT_LIST_HEAD(&buf->rb_mws);
>>> + INIT_LIST_HEAD(&buf->rb_frs);
>>> switch (ia->ri_memreg_strategy) {
>>> + case RPCRDMA_FASTREG:
>>> + {
>>
>> Can we please get rid of this kind of ugliness whereby we insert blocks
>> just in order to set up a temporary variable. I know Chuck is fond of
>> it, but as far as I'm concerned it just screws up readability of the
>> code.
>> If you feel you need to isolate a variable to a particular block of
>> code, then make a function, and that's particularly true of these huge
>> switch statements that are trying to do 100 entirely unrelated things in
>> one function.
>
> Never fear, I have stopped the practice, and prefer to use a separate
> function now. I agree that these are not very readable. The double
> bracket at the end of such structures makes my eyes cross.
>
> There are probably still some instances in my queued patch series, but
> I've tried to cull them out before posting them.
>

I just copied the coding style of the function. So the proposal here is
to refactor this function? If yes, then I'll make it happen.

Tom

> --
> Chuck Lever
> chuck[dot]lever[at]oracle[dot]com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html


2008-08-13 22:40:14

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH,RFC 02/02] xprtrdma: Update the RPC memory registration to use FRMR

On Wed, 2008-08-13 at 17:35 -0500, Tom Tucker wrote:

> I just copied the coding style of the function. So the proposal here is
> to refactor this function? If yes, then I'll make it happen.

Please do, or please at least make sure that you don't repeat the
mistakes of the past when adding in new code.

Cheers
Trond