2022-09-23 13:08:18

by Chuck Lever III

[permalink] [raw]
Subject: [PATCH v1 0/6] xprtrdma deadlock avoidance

These patches attempt to reduce the possibility of a deadlock when
direct reclaim drives resource allocation in xprtrdma. Looking for
comments and review.

---

Chuck Lever (6):
svcrdma: Clean up RPCRDMA_DEF_GFP
xprtrdma: Clean up synopsis of rpcrdma_req_create()
xprtrdma: Clean up synopsis of rpcrdma_regbuf_alloc()
xprtrdma: MR-related memory allocation should be allowed to fail
xprtrdma: Memory allocation should be allowed to fail during connect
xprtrdma: Prevent memory allocations from driving a reclaim


net/sunrpc/xprtrdma/backchannel.c | 2 +-
net/sunrpc/xprtrdma/frwr_ops.c | 17 ++++-----
net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 4 +--
net/sunrpc/xprtrdma/verbs.c | 42 +++++++++++-----------
net/sunrpc/xprtrdma/xprt_rdma.h | 10 ++++--
5 files changed, 38 insertions(+), 37 deletions(-)

--
Chuck Lever


2022-09-23 13:08:18

by Chuck Lever III

[permalink] [raw]
Subject: [PATCH v1 1/6] svcrdma: Clean up RPCRDMA_DEF_GFP

xprt_rdma_bc_allocate() is now the only user of RPCRDMA_DEF_GFP.
Replace that macro with the raw flags.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 4 ++--
net/sunrpc/xprtrdma/xprt_rdma.h | 2 --
2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 85c8cdda98b1..aa2227a7e552 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -119,12 +119,12 @@ xprt_rdma_bc_allocate(struct rpc_task *task)
return -EINVAL;
}

- page = alloc_page(RPCRDMA_DEF_GFP);
+ page = alloc_page(GFP_NOIO | __GFP_NOWARN);
if (!page)
return -ENOMEM;
rqst->rq_buffer = page_address(page);

- rqst->rq_rbuffer = kmalloc(rqst->rq_rcvsize, RPCRDMA_DEF_GFP);
+ rqst->rq_rbuffer = kmalloc(rqst->rq_rcvsize, GFP_NOIO | __GFP_NOWARN);
if (!rqst->rq_rbuffer) {
put_page(page);
return -ENOMEM;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index c79f92eeda76..84b685c45555 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -149,8 +149,6 @@ static inline void *rdmab_data(const struct rpcrdma_regbuf *rb)
return rb->rg_data;
}

-#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN)
-
/* To ensure a transport can always make forward progress,
* the number of RDMA segments allowed in header chunk lists
* is capped at 16. This prevents less-capable devices from


2022-09-23 13:08:19

by Chuck Lever III

[permalink] [raw]
Subject: [PATCH v1 4/6] xprtrdma: MR-related memory allocation should be allowed to fail

xprtrdma always drives a retry of MR allocation if it should fail.
It should be safe to not use GFP_KERNEL for this purpose rather
than sleeping in the memory allocator.

In theory, if these weaker allocations are attempted first, memory
exhaustion is likely to cause xprtrdma to fail fast and not then
invoke the RDMA core APIs, which still might use GFP_KERNEL.

Also note that rpc_task_gfp_mask() always sets __GFP_NORETRY and
__GFP_NOWARN when an RPC-related allocation is being done in a
worker thread. MR allocation is already always done in worker
threads.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/frwr_ops.c | 17 +++++++----------
net/sunrpc/xprtrdma/verbs.c | 5 ++++-
net/sunrpc/xprtrdma/xprt_rdma.h | 6 ++++++
3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index de0bdb6b729f..ce55361a822f 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -126,14 +126,15 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
struct ib_mr *frmr;
int rc;

+ sg = kcalloc_node(depth, sizeof(*sg), XPRTRDMA_GFP_FLAGS,
+ ibdev_to_node(ep->re_id->device));
+ if (!sg)
+ return -ENOMEM;
+
frmr = ib_alloc_mr(ep->re_pd, ep->re_mrtype, depth);
if (IS_ERR(frmr))
goto out_mr_err;

- sg = kmalloc_array(depth, sizeof(*sg), GFP_KERNEL);
- if (!sg)
- goto out_list_err;
-
mr->mr_xprt = r_xprt;
mr->mr_ibmr = frmr;
mr->mr_device = NULL;
@@ -146,13 +147,9 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
return 0;

out_mr_err:
- rc = PTR_ERR(frmr);
+ kfree(sg);
trace_xprtrdma_frwr_alloc(mr, rc);
- return rc;
-
-out_list_err:
- ib_dereg_mr(frmr);
- return -ENOMEM;
+ return PTR_ERR(frmr);
}

/**
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 8fb10fc72f69..4a7b87e9e47c 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -739,13 +739,16 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ struct ib_device *device = ep->re_id->device;
unsigned int count;

+ /* Try to allocate enough to perform one full-sized I/O */
for (count = 0; count < ep->re_max_rdma_segs; count++) {
struct rpcrdma_mr *mr;
int rc;

- mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ mr = kzalloc_node(sizeof(*mr), XPRTRDMA_GFP_FLAGS,
+ ibdev_to_node(device));
if (!mr)
break;

diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 227dce50cc4b..5e5ff6784ef5 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -149,6 +149,12 @@ static inline void *rdmab_data(const struct rpcrdma_regbuf *rb)
return rb->rg_data;
}

+/* Do not use emergency memory reserves, and fail quickly if memory
+ * cannot be allocated easily. These flags may be used wherever there
+ * is robust logic to handle a failure to allocate.
+ */
+#define XPRTRDMA_GFP_FLAGS (__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN)
+
/* To ensure a transport can always make forward progress,
* the number of RDMA segments allowed in header chunk lists
* is capped at 16. This prevents less-capable devices from


2022-09-23 13:08:24

by Chuck Lever III

[permalink] [raw]
Subject: [PATCH v1 6/6] xprtrdma: Prevent memory allocations from driving a reclaim

Many memory allocations that xprtrdma does can fail safely. Let's
use this fact to avoid some potential deadlocks: Replace GFP_KERNEL
with GFP flags that do not try hard to acquire memory.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/verbs.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 7ca58cb65e27..44b87e4274b4 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -811,7 +811,7 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
struct rpcrdma_req *req;

- req = kzalloc(sizeof(*req), GFP_KERNEL);
+ req = kzalloc(sizeof(*req), XPRTRDMA_GFP_FLAGS);
if (req == NULL)
goto out1;

@@ -926,7 +926,7 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_rep *rep;

- rep = kzalloc(sizeof(*rep), GFP_KERNEL);
+ rep = kzalloc(sizeof(*rep), XPRTRDMA_GFP_FLAGS);
if (rep == NULL)
goto out;

@@ -1236,10 +1236,10 @@ rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction)
{
struct rpcrdma_regbuf *rb;

- rb = kmalloc(sizeof(*rb), GFP_KERNEL);
+ rb = kmalloc(sizeof(*rb), XPRTRDMA_GFP_FLAGS);
if (!rb)
return NULL;
- rb->rg_data = kmalloc(size, GFP_KERNEL);
+ rb->rg_data = kmalloc(size, XPRTRDMA_GFP_FLAGS);
if (!rb->rg_data) {
kfree(rb);
return NULL;