2024-04-29 15:28:43

by Chuck Lever

[permalink] [raw]
Subject: [RFC PATCH 0/4] NFS: Fix another 'check_flush_dependency' splat

From: Chuck Lever <[email protected]>

Avoid getting work queue splats in the system journal by moving
client-side RPC/RDMA transport tear-down into a background process.

I've done some testing of this series, now looking for review
comments.

Chuck Lever (4):
xprtrdma: Remove temp allocation of rpcrdma_rep objects
xprtrdma: Clean up synopsis of frwr_mr_unmap()
xprtrdma: Delay releasing connection hardware resources
xprtrdma: Move MRs to struct rpcrdma_ep

net/sunrpc/xprtrdma/frwr_ops.c | 13 ++-
net/sunrpc/xprtrdma/rpc_rdma.c | 3 +-
net/sunrpc/xprtrdma/transport.c | 20 +++-
net/sunrpc/xprtrdma/verbs.c | 173 ++++++++++++++++----------------
net/sunrpc/xprtrdma/xprt_rdma.h | 21 ++--
5 files changed, 125 insertions(+), 105 deletions(-)


base-commit: e67572cd2204894179d89bd7b984072f19313b03
--
2.44.0



2024-04-29 15:28:51

by Chuck Lever

[permalink] [raw]
Subject: [RFC PATCH 2/4] xprtrdma: Clean up synopsis of frwr_mr_unmap()

From: Chuck Lever <[email protected]>

Commit 7a03aeb66c41 ("xprtrdma: Micro-optimize MR DMA-unmapping")
removed the last use of the @r_xprt parameter in this function, but
neglected to remove the parameter itself.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/frwr_ops.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index ffbf99894970..6e508708d06d 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -54,7 +54,7 @@ static void frwr_cid_init(struct rpcrdma_ep *ep,
cid->ci_completion_id = mr->mr_ibmr->res.id;
}

-static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
+static void frwr_mr_unmap(struct rpcrdma_mr *mr)
{
if (mr->mr_device) {
trace_xprtrdma_mr_unmap(mr);
@@ -73,7 +73,7 @@ void frwr_mr_release(struct rpcrdma_mr *mr)
{
int rc;

- frwr_mr_unmap(mr->mr_xprt, mr);
+ frwr_mr_unmap(mr);

rc = ib_dereg_mr(mr->mr_ibmr);
if (rc)
@@ -84,7 +84,7 @@ void frwr_mr_release(struct rpcrdma_mr *mr)

static void frwr_mr_put(struct rpcrdma_mr *mr)
{
- frwr_mr_unmap(mr->mr_xprt, mr);
+ frwr_mr_unmap(mr);

/* The MR is returned to the req's MR free list instead
* of to the xprt's MR free list. No spinlock is needed.
--
2.44.0


2024-04-29 15:29:12

by Chuck Lever

[permalink] [raw]
Subject: [RFC PATCH 4/4] xprtrdma: Move MRs to struct rpcrdma_ep

From: Chuck Lever <[email protected]>

MRs are a connection-specific hardware resource, thus they should be
anchored in the EP and released with the other hardware resources.

Closes: Link: https://bugzilla.kernel.org/show_bug.cgi?id=218704
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/frwr_ops.c | 7 ++--
net/sunrpc/xprtrdma/verbs.c | 70 ++++++++++++++++-----------------
net/sunrpc/xprtrdma/xprt_rdma.h | 13 +++---
3 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 6e508708d06d..7e918753eec4 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -112,15 +112,14 @@ void frwr_reset(struct rpcrdma_req *req)

/**
* frwr_mr_init - Initialize one MR
- * @r_xprt: controlling transport instance
+ * @ep: controlling transport instance
* @mr: generic MR to prepare for FRWR
*
* Returns zero if successful. Otherwise a negative errno
* is returned.
*/
-int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
+int frwr_mr_init(struct rpcrdma_ep *ep, struct rpcrdma_mr *mr)
{
- struct rpcrdma_ep *ep = r_xprt->rx_ep;
unsigned int depth = ep->re_max_fr_depth;
struct scatterlist *sg;
struct ib_mr *frmr;
@@ -134,7 +133,7 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
if (IS_ERR(frmr))
goto out_mr_err;

- mr->mr_xprt = r_xprt;
+ mr->mr_ep = ep;
mr->mr_ibmr = frmr;
mr->mr_device = NULL;
INIT_LIST_HEAD(&mr->mr_list);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index f1e4a28325fa..2578d9e77056 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -71,7 +71,8 @@ static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
-static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_mrs_destroy(struct rpcrdma_ep *ep);
+static void rpcrdma_mr_refresh_worker(struct work_struct *work);
static void rpcrdma_ep_get(struct rpcrdma_ep *ep);
static void rpcrdma_ep_put(struct rpcrdma_ep *ep);
static struct rpcrdma_regbuf *
@@ -337,6 +338,8 @@ static void rpcrdma_ep_destroy(struct work_struct *work)
{
struct rpcrdma_ep *ep = container_of(work, struct rpcrdma_ep, re_worker);

+ rpcrdma_mrs_destroy(ep);
+
if (ep->re_id->qp) {
rdma_destroy_qp(ep->re_id);
ep->re_id->qp = NULL;
@@ -393,6 +396,11 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
ep->re_xprt = &r_xprt->rx_xprt;
kref_init(&ep->re_kref);

+ spin_lock_init(&ep->re_mr_lock);
+ INIT_WORK(&ep->re_refresh_worker, rpcrdma_mr_refresh_worker);
+ INIT_LIST_HEAD(&ep->re_mrs);
+ INIT_LIST_HEAD(&ep->re_all_mrs);
+
id = rpcrdma_create_id(r_xprt, ep);
if (IS_ERR(id)) {
kfree(ep);
@@ -575,7 +583,6 @@ void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt)
rpcrdma_xprt_drain(r_xprt);
rpcrdma_reps_unmap(r_xprt);
rpcrdma_reqs_reset(r_xprt);
- rpcrdma_mrs_destroy(r_xprt);
rpcrdma_sendctxs_destroy(r_xprt);

r_xprt->rx_ep = NULL;
@@ -749,7 +756,6 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt,
static void
rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_ep *ep = r_xprt->rx_ep;
struct ib_device *device = ep->re_id->device;
unsigned int count;
@@ -764,16 +770,16 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
if (!mr)
break;

- rc = frwr_mr_init(r_xprt, mr);
+ rc = frwr_mr_init(ep, mr);
if (rc) {
kfree(mr);
break;
}

- spin_lock(&buf->rb_lock);
- rpcrdma_mr_push(mr, &buf->rb_mrs);
- list_add(&mr->mr_all, &buf->rb_all_mrs);
- spin_unlock(&buf->rb_lock);
+ spin_lock(&ep->re_mr_lock);
+ rpcrdma_mr_push(mr, &ep->re_mrs);
+ list_add(&mr->mr_all, &ep->re_all_mrs);
+ spin_unlock(&ep->re_mr_lock);
}

r_xprt->rx_stats.mrs_allocated += count;
@@ -783,10 +789,11 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
static void
rpcrdma_mr_refresh_worker(struct work_struct *work)
{
- struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
- rb_refresh_worker);
- struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
- rx_buf);
+ struct rpcrdma_ep *ep = container_of(work, struct rpcrdma_ep,
+ re_refresh_worker);
+ struct rpcrdma_xprt *r_xprt = container_of(ep->re_xprt,
+ struct rpcrdma_xprt,
+ rx_xprt);

rpcrdma_mrs_create(r_xprt);
xprt_write_space(&r_xprt->rx_xprt);
@@ -799,7 +806,6 @@ rpcrdma_mr_refresh_worker(struct work_struct *work)
*/
void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_ep *ep = r_xprt->rx_ep;

/* If there is no underlying connection, it's no use
@@ -807,7 +813,7 @@ void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt)
*/
if (ep->re_connect_status != 1)
return;
- queue_work(system_highpri_wq, &buf->rb_refresh_worker);
+ queue_work(system_highpri_wq, &ep->re_refresh_worker);
}

/**
@@ -1044,9 +1050,6 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)

buf->rb_bc_srv_max_requests = 0;
spin_lock_init(&buf->rb_lock);
- INIT_LIST_HEAD(&buf->rb_mrs);
- INIT_LIST_HEAD(&buf->rb_all_mrs);
- INIT_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker);

INIT_LIST_HEAD(&buf->rb_send_bufs);
INIT_LIST_HEAD(&buf->rb_allreqs);
@@ -1085,11 +1088,11 @@ void rpcrdma_req_destroy(struct rpcrdma_req *req)
list_del(&req->rl_all);

while ((mr = rpcrdma_mr_pop(&req->rl_free_mrs))) {
- struct rpcrdma_buffer *buf = &mr->mr_xprt->rx_buf;
+ struct rpcrdma_ep *ep = mr->mr_ep;

- spin_lock(&buf->rb_lock);
+ spin_lock(&ep->re_mr_lock);
list_del(&mr->mr_all);
- spin_unlock(&buf->rb_lock);
+ spin_unlock(&ep->re_mr_lock);

frwr_mr_release(mr);
}
@@ -1102,31 +1105,28 @@ void rpcrdma_req_destroy(struct rpcrdma_req *req)

/**
* rpcrdma_mrs_destroy - Release all of a transport's MRs
- * @r_xprt: controlling transport instance
+ * @ep: controlling transport instance
*
- * Relies on caller holding the transport send lock to protect
- * removing mr->mr_list from req->rl_free_mrs safely.
*/
-static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt)
+static void rpcrdma_mrs_destroy(struct rpcrdma_ep *ep)
{
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_mr *mr;

- cancel_work_sync(&buf->rb_refresh_worker);
+ cancel_work_sync(&ep->re_refresh_worker);

- spin_lock(&buf->rb_lock);
- while ((mr = list_first_entry_or_null(&buf->rb_all_mrs,
+ spin_lock(&ep->re_mr_lock);
+ while ((mr = list_first_entry_or_null(&ep->re_all_mrs,
struct rpcrdma_mr,
mr_all)) != NULL) {
list_del(&mr->mr_list);
list_del(&mr->mr_all);
- spin_unlock(&buf->rb_lock);
+ spin_unlock(&ep->re_mr_lock);

frwr_mr_release(mr);

- spin_lock(&buf->rb_lock);
+ spin_lock(&ep->re_mr_lock);
}
- spin_unlock(&buf->rb_lock);
+ spin_unlock(&ep->re_mr_lock);
}

/**
@@ -1162,12 +1162,12 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
struct rpcrdma_mr *
rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
struct rpcrdma_mr *mr;

- spin_lock(&buf->rb_lock);
- mr = rpcrdma_mr_pop(&buf->rb_mrs);
- spin_unlock(&buf->rb_lock);
+ spin_lock(&ep->re_mr_lock);
+ mr = rpcrdma_mr_pop(&ep->re_mrs);
+ spin_unlock(&ep->re_mr_lock);
return mr;
}

diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 048d2e329384..ce703b6e3b86 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -96,6 +96,11 @@ struct rpcrdma_ep {
unsigned int re_inline_send; /* negotiated */
unsigned int re_inline_recv; /* negotiated */

+ spinlock_t re_mr_lock;
+ struct list_head re_mrs;
+ struct list_head re_all_mrs;
+ struct work_struct re_refresh_worker;
+
atomic_t re_completion_ids;

char re_write_pad[XDR_UNIT];
@@ -253,7 +258,7 @@ struct rpcrdma_mr {
struct ib_reg_wr mr_regwr;
struct ib_send_wr mr_invwr;
};
- struct rpcrdma_xprt *mr_xprt;
+ struct rpcrdma_ep *mr_ep;
u32 mr_handle;
u32 mr_length;
u64 mr_offset;
@@ -365,7 +370,6 @@ rpcrdma_mr_pop(struct list_head *list)
struct rpcrdma_buffer {
spinlock_t rb_lock;
struct list_head rb_send_bufs;
- struct list_head rb_mrs;

unsigned long rb_sc_head;
unsigned long rb_sc_tail;
@@ -373,7 +377,6 @@ struct rpcrdma_buffer {
struct rpcrdma_sendctx **rb_sc_ctxs;

struct list_head rb_allreqs;
- struct list_head rb_all_mrs;
struct list_head rb_all_reps;

struct llist_head rb_free_reps;
@@ -383,8 +386,6 @@ struct rpcrdma_buffer {

u32 rb_bc_srv_max_requests;
u32 rb_bc_max_requests;
-
- struct work_struct rb_refresh_worker;
};

/*
@@ -533,7 +534,7 @@ rpcrdma_data_dir(bool writing)
*/
void frwr_reset(struct rpcrdma_req *req);
int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device);
-int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr);
+int frwr_mr_init(struct rpcrdma_ep *ep, struct rpcrdma_mr *mr);
void frwr_mr_release(struct rpcrdma_mr *mr);
struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_mr_seg *seg,
--
2.44.0


2024-04-29 15:58:11

by Chuck Lever

[permalink] [raw]
Subject: [RFC PATCH 1/4] xprtrdma: Remove temp allocation of rpcrdma_rep objects

From: Chuck Lever <[email protected]>

The original code was designed so that most calls to
rpcrdma_rep_create() would occur on the NUMA node that the device
preferred. There are a few cases where that's not possible, so
those reps are marked as temporary.

However, we have the device (and its preferred node) already in
rpcrdma_rep_create(), so let's use that.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 3 +-
net/sunrpc/xprtrdma/verbs.c | 57 ++++++++++++++-------------------
net/sunrpc/xprtrdma/xprt_rdma.h | 3 +-
3 files changed, 26 insertions(+), 37 deletions(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 190a4de239c8..1478c41c7e9d 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1471,8 +1471,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
credits = 1; /* don't deadlock */
else if (credits > r_xprt->rx_ep->re_max_requests)
credits = r_xprt->rx_ep->re_max_requests;
- rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1),
- false);
+ rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1));
if (buf->rb_credits != credits)
rpcrdma_update_cwnd(r_xprt, credits);

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 4f8d7efa469f..c6d9d94c28ba 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -69,13 +69,15 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_sendctx *sc);
static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt);
-static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep);
static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_ep_get(struct rpcrdma_ep *ep);
static int rpcrdma_ep_put(struct rpcrdma_ep *ep);
static struct rpcrdma_regbuf *
+rpcrdma_regbuf_alloc_node(size_t size, enum dma_data_direction direction,
+ int node);
+static struct rpcrdma_regbuf *
rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction);
static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb);
static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb);
@@ -501,7 +503,7 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt)
* outstanding Receives.
*/
rpcrdma_ep_get(ep);
- rpcrdma_post_recvs(r_xprt, 1, true);
+ rpcrdma_post_recvs(r_xprt, 1);

rc = rdma_connect(ep->re_id, &ep->re_remote_cma);
if (rc)
@@ -920,18 +922,20 @@ static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt)
}

static noinline
-struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
- bool temp)
+struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ struct ib_device *device = ep->re_id->device;
struct rpcrdma_rep *rep;

rep = kzalloc(sizeof(*rep), XPRTRDMA_GFP_FLAGS);
if (rep == NULL)
goto out;

- rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv,
- DMA_FROM_DEVICE);
+ rep->rr_rdmabuf = rpcrdma_regbuf_alloc_node(ep->re_inline_recv,
+ DMA_FROM_DEVICE,
+ ibdev_to_node(device));
if (!rep->rr_rdmabuf)
goto out_free;

@@ -946,7 +950,6 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
rep->rr_recv_wr.num_sge = 1;
- rep->rr_temp = temp;

spin_lock(&buf->rb_lock);
list_add(&rep->rr_all, &buf->rb_all_reps);
@@ -965,17 +968,6 @@ static void rpcrdma_rep_free(struct rpcrdma_rep *rep)
kfree(rep);
}

-static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
-{
- struct rpcrdma_buffer *buf = &rep->rr_rxprt->rx_buf;
-
- spin_lock(&buf->rb_lock);
- list_del(&rep->rr_all);
- spin_unlock(&buf->rb_lock);
-
- rpcrdma_rep_free(rep);
-}
-
static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf)
{
struct llist_node *node;
@@ -1007,10 +999,8 @@ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt)
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_rep *rep;

- list_for_each_entry(rep, &buf->rb_all_reps, rr_all) {
+ list_for_each_entry(rep, &buf->rb_all_reps, rr_all)
rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf);
- rep->rr_temp = true; /* Mark this rep for destruction */
- }
}

static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf)
@@ -1227,14 +1217,15 @@ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
* or Replies they may be registered externally via frwr_map.
*/
static struct rpcrdma_regbuf *
-rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction)
+rpcrdma_regbuf_alloc_node(size_t size, enum dma_data_direction direction,
+ int node)
{
struct rpcrdma_regbuf *rb;

- rb = kmalloc(sizeof(*rb), XPRTRDMA_GFP_FLAGS);
+ rb = kmalloc_node(sizeof(*rb), XPRTRDMA_GFP_FLAGS, node);
if (!rb)
return NULL;
- rb->rg_data = kmalloc(size, XPRTRDMA_GFP_FLAGS);
+ rb->rg_data = kmalloc_node(size, XPRTRDMA_GFP_FLAGS, node);
if (!rb->rg_data) {
kfree(rb);
return NULL;
@@ -1246,6 +1237,12 @@ rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction)
return rb;
}

+static struct rpcrdma_regbuf *
+rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction)
+{
+ return rpcrdma_regbuf_alloc_node(size, direction, NUMA_NO_NODE);
+}
+
/**
* rpcrdma_regbuf_realloc - re-allocate a SEND/RECV buffer
* @rb: regbuf to reallocate
@@ -1323,10 +1320,9 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb)
* rpcrdma_post_recvs - Refill the Receive Queue
* @r_xprt: controlling transport instance
* @needed: current credit grant
- * @temp: mark Receive buffers to be deleted after one use
*
*/
-void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
+void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_ep *ep = r_xprt->rx_ep;
@@ -1340,8 +1336,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
if (likely(ep->re_receive_count > needed))
goto out;
needed -= ep->re_receive_count;
- if (!temp)
- needed += RPCRDMA_MAX_RECV_BATCH;
+ needed += RPCRDMA_MAX_RECV_BATCH;

if (atomic_inc_return(&ep->re_receiving) > 1)
goto out;
@@ -1350,12 +1345,8 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
wr = NULL;
while (needed) {
rep = rpcrdma_rep_get_locked(buf);
- if (rep && rep->rr_temp) {
- rpcrdma_rep_destroy(rep);
- continue;
- }
if (!rep)
- rep = rpcrdma_rep_create(r_xprt, temp);
+ rep = rpcrdma_rep_create(r_xprt);
if (!rep)
break;
if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) {
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index da409450dfc0..08bda29ed953 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -198,7 +198,6 @@ struct rpcrdma_rep {
__be32 rr_proc;
int rr_wc_flags;
u32 rr_inv_rkey;
- bool rr_temp;
struct rpcrdma_regbuf *rr_rdmabuf;
struct rpcrdma_xprt *rr_rxprt;
struct rpc_rqst *rr_rqst;
@@ -466,7 +465,7 @@ void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc);
int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt);
void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt);

-void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp);
+void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed);

/*
* Buffer calls - xprtrdma/verbs.c
--
2.44.0


2024-04-29 16:03:40

by Chuck Lever

[permalink] [raw]
Subject: [RFC PATCH 3/4] xprtrdma: Delay releasing connection hardware resources

From: Chuck Lever <[email protected]>

xprtiod_workqueue is a MEM_RECLAIM-enabled workqueue. However, the
RDMA core API functions are not memory reclaim-safe. This was
partially accomplished by commit 6b1eb3b22272 ("SUNRPC: Replace the
use of the xprtiod WQ in rpcrdma").

This commit addressed the issue in the connect path, but not in the
disconnect path. Thus sometimes a transport disconnect results in
this splat:

workqueue: WQ_MEM_RECLAIM xprtiod:xprt_autoclose [sunrpc] is flushing !WQ_MEM_RECLAIM events_highpri:rpcrdma_mr_refresh_worker [rpcrdma]
WARNING: CPU: 1 PID: 20378 at kernel/workqueue.c:3728 check_flush_dependency+0x101/0x120

? check_flush_dependency+0x101/0x120
? report_bug+0x175/0x1a0
? handle_bug+0x44/0x90
? exc_invalid_op+0x1c/0x70
? asm_exc_invalid_op+0x1f/0x30
? __pfx_rpcrdma_mr_refresh_worker+0x10/0x10 [rpcrdma aefd3d1b298311368fa14fa93ae5fb3818c3aeac]
? check_flush_dependency+0x101/0x120
__flush_work.isra.0+0x20a/0x290
__cancel_work_sync+0x129/0x1c0
cancel_work_sync+0x14/0x20
rpcrdma_xprt_disconnect+0x229/0x3f0 [rpcrdma aefd3d1b298311368fa14fa93ae5fb3818c3aeac]
xprt_rdma_close+0x16/0x40 [rpcrdma aefd3d1b298311368fa14fa93ae5fb3818c3aeac]
xprt_autoclose+0x63/0x110 [sunrpc a04d701bce94b5a8fb541cafbe1a489d6b1ab5b3]
process_one_work+0x19e/0x3f0
worker_thread+0x340/0x510
? __pfx_worker_thread+0x10/0x10
kthread+0xf7/0x130
? __pfx_kthread+0x10/0x10
ret_from_fork+0x41/0x60
? __pfx_kthread+0x10/0x10
ret_from_fork_asm+0x1a/0x30

Create a context in which it is safe to manage resources that are
not memory reclaim-safe that can be invoked during transport
disconnect. Essentially this means that releasing an rpcrdma_ep is
now done completely asynchronously.

Subsequent patches will move the release of transport resources into
this new context.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=218704
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/transport.c | 20 +++++++++++++-
net/sunrpc/xprtrdma/verbs.c | 46 ++++++++++++++++++++-------------
net/sunrpc/xprtrdma/xprt_rdma.h | 5 +++-
3 files changed, 51 insertions(+), 20 deletions(-)

diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 29b0562d62e7..237d78c1ec54 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -761,8 +761,12 @@ static struct xprt_class xprt_rdma = {
.netid = { "rdma", "rdma6", "" },
};

+struct workqueue_struct *rpcrdma_release_wq __read_mostly;
+
void xprt_rdma_cleanup(void)
{
+ struct workqueue_struct *wq;
+
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
if (sunrpc_table_header) {
unregister_sysctl_table(sunrpc_table_header);
@@ -772,18 +776,32 @@ void xprt_rdma_cleanup(void)

xprt_unregister_transport(&xprt_rdma);
xprt_unregister_transport(&xprt_rdma_bc);
+
+ wq = rpcrdma_release_wq;
+ rpcrdma_release_wq = NULL;
+ destroy_workqueue(wq);
}

int xprt_rdma_init(void)
{
+ struct workqueue_struct *wq;
int rc;

+ /* provision a WQ that is always unbound and !mem_reclaim */
+ wq = alloc_workqueue("rpcrdma_release", WQ_UNBOUND, 0);
+ if (!wq)
+ return -ENOMEM;
+ rpcrdma_release_wq = wq;
+
rc = xprt_register_transport(&xprt_rdma);
- if (rc)
+ if (rc) {
+ destroy_workqueue(wq);
return rc;
+ }

rc = xprt_register_transport(&xprt_rdma_bc);
if (rc) {
+ destroy_workqueue(wq);
xprt_unregister_transport(&xprt_rdma);
return rc;
}
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index c6d9d94c28ba..f1e4a28325fa 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -73,7 +73,7 @@ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_ep_get(struct rpcrdma_ep *ep);
-static int rpcrdma_ep_put(struct rpcrdma_ep *ep);
+static void rpcrdma_ep_put(struct rpcrdma_ep *ep);
static struct rpcrdma_regbuf *
rpcrdma_regbuf_alloc_node(size_t size, enum dma_data_direction direction,
int node);
@@ -234,15 +234,15 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
case RDMA_CM_EVENT_ROUTE_RESOLVED:
ep->re_async_rc = 0;
complete(&ep->re_done);
- return 0;
+ break;
case RDMA_CM_EVENT_ADDR_ERROR:
ep->re_async_rc = -EPROTO;
complete(&ep->re_done);
- return 0;
+ break;
case RDMA_CM_EVENT_ROUTE_ERROR:
ep->re_async_rc = -ENETUNREACH;
complete(&ep->re_done);
- return 0;
+ break;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
pr_info("rpcrdma: removing device %s for %pISpc\n",
ep->re_id->device->name, sap);
@@ -269,12 +269,13 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
ep->re_connect_status = -ENOTCONN;
wake_connect_worker:
wake_up_all(&ep->re_connect_wait);
- return 0;
+ break;
case RDMA_CM_EVENT_DISCONNECTED:
ep->re_connect_status = -ECONNABORTED;
disconnected:
rpcrdma_force_disconnect(ep);
- return rpcrdma_ep_put(ep);
+ rpcrdma_ep_put(ep);
+ fallthrough;
default:
break;
}
@@ -328,9 +329,13 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
return ERR_PTR(rc);
}

-static void rpcrdma_ep_destroy(struct kref *kref)
+/* Delayed release of a connection's hardware resources. Releasing
+ * RDMA hardware resources is done in a !MEM_RECLAIM context because
+ * the RDMA core API functions are generally not reclaim-safe.
+ */
+static void rpcrdma_ep_destroy(struct work_struct *work)
{
- struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref);
+ struct rpcrdma_ep *ep = container_of(work, struct rpcrdma_ep, re_worker);

if (ep->re_id->qp) {
rdma_destroy_qp(ep->re_id);
@@ -348,22 +353,30 @@ static void rpcrdma_ep_destroy(struct kref *kref)
ib_dealloc_pd(ep->re_pd);
ep->re_pd = NULL;

+ if (ep->re_id)
+ rdma_destroy_id(ep->re_id);
+ ep->re_id = NULL;
+
kfree(ep);
module_put(THIS_MODULE);
}

+static void rpcrdma_ep_release(struct kref *kref)
+{
+ struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref);
+
+ INIT_WORK(&ep->re_worker, rpcrdma_ep_destroy);
+ queue_work(rpcrdma_release_wq, &ep->re_worker);
+}
+
static noinline void rpcrdma_ep_get(struct rpcrdma_ep *ep)
{
kref_get(&ep->re_kref);
}

-/* Returns:
- * %0 if @ep still has a positive kref count, or
- * %1 if @ep was destroyed successfully.
- */
-static noinline int rpcrdma_ep_put(struct rpcrdma_ep *ep)
+static noinline void rpcrdma_ep_put(struct rpcrdma_ep *ep)
{
- return kref_put(&ep->re_kref, rpcrdma_ep_destroy);
+ kref_put(&ep->re_kref, rpcrdma_ep_release);
}

static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
@@ -475,7 +488,6 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)

out_destroy:
rpcrdma_ep_put(ep);
- rdma_destroy_id(id);
return rc;
}

@@ -566,10 +578,8 @@ void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt)
rpcrdma_mrs_destroy(r_xprt);
rpcrdma_sendctxs_destroy(r_xprt);

- if (rpcrdma_ep_put(ep))
- rdma_destroy_id(id);
-
r_xprt->rx_ep = NULL;
+ rpcrdma_ep_put(ep);
}

/* Fixed-size circular FIFO queue. This implementation is wait-free and
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 08bda29ed953..048d2e329384 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -70,7 +70,6 @@
*/
struct rpcrdma_mr;
struct rpcrdma_ep {
- struct kref re_kref;
struct rdma_cm_id *re_id;
struct ib_pd *re_pd;
unsigned int re_max_rdma_segs;
@@ -100,6 +99,9 @@ struct rpcrdma_ep {
atomic_t re_completion_ids;

char re_write_pad[XDR_UNIT];
+
+ struct kref re_kref;
+ struct work_struct re_worker;
};

/* Pre-allocate extra Work Requests for handling reverse-direction
@@ -583,6 +585,7 @@ void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
void xprt_rdma_close(struct rpc_xprt *xprt);
void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
+extern struct workqueue_struct *rpcrdma_release_wq;
int xprt_rdma_init(void);
void xprt_rdma_cleanup(void);

--
2.44.0


2024-04-30 07:27:04

by Zhu Yanjun

[permalink] [raw]
Subject: Re: [RFC PATCH 0/4] NFS: Fix another 'check_flush_dependency' splat

On 29.04.24 17:25, [email protected] wrote:
> From: Chuck Lever <[email protected]>
>
> Avoid getting work queue splats in the system journal by moving
> client-side RPC/RDMA transport tear-down into a background process.
>
> I've done some testing of this series, now looking for review
> comments.

How to make tests with nfs && rdma? Can you provide some steps or tools?
I am interested in nfs && rdma.

Thanks,
Zhu Yanjun

>
> Chuck Lever (4):
> xprtrdma: Remove temp allocation of rpcrdma_rep objects
> xprtrdma: Clean up synopsis of frwr_mr_unmap()
> xprtrdma: Delay releasing connection hardware resources
> xprtrdma: Move MRs to struct rpcrdma_ep
>
> net/sunrpc/xprtrdma/frwr_ops.c | 13 ++-
> net/sunrpc/xprtrdma/rpc_rdma.c | 3 +-
> net/sunrpc/xprtrdma/transport.c | 20 +++-
> net/sunrpc/xprtrdma/verbs.c | 173 ++++++++++++++++----------------
> net/sunrpc/xprtrdma/xprt_rdma.h | 21 ++--
> 5 files changed, 125 insertions(+), 105 deletions(-)
>
>
> base-commit: e67572cd2204894179d89bd7b984072f19313b03


2024-04-30 13:42:46

by Chuck Lever III

[permalink] [raw]
Subject: Re: [RFC PATCH 0/4] NFS: Fix another 'check_flush_dependency' splat



> On Apr 30, 2024, at 3:26 AM, Zhu Yanjun <[email protected]> wrote:
>
> On 29.04.24 17:25, [email protected] wrote:
>> From: Chuck Lever <[email protected]>
>> Avoid getting work queue splats in the system journal by moving
>> client-side RPC/RDMA transport tear-down into a background process.
>> I've done some testing of this series, now looking for review
>> comments.
>
> How to make tests with nfs && rdma? Can you provide some steps or tools?

We are building NFS tests into kdevops:

https://github.com/linux-kdevops/kdevops.git

and there is a config option to use soft iWARP instead of TCP.

kdevops includes workflows for fstests, Mora's nfstest, the
git regression suite, and ltp, all of which we use regularly
to test the Linux NFS client and server implementations.


> I am interested in nfs && rdma.
>
> Thanks,
> Zhu Yanjun
>
>> Chuck Lever (4):
>> xprtrdma: Remove temp allocation of rpcrdma_rep objects
>> xprtrdma: Clean up synopsis of frwr_mr_unmap()
>> xprtrdma: Delay releasing connection hardware resources
>> xprtrdma: Move MRs to struct rpcrdma_ep
>> net/sunrpc/xprtrdma/frwr_ops.c | 13 ++-
>> net/sunrpc/xprtrdma/rpc_rdma.c | 3 +-
>> net/sunrpc/xprtrdma/transport.c | 20 +++-
>> net/sunrpc/xprtrdma/verbs.c | 173 ++++++++++++++++----------------
>> net/sunrpc/xprtrdma/xprt_rdma.h | 21 ++--
>> 5 files changed, 125 insertions(+), 105 deletions(-)
>> base-commit: e67572cd2204894179d89bd7b984072f19313b03
>

--
Chuck Lever


2024-04-30 13:58:39

by Zhu Yanjun

[permalink] [raw]
Subject: Re: [RFC PATCH 0/4] NFS: Fix another 'check_flush_dependency' splat


On 30.04.24 15:42, Chuck Lever III wrote:
>
>> On Apr 30, 2024, at 3:26 AM, Zhu Yanjun <[email protected]> wrote:
>>
>> On 29.04.24 17:25, [email protected] wrote:
>>> From: Chuck Lever <[email protected]>
>>> Avoid getting work queue splats in the system journal by moving
>>> client-side RPC/RDMA transport tear-down into a background process.
>>> I've done some testing of this series, now looking for review
>>> comments.
>> How to make tests with nfs && rdma? Can you provide some steps or tools?
> We are building NFS tests into kdevops:
>
> https://github.com/linux-kdevops/kdevops.git
>
> and there is a config option to use soft iWARP instead of TCP.

Thanks a lot. It is interesting. Have you made tests with RXE instead of
iWARP?

If yes, does nfs work well with RXE? I am just curious with nfs && RXE.

Normally nfs works with TCP. Now nfs will use RDMA instead of TCP.

The popular RDMA implementation is RoCEv2 which is based on UDP protocol.

So I am curious if NFS can work well with RXE (RoCEv2 emulation driver)
or not.

If the user wants to use nfs in his production hosts, it is possible
that nfs will work with RoCEv2 (UDP).

Best Regards,

Zhu Yanjun

>
> kdevops includes workflows for fstests, Mora's nfstest, the
> git regression suite, and ltp, all of which we use regularly
> to test the Linux NFS client and server implementations.
>
>
>> I am interested in nfs && rdma.
>>
>> Thanks,
>> Zhu Yanjun
>>
>>> Chuck Lever (4):
>>> xprtrdma: Remove temp allocation of rpcrdma_rep objects
>>> xprtrdma: Clean up synopsis of frwr_mr_unmap()
>>> xprtrdma: Delay releasing connection hardware resources
>>> xprtrdma: Move MRs to struct rpcrdma_ep
>>> net/sunrpc/xprtrdma/frwr_ops.c | 13 ++-
>>> net/sunrpc/xprtrdma/rpc_rdma.c | 3 +-
>>> net/sunrpc/xprtrdma/transport.c | 20 +++-
>>> net/sunrpc/xprtrdma/verbs.c | 173 ++++++++++++++++----------------
>>> net/sunrpc/xprtrdma/xprt_rdma.h | 21 ++--
>>> 5 files changed, 125 insertions(+), 105 deletions(-)
>>> base-commit: e67572cd2204894179d89bd7b984072f19313b03
> --
> Chuck Lever
>
>
--
Best Regards,
Yanjun.Zhu


2024-04-30 14:25:54

by Chuck Lever III

[permalink] [raw]
Subject: Re: [RFC PATCH 0/4] NFS: Fix another 'check_flush_dependency' splat



> On Apr 30, 2024, at 9:58 AM, Zhu Yanjun <[email protected]> wrote:
>
>
> On 30.04.24 15:42, Chuck Lever III wrote:
>>
>>> On Apr 30, 2024, at 3:26 AM, Zhu Yanjun <[email protected]> wrote:
>>>
>>> On 29.04.24 17:25, [email protected] wrote:
>>>> From: Chuck Lever <[email protected]>
>>>> Avoid getting work queue splats in the system journal by moving
>>>> client-side RPC/RDMA transport tear-down into a background process.
>>>> I've done some testing of this series, now looking for review
>>>> comments.
>>> How to make tests with nfs && rdma? Can you provide some steps or tools?
>> We are building NFS tests into kdevops:
>>
>> https://github.com/linux-kdevops/kdevops.git
>>
>> and there is a config option to use soft iWARP instead of TCP.
>
> Thanks a lot. It is interesting. Have you made tests with RXE instead of iWARP?
>
> If yes, does nfs work well with RXE? I am just curious with nfs && RXE.
>
> Normally nfs works with TCP. Now nfs will use RDMA instead of TCP.
>
> The popular RDMA implementation is RoCEv2 which is based on UDP protocol.
>
> So I am curious if NFS can work well with RXE (RoCEv2 emulation driver) or not.
>
> If the user wants to use nfs in his production hosts, it is possible that nfs will work with RoCEv2 (UDP).

Yes, NFS/RDMA works with rxe and even with rxe mixed with
hardware RoCE. Someone else will have to step in and say
whether it works "well" since I don't use rxe, only CX-5
and newer on 100GbE.

Generally we use siw because our testing environment varies
between all systems on a single local network or hypervisor,
all the way up to NFS/RDMA on VPN and WAN. The rxe driver
doesn't support operation over tunnels, currently.

It is possible to add rxe as a second option in kdevops,
but siw has worked for our purposes so far, and the NFS
test matrix is already enormous.


> Best Regards,
>
> Zhu Yanjun
>
>> kdevops includes workflows for fstests, Mora's nfstest, the
>> git regression suite, and ltp, all of which we use regularly
>> to test the Linux NFS client and server implementations.
>>
>>
>>> I am interested in nfs && rdma.
>>>
>>> Thanks,
>>> Zhu Yanjun
>>>
>>>> Chuck Lever (4):
>>>> xprtrdma: Remove temp allocation of rpcrdma_rep objects
>>>> xprtrdma: Clean up synopsis of frwr_mr_unmap()
>>>> xprtrdma: Delay releasing connection hardware resources
>>>> xprtrdma: Move MRs to struct rpcrdma_ep
>>>> net/sunrpc/xprtrdma/frwr_ops.c | 13 ++-
>>>> net/sunrpc/xprtrdma/rpc_rdma.c | 3 +-
>>>> net/sunrpc/xprtrdma/transport.c | 20 +++-
>>>> net/sunrpc/xprtrdma/verbs.c | 173 ++++++++++++++++----------------
>>>> net/sunrpc/xprtrdma/xprt_rdma.h | 21 ++--
>>>> 5 files changed, 125 insertions(+), 105 deletions(-)
>>>> base-commit: e67572cd2204894179d89bd7b984072f19313b03
>> --
>> Chuck Lever
>>
>>
> --
> Best Regards,
> Yanjun.Zhu


--
Chuck Lever


2024-04-30 14:52:48

by Zhu Yanjun

[permalink] [raw]
Subject: Re: [RFC PATCH 0/4] NFS: Fix another 'check_flush_dependency' splat

On 30.04.24 16:13, Chuck Lever III wrote:
>
>
>> On Apr 30, 2024, at 9:58 AM, Zhu Yanjun <[email protected]> wrote:
>>
>>
>> On 30.04.24 15:42, Chuck Lever III wrote:
>>>
>>>> On Apr 30, 2024, at 3:26 AM, Zhu Yanjun <[email protected]> wrote:
>>>>
>>>> On 29.04.24 17:25, [email protected] wrote:
>>>>> From: Chuck Lever <[email protected]>
>>>>> Avoid getting work queue splats in the system journal by moving
>>>>> client-side RPC/RDMA transport tear-down into a background process.
>>>>> I've done some testing of this series, now looking for review
>>>>> comments.
>>>> How to make tests with nfs && rdma? Can you provide some steps or tools?
>>> We are building NFS tests into kdevops:
>>>
>>> https://github.com/linux-kdevops/kdevops.git
>>>
>>> and there is a config option to use soft iWARP instead of TCP.
>>
>> Thanks a lot. It is interesting. Have you made tests with RXE instead of iWARP?
>>
>> If yes, does nfs work well with RXE? I am just curious with nfs && RXE.
>>
>> Normally nfs works with TCP. Now nfs will use RDMA instead of TCP.
>>
>> The popular RDMA implementation is RoCEv2 which is based on UDP protocol.
>>
>> So I am curious if NFS can work well with RXE (RoCEv2 emulation driver) or not.
>>
>> If the user wants to use nfs in his production hosts, it is possible that nfs will work with RoCEv2 (UDP).
>
> Yes, NFS/RDMA works with rxe and even with rxe mixed with
> hardware RoCE. Someone else will have to step in and say
> whether it works "well" since I don't use rxe, only CX-5
> and newer on 100GbE.
>
> Generally we use siw because our testing environment varies
> between all systems on a single local network or hypervisor,
> all the way up to NFS/RDMA on VPN and WAN. The rxe driver
> doesn't support operation over tunnels, currently.

Thanks a lot. "The rxe driver doesn't support operation over tunnels,
currently." Do you mean that rxe can not work well with tun/tap device?

>
> It is possible to add rxe as a second option in kdevops,
> but siw has worked for our purposes so far, and the NFS
> test matrix is already enormous.

Thanks. If rxe can be as a second option in kdevops, I will make tests
with kdevops to check rxe work well or not in the future kernel version.

Best Regards,
Zhu Yanjun

>
>
>> Best Regards,
>>
>> Zhu Yanjun
>>
>>> kdevops includes workflows for fstests, Mora's nfstest, the
>>> git regression suite, and ltp, all of which we use regularly
>>> to test the Linux NFS client and server implementations.
>>>
>>>
>>>> I am interested in nfs && rdma.
>>>>
>>>> Thanks,
>>>> Zhu Yanjun
>>>>
>>>>> Chuck Lever (4):
>>>>> xprtrdma: Remove temp allocation of rpcrdma_rep objects
>>>>> xprtrdma: Clean up synopsis of frwr_mr_unmap()
>>>>> xprtrdma: Delay releasing connection hardware resources
>>>>> xprtrdma: Move MRs to struct rpcrdma_ep
>>>>> net/sunrpc/xprtrdma/frwr_ops.c | 13 ++-
>>>>> net/sunrpc/xprtrdma/rpc_rdma.c | 3 +-
>>>>> net/sunrpc/xprtrdma/transport.c | 20 +++-
>>>>> net/sunrpc/xprtrdma/verbs.c | 173 ++++++++++++++++----------------
>>>>> net/sunrpc/xprtrdma/xprt_rdma.h | 21 ++--
>>>>> 5 files changed, 125 insertions(+), 105 deletions(-)
>>>>> base-commit: e67572cd2204894179d89bd7b984072f19313b03
>>> --
>>> Chuck Lever
>>>
>>>
>> --
>> Best Regards,
>> Yanjun.Zhu
>
>
> --
> Chuck Lever
>
>


2024-04-30 15:02:00

by Chuck Lever III

[permalink] [raw]
Subject: Re: [RFC PATCH 0/4] NFS: Fix another 'check_flush_dependency' splat



> On Apr 30, 2024, at 10:45 AM, Zhu Yanjun <[email protected]> wrote:
>
> On 30.04.24 16:13, Chuck Lever III wrote:
>>> On Apr 30, 2024, at 9:58 AM, Zhu Yanjun <[email protected]> wrote:
>>>
>>>
>>> On 30.04.24 15:42, Chuck Lever III wrote:
>>>>
>>>>> On Apr 30, 2024, at 3:26 AM, Zhu Yanjun <[email protected]> wrote:
>>>>>
>>>>> On 29.04.24 17:25, [email protected] wrote:
>>>>>> From: Chuck Lever <[email protected]>
>>>>>> Avoid getting work queue splats in the system journal by moving
>>>>>> client-side RPC/RDMA transport tear-down into a background process.
>>>>>> I've done some testing of this series, now looking for review
>>>>>> comments.
>>>>> How to make tests with nfs && rdma? Can you provide some steps or tools?
>>>> We are building NFS tests into kdevops:
>>>>
>>>> https://github.com/linux-kdevops/kdevops.git
>>>>
>>>> and there is a config option to use soft iWARP instead of TCP.
>>>
>>> Thanks a lot. It is interesting. Have you made tests with RXE instead of iWARP?
>>>
>>> If yes, does nfs work well with RXE? I am just curious with nfs && RXE.
>>>
>>> Normally nfs works with TCP. Now nfs will use RDMA instead of TCP.
>>>
>>> The popular RDMA implementation is RoCEv2 which is based on UDP protocol.
>>>
>>> So I am curious if NFS can work well with RXE (RoCEv2 emulation driver) or not.
>>>
>>> If the user wants to use nfs in his production hosts, it is possible that nfs will work with RoCEv2 (UDP).
>> Yes, NFS/RDMA works with rxe and even with rxe mixed with
>> hardware RoCE. Someone else will have to step in and say
>> whether it works "well" since I don't use rxe, only CX-5
>> and newer on 100GbE.
>> Generally we use siw because our testing environment varies
>> between all systems on a single local network or hypervisor,
>> all the way up to NFS/RDMA on VPN and WAN. The rxe driver
>> doesn't support operation over tunnels, currently.
>
> Thanks a lot. "The rxe driver doesn't support operation over tunnels, currently." Do you mean that rxe can not work well with tun/tap device?

No, rxe cannot be configured to use tunnel devices, AFAIK.


>> It is possible to add rxe as a second option in kdevops,
>> but siw has worked for our purposes so far, and the NFS
>> test matrix is already enormous.
>
> Thanks. If rxe can be as a second option in kdevops, I will make tests with kdevops to check rxe work well or not in the future kernel version.

No new tests are necessary. The only thing missing right
now is the ability to set up rxe devices on all the test
systems.


> Best Regards,
> Zhu Yanjun
>
>>> Best Regards,
>>>
>>> Zhu Yanjun
>>>
>>>> kdevops includes workflows for fstests, Mora's nfstest, the
>>>> git regression suite, and ltp, all of which we use regularly
>>>> to test the Linux NFS client and server implementations.
>>>>
>>>>
>>>>> I am interested in nfs && rdma.
>>>>>
>>>>> Thanks,
>>>>> Zhu Yanjun
>>>>>
>>>>>> Chuck Lever (4):
>>>>>> xprtrdma: Remove temp allocation of rpcrdma_rep objects
>>>>>> xprtrdma: Clean up synopsis of frwr_mr_unmap()
>>>>>> xprtrdma: Delay releasing connection hardware resources
>>>>>> xprtrdma: Move MRs to struct rpcrdma_ep
>>>>>> net/sunrpc/xprtrdma/frwr_ops.c | 13 ++-
>>>>>> net/sunrpc/xprtrdma/rpc_rdma.c | 3 +-
>>>>>> net/sunrpc/xprtrdma/transport.c | 20 +++-
>>>>>> net/sunrpc/xprtrdma/verbs.c | 173 ++++++++++++++++----------------
>>>>>> net/sunrpc/xprtrdma/xprt_rdma.h | 21 ++--
>>>>>> 5 files changed, 125 insertions(+), 105 deletions(-)
>>>>>> base-commit: e67572cd2204894179d89bd7b984072f19313b03
>>>> --
>>>> Chuck Lever
>>>>
>>>>
>>> --
>>> Best Regards,
>>> Yanjun.Zhu
>> --
>> Chuck Lever


--
Chuck Lever


2024-04-30 15:20:36

by Zhu Yanjun

[permalink] [raw]
Subject: Re: [RFC PATCH 0/4] NFS: Fix another 'check_flush_dependency' splat


On 30.04.24 16:52, Chuck Lever III wrote:
>
>> On Apr 30, 2024, at 10:45 AM, Zhu Yanjun <[email protected]> wrote:
>>
>> On 30.04.24 16:13, Chuck Lever III wrote:
>>>> On Apr 30, 2024, at 9:58 AM, Zhu Yanjun <[email protected]> wrote:
>>>>
>>>>
>>>> On 30.04.24 15:42, Chuck Lever III wrote:
>>>>>> On Apr 30, 2024, at 3:26 AM, Zhu Yanjun <[email protected]> wrote:
>>>>>>
>>>>>> On 29.04.24 17:25, [email protected] wrote:
>>>>>>> From: Chuck Lever <[email protected]>
>>>>>>> Avoid getting work queue splats in the system journal by moving
>>>>>>> client-side RPC/RDMA transport tear-down into a background process.
>>>>>>> I've done some testing of this series, now looking for review
>>>>>>> comments.
>>>>>> How to make tests with nfs && rdma? Can you provide some steps or tools?
>>>>> We are building NFS tests into kdevops:
>>>>>
>>>>> https://github.com/linux-kdevops/kdevops.git
>>>>>
>>>>> and there is a config option to use soft iWARP instead of TCP.
>>>> Thanks a lot. It is interesting. Have you made tests with RXE instead of iWARP?
>>>>
>>>> If yes, does nfs work well with RXE? I am just curious with nfs && RXE.
>>>>
>>>> Normally nfs works with TCP. Now nfs will use RDMA instead of TCP.
>>>>
>>>> The popular RDMA implementation is RoCEv2 which is based on UDP protocol.
>>>>
>>>> So I am curious if NFS can work well with RXE (RoCEv2 emulation driver) or not.
>>>>
>>>> If the user wants to use nfs in his production hosts, it is possible that nfs will work with RoCEv2 (UDP).
>>> Yes, NFS/RDMA works with rxe and even with rxe mixed with
>>> hardware RoCE. Someone else will have to step in and say
>>> whether it works "well" since I don't use rxe, only CX-5
>>> and newer on 100GbE.
>>> Generally we use siw because our testing environment varies
>>> between all systems on a single local network or hypervisor,
>>> all the way up to NFS/RDMA on VPN and WAN. The rxe driver
>>> doesn't support operation over tunnels, currently.
>> Thanks a lot. "The rxe driver doesn't support operation over tunnels, currently." Do you mean that rxe can not work well with tun/tap device?
> No, rxe cannot be configured to use tunnel devices, AFAIK.
>
>
>>> It is possible to add rxe as a second option in kdevops,
>>> but siw has worked for our purposes so far, and the NFS
>>> test matrix is already enormous.
>> Thanks. If rxe can be as a second option in kdevops, I will make tests with kdevops to check rxe work well or not in the future kernel version.
> No new tests are necessary. The only thing missing right
> now is the ability to set up rxe devices on all the test
> systems.

Got it. Thanks.

Zhu Yanjun

>
>
>> Best Regards,
>> Zhu Yanjun
>>
>>>> Best Regards,
>>>>
>>>> Zhu Yanjun
>>>>
>>>>> kdevops includes workflows for fstests, Mora's nfstest, the
>>>>> git regression suite, and ltp, all of which we use regularly
>>>>> to test the Linux NFS client and server implementations.
>>>>>
>>>>>
>>>>>> I am interested in nfs && rdma.
>>>>>>
>>>>>> Thanks,
>>>>>> Zhu Yanjun
>>>>>>
>>>>>>> Chuck Lever (4):
>>>>>>> xprtrdma: Remove temp allocation of rpcrdma_rep objects
>>>>>>> xprtrdma: Clean up synopsis of frwr_mr_unmap()
>>>>>>> xprtrdma: Delay releasing connection hardware resources
>>>>>>> xprtrdma: Move MRs to struct rpcrdma_ep
>>>>>>> net/sunrpc/xprtrdma/frwr_ops.c | 13 ++-
>>>>>>> net/sunrpc/xprtrdma/rpc_rdma.c | 3 +-
>>>>>>> net/sunrpc/xprtrdma/transport.c | 20 +++-
>>>>>>> net/sunrpc/xprtrdma/verbs.c | 173 ++++++++++++++++----------------
>>>>>>> net/sunrpc/xprtrdma/xprt_rdma.h | 21 ++--
>>>>>>> 5 files changed, 125 insertions(+), 105 deletions(-)
>>>>>>> base-commit: e67572cd2204894179d89bd7b984072f19313b03
>>>>> --
>>>>> Chuck Lever
>>>>>
>>>>>
>>>> --
>>>> Best Regards,
>>>> Yanjun.Zhu
>>> --
>>> Chuck Lever
>
> --
> Chuck Lever
>
>
--
Best Regards,
Yanjun.Zhu


2024-06-02 18:14:34

by Zhu Yanjun

[permalink] [raw]
Subject: Re: [RFC PATCH 0/4] NFS: Fix another 'check_flush_dependency' splat

On Sun, Jun 2, 2024 at 5:40 PM Chuck Lever III <[email protected]> wrote:
>
>
> > On Apr 30, 2024, at 10:45 AM, Zhu Yanjun <[email protected]> wrote:
> >
> > On 30.04.24 16:13, Chuck Lever III wrote:
> >> It is possible to add rxe as a second option in kdevops,
> >> but siw has worked for our purposes so far, and the NFS
> >> test matrix is already enormous.
> >
> > Thanks. If rxe can be as a second option in kdevops, I will make tests with kdevops to check rxe work well or not in the future kernel version.
>
> As per our recent discussion, I have added rxe as a second
> software RDMA option in kdevops. Proof of concept:

Thanks a lot. I am very glad to know that rxe is treated as a second
software RDMA option in kdeops.
And I also checked the commit related with this feature. It is very
complicated and huge. I hope rxe can work well in kdeops.
So I can also use kdeops to verify rxe and rdma subsystems. Thanks a
lot your efforts.

>
> https://github.com/chucklever/kdevops/tree/add-rxe-support
>
> But basic rping testing is not working (with 6.10-rc1 kernels)
> in this set-up. It's missing something...

Just now I made tests with the latest rdma-core (rping is included in
rdma-core) and 6.10-rc1 kernels. rping can work well.

Normally rping works as a basic tool to verify if rxe works well or
not. If rping can not work well, normally I will do the followings:
1. rping -s -a 127.0.0.1
rping -c -a 127.0.0.1 -C 3 -d -v
This will verify whether rxe is configured correctly or not.
2. ping -c 3 server_ip on client host.
This will verify whether the client host can connect to the server
host or not.
3. rping -s -a server_ip
rping -c -a server_ip -C 3 -d -v
1) shutdown firewall
2) tcpdump -ni xxxx to capture udp packets
Normally the above steps can find out the errors in rxe client/server.
Hope the above can help to find out the errors.

Zhu Yanjun

>
> --
> Chuck Lever
>
>

2024-06-03 16:54:32

by Zhu Yanjun

[permalink] [raw]
Subject: Re: [RFC PATCH 0/4] NFS: Fix another 'check_flush_dependency' splat

On Mon, Jun 3, 2024 at 5:59 PM Chuck Lever III <[email protected]> wrote:
>
>
>
> > On Jun 2, 2024, at 2:14 PM, Zhu Yanjun <[email protected]> wrote:
> >
> > On Sun, Jun 2, 2024 at 5:40 PM Chuck Lever III <[email protected]> wrote:
> >>
> >>
> >>> On Apr 30, 2024, at 10:45 AM, Zhu Yanjun <[email protected]> wrote:
> >>>
> >>> On 30.04.24 16:13, Chuck Lever III wrote:
> >>>> It is possible to add rxe as a second option in kdevops,
> >>>> but siw has worked for our purposes so far, and the NFS
> >>>> test matrix is already enormous.
> >>>
> >>> Thanks. If rxe can be as a second option in kdevops, I will make tests with kdevops to check rxe work well or not in the future kernel version.
> >>
> >> As per our recent discussion, I have added rxe as a second
> >> software RDMA option in kdevops. Proof of concept:
> >
> > Thanks a lot. I am very glad to know that rxe is treated as a second
> > software RDMA option in kdeops.
> > And I also checked the commit related with this feature. It is very
> > complicated and huge.
>
> I split this into four smaller patches, HTH.
>
>
> > I hope rxe can work well in kdeops.
> > So I can also use kdeops to verify rxe and rdma subsystems. Thanks a
> > lot your efforts.
> >
> >>
> >> https://github.com/chucklever/kdevops/tree/add-rxe-support
> >>
> >> But basic rping testing is not working (with 6.10-rc1 kernels)
> >> in this set-up. It's missing something...
> >
> > Just now I made tests with the latest rdma-core (rping is included in
> > rdma-core) and 6.10-rc1 kernels. rping can work well.
> >
> > Normally rping works as a basic tool to verify if rxe works well or
> > not. If rping can not work well, normally I will do the followings:
> > 1. rping -s -a 127.0.0.1
> > rping -c -a 127.0.0.1 -C 3 -d -v
> > This will verify whether rxe is configured correctly or not.
>
> I don't have rxe set up on loopback, so I substituted the host's
> configured Ethernet IP.
>
> The tests works on the NFS server, but the rping client hangs
> on the NFS client (both running v6.10-rc1).
>
> I rebooted in to the Fedora 39 stock kernel, and the rping tests
> pass.
>
> However, when I try to run fstests with NFS/RDMA using rxe, the
> client kernel reports a soft CPU lock-up, and top shows this:
>
> 115 root 20 0 0 0 0 R 99.3 0.0 1:03.50 kworker/u8:5+rxe_wq

rxe_wq is introduced in the commit 9b4b7c1f9f54 "RDMA/rxe: Add
workqueue support for rxe tasks".
And this commit is merged into kernel v6.4-rc2-1-g9b4b7c1f9f54.

And the Fedora 39 stock kernel is kernel 6.5. So maybe some commits
between 6.5 and 6.10 introduce this problem.

>
> So I think this is enough to show that the Ansible parts of this
> change are working as expected. I can push this to kdevops now
> if there are no objections, and someone (maybe you, maybe me) can
> sort out the rxe specific issues later.

Thanks. After I can reproduce this problem in my local host, I am very
glad to delve into this problem. Perhaps it will take me a long time
since I do not have a good host to deploy kdeops.

To be honest, perhaps "git bisec" can find the commit that introduce
this problem. If you can find the commit, we can fix this problem very
quickly^_^

Thanks,
Zhu Yanjun

>
>
> > 2. ping -c 3 server_ip on client host.
> > This will verify whether the client host can connect to the server
> > host or not.
> > 3. rping -s -a server_ip
> > rping -c -a server_ip -C 3 -d -v
> > 1) shutdown firewall
> > 2) tcpdump -ni xxxx to capture udp packets
> > Normally the above steps can find out the errors in rxe client/server.
> > Hope the above can help to find out the errors.
> >
> > Zhu Yanjun
> >
> >>
> >> --
> >> Chuck Lever
> >>
> >>
>
> --
> Chuck Lever
>
>