From: Chuck Lever <[email protected]>
Avoid FastReg operations getting MW_BIND_ERR after a reconnect.
rpcrdma_reqs_reset() is called on transport tear-down to get each
rpcrdma_req back into a clean state.
MRs on req->rl_registered are waiting for a FastReg, are already
registered, or are waiting for invalidation. If the transport is
being torn down when reqs_reset() is called, the matching LocalInv
might never be posted. That leaves these MR registered /and/ on
req->rl_free_mrs, where they can be re-used for the next
connection.
Since xprtrdma does not keep specific track of the MR state, it's
not possible to know what state these MRs are in, so the only safe
thing to do is release them immediately.
Fixes: 5de55ce951a1 ("xprtrdma: Release in-flight MRs on disconnect")
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/frwr_ops.c | 3 ++-
net/sunrpc/xprtrdma/verbs.c | 16 +++++++++++++++-
2 files changed, 17 insertions(+), 2 deletions(-)
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index ffbf99894970..47f33bb7bff8 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -92,7 +92,8 @@ static void frwr_mr_put(struct rpcrdma_mr *mr)
rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
}
-/* frwr_reset - Place MRs back on the free list
+/**
+ * frwr_reset - Place MRs back on @req's free list
* @req: request to reset
*
* Used after a failed marshal. For FRWR, this means the MRs
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 432557a553e7..a0b071089e15 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -897,6 +897,8 @@ static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt)
static void rpcrdma_req_reset(struct rpcrdma_req *req)
{
+ struct rpcrdma_mr *mr;
+
/* Credits are valid for only one connection */
req->rl_slot.rq_cong = 0;
@@ -906,7 +908,19 @@ static void rpcrdma_req_reset(struct rpcrdma_req *req)
rpcrdma_regbuf_dma_unmap(req->rl_sendbuf);
rpcrdma_regbuf_dma_unmap(req->rl_recvbuf);
- frwr_reset(req);
+ /* The verbs consumer can't know the state of an MR on the
+ * req->rl_registered list unless a successful completion
+ * has occurred, so they cannot be re-used.
+ */
+ while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
+ struct rpcrdma_buffer *buf = &mr->mr_xprt->rx_buf;
+
+ spin_lock(&buf->rb_lock);
+ list_del(&mr->mr_all);
+ spin_unlock(&buf->rb_lock);
+
+ frwr_mr_release(mr);
+ }
}
/* ASSUMPTION: the rb_allreqs list is stable for the duration,
--
2.45.1
From: Chuck Lever <[email protected]>
Commit e87a911fed07 ("nvme-rdma: use ib_client API to detect device
removal") explains the benefits of handling device removal outside
of the CM event handler.
Sketch in an IB device removal notification mechanism that can be
used by both the client and server side RPC-over-RDMA transport
implementations.
Suggested-by: Sagi Grimberg <[email protected]>
Signed-off-by: Chuck Lever <[email protected]>
---
include/linux/sunrpc/rdma_rn.h | 27 +++++
include/trace/events/rpcrdma.h | 34 ++++++
net/sunrpc/xprtrdma/Makefile | 2 +-
net/sunrpc/xprtrdma/ib_client.c | 181 ++++++++++++++++++++++++++++++++
net/sunrpc/xprtrdma/module.c | 18 +++-
5 files changed, 258 insertions(+), 4 deletions(-)
create mode 100644 include/linux/sunrpc/rdma_rn.h
create mode 100644 net/sunrpc/xprtrdma/ib_client.c
diff --git a/include/linux/sunrpc/rdma_rn.h b/include/linux/sunrpc/rdma_rn.h
new file mode 100644
index 000000000000..7d032ca057af
--- /dev/null
+++ b/include/linux/sunrpc/rdma_rn.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * * Copyright (c) 2024, Oracle and/or its affiliates.
+ */
+
+#ifndef _LINUX_SUNRPC_RDMA_RN_H
+#define _LINUX_SUNRPC_RDMA_RN_H
+
+#include <rdma/ib_verbs.h>
+
+/**
+ * rpcrdma_notification - request removal notification
+ */
+struct rpcrdma_notification {
+ void (*rn_done)(struct rpcrdma_notification *rn);
+ u32 rn_index;
+};
+
+int rpcrdma_rn_register(struct ib_device *device,
+ struct rpcrdma_notification *rn,
+ void (*done)(struct rpcrdma_notification *rn));
+void rpcrdma_rn_unregister(struct ib_device *device,
+ struct rpcrdma_notification *rn);
+int rpcrdma_ib_client_register(void);
+void rpcrdma_ib_client_unregister(void);
+
+#endif /* _LINUX_SUNRPC_RDMA_RN_H */
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 14392652273a..ecdaf088219d 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -2220,6 +2220,40 @@ TRACE_EVENT(svcrdma_sq_post_err,
)
);
+DECLARE_EVENT_CLASS(rpcrdma_client_device_class,
+ TP_PROTO(
+ const struct ib_device *device
+ ),
+
+ TP_ARGS(device),
+
+ TP_STRUCT__entry(
+ __string(name, device->name)
+ ),
+
+ TP_fast_assign(
+ __assign_str(name);
+ ),
+
+ TP_printk("device=%s",
+ __get_str(name)
+ )
+);
+
+#define DEFINE_CLIENT_DEVICE_EVENT(name) \
+ DEFINE_EVENT(rpcrdma_client_device_class, name, \
+ TP_PROTO( \
+ const struct ib_device *device \
+ ), \
+ TP_ARGS(device) \
+ )
+
+DEFINE_CLIENT_DEVICE_EVENT(rpcrdma_client_completion);
+DEFINE_CLIENT_DEVICE_EVENT(rpcrdma_client_add_one);
+DEFINE_CLIENT_DEVICE_EVENT(rpcrdma_client_remove_one);
+DEFINE_CLIENT_DEVICE_EVENT(rpcrdma_client_wait_on);
+DEFINE_CLIENT_DEVICE_EVENT(rpcrdma_client_remove_one_done);
+
#endif /* _TRACE_RPCRDMA_H */
#include <trace/define_trace.h>
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 55b21bae866d..3232aa23cdb4 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
-rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o \
+rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o ib_client.o \
svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \
svc_rdma_pcl.o module.o
diff --git a/net/sunrpc/xprtrdma/ib_client.c b/net/sunrpc/xprtrdma/ib_client.c
new file mode 100644
index 000000000000..a938c19c3490
--- /dev/null
+++ b/net/sunrpc/xprtrdma/ib_client.c
@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/*
+ * Copyright (c) 2024 Oracle. All rights reserved.
+ */
+
+/* #include <linux/module.h>
+#include <linux/slab.h> */
+#include <linux/xarray.h>
+#include <linux/types.h>
+#include <linux/kref.h>
+#include <linux/completion.h>
+
+#include <linux/sunrpc/svc_rdma.h>
+#include <linux/sunrpc/rdma_rn.h>
+
+#include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
+
+/* Per-ib_device private data for rpcrdma */
+struct rpcrdma_device {
+ struct kref rd_kref;
+ unsigned long rd_flags;
+ struct ib_device *rd_device;
+ struct xarray rd_xa;
+ struct completion rd_done;
+};
+
+#define RPCRDMA_RD_F_REMOVING (0)
+
+static struct ib_client rpcrdma_ib_client;
+
+/*
+ * Listeners have no associated device, so we never register them.
+ * Note that ib_get_client_data() does not check if @device is
+ * NULL for us.
+ */
+static struct rpcrdma_device *rpcrdma_get_client_data(struct ib_device *device)
+{
+ if (!device)
+ return NULL;
+ return ib_get_client_data(device, &rpcrdma_ib_client);
+}
+
+/**
+ * rpcrdma_rn_register - register to get device removal notifications
+ * @device: device to monitor
+ * @rn: notification object that wishes to be notified
+ * @done: callback to notify caller of device removal
+ *
+ * Returns zero on success. The callback in rn_done is guaranteed
+ * to be invoked when the device is removed, unless this notification
+ * is unregistered first.
+ *
+ * On failure, a negative errno is returned.
+ */
+int rpcrdma_rn_register(struct ib_device *device,
+ struct rpcrdma_notification *rn,
+ void (*done)(struct rpcrdma_notification *rn))
+{
+ struct rpcrdma_device *rd = rpcrdma_get_client_data(device);
+
+ if (!rd || test_bit(RPCRDMA_RD_F_REMOVING, &rd->rd_flags))
+ return -ENETUNREACH;
+
+ kref_get(&rd->rd_kref);
+ if (xa_alloc(&rd->rd_xa, &rn->rn_index, rn, xa_limit_32b, GFP_KERNEL) < 0)
+ return -ENOMEM;
+ rn->rn_done = done;
+ return 0;
+}
+
+static void rpcrdma_rn_release(struct kref *kref)
+{
+ struct rpcrdma_device *rd = container_of(kref, struct rpcrdma_device,
+ rd_kref);
+
+ trace_rpcrdma_client_completion(rd->rd_device);
+ complete(&rd->rd_done);
+}
+
+/**
+ * rpcrdma_rn_unregister - stop device removal notifications
+ * @device: monitored device
+ * @rn: notification object that no longer wishes to be notified
+ */
+void rpcrdma_rn_unregister(struct ib_device *device,
+ struct rpcrdma_notification *rn)
+{
+ struct rpcrdma_device *rd = rpcrdma_get_client_data(device);
+
+ if (!rd)
+ return;
+
+ xa_erase(&rd->rd_xa, rn->rn_index);
+ kref_put(&rd->rd_kref, rpcrdma_rn_release);
+}
+
+/**
+ * rpcrdma_add_one - ib_client device insertion callback
+ * @device: device about to be inserted
+ *
+ * Returns zero on success. xprtrdma private data has been allocated
+ * for this device. On failure, a negative errno is returned.
+ */
+static int rpcrdma_add_one(struct ib_device *device)
+{
+ struct rpcrdma_device *rd;
+
+ rd = kzalloc(sizeof(*rd), GFP_KERNEL);
+ if (!rd)
+ return -ENOMEM;
+
+ kref_init(&rd->rd_kref);
+ xa_init_flags(&rd->rd_xa, XA_FLAGS_ALLOC1);
+ rd->rd_device = device;
+ init_completion(&rd->rd_done);
+ ib_set_client_data(device, &rpcrdma_ib_client, rd);
+
+ trace_rpcrdma_client_add_one(device);
+ return 0;
+}
+
+/**
+ * rpcrdma_remove_one - ib_client device removal callback
+ * @device: device about to be removed
+ * @client_data: this module's private per-device data
+ *
+ * Upon return, all transports associated with @device have divested
+ * themselves from IB hardware resources.
+ */
+static void rpcrdma_remove_one(struct ib_device *device,
+ void *client_data)
+{
+ struct rpcrdma_device *rd = client_data;
+ struct rpcrdma_notification *rn;
+ unsigned long index;
+
+ trace_rpcrdma_client_remove_one(device);
+
+ set_bit(RPCRDMA_RD_F_REMOVING, &rd->rd_flags);
+ xa_for_each(&rd->rd_xa, index, rn)
+ rn->rn_done(rn);
+
+ /*
+ * Wait only if there are still outstanding notification
+ * registrants for this device.
+ */
+ if (!refcount_dec_and_test(&rd->rd_kref.refcount)) {
+ trace_rpcrdma_client_wait_on(device);
+ wait_for_completion(&rd->rd_done);
+ }
+
+ trace_rpcrdma_client_remove_one_done(device);
+ kfree(rd);
+}
+
+static struct ib_client rpcrdma_ib_client = {
+ .name = "rpcrdma",
+ .add = rpcrdma_add_one,
+ .remove = rpcrdma_remove_one,
+};
+
+/**
+ * rpcrdma_ib_client_unregister - unregister ib_client for xprtrdma
+ *
+ * cel: watch for orphaned rpcrdma_device objects on module unload
+ */
+void rpcrdma_ib_client_unregister(void)
+{
+ ib_unregister_client(&rpcrdma_ib_client);
+}
+
+/**
+ * rpcrdma_ib_client_register - register ib_client for rpcrdma
+ *
+ * Returns zero on success, or a negative errno.
+ */
+int rpcrdma_ib_client_register(void)
+{
+ return ib_register_client(&rpcrdma_ib_client);
+}
diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c
index 45c5b41ac8dc..697f571d4c01 100644
--- a/net/sunrpc/xprtrdma/module.c
+++ b/net/sunrpc/xprtrdma/module.c
@@ -11,6 +11,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sunrpc/svc_rdma.h>
+#include <linux/sunrpc/rdma_rn.h>
#include <asm/swab.h>
@@ -30,21 +31,32 @@ static void __exit rpc_rdma_cleanup(void)
{
xprt_rdma_cleanup();
svc_rdma_cleanup();
+ rpcrdma_ib_client_unregister();
}
static int __init rpc_rdma_init(void)
{
int rc;
+ rc = rpcrdma_ib_client_register();
+ if (rc)
+ goto out_rc;
+
rc = svc_rdma_init();
if (rc)
- goto out;
+ goto out_ib_client;
rc = xprt_rdma_init();
if (rc)
- svc_rdma_cleanup();
+ goto out_svc_rdma;
-out:
+ return 0;
+
+out_svc_rdma:
+ svc_rdma_cleanup();
+out_ib_client:
+ rpcrdma_ib_client_unregister();
+out_rc:
return rc;
}
--
2.45.1
From: Chuck Lever <[email protected]>
Wait for all disconnects to complete to ensure the transport has
divested all of its hardware resources before the underlying RDMA
device can be removed.
Signed-off-by: Chuck Lever <[email protected]>
---
include/trace/events/rpcrdma.h | 23 +++++++++++++++++++++++
net/sunrpc/xprtrdma/verbs.c | 23 ++++++++++++++---------
net/sunrpc/xprtrdma/xprt_rdma.h | 2 ++
3 files changed, 39 insertions(+), 9 deletions(-)
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index ecdaf088219d..ba2d6a0e41cc 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -669,6 +669,29 @@ TRACE_EVENT(xprtrdma_inline_thresh,
DEFINE_CONN_EVENT(connect);
DEFINE_CONN_EVENT(disconnect);
+TRACE_EVENT(xprtrdma_device_removal,
+ TP_PROTO(
+ const struct rdma_cm_id *id
+ ),
+
+ TP_ARGS(id),
+
+ TP_STRUCT__entry(
+ __string(name, id->device->name)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ ),
+
+ TP_fast_assign(
+ __assign_str(name);
+ memcpy(__entry->addr, &id->route.addr.dst_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+
+ TP_printk("device %s to be removed, disconnecting %pISpc\n",
+ __get_str(name), __entry->addr
+ )
+);
+
DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc);
TRACE_EVENT(xprtrdma_op_connect,
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index a0b071089e15..04558c99e9f4 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -222,7 +222,6 @@ static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep,
static int
rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
{
- struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr;
struct rpcrdma_ep *ep = id->context;
might_sleep();
@@ -241,14 +240,6 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
ep->re_async_rc = -ENETUNREACH;
complete(&ep->re_done);
return 0;
- case RDMA_CM_EVENT_DEVICE_REMOVAL:
- pr_info("rpcrdma: removing device %s for %pISpc\n",
- ep->re_id->device->name, sap);
- switch (xchg(&ep->re_connect_status, -ENODEV)) {
- case 0: goto wake_connect_worker;
- case 1: goto disconnected;
- }
- return 0;
case RDMA_CM_EVENT_ADDR_CHANGE:
ep->re_connect_status = -ENODEV;
goto disconnected;
@@ -284,6 +275,14 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
return 0;
}
+static void rpcrdma_ep_removal_done(struct rpcrdma_notification *rn)
+{
+ struct rpcrdma_ep *ep = container_of(rn, struct rpcrdma_ep, re_rn);
+
+ trace_xprtrdma_device_removal(ep->re_id);
+ xprt_force_disconnect(ep->re_xprt);
+}
+
static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_ep *ep)
{
@@ -323,6 +322,10 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
if (rc)
goto out;
+ rc = rpcrdma_rn_register(id->device, &ep->re_rn, rpcrdma_ep_removal_done);
+ if (rc)
+ goto out;
+
return id;
out:
@@ -350,6 +353,8 @@ static void rpcrdma_ep_destroy(struct kref *kref)
ib_dealloc_pd(ep->re_pd);
ep->re_pd = NULL;
+ rpcrdma_rn_unregister(ep->re_id->device, &ep->re_rn);
+
kfree(ep);
module_put(THIS_MODULE);
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index da409450dfc0..341725c66ec8 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -56,6 +56,7 @@
#include <linux/sunrpc/rpc_rdma_cid.h> /* completion IDs */
#include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */
#include <linux/sunrpc/xprtrdma.h> /* xprt parameters */
+#include <linux/sunrpc/rdma_rn.h> /* removal notifications */
#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */
#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */
@@ -92,6 +93,7 @@ struct rpcrdma_ep {
struct rpcrdma_connect_private
re_cm_private;
struct rdma_conn_param re_remote_cma;
+ struct rpcrdma_notification re_rn;
int re_receive_count;
unsigned int re_max_requests; /* depends on device */
unsigned int re_inline_send; /* negotiated */
--
2.45.1
From: Chuck Lever <[email protected]>
Commit 7a03aeb66c41 ("xprtrdma: Micro-optimize MR DMA-unmapping")
removed the last use of the @r_xprt parameter in this function, but
neglected to remove the parameter itself.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/frwr_ops.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 47f33bb7bff8..31434aeb8e29 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -54,7 +54,7 @@ static void frwr_cid_init(struct rpcrdma_ep *ep,
cid->ci_completion_id = mr->mr_ibmr->res.id;
}
-static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
+static void frwr_mr_unmap(struct rpcrdma_mr *mr)
{
if (mr->mr_device) {
trace_xprtrdma_mr_unmap(mr);
@@ -73,7 +73,7 @@ void frwr_mr_release(struct rpcrdma_mr *mr)
{
int rc;
- frwr_mr_unmap(mr->mr_xprt, mr);
+ frwr_mr_unmap(mr);
rc = ib_dereg_mr(mr->mr_ibmr);
if (rc)
@@ -84,7 +84,7 @@ void frwr_mr_release(struct rpcrdma_mr *mr)
static void frwr_mr_put(struct rpcrdma_mr *mr)
{
- frwr_mr_unmap(mr->mr_xprt, mr);
+ frwr_mr_unmap(mr);
/* The MR is returned to the req's MR free list instead
* of to the xprt's MR free list. No spinlock is needed.
--
2.45.1
From: Chuck Lever <[email protected]>
The original code was designed so that most calls to
rpcrdma_rep_create() would occur on the NUMA node that the device
preferred. There are a few cases where that's not possible, so
those reps are marked as temporary.
However, we have the device (and its preferred node) already in
rpcrdma_rep_create(), so let's use that to guarantee the memory
is allocated from the correct node.
Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/rpc_rdma.c | 3 +-
net/sunrpc/xprtrdma/verbs.c | 57 ++++++++++++++-------------------
net/sunrpc/xprtrdma/xprt_rdma.h | 3 +-
3 files changed, 26 insertions(+), 37 deletions(-)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 190a4de239c8..1478c41c7e9d 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1471,8 +1471,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
credits = 1; /* don't deadlock */
else if (credits > r_xprt->rx_ep->re_max_requests)
credits = r_xprt->rx_ep->re_max_requests;
- rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1),
- false);
+ rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1));
if (buf->rb_credits != credits)
rpcrdma_update_cwnd(r_xprt, credits);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 04558c99e9f4..110933745e5d 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -69,13 +69,15 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_sendctx *sc);
static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt);
-static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep);
static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_ep_get(struct rpcrdma_ep *ep);
static int rpcrdma_ep_put(struct rpcrdma_ep *ep);
static struct rpcrdma_regbuf *
+rpcrdma_regbuf_alloc_node(size_t size, enum dma_data_direction direction,
+ int node);
+static struct rpcrdma_regbuf *
rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction);
static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb);
static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb);
@@ -510,7 +512,7 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt)
* outstanding Receives.
*/
rpcrdma_ep_get(ep);
- rpcrdma_post_recvs(r_xprt, 1, true);
+ rpcrdma_post_recvs(r_xprt, 1);
rc = rdma_connect(ep->re_id, &ep->re_remote_cma);
if (rc)
@@ -943,18 +945,20 @@ static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt)
}
static noinline
-struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
- bool temp)
+struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ struct ib_device *device = ep->re_id->device;
struct rpcrdma_rep *rep;
rep = kzalloc(sizeof(*rep), XPRTRDMA_GFP_FLAGS);
if (rep == NULL)
goto out;
- rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv,
- DMA_FROM_DEVICE);
+ rep->rr_rdmabuf = rpcrdma_regbuf_alloc_node(ep->re_inline_recv,
+ DMA_FROM_DEVICE,
+ ibdev_to_node(device));
if (!rep->rr_rdmabuf)
goto out_free;
@@ -969,7 +973,6 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
rep->rr_recv_wr.num_sge = 1;
- rep->rr_temp = temp;
spin_lock(&buf->rb_lock);
list_add(&rep->rr_all, &buf->rb_all_reps);
@@ -988,17 +991,6 @@ static void rpcrdma_rep_free(struct rpcrdma_rep *rep)
kfree(rep);
}
-static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
-{
- struct rpcrdma_buffer *buf = &rep->rr_rxprt->rx_buf;
-
- spin_lock(&buf->rb_lock);
- list_del(&rep->rr_all);
- spin_unlock(&buf->rb_lock);
-
- rpcrdma_rep_free(rep);
-}
-
static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf)
{
struct llist_node *node;
@@ -1030,10 +1022,8 @@ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt)
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_rep *rep;
- list_for_each_entry(rep, &buf->rb_all_reps, rr_all) {
+ list_for_each_entry(rep, &buf->rb_all_reps, rr_all)
rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf);
- rep->rr_temp = true; /* Mark this rep for destruction */
- }
}
static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf)
@@ -1250,14 +1240,15 @@ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
* or Replies they may be registered externally via frwr_map.
*/
static struct rpcrdma_regbuf *
-rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction)
+rpcrdma_regbuf_alloc_node(size_t size, enum dma_data_direction direction,
+ int node)
{
struct rpcrdma_regbuf *rb;
- rb = kmalloc(sizeof(*rb), XPRTRDMA_GFP_FLAGS);
+ rb = kmalloc_node(sizeof(*rb), XPRTRDMA_GFP_FLAGS, node);
if (!rb)
return NULL;
- rb->rg_data = kmalloc(size, XPRTRDMA_GFP_FLAGS);
+ rb->rg_data = kmalloc_node(size, XPRTRDMA_GFP_FLAGS, node);
if (!rb->rg_data) {
kfree(rb);
return NULL;
@@ -1269,6 +1260,12 @@ rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction)
return rb;
}
+static struct rpcrdma_regbuf *
+rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction)
+{
+ return rpcrdma_regbuf_alloc_node(size, direction, NUMA_NO_NODE);
+}
+
/**
* rpcrdma_regbuf_realloc - re-allocate a SEND/RECV buffer
* @rb: regbuf to reallocate
@@ -1346,10 +1343,9 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb)
* rpcrdma_post_recvs - Refill the Receive Queue
* @r_xprt: controlling transport instance
* @needed: current credit grant
- * @temp: mark Receive buffers to be deleted after one use
*
*/
-void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
+void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_ep *ep = r_xprt->rx_ep;
@@ -1363,8 +1359,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
if (likely(ep->re_receive_count > needed))
goto out;
needed -= ep->re_receive_count;
- if (!temp)
- needed += RPCRDMA_MAX_RECV_BATCH;
+ needed += RPCRDMA_MAX_RECV_BATCH;
if (atomic_inc_return(&ep->re_receiving) > 1)
goto out;
@@ -1373,12 +1368,8 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
wr = NULL;
while (needed) {
rep = rpcrdma_rep_get_locked(buf);
- if (rep && rep->rr_temp) {
- rpcrdma_rep_destroy(rep);
- continue;
- }
if (!rep)
- rep = rpcrdma_rep_create(r_xprt, temp);
+ rep = rpcrdma_rep_create(r_xprt);
if (!rep)
break;
if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) {
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 341725c66ec8..8147d2b41494 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -200,7 +200,6 @@ struct rpcrdma_rep {
__be32 rr_proc;
int rr_wc_flags;
u32 rr_inv_rkey;
- bool rr_temp;
struct rpcrdma_regbuf *rr_rdmabuf;
struct rpcrdma_xprt *rr_rxprt;
struct rpc_rqst *rr_rqst;
@@ -468,7 +467,7 @@ void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc);
int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt);
void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt);
-void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp);
+void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed);
/*
* Buffer calls - xprtrdma/verbs.c
--
2.45.1
On 04/06/2024 22:45, [email protected] wrote:
> From: Chuck Lever <[email protected]>
>
> Avoid FastReg operations getting MW_BIND_ERR after a reconnect.
>
> rpcrdma_reqs_reset() is called on transport tear-down to get each
> rpcrdma_req back into a clean state.
>
> MRs on req->rl_registered are waiting for a FastReg, are already
> registered, or are waiting for invalidation. If the transport is
> being torn down when reqs_reset() is called, the matching LocalInv
> might never be posted. That leaves these MR registered /and/ on
> req->rl_free_mrs, where they can be re-used for the next
> connection.
>
> Since xprtrdma does not keep specific track of the MR state, it's
> not possible to know what state these MRs are in, so the only safe
> thing to do is release them immediately.
>
> Fixes: 5de55ce951a1 ("xprtrdma: Release in-flight MRs on disconnect")
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> net/sunrpc/xprtrdma/frwr_ops.c | 3 ++-
> net/sunrpc/xprtrdma/verbs.c | 16 +++++++++++++++-
> 2 files changed, 17 insertions(+), 2 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
> index ffbf99894970..47f33bb7bff8 100644
> --- a/net/sunrpc/xprtrdma/frwr_ops.c
> +++ b/net/sunrpc/xprtrdma/frwr_ops.c
> @@ -92,7 +92,8 @@ static void frwr_mr_put(struct rpcrdma_mr *mr)
> rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
> }
>
> -/* frwr_reset - Place MRs back on the free list
> +/**
> + * frwr_reset - Place MRs back on @req's free list
> * @req: request to reset
> *
> * Used after a failed marshal. For FRWR, this means the MRs
> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> index 432557a553e7..a0b071089e15 100644
> --- a/net/sunrpc/xprtrdma/verbs.c
> +++ b/net/sunrpc/xprtrdma/verbs.c
> @@ -897,6 +897,8 @@ static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt)
>
> static void rpcrdma_req_reset(struct rpcrdma_req *req)
> {
> + struct rpcrdma_mr *mr;
> +
> /* Credits are valid for only one connection */
> req->rl_slot.rq_cong = 0;
>
> @@ -906,7 +908,19 @@ static void rpcrdma_req_reset(struct rpcrdma_req *req)
> rpcrdma_regbuf_dma_unmap(req->rl_sendbuf);
> rpcrdma_regbuf_dma_unmap(req->rl_recvbuf);
>
> - frwr_reset(req);
> + /* The verbs consumer can't know the state of an MR on the
> + * req->rl_registered list unless a successful completion
> + * has occurred, so they cannot be re-used.
> + */
Theoretically it would be possible to know by inspecting the FLUSH error
completions
and see which of those was a registration wr (meaning the registration
was not ever
executed).
But I guess its simpler to just assume that it isn't reusable.
> + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
> + struct rpcrdma_buffer *buf = &mr->mr_xprt->rx_buf;
> +
> + spin_lock(&buf->rb_lock);
> + list_del(&mr->mr_all);
> + spin_unlock(&buf->rb_lock);
> +
> + frwr_mr_release(mr);
> + }
> }
>
> /* ASSUMPTION: the rb_allreqs list is stable for the duration,
Looks good,
Reviewed-by: Sagi Grimberg <[email protected]>
On 04/06/2024 22:45, [email protected] wrote:
> From: Chuck Lever <[email protected]>
>
> The original code was designed so that most calls to
> rpcrdma_rep_create() would occur on the NUMA node that the device
> preferred. There are a few cases where that's not possible, so
> those reps are marked as temporary.
>
> However, we have the device (and its preferred node) already in
> rpcrdma_rep_create(), so let's use that to guarantee the memory
> is allocated from the correct node.
Reviewed-by: Sagi Grimberg <[email protected]>
CC Dan.
Reviewed-by: Sagi Grimberg <[email protected]>
On 04/06/2024 22:45, [email protected] wrote:
> From: Chuck Lever <[email protected]>
>
> Wait for all disconnects to complete to ensure the transport has
> divested all of its hardware resources before the underlying RDMA
> device can be removed.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> include/trace/events/rpcrdma.h | 23 +++++++++++++++++++++++
> net/sunrpc/xprtrdma/verbs.c | 23 ++++++++++++++---------
> net/sunrpc/xprtrdma/xprt_rdma.h | 2 ++
> 3 files changed, 39 insertions(+), 9 deletions(-)
>
> diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
> index ecdaf088219d..ba2d6a0e41cc 100644
> --- a/include/trace/events/rpcrdma.h
> +++ b/include/trace/events/rpcrdma.h
> @@ -669,6 +669,29 @@ TRACE_EVENT(xprtrdma_inline_thresh,
> DEFINE_CONN_EVENT(connect);
> DEFINE_CONN_EVENT(disconnect);
>
> +TRACE_EVENT(xprtrdma_device_removal,
> + TP_PROTO(
> + const struct rdma_cm_id *id
> + ),
> +
> + TP_ARGS(id),
> +
> + TP_STRUCT__entry(
> + __string(name, id->device->name)
> + __array(unsigned char, addr, sizeof(struct sockaddr_in6))
> + ),
> +
> + TP_fast_assign(
> + __assign_str(name);
> + memcpy(__entry->addr, &id->route.addr.dst_addr,
> + sizeof(struct sockaddr_in6));
> + ),
> +
> + TP_printk("device %s to be removed, disconnecting %pISpc\n",
> + __get_str(name), __entry->addr
> + )
> +);
> +
> DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc);
>
> TRACE_EVENT(xprtrdma_op_connect,
> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> index a0b071089e15..04558c99e9f4 100644
> --- a/net/sunrpc/xprtrdma/verbs.c
> +++ b/net/sunrpc/xprtrdma/verbs.c
> @@ -222,7 +222,6 @@ static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep,
> static int
> rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
> {
> - struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr;
> struct rpcrdma_ep *ep = id->context;
>
> might_sleep();
> @@ -241,14 +240,6 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
> ep->re_async_rc = -ENETUNREACH;
> complete(&ep->re_done);
> return 0;
> - case RDMA_CM_EVENT_DEVICE_REMOVAL:
> - pr_info("rpcrdma: removing device %s for %pISpc\n",
> - ep->re_id->device->name, sap);
> - switch (xchg(&ep->re_connect_status, -ENODEV)) {
> - case 0: goto wake_connect_worker;
> - case 1: goto disconnected;
> - }
> - return 0;
> case RDMA_CM_EVENT_ADDR_CHANGE:
> ep->re_connect_status = -ENODEV;
> goto disconnected;
> @@ -284,6 +275,14 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
> return 0;
> }
>
> +static void rpcrdma_ep_removal_done(struct rpcrdma_notification *rn)
> +{
> + struct rpcrdma_ep *ep = container_of(rn, struct rpcrdma_ep, re_rn);
> +
> + trace_xprtrdma_device_removal(ep->re_id);
> + xprt_force_disconnect(ep->re_xprt);
> +}
> +
> static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
> struct rpcrdma_ep *ep)
> {
> @@ -323,6 +322,10 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
> if (rc)
> goto out;
>
> + rc = rpcrdma_rn_register(id->device, &ep->re_rn, rpcrdma_ep_removal_done);
> + if (rc)
> + goto out;
> +
> return id;
>
> out:
> @@ -350,6 +353,8 @@ static void rpcrdma_ep_destroy(struct kref *kref)
> ib_dealloc_pd(ep->re_pd);
> ep->re_pd = NULL;
>
> + rpcrdma_rn_unregister(ep->re_id->device, &ep->re_rn);
> +
> kfree(ep);
> module_put(THIS_MODULE);
> }
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index da409450dfc0..341725c66ec8 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -56,6 +56,7 @@
> #include <linux/sunrpc/rpc_rdma_cid.h> /* completion IDs */
> #include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */
> #include <linux/sunrpc/xprtrdma.h> /* xprt parameters */
> +#include <linux/sunrpc/rdma_rn.h> /* removal notifications */
>
> #define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */
> #define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */
> @@ -92,6 +93,7 @@ struct rpcrdma_ep {
> struct rpcrdma_connect_private
> re_cm_private;
> struct rdma_conn_param re_remote_cma;
> + struct rpcrdma_notification re_rn;
> int re_receive_count;
> unsigned int re_max_requests; /* depends on device */
> unsigned int re_inline_send; /* negotiated */
Reviewed-by: Sagi Grimberg <[email protected]>