LinuxLists.cc - [PATCH v1 0/4] NUMA memory optimizations for NFS/RDMA server

2023-06-05 13:23:16

Subject: [PATCH v1 0/4] NUMA memory optimizations for NFS/RDMA server

Ensure that memory for various purposes is allocated on the node
closest to the underlying device. Using the device::numa_node field
guarantees the allocation is performed from the correct node.

---

Chuck Lever (4):
svcrdma: Allocate new transports on device's NUMA node
svcrdma: Clean up allocation of svc_rdma_recv_ctxt
svcrdma: Clean up allocation of svc_rdma_send_ctxt
svcrdma: Clean up allocation of svc_rdma_rw_ctxt

include/linux/sunrpc/svc_rdma.h | 1 -
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 18 +++++++-----------
net/sunrpc/xprtrdma/svc_rdma_rw.c | 10 ++++++----
net/sunrpc/xprtrdma/svc_rdma_sendto.c | 9 ++++-----
net/sunrpc/xprtrdma/svc_rdma_transport.c | 18 +++++++++---------
5 files changed, 26 insertions(+), 30 deletions(-)

--
Chuck Lever

2023-06-05 13:23:42

by Chuck Lever

[permalink] [raw]

Subject: [PATCH v1 3/4] svcrdma: Clean up allocation of svc_rdma_send_ctxt

From: Chuck Lever <[email protected]>

The physical device's favored NUMA node ID is available when
allocating a send_ctxt. Use that value instead of relying on the
assumption that the memory allocation happens to be running on a
node close to the device.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/svc_rdma_sendto.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 22a871e6fe4d..a35d1e055b1a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -123,18 +123,17 @@ static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma,
static struct svc_rdma_send_ctxt *
svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
{
+ int node = ibdev_to_node(rdma->sc_cm_id->device);
struct svc_rdma_send_ctxt *ctxt;
dma_addr_t addr;
void *buffer;
- size_t size;
int i;

- size = sizeof(*ctxt);
- size += rdma->sc_max_send_sges * sizeof(struct ib_sge);
- ctxt = kmalloc(size, GFP_KERNEL);
+ ctxt = kmalloc_node(struct_size(ctxt, sc_sges, rdma->sc_max_send_sges),
+ GFP_KERNEL, node);
if (!ctxt)
goto fail0;
- buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL);
+ buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
if (!buffer)
goto fail1;
addr = ib_dma_map_single(rdma->sc_pd->device, buffer,

2023-06-05 13:24:22

by Chuck Lever

[permalink] [raw]

Subject: [PATCH v1 2/4] svcrdma: Clean up allocation of svc_rdma_recv_ctxt

From: Chuck Lever <[email protected]>

The physical device's favored NUMA node ID is available when
allocating a recv_ctxt. Use that value instead of relying on the
assumption that the memory allocation happens to be running on a
node close to the device.

This clean up eliminates the hack of destroying recv_ctxts that
were not created by the receive CQ thread -- recv_ctxts are now
always allocated on a "good" node.

Signed-off-by: Chuck Lever <[email protected]>
---
include/linux/sunrpc/svc_rdma.h | 1 -
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 18 +++++++-----------
2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index fbc4bd423b35..a0f3ea357977 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -135,7 +135,6 @@ struct svc_rdma_recv_ctxt {
struct ib_sge rc_recv_sge;
void *rc_recv_buf;
struct xdr_stream rc_stream;
- bool rc_temp;
u32 rc_byte_len;
unsigned int rc_page_count;
u32 rc_inv_rkey;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index a22fe7587fa6..46a719ba4917 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -125,14 +125,15 @@ static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma,
static struct svc_rdma_recv_ctxt *
svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
{
+ int node = ibdev_to_node(rdma->sc_cm_id->device);
struct svc_rdma_recv_ctxt *ctxt;
dma_addr_t addr;
void *buffer;

- ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
+ ctxt = kmalloc_node(sizeof(*ctxt), GFP_KERNEL, node);
if (!ctxt)
goto fail0;
- buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL);
+ buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
if (!buffer)
goto fail1;
addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
@@ -155,7 +156,6 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
ctxt->rc_recv_sge.length = rdma->sc_max_req_size;
ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey;
ctxt->rc_recv_buf = buffer;
- ctxt->rc_temp = false;
return ctxt;

fail2:
@@ -232,10 +232,7 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
pcl_free(&ctxt->rc_write_pcl);
pcl_free(&ctxt->rc_reply_pcl);

- if (!ctxt->rc_temp)
- llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
- else
- svc_rdma_recv_ctxt_destroy(rdma, ctxt);
+ llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
}

/**
@@ -258,7 +255,7 @@ void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *vctxt)
}

static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
- unsigned int wanted, bool temp)
+ unsigned int wanted)
{
const struct ib_recv_wr *bad_wr = NULL;
struct svc_rdma_recv_ctxt *ctxt;
@@ -275,7 +272,6 @@ static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
break;

trace_svcrdma_post_recv(ctxt);
- ctxt->rc_temp = temp;
ctxt->rc_recv_wr.next = recv_chain;
recv_chain = &ctxt->rc_recv_wr;
rdma->sc_pending_recvs++;
@@ -309,7 +305,7 @@ static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
*/
bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
{
- return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true);
+ return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests);
}

/**
@@ -343,7 +339,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
* client reconnects.
*/
if (rdma->sc_pending_recvs < rdma->sc_max_requests)
- if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch, false))
+ if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch))
goto dropped;

/* All wc fields are now known to be valid */

2023-06-05 13:27:11

by Chuck Lever

[permalink] [raw]

Subject: [PATCH v1 4/4] svcrdma: Clean up allocation of svc_rdma_rw_ctxt

From: Chuck Lever <[email protected]>

The physical device's favored NUMA node ID is available when
allocating a rw_ctxt. Use that value instead of relying on the
assumption that the memory allocation happens to be running on a
node close to the device.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/svc_rdma_rw.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 11cf7c646644..068c365e7812 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -62,8 +62,8 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
if (node) {
ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node);
} else {
- ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE),
- GFP_KERNEL);
+ ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE),
+ GFP_KERNEL, ibdev_to_node(rdma->sc_cm_id->device));
if (!ctxt)
goto out_noctx;

@@ -234,7 +234,8 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma,
{
struct svc_rdma_write_info *info;

- info = kmalloc(sizeof(*info), GFP_KERNEL);
+ info = kmalloc_node(sizeof(*info), GFP_KERNEL,
+ ibdev_to_node(rdma->sc_cm_id->device));
if (!info)
return info;

@@ -304,7 +305,8 @@ svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma)
{
struct svc_rdma_read_info *info;

- info = kmalloc(sizeof(*info), GFP_KERNEL);
+ info = kmalloc_node(sizeof(*info), GFP_KERNEL,
+ ibdev_to_node(rdma->sc_cm_id->device));
if (!info)
return info;

2023-06-05 13:34:23

by Chuck Lever

[permalink] [raw]

Subject: [PATCH v1 1/4] svcrdma: Allocate new transports on device's NUMA node

From: Chuck Lever <[email protected]>

The physical device's NUMA node ID is available when allocating an
svc_xprt for an incoming connection. Use that value to ensure the
svc_xprt structure is allocated on the NUMA node closest to the
device.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/svc_rdma_transport.c | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index ca04f7a6a085..2abd895046ee 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -64,7 +64,7 @@
#define RPCDBG_FACILITY RPCDBG_SVCXPRT

static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
- struct net *net);
+ struct net *net, int node);
static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
struct net *net,
struct sockaddr *sa, int salen,
@@ -123,14 +123,14 @@ static void qp_event_handler(struct ib_event *event, void *context)
}

static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
- struct net *net)
+ struct net *net, int node)
{
- struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL);
+ struct svcxprt_rdma *cma_xprt;

- if (!cma_xprt) {
- dprintk("svcrdma: failed to create new transport\n");
+ cma_xprt = kzalloc_node(sizeof(*cma_xprt), GFP_KERNEL, node);
+ if (!cma_xprt)
return NULL;
- }
+
svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
@@ -193,9 +193,9 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id,
struct svcxprt_rdma *newxprt;
struct sockaddr *sa;

- /* Create a new transport */
newxprt = svc_rdma_create_xprt(listen_xprt->sc_xprt.xpt_server,
- listen_xprt->sc_xprt.xpt_net);
+ listen_xprt->sc_xprt.xpt_net,
+ ibdev_to_node(new_cma_id->device));
if (!newxprt)
return;
newxprt->sc_cm_id = new_cma_id;
@@ -304,7 +304,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,

if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6)
return ERR_PTR(-EAFNOSUPPORT);
- cma_xprt = svc_rdma_create_xprt(serv, net);
+ cma_xprt = svc_rdma_create_xprt(serv, net, NUMA_NO_NODE);
if (!cma_xprt)
return ERR_PTR(-ENOMEM);
set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);

2023-06-09 20:47:52

by Tom Talpey

[permalink] [raw]

Subject: Re: [PATCH v1 1/4] svcrdma: Allocate new transports on device's NUMA node

On 6/5/2023 9:11 AM, Chuck Lever wrote:
> From: Chuck Lever <[email protected]>
>
> The physical device's NUMA node ID is available when allocating an
> svc_xprt for an incoming connection. Use that value to ensure the
> svc_xprt structure is allocated on the NUMA node closest to the
> device.

How is "closest" determined from the device's perspective? While I agree
that allocating DMA-able control structures on the same CPU socket is
good for signaling latency, it may or may not be the best choice for
interrupt processing. And, some architectures push the NUMA envelope
pretty far, such as presenting memory-only NUMA nodes, or highly
asymmetrical processor cores. How are those accounted for?

Would it make more sense to push this affinity down to the CQ level,
associating them with selected cores? One obvious strategy might be
to affinitize the send and receive cq's to different cores, and
carefully place send-side and receive-side contexts on separate
affinitized cachelines, to avoid pingponging.

I'm not against the idea, just wondering if it goes far enough. Do
you have numbers, and if so, on what platforms?

Tom.

> Signed-off-by: Chuck Lever <[email protected]>
> ---
> net/sunrpc/xprtrdma/svc_rdma_transport.c | 18 +++++++++---------
> 1 file changed, 9 insertions(+), 9 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
> index ca04f7a6a085..2abd895046ee 100644
> --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
> +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
> @@ -64,7 +64,7 @@
> #define RPCDBG_FACILITY RPCDBG_SVCXPRT
>
> static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
> - struct net *net);
> + struct net *net, int node);
> static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
> struct net *net,
> struct sockaddr *sa, int salen,
> @@ -123,14 +123,14 @@ static void qp_event_handler(struct ib_event *event, void *context)
> }
>
> static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
> - struct net *net)
> + struct net *net, int node)
> {
> - struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL);
> + struct svcxprt_rdma *cma_xprt;
>
> - if (!cma_xprt) {
> - dprintk("svcrdma: failed to create new transport\n");
> + cma_xprt = kzalloc_node(sizeof(*cma_xprt), GFP_KERNEL, node);
> + if (!cma_xprt)
> return NULL;
> - }
> +
> svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
> INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
> INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
> @@ -193,9 +193,9 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id,
> struct svcxprt_rdma *newxprt;
> struct sockaddr *sa;
>
> - /* Create a new transport */
> newxprt = svc_rdma_create_xprt(listen_xprt->sc_xprt.xpt_server,
> - listen_xprt->sc_xprt.xpt_net);
> + listen_xprt->sc_xprt.xpt_net,
> + ibdev_to_node(new_cma_id->device));
> if (!newxprt)
> return;
> newxprt->sc_cm_id = new_cma_id;
> @@ -304,7 +304,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
>
> if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6)
> return ERR_PTR(-EAFNOSUPPORT);
> - cma_xprt = svc_rdma_create_xprt(serv, net);
> + cma_xprt = svc_rdma_create_xprt(serv, net, NUMA_NO_NODE);
> if (!cma_xprt)
> return ERR_PTR(-ENOMEM);
> set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
>
>
>

2023-06-09 21:19:00

by Chuck Lever

[permalink] [raw]

Subject: Re: [PATCH v1 1/4] svcrdma: Allocate new transports on device's NUMA node

> On Jun 9, 2023, at 4:40 PM, Tom Talpey <[email protected]> wrote:
>
> On 6/5/2023 9:11 AM, Chuck Lever wrote:
>> From: Chuck Lever <[email protected]>
>> The physical device's NUMA node ID is available when allocating an
>> svc_xprt for an incoming connection. Use that value to ensure the
>> svc_xprt structure is allocated on the NUMA node closest to the
>> device.
>
> How is "closest" determined from the device's perspective?

That's up to the device driver and network core.

> While I agree
> that allocating DMA-able control structures on the same CPU socket is
> good for signaling latency, it may or may not be the best choice for
> interrupt processing. And, some architectures push the NUMA envelope
> pretty far, such as presenting memory-only NUMA nodes, or highly
> asymmetrical processor cores. How are those accounted for?

My understanding is devices are never attached to such nodes.
Anyway, I don't intend this patch to address that situation.

> Would it make more sense to push this affinity down to the CQ level,
> associating them with selected cores? One obvious strategy might be
> to affinitize the send and receive cq's to different cores, and
> carefully place send-side and receive-side contexts on separate
> affinitized cachelines, to avoid pingponging.

That is already done. Each CQ is handled on one core, and both
NFS's and NFSD's transport set up code is careful to allocate
the Receive CQ and Send CQ on different cores from each other
when more than one core is available.

It's up to the device driver and the system administrator to
ensure that the IRQs are affined to cores on the same node as
the device.

> I'm not against the idea, just wondering if it goes far enough. Do
> you have numbers, and if so, on what platforms?

I have a two-node system here, configured such that each node
gets its own nfsd thread pool. The difference in performance
is a change in the latency distribution curve... it's narrower
when memory participating in DMA lives on the same node as the
device, which lowers the average completion latency.

But, for the send and receive ctxts (in subsequent patches),
they are already allocated on the same node where the CQ is
allocated, most of the time.

It's patch 1/4 that ensures that the xprt is also on that node:
that's done because the completion handlers access a bunch of
fields in svcxprt_rdma.

I think the main purpose of these patches is documentary; but
I do see a small uptick in performance with 1/4.

> Tom.
>
>> Signed-off-by: Chuck Lever <[email protected]>
>> ---
>> net/sunrpc/xprtrdma/svc_rdma_transport.c | 18 +++++++++---------
>> 1 file changed, 9 insertions(+), 9 deletions(-)
>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
>> index ca04f7a6a085..2abd895046ee 100644
>> --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
>> +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
>> @@ -64,7 +64,7 @@
>> #define RPCDBG_FACILITY RPCDBG_SVCXPRT
>> static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
>> - struct net *net);
>> + struct net *net, int node);
>> static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
>> struct net *net,
>> struct sockaddr *sa, int salen,
>> @@ -123,14 +123,14 @@ static void qp_event_handler(struct ib_event *event, void *context)
>> }
>> static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
>> - struct net *net)
>> + struct net *net, int node)
>> {
>> - struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL);
>> + struct svcxprt_rdma *cma_xprt;
>> - if (!cma_xprt) {
>> - dprintk("svcrdma: failed to create new transport\n");
>> + cma_xprt = kzalloc_node(sizeof(*cma_xprt), GFP_KERNEL, node);
>> + if (!cma_xprt)
>> return NULL;
>> - }
>> +
>> svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
>> INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
>> INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
>> @@ -193,9 +193,9 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id,
>> struct svcxprt_rdma *newxprt;
>> struct sockaddr *sa;
>> - /* Create a new transport */
>> newxprt = svc_rdma_create_xprt(listen_xprt->sc_xprt.xpt_server,
>> - listen_xprt->sc_xprt.xpt_net);
>> + listen_xprt->sc_xprt.xpt_net,
>> + ibdev_to_node(new_cma_id->device));
>> if (!newxprt)
>> return;
>> newxprt->sc_cm_id = new_cma_id;
>> @@ -304,7 +304,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
>> if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6)
>> return ERR_PTR(-EAFNOSUPPORT);
>> - cma_xprt = svc_rdma_create_xprt(serv, net);
>> + cma_xprt = svc_rdma_create_xprt(serv, net, NUMA_NO_NODE);
>> if (!cma_xprt)
>> return ERR_PTR(-ENOMEM);
>> set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);

--
Chuck Lever