2023-06-06 17:42:58

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 0/4] svcrdma: Go back to releasing pages-under-I/O

Return to the behavior of releasing reply buffer pages as part of
sending an RPC Reply over RDMA. I measured a performance improvement
(which is documented in 4/4). Similar page release behavior with
socket transports also means we should be able to share a little
more code between transports as MSG_SPLICE_PAGES rolls out.

---

Chuck Lever (4):
SUNRPC: Revert cc93ce9529a6
SUNRPC: Revert 579900670ac7
SUNRPC: Revert 2a1e4f21d841
SUNRPC: Optimize page release in svc_rdma_sendto()


include/linux/sunrpc/svc_rdma.h | 4 +-
net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 8 +---
net/sunrpc/xprtrdma/svc_rdma_sendto.c | 53 ++++++++++++++--------
3 files changed, 38 insertions(+), 27 deletions(-)

--
Chuck Lever



2023-06-06 17:42:58

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 1/4] SUNRPC: Revert cc93ce9529a6

From: Chuck Lever <[email protected]>

Pre-requisite for releasing pages in the send completion handler.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/svc_rdma_sendto.c | 5 -----
1 file changed, 5 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index a35d1e055b1a..8e7ccef74207 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -975,11 +975,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp);
if (ret < 0)
goto put_ctxt;
-
- /* Prevent svc_xprt_release() from releasing the page backing
- * rq_res.head[0].iov_base. It's no longer being accessed by
- * the I/O device. */
- rqstp->rq_respages++;
return 0;

reply_chunk:



2023-06-06 17:44:14

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 3/4] SUNRPC: Revert 2a1e4f21d841

From: Chuck Lever <[email protected]>

Pre-requisite for releasing pages in the send completion handler.

Signed-off-by: Chuck Lever <[email protected]>
---
include/linux/sunrpc/svc_rdma.h | 1 -
net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 8 +-------
net/sunrpc/xprtrdma/svc_rdma_sendto.c | 27 ++++++++++++---------------
3 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 8e654da55170..a5ee0af2a310 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -154,7 +154,6 @@ struct svc_rdma_send_ctxt {

struct ib_send_wr sc_send_wr;
struct ib_cqe sc_cqe;
- struct completion sc_done;
struct xdr_buf sc_hdrbuf;
struct xdr_stream sc_stream;
void *sc_xprt_buf;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index aa2227a7e552..7420a2c990c7 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -93,13 +93,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
*/
get_page(virt_to_page(rqst->rq_buffer));
sctxt->sc_send_wr.opcode = IB_WR_SEND;
- ret = svc_rdma_send(rdma, sctxt);
- if (ret < 0)
- return ret;
-
- ret = wait_for_completion_killable(&sctxt->sc_done);
- svc_rdma_send_ctxt_put(rdma, sctxt);
- return ret;
+ return svc_rdma_send(rdma, sctxt);
}

/* Server-side transport endpoint wants a whole page for its send
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 4c62bc41ea40..1ae4236d04a3 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -147,7 +147,6 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe;
ctxt->sc_send_wr.sg_list = ctxt->sc_sges;
ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED;
- init_completion(&ctxt->sc_done);
ctxt->sc_cqe.done = svc_rdma_wc_send;
ctxt->sc_xprt_buf = buffer;
xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf,
@@ -286,12 +285,12 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe);

svc_rdma_wake_send_waiters(rdma, 1);
- complete(&ctxt->sc_done);

if (unlikely(wc->status != IB_WC_SUCCESS))
goto flushed;

trace_svcrdma_wc_send(wc, &ctxt->sc_cid);
+ svc_rdma_send_ctxt_put(rdma, ctxt);
return;

flushed:
@@ -299,6 +298,7 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
trace_svcrdma_wc_send_err(wc, &ctxt->sc_cid);
else
trace_svcrdma_wc_send_flush(wc, &ctxt->sc_cid);
+ svc_rdma_send_ctxt_put(rdma, ctxt);
svc_xprt_deferred_close(&rdma->sc_xprt);
}

@@ -315,8 +315,6 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
struct ib_send_wr *wr = &ctxt->sc_send_wr;
int ret;

- reinit_completion(&ctxt->sc_done);
-
/* Sync the transport header buffer */
ib_dma_sync_single_for_device(rdma->sc_pd->device,
wr->sg_list[0].addr,
@@ -808,8 +806,8 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
* svc_rdma_sendto returns. Transfer pages under I/O to the ctxt
* so they are released by the Send completion handler.
*/
-static inline void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
- struct svc_rdma_send_ctxt *ctxt)
+static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
+ struct svc_rdma_send_ctxt *ctxt)
{
int i, pages = rqstp->rq_next_page - rqstp->rq_respages;

@@ -852,6 +850,8 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
if (ret < 0)
return ret;

+ svc_rdma_save_io_pages(rqstp, sctxt);
+
if (rctxt->rc_inv_rkey) {
sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey;
@@ -859,13 +859,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
sctxt->sc_send_wr.opcode = IB_WR_SEND;
}

- ret = svc_rdma_send(rdma, sctxt);
- if (ret < 0)
- return ret;
-
- ret = wait_for_completion_killable(&sctxt->sc_done);
- svc_rdma_send_ctxt_put(rdma, sctxt);
- return ret;
+ return svc_rdma_send(rdma, sctxt);
}

/**
@@ -931,8 +925,7 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len;
if (svc_rdma_send(rdma, sctxt))
goto put_ctxt;
-
- wait_for_completion_killable(&sctxt->sc_done);
+ return;

put_ctxt:
svc_rdma_send_ctxt_put(rdma, sctxt);
@@ -1006,6 +999,10 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
if (ret != -E2BIG && ret != -EINVAL)
goto put_ctxt;

+ /* Send completion releases payload pages that were part
+ * of previously posted RDMA Writes.
+ */
+ svc_rdma_save_io_pages(rqstp, sctxt);
svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret);
return 0;




2023-06-06 17:44:44

by Chuck Lever

[permalink] [raw]
Subject: [PATCH v1 4/4] SUNRPC: Optimize page release in svc_rdma_sendto()

From: Chuck Lever <[email protected]>

Now that we have bulk page allocation and release APIs, it's more
efficient to use those than it is for nfsd threads to wait for send
completions. Previous patches have eliminated the calls to
wait_for_completion() and complete(), in order to avoid scheduler
overhead.

Now release pages-under-I/O in the send completion handler using
the efficient bulk release API.

I've measured a 7% reduction in cumulative CPU utilization in
svc_rdma_sendto(), svc_rdma_wc_send(), and svc_xprt_release(). In
particular, using release_pages() instead of complete() cuts the
time per svc_rdma_wc_send() call by two-thirds. This helps improve
scalability because svc_rdma_wc_send() is single-threaded per
connection.

Signed-off-by: Chuck Lever <[email protected]>
---
net/sunrpc/xprtrdma/svc_rdma_sendto.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 1ae4236d04a3..24228f3611e8 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -236,8 +236,8 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
struct ib_device *device = rdma->sc_cm_id->device;
unsigned int i;

- for (i = 0; i < ctxt->sc_page_count; ++i)
- put_page(ctxt->sc_pages[i]);
+ if (ctxt->sc_page_count)
+ release_pages(ctxt->sc_pages, ctxt->sc_page_count);

/* The first SGE contains the transport header, which
* remains mapped until @ctxt is destroyed.



2023-06-09 20:44:28

by Tom Talpey

[permalink] [raw]
Subject: Re: [PATCH v1 4/4] SUNRPC: Optimize page release in svc_rdma_sendto()

On 6/6/2023 1:33 PM, Chuck Lever wrote:
> From: Chuck Lever <[email protected]>
>
> Now that we have bulk page allocation and release APIs, it's more
> efficient to use those than it is for nfsd threads to wait for send
> completions. Previous patches have eliminated the calls to
> wait_for_completion() and complete(), in order to avoid scheduler
> overhead.

Makes sense to me. Have you considered changing the send cq arming
to avoid completion notifications and simply poll them from the cq
at future convenient/efficient times?

Reviewed-by: Tom Talpey <[email protected]>

Tom.

> Now release pages-under-I/O in the send completion handler using
> the efficient bulk release API.
>
> I've measured a 7% reduction in cumulative CPU utilization in
> svc_rdma_sendto(), svc_rdma_wc_send(), and svc_xprt_release(). In
> particular, using release_pages() instead of complete() cuts the
> time per svc_rdma_wc_send() call by two-thirds. This helps improve
> scalability because svc_rdma_wc_send() is single-threaded per
> connection.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> net/sunrpc/xprtrdma/svc_rdma_sendto.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
> index 1ae4236d04a3..24228f3611e8 100644
> --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
> +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
> @@ -236,8 +236,8 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
> struct ib_device *device = rdma->sc_cm_id->device;
> unsigned int i;
>
> - for (i = 0; i < ctxt->sc_page_count; ++i)
> - put_page(ctxt->sc_pages[i]);
> + if (ctxt->sc_page_count)
> + release_pages(ctxt->sc_pages, ctxt->sc_page_count);
>
> /* The first SGE contains the transport header, which
> * remains mapped until @ctxt is destroyed.
>
>
>

2023-06-09 21:13:01

by Chuck Lever III

[permalink] [raw]
Subject: Re: [PATCH v1 4/4] SUNRPC: Optimize page release in svc_rdma_sendto()



> On Jun 9, 2023, at 4:31 PM, Tom Talpey <[email protected]> wrote:
>
> On 6/6/2023 1:33 PM, Chuck Lever wrote:
>> From: Chuck Lever <[email protected]>
>> Now that we have bulk page allocation and release APIs, it's more
>> efficient to use those than it is for nfsd threads to wait for send
>> completions. Previous patches have eliminated the calls to
>> wait_for_completion() and complete(), in order to avoid scheduler
>> overhead.
>
> Makes sense to me. Have you considered changing the send cq arming
> to avoid completion notifications and simply poll them from the cq
> at future convenient/efficient times?

That doesn't work for Read completions: those need to wake a new
nfsd as soon as they complete.

Send and Write completion don't need timely handling unless the
SQ is out of SQEs. It would be nice to move that processing out
of the CQ handler.

Otherwise, no, I haven't considered leaving the Send CQ disarmed.
I assumed that, at least on mlx5 hardware, there is some interrupt
mitigation that helps in this regard.


> Reviewed-by: Tom Talpey <[email protected]>
>
> Tom.
>
>> Now release pages-under-I/O in the send completion handler using
>> the efficient bulk release API.
>> I've measured a 7% reduction in cumulative CPU utilization in
>> svc_rdma_sendto(), svc_rdma_wc_send(), and svc_xprt_release(). In
>> particular, using release_pages() instead of complete() cuts the
>> time per svc_rdma_wc_send() call by two-thirds. This helps improve
>> scalability because svc_rdma_wc_send() is single-threaded per
>> connection.
>> Signed-off-by: Chuck Lever <[email protected]>
>> ---
>> net/sunrpc/xprtrdma/svc_rdma_sendto.c | 4 ++--
>> 1 file changed, 2 insertions(+), 2 deletions(-)
>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>> index 1ae4236d04a3..24228f3611e8 100644
>> --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>> +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>> @@ -236,8 +236,8 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
>> struct ib_device *device = rdma->sc_cm_id->device;
>> unsigned int i;
>> - for (i = 0; i < ctxt->sc_page_count; ++i)
>> - put_page(ctxt->sc_pages[i]);
>> + if (ctxt->sc_page_count)
>> + release_pages(ctxt->sc_pages, ctxt->sc_page_count);
>> /* The first SGE contains the transport header, which
>> * remains mapped until @ctxt is destroyed.

--
Chuck Lever



2023-06-09 22:05:21

by Tom Talpey

[permalink] [raw]
Subject: Re: [PATCH v1 4/4] SUNRPC: Optimize page release in svc_rdma_sendto()

On 6/9/2023 4:49 PM, Chuck Lever III wrote:
>
>
>> On Jun 9, 2023, at 4:31 PM, Tom Talpey <[email protected]> wrote:
>>
>> On 6/6/2023 1:33 PM, Chuck Lever wrote:
>>> From: Chuck Lever <[email protected]>
>>> Now that we have bulk page allocation and release APIs, it's more
>>> efficient to use those than it is for nfsd threads to wait for send
>>> completions. Previous patches have eliminated the calls to
>>> wait_for_completion() and complete(), in order to avoid scheduler
>>> overhead.
>>
>> Makes sense to me. Have you considered changing the send cq arming
>> to avoid completion notifications and simply poll them from the cq
>> at future convenient/efficient times?
>
> That doesn't work for Read completions: those need to wake a new
> nfsd as soon as they complete.
>
> Send and Write completion don't need timely handling unless the
> SQ is out of SQEs. It would be nice to move that processing out
> of the CQ handler.

Yes of course. That's my point, the receive cq and send cq have
very different requirements, and even more with this change.

> Otherwise, no, I haven't considered leaving the Send CQ disarmed.
> I assumed that, at least on mlx5 hardware, there is some interrupt
> mitigation that helps in this regard.

You definitely do not want interrupt mitigation, it will kill IOPS.
Say the hardware pauses 10us before delivering an interrupt, in case
another comes in shortly after. Ok, in a single-threaded workload
you now have a max 100K IOPS throughput.

Interrupt management needs to be done from the upper layer. Which is
what cq arming is all about.

Food for thought.

Tom.

>> Reviewed-by: Tom Talpey <[email protected]>
>>
>> Tom.
>>
>>> Now release pages-under-I/O in the send completion handler using
>>> the efficient bulk release API.
>>> I've measured a 7% reduction in cumulative CPU utilization in
>>> svc_rdma_sendto(), svc_rdma_wc_send(), and svc_xprt_release(). In
>>> particular, using release_pages() instead of complete() cuts the
>>> time per svc_rdma_wc_send() call by two-thirds. This helps improve
>>> scalability because svc_rdma_wc_send() is single-threaded per
>>> connection.
>>> Signed-off-by: Chuck Lever <[email protected]>
>>> ---
>>> net/sunrpc/xprtrdma/svc_rdma_sendto.c | 4 ++--
>>> 1 file changed, 2 insertions(+), 2 deletions(-)
>>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>>> index 1ae4236d04a3..24228f3611e8 100644
>>> --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>>> +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>>> @@ -236,8 +236,8 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
>>> struct ib_device *device = rdma->sc_cm_id->device;
>>> unsigned int i;
>>> - for (i = 0; i < ctxt->sc_page_count; ++i)
>>> - put_page(ctxt->sc_pages[i]);
>>> + if (ctxt->sc_page_count)
>>> + release_pages(ctxt->sc_pages, ctxt->sc_page_count);
>>> /* The first SGE contains the transport header, which
>>> * remains mapped until @ctxt is destroyed.
>
> --
> Chuck Lever
>
>
>