LinuxLists.cc - [PATCH net-next v2 10/17] nvme: Use sendmsg(MSG_SPLICE

2023-06-17 12:21:07

Subject: [PATCH net-next v2 10/17] nvme: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage

When transmitting data, call down into TCP using a single sendmsg with
MSG_SPLICE_PAGES to indicate that content should be spliced rather than
performing several sendmsg and sendpage calls to transmit header, data
pages and trailer.

Signed-off-by: David Howells <[email protected]>
cc: Keith Busch <[email protected]>
cc: Jens Axboe <[email protected]>
cc: Christoph Hellwig <[email protected]>
cc: Sagi Grimberg <[email protected]>
cc: Chaitanya Kulkarni <[email protected]>
cc: "David S. Miller" <[email protected]>
cc: Eric Dumazet <[email protected]>
cc: Jakub Kicinski <[email protected]>
cc: Paolo Abeni <[email protected]>
cc: Jens Axboe <[email protected]>
cc: Matthew Wilcox <[email protected]>
cc: [email protected]
cc: [email protected]
---

Notes:
ver #2)
- Wrap lines at 80.

drivers/nvme/host/tcp.c | 46 ++++++++++++++++++++-------------------
drivers/nvme/target/tcp.c | 46 ++++++++++++++++++++++++---------------
2 files changed, 53 insertions(+), 39 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index bf0230442d57..6f31cdbb696a 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -997,25 +997,25 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
u32 h2cdata_left = req->h2cdata_left;

while (true) {
+ struct bio_vec bvec;
+ struct msghdr msg = {
+ .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
+ };
struct page *page = nvme_tcp_req_cur_page(req);
size_t offset = nvme_tcp_req_cur_offset(req);
size_t len = nvme_tcp_req_cur_length(req);
bool last = nvme_tcp_pdu_last_send(req, len);
int req_data_sent = req->data_sent;
- int ret, flags = MSG_DONTWAIT;
+ int ret;

if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
- flags |= MSG_EOR;
+ msg.msg_flags |= MSG_EOR;
else
- flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+ msg.msg_flags |= MSG_MORE;

- if (sendpage_ok(page)) {
- ret = kernel_sendpage(queue->sock, page, offset, len,
- flags);
- } else {
- ret = sock_no_sendpage(queue->sock, page, offset, len,
- flags);
- }
+ bvec_set_page(&bvec, page, len, offset);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+ ret = sock_sendmsg(queue->sock, &msg);
if (ret <= 0)
return ret;

@@ -1054,22 +1054,24 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
{
struct nvme_tcp_queue *queue = req->queue;
struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
+ struct bio_vec bvec;
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
bool inline_data = nvme_tcp_has_inline_data(req);
u8 hdgst = nvme_tcp_hdgst_len(queue);
int len = sizeof(*pdu) + hdgst - req->offset;
- int flags = MSG_DONTWAIT;
int ret;

if (inline_data || nvme_tcp_queue_more(queue))
- flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+ msg.msg_flags |= MSG_MORE;
else
- flags |= MSG_EOR;
+ msg.msg_flags |= MSG_EOR;

if (queue->hdr_digest && !req->offset)
nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));

- ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
- offset_in_page(pdu) + req->offset, len, flags);
+ bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+ ret = sock_sendmsg(queue->sock, &msg);
if (unlikely(ret <= 0))
return ret;

@@ -1093,6 +1095,8 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
{
struct nvme_tcp_queue *queue = req->queue;
struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req);
+ struct bio_vec bvec;
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_MORE, };
u8 hdgst = nvme_tcp_hdgst_len(queue);
int len = sizeof(*pdu) - req->offset + hdgst;
int ret;
@@ -1101,13 +1105,11 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));

if (!req->h2cdata_left)
- ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
- offset_in_page(pdu) + req->offset, len,
- MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
- else
- ret = sock_no_sendpage(queue->sock, virt_to_page(pdu),
- offset_in_page(pdu) + req->offset, len,
- MSG_DONTWAIT | MSG_MORE);
+ msg.msg_flags |= MSG_SPLICE_PAGES;
+
+ bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+ ret = sock_sendmsg(queue->sock, &msg);
if (unlikely(ret <= 0))
return ret;

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index ed98df72c76b..868aa4de2e4c 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -576,13 +576,17 @@ static void nvmet_tcp_execute_request(struct nvmet_tcp_cmd *cmd)

static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
{
+ struct msghdr msg = {
+ .msg_flags = MSG_DONTWAIT | MSG_MORE | MSG_SPLICE_PAGES,
+ };
+ struct bio_vec bvec;
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
int ret;

- ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu),
- offset_in_page(cmd->data_pdu) + cmd->offset,
- left, MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
+ bvec_set_virt(&bvec, (void *)cmd->data_pdu + cmd->offset, left);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
+ ret = sock_sendmsg(cmd->queue->sock, &msg);
if (ret <= 0)
return ret;

@@ -603,17 +607,21 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
int ret;

while (cmd->cur_sg) {
+ struct msghdr msg = {
+ .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
+ };
struct page *page = sg_page(cmd->cur_sg);
+ struct bio_vec bvec;
u32 left = cmd->cur_sg->length - cmd->offset;
- int flags = MSG_DONTWAIT;

if ((!last_in_batch && cmd->queue->send_list_len) ||
cmd->wbytes_done + left < cmd->req.transfer_len ||
queue->data_digest || !queue->nvme_sq.sqhd_disabled)
- flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+ msg.msg_flags |= MSG_MORE;

- ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
- left, flags);
+ bvec_set_page(&bvec, page, left, cmd->offset);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
+ ret = sock_sendmsg(cmd->queue->sock, &msg);
if (ret <= 0)
return ret;

@@ -649,18 +657,20 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
bool last_in_batch)
{
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
+ struct bio_vec bvec;
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
- int flags = MSG_DONTWAIT;
int ret;

if (!last_in_batch && cmd->queue->send_list_len)
- flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+ msg.msg_flags |= MSG_MORE;
else
- flags |= MSG_EOR;
+ msg.msg_flags |= MSG_EOR;

- ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu),
- offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags);
+ bvec_set_virt(&bvec, (void *)cmd->rsp_pdu + cmd->offset, left);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
+ ret = sock_sendmsg(cmd->queue->sock, &msg);
if (ret <= 0)
return ret;
cmd->offset += ret;
@@ -677,18 +687,20 @@ static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,

static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
{
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
+ struct bio_vec bvec;
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
- int flags = MSG_DONTWAIT;
int ret;

if (!last_in_batch && cmd->queue->send_list_len)
- flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+ msg.msg_flags |= MSG_MORE;
else
- flags |= MSG_EOR;
+ msg.msg_flags |= MSG_EOR;

- ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu),
- offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags);
+ bvec_set_virt(&bvec, (void *)cmd->r2t_pdu + cmd->offset, left);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
+ ret = sock_sendmsg(cmd->queue->sock, &msg);
if (ret <= 0)
return ret;
cmd->offset += ret;

2023-06-18 16:57:19

by Willem de Bruijn

[permalink] [raw]

Subject: RE: [PATCH net-next v2 10/17] nvme: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage

David Howells wrote:
> When transmitting data, call down into TCP using a single sendmsg with
> MSG_SPLICE_PAGES to indicate that content should be spliced rather than
> performing several sendmsg and sendpage calls to transmit header, data
> pages and trailer.
>
> Signed-off-by: David Howells <[email protected]>
> cc: Keith Busch <[email protected]>
> cc: Jens Axboe <[email protected]>
> cc: Christoph Hellwig <[email protected]>
> cc: Sagi Grimberg <[email protected]>
> cc: Chaitanya Kulkarni <[email protected]>
> cc: "David S. Miller" <[email protected]>
> cc: Eric Dumazet <[email protected]>
> cc: Jakub Kicinski <[email protected]>
> cc: Paolo Abeni <[email protected]>
> cc: Jens Axboe <[email protected]>
> cc: Matthew Wilcox <[email protected]>
> cc: [email protected]
> cc: [email protected]
> ---
>
> Notes:
> ver #2)
> - Wrap lines at 80.
>
> drivers/nvme/host/tcp.c | 46 ++++++++++++++++++++-------------------
> drivers/nvme/target/tcp.c | 46 ++++++++++++++++++++++++---------------
> 2 files changed, 53 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index bf0230442d57..6f31cdbb696a 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -997,25 +997,25 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
> u32 h2cdata_left = req->h2cdata_left;
>
> while (true) {
> + struct bio_vec bvec;
> + struct msghdr msg = {
> + .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
> + };
> struct page *page = nvme_tcp_req_cur_page(req);
> size_t offset = nvme_tcp_req_cur_offset(req);
> size_t len = nvme_tcp_req_cur_length(req);
> bool last = nvme_tcp_pdu_last_send(req, len);
> int req_data_sent = req->data_sent;
> - int ret, flags = MSG_DONTWAIT;
> + int ret;
>
> if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
> - flags |= MSG_EOR;
> + msg.msg_flags |= MSG_EOR;
> else
> - flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
> + msg.msg_flags |= MSG_MORE;
>
> - if (sendpage_ok(page)) {
> - ret = kernel_sendpage(queue->sock, page, offset, len,
> - flags);
> - } else {
> - ret = sock_no_sendpage(queue->sock, page, offset, len,
> - flags);
> - }
> + bvec_set_page(&bvec, page, len, offset);
> + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
> + ret = sock_sendmsg(queue->sock, &msg);
> if (ret <= 0)
> return ret;
>
> @@ -1054,22 +1054,24 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
> {
> struct nvme_tcp_queue *queue = req->queue;
> struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
> + struct bio_vec bvec;
> + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
> bool inline_data = nvme_tcp_has_inline_data(req);
> u8 hdgst = nvme_tcp_hdgst_len(queue);
> int len = sizeof(*pdu) + hdgst - req->offset;
> - int flags = MSG_DONTWAIT;
> int ret;
>
> if (inline_data || nvme_tcp_queue_more(queue))
> - flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
> + msg.msg_flags |= MSG_MORE;
> else
> - flags |= MSG_EOR;
> + msg.msg_flags |= MSG_EOR;
>
> if (queue->hdr_digest && !req->offset)
> nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
>
> - ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
> - offset_in_page(pdu) + req->offset, len, flags);
> + bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
> + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
> + ret = sock_sendmsg(queue->sock, &msg);
> if (unlikely(ret <= 0))
> return ret;
>

struct bio_vec bvec;
struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | ... };

..

bvec_set_virt
iov_iter_bvec
sock_sendmsg

is a frequent pattern. Does it make sense to define a wrapper? Same for bvec_set_page.

2023-06-18 17:45:11

by David Howells

[permalink] [raw]

Subject: Re: [PATCH net-next v2 10/17] nvme: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage

Willem de Bruijn <[email protected]> wrote:

> struct bio_vec bvec;
> struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | ... };
>
> ..
>
> bvec_set_virt
> iov_iter_bvec
> sock_sendmsg
>
> is a frequent pattern. Does it make sense to define a wrapper? Same for bvec_set_page.

I dunno. I'm trying to move towards aggregating multiple pages in a bvec
before calling sendmsg if possible rather than doing it one page at a time,
but it's easier and more obvious in some places than others.

David

2023-06-19 08:33:49

by Sagi Grimberg

[permalink] [raw]

Subject: Re: [PATCH net-next v2 10/17] nvme: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage

>> struct bio_vec bvec;
>> struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | ... };
>>
>> ..
>>
>> bvec_set_virt
>> iov_iter_bvec
>> sock_sendmsg
>>
>> is a frequent pattern. Does it make sense to define a wrapper? Same for bvec_set_page.
>
> I dunno. I'm trying to move towards aggregating multiple pages in a bvec
> before calling sendmsg if possible rather than doing it one page at a time,
> but it's easier and more obvious in some places than others.

That would be great to do, but nvme needs to calculate a data digest
and doing that in a separate scan of the payload is not very cache
friendly...

There is also the fact that the payload may be sent in portions
asynchronously driven by how the controller wants to accept them,
so there is some complexity there.

But worth looking at for sure.

The patch looks good to me, taking it to run some tests
(from sendpage-3-frag branch in your kernel.org tree correct?)

For now, you can add:
Reviewed-by: Sagi Grimberg <[email protected]>

2023-06-19 10:02:48

by David Howells

[permalink] [raw]

Subject: Re: [PATCH net-next v2 10/17] nvme: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage

Sagi Grimberg <[email protected]> wrote:

> The patch looks good to me, taking it to run some tests
> (from sendpage-3-frag branch in your kernel.org tree correct?)

Yep, but you'll patches 1 also and patch 2 might help with seeing what's going
on.

David

2023-06-19 12:25:51

by Willem de Bruijn

[permalink] [raw]

Subject: Re: [PATCH net-next v2 10/17] nvme: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage

David Howells wrote:
> Willem de Bruijn <[email protected]> wrote:
>
> > struct bio_vec bvec;
> > struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | ... };
> >
> > ..
> >
> > bvec_set_virt
> > iov_iter_bvec
> > sock_sendmsg
> >
> > is a frequent pattern. Does it make sense to define a wrapper? Same for bvec_set_page.
>
> I dunno. I'm trying to move towards aggregating multiple pages in a bvec
> before calling sendmsg if possible rather than doing it one page at a time,
> but it's easier and more obvious in some places than others.

Ok. I just wanted to have your opinion. Fine to leave as is.

Acked-by: Willem de Bruijn <[email protected]>

2023-06-20 13:16:11

by Sagi Grimberg

[permalink] [raw]

Subject: Re: [PATCH net-next v2 10/17] nvme: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage

>>>      struct bio_vec bvec;
>>>      struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | ... };
>>>
>>>      ..
>>>
>>>      bvec_set_virt
>>>      iov_iter_bvec
>>>      sock_sendmsg
>>>
>>> is a frequent pattern. Does it make sense to define a wrapper? Same
>>> for bvec_set_page.
>>
>> I dunno. I'm trying to move towards aggregating multiple pages in a bvec
>> before calling sendmsg if possible rather than doing it one page at a
>> time,
>> but it's easier and more obvious in some places than others.
>
> That would be great to do, but nvme needs to calculate a data digest
> and doing that in a separate scan of the payload is not very cache
> friendly...
>
> There is also the fact that the payload may be sent in portions
> asynchronously driven by how the controller wants to accept them,
> so there is some complexity there.
>
> But worth looking at for sure.
>
> The patch looks good to me, taking it to run some tests
> (from sendpage-3-frag branch in your kernel.org tree correct?)
>
> For now, you can add:
> Reviewed-by: Sagi Grimberg <[email protected]>

Patches seem to hold up.
Tested-by: Sagi Grimberg <[email protected]>

However if possible, can you please split nvme/host and nvme/target
changes? We try to separate host side and target side changes in the
same patch.