Here's the second tranche of patches towards providing a MSG_SPLICE_PAGES
internal sendmsg flag that is intended to replace the ->sendpage() op with
calls to sendmsg(). MSG_SPLICE_PAGES is a hint that tells the protocol
that it should splice the pages supplied if it can and copy them if not.
This set consists of the following parts:
(1) Implement MSG_SPLICE_PAGES support in Chelsio TLS and make
chtls_sendpage() just a wrapper around sendmsg().
(2) Implement MSG_SPLICE_PAGES support in AF_KCM and make kcm_sendpage()
just a wrapper around sendmsg().
I've pushed the patches here also:
https://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git/log/?h=sendpage-2
David
Link: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=51c78a4d532efe9543a4df019ff405f05c6157f6 # part 1
David Howells (4):
chelsio: Support MSG_SPLICE_PAGES
chelsio: Convert chtls_sendpage() to use MSG_SPLICE_PAGES
kcm: Support MSG_SPLICE_PAGES
kcm: Convert kcm_sendpage() to use MSG_SPLICE_PAGES
.../chelsio/inline_crypto/chtls/chtls_io.c | 121 ++--------
net/kcm/kcmsock.c | 216 +++++-------------
2 files changed, 74 insertions(+), 263 deletions(-)
Convert kcm_sendpage() to use sendmsg() with MSG_SPLICE_PAGES rather than
directly splicing in the pages itself.
This allows ->sendpage() to be replaced by something that can handle
multiple multipage folios in a single transaction.
Signed-off-by: David Howells <[email protected]>
cc: Tom Herbert <[email protected]>
cc: Tom Herbert <[email protected]>
cc: Jakub Kicinski <[email protected]>
cc: Eric Dumazet <[email protected]>
cc: "David S. Miller" <[email protected]>
cc: Paolo Abeni <[email protected]>
cc: Jens Axboe <[email protected]>
cc: Matthew Wilcox <[email protected]>
cc: [email protected]
---
net/kcm/kcmsock.c | 161 ++++++----------------------------------------
1 file changed, 18 insertions(+), 143 deletions(-)
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 411726d830c0..f6e0e017e3cc 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -761,149 +761,6 @@ static void kcm_push(struct kcm_sock *kcm)
kcm_write_msgs(kcm);
}
-static ssize_t kcm_sendpage(struct socket *sock, struct page *page,
- int offset, size_t size, int flags)
-
-{
- struct sock *sk = sock->sk;
- struct kcm_sock *kcm = kcm_sk(sk);
- struct sk_buff *skb = NULL, *head = NULL;
- long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
- bool eor;
- int err = 0;
- int i;
-
- if (flags & MSG_SENDPAGE_NOTLAST)
- flags |= MSG_MORE;
-
- /* No MSG_EOR from splice, only look at MSG_MORE */
- eor = !(flags & MSG_MORE);
-
- lock_sock(sk);
-
- sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
-
- err = -EPIPE;
- if (sk->sk_err)
- goto out_error;
-
- if (kcm->seq_skb) {
- /* Previously opened message */
- head = kcm->seq_skb;
- skb = kcm_tx_msg(head)->last_skb;
- i = skb_shinfo(skb)->nr_frags;
-
- if (skb_can_coalesce(skb, i, page, offset)) {
- skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
- skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
- goto coalesced;
- }
-
- if (i >= MAX_SKB_FRAGS) {
- struct sk_buff *tskb;
-
- tskb = alloc_skb(0, sk->sk_allocation);
- while (!tskb) {
- kcm_push(kcm);
- err = sk_stream_wait_memory(sk, &timeo);
- if (err)
- goto out_error;
- }
-
- if (head == skb)
- skb_shinfo(head)->frag_list = tskb;
- else
- skb->next = tskb;
-
- skb = tskb;
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- i = 0;
- }
- } else {
- /* Call the sk_stream functions to manage the sndbuf mem. */
- if (!sk_stream_memory_free(sk)) {
- kcm_push(kcm);
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- err = sk_stream_wait_memory(sk, &timeo);
- if (err)
- goto out_error;
- }
-
- head = alloc_skb(0, sk->sk_allocation);
- while (!head) {
- kcm_push(kcm);
- err = sk_stream_wait_memory(sk, &timeo);
- if (err)
- goto out_error;
- }
-
- skb = head;
- i = 0;
- }
-
- get_page(page);
- skb_fill_page_desc_noacc(skb, i, page, offset, size);
- skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
-
-coalesced:
- skb->len += size;
- skb->data_len += size;
- skb->truesize += size;
- sk->sk_wmem_queued += size;
- sk_mem_charge(sk, size);
-
- if (head != skb) {
- head->len += size;
- head->data_len += size;
- head->truesize += size;
- }
-
- if (eor) {
- bool not_busy = skb_queue_empty(&sk->sk_write_queue);
-
- /* Message complete, queue it on send buffer */
- __skb_queue_tail(&sk->sk_write_queue, head);
- kcm->seq_skb = NULL;
- KCM_STATS_INCR(kcm->stats.tx_msgs);
-
- if (flags & MSG_BATCH) {
- kcm->tx_wait_more = true;
- } else if (kcm->tx_wait_more || not_busy) {
- err = kcm_write_msgs(kcm);
- if (err < 0) {
- /* We got a hard error in write_msgs but have
- * already queued this message. Report an error
- * in the socket, but don't affect return value
- * from sendmsg
- */
- pr_warn("KCM: Hard failure on kcm_write_msgs\n");
- report_csk_error(&kcm->sk, -err);
- }
- }
- } else {
- /* Message not complete, save state */
- kcm->seq_skb = head;
- kcm_tx_msg(head)->last_skb = skb;
- }
-
- KCM_STATS_ADD(kcm->stats.tx_bytes, size);
-
- release_sock(sk);
- return size;
-
-out_error:
- kcm_push(kcm);
-
- err = sk_stream_error(sk, flags, err);
-
- /* make sure we wake any epoll edge trigger waiter */
- if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
- sk->sk_write_space(sk);
-
- release_sock(sk);
- return err;
-}
-
static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
struct sock *sk = sock->sk;
@@ -1109,6 +966,24 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
return err;
}
+static ssize_t kcm_sendpage(struct socket *sock, struct page *page,
+ int offset, size_t size, int flags)
+
+{
+ struct bio_vec bvec;
+ struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, };
+
+ if (flags & MSG_SENDPAGE_NOTLAST)
+ msg.msg_flags |= MSG_MORE;
+
+ if (flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ bvec_set_page(&bvec, page, size, offset);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);
+ return kcm_sendmsg(sock, &msg, size);
+}
+
static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
size_t len, int flags)
{
Make AF_KCM sendmsg() support MSG_SPLICE_PAGES. This causes pages to be
spliced from the source iterator if possible.
This allows ->sendpage() to be replaced by something that can handle
multiple multipage folios in a single transaction.
Signed-off-by: David Howells <[email protected]>
cc: Tom Herbert <[email protected]>
cc: Tom Herbert <[email protected]>
cc: Jakub Kicinski <[email protected]>
cc: Eric Dumazet <[email protected]>
cc: "David S. Miller" <[email protected]>
cc: Paolo Abeni <[email protected]>
cc: Jens Axboe <[email protected]>
cc: Matthew Wilcox <[email protected]>
cc: [email protected]
---
net/kcm/kcmsock.c | 55 ++++++++++++++++++++++++++++++++---------------
1 file changed, 38 insertions(+), 17 deletions(-)
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index cfe828bd7fc6..411726d830c0 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -989,29 +989,50 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
merge = false;
}
- copy = min_t(int, msg_data_left(msg),
- pfrag->size - pfrag->offset);
+ if (msg->msg_flags & MSG_SPLICE_PAGES) {
+ copy = msg_data_left(msg);
+ if (!sk_wmem_schedule(sk, copy))
+ goto wait_for_memory;
- if (!sk_wmem_schedule(sk, copy))
- goto wait_for_memory;
+ err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
+ sk->sk_allocation);
+ if (err < 0) {
+ if (err == -EMSGSIZE)
+ goto wait_for_memory;
+ goto out_error;
+ }
- err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
- pfrag->page,
- pfrag->offset,
- copy);
- if (err)
- goto out_error;
+ skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
+ sk_wmem_queued_add(sk, copy);
+ sk_mem_charge(sk, copy);
- /* Update the skb. */
- if (merge) {
- skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+ if (head != skb)
+ head->truesize += copy;
} else {
- skb_fill_page_desc(skb, i, pfrag->page,
- pfrag->offset, copy);
- get_page(pfrag->page);
+ copy = min_t(int, msg_data_left(msg),
+ pfrag->size - pfrag->offset);
+ if (!sk_wmem_schedule(sk, copy))
+ goto wait_for_memory;
+
+ err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
+ pfrag->page,
+ pfrag->offset,
+ copy);
+ if (err)
+ goto out_error;
+
+ /* Update the skb. */
+ if (merge) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+ } else {
+ skb_fill_page_desc(skb, i, pfrag->page,
+ pfrag->offset, copy);
+ get_page(pfrag->page);
+ }
+
+ pfrag->offset += copy;
}
- pfrag->offset += copy;
copied += copy;
if (head != skb) {
head->len += copy;
Make Chelsio's TLS offload sendmsg() support MSG_SPLICE_PAGES, splicing in
pages from the source iterator if possible and copying the data in
otherwise.
This allows ->sendpage() to be replaced by something that can handle
multiple multipage folios in a single transaction.
Signed-off-by: David Howells <[email protected]>
cc: Ayush Sawal <[email protected]>
cc: "David S. Miller" <[email protected]>
cc: Eric Dumazet <[email protected]>
cc: Jakub Kicinski <[email protected]>
cc: Paolo Abeni <[email protected]>
cc: Jens Axboe <[email protected]>
cc: Matthew Wilcox <[email protected]>
cc: [email protected]
---
.../ethernet/chelsio/inline_crypto/chtls/chtls_io.c | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c
index ae6b17b96bf1..1d08386ac916 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c
+++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c
@@ -1092,7 +1092,17 @@ int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
if (copy > size)
copy = size;
- if (skb_tailroom(skb) > 0) {
+ if (msg->msg_flags & MSG_SPLICE_PAGES) {
+ err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
+ sk->sk_allocation);
+ if (err < 0) {
+ if (err == -EMSGSIZE)
+ goto new_buf;
+ goto do_fault;
+ }
+ copy = err;
+ sk_wmem_queued_add(sk, copy);
+ } else if (skb_tailroom(skb) > 0) {
copy = min(copy, skb_tailroom(skb));
if (is_tls_tx(csk))
copy = min_t(int, copy, csk->tlshws.txleft);
Convert chtls_sendpage() to use sendmsg() with MSG_SPLICE_PAGES rather than
directly splicing in the pages itself.
This allows ->sendpage() to be replaced by something that can handle
multiple multipage folios in a single transaction.
Signed-off-by: David Howells <[email protected]>
cc: Ayush Sawal <[email protected]>
cc: "David S. Miller" <[email protected]>
cc: Eric Dumazet <[email protected]>
cc: Jakub Kicinski <[email protected]>
cc: Paolo Abeni <[email protected]>
cc: Jens Axboe <[email protected]>
cc: Matthew Wilcox <[email protected]>
cc: [email protected]
---
.../chelsio/inline_crypto/chtls/chtls_io.c | 109 ++----------------
1 file changed, 7 insertions(+), 102 deletions(-)
diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c
index 1d08386ac916..65efd20ec796 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c
+++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c
@@ -1240,110 +1240,15 @@ int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
int chtls_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
{
- struct chtls_sock *csk;
- struct chtls_dev *cdev;
- int mss, err, copied;
- struct tcp_sock *tp;
- long timeo;
-
- tp = tcp_sk(sk);
- copied = 0;
- csk = rcu_dereference_sk_user_data(sk);
- cdev = csk->cdev;
- lock_sock(sk);
- timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ struct bio_vec bvec;
+ struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, };
- err = sk_stream_wait_connect(sk, &timeo);
- if (!sk_in_state(sk, TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
- err != 0)
- goto out_err;
-
- mss = csk->mss;
- csk_set_flag(csk, CSK_TX_MORE_DATA);
-
- while (size > 0) {
- struct sk_buff *skb = skb_peek_tail(&csk->txq);
- int copy, i;
-
- if (!skb || (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND) ||
- (copy = mss - skb->len) <= 0) {
-new_buf:
- if (!csk_mem_free(cdev, sk))
- goto wait_for_sndbuf;
+ if (flags & MSG_SENDPAGE_NOTLAST)
+ msg.msg_flags |= MSG_MORE;
- if (is_tls_tx(csk)) {
- skb = get_record_skb(sk,
- select_size(sk, size,
- flags,
- TX_TLSHDR_LEN),
- true);
- } else {
- skb = get_tx_skb(sk, 0);
- }
- if (!skb)
- goto wait_for_memory;
- copy = mss;
- }
- if (copy > size)
- copy = size;
-
- i = skb_shinfo(skb)->nr_frags;
- if (skb_can_coalesce(skb, i, page, offset)) {
- skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
- } else if (i < MAX_SKB_FRAGS) {
- get_page(page);
- skb_fill_page_desc(skb, i, page, offset, copy);
- } else {
- tx_skb_finalize(skb);
- push_frames_if_head(sk);
- goto new_buf;
- }
-
- skb->len += copy;
- if (skb->len == mss)
- tx_skb_finalize(skb);
- skb->data_len += copy;
- skb->truesize += copy;
- sk->sk_wmem_queued += copy;
- tp->write_seq += copy;
- copied += copy;
- offset += copy;
- size -= copy;
-
- if (corked(tp, flags) &&
- (sk_stream_wspace(sk) < sk_stream_min_wspace(sk)))
- ULP_SKB_CB(skb)->flags |= ULPCB_FLAG_NO_APPEND;
-
- if (!size)
- break;
-
- if (unlikely(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND))
- push_frames_if_head(sk);
- continue;
-wait_for_sndbuf:
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-wait_for_memory:
- err = csk_wait_memory(cdev, sk, &timeo);
- if (err)
- goto do_error;
- }
-out:
- csk_reset_flag(csk, CSK_TX_MORE_DATA);
- if (copied)
- chtls_tcp_push(sk, flags);
-done:
- release_sock(sk);
- return copied;
-
-do_error:
- if (copied)
- goto out;
-
-out_err:
- if (csk_conn_inline(csk))
- csk_reset_flag(csk, CSK_TX_MORE_DATA);
- copied = sk_stream_error(sk, flags, err);
- goto done;
+ bvec_set_page(&bvec, page, size, offset);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);
+ return chtls_sendmsg(sk, &msg, size);
}
static void chtls_select_window(struct sock *sk)
On Wed, May 24, 2023 at 03:49:21PM +0100, David Howells wrote:
> Convert chtls_sendpage() to use sendmsg() with MSG_SPLICE_PAGES rather than
> directly splicing in the pages itself.
>
> This allows ->sendpage() to be replaced by something that can handle
> multiple multipage folios in a single transaction.
>
> Signed-off-by: David Howells <[email protected]>
> cc: Ayush Sawal <[email protected]>
> cc: "David S. Miller" <[email protected]>
> cc: Eric Dumazet <[email protected]>
> cc: Jakub Kicinski <[email protected]>
> cc: Paolo Abeni <[email protected]>
> cc: Jens Axboe <[email protected]>
> cc: Matthew Wilcox <[email protected]>
> cc: [email protected]
> ---
> .../chelsio/inline_crypto/chtls/chtls_io.c | 109 ++----------------
> 1 file changed, 7 insertions(+), 102 deletions(-)
>
> diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c
> index 1d08386ac916..65efd20ec796 100644
> --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c
> +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c
> @@ -1240,110 +1240,15 @@ int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
> int chtls_sendpage(struct sock *sk, struct page *page,
> int offset, size_t size, int flags)
> {
> - struct chtls_sock *csk;
> - struct chtls_dev *cdev;
> - int mss, err, copied;
> - struct tcp_sock *tp;
> - long timeo;
> -
> - tp = tcp_sk(sk);
> - copied = 0;
> - csk = rcu_dereference_sk_user_data(sk);
> - cdev = csk->cdev;
> - lock_sock(sk);
> - timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
> + struct bio_vec bvec;
> + struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, };
>
Hi David,
a minor nit, in case you need to repost this series for some other reason.
Please use reverse Xmas tree - longest line to shortest - order for
Networking code. I understand this file doesn't adhere to that, and
we probably don't want churn due to addressing it throughout this file.
But my preference is to move towards this standard, or at least not away
from it.
So in this case:
struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, };
struct bio_vec bvec;
This tool can be useful:
https://github.com/ecree-solarflare/xmastree
On Thu, May 25, 2023 at 09:41:46AM +0100, David Howells wrote:
> Simon Horman <[email protected]> wrote:
>
> > a minor nit, in case you need to repost this series for some other reason.
>
> Note that the aim is to delete chtls_sendpage() entirely at some point soon.
Excellent.
Simon Horman <[email protected]> wrote:
> a minor nit, in case you need to repost this series for some other reason.
Note that the aim is to delete chtls_sendpage() entirely at some point soon.
David
On Wed, 24 May 2023 15:49:22 +0100 David Howells wrote:
> + err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
> + sk->sk_allocation);
> + if (err < 0) {
> + if (err == -EMSGSIZE)
> + goto wait_for_memory;
> + goto out_error;
> + }
>
should there be a:
copy = err;
or:
copy -= msg_data_left(msg);
or some such here? Can we safely assume that skb_splice_from_iter() will
copy all or nothing?
> - err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
> - pfrag->page,
> - pfrag->offset,
> - copy);
> - if (err)
> - goto out_error;
> + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
Jakub Kicinski <[email protected]> wrote:
> On Wed, 24 May 2023 15:49:22 +0100 David Howells wrote:
> > + err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
> > + sk->sk_allocation);
> > + if (err < 0) {
> > + if (err == -EMSGSIZE)
> > + goto wait_for_memory;
> > + goto out_error;
> > + }
> >
>
> should there be a:
>
> copy = err;
> or:
> copy -= msg_data_left(msg);
>
> or some such here? Can we safely assume that skb_splice_from_iter() will
> copy all or nothing?
Yeah. Good point. I didn't add one because the normal operation code doesn't
do that - but I guess that's all-or-nothing.
David