2023-11-06 02:46:42

by Mina Almasry

[permalink] [raw]
Subject: [RFC PATCH v3 10/12] tcp: RX path for devmem TCP

In tcp_recvmsg_locked(), detect if the skb being received by the user
is a devmem skb. In this case - if the user provided the MSG_SOCK_DEVMEM
flag - pass it to tcp_recvmsg_devmem() for custom handling.

tcp_recvmsg_devmem() copies any data in the skb header to the linear
buffer, and returns a cmsg to the user indicating the number of bytes
returned in the linear buffer.

tcp_recvmsg_devmem() then loops over the unaccessible devmem skb frags,
and returns to the user a cmsg_devmem indicating the location of the
data in the dmabuf device memory. cmsg_devmem contains this information:

1. the offset into the dmabuf where the payload starts. 'frag_offset'.
2. the size of the frag. 'frag_size'.
3. an opaque token 'frag_token' to return to the kernel when the buffer
is to be released.

The pages awaiting freeing are stored in the newly added
sk->sk_user_pages, and each page passed to userspace is get_page()'d.
This reference is dropped once the userspace indicates that it is
done reading this page. All pages are released when the socket is
destroyed.

Signed-off-by: Willem de Bruijn <[email protected]>
Signed-off-by: Kaiyuan Zhang <[email protected]>
Signed-off-by: Mina Almasry <[email protected]>

---

RFC v3:
- Fixed issue with put_cmsg() failing silently.

---
include/linux/socket.h | 1 +
include/net/page_pool/helpers.h | 9 ++
include/net/sock.h | 2 +
include/uapi/asm-generic/socket.h | 5 +
include/uapi/linux/uio.h | 6 +
net/ipv4/tcp.c | 189 +++++++++++++++++++++++++++++-
net/ipv4/tcp_ipv4.c | 7 ++
7 files changed, 214 insertions(+), 5 deletions(-)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index cfcb7e2c3813..fe2b9e2081bb 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -326,6 +326,7 @@ struct ucred {
* plain text and require encryption
*/

+#define MSG_SOCK_DEVMEM 0x2000000 /* Receive devmem skbs as cmsg */
#define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */
#define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */
#define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */
diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 08f1a2cc70d2..95f4d579cbc4 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -106,6 +106,15 @@ page_pool_iov_dma_addr(const struct page_pool_iov *ppiov)
((dma_addr_t)page_pool_iov_idx(ppiov) << PAGE_SHIFT);
}

+static inline unsigned long
+page_pool_iov_virtual_addr(const struct page_pool_iov *ppiov)
+{
+ struct dmabuf_genpool_chunk_owner *owner = page_pool_iov_owner(ppiov);
+
+ return owner->base_virtual +
+ ((unsigned long)page_pool_iov_idx(ppiov) << PAGE_SHIFT);
+}
+
static inline struct netdev_dmabuf_binding *
page_pool_iov_binding(const struct page_pool_iov *ppiov)
{
diff --git a/include/net/sock.h b/include/net/sock.h
index 242590308d64..986d9da6e062 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -353,6 +353,7 @@ struct sk_filter;
* @sk_txtime_unused: unused txtime flags
* @ns_tracker: tracker for netns reference
* @sk_bind2_node: bind node in the bhash2 table
+ * @sk_user_pages: xarray of pages the user is holding a reference on.
*/
struct sock {
/*
@@ -545,6 +546,7 @@ struct sock {
struct rcu_head sk_rcu;
netns_tracker ns_tracker;
struct hlist_node sk_bind2_node;
+ struct xarray sk_user_pages;
};

enum sk_pacing {
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 8ce8a39a1e5f..aacb97f16b78 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -135,6 +135,11 @@
#define SO_PASSPIDFD 76
#define SO_PEERPIDFD 77

+#define SO_DEVMEM_HEADER 98
+#define SCM_DEVMEM_HEADER SO_DEVMEM_HEADER
+#define SO_DEVMEM_OFFSET 99
+#define SCM_DEVMEM_OFFSET SO_DEVMEM_OFFSET
+
#if !defined(__KERNEL__)

#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/include/uapi/linux/uio.h b/include/uapi/linux/uio.h
index 059b1a9147f4..ae94763b1963 100644
--- a/include/uapi/linux/uio.h
+++ b/include/uapi/linux/uio.h
@@ -20,6 +20,12 @@ struct iovec
__kernel_size_t iov_len; /* Must be size_t (1003.1g) */
};

+struct cmsg_devmem {
+ __u64 frag_offset;
+ __u32 frag_size;
+ __u32 frag_token;
+};
+
/*
* UIO_MAXIOV shall be at least 16 1003.1g (5.4.1.1)
*/
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5c6fed52ed0e..fd7f6d7e7671 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -461,6 +461,7 @@ void tcp_init_sock(struct sock *sk)

set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
sk_sockets_allocated_inc(sk);
+ xa_init_flags(&sk->sk_user_pages, XA_FLAGS_ALLOC1);
}
EXPORT_SYMBOL(tcp_init_sock);

@@ -2301,6 +2302,154 @@ static int tcp_inq_hint(struct sock *sk)
return inq;
}

+/* On error, returns the -errno. On success, returns number of bytes sent to the
+ * user. May not consume all of @remaining_len.
+ */
+static int tcp_recvmsg_devmem(const struct sock *sk, const struct sk_buff *skb,
+ unsigned int offset, struct msghdr *msg,
+ int remaining_len)
+{
+ struct cmsg_devmem cmsg_devmem = { 0 };
+ unsigned int start;
+ int i, copy, n;
+ int sent = 0;
+ int err = 0;
+
+ do {
+ start = skb_headlen(skb);
+
+ if (!skb_frags_not_readable(skb)) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ /* Copy header. */
+ copy = start - offset;
+ if (copy > 0) {
+ copy = min(copy, remaining_len);
+
+ n = copy_to_iter(skb->data + offset, copy,
+ &msg->msg_iter);
+ if (n != copy) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ offset += copy;
+ remaining_len -= copy;
+
+ /* First a cmsg_devmem for # bytes copied to user
+ * buffer.
+ */
+ memset(&cmsg_devmem, 0, sizeof(cmsg_devmem));
+ cmsg_devmem.frag_size = copy;
+ err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_HEADER,
+ sizeof(cmsg_devmem), &cmsg_devmem);
+ if (err || msg->msg_flags & MSG_CTRUNC) {
+ msg->msg_flags &= ~MSG_CTRUNC;
+ if (!err)
+ err = -ETOOSMALL;
+ goto out;
+ }
+
+ sent += copy;
+
+ if (remaining_len == 0)
+ goto out;
+ }
+
+ /* after that, send information of devmem pages through a
+ * sequence of cmsg
+ */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct page_pool_iov *ppiov;
+ u64 frag_offset;
+ u32 user_token;
+ int end;
+
+ /* skb_frags_not_readable() should indicate that ALL the
+ * frags in this skb are unreadable page_pool_iovs.
+ * We're checking for that flag above, but also check
+ * individual pages here. If the tcp stack is not
+ * setting skb->devmem correctly, we still don't want to
+ * crash here when accessing pgmap or priv below.
+ */
+ if (!skb_frag_page_pool_iov(frag)) {
+ net_err_ratelimited("Found non-devmem skb with page_pool_iov");
+ err = -ENODEV;
+ goto out;
+ }
+
+ ppiov = skb_frag_page_pool_iov(frag);
+ end = start + skb_frag_size(frag);
+ copy = end - offset;
+
+ if (copy > 0) {
+ copy = min(copy, remaining_len);
+
+ frag_offset = page_pool_iov_virtual_addr(ppiov) +
+ skb_frag_off(frag) + offset -
+ start;
+ cmsg_devmem.frag_offset = frag_offset;
+ cmsg_devmem.frag_size = copy;
+ err = xa_alloc((struct xarray *)&sk->sk_user_pages,
+ &user_token, frag->bv_page,
+ xa_limit_31b, GFP_KERNEL);
+ if (err)
+ goto out;
+
+ cmsg_devmem.frag_token = user_token;
+
+ offset += copy;
+ remaining_len -= copy;
+
+ err = put_cmsg(msg, SOL_SOCKET,
+ SO_DEVMEM_OFFSET,
+ sizeof(cmsg_devmem),
+ &cmsg_devmem);
+ if (err || msg->msg_flags & MSG_CTRUNC) {
+ msg->msg_flags &= ~MSG_CTRUNC;
+ xa_erase((struct xarray *)&sk->sk_user_pages,
+ user_token);
+ if (!err)
+ err = -ETOOSMALL;
+ goto out;
+ }
+
+ page_pool_iov_get_many(ppiov, 1);
+
+ sent += copy;
+
+ if (remaining_len == 0)
+ goto out;
+ }
+ start = end;
+ }
+
+ if (!remaining_len)
+ goto out;
+
+ /* if remaining_len is not satisfied yet, we need to go to the
+ * next frag in the frag_list to satisfy remaining_len.
+ */
+ skb = skb_shinfo(skb)->frag_list ?: skb->next;
+
+ offset = offset - start;
+ } while (skb);
+
+ if (remaining_len) {
+ err = -EFAULT;
+ goto out;
+ }
+
+out:
+ if (!sent)
+ sent = err;
+
+ return sent;
+}
+
/*
* This routine copies from a sock struct into the user buffer.
*
@@ -2314,6 +2463,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
int *cmsg_flags)
{
struct tcp_sock *tp = tcp_sk(sk);
+ int last_copied_devmem = -1; /* uninitialized */
int copied = 0;
u32 peek_seq;
u32 *seq;
@@ -2491,15 +2641,44 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
}

if (!(flags & MSG_TRUNC)) {
- err = skb_copy_datagram_msg(skb, offset, msg, used);
- if (err) {
- /* Exception. Bailout! */
- if (!copied)
- copied = -EFAULT;
+ if (last_copied_devmem != -1 &&
+ last_copied_devmem != skb->devmem)
break;
+
+ if (!skb->devmem) {
+ err = skb_copy_datagram_msg(skb, offset, msg,
+ used);
+ if (err) {
+ /* Exception. Bailout! */
+ if (!copied)
+ copied = -EFAULT;
+ break;
+ }
+ } else {
+ if (!(flags & MSG_SOCK_DEVMEM)) {
+ /* skb->devmem skbs can only be received
+ * with the MSG_SOCK_DEVMEM flag.
+ */
+ if (!copied)
+ copied = -EFAULT;
+
+ break;
+ }
+
+ err = tcp_recvmsg_devmem(sk, skb, offset, msg,
+ used);
+ if (err <= 0) {
+ if (!copied)
+ copied = -EFAULT;
+
+ break;
+ }
+ used = err;
}
}

+ last_copied_devmem = skb->devmem;
+
WRITE_ONCE(*seq, *seq + used);
copied += used;
len -= used;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7583d4e34c8c..4cc8be892f05 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2299,6 +2299,13 @@ static int tcp_v4_init_sock(struct sock *sk)
void tcp_v4_destroy_sock(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct page *page;
+ unsigned long index;
+
+ xa_for_each(&sk->sk_user_pages, index, page)
+ page_pool_page_put_many(page, 1);
+
+ xa_destroy(&sk->sk_user_pages);

trace_tcp_destroy_sock(sk);

--
2.42.0.869.gea05f2083d-goog


2023-11-06 18:44:47

by Stanislav Fomichev

[permalink] [raw]
Subject: Re: [RFC PATCH v3 10/12] tcp: RX path for devmem TCP

On 11/05, Mina Almasry wrote:
> In tcp_recvmsg_locked(), detect if the skb being received by the user
> is a devmem skb. In this case - if the user provided the MSG_SOCK_DEVMEM
> flag - pass it to tcp_recvmsg_devmem() for custom handling.
>
> tcp_recvmsg_devmem() copies any data in the skb header to the linear
> buffer, and returns a cmsg to the user indicating the number of bytes
> returned in the linear buffer.
>
> tcp_recvmsg_devmem() then loops over the unaccessible devmem skb frags,
> and returns to the user a cmsg_devmem indicating the location of the
> data in the dmabuf device memory. cmsg_devmem contains this information:
>
> 1. the offset into the dmabuf where the payload starts. 'frag_offset'.
> 2. the size of the frag. 'frag_size'.
> 3. an opaque token 'frag_token' to return to the kernel when the buffer
> is to be released.
>
> The pages awaiting freeing are stored in the newly added
> sk->sk_user_pages, and each page passed to userspace is get_page()'d.
> This reference is dropped once the userspace indicates that it is
> done reading this page. All pages are released when the socket is
> destroyed.
>
> Signed-off-by: Willem de Bruijn <[email protected]>
> Signed-off-by: Kaiyuan Zhang <[email protected]>
> Signed-off-by: Mina Almasry <[email protected]>
>
> ---
>
> RFC v3:
> - Fixed issue with put_cmsg() failing silently.
>
> ---
> include/linux/socket.h | 1 +
> include/net/page_pool/helpers.h | 9 ++
> include/net/sock.h | 2 +
> include/uapi/asm-generic/socket.h | 5 +
> include/uapi/linux/uio.h | 6 +
> net/ipv4/tcp.c | 189 +++++++++++++++++++++++++++++-
> net/ipv4/tcp_ipv4.c | 7 ++
> 7 files changed, 214 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/socket.h b/include/linux/socket.h
> index cfcb7e2c3813..fe2b9e2081bb 100644
> --- a/include/linux/socket.h
> +++ b/include/linux/socket.h
> @@ -326,6 +326,7 @@ struct ucred {
> * plain text and require encryption
> */
>
> +#define MSG_SOCK_DEVMEM 0x2000000 /* Receive devmem skbs as cmsg */

Sharing the feedback that I've been providing internally on the public list:

IMHO, we need a better UAPI to receive the tokens and give them back to
the kernel. CMSG + setsockopt(SO_DEVMEM_DONTNEED) get the job done,
but look dated and hacky :-(

We should either do some kind of user/kernel shared memory queue to
receive/return the tokens (similar to what Jonathan was doing in his
proposal?) or bite the bullet and switch to io_uring.

I was also suggesting to do it via netlink initially, but it's probably
a bit slow for these purpose, idk.

2023-11-06 19:29:57

by Mina Almasry

[permalink] [raw]
Subject: Re: [RFC PATCH v3 10/12] tcp: RX path for devmem TCP

On Mon, Nov 6, 2023 at 10:44 AM Stanislav Fomichev <[email protected]> wrote:
>
> On 11/05, Mina Almasry wrote:
> > In tcp_recvmsg_locked(), detect if the skb being received by the user
> > is a devmem skb. In this case - if the user provided the MSG_SOCK_DEVMEM
> > flag - pass it to tcp_recvmsg_devmem() for custom handling.
> >
> > tcp_recvmsg_devmem() copies any data in the skb header to the linear
> > buffer, and returns a cmsg to the user indicating the number of bytes
> > returned in the linear buffer.
> >
> > tcp_recvmsg_devmem() then loops over the unaccessible devmem skb frags,
> > and returns to the user a cmsg_devmem indicating the location of the
> > data in the dmabuf device memory. cmsg_devmem contains this information:
> >
> > 1. the offset into the dmabuf where the payload starts. 'frag_offset'.
> > 2. the size of the frag. 'frag_size'.
> > 3. an opaque token 'frag_token' to return to the kernel when the buffer
> > is to be released.
> >
> > The pages awaiting freeing are stored in the newly added
> > sk->sk_user_pages, and each page passed to userspace is get_page()'d.
> > This reference is dropped once the userspace indicates that it is
> > done reading this page. All pages are released when the socket is
> > destroyed.
> >
> > Signed-off-by: Willem de Bruijn <[email protected]>
> > Signed-off-by: Kaiyuan Zhang <[email protected]>
> > Signed-off-by: Mina Almasry <[email protected]>
> >
> > ---
> >
> > RFC v3:
> > - Fixed issue with put_cmsg() failing silently.
> >
> > ---
> > include/linux/socket.h | 1 +
> > include/net/page_pool/helpers.h | 9 ++
> > include/net/sock.h | 2 +
> > include/uapi/asm-generic/socket.h | 5 +
> > include/uapi/linux/uio.h | 6 +
> > net/ipv4/tcp.c | 189 +++++++++++++++++++++++++++++-
> > net/ipv4/tcp_ipv4.c | 7 ++
> > 7 files changed, 214 insertions(+), 5 deletions(-)
> >
> > diff --git a/include/linux/socket.h b/include/linux/socket.h
> > index cfcb7e2c3813..fe2b9e2081bb 100644
> > --- a/include/linux/socket.h
> > +++ b/include/linux/socket.h
> > @@ -326,6 +326,7 @@ struct ucred {
> > * plain text and require encryption
> > */
> >
> > +#define MSG_SOCK_DEVMEM 0x2000000 /* Receive devmem skbs as cmsg */
>
> Sharing the feedback that I've been providing internally on the public list:
>

There may have been a miscommunication. I don't recall hearing this
specific feedback from you, at least in the last few months. Sorry if
it seemed like I'm ignoring feedback :)

> IMHO, we need a better UAPI to receive the tokens and give them back to
> the kernel. CMSG + setsockopt(SO_DEVMEM_DONTNEED) get the job done,
> but look dated and hacky :-(
>
> We should either do some kind of user/kernel shared memory queue to
> receive/return the tokens (similar to what Jonathan was doing in his
> proposal?)

I'll take a look at Jonathan's proposal, sorry, I'm not immediately
familiar but I wanted to respond :-) But is the suggestion here to
build a new kernel-user communication channel primitive for the
purpose of passing the information in the devmem cmsg? IMHO that seems
like an overkill. Why add 100-200 lines of code to the kernel to add
something that can already be done with existing primitives? I don't
see anything concretely wrong with cmsg & setsockopt approach, and if
we switch to something I'd prefer to switch to an existing primitive
for simplicity?

The only other existing primitive to pass data outside of the linear
buffer is the MSG_ERRQUEUE that is used for zerocopy. Is that
preferred? Any other suggestions or existing primitives I'm not aware
of?

> or bite the bullet and switch to io_uring.
>

IMO io_uring & socket support are orthogonal, and one doesn't preclude
the other. As you know we like to use sockets and I believe there are
issues with io_uring adoption at Google that I'm not familiar with
(and could be wrong). I'm interested in exploring io_uring support as
a follow up but I think David Wei will be interested in io_uring
support as well anyway.

> I was also suggesting to do it via netlink initially, but it's probably
> a bit slow for these purpose, idk.

Yeah, I hear netlink is reserved for control paths and is
inappropriate for data path, but I'll let folks correct me if wrong.

--
Thanks,
Mina

2023-11-06 21:15:13

by Willem de Bruijn

[permalink] [raw]
Subject: Re: [RFC PATCH v3 10/12] tcp: RX path for devmem TCP

> > IMHO, we need a better UAPI to receive the tokens and give them back to
> > the kernel. CMSG + setsockopt(SO_DEVMEM_DONTNEED) get the job done,
> > but look dated and hacky :-(
> >
> > We should either do some kind of user/kernel shared memory queue to
> > receive/return the tokens (similar to what Jonathan was doing in his
> > proposal?)
>
> I'll take a look at Jonathan's proposal, sorry, I'm not immediately
> familiar but I wanted to respond :-) But is the suggestion here to
> build a new kernel-user communication channel primitive for the
> purpose of passing the information in the devmem cmsg? IMHO that seems
> like an overkill. Why add 100-200 lines of code to the kernel to add
> something that can already be done with existing primitives? I don't
> see anything concretely wrong with cmsg & setsockopt approach, and if
> we switch to something I'd prefer to switch to an existing primitive
> for simplicity?
>
> The only other existing primitive to pass data outside of the linear
> buffer is the MSG_ERRQUEUE that is used for zerocopy. Is that
> preferred? Any other suggestions or existing primitives I'm not aware
> of?
>
> > or bite the bullet and switch to io_uring.
> >
>
> IMO io_uring & socket support are orthogonal, and one doesn't preclude
> the other. As you know we like to use sockets and I believe there are
> issues with io_uring adoption at Google that I'm not familiar with
> (and could be wrong). I'm interested in exploring io_uring support as
> a follow up but I think David Wei will be interested in io_uring
> support as well anyway.

I also disagree that we need to replace a standard socket interface
with something "faster", in quotes.

This interface is not the bottleneck to the target workload.

Replacing the synchronous sockets interface with something more
performant for workloads where it is, is an orthogonal challenge.
However we do that, I think that traditional sockets should continue
to be supported.

The feature may already even work with io_uring, as both recvmsg with
cmsg and setsockopt have io_uring support now.

2023-11-06 21:18:35

by Stanislav Fomichev

[permalink] [raw]
Subject: Re: [RFC PATCH v3 10/12] tcp: RX path for devmem TCP

On 11/06, Mina Almasry wrote:
> On Mon, Nov 6, 2023 at 10:44 AM Stanislav Fomichev <[email protected]> wrote:
> >
> > On 11/05, Mina Almasry wrote:
> > > In tcp_recvmsg_locked(), detect if the skb being received by the user
> > > is a devmem skb. In this case - if the user provided the MSG_SOCK_DEVMEM
> > > flag - pass it to tcp_recvmsg_devmem() for custom handling.
> > >
> > > tcp_recvmsg_devmem() copies any data in the skb header to the linear
> > > buffer, and returns a cmsg to the user indicating the number of bytes
> > > returned in the linear buffer.
> > >
> > > tcp_recvmsg_devmem() then loops over the unaccessible devmem skb frags,
> > > and returns to the user a cmsg_devmem indicating the location of the
> > > data in the dmabuf device memory. cmsg_devmem contains this information:
> > >
> > > 1. the offset into the dmabuf where the payload starts. 'frag_offset'.
> > > 2. the size of the frag. 'frag_size'.
> > > 3. an opaque token 'frag_token' to return to the kernel when the buffer
> > > is to be released.
> > >
> > > The pages awaiting freeing are stored in the newly added
> > > sk->sk_user_pages, and each page passed to userspace is get_page()'d.
> > > This reference is dropped once the userspace indicates that it is
> > > done reading this page. All pages are released when the socket is
> > > destroyed.
> > >
> > > Signed-off-by: Willem de Bruijn <[email protected]>
> > > Signed-off-by: Kaiyuan Zhang <[email protected]>
> > > Signed-off-by: Mina Almasry <[email protected]>
> > >
> > > ---
> > >
> > > RFC v3:
> > > - Fixed issue with put_cmsg() failing silently.
> > >
> > > ---
> > > include/linux/socket.h | 1 +
> > > include/net/page_pool/helpers.h | 9 ++
> > > include/net/sock.h | 2 +
> > > include/uapi/asm-generic/socket.h | 5 +
> > > include/uapi/linux/uio.h | 6 +
> > > net/ipv4/tcp.c | 189 +++++++++++++++++++++++++++++-
> > > net/ipv4/tcp_ipv4.c | 7 ++
> > > 7 files changed, 214 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/include/linux/socket.h b/include/linux/socket.h
> > > index cfcb7e2c3813..fe2b9e2081bb 100644
> > > --- a/include/linux/socket.h
> > > +++ b/include/linux/socket.h
> > > @@ -326,6 +326,7 @@ struct ucred {
> > > * plain text and require encryption
> > > */
> > >
> > > +#define MSG_SOCK_DEVMEM 0x2000000 /* Receive devmem skbs as cmsg */
> >
> > Sharing the feedback that I've been providing internally on the public list:
> >
>
> There may have been a miscommunication. I don't recall hearing this
> specific feedback from you, at least in the last few months. Sorry if
> it seemed like I'm ignoring feedback :)

No worries, there was a thread long time ago about this whole token
interface and whether it should support out-of-order refills, etc.

> > IMHO, we need a better UAPI to receive the tokens and give them back to
> > the kernel. CMSG + setsockopt(SO_DEVMEM_DONTNEED) get the job done,
> > but look dated and hacky :-(
> >
> > We should either do some kind of user/kernel shared memory queue to
> > receive/return the tokens (similar to what Jonathan was doing in his
> > proposal?)
>
> I'll take a look at Jonathan's proposal, sorry, I'm not immediately
> familiar but I wanted to respond :-) But is the suggestion here to
> build a new kernel-user communication channel primitive for the
> purpose of passing the information in the devmem cmsg? IMHO that seems
> like an overkill. Why add 100-200 lines of code to the kernel to add
> something that can already be done with existing primitives? I don't
> see anything concretely wrong with cmsg & setsockopt approach, and if
> we switch to something I'd prefer to switch to an existing primitive
> for simplicity?
>
> The only other existing primitive to pass data outside of the linear
> buffer is the MSG_ERRQUEUE that is used for zerocopy. Is that
> preferred? Any other suggestions or existing primitives I'm not aware
> of?

I guess I'm just wondering whether other people have any suggestions
here. Not sure Jonathan's way was better, but we fundamentally
have two queues between the kernel and the userspace:
- userspace receiving tokens (recvmsg + magical flag)
- userspace refilling tokens (setsockopt + magical flag)

So having some kind of shared memory producer-consumer queue feels natural.
And using 'classic' socket api here feels like a stretch, idk.

But maybe I'm overthinking and overcomplicating :-)

> > or bite the bullet and switch to io_uring.
> >
>
> IMO io_uring & socket support are orthogonal, and one doesn't preclude
> the other. As you know we like to use sockets and I believe there are
> issues with io_uring adoption at Google that I'm not familiar with
> (and could be wrong). I'm interested in exploring io_uring support as
> a follow up but I think David Wei will be interested in io_uring
> support as well anyway.

Ack, might be one more reason on our side to adopt iouring :-p

> > I was also suggesting to do it via netlink initially, but it's probably
> > a bit slow for these purpose, idk.
>
> Yeah, I hear netlink is reserved for control paths and is
> inappropriate for data path, but I'll let folks correct me if wrong.
>
> --
> Thanks,
> Mina

2023-11-06 22:34:50

by Stanislav Fomichev

[permalink] [raw]
Subject: Re: [RFC PATCH v3 10/12] tcp: RX path for devmem TCP

On 11/06, Willem de Bruijn wrote:
> > > IMHO, we need a better UAPI to receive the tokens and give them back to
> > > the kernel. CMSG + setsockopt(SO_DEVMEM_DONTNEED) get the job done,
> > > but look dated and hacky :-(
> > >
> > > We should either do some kind of user/kernel shared memory queue to
> > > receive/return the tokens (similar to what Jonathan was doing in his
> > > proposal?)
> >
> > I'll take a look at Jonathan's proposal, sorry, I'm not immediately
> > familiar but I wanted to respond :-) But is the suggestion here to
> > build a new kernel-user communication channel primitive for the
> > purpose of passing the information in the devmem cmsg? IMHO that seems
> > like an overkill. Why add 100-200 lines of code to the kernel to add
> > something that can already be done with existing primitives? I don't
> > see anything concretely wrong with cmsg & setsockopt approach, and if
> > we switch to something I'd prefer to switch to an existing primitive
> > for simplicity?
> >
> > The only other existing primitive to pass data outside of the linear
> > buffer is the MSG_ERRQUEUE that is used for zerocopy. Is that
> > preferred? Any other suggestions or existing primitives I'm not aware
> > of?
> >
> > > or bite the bullet and switch to io_uring.
> > >
> >
> > IMO io_uring & socket support are orthogonal, and one doesn't preclude
> > the other. As you know we like to use sockets and I believe there are
> > issues with io_uring adoption at Google that I'm not familiar with
> > (and could be wrong). I'm interested in exploring io_uring support as
> > a follow up but I think David Wei will be interested in io_uring
> > support as well anyway.
>
> I also disagree that we need to replace a standard socket interface
> with something "faster", in quotes.
>
> This interface is not the bottleneck to the target workload.
>
> Replacing the synchronous sockets interface with something more
> performant for workloads where it is, is an orthogonal challenge.
> However we do that, I think that traditional sockets should continue
> to be supported.
>
> The feature may already even work with io_uring, as both recvmsg with
> cmsg and setsockopt have io_uring support now.

I'm not really concerned with faster. I would prefer something cleaner :-)

Or maybe we should just have it documented. With some kind of path
towards beautiful world where we can create dynamic queues..

2023-11-06 22:56:21

by Willem de Bruijn

[permalink] [raw]
Subject: Re: [RFC PATCH v3 10/12] tcp: RX path for devmem TCP

On Mon, Nov 6, 2023 at 2:34 PM Stanislav Fomichev <[email protected]> wrote:
>
> On 11/06, Willem de Bruijn wrote:
> > > > IMHO, we need a better UAPI to receive the tokens and give them back to
> > > > the kernel. CMSG + setsockopt(SO_DEVMEM_DONTNEED) get the job done,
> > > > but look dated and hacky :-(
> > > >
> > > > We should either do some kind of user/kernel shared memory queue to
> > > > receive/return the tokens (similar to what Jonathan was doing in his
> > > > proposal?)
> > >
> > > I'll take a look at Jonathan's proposal, sorry, I'm not immediately
> > > familiar but I wanted to respond :-) But is the suggestion here to
> > > build a new kernel-user communication channel primitive for the
> > > purpose of passing the information in the devmem cmsg? IMHO that seems
> > > like an overkill. Why add 100-200 lines of code to the kernel to add
> > > something that can already be done with existing primitives? I don't
> > > see anything concretely wrong with cmsg & setsockopt approach, and if
> > > we switch to something I'd prefer to switch to an existing primitive
> > > for simplicity?
> > >
> > > The only other existing primitive to pass data outside of the linear
> > > buffer is the MSG_ERRQUEUE that is used for zerocopy. Is that
> > > preferred? Any other suggestions or existing primitives I'm not aware
> > > of?
> > >
> > > > or bite the bullet and switch to io_uring.
> > > >
> > >
> > > IMO io_uring & socket support are orthogonal, and one doesn't preclude
> > > the other. As you know we like to use sockets and I believe there are
> > > issues with io_uring adoption at Google that I'm not familiar with
> > > (and could be wrong). I'm interested in exploring io_uring support as
> > > a follow up but I think David Wei will be interested in io_uring
> > > support as well anyway.
> >
> > I also disagree that we need to replace a standard socket interface
> > with something "faster", in quotes.
> >
> > This interface is not the bottleneck to the target workload.
> >
> > Replacing the synchronous sockets interface with something more
> > performant for workloads where it is, is an orthogonal challenge.
> > However we do that, I think that traditional sockets should continue
> > to be supported.
> >
> > The feature may already even work with io_uring, as both recvmsg with
> > cmsg and setsockopt have io_uring support now.
>
> I'm not really concerned with faster. I would prefer something cleaner :-)
>
> Or maybe we should just have it documented. With some kind of path
> towards beautiful world where we can create dynamic queues..

I suppose we just disagree on the elegance of the API.

The concise notification API returns tokens as a range for
compression, encoding as two 32-bit unsigned integers start + length.
It allows for even further batching by returning multiple such ranges
in a single call.

This is analogous to the MSG_ZEROCOPY notification mechanism from
kernel to user.

The synchronous socket syscall interface can be replaced by something
asynchronous like io_uring. This already works today? Whatever
asynchronous ring-based API would be selected, io_uring or otherwise,
I think the concise notification encoding would remain as is.

Since this is an operation on a socket, I find a setsockopt the
fitting interface.

2023-11-06 23:32:59

by Stanislav Fomichev

[permalink] [raw]
Subject: Re: [RFC PATCH v3 10/12] tcp: RX path for devmem TCP

On Mon, Nov 6, 2023 at 2:56 PM Willem de Bruijn
<[email protected]> wrote:
>
> On Mon, Nov 6, 2023 at 2:34 PM Stanislav Fomichev <[email protected]> wrote:
> >
> > On 11/06, Willem de Bruijn wrote:
> > > > > IMHO, we need a better UAPI to receive the tokens and give them back to
> > > > > the kernel. CMSG + setsockopt(SO_DEVMEM_DONTNEED) get the job done,
> > > > > but look dated and hacky :-(
> > > > >
> > > > > We should either do some kind of user/kernel shared memory queue to
> > > > > receive/return the tokens (similar to what Jonathan was doing in his
> > > > > proposal?)
> > > >
> > > > I'll take a look at Jonathan's proposal, sorry, I'm not immediately
> > > > familiar but I wanted to respond :-) But is the suggestion here to
> > > > build a new kernel-user communication channel primitive for the
> > > > purpose of passing the information in the devmem cmsg? IMHO that seems
> > > > like an overkill. Why add 100-200 lines of code to the kernel to add
> > > > something that can already be done with existing primitives? I don't
> > > > see anything concretely wrong with cmsg & setsockopt approach, and if
> > > > we switch to something I'd prefer to switch to an existing primitive
> > > > for simplicity?
> > > >
> > > > The only other existing primitive to pass data outside of the linear
> > > > buffer is the MSG_ERRQUEUE that is used for zerocopy. Is that
> > > > preferred? Any other suggestions or existing primitives I'm not aware
> > > > of?
> > > >
> > > > > or bite the bullet and switch to io_uring.
> > > > >
> > > >
> > > > IMO io_uring & socket support are orthogonal, and one doesn't preclude
> > > > the other. As you know we like to use sockets and I believe there are
> > > > issues with io_uring adoption at Google that I'm not familiar with
> > > > (and could be wrong). I'm interested in exploring io_uring support as
> > > > a follow up but I think David Wei will be interested in io_uring
> > > > support as well anyway.
> > >
> > > I also disagree that we need to replace a standard socket interface
> > > with something "faster", in quotes.
> > >
> > > This interface is not the bottleneck to the target workload.
> > >
> > > Replacing the synchronous sockets interface with something more
> > > performant for workloads where it is, is an orthogonal challenge.
> > > However we do that, I think that traditional sockets should continue
> > > to be supported.
> > >
> > > The feature may already even work with io_uring, as both recvmsg with
> > > cmsg and setsockopt have io_uring support now.
> >
> > I'm not really concerned with faster. I would prefer something cleaner :-)
> >
> > Or maybe we should just have it documented. With some kind of path
> > towards beautiful world where we can create dynamic queues..
>
> I suppose we just disagree on the elegance of the API.

Yeah, I might be overly sensitive to the apis that use get/setsockopt
for something more involved than setting a flag.
Probably because I know that bpf will (unnecessarily) trigger on these :-D
I had to implement that bpf "bypass" (or fastpath) for
TCP_ZEROCOPY_RECEIVE and it looks like this token recycle might also
benefit from something similar.

> The concise notification API returns tokens as a range for
> compression, encoding as two 32-bit unsigned integers start + length.
> It allows for even further batching by returning multiple such ranges
> in a single call.

Tangential: should tokens be u64? Otherwise we can't have more than
4gb unacknowledged. Or that's a reasonable constraint?


> This is analogous to the MSG_ZEROCOPY notification mechanism from
> kernel to user.
>
> The synchronous socket syscall interface can be replaced by something
> asynchronous like io_uring. This already works today? Whatever
> asynchronous ring-based API would be selected, io_uring or otherwise,
> I think the concise notification encoding would remain as is.
>
> Since this is an operation on a socket, I find a setsockopt the
> fitting interface.

2023-11-06 23:56:21

by David Ahern

[permalink] [raw]
Subject: Re: [RFC PATCH v3 10/12] tcp: RX path for devmem TCP

On 11/6/23 4:32 PM, Stanislav Fomichev wrote:
>> The concise notification API returns tokens as a range for
>> compression, encoding as two 32-bit unsigned integers start + length.
>> It allows for even further batching by returning multiple such ranges
>> in a single call.
>
> Tangential: should tokens be u64? Otherwise we can't have more than
> 4gb unacknowledged. Or that's a reasonable constraint?
>

Was thinking the same and with bits reserved for a dmabuf id to allow
multiple dmabufs in a single rx queue (future extension, but build the
capability in now). e.g., something like a 37b offset (128GB dmabuf
size), 19b length (large GRO), 8b dmabuf id (lots of dmabufs to a queue).

2023-11-07 00:03:44

by Willem de Bruijn

[permalink] [raw]
Subject: Re: [RFC PATCH v3 10/12] tcp: RX path for devmem TCP

On Mon, Nov 6, 2023 at 3:55 PM David Ahern <[email protected]> wrote:
>
> On 11/6/23 4:32 PM, Stanislav Fomichev wrote:
> >> The concise notification API returns tokens as a range for
> >> compression, encoding as two 32-bit unsigned integers start + length.
> >> It allows for even further batching by returning multiple such ranges
> >> in a single call.
> >
> > Tangential: should tokens be u64? Otherwise we can't have more than
> > 4gb unacknowledged. Or that's a reasonable constraint?
> >
>
> Was thinking the same and with bits reserved for a dmabuf id to allow
> multiple dmabufs in a single rx queue (future extension, but build the
> capability in now). e.g., something like a 37b offset (128GB dmabuf
> size), 19b length (large GRO), 8b dmabuf id (lots of dmabufs to a queue).

Agreed. Converting to 64b now sounds like a good forward looking revision.

2023-12-08 20:17:06

by Pavel Begunkov

[permalink] [raw]
Subject: Re: [RFC PATCH v3 10/12] tcp: RX path for devmem TCP

On 11/6/23 22:34, Stanislav Fomichev wrote:
> On 11/06, Willem de Bruijn wrote:
>>>> IMHO, we need a better UAPI to receive the tokens and give them back to
>>>> the kernel. CMSG + setsockopt(SO_DEVMEM_DONTNEED) get the job done,
>>>> but look dated and hacky :-(
>>>>
>>>> We should either do some kind of user/kernel shared memory queue to
>>>> receive/return the tokens (similar to what Jonathan was doing in his
>>>> proposal?)

Oops, missed the discussion.
IMHO shared rings are more elegant here. With that the app -> kernel
buffer return path doesn't need to setsockopt(), which will have to
figure out how to return buffers to pp efficiently, and then potentially
some sync on the pp allocation side. It just grabs entries from the ring
in the napi context on allocation when necessary.
But then you basically get the io_uring zc rx... just saying

>>> I'll take a look at Jonathan's proposal, sorry, I'm not immediately
>>> familiar but I wanted to respond :-) But is the suggestion here to
>>> build a new kernel-user communication channel primitive for the
>>> purpose of passing the information in the devmem cmsg? IMHO that seems
>>> like an overkill. Why add 100-200 lines of code to the kernel to add
>>> something that can already be done with existing primitives? I don't
>>> see anything concretely wrong with cmsg & setsockopt approach, and if
>>> we switch to something I'd prefer to switch to an existing primitive
>>> for simplicity?
>>>
>>> The only other existing primitive to pass data outside of the linear
>>> buffer is the MSG_ERRQUEUE that is used for zerocopy. Is that
>>> preferred? Any other suggestions or existing primitives I'm not aware
>>> of?
>>>
>>>> or bite the bullet and switch to io_uring.
>>>>
>>>
>>> IMO io_uring & socket support are orthogonal, and one doesn't preclude
>>> the other.

They don't preclude each other, but I wouldn't say they're orthogonal.
Similar approaches, some different details. FWIW, we'll be posting a
next iteration on top of the pp providers patches soon.

>>> As you know we like to use sockets and I believe there are
>>> issues with io_uring adoption at Google that I'm not familiar with
>>> (and could be wrong). I'm interested in exploring io_uring support as
>>> a follow up but I think David Wei will be interested in io_uring
>>> support as well anyway.

Well, not exactly support of devmem, but true, we definitely want
to have io_uring zerocopy, considering all the api differences.
(at the same time not duplicating net bits).

>> I also disagree that we need to replace a standard socket interface
>> with something "faster", in quotes.
>>
>> This interface is not the bottleneck to the target workload.
>>
>> Replacing the synchronous sockets interface with something more
>> performant for workloads where it is, is an orthogonal challenge.
>> However we do that, I think that traditional sockets should continue
>> to be supported.
>>
>> The feature may already even work with io_uring, as both recvmsg with
>> cmsg and setsockopt have io_uring support now.

It should, in theory, but the api wouldn't suit io_uring, internals
wouldn't be properly optimised, and we can't use it with some
important features like multishot recv because of cmsg.

> I'm not really concerned with faster. I would prefer something cleaner :-)
>
> Or maybe we should just have it documented. With some kind of path
> towards beautiful world where we can create dynamic queues..



--
Pavel Begunkov

2023-12-08 20:21:30

by Pavel Begunkov

[permalink] [raw]
Subject: Re: [RFC PATCH v3 10/12] tcp: RX path for devmem TCP

On 11/9/23 16:07, Edward Cree wrote:
> On 09/11/2023 02:39, Mina Almasry wrote:
>> On Wed, Nov 8, 2023 at 7:36 AM Edward Cree <[email protected]> wrote:
>>> If not then surely the way to return a memory area
>>> in an io_uring idiom is just to post a new read sqe ('RX descriptor')
>>> pointing into it, rather than explicitly returning it with setsockopt.
>>
>> We're interested in using this with regular TCP sockets, not
>> necessarily io_uring.
> Fair. I just wanted to push against the suggestion upthread that "oh,
> since io_uring supports setsockopt() we can just ignore it and it'll
> all magically work later" (paraphrased).

IMHO, that'd be horrible, but that why there are io_uring zc rx
patches, and we'll be sending an update soon

https://lore.kernel.org/all/[email protected]/


> If you can keep the "allocate buffers out of a devmem region" and "post
> RX descriptors built on those buffers" APIs separate (inside the
> kernel; obviously both triggered by a single call to the setsockopt()
> uAPI) that'll likely make things simpler for the io_uring interface I
> describe, which will only want the latter.
> PS: Here's a crazy idea that I haven't thought through at all: what if
> you allow device memory to be mmap()ed into process address space
> (obviously with none of r/w/x because it's unreachable), so that your
> various uAPIs can just operate on pointers (e.g. the setsockopt
> becomes the madvise it's named after; recvmsg just uses or populates
> the iovec rather than needing a cmsg). Then if future devices have
> their memory CXL accessible that can potentially be enabled with no
> change to the uAPI (userland just starts being able to access the
> region without faulting).
> And you can maybe add a semantic flag to recvmsg saying "if you don't
> use all the buffers in my iovec, keep hold of the rest of them for
> future incoming traffic, and if I post new buffers with my next
> recvmsg, add those to the tail of the RXQ rather than replacing the
> ones you've got". That way you can still have the "userland
> directly fills the RX ring" behaviour even with TCP sockets.
>

--
Pavel Begunkov

2023-12-08 20:36:08

by Pavel Begunkov

[permalink] [raw]
Subject: Re: [RFC PATCH v3 10/12] tcp: RX path for devmem TCP

On 11/6/23 22:55, Willem de Bruijn wrote:
> On Mon, Nov 6, 2023 at 2:34 PM Stanislav Fomichev <[email protected]> wrote:
>>
>> On 11/06, Willem de Bruijn wrote:
>>>>> IMHO, we need a better UAPI to receive the tokens and give them back to
>>>>> the kernel. CMSG + setsockopt(SO_DEVMEM_DONTNEED) get the job done,
>>>>> but look dated and hacky :-(
>>>>>
>>>>> We should either do some kind of user/kernel shared memory queue to
>>>>> receive/return the tokens (similar to what Jonathan was doing in his
>>>>> proposal?)
>>>>
>>>> I'll take a look at Jonathan's proposal, sorry, I'm not immediately
>>>> familiar but I wanted to respond :-) But is the suggestion here to
>>>> build a new kernel-user communication channel primitive for the
>>>> purpose of passing the information in the devmem cmsg? IMHO that seems
>>>> like an overkill. Why add 100-200 lines of code to the kernel to add
>>>> something that can already be done with existing primitives? I don't
>>>> see anything concretely wrong with cmsg & setsockopt approach, and if
>>>> we switch to something I'd prefer to switch to an existing primitive
>>>> for simplicity?
>>>>
>>>> The only other existing primitive to pass data outside of the linear
>>>> buffer is the MSG_ERRQUEUE that is used for zerocopy. Is that
>>>> preferred? Any other suggestions or existing primitives I'm not aware
>>>> of?
>>>>
>>>>> or bite the bullet and switch to io_uring.
>>>>>
>>>>
>>>> IMO io_uring & socket support are orthogonal, and one doesn't preclude
>>>> the other. As you know we like to use sockets and I believe there are
>>>> issues with io_uring adoption at Google that I'm not familiar with
>>>> (and could be wrong). I'm interested in exploring io_uring support as
>>>> a follow up but I think David Wei will be interested in io_uring
>>>> support as well anyway.
>>>
>>> I also disagree that we need to replace a standard socket interface
>>> with something "faster", in quotes.
>>>
>>> This interface is not the bottleneck to the target workload.
>>>
>>> Replacing the synchronous sockets interface with something more
>>> performant for workloads where it is, is an orthogonal challenge.
>>> However we do that, I think that traditional sockets should continue
>>> to be supported.
>>>
>>> The feature may already even work with io_uring, as both recvmsg with
>>> cmsg and setsockopt have io_uring support now.
>>
>> I'm not really concerned with faster. I would prefer something cleaner :-)
>>
>> Or maybe we should just have it documented. With some kind of path
>> towards beautiful world where we can create dynamic queues..
>
> I suppose we just disagree on the elegance of the API.
>
> The concise notification API returns tokens as a range for
> compression, encoding as two 32-bit unsigned integers start + length.
> It allows for even further batching by returning multiple such ranges
> in a single call.

FWIW, nothing prevents io_uring from compressing ranges. The io_uring
zc RFC returns {offset, size} as well, though at the moment the would
lie in the same page.

> This is analogous to the MSG_ZEROCOPY notification mechanism from
> kernel to user.
>
> The synchronous socket syscall interface can be replaced by something
> asynchronous like io_uring. This already works today? Whatever

If you mean async io_uring recv, it does work. In short, internally
it polls the socket and then calls sock_recvmsg(). There is also a
feature that would make it return back to polling after sock_recvmsg()
and loop like this.

> asynchronous ring-based API would be selected, io_uring or otherwise,
> I think the concise notification encoding would remain as is.
>
> Since this is an operation on a socket, I find a setsockopt the
> fitting interface.

--
Pavel Begunkov