There is a couple of tricks we can do with io_uring to improve ubuf_info
refcounting. First, we ammortise reference grabbing and then give them
away to the network layer, which is implemented in 8 and 11. Also, we
don't need need additional pinning for TCP, which is removed by 7.
1-4 are needed because otherwise we're out of space in io_notif_data and
using ->desc or some other field of ubuf_info would be ugly. It'll also
facilitate further ideas like adding a simpler notification model for UDP.
liburing/examples/io_uring-sendzc benchmark using a branch containing the
patchset and some more [1] showed ~1.6% qps improvement for UDP (dummy dev),
and ~1% for TCP (localhost + hacks enabling zc).
I didn't specifically test xen and vhost and not sure how, would love
some help with that.
[1] https://github.com/isilence/linux/tree/net/zc-ref-optimisation
Pavel Begunkov (11):
net: introduce struct ubuf_info_msgzc
xen/netback: use struct ubuf_info_msgzc
vhost/net: use struct ubuf_info_msgzc
net: shrink struct ubuf_info
net: rename ubuf_info's flags
net: add flags for controlling ubuf_info
net/tcp: optimise tcp ubuf refcounting
net: let callers provide ->msg_ubuf refs
io_uring/notif: add helper for flushing refs
io_uring/notif: mark notifs with UARGFL_CALLER_PINNED
io_uring/notif: add ubuf_info ref caching
drivers/net/xen-netback/common.h | 2 +-
drivers/net/xen-netback/interface.c | 4 +--
drivers/net/xen-netback/netback.c | 7 +++---
drivers/vhost/net.c | 17 +++++++------
include/linux/skbuff.h | 35 +++++++++++++++++++++++---
io_uring/net.c | 8 +++++-
io_uring/notif.c | 21 ++++++++++------
io_uring/notif.h | 22 +++++++++++++++-
net/core/skbuff.c | 39 ++++++++++++++++-------------
net/ipv4/ip_output.c | 3 ++-
net/ipv4/tcp.c | 11 +++++---
net/ipv6/ip6_output.c | 3 ++-
12 files changed, 123 insertions(+), 49 deletions(-)
--
2.37.0
struct ubuf_info will be changed, use ubuf_info_msgzc instead.
Signed-off-by: Pavel Begunkov <[email protected]>
---
drivers/net/xen-netback/common.h | 2 +-
drivers/net/xen-netback/interface.c | 4 ++--
drivers/net/xen-netback/netback.c | 7 ++++---
3 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 8174d7b2966c..1545cbee77a4 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -62,7 +62,7 @@ struct pending_tx_info {
* ubuf_to_vif is a helper which finds the struct xenvif from a pointer
* to this field.
*/
- struct ubuf_info callback_struct;
+ struct ubuf_info_msgzc callback_struct;
};
#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, XEN_PAGE_SIZE)
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index fb32ae82d9b0..e579ecd40b74 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -591,8 +591,8 @@ int xenvif_init_queue(struct xenvif_queue *queue)
}
for (i = 0; i < MAX_PENDING_REQS; i++) {
- queue->pending_tx_info[i].callback_struct = (struct ubuf_info)
- { .callback = xenvif_zerocopy_callback,
+ queue->pending_tx_info[i].callback_struct = (struct ubuf_info_msgzc)
+ { { .callback = xenvif_zerocopy_callback },
{ { .ctx = NULL,
.desc = i } } };
queue->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index a256695fc89e..3d2081bbbc86 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -133,7 +133,7 @@ static inline unsigned long idx_to_kaddr(struct xenvif_queue *queue,
/* Find the containing VIF's structure from a pointer in pending_tx_info array
*/
-static inline struct xenvif_queue *ubuf_to_queue(const struct ubuf_info *ubuf)
+static inline struct xenvif_queue *ubuf_to_queue(const struct ubuf_info_msgzc *ubuf)
{
u16 pending_idx = ubuf->desc;
struct pending_tx_info *temp =
@@ -1228,11 +1228,12 @@ static int xenvif_tx_submit(struct xenvif_queue *queue)
return work_done;
}
-void xenvif_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf,
+void xenvif_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf_base,
bool zerocopy_success)
{
unsigned long flags;
pending_ring_idx_t index;
+ struct ubuf_info_msgzc *ubuf = uarg_to_msgzc(ubuf_base);
struct xenvif_queue *queue = ubuf_to_queue(ubuf);
/* This is the only place where we grab this lock, to protect callbacks
@@ -1241,7 +1242,7 @@ void xenvif_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf,
spin_lock_irqsave(&queue->callback_lock, flags);
do {
u16 pending_idx = ubuf->desc;
- ubuf = (struct ubuf_info *) ubuf->ctx;
+ ubuf = (struct ubuf_info_msgzc *) ubuf->ctx;
BUG_ON(queue->dealloc_prod - queue->dealloc_cons >=
MAX_PENDING_REQS);
index = pending_index(queue->dealloc_prod);
--
2.37.0
ubuf_info::flags contains SKBFL_* flags that we copy into skbs, change
the field name to stress that it keeps skb flags.
Signed-off-by: Pavel Begunkov <[email protected]>
---
include/linux/skbuff.h | 4 ++--
io_uring/notif.c | 2 +-
net/core/skbuff.c | 2 +-
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index afd7400d7f62..e749b5d3868d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -534,7 +534,7 @@ struct ubuf_info {
void (*callback)(struct sk_buff *, struct ubuf_info *,
bool zerocopy_success);
refcount_t refcnt;
- u8 flags;
+ u8 skb_flags;
};
struct ubuf_info_msgzc {
@@ -1664,7 +1664,7 @@ static inline void net_zcopy_get(struct ubuf_info *uarg)
static inline void skb_zcopy_init(struct sk_buff *skb, struct ubuf_info *uarg)
{
skb_shinfo(skb)->destructor_arg = uarg;
- skb_shinfo(skb)->flags |= uarg->flags;
+ skb_shinfo(skb)->flags |= uarg->skb_flags;
}
static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
diff --git a/io_uring/notif.c b/io_uring/notif.c
index b5f989dff9de..97cb4a7e8849 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -65,7 +65,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
nd = io_notif_to_data(notif);
nd->account_pages = 0;
- nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
+ nd->uarg.skb_flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
nd->uarg.callback = io_uring_tx_zerocopy_callback;
/* master ref owned by io_notif_slot, will be dropped on flush */
refcount_set(&nd->uarg.refcnt, 1);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b047a773acd7..40bb84986800 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1206,7 +1206,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
uarg->len = 1;
uarg->bytelen = size;
uarg->zerocopy = 1;
- uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
+ uarg->ubuf.skb_flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
refcount_set(&uarg->ubuf.refcnt, 1);
sock_hold(sk);
--
2.37.0
We can benefit from a smaller struct ubuf_info, so leave only mandatory
fields and let users to decide how they want to extend it. Convert
MSG_ZEROCOPY to struct ubuf_info_msgzc and remove duplicated fields.
This reduces the size from 48 bytes to just 16.
Signed-off-by: Pavel Begunkov <[email protected]>
---
include/linux/skbuff.h | 22 ++++------------------
net/core/skbuff.c | 38 +++++++++++++++++++++-----------------
net/ipv4/ip_output.c | 2 +-
net/ipv4/tcp.c | 2 +-
net/ipv6/ip6_output.c | 2 +-
5 files changed, 28 insertions(+), 38 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f8ac3678dab8..afd7400d7f62 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -533,25 +533,8 @@ enum {
struct ubuf_info {
void (*callback)(struct sk_buff *, struct ubuf_info *,
bool zerocopy_success);
- union {
- struct {
- unsigned long desc;
- void *ctx;
- };
- struct {
- u32 id;
- u16 len;
- u16 zerocopy:1;
- u32 bytelen;
- };
- };
refcount_t refcnt;
u8 flags;
-
- struct mmpin {
- struct user_struct *user;
- unsigned int num_pg;
- } mmp;
};
struct ubuf_info_msgzc {
@@ -570,7 +553,10 @@ struct ubuf_info_msgzc {
};
};
- struct mmpin mmp;
+ struct mmpin {
+ struct user_struct *user;
+ unsigned int num_pg;
+ } mmp;
};
#define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 974bbbbe7138..b047a773acd7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1183,7 +1183,7 @@ EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
{
- struct ubuf_info *uarg;
+ struct ubuf_info_msgzc *uarg;
struct sk_buff *skb;
WARN_ON_ONCE(!in_task());
@@ -1201,19 +1201,19 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
return NULL;
}
- uarg->callback = msg_zerocopy_callback;
+ uarg->ubuf.callback = msg_zerocopy_callback;
uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
uarg->len = 1;
uarg->bytelen = size;
uarg->zerocopy = 1;
- uarg->flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
- refcount_set(&uarg->refcnt, 1);
+ uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
+ refcount_set(&uarg->ubuf.refcnt, 1);
sock_hold(sk);
- return uarg;
+ return &uarg->ubuf;
}
-static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
+static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg)
{
return container_of((void *)uarg, struct sk_buff, cb);
}
@@ -1222,6 +1222,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
struct ubuf_info *uarg)
{
if (uarg) {
+ struct ubuf_info_msgzc *uarg_zc;
const u32 byte_limit = 1 << 19; /* limit to a few TSO */
u32 bytelen, next;
@@ -1237,8 +1238,9 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
return NULL;
}
- bytelen = uarg->bytelen + size;
- if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
+ uarg_zc = uarg_to_msgzc(uarg);
+ bytelen = uarg_zc->bytelen + size;
+ if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) {
/* TCP can create new skb to attach new uarg */
if (sk->sk_type == SOCK_STREAM)
goto new_alloc;
@@ -1246,11 +1248,11 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
}
next = (u32)atomic_read(&sk->sk_zckey);
- if ((u32)(uarg->id + uarg->len) == next) {
- if (mm_account_pinned_pages(&uarg->mmp, size))
+ if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
+ if (mm_account_pinned_pages(&uarg_zc->mmp, size))
return NULL;
- uarg->len++;
- uarg->bytelen = bytelen;
+ uarg_zc->len++;
+ uarg_zc->bytelen = bytelen;
atomic_set(&sk->sk_zckey, ++next);
/* no extra ref when appending to datagram (MSG_MORE) */
@@ -1286,7 +1288,7 @@ static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
return true;
}
-static void __msg_zerocopy_callback(struct ubuf_info *uarg)
+static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg)
{
struct sk_buff *tail, *skb = skb_from_uarg(uarg);
struct sock_exterr_skb *serr;
@@ -1339,19 +1341,21 @@ static void __msg_zerocopy_callback(struct ubuf_info *uarg)
void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
bool success)
{
- uarg->zerocopy = uarg->zerocopy & success;
+ struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg);
+
+ uarg_zc->zerocopy = uarg_zc->zerocopy & success;
if (refcount_dec_and_test(&uarg->refcnt))
- __msg_zerocopy_callback(uarg);
+ __msg_zerocopy_callback(uarg_zc);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_callback);
void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
- struct sock *sk = skb_from_uarg(uarg)->sk;
+ struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk;
atomic_dec(&sk->sk_zckey);
- uarg->len--;
+ uarg_to_msgzc(uarg)->len--;
if (have_uref)
msg_zerocopy_callback(NULL, uarg, true);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index d7bd1daf022b..546897a4b4fa 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1043,7 +1043,7 @@ static int __ip_append_data(struct sock *sk,
paged = true;
zc = true;
} else {
- uarg->zerocopy = 0;
+ uarg_to_msgzc(uarg)->zerocopy = 0;
skb_zcopy_set(skb, uarg, &extra_uref);
}
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 970e9a2cca4a..3152da8f4763 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1239,7 +1239,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
}
zc = sk->sk_route_caps & NETIF_F_SG;
if (!zc)
- uarg->zerocopy = 0;
+ uarg_to_msgzc(uarg)->zerocopy = 0;
}
}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 897ca4f9b791..6d4f01a0cf6e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1568,7 +1568,7 @@ static int __ip6_append_data(struct sock *sk,
paged = true;
zc = true;
} else {
- uarg->zerocopy = 0;
+ uarg_to_msgzc(uarg)->zerocopy = 0;
skb_zcopy_set(skb, uarg, &extra_uref);
}
}
--
2.37.0
There are already skb_flags in ubuf_info, which enhancing skbs. Also add
flags controlling ubuf_info, mainly to hint about various referencing
aspects of it, which will be introduced in later patches.
Signed-off-by: Pavel Begunkov <[email protected]>
---
include/linux/skbuff.h | 1 +
io_uring/notif.c | 1 +
net/core/skbuff.c | 1 +
3 files changed, 3 insertions(+)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e749b5d3868d..2b2e0020030b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -535,6 +535,7 @@ struct ubuf_info {
bool zerocopy_success);
refcount_t refcnt;
u8 skb_flags;
+ u8 flags;
};
struct ubuf_info_msgzc {
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 97cb4a7e8849..a2ba1e35a59f 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -66,6 +66,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
nd = io_notif_to_data(notif);
nd->account_pages = 0;
nd->uarg.skb_flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
+ nd->uarg.flags = 0;
nd->uarg.callback = io_uring_tx_zerocopy_callback;
/* master ref owned by io_notif_slot, will be dropped on flush */
refcount_set(&nd->uarg.refcnt, 1);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 40bb84986800..7e102373482c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1207,6 +1207,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
uarg->bytelen = size;
uarg->zerocopy = 1;
uarg->ubuf.skb_flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
+ uarg->ubuf.flags = 0;
refcount_set(&uarg->ubuf.refcnt, 1);
sock_hold(sk);
--
2.37.0
Add UARGFL_CALLER_PINNED letting protocols know that the caller holds a
reference to the ubuf_info and so it doesn't need additional refcounting
for purposes of keeping it alive. With that TCP can save a refcount
put/get pair per send when used with ->msg_ubuf.
Signed-off-by: Pavel Begunkov <[email protected]>
---
include/linux/skbuff.h | 7 +++++++
net/ipv4/tcp.c | 9 ++++++---
2 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2b2e0020030b..45fe7f0648d0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -522,6 +522,13 @@ enum {
#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \
SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS)
+enum {
+ /* The caller holds a reference during the submission so the ubuf won't
+ * be freed until we return.
+ */
+ UARGFL_CALLER_PINNED = BIT(0),
+};
+
/*
* The callback notifies userspace to release buffers when skb DMA is done in
* lower device, the skb last reference should be 0 when calling this.
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3152da8f4763..4925107de57d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1229,7 +1229,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
if (msg->msg_ubuf) {
uarg = msg->msg_ubuf;
- net_zcopy_get(uarg);
+ if (!(uarg->flags & UARGFL_CALLER_PINNED))
+ net_zcopy_get(uarg);
zc = sk->sk_route_caps & NETIF_F_SG;
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
@@ -1455,7 +1456,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
}
out_nopush:
- net_zcopy_put(uarg);
+ if (uarg && !(uarg->flags & UARGFL_CALLER_PINNED))
+ net_zcopy_put(uarg);
return copied + copied_syn;
do_error:
@@ -1464,7 +1466,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
if (copied + copied_syn)
goto out;
out_err:
- net_zcopy_put_abort(uarg, true);
+ if (uarg && !(uarg->flags & UARGFL_CALLER_PINNED))
+ net_zcopy_put_abort(uarg, true);
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
--
2.37.0
Some msg_ubuf providers like io_uring can keep elaborated ubuf_info
reference batching and caching, so it will be of benefit to let the
network layer to optionally steal some of the cached refs.
Add UARGFL_GIFT_REF, if set the caller has at least one extra reference
that it can gift away. If the network decides to take the ref it should
clear the flag.
Signed-off-by: Pavel Begunkov <[email protected]>
---
include/linux/skbuff.h | 14 ++++++++++++++
net/ipv4/ip_output.c | 1 +
net/ipv6/ip6_output.c | 1 +
3 files changed, 16 insertions(+)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 45fe7f0648d0..972ec676e222 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -527,6 +527,11 @@ enum {
* be freed until we return.
*/
UARGFL_CALLER_PINNED = BIT(0),
+
+ /* The caller can gift one ubuf reference. The flag should be cleared
+ * when the reference is taken.
+ */
+ UARGFL_GIFT_REF = BIT(1),
};
/*
@@ -1709,6 +1714,15 @@ static inline void net_zcopy_put(struct ubuf_info *uarg)
uarg->callback(NULL, uarg, true);
}
+static inline bool net_zcopy_get_gift_ref(struct ubuf_info *uarg)
+{
+ bool has_ref;
+
+ has_ref = uarg->flags & UARGFL_GIFT_REF;
+ uarg->flags &= ~UARGFL_GIFT_REF;
+ return has_ref;
+}
+
static inline void net_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
if (uarg) {
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 546897a4b4fa..9d42b6dd6b78 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1032,6 +1032,7 @@ static int __ip_append_data(struct sock *sk,
paged = true;
zc = true;
uarg = msg->msg_ubuf;
+ extra_uref = net_zcopy_get_gift_ref(uarg);
}
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 6d4f01a0cf6e..8d8a8bbdb8df 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1557,6 +1557,7 @@ static int __ip6_append_data(struct sock *sk,
paged = true;
zc = true;
uarg = msg->msg_ubuf;
+ extra_uref = net_zcopy_get_gift_ref(uarg);
}
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
--
2.37.0
Add a helper for dropping notification references during flush. It's a
preparation patch, currently it's only one master ref, but we're going
to add ref caching.
Signed-off-by: Pavel Begunkov <[email protected]>
---
io_uring/notif.c | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/io_uring/notif.c b/io_uring/notif.c
index a2ba1e35a59f..5661681b3b44 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -73,6 +73,13 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
return notif;
}
+static inline bool io_notif_drop_refs(struct io_notif_data *nd)
+{
+ int refs = 1;
+
+ return refcount_sub_and_test(refs, &nd->uarg.refcnt);
+}
+
void io_notif_slot_flush(struct io_notif_slot *slot)
__must_hold(&ctx->uring_lock)
{
@@ -81,8 +88,7 @@ void io_notif_slot_flush(struct io_notif_slot *slot)
slot->notif = NULL;
- /* drop slot's master ref */
- if (refcount_dec_and_test(&nd->uarg.refcnt))
+ if (io_notif_drop_refs(nd))
io_notif_complete(notif);
}
@@ -97,13 +103,11 @@ __cold int io_notif_unregister(struct io_ring_ctx *ctx)
for (i = 0; i < ctx->nr_notif_slots; i++) {
struct io_notif_slot *slot = &ctx->notif_slots[i];
struct io_kiocb *notif = slot->notif;
- struct io_notif_data *nd;
if (!notif)
continue;
- nd = io_kiocb_to_cmd(notif);
slot->notif = NULL;
- if (!refcount_dec_and_test(&nd->uarg.refcnt))
+ if (!io_notif_drop_refs(io_kiocb_to_cmd(notif)))
continue;
notif->io_task_work.func = __io_notif_complete_tw;
io_req_task_work_add(notif);
--
2.37.0
Cache some active notifier references at the io_uring side and get them
in batches, so the ammortised cost is low. Then these references can be
given away to the network layer using UARGFL_GIFT_REF.
Signed-off-by: Pavel Begunkov <[email protected]>
---
io_uring/net.c | 8 +++++++-
io_uring/notif.c | 6 ++++--
io_uring/notif.h | 22 +++++++++++++++++++++-
3 files changed, 32 insertions(+), 4 deletions(-)
diff --git a/io_uring/net.c b/io_uring/net.c
index e6fc9748fbd2..bdaf9b10bd1b 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -949,6 +949,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
struct io_sendzc *zc = io_kiocb_to_cmd(req);
struct io_notif_slot *notif_slot;
struct io_kiocb *notif;
+ struct ubuf_info *ubuf;
struct msghdr msg;
struct iovec iov;
struct socket *sock;
@@ -1007,10 +1008,15 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
min_ret = iov_iter_count(&msg.msg_iter);
msg.msg_flags = msg_flags;
- msg.msg_ubuf = &io_notif_to_data(notif)->uarg;
msg.sg_from_iter = io_sg_from_iter;
+ msg.msg_ubuf = ubuf = &io_notif_to_data(notif)->uarg;
+ ubuf->flags |= UARGFL_GIFT_REF;
ret = sock_sendmsg(sock, &msg);
+ /* check if the send consumed an additional ref */
+ if (likely(!(ubuf->flags & UARGFL_GIFT_REF)))
+ io_notif_consume_ref(notif);
+
if (unlikely(ret < min_ret)) {
if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
return -EAGAIN;
diff --git a/io_uring/notif.c b/io_uring/notif.c
index dd346ea67580..73bbda5de07d 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -68,15 +68,17 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
nd->uarg.skb_flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
nd->uarg.flags = UARGFL_CALLER_PINNED;
nd->uarg.callback = io_uring_tx_zerocopy_callback;
+ nd->cached_refs = IO_NOTIF_REF_CACHE_NR;
/* master ref owned by io_notif_slot, will be dropped on flush */
- refcount_set(&nd->uarg.refcnt, 1);
+ refcount_set(&nd->uarg.refcnt, IO_NOTIF_REF_CACHE_NR + 1);
return notif;
}
static inline bool io_notif_drop_refs(struct io_notif_data *nd)
{
- int refs = 1;
+ int refs = nd->cached_refs + 1;
+ nd->cached_refs = 0;
return refcount_sub_and_test(refs, &nd->uarg.refcnt);
}
diff --git a/io_uring/notif.h b/io_uring/notif.h
index 0819304d7e00..2a263055a53b 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -9,11 +9,14 @@
#define IO_NOTIF_SPLICE_BATCH 32
#define IORING_MAX_NOTIF_SLOTS (1U << 10)
+#define IO_NOTIF_REF_CACHE_NR 64
struct io_notif_data {
struct file *file;
- struct ubuf_info uarg;
unsigned long account_pages;
+ /* extra uarg->refcnt refs */
+ int cached_refs;
+ struct ubuf_info uarg;
};
struct io_notif_slot {
@@ -88,3 +91,20 @@ static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len)
}
return 0;
}
+
+static inline void io_notif_consume_ref(struct io_kiocb *notif)
+ __must_hold(&ctx->uring_lock)
+{
+ struct io_notif_data *nd = io_notif_to_data(notif);
+
+ nd->cached_refs--;
+
+ /*
+ * Issue sends without looking at notif->cached_refs first, so we
+ * always have to have at least one ref cached
+ */
+ if (unlikely(!nd->cached_refs)) {
+ refcount_add(IO_NOTIF_REF_CACHE_NR, &nd->uarg.refcnt);
+ nd->cached_refs += IO_NOTIF_REF_CACHE_NR;
+ }
+}
--
2.37.0
We always keep references to active notifications and drop them only
when we flush, so they're always pinned during sock_sendmsg() and we can
add UARGFL_CALLER_PINNED.
Signed-off-by: Pavel Begunkov <[email protected]>
---
io_uring/notif.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 5661681b3b44..dd346ea67580 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -66,7 +66,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
nd = io_notif_to_data(notif);
nd->account_pages = 0;
nd->uarg.skb_flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
- nd->uarg.flags = 0;
+ nd->uarg.flags = UARGFL_CALLER_PINNED;
nd->uarg.callback = io_uring_tx_zerocopy_callback;
/* master ref owned by io_notif_slot, will be dropped on flush */
refcount_set(&nd->uarg.refcnt, 1);
--
2.37.0
struct ubuf_info will be changed, use ubuf_info_msgzc instead.
Signed-off-by: Pavel Begunkov <[email protected]>
---
drivers/vhost/net.c | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 68e4ecd1cc0e..9b616536dd9e 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -118,7 +118,7 @@ struct vhost_net_virtqueue {
/* Number of XDP frames batched */
int batched_xdp;
/* an array of userspace buffers info */
- struct ubuf_info *ubuf_info;
+ struct ubuf_info_msgzc *ubuf_info;
/* Reference counting for outstanding ubufs.
* Protected by vq mutex. Writers must also take device mutex. */
struct vhost_net_ubuf_ref *ubufs;
@@ -288,7 +288,7 @@ static int vhost_net_set_ubuf_info(struct vhost_net *n)
n->vqs[i].ubuf_info =
kmalloc_array(UIO_MAXIOV,
sizeof(*n->vqs[i].ubuf_info),
- GFP_KERNEL);
+ GFP_KERNEL | __GFP_ZERO);
if (!n->vqs[i].ubuf_info)
goto err;
}
@@ -382,8 +382,9 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
}
static void vhost_zerocopy_callback(struct sk_buff *skb,
- struct ubuf_info *ubuf, bool success)
+ struct ubuf_info *ubuf_base, bool success)
{
+ struct ubuf_info_msgzc *ubuf = uarg_to_msgzc(ubuf_base);
struct vhost_net_ubuf_ref *ubufs = ubuf->ctx;
struct vhost_virtqueue *vq = ubufs->vq;
int cnt;
@@ -871,7 +872,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
size_t len, total_len = 0;
int err;
struct vhost_net_ubuf_ref *ubufs;
- struct ubuf_info *ubuf;
+ struct ubuf_info_msgzc *ubuf;
bool zcopy_used;
int sent_pkts = 0;
@@ -907,14 +908,14 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
ubuf = nvq->ubuf_info + nvq->upend_idx;
vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
- ubuf->callback = vhost_zerocopy_callback;
ubuf->ctx = nvq->ubufs;
ubuf->desc = nvq->upend_idx;
- ubuf->flags = SKBFL_ZEROCOPY_FRAG;
- refcount_set(&ubuf->refcnt, 1);
+ ubuf->ubuf.callback = vhost_zerocopy_callback;
+ ubuf->ubuf.flags = SKBFL_ZEROCOPY_FRAG;
+ refcount_set(&ubuf->ubuf.refcnt, 1);
msg.msg_control = &ctl;
ctl.type = TUN_MSG_UBUF;
- ctl.ptr = ubuf;
+ ctl.ptr = &ubuf->ubuf;
msg.msg_controllen = sizeof(ctl);
ubufs = nvq->ubufs;
atomic_inc(&ubufs->refcount);
--
2.37.0
We're going to split struct ubuf_info and leave there only
mandatory fields. Users are free to extend it. Add struct
ubuf_info_msgzc, which will be an extended version for MSG_ZEROCOPY and
some other users. It duplicates of struct ubuf_info for now and will be
removed in a couple of patches.
Signed-off-by: Pavel Begunkov <[email protected]>
---
include/linux/skbuff.h | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ca8afa382bf2..f8ac3678dab8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -554,7 +554,28 @@ struct ubuf_info {
} mmp;
};
+struct ubuf_info_msgzc {
+ struct ubuf_info ubuf;
+
+ union {
+ struct {
+ unsigned long desc;
+ void *ctx;
+ };
+ struct {
+ u32 id;
+ u16 len;
+ u16 zerocopy:1;
+ u32 bytelen;
+ };
+ };
+
+ struct mmpin mmp;
+};
+
#define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
+#define uarg_to_msgzc(ubuf_ptr) container_of((ubuf_ptr), struct ubuf_info_msgzc, \
+ ubuf)
int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
void mm_unaccount_pinned_pages(struct mmpin *mmp);
--
2.37.0