Shed some weight from udp/ipv6. Zerocopy benchmarks over dummy showed
~5% tx/s improvement, should be similar for small payload non-zc
cases.
The performance comes from killing 4 atomics and a couple of big struct
memcpy/memset. 1/10 removes a pair of atomics on dst refcounting for
cork->skb setup, 9/10 saves another pair on cork init. 5/10 and 8/10
kill extra 88B memset and memcpy respectively.
v2: add a comment about setting dst early in ip6_setup_cork()
drop non-udp patches for now
add patch 10
Pavel Begunkov (10):
ipv6: optimise dst refcounting on skb init
udp6: shuffle up->pending AF_INET bits
ipv6: remove daddr temp buffer in __ip6_make_skb
ipv6: clean up cork setup/release
ipv6: don't zero inet_cork_full::fl after use
ipv6: pass full cork into __ip6_append_data()
udp6: pass flow in ip6_make_skb together with cork
udp6: don't make extra copies of iflow
ipv6: optimise dst refcounting on cork init
ipv6: partially inline ipv6_fixup_options
include/net/ipv6.h | 14 ++++--
net/ipv6/exthdrs.c | 8 ++--
net/ipv6/ip6_output.c | 99 ++++++++++++++++++++++------------------
net/ipv6/udp.c | 103 ++++++++++++++++++++----------------------
4 files changed, 118 insertions(+), 106 deletions(-)
--
2.34.1
Corked AF_INET for ipv6 socket doesn't appear to be the hottest case,
so move it out of the common path under up->pending check to remove
overhead.
Signed-off-by: Pavel Begunkov <[email protected]>
---
net/ipv6/udp.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 528b81ef19c9..e221a6957b1f 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1363,9 +1363,6 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
}
- if (up->pending == AF_INET)
- return udp_sendmsg(sk, msg, len);
-
/* Rough check on arithmetic overflow,
better check is made in ip6_append_data().
*/
@@ -1374,6 +1371,8 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
if (up->pending) {
+ if (up->pending == AF_INET)
+ return udp_sendmsg(sk, msg, len);
/*
* There are pending frames.
* The socket lock must be held while it's corked.
--
2.34.1
__ip6_make_skb() gets a cork->dst ref, hands it over to skb and shortly
after puts cork->dst. Save two atomics by stealing it without extra
referencing, ip6_cork_release() handles NULL cork->dst.
Signed-off-by: Pavel Begunkov <[email protected]>
---
net/ipv6/ip6_output.c | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 2995f8d89e7e..14d607ccfeea 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1807,6 +1807,15 @@ int ip6_append_data(struct sock *sk,
}
EXPORT_SYMBOL_GPL(ip6_append_data);
+static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
+{
+ struct dst_entry *dst = cork->base.dst;
+
+ cork->base.dst = NULL;
+ cork->base.flags &= ~IPCORK_ALLFRAG;
+ skb_dst_set(skb, dst);
+}
+
static void ip6_cork_release(struct inet_cork_full *cork,
struct inet6_cork *v6_cork)
{
@@ -1889,7 +1898,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
skb->tstamp = cork->base.transmit_time;
- skb_dst_set(skb, dst_clone(&rt->dst));
+ ip6_cork_steal_dst(skb, cork);
IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
if (proto == IPPROTO_ICMPV6) {
struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
--
2.34.1
Clean up ip6_setup_cork() and ip6_cork_release() adding a local variable
for v6_cork->opt. It's a preparation patch for further changes.
Signed-off-by: Pavel Begunkov <[email protected]>
---
net/ipv6/ip6_output.c | 44 +++++++++++++++++++++----------------------
1 file changed, 21 insertions(+), 23 deletions(-)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 4acd577d5ec5..88349e49717a 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1354,7 +1354,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
{
struct ipv6_pinfo *np = inet6_sk(sk);
unsigned int mtu;
- struct ipv6_txoptions *opt = ipc6->opt;
+ struct ipv6_txoptions *nopt, *opt = ipc6->opt;
/*
* setup for corking
@@ -1363,32 +1363,28 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
if (WARN_ON(v6_cork->opt))
return -EINVAL;
- v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
- if (unlikely(!v6_cork->opt))
+ nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
+ if (unlikely(!nopt))
return -ENOBUFS;
- v6_cork->opt->tot_len = sizeof(*opt);
- v6_cork->opt->opt_flen = opt->opt_flen;
- v6_cork->opt->opt_nflen = opt->opt_nflen;
+ nopt->tot_len = sizeof(*opt);
+ nopt->opt_flen = opt->opt_flen;
+ nopt->opt_nflen = opt->opt_nflen;
- v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
- sk->sk_allocation);
- if (opt->dst0opt && !v6_cork->opt->dst0opt)
+ nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
+ if (opt->dst0opt && !nopt->dst0opt)
return -ENOBUFS;
- v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
- sk->sk_allocation);
- if (opt->dst1opt && !v6_cork->opt->dst1opt)
+ nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
+ if (opt->dst1opt && !nopt->dst1opt)
return -ENOBUFS;
- v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
- sk->sk_allocation);
- if (opt->hopopt && !v6_cork->opt->hopopt)
+ nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
+ if (opt->hopopt && !nopt->hopopt)
return -ENOBUFS;
- v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
- sk->sk_allocation);
- if (opt->srcrt && !v6_cork->opt->srcrt)
+ nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
+ if (opt->srcrt && !nopt->srcrt)
return -ENOBUFS;
/* need source address above miyazawa*/
@@ -1820,11 +1816,13 @@ static void ip6_cork_release(struct inet_cork_full *cork,
struct inet6_cork *v6_cork)
{
if (v6_cork->opt) {
- kfree(v6_cork->opt->dst0opt);
- kfree(v6_cork->opt->dst1opt);
- kfree(v6_cork->opt->hopopt);
- kfree(v6_cork->opt->srcrt);
- kfree(v6_cork->opt);
+ struct ipv6_txoptions *opt = v6_cork->opt;
+
+ kfree(opt->dst0opt);
+ kfree(opt->dst1opt);
+ kfree(opt->hopopt);
+ kfree(opt->srcrt);
+ kfree(opt);
v6_cork->opt = NULL;
}
--
2.34.1
Convert a struct inet_cork argument in __ip6_append_data() to struct
inet_cork_full. As one struct contains another inet_cork is still can
be accessed via ->base field. It's a preparation patch making further
changes a bit cleaner.
Signed-off-by: Pavel Begunkov <[email protected]>
---
net/ipv6/ip6_output.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b8fdda9ac797..62da09819750 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1424,7 +1424,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
static int __ip6_append_data(struct sock *sk,
struct flowi6 *fl6,
struct sk_buff_head *queue,
- struct inet_cork *cork,
+ struct inet_cork_full *cork_full,
struct inet6_cork *v6_cork,
struct page_frag *pfrag,
int getfrag(void *from, char *to, int offset,
@@ -1433,6 +1433,7 @@ static int __ip6_append_data(struct sock *sk,
unsigned int flags, struct ipcm6_cookie *ipc6)
{
struct sk_buff *skb, *skb_prev = NULL;
+ struct inet_cork *cork = &cork_full->base;
unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
struct ubuf_info *uarg = NULL;
int exthdrlen = 0;
@@ -1797,7 +1798,7 @@ int ip6_append_data(struct sock *sk,
transhdrlen = 0;
}
- return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
+ return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork,
&np->cork, sk_page_frag(sk), getfrag,
from, length, transhdrlen, flags, ipc6);
}
@@ -1993,7 +1994,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
if (ipc6->dontfrag < 0)
ipc6->dontfrag = inet6_sk(sk)->dontfrag;
- err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
+ err = __ip6_append_data(sk, fl6, &queue, cork, &v6_cork,
¤t->task_frag, getfrag, from,
length + exthdrlen, transhdrlen + exthdrlen,
flags, ipc6);
--
2.34.1
Another preparation patch. inet_cork_full already contains a field for
iflow, so we can avoid passing a separate struct iflow6 into
__ip6_append_data() and ip6_make_skb(), and use the flow stored in
inet_cork_full. Make sure callers set cork->fl, i.e. we init it in
ip6_append_data() and before calling ip6_make_skb().
Signed-off-by: Pavel Begunkov <[email protected]>
---
include/net/ipv6.h | 2 +-
net/ipv6/ip6_output.c | 20 +++++++++-----------
net/ipv6/udp.c | 4 +++-
3 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 3afcb128e064..5e0b56d66724 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1020,7 +1020,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
int getfrag(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
- struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
+ struct ipcm6_cookie *ipc6,
struct rt6_info *rt, unsigned int flags,
struct inet_cork_full *cork);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 62da09819750..0cc490f2cfbf 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1350,7 +1350,7 @@ static void ip6_append_data_mtu(unsigned int *mtu,
static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
- struct rt6_info *rt, struct flowi6 *fl6)
+ struct rt6_info *rt)
{
struct ipv6_pinfo *np = inet6_sk(sk);
unsigned int mtu;
@@ -1391,7 +1391,6 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
}
dst_hold(&rt->dst);
cork->base.dst = &rt->dst;
- cork->fl.u.ip6 = *fl6;
v6_cork->hop_limit = ipc6->hlimit;
v6_cork->tclass = ipc6->tclass;
if (rt->dst.flags & DST_XFRM_TUNNEL)
@@ -1422,7 +1421,6 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
}
static int __ip6_append_data(struct sock *sk,
- struct flowi6 *fl6,
struct sk_buff_head *queue,
struct inet_cork_full *cork_full,
struct inet6_cork *v6_cork,
@@ -1434,6 +1432,7 @@ static int __ip6_append_data(struct sock *sk,
{
struct sk_buff *skb, *skb_prev = NULL;
struct inet_cork *cork = &cork_full->base;
+ struct flowi6 *fl6 = &cork_full->fl.u.ip6;
unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
struct ubuf_info *uarg = NULL;
int exthdrlen = 0;
@@ -1786,19 +1785,19 @@ int ip6_append_data(struct sock *sk,
* setup for corking
*/
err = ip6_setup_cork(sk, &inet->cork, &np->cork,
- ipc6, rt, fl6);
+ ipc6, rt);
if (err)
return err;
+ inet->cork.fl.u.ip6 = *fl6;
exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
length += exthdrlen;
transhdrlen += exthdrlen;
} else {
- fl6 = &inet->cork.fl.u.ip6;
transhdrlen = 0;
}
- return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork,
+ return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
&np->cork, sk_page_frag(sk), getfrag,
from, length, transhdrlen, flags, ipc6);
}
@@ -1967,9 +1966,8 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
int getfrag(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
- struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
- struct rt6_info *rt, unsigned int flags,
- struct inet_cork_full *cork)
+ struct ipcm6_cookie *ipc6, struct rt6_info *rt,
+ unsigned int flags, struct inet_cork_full *cork)
{
struct inet6_cork v6_cork;
struct sk_buff_head queue;
@@ -1986,7 +1984,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
cork->base.opt = NULL;
cork->base.dst = NULL;
v6_cork.opt = NULL;
- err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
+ err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
if (err) {
ip6_cork_release(cork, &v6_cork);
return ERR_PTR(err);
@@ -1994,7 +1992,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
if (ipc6->dontfrag < 0)
ipc6->dontfrag = inet6_sk(sk)->dontfrag;
- err = __ip6_append_data(sk, fl6, &queue, cork, &v6_cork,
+ err = __ip6_append_data(sk, &queue, cork, &v6_cork,
¤t->task_frag, getfrag, from,
length + exthdrlen, transhdrlen + exthdrlen,
flags, ipc6);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 3af1eea739a8..44b7ca9bd78e 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1533,9 +1533,11 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct inet_cork_full cork;
struct sk_buff *skb;
+ cork.fl.u.ip6 = fl6;
+
skb = ip6_make_skb(sk, getfrag, msg, ulen,
sizeof(struct udphdr), &ipc6,
- &fl6, (struct rt6_info *)dst,
+ (struct rt6_info *)dst,
msg->msg_flags, &cork);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
--
2.34.1
It doesn't appear there is any reason for ip6_cork_release() to zero
cork->fl, it'll be fully filled on next initialisation. This 88 bytes
memset accounts to 0.3-0.5% of total CPU cycles.
It's also needed in following patches and allows to remove an extar flow
copy in udp_v6_push_pending_frames().
Signed-off-by: Pavel Begunkov <[email protected]>
---
net/ipv6/ip6_output.c | 1 -
net/ipv6/udp.c | 10 ++--------
2 files changed, 2 insertions(+), 9 deletions(-)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 88349e49717a..b8fdda9ac797 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1831,7 +1831,6 @@ static void ip6_cork_release(struct inet_cork_full *cork,
cork->base.dst = NULL;
cork->base.flags &= ~IPCORK_ALLFRAG;
}
- memset(&cork->fl, 0, sizeof(cork->fl));
}
struct sk_buff *__ip6_make_skb(struct sock *sk,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e221a6957b1f..3af1eea739a8 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1266,23 +1266,17 @@ static int udp_v6_push_pending_frames(struct sock *sk)
{
struct sk_buff *skb;
struct udp_sock *up = udp_sk(sk);
- struct flowi6 fl6;
int err = 0;
if (up->pending == AF_INET)
return udp_push_pending_frames(sk);
- /* ip6_finish_skb will release the cork, so make a copy of
- * fl6 here.
- */
- fl6 = inet_sk(sk)->cork.fl.u.ip6;
-
skb = ip6_finish_skb(sk);
if (!skb)
goto out;
- err = udp_v6_send_skb(skb, &fl6, &inet_sk(sk)->cork.base);
-
+ err = udp_v6_send_skb(skb, &inet_sk(sk)->cork.fl.u.ip6,
+ &inet_sk(sk)->cork.base);
out:
up->len = 0;
up->pending = 0;
--
2.34.1
ipv6_push_nfrag_opts() doesn't change passed daddr, and so
__ip6_make_skb() doesn't actually need to keep an on-stack copy of
fl6->daddr. Set initially final_dst to fl6->daddr,
ipv6_push_nfrag_opts() will override it if needed, and get rid of extra
copies.
Signed-off-by: Pavel Begunkov <[email protected]>
---
net/ipv6/ip6_output.c | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 14d607ccfeea..4acd577d5ec5 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1843,7 +1843,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
{
struct sk_buff *skb, *tmp_skb;
struct sk_buff **tail_skb;
- struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
+ struct in6_addr *final_dst;
struct ipv6_pinfo *np = inet6_sk(sk);
struct net *net = sock_net(sk);
struct ipv6hdr *hdr;
@@ -1873,9 +1873,9 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
/* Allow local fragmentation. */
skb->ignore_df = ip6_sk_ignore_df(sk);
-
- *final_dst = fl6->daddr;
__skb_pull(skb, skb_network_header_len(skb));
+
+ final_dst = &fl6->daddr;
if (opt && opt->opt_flen)
ipv6_push_frag_opts(skb, opt, &proto);
if (opt && opt->opt_nflen)
@@ -1895,7 +1895,6 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
skb->priority = sk->sk_priority;
skb->mark = cork->base.mark;
-
skb->tstamp = cork->base.transmit_time;
ip6_cork_steal_dst(skb, cork);
--
2.34.1
udpv6_sendmsg() first initialises an on-stack 88B struct flowi6 and then
copies it into cork, which is expensive. Avoid the copy in corkless case
by initialising on-stack cork->fl directly.
The main part is a couple of lines under !corkreq check. The rest
converts fl6 variable to be a pointer.
Signed-off-by: Pavel Begunkov <[email protected]>
---
net/ipv6/udp.c | 85 +++++++++++++++++++++++++-------------------------
1 file changed, 42 insertions(+), 43 deletions(-)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 44b7ca9bd78e..cfcf08c3df4d 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1294,7 +1294,8 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct ipv6_txoptions *opt = NULL;
struct ipv6_txoptions *opt_to_free = NULL;
struct ip6_flowlabel *flowlabel = NULL;
- struct flowi6 fl6;
+ struct inet_cork_full cork;
+ struct flowi6 *fl6 = &cork.fl.u.ip6;
struct dst_entry *dst;
struct ipcm6_cookie ipc6;
int addr_len = msg->msg_namelen;
@@ -1384,19 +1385,19 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
ulen += sizeof(struct udphdr);
- memset(&fl6, 0, sizeof(fl6));
+ memset(fl6, 0, sizeof(*fl6));
if (sin6) {
if (sin6->sin6_port == 0)
return -EINVAL;
- fl6.fl6_dport = sin6->sin6_port;
+ fl6->fl6_dport = sin6->sin6_port;
daddr = &sin6->sin6_addr;
if (np->sndflow) {
- fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
- if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
- flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ fl6->flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+ if (fl6->flowlabel & IPV6_FLOWLABEL_MASK) {
+ flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
if (IS_ERR(flowlabel))
return -EINVAL;
}
@@ -1413,24 +1414,24 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (addr_len >= sizeof(struct sockaddr_in6) &&
sin6->sin6_scope_id &&
__ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr)))
- fl6.flowi6_oif = sin6->sin6_scope_id;
+ fl6->flowi6_oif = sin6->sin6_scope_id;
} else {
if (sk->sk_state != TCP_ESTABLISHED)
return -EDESTADDRREQ;
- fl6.fl6_dport = inet->inet_dport;
+ fl6->fl6_dport = inet->inet_dport;
daddr = &sk->sk_v6_daddr;
- fl6.flowlabel = np->flow_label;
+ fl6->flowlabel = np->flow_label;
connected = true;
}
- if (!fl6.flowi6_oif)
- fl6.flowi6_oif = sk->sk_bound_dev_if;
+ if (!fl6->flowi6_oif)
+ fl6->flowi6_oif = sk->sk_bound_dev_if;
- if (!fl6.flowi6_oif)
- fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
+ if (!fl6->flowi6_oif)
+ fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
- fl6.flowi6_uid = sk->sk_uid;
+ fl6->flowi6_uid = sk->sk_uid;
if (msg->msg_controllen) {
opt = &opt_space;
@@ -1440,14 +1441,14 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
err = udp_cmsg_send(sk, msg, &ipc6.gso_size);
if (err > 0)
- err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6,
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, fl6,
&ipc6);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
}
- if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
- flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ if ((fl6->flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
+ flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
if (IS_ERR(flowlabel))
return -EINVAL;
}
@@ -1464,16 +1465,17 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
opt = ipv6_fixup_options(&opt_space, opt);
ipc6.opt = opt;
- fl6.flowi6_proto = sk->sk_protocol;
- fl6.flowi6_mark = ipc6.sockc.mark;
- fl6.daddr = *daddr;
- if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr))
- fl6.saddr = np->saddr;
- fl6.fl6_sport = inet->inet_sport;
+ fl6->flowi6_proto = sk->sk_protocol;
+ fl6->flowi6_mark = ipc6.sockc.mark;
+ fl6->daddr = *daddr;
+ if (ipv6_addr_any(&fl6->saddr) && !ipv6_addr_any(&np->saddr))
+ fl6->saddr = np->saddr;
+ fl6->fl6_sport = inet->inet_sport;
if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
- (struct sockaddr *)sin6, &fl6.saddr);
+ (struct sockaddr *)sin6,
+ &fl6->saddr);
if (err)
goto out_no_dst;
if (sin6) {
@@ -1489,32 +1491,32 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
err = -EINVAL;
goto out_no_dst;
}
- fl6.fl6_dport = sin6->sin6_port;
- fl6.daddr = sin6->sin6_addr;
+ fl6->fl6_dport = sin6->sin6_port;
+ fl6->daddr = sin6->sin6_addr;
}
}
- if (ipv6_addr_any(&fl6.daddr))
- fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
+ if (ipv6_addr_any(&fl6->daddr))
+ fl6->daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
- final_p = fl6_update_dst(&fl6, opt, &final);
+ final_p = fl6_update_dst(fl6, opt, &final);
if (final_p)
connected = false;
- if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) {
- fl6.flowi6_oif = np->mcast_oif;
+ if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr)) {
+ fl6->flowi6_oif = np->mcast_oif;
connected = false;
- } else if (!fl6.flowi6_oif)
- fl6.flowi6_oif = np->ucast_oif;
+ } else if (!fl6->flowi6_oif)
+ fl6->flowi6_oif = np->ucast_oif;
- security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
if (ipc6.tclass < 0)
ipc6.tclass = np->tclass;
- fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
+ fl6->flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6->flowlabel);
- dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p, connected);
+ dst = ip6_sk_dst_lookup_flow(sk, fl6, final_p, connected);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
dst = NULL;
@@ -1522,7 +1524,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
if (ipc6.hlimit < 0)
- ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+ ipc6.hlimit = ip6_sk_dst_hoplimit(np, fl6, dst);
if (msg->msg_flags&MSG_CONFIRM)
goto do_confirm;
@@ -1530,18 +1532,15 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
/* Lockless fast path for the non-corking case */
if (!corkreq) {
- struct inet_cork_full cork;
struct sk_buff *skb;
- cork.fl.u.ip6 = fl6;
-
skb = ip6_make_skb(sk, getfrag, msg, ulen,
sizeof(struct udphdr), &ipc6,
(struct rt6_info *)dst,
msg->msg_flags, &cork);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
- err = udp_v6_send_skb(skb, &fl6, &cork.base);
+ err = udp_v6_send_skb(skb, fl6, &cork.base);
goto out;
}
@@ -1563,7 +1562,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc6.dontfrag = np->dontfrag;
up->len += ulen;
err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr),
- &ipc6, &fl6, (struct rt6_info *)dst,
+ &ipc6, fl6, (struct rt6_info *)dst,
corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
if (err)
udp_v6_flush_pending_frames(sk);
@@ -1598,7 +1597,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
do_confirm:
if (msg->msg_flags & MSG_PROBE)
- dst_confirm_neigh(dst, &fl6.daddr);
+ dst_confirm_neigh(dst, &fl6->daddr);
if (!(msg->msg_flags&MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
--
2.34.1
udpv6_sendmsg() doesn't need dst after calling ip6_make_skb(), so
instead of taking an additional reference inside ip6_setup_cork()
and releasing the initial one afterwards, we can hand over a reference
into ip6_make_skb() saving two atomics. The only other user of
ip6_setup_cork() is ip6_append_data() and it requires an extra
dst_hold().
Signed-off-by: Pavel Begunkov <[email protected]>
---
net/ipv6/ip6_output.c | 13 +++++++++----
net/ipv6/udp.c | 3 ++-
2 files changed, 11 insertions(+), 5 deletions(-)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 0cc490f2cfbf..0c6c971ce0a5 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1356,6 +1356,11 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
unsigned int mtu;
struct ipv6_txoptions *nopt, *opt = ipc6->opt;
+ /* callers pass dst together with a reference, set it first so
+ * ip6_cork_release() can put it down even in case of an error.
+ */
+ cork->base.dst = &rt->dst;
+
/*
* setup for corking
*/
@@ -1389,8 +1394,6 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
/* need source address above miyazawa*/
}
- dst_hold(&rt->dst);
- cork->base.dst = &rt->dst;
v6_cork->hop_limit = ipc6->hlimit;
v6_cork->tclass = ipc6->tclass;
if (rt->dst.flags & DST_XFRM_TUNNEL)
@@ -1784,6 +1787,7 @@ int ip6_append_data(struct sock *sk,
/*
* setup for corking
*/
+ dst_hold(&rt->dst);
err = ip6_setup_cork(sk, &inet->cork, &np->cork,
ipc6, rt);
if (err)
@@ -1974,15 +1978,16 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
int err;
- if (flags & MSG_PROBE)
+ if (flags & MSG_PROBE) {
+ dst_release(&rt->dst);
return NULL;
+ }
__skb_queue_head_init(&queue);
cork->base.flags = 0;
cork->base.addr = 0;
cork->base.opt = NULL;
- cork->base.dst = NULL;
v6_cork.opt = NULL;
err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
if (err) {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index cfcf08c3df4d..c6872596b408 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1541,7 +1541,8 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
err = udp_v6_send_skb(skb, fl6, &cork.base);
- goto out;
+ /* ip6_make_skb steals dst reference */
+ goto out_no_dst;
}
lock_sock(sk);
--
2.34.1
Inline a part of ipv6_fixup_options() to avoid extra overhead on
function call if opt is NULL.
Signed-off-by: Pavel Begunkov <[email protected]>
---
include/net/ipv6.h | 12 ++++++++++--
net/ipv6/exthdrs.c | 8 ++++----
2 files changed, 14 insertions(+), 6 deletions(-)
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 5e0b56d66724..082f30256f59 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -437,8 +437,16 @@ struct ipv6_txoptions *ipv6_renew_options(struct sock *sk,
struct ipv6_txoptions *opt,
int newtype,
struct ipv6_opt_hdr *newopt);
-struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
- struct ipv6_txoptions *opt);
+struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space,
+ struct ipv6_txoptions *opt);
+
+static inline struct ipv6_txoptions *
+ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt)
+{
+ if (!opt)
+ return NULL;
+ return __ipv6_fixup_options(opt_space, opt);
+}
bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
const struct inet6_skb_parm *opt);
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 77e34aec7e82..658d5eabaf7e 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -1344,14 +1344,14 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
return opt2;
}
-struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
- struct ipv6_txoptions *opt)
+struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space,
+ struct ipv6_txoptions *opt)
{
/*
* ignore the dest before srcrt unless srcrt is being included.
* --yoshfuji
*/
- if (opt && opt->dst0opt && !opt->srcrt) {
+ if (opt->dst0opt && !opt->srcrt) {
if (opt_space != opt) {
memcpy(opt_space, opt, sizeof(*opt_space));
opt = opt_space;
@@ -1362,7 +1362,7 @@ struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
return opt;
}
-EXPORT_SYMBOL_GPL(ipv6_fixup_options);
+EXPORT_SYMBOL_GPL(__ipv6_fixup_options);
/**
* fl6_update_dst - update flowi destination address with info given
--
2.34.1
On Wed, Jan 26, 2022 at 7:36 PM Pavel Begunkov <[email protected]> wrote:
>
> Shed some weight from udp/ipv6. Zerocopy benchmarks over dummy showed
> ~5% tx/s improvement, should be similar for small payload non-zc
> cases.
>
> The performance comes from killing 4 atomics and a couple of big struct
> memcpy/memset. 1/10 removes a pair of atomics on dst refcounting for
> cork->skb setup, 9/10 saves another pair on cork init. 5/10 and 8/10
> kill extra 88B memset and memcpy respectively.
>
> v2: add a comment about setting dst early in ip6_setup_cork()
> drop non-udp patches for now
> add patch 10
>
> Pavel Begunkov (10):
> ipv6: optimise dst refcounting on skb init
> udp6: shuffle up->pending AF_INET bits
> ipv6: remove daddr temp buffer in __ip6_make_skb
> ipv6: clean up cork setup/release
> ipv6: don't zero inet_cork_full::fl after use
> ipv6: pass full cork into __ip6_append_data()
> udp6: pass flow in ip6_make_skb together with cork
> udp6: don't make extra copies of iflow
> ipv6: optimise dst refcounting on cork init
> ipv6: partially inline ipv6_fixup_options
>
> include/net/ipv6.h | 14 ++++--
> net/ipv6/exthdrs.c | 8 ++--
> net/ipv6/ip6_output.c | 99 ++++++++++++++++++++++------------------
> net/ipv6/udp.c | 103 ++++++++++++++++++++----------------------
> 4 files changed, 118 insertions(+), 106 deletions(-)
For the series:
Reviewed-by: Willem de Bruijn <[email protected]>
Iterative review vs v1, where I only had one small comment, which was
addressed. NB: Due to some subject line changes, it wasn't immediately
clear to me that this was just a range-diff over the first 10 patches
in both series.
Hello:
This series was applied to netdev/net-next.git (master)
by Jakub Kicinski <[email protected]>:
On Thu, 27 Jan 2022 00:36:21 +0000 you wrote:
> Shed some weight from udp/ipv6. Zerocopy benchmarks over dummy showed
> ~5% tx/s improvement, should be similar for small payload non-zc
> cases.
>
> The performance comes from killing 4 atomics and a couple of big struct
> memcpy/memset. 1/10 removes a pair of atomics on dst refcounting for
> cork->skb setup, 9/10 saves another pair on cork init. 5/10 and 8/10
> kill extra 88B memset and memcpy respectively.
>
> [...]
Here is the summary with links:
- [net-next,v2,01/10] ipv6: optimise dst refcounting on skb init
https://git.kernel.org/netdev/net-next/c/cd3c74807736
- [net-next,v2,02/10] udp6: shuffle up->pending AF_INET bits
https://git.kernel.org/netdev/net-next/c/406c4a0af010
- [net-next,v2,03/10] ipv6: remove daddr temp buffer in __ip6_make_skb
https://git.kernel.org/netdev/net-next/c/b60d4e58c615
- [net-next,v2,04/10] ipv6: clean up cork setup/release
https://git.kernel.org/netdev/net-next/c/d656b2ea5fa7
- [net-next,v2,05/10] ipv6: don't zero inet_cork_full::fl after use
https://git.kernel.org/netdev/net-next/c/940ea00b0646
- [net-next,v2,06/10] ipv6: pass full cork into __ip6_append_data()
https://git.kernel.org/netdev/net-next/c/f3b46a3e8c40
- [net-next,v2,07/10] udp6: pass flow in ip6_make_skb together with cork
https://git.kernel.org/netdev/net-next/c/f37a4cc6bb0b
- [net-next,v2,08/10] udp6: don't make extra copies of iflow
https://git.kernel.org/netdev/net-next/c/5298953e742d
- [net-next,v2,09/10] ipv6: optimise dst refcounting on cork init
https://git.kernel.org/netdev/net-next/c/40ac240c2e06
- [net-next,v2,10/10] ipv6: partially inline ipv6_fixup_options
https://git.kernel.org/netdev/net-next/c/31ed2261e88f
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html