Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id DB746C678D5 for ; Tue, 7 Mar 2023 12:58:19 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S230009AbjCGM6S (ORCPT ); Tue, 7 Mar 2023 07:58:18 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55218 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229913AbjCGM56 (ORCPT ); Tue, 7 Mar 2023 07:57:58 -0500 Received: from galois.linutronix.de (Galois.linutronix.de [IPv6:2a0a:51c0:0:12e:550::1]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 0B20B76F71; Tue, 7 Mar 2023 04:57:48 -0800 (PST) Message-ID: <20230307125538.989175656@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1678193866; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=4Doa8MeF8eRNEXVS0R6+XvKtxdSgyI58na1gVzLY2sA=; b=eHggAehHejK7m2paUFpR33AS4mxhv4SeJwgkYWjpD8/ChqBp6nd5oHgCGle8Oto2jt1dZM X4s7+PWDMnjA8zTU5QYo2Mi7zwl1wjVwzuLX2IhbohMFGsDUj7yinJul0xMI1o0fZbyWcR mwckqJ2vHJicnQv9/keDeFYK+UmYtZg5fD0GXhDbfpYXPjzAN2hXECTeXte3zc4wGapIZG FVgBeiNUUtVQ0UvA36y/88FJj0+JscXxEshKR3xARiIIJZQqgwjTLElaCXlTZR81Iq+4HO lyP4Sk5b7K7TeKebJ0YSjOq/FmQf26LUYu0oaXj87vdHndJwCGe6G0CMo9JEYQ== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1678193866; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=4Doa8MeF8eRNEXVS0R6+XvKtxdSgyI58na1gVzLY2sA=; b=Zulwbqlyq5ib9YomdtRadcSvyqCUMnxCAkNKDG9GQIHnJh19Odb28i7U6NBto8MVQ0nXRb 54gh7cH8UTx8nDCw== From: Thomas Gleixner To: LKML Cc: Linus Torvalds , x86@kernel.org, Wangyang Guo , Arjan van De Ven , "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , netdev@vger.kernel.org, Will Deacon , Peter Zijlstra , Boqun Feng , Mark Rutland , Marc Zyngier , Arjan Van De Ven Subject: [patch V2 4/4] net: dst: Switch to rcuref_t reference counting References: <20230307125358.772287565@linutronix.de> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Date: Tue, 7 Mar 2023 13:57:46 +0100 (CET) Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Under high contention dst_entry::__refcnt becomes a significant bottleneck. atomic_inc_not_zero() is implemented with a cmpxchg() loop, which goes into high retry rates on contention. Switch the reference count to rcuref_t which results in a significant performance gain. The gain depends on the micro-architecture and the number of concurrent operations and has been measured in the range of +25% to +130% with a localhost memtier/memcached benchmark which amplifies the problem massively. Running the memtier/memcached benchmark over a real (1Gb) network connection the conversion on top of the false sharing fix for struct dst_entry::__refcnt results in a total gain in the 2%-5% range over the upstream baseline. Reported-by: Wangyang Guo Reported-by: Arjan Van De Ven Signed-off-by: Thomas Gleixner Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: netdev@vger.kernel.org --- include/net/dst.h | 9 +++++---- include/net/sock.h | 2 +- net/bridge/br_nf_core.c | 2 +- net/core/dst.c | 26 +++++--------------------- net/core/rtnetlink.c | 2 +- net/ipv6/route.c | 6 +++--- net/netfilter/ipvs/ip_vs_xmit.c | 4 ++-- 7 files changed, 18 insertions(+), 33 deletions(-) --- a/include/net/dst.h +++ b/include/net/dst.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -65,7 +66,7 @@ struct dst_entry { * input/output/ops or performance tanks badly */ #ifdef CONFIG_64BIT - atomic_t __refcnt; /* 64-bit offset 64 */ + rcuref_t __refcnt; /* 64-bit offset 64 */ #endif int __use; unsigned long lastuse; @@ -75,7 +76,7 @@ struct dst_entry { __u32 tclassid; #ifndef CONFIG_64BIT struct lwtunnel_state *lwtstate; - atomic_t __refcnt; /* 32-bit offset 64 */ + rcuref_t __refcnt; /* 32-bit offset 64 */ #endif netdevice_tracker dev_tracker; @@ -241,7 +242,7 @@ static inline void dst_hold(struct dst_e * the placement of __refcnt in struct dst_entry */ BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63); - WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0); + WARN_ON(!rcuref_get(&dst->__refcnt)); } static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) @@ -305,7 +306,7 @@ static inline void skb_dst_copy(struct s */ static inline bool dst_hold_safe(struct dst_entry *dst) { - return atomic_inc_not_zero(&dst->__refcnt); + return rcuref_get(&dst->__refcnt); } /** --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2131,7 +2131,7 @@ sk_dst_get(struct sock *sk) rcu_read_lock(); dst = rcu_dereference(sk->sk_dst_cache); - if (dst && !atomic_inc_not_zero(&dst->__refcnt)) + if (dst && !rcuref_get(&dst->__refcnt)) dst = NULL; rcu_read_unlock(); return dst; --- a/net/bridge/br_nf_core.c +++ b/net/bridge/br_nf_core.c @@ -73,7 +73,7 @@ void br_netfilter_rtable_init(struct net { struct rtable *rt = &br->fake_rtable; - atomic_set(&rt->dst.__refcnt, 1); + rcuref_init(&rt->dst.__refcnt, 1); rt->dst.dev = br->dev; dst_init_metrics(&rt->dst, br_dst_default_metrics, true); rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; --- a/net/core/dst.c +++ b/net/core/dst.c @@ -66,7 +66,7 @@ void dst_init(struct dst_entry *dst, str dst->tclassid = 0; #endif dst->lwtstate = NULL; - atomic_set(&dst->__refcnt, initial_ref); + rcuref_init(&dst->__refcnt, initial_ref); dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; @@ -162,31 +162,15 @@ EXPORT_SYMBOL(dst_dev_put); void dst_release(struct dst_entry *dst) { - if (dst) { - int newrefcnt; - - newrefcnt = atomic_dec_return(&dst->__refcnt); - if (WARN_ONCE(newrefcnt < 0, "dst_release underflow")) - net_warn_ratelimited("%s: dst:%p refcnt:%d\n", - __func__, dst, newrefcnt); - if (!newrefcnt) - call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); - } + if (dst && rcuref_put(&dst->__refcnt)) + call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); } EXPORT_SYMBOL(dst_release); void dst_release_immediate(struct dst_entry *dst) { - if (dst) { - int newrefcnt; - - newrefcnt = atomic_dec_return(&dst->__refcnt); - if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow")) - net_warn_ratelimited("%s: dst:%p refcnt:%d\n", - __func__, dst, newrefcnt); - if (!newrefcnt) - dst_destroy(dst); - } + if (dst && rcuref_put(&dst->__refcnt)) + dst_destroy(dst); } EXPORT_SYMBOL(dst_release_immediate); --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -840,7 +840,7 @@ int rtnl_put_cacheinfo(struct sk_buff *s if (dst) { ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); ci.rta_used = dst->__use; - ci.rta_clntref = atomic_read(&dst->__refcnt); + ci.rta_clntref = rcuref_read(&dst->__refcnt); } if (expires) { unsigned long clock; --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -293,7 +293,7 @@ static const struct fib6_info fib6_null_ static const struct rt6_info ip6_null_entry_template = { .dst = { - .__refcnt = ATOMIC_INIT(1), + .__refcnt = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -ENETUNREACH, @@ -307,7 +307,7 @@ static const struct rt6_info ip6_null_en static const struct rt6_info ip6_prohibit_entry_template = { .dst = { - .__refcnt = ATOMIC_INIT(1), + .__refcnt = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EACCES, @@ -319,7 +319,7 @@ static const struct rt6_info ip6_prohibi static const struct rt6_info ip6_blk_hole_entry_template = { .dst = { - .__refcnt = ATOMIC_INIT(1), + .__refcnt = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EINVAL, --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -339,7 +339,7 @@ static int spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", &dest->addr.ip, &dest_dst->dst_saddr.ip, - atomic_read(&rt->dst.__refcnt)); + rcuref_read(&rt->dst.__refcnt)); } if (ret_saddr) *ret_saddr = dest_dst->dst_saddr.ip; @@ -507,7 +507,7 @@ static int spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", &dest->addr.in6, &dest_dst->dst_saddr.in6, - atomic_read(&rt->dst.__refcnt)); + rcuref_read(&rt->dst.__refcnt)); } if (ret_saddr) *ret_saddr = dest_dst->dst_saddr.in6;