Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753545AbaG2LmG (ORCPT ); Tue, 29 Jul 2014 07:42:06 -0400 Received: from casper.infradead.org ([85.118.1.10]:35915 "EHLO casper.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753466AbaG2Llk (ORCPT ); Tue, 29 Jul 2014 07:41:40 -0400 From: Thomas Graf To: davem@davemloft.net, netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org, kaber@trash.net, paulmck@linux.vnet.ibm.com, josh@joshtriplett.org, challa@noironetworks.com, walpole@cs.pdx.edu, dev@openvswitch.org Subject: [PATCH 2/2] netlink: Convert netlink_lookup() to use RCU protected hash table Date: Tue, 29 Jul 2014 13:41:33 +0200 Message-Id: X-Mailer: git-send-email 1.9.3 In-Reply-To: References: In-Reply-To: References: X-SRS-Rewrite: SMTP reverse-path rewritten from by casper.infradead.org See http://www.infradead.org/rpr.html Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Heavy Netlink users such as Open vSwitch spend a considerable amount of time in netlink_lookup() due to the read-lock on nl_table_lock. Use of RCU relieves the lock contention. Makes use of the new resizable hash table to avoid locking on the lookup. The hash table will grow if entries exceeds 75% of table size up to a total table size of 64K. It will automatically shrink if usage falls below 50%. Also splits nl_table_lock into a separate spinlock to protect hash table mutations. This avoids a possible deadlock when the hash table growing waits on RCU readers to complete via synchronize_rcu() while readers holding RCU read lock are waiting on the nl_table_lock() to be released to lock the table for broadcasting. Before: 9.16% kpktgend_0 [openvswitch] [k] masked_flow_lookup 6.42% kpktgend_0 [pktgen] [k] mod_cur_headers 6.26% kpktgend_0 [pktgen] [k] pktgen_thread_worker 6.23% kpktgend_0 [kernel.kallsyms] [k] memset 4.79% kpktgend_0 [kernel.kallsyms] [k] netlink_lookup 4.37% kpktgend_0 [kernel.kallsyms] [k] memcpy 3.60% kpktgend_0 [openvswitch] [k] ovs_flow_extract 2.69% kpktgend_0 [kernel.kallsyms] [k] jhash2 After: 15.26% kpktgend_0 [openvswitch] [k] masked_flow_lookup 8.12% kpktgend_0 [pktgen] [k] pktgen_thread_worker 7.92% kpktgend_0 [pktgen] [k] mod_cur_headers 5.11% kpktgend_0 [kernel.kallsyms] [k] memset 4.11% kpktgend_0 [openvswitch] [k] ovs_flow_extract 4.06% kpktgend_0 [kernel.kallsyms] [k] _raw_spin_lock 3.90% kpktgend_0 [kernel.kallsyms] [k] jhash2 [...] 0.67% kpktgend_0 [kernel.kallsyms] [k] netlink_lookup Signed-off-by: Thomas Graf --- net/netlink/af_netlink.c | 285 ++++++++++++++++++----------------------------- net/netlink/af_netlink.h | 18 +-- net/netlink/diag.c | 11 +- 3 files changed, 119 insertions(+), 195 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 1b38f7f..7f44468 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -58,7 +58,9 @@ #include #include #include +#include #include +#include #include #include @@ -100,6 +102,18 @@ static atomic_t nl_table_users = ATOMIC_INIT(0); #define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock)); +/* Protects netlink socket hash table mutations */ +DEFINE_SPINLOCK(nl_sk_hash_lock); + +static int lockdep_nl_sk_hash_is_held(void) +{ +#ifdef CONFIG_LOCKDEP + return (debug_locks) ? lockdep_is_held(&nl_sk_hash_lock) : 1; +#else + return 1; +#endif +} + static ATOMIC_NOTIFIER_HEAD(netlink_chain); static DEFINE_SPINLOCK(netlink_tap_lock); @@ -110,11 +124,6 @@ static inline u32 netlink_group_mask(u32 group) return group ? 1 << (group - 1) : 0; } -static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u32 portid) -{ - return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask]; -} - int netlink_add_tap(struct netlink_tap *nt) { if (unlikely(nt->dev->type != ARPHRD_NETLINK)) @@ -983,105 +992,48 @@ netlink_unlock_table(void) wake_up(&nl_table_wait); } -static bool netlink_compare(struct net *net, struct sock *sk) -{ - return net_eq(sock_net(sk), net); -} - -static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid) +struct netlink_compare_arg { - struct netlink_table *table = &nl_table[protocol]; - struct nl_portid_hash *hash = &table->hash; - struct hlist_head *head; - struct sock *sk; - - read_lock(&nl_table_lock); - head = nl_portid_hashfn(hash, portid); - sk_for_each(sk, head) { - if (table->compare(net, sk) && - (nlk_sk(sk)->portid == portid)) { - sock_hold(sk); - goto found; - } - } - sk = NULL; -found: - read_unlock(&nl_table_lock); - return sk; -} + struct net *net; + u32 portid; +}; -static struct hlist_head *nl_portid_hash_zalloc(size_t size) +static bool netlink_compare(void *ptr, void *arg) { - if (size <= PAGE_SIZE) - return kzalloc(size, GFP_ATOMIC); - else - return (struct hlist_head *) - __get_free_pages(GFP_ATOMIC | __GFP_ZERO, - get_order(size)); -} + struct netlink_compare_arg *x = arg; + struct sock *sk = ptr; -static void nl_portid_hash_free(struct hlist_head *table, size_t size) -{ - if (size <= PAGE_SIZE) - kfree(table); - else - free_pages((unsigned long)table, get_order(size)); + return nlk_sk(sk)->portid == x->portid && + net_eq(sock_net(sk), x->net); } -static int nl_portid_hash_rehash(struct nl_portid_hash *hash, int grow) +static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid, + struct net *net) { - unsigned int omask, mask, shift; - size_t osize, size; - struct hlist_head *otable, *table; - int i; - - omask = mask = hash->mask; - osize = size = (mask + 1) * sizeof(*table); - shift = hash->shift; - - if (grow) { - if (++shift > hash->max_shift) - return 0; - mask = mask * 2 + 1; - size *= 2; - } + struct netlink_compare_arg arg = { + .net = net, + .portid = portid, + }; + u32 hash; - table = nl_portid_hash_zalloc(size); - if (!table) - return 0; + hash = rhashtable_hashfn(&table->hash, &portid, sizeof(portid)); - otable = hash->table; - hash->table = table; - hash->mask = mask; - hash->shift = shift; - get_random_bytes(&hash->rnd, sizeof(hash->rnd)); - - for (i = 0; i <= omask; i++) { - struct sock *sk; - struct hlist_node *tmp; - - sk_for_each_safe(sk, tmp, &otable[i]) - __sk_add_node(sk, nl_portid_hashfn(hash, nlk_sk(sk)->portid)); - } - - nl_portid_hash_free(otable, osize); - hash->rehash_time = jiffies + 10 * 60 * HZ; - return 1; + return rhashtable_lookup_compare(&table->hash, hash, + &netlink_compare, &arg); } -static inline int nl_portid_hash_dilute(struct nl_portid_hash *hash, int len) +static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid) { - int avg = hash->entries >> hash->shift; - - if (unlikely(avg > 1) && nl_portid_hash_rehash(hash, 1)) - return 1; + struct netlink_table *table = &nl_table[protocol]; + struct sock *sk; - if (unlikely(len > avg) && time_after(jiffies, hash->rehash_time)) { - nl_portid_hash_rehash(hash, 0); - return 1; - } + rcu_read_lock(); + sk = __netlink_lookup(table, portid, net); + if (sk) + sock_hold(sk); + rcu_read_unlock(); - return 0; + return sk; } static const struct proto_ops netlink_ops; @@ -1113,22 +1065,10 @@ netlink_update_listeners(struct sock *sk) static int netlink_insert(struct sock *sk, struct net *net, u32 portid) { struct netlink_table *table = &nl_table[sk->sk_protocol]; - struct nl_portid_hash *hash = &table->hash; - struct hlist_head *head; int err = -EADDRINUSE; - struct sock *osk; - int len; - netlink_table_grab(); - head = nl_portid_hashfn(hash, portid); - len = 0; - sk_for_each(osk, head) { - if (table->compare(net, osk) && - (nlk_sk(osk)->portid == portid)) - break; - len++; - } - if (osk) + spin_lock(&nl_sk_hash_lock); + if (__netlink_lookup(table, portid, net)) goto err; err = -EBUSY; @@ -1136,26 +1076,31 @@ static int netlink_insert(struct sock *sk, struct net *net, u32 portid) goto err; err = -ENOMEM; - if (BITS_PER_LONG > 32 && unlikely(hash->entries >= UINT_MAX)) + if (BITS_PER_LONG > 32 && unlikely(table->hash.nelems >= UINT_MAX)) goto err; - if (len && nl_portid_hash_dilute(hash, len)) - head = nl_portid_hashfn(hash, portid); - hash->entries++; nlk_sk(sk)->portid = portid; - sk_add_node(sk, head); - err = 0; + sock_hold(sk); + err = rhashtable_insert(&table->hash, &nlk_sk(sk)->node, GFP_ATOMIC); err: - netlink_table_ungrab(); + spin_unlock(&nl_sk_hash_lock); return err; } static void netlink_remove(struct sock *sk) { + struct netlink_table *table; + + spin_lock(&nl_sk_hash_lock); + table = &nl_table[sk->sk_protocol]; + if (rhashtable_remove(&table->hash, &nlk_sk(sk)->node, GFP_ATOMIC)) { + WARN_ON(atomic_read(&sk->sk_refcnt) == 1); + __sock_put(sk); + } + spin_unlock(&nl_sk_hash_lock); + netlink_table_grab(); - if (sk_del_node_init(sk)) - nl_table[sk->sk_protocol].hash.entries--; if (nlk_sk(sk)->subscriptions) __sk_del_bind_node(sk); netlink_table_ungrab(); @@ -1311,6 +1256,9 @@ static int netlink_release(struct socket *sock) } netlink_table_ungrab(); + /* Wait for readers to complete */ + synchronize_net(); + kfree(nlk->groups); nlk->groups = NULL; @@ -1326,30 +1274,22 @@ static int netlink_autobind(struct socket *sock) struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct netlink_table *table = &nl_table[sk->sk_protocol]; - struct nl_portid_hash *hash = &table->hash; - struct hlist_head *head; - struct sock *osk; s32 portid = task_tgid_vnr(current); int err; static s32 rover = -4097; retry: cond_resched(); - netlink_table_grab(); - head = nl_portid_hashfn(hash, portid); - sk_for_each(osk, head) { - if (!table->compare(net, osk)) - continue; - if (nlk_sk(osk)->portid == portid) { - /* Bind collision, search negative portid values. */ - portid = rover--; - if (rover > -4097) - rover = -4097; - netlink_table_ungrab(); - goto retry; - } + rcu_read_lock(); + if (__netlink_lookup(table, portid, net)) { + /* Bind collision, search negative portid values. */ + portid = rover--; + if (rover > -4097) + rover = -4097; + rcu_read_unlock(); + goto retry; } - netlink_table_ungrab(); + rcu_read_unlock(); err = netlink_insert(sk, net, portid); if (err == -EADDRINUSE) @@ -2953,14 +2893,18 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos) { struct nl_seq_iter *iter = seq->private; int i, j; + struct netlink_sock *nlk; struct sock *s; loff_t off = 0; for (i = 0; i < MAX_LINKS; i++) { - struct nl_portid_hash *hash = &nl_table[i].hash; + struct rhashtable *ht = &nl_table[i].hash; + const struct bucket_table *tbl = rht_dereference(ht->tbl, ht); + + for (j = 0; j < tbl->size; j++) { + rht_for_each_entry_rcu(nlk, tbl->buckets[j], node) { + s = (struct sock *)nlk; - for (j = 0; j <= hash->mask; j++) { - sk_for_each(s, &hash->table[j]) { if (sock_net(s) != seq_file_net(seq)) continue; if (off == pos) { @@ -2976,15 +2920,14 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos) } static void *netlink_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(nl_table_lock) { - read_lock(&nl_table_lock); + rcu_read_lock(); return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct sock *s; + struct netlink_sock *nlk; struct nl_seq_iter *iter; struct net *net; int i, j; @@ -2996,28 +2939,26 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) net = seq_file_net(seq); iter = seq->private; - s = v; - do { - s = sk_next(s); - } while (s && !nl_table[s->sk_protocol].compare(net, s)); - if (s) - return s; + nlk = v; + + rht_for_each_entry_rcu(nlk, nlk->node.next, node) + if (net_eq(sock_net((struct sock *)nlk), net)) + return nlk; i = iter->link; j = iter->hash_idx + 1; do { - struct nl_portid_hash *hash = &nl_table[i].hash; - - for (; j <= hash->mask; j++) { - s = sk_head(&hash->table[j]); + struct rhashtable *ht = &nl_table[i].hash; + const struct bucket_table *tbl = rht_dereference(ht->tbl, ht); - while (s && !nl_table[s->sk_protocol].compare(net, s)) - s = sk_next(s); - if (s) { - iter->link = i; - iter->hash_idx = j; - return s; + for (; j <= tbl->size; j++) { + rht_for_each_entry_rcu(nlk, tbl->buckets[j], node) { + if (net_eq(sock_net((struct sock *)nlk), net)) { + iter->link = i; + iter->hash_idx = j; + return nlk; + } } } @@ -3028,9 +2969,8 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void netlink_seq_stop(struct seq_file *seq, void *v) - __releases(nl_table_lock) { - read_unlock(&nl_table_lock); + rcu_read_unlock(); } @@ -3168,9 +3108,17 @@ static struct pernet_operations __net_initdata netlink_net_ops = { static int __init netlink_proto_init(void) { int i; - unsigned long limit; - unsigned int order; int err = proto_register(&netlink_proto, 0); + struct rhashtable_params ht_params = { + .head_offset = offsetof(struct netlink_sock, node), + .key_offset = offsetof(struct netlink_sock, portid), + .key_len = sizeof(u32), /* portid */ + .hashfn = arch_fast_hash, + .max_shift = 16, /* 64K */ + .grow_decision = rht_grow_above_75, + .shrink_decision = rht_shrink_below_50, + .mutex_is_held = lockdep_nl_sk_hash_is_held, + }; if (err != 0) goto out; @@ -3181,32 +3129,13 @@ static int __init netlink_proto_init(void) if (!nl_table) goto panic; - if (totalram_pages >= (128 * 1024)) - limit = totalram_pages >> (21 - PAGE_SHIFT); - else - limit = totalram_pages >> (23 - PAGE_SHIFT); - - order = get_bitmask_order(limit) - 1 + PAGE_SHIFT; - limit = (1UL << order) / sizeof(struct hlist_head); - order = get_bitmask_order(min(limit, (unsigned long)UINT_MAX)) - 1; - for (i = 0; i < MAX_LINKS; i++) { - struct nl_portid_hash *hash = &nl_table[i].hash; - - hash->table = nl_portid_hash_zalloc(1 * sizeof(*hash->table)); - if (!hash->table) { - while (i-- > 0) - nl_portid_hash_free(nl_table[i].hash.table, - 1 * sizeof(*hash->table)); + if (rhashtable_init(&nl_table[i].hash, &ht_params) < 0) { + while (--i > 0) + rhashtable_destroy(&nl_table[i].hash); kfree(nl_table); goto panic; } - hash->max_shift = order; - hash->shift = 0; - hash->mask = 0; - hash->rehash_time = jiffies; - - nl_table[i].compare = netlink_compare; } INIT_LIST_HEAD(&netlink_tap_all); diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 0b59d44..60f631f 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -1,6 +1,7 @@ #ifndef _AF_NETLINK_H #define _AF_NETLINK_H +#include #include #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) @@ -47,6 +48,8 @@ struct netlink_sock { struct netlink_ring tx_ring; atomic_t mapped; #endif /* CONFIG_NETLINK_MMAP */ + + struct rhash_head node; }; static inline struct netlink_sock *nlk_sk(struct sock *sk) @@ -54,21 +57,8 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk) return container_of(sk, struct netlink_sock, sk); } -struct nl_portid_hash { - struct hlist_head *table; - unsigned long rehash_time; - - unsigned int mask; - unsigned int shift; - - unsigned int entries; - unsigned int max_shift; - - u32 rnd; -}; - struct netlink_table { - struct nl_portid_hash hash; + struct rhashtable hash; struct hlist_head mc_list; struct listeners __rcu *listeners; unsigned int flags; diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 1af29624..7588f34 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "af_netlink.h" @@ -101,16 +102,20 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, int protocol, int s_num) { struct netlink_table *tbl = &nl_table[protocol]; - struct nl_portid_hash *hash = &tbl->hash; + struct rhashtable *ht = &tbl->hash; + const struct bucket_table *htbl = rht_dereference(ht->tbl, ht); struct net *net = sock_net(skb->sk); struct netlink_diag_req *req; + struct netlink_sock *nlsk; struct sock *sk; int ret = 0, num = 0, i; req = nlmsg_data(cb->nlh); - for (i = 0; i <= hash->mask; i++) { - sk_for_each(sk, &hash->table[i]) { + for (i = 0; i <= htbl->size; i++) { + rht_for_each_entry(nlsk, htbl->buckets[i], ht, node) { + sk = (struct sock *)nlsk; + if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) { -- 1.9.3 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/