Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1756284AbZDPUus (ORCPT ); Thu, 16 Apr 2009 16:50:48 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1754410AbZDPUuL (ORCPT ); Thu, 16 Apr 2009 16:50:11 -0400 Received: from mail.vyatta.com ([76.74.103.46]:55491 "EHLO mail.vyatta.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753575AbZDPUuG (ORCPT ); Thu, 16 Apr 2009 16:50:06 -0400 Date: Thu, 16 Apr 2009 13:49:56 -0700 From: Stephen Hemminger To: Eric Dumazet , paulmck@linux.vnet.ibm.com, Patrick McHardy , David Miller , Linus Torvalds Cc: jeff.chua.linux@gmail.com, paulus@samba.org, mingo@elte.hu, laijs@cn.fujitsu.com, jengelh@medozas.de, r000n@r000n.net, linux-kernel@vger.kernel.org, netfilter-devel@vger.kernel.org, netdev@vger.kernel.org, benh@kernel.crashing.org Subject: [PATCH[] netfilter: use per-cpu reader-writer lock (v0.7) Message-ID: <20090416134956.6c1f0087@nehalam> In-Reply-To: <49E77BF6.1080206@cosmosbay.com> References: <20090415135526.2afc4d18@nehalam> <49E64C91.5020708@cosmosbay.com> <20090415.164811.19905145.davem@davemloft.net> <20090415170111.6e1ca264@nehalam> <20090415174551.529d241c@nehalam> <49E6BBA9.2030701@cosmosbay.com> <49E7384B.5020707@trash.net> <20090416144748.GB6924@linux.vnet.ibm.com> <49E75876.10509@cosmosbay.com> <20090416175850.GH6924@linux.vnet.ibm.com> <49E77BF6.1080206@cosmosbay.com> Organization: Vyatta X-Mailer: Claws Mail 3.6.1 (GTK+ 2.16.1; x86_64-pc-linux-gnu) Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 19947 Lines: 729 This version of x_tables (ip/ip6/arp) locking uses a per-cpu rwlock that can be nested. It is sort of like earlier brwlock (fast reader, slow writer). The locking is isolated so future improvements can concentrate on measuring/optimizing xt_table_info_lock. I tried other versions based on recursive spin locks and sequence counters and for me, the risk of inventing own locking primitives not worth it at this time. The idea for this came from an earlier version done by Eric Dumazet. Locking is done per-cpu, the fast path locks on the current cpu and updates counters. This reduces the contention of a single reader lock (in 2.6.29) without the delay of synchronize_net() (in 2.6.30-rc2). The mutex that was added for 2.6.30 in xt_table is unnecessary since there already is a mutex for xt[af].mutex that is held. Lockdep reports bogus warnings on this, so using raw_write_lock might be necessary. Signed-off-by: Stephen Hemminger valid_hooks & (1 << hook)); - - rcu_read_lock_bh(); - private = rcu_dereference(table->private); - table_base = rcu_dereference(private->entries[smp_processor_id()]); + xt_table_info_lock(); + private = table->private; + table_base = private->entries[smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); @@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb, e = (void *)e + e->next_offset; } } while (!hotdrop); - - rcu_read_unlock_bh(); + xt_table_info_unlock(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -918,60 +916,6 @@ get_counters(const struct xt_table_info counters, &i); } - -} - -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static int -add_counter_to_entry(struct ipt_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - -/* Take values from counters and add them back onto the current cpu */ -static void put_counters(struct xt_table_info *t, - const struct xt_counters counters[]) -{ - unsigned int i, cpu; - - local_bh_disable(); - cpu = smp_processor_id(); - i = 0; - IPT_ENTRY_ITERATE(t->entries[cpu], - t->size, - add_counter_to_entry, - counters, - &i); - local_bh_enable(); -} - - -static inline int -zero_entry_counter(struct ipt_entry *e, void *arg) -{ - e->counters.bcnt = 0; - e->counters.pcnt = 0; - return 0; -} - -static void -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) -{ - unsigned int cpu; - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; - - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); - for_each_possible_cpu(cpu) { - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); - IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, - zero_entry_counter, NULL); - } } static struct xt_counters * alloc_counters(struct xt_table *table) @@ -979,7 +923,6 @@ static struct xt_counters * alloc_counte unsigned int countersize; struct xt_counters *counters; struct xt_table_info *private = table->private; - struct xt_table_info *info; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -988,30 +931,13 @@ static struct xt_counters * alloc_counte counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - goto nomem; - - info = xt_alloc_table_info(private->size); - if (!info) - goto free_counters; - - clone_counters(info, private); + return ERR_PTR(-ENOMEM); - mutex_lock(&table->lock); - xt_table_entry_swap_rcu(private, info); - synchronize_net(); /* Wait until smoke has cleared */ - - get_counters(info, counters); - put_counters(private, counters); - mutex_unlock(&table->lock); - - xt_free_table_info(info); + xt_table_info_lock_all(); + get_counters(private, counters); + xt_table_info_unlock_all(); return counters; - - free_counters: - vfree(counters); - nomem: - return ERR_PTR(-ENOMEM); } static int @@ -1377,6 +1303,18 @@ do_replace(struct net *net, void __user return ret; } +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static int +add_counter_to_entry(struct ipt_entry *e, + const struct xt_counters addme[], + unsigned int *i) +{ + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} static int do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) @@ -1437,25 +1375,23 @@ do_add_counters(struct net *net, void __ goto free; } - mutex_lock(&t->lock); + xt_table_info_lock_all(); private = t->private; if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } - preempt_disable(); i = 0; /* Choose the copy that is on our node */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries[smp_processor_id()]; IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, paddc, &i); - preempt_enable(); unlock_up_free: - mutex_unlock(&t->lock); + xt_table_info_unlock_all(); xt_table_unlock(t); module_put(t->me); free: --- a/net/netfilter/x_tables.c 2009-04-16 13:40:57.174740286 -0700 +++ b/net/netfilter/x_tables.c 2009-04-16 13:40:58.880757376 -0700 @@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_ } EXPORT_SYMBOL(xt_free_table_info); -void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo, - struct xt_table_info *newinfo) -{ - unsigned int cpu; - - for_each_possible_cpu(cpu) { - void *p = oldinfo->entries[cpu]; - rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]); - newinfo->entries[cpu] = p; - } - -} -EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu); - /* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name) @@ -676,6 +662,40 @@ void xt_compat_unlock(u_int8_t af) EXPORT_SYMBOL_GPL(xt_compat_unlock); #endif +DEFINE_PER_CPU(rwlock_t, xt_info_locks); +EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks); + +/** + * xt_table_info_lock_all - lock xt table info for update + * + * Locks out all readers, and blocks bottom half + */ +void xt_table_info_lock_all(void) +{ + int i; + + local_bh_disable(); + for_each_possible_cpu(i) + write_lock(&per_cpu(xt_info_locks, i)); + +} +EXPORT_SYMBOL_GPL(xt_table_info_lock_all); + +/** + * xt_table_info_unlock_all - lock xt table info for update + * + * Unlocks all readers, and unblocks bottom half + */ +void xt_table_info_unlock_all(void) +{ + int i; + + for_each_possible_cpu(i) + write_unlock(&per_cpu(xt_info_locks, i)); + local_bh_enable(); +} +EXPORT_SYMBOL_GPL(xt_table_info_unlock_all); + struct xt_table_info * xt_replace_table(struct xt_table *table, unsigned int num_counters, @@ -685,22 +705,21 @@ xt_replace_table(struct xt_table *table, struct xt_table_info *oldinfo, *private; /* Do the substitution. */ - mutex_lock(&table->lock); + xt_table_info_lock_all(); private = table->private; /* Check inside lock: is the old number correct? */ if (num_counters != private->number) { duprintf("num_counters != table->private->number (%u/%u)\n", num_counters, private->number); - mutex_unlock(&table->lock); + xt_table_info_unlock_all(); *error = -EAGAIN; return NULL; } oldinfo = private; - rcu_assign_pointer(table->private, newinfo); - newinfo->initial_entries = oldinfo->initial_entries; - mutex_unlock(&table->lock); + table->private = newinfo; + newinfo->initial_entries = private->initial_entries; + xt_table_info_unlock_all(); - synchronize_net(); return oldinfo; } EXPORT_SYMBOL_GPL(xt_replace_table); @@ -734,7 +753,6 @@ struct xt_table *xt_register_table(struc /* Simplifies replace_table code. */ table->private = bootstrap; - mutex_init(&table->lock); if (!xt_replace_table(table, 0, newinfo, &ret)) goto unlock; @@ -1149,6 +1167,9 @@ static int __init xt_init(void) { int i, rv; + for_each_possible_cpu(i) + rwlock_init(&per_cpu(xt_info_locks, i)); + xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); if (!xt) return -ENOMEM; --- a/net/ipv6/netfilter/ip6_tables.c 2009-04-16 13:40:57.205741715 -0700 +++ b/net/ipv6/netfilter/ip6_tables.c 2009-04-16 13:40:58.882756612 -0700 @@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb, IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - rcu_read_lock_bh(); - private = rcu_dereference(table->private); - table_base = rcu_dereference(private->entries[smp_processor_id()]); + xt_table_info_lock(); + private = table->private; + table_base = private->entries[smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); @@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb, #ifdef CONFIG_NETFILTER_DEBUG ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; #endif - rcu_read_unlock_bh(); + xt_table_info_unlock(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -949,64 +949,11 @@ get_counters(const struct xt_table_info } } -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static int -add_counter_to_entry(struct ip6t_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - -/* Take values from counters and add them back onto the current cpu */ -static void put_counters(struct xt_table_info *t, - const struct xt_counters counters[]) -{ - unsigned int i, cpu; - - local_bh_disable(); - cpu = smp_processor_id(); - i = 0; - IP6T_ENTRY_ITERATE(t->entries[cpu], - t->size, - add_counter_to_entry, - counters, - &i); - local_bh_enable(); -} - -static inline int -zero_entry_counter(struct ip6t_entry *e, void *arg) -{ - e->counters.bcnt = 0; - e->counters.pcnt = 0; - return 0; -} - -static void -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) -{ - unsigned int cpu; - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; - - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); - for_each_possible_cpu(cpu) { - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); - IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, - zero_entry_counter, NULL); - } -} - static struct xt_counters *alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; struct xt_table_info *private = table->private; - struct xt_table_info *info; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -1015,30 +962,13 @@ static struct xt_counters *alloc_counter counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - goto nomem; - - info = xt_alloc_table_info(private->size); - if (!info) - goto free_counters; + return ERR_PTR(-ENOMEM); - clone_counters(info, private); - - mutex_lock(&table->lock); - xt_table_entry_swap_rcu(private, info); - synchronize_net(); /* Wait until smoke has cleared */ - - get_counters(info, counters); - put_counters(private, counters); - mutex_unlock(&table->lock); - - xt_free_table_info(info); + xt_table_info_lock_all(); + get_counters(private, counters); + xt_table_info_unlock_all(); return counters; - - free_counters: - vfree(counters); - nomem: - return ERR_PTR(-ENOMEM); } static int @@ -1405,6 +1335,19 @@ do_replace(struct net *net, void __user return ret; } +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static int +add_counter_to_entry(struct ip6t_entry *e, + const struct xt_counters addme[], + unsigned int *i) +{ + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + static int do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) @@ -1465,25 +1408,24 @@ do_add_counters(struct net *net, void __ goto free; } - mutex_lock(&t->lock); + xt_table_info_lock_all(); private = t->private; if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } - preempt_disable(); i = 0; /* Choose the copy that is on our node */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries[smp_processor_id()]; IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, paddc, &i); - preempt_enable(); + unlock_up_free: - mutex_unlock(&t->lock); + xt_table_info_unlock_all(); xt_table_unlock(t); module_put(t->me); free: --- a/net/ipv4/netfilter/arp_tables.c 2009-04-16 13:40:57.226788977 -0700 +++ b/net/ipv4/netfilter/arp_tables.c 2009-04-16 13:40:58.897816063 -0700 @@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buf indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; - rcu_read_lock_bh(); - private = rcu_dereference(table->private); - table_base = rcu_dereference(private->entries[smp_processor_id()]); + xt_table_info_lock(); + private = table->private; + table_base = private->entries[smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); back = get_entry(table_base, private->underflow[hook]); @@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buf hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + (2 * skb->dev->addr_len); + ADD_COUNTER(e->counters, hdr_len, 1); t = arpt_get_target(e); @@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buf e = (void *)e + e->next_offset; } } while (!hotdrop); - - rcu_read_unlock_bh(); + xt_table_info_unlock(); if (hotdrop) return NF_DROP; @@ -734,65 +734,11 @@ static void get_counters(const struct xt } } - -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static int -add_counter_to_entry(struct arpt_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - -/* Take values from counters and add them back onto the current cpu */ -static void put_counters(struct xt_table_info *t, - const struct xt_counters counters[]) -{ - unsigned int i, cpu; - - local_bh_disable(); - cpu = smp_processor_id(); - i = 0; - ARPT_ENTRY_ITERATE(t->entries[cpu], - t->size, - add_counter_to_entry, - counters, - &i); - local_bh_enable(); -} - -static inline int -zero_entry_counter(struct arpt_entry *e, void *arg) -{ - e->counters.bcnt = 0; - e->counters.pcnt = 0; - return 0; -} - -static void -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) -{ - unsigned int cpu; - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; - - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); - for_each_possible_cpu(cpu) { - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); - ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, - zero_entry_counter, NULL); - } -} - static struct xt_counters *alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; struct xt_table_info *private = table->private; - struct xt_table_info *info; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -802,30 +748,13 @@ static struct xt_counters *alloc_counter counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - goto nomem; - - info = xt_alloc_table_info(private->size); - if (!info) - goto free_counters; + return ERR_PTR(-ENOMEM); - clone_counters(info, private); - - mutex_lock(&table->lock); - xt_table_entry_swap_rcu(private, info); - synchronize_net(); /* Wait until smoke has cleared */ - - get_counters(info, counters); - put_counters(private, counters); - mutex_unlock(&table->lock); - - xt_free_table_info(info); + xt_table_info_lock_all(); + get_counters(private, counters); + xt_table_info_unlock_all(); return counters; - - free_counters: - vfree(counters); - nomem: - return ERR_PTR(-ENOMEM); } static int copy_entries_to_user(unsigned int total_size, @@ -1165,6 +1094,19 @@ static int do_replace(struct net *net, v return ret; } +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static int +add_counter_to_entry(struct arpt_entry *e, + const struct xt_counters addme[], + unsigned int *i) +{ + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + static int do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) { @@ -1224,14 +1166,13 @@ static int do_add_counters(struct net *n goto free; } - mutex_lock(&t->lock); + xt_table_info_lock_all(); private = t->private; if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } - preempt_disable(); i = 0; /* Choose the copy that is on our node */ loc_cpu_entry = private->entries[smp_processor_id()]; @@ -1240,10 +1181,9 @@ static int do_add_counters(struct net *n add_counter_to_entry, paddc, &i); - preempt_enable(); - unlock_up_free: - mutex_unlock(&t->lock); + unlock_up_free: + xt_table_info_unlock_all(); xt_table_unlock(t); module_put(t->me); free: -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/