2010-08-17 07:16:46

by Changli Gao

[permalink] [raw]
Subject: [PATCH] netfilter: save the hash of the tuple in the original direction for latter use

Since we don't change the tuple in the original direction, we can save it
in ct->tuplehash[IP_CT_DIR_REPLY].hnode.pprev for __nf_conntrack_confirm()
use.

__hash_conntrack() is split into two steps: ____hash_conntrack() is used
to get the raw hash, and __hash_bucket() is used to get the bucket id.

In SYN-flood case, early_drop() doesn't need to recompute the hash again.

Signed-off-by: Changli Gao <[email protected]>
---
net/netfilter/nf_conntrack_core.c | 117 ++++++++++++++++++++++++++------------
1 file changed, 82 insertions(+), 35 deletions(-)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index df3eedb..947f0a9 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -65,14 +65,23 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max);
DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);

-static int nf_conntrack_hash_rnd_initted;
-static unsigned int nf_conntrack_hash_rnd;
-
-static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
- u16 zone, unsigned int size, unsigned int rnd)
+static u32 ____hash_conntrack(const struct nf_conntrack_tuple *tuple, u16 zone)
{
unsigned int n;
u_int32_t h;
+ static int rnd_initted;
+ static unsigned int rnd;
+ static DEFINE_SPINLOCK(rnd_lock);
+
+ if (unlikely(!rnd_initted)) {
+ spin_lock_bh(&rnd_lock);
+ if (!rnd_initted) {
+ get_random_bytes(&rnd, sizeof(rnd));
+ wmb();
+ rnd_initted = 1;
+ }
+ spin_unlock_bh(&rnd_lock);
+ }

/* The direction must be ignored, so we hash everything up to the
* destination ports (which is a multiple of 4) and treat the last
@@ -83,14 +92,29 @@ static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
zone ^ rnd ^ (((__force __u16)tuple->dst.u.all << 16) |
tuple->dst.protonum));

- return ((u64)h * size) >> 32;
+ return h;
+}
+
+static u32 __hash_bucket(u32 __hash, unsigned int size)
+{
+ return ((u64)__hash * size) >> 32;
+}
+
+static u32 hash_bucket(u32 __hash, const struct net *net)
+{
+ return __hash_bucket(__hash, net->ct.htable_size);
+}
+
+static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
+ u16 zone, unsigned int size)
+{
+ return __hash_bucket(____hash_conntrack(tuple, zone), size);
}

static inline u_int32_t hash_conntrack(const struct net *net, u16 zone,
const struct nf_conntrack_tuple *tuple)
{
- return __hash_conntrack(tuple, zone, net->ct.htable_size,
- nf_conntrack_hash_rnd);
+ return __hash_conntrack(tuple, zone, net->ct.htable_size);
}

bool
@@ -292,13 +316,13 @@ static void death_by_timeout(unsigned long ul_conntrack)
* OR
* - Caller must lock nf_conntrack_lock before calling this function
*/
-struct nf_conntrack_tuple_hash *
-__nf_conntrack_find(struct net *net, u16 zone,
- const struct nf_conntrack_tuple *tuple)
+static struct nf_conntrack_tuple_hash *
+____nf_conntrack_find(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple, u32 __hash)
{
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
- unsigned int hash = hash_conntrack(net, zone, tuple);
+ unsigned int hash = hash_bucket(__hash, net);

/* Disable BHs the entire time since we normally need to disable them
* at least once for the stats anyway.
@@ -327,19 +351,27 @@ begin:

return NULL;
}
+
+struct nf_conntrack_tuple_hash *
+__nf_conntrack_find(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return ____nf_conntrack_find(net, zone, tuple,
+ ____hash_conntrack(tuple, zone));
+}
EXPORT_SYMBOL_GPL(__nf_conntrack_find);

/* Find a connection corresponding to a tuple. */
-struct nf_conntrack_tuple_hash *
-nf_conntrack_find_get(struct net *net, u16 zone,
- const struct nf_conntrack_tuple *tuple)
+static struct nf_conntrack_tuple_hash *
+__nf_conntrack_find_get(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple, u32 __hash)
{
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;

rcu_read_lock();
begin:
- h = __nf_conntrack_find(net, zone, tuple);
+ h = ____nf_conntrack_find(net, zone, tuple, __hash);
if (h) {
ct = nf_ct_tuplehash_to_ctrack(h);
if (unlikely(nf_ct_is_dying(ct) ||
@@ -357,6 +389,14 @@ begin:

return h;
}
+
+struct nf_conntrack_tuple_hash *
+nf_conntrack_find_get(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return __nf_conntrack_find_get(net, zone, tuple,
+ ____hash_conntrack(tuple, zone));
+}
EXPORT_SYMBOL_GPL(nf_conntrack_find_get);

static void __nf_conntrack_hash_insert(struct nf_conn *ct,
@@ -409,7 +449,8 @@ __nf_conntrack_confirm(struct sk_buff *skb)
return NF_ACCEPT;

zone = nf_ct_zone(ct);
- hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ /* reuse the __hash saved before */
+ hash = hash_bucket(*(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev, net);
repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);

/* We're not in hash table, and we refuse to set up related
@@ -567,25 +608,20 @@ static noinline int early_drop(struct net *net, unsigned int hash)
return dropped;
}

-struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
- const struct nf_conntrack_tuple *orig,
- const struct nf_conntrack_tuple *repl,
- gfp_t gfp)
+static struct nf_conn *
+__nf_conntrack_alloc(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_tuple *repl,
+ gfp_t gfp, u32 __hash)
{
struct nf_conn *ct;

- if (unlikely(!nf_conntrack_hash_rnd_initted)) {
- get_random_bytes(&nf_conntrack_hash_rnd,
- sizeof(nf_conntrack_hash_rnd));
- nf_conntrack_hash_rnd_initted = 1;
- }
-
/* We don't want any race condition at early drop stage */
atomic_inc(&net->ct.count);

if (nf_conntrack_max &&
unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
- unsigned int hash = hash_conntrack(net, zone, orig);
+ unsigned int hash = hash_bucket(__hash, net);
if (!early_drop(net, hash)) {
atomic_dec(&net->ct.count);
if (net_ratelimit())
@@ -616,7 +652,8 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
- ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL;
+ /* save __hash for reusing when confirming */
+ *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = __hash;
/* Don't set timer yet: wait for confirmation */
setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
write_pnet(&ct->ct_net, net);
@@ -643,6 +680,14 @@ out_free:
return ERR_PTR(-ENOMEM);
#endif
}
+
+struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_tuple *repl,
+ gfp_t gfp)
+{
+ return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
+}
EXPORT_SYMBOL_GPL(nf_conntrack_alloc);

void nf_conntrack_free(struct nf_conn *ct)
@@ -664,7 +709,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
struct nf_conntrack_l3proto *l3proto,
struct nf_conntrack_l4proto *l4proto,
struct sk_buff *skb,
- unsigned int dataoff)
+ unsigned int dataoff, u32 __hash)
{
struct nf_conn *ct;
struct nf_conn_help *help;
@@ -678,7 +723,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
return NULL;
}

- ct = nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC);
+ ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
+ __hash);
if (IS_ERR(ct)) {
pr_debug("Can't allocate conntrack.\n");
return (struct nf_conntrack_tuple_hash *)ct;
@@ -755,6 +801,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+ u32 __hash;

if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
dataoff, l3num, protonum, &tuple, l3proto,
@@ -764,10 +811,11 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
}

/* look for tuple match */
- h = nf_conntrack_find_get(net, zone, &tuple);
+ __hash = ____hash_conntrack(&tuple, zone);
+ h = __nf_conntrack_find_get(net, zone, &tuple, __hash);
if (!h) {
h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
- skb, dataoff);
+ skb, dataoff, __hash);
if (!h)
return NULL;
if (IS_ERR(h))
@@ -1307,8 +1355,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
ct = nf_ct_tuplehash_to_ctrack(h);
hlist_nulls_del_rcu(&h->hnnode);
bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
- hashsize,
- nf_conntrack_hash_rnd);
+ hashsize);
hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
}
}


2010-08-17 08:30:24

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] netfilter: save the hash of the tuple in the original direction for latter use

Le mardi 17 août 2010 à 15:16 +0800, Changli Gao a écrit :
> Since we don't change the tuple in the original direction, we can save it
> in ct->tuplehash[IP_CT_DIR_REPLY].hnode.pprev for __nf_conntrack_confirm()
> use.
>
> __hash_conntrack() is split into two steps: ____hash_conntrack() is used
> to get the raw hash, and __hash_bucket() is used to get the bucket id.
>
> In SYN-flood case, early_drop() doesn't need to recompute the hash again.
>
> Signed-off-by: Changli Gao <[email protected]>
> ---
> net/netfilter/nf_conntrack_core.c | 117 ++++++++++++++++++++++++++------------
> 1 file changed, 82 insertions(+), 35 deletions(-)
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index df3eedb..947f0a9 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -65,14 +65,23 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max);
> DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
> EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
>
> -static int nf_conntrack_hash_rnd_initted;
> -static unsigned int nf_conntrack_hash_rnd;
> -
> -static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
> - u16 zone, unsigned int size, unsigned int rnd)
> +static u32 ____hash_conntrack(const struct nf_conntrack_tuple *tuple, u16 zone)
> {
> unsigned int n;
> u_int32_t h;
> + static int rnd_initted;
> + static unsigned int rnd;
> + static DEFINE_SPINLOCK(rnd_lock);
> +
> + if (unlikely(!rnd_initted)) {
> + spin_lock_bh(&rnd_lock);
> + if (!rnd_initted) {
> + get_random_bytes(&rnd, sizeof(rnd));
> + wmb();
> + rnd_initted = 1;
> + }
> + spin_unlock_bh(&rnd_lock);
> + }
>

Three variables ?

static atomic_t rnd __read_mostly;

if (unlikely(!atomic_read(&rnd))) {
unsigned int val;

get_random_bytes(&val, sizeof(val));
if (!val)
val = 1;
atomic_cmpxchg(&rnd, 0, val);
}

2010-08-17 08:47:10

by Changli Gao

[permalink] [raw]
Subject: Re: [PATCH] netfilter: save the hash of the tuple in the original direction for latter use

On Tue, Aug 17, 2010 at 4:30 PM, Eric Dumazet <[email protected]> wrote:
>
> Three variables ?
>
> static atomic_t rnd __read_mostly;
>
> if (unlikely(!atomic_read(&rnd))) {
> ? ? ? ?unsigned int val;
>
> ? ? ? ?get_random_bytes(&val, sizeof(val));
> ? ? ? ?if (!val)
> ? ? ? ? ? ? ? ?val = 1;
> ? ? ? ?atomic_cmpxchg(&rnd, 0, val);
> }
>


Good idea. However, atomic_t is a volatile variable, and it prevent
ILP. I think maybe it hurts the likely case. cmpxchg() is an atomic
operations, so is the bellow code better?

static unsigned long rnd __read_mostly;

if (unlikely(!rnd)) {
unsigned long val;

get_random_bytes(&val, sizeof(val));
if (!val)
val = 1;
cmpxchg(&rnd, 0, val);
}

Thanks.

--
Regards,
Changli Gao([email protected])

2010-08-17 09:02:02

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] netfilter: save the hash of the tuple in the original direction for latter use

Le mardi 17 août 2010 à 16:46 +0800, Changli Gao a écrit :
> On Tue, Aug 17, 2010 at 4:30 PM, Eric Dumazet <[email protected]> wrote:
> >
> > Three variables ?
> >
> > static atomic_t rnd __read_mostly;
> >
> > if (unlikely(!atomic_read(&rnd))) {
> > unsigned int val;
> >
> > get_random_bytes(&val, sizeof(val));
> > if (!val)
> > val = 1;
> > atomic_cmpxchg(&rnd, 0, val);
> > }
> >
>
>
> Good idea. However, atomic_t is a volatile variable, and it prevent
> ILP. I think maybe it hurts the likely case. cmpxchg() is an atomic
> operations, so is the bellow code better?
>

I am not sure what you mean by ILP.

On x86, reading twice same memory location is faster than
reading it and store in temp variable (on stack)
reading from stack

> static unsigned long rnd __read_mostly;
>
> if (unlikely(!rnd)) {
> unsigned long val;
>
> get_random_bytes(&val, sizeof(val));
> if (!val)
> val = 1;
> cmpxchg(&rnd, 0, val);
> }
>
> Thanks.
>

I am not sure we must use a long (we really need 4 bytes only), and last
time I tried to use cmpxchg(), I was being told it was not available on
all arches.

But seeing it used in kernel/pid.c, maybe its not true anymore (that is,
__HAVE_ARCH_CMPXCHG is always defined to 1)

Since its a recent change (in kernel/pid.c), I would wait a bit and see
if an arch maintainer complains ;)

2010-08-17 09:32:49

by Changli Gao

[permalink] [raw]
Subject: Re: [PATCH] netfilter: save the hash of the tuple in the original direction for latter use

On Tue, Aug 17, 2010 at 5:01 PM, Eric Dumazet <[email protected]> wrote:
> Le mardi 17 ao?t 2010 ? 16:46 +0800, Changli Gao a ?crit :
>> On Tue, Aug 17, 2010 at 4:30 PM, Eric Dumazet <[email protected]> wrote:
>> >
>> > Three variables ?
>> >
>> > static atomic_t rnd __read_mostly;
>> >
>> > if (unlikely(!atomic_read(&rnd))) {
>> > ? ? ? ?unsigned int val;
>> >
>> > ? ? ? ?get_random_bytes(&val, sizeof(val));
>> > ? ? ? ?if (!val)
>> > ? ? ? ? ? ? ? ?val = 1;
>> > ? ? ? ?atomic_cmpxchg(&rnd, 0, val);
>> > }
>> >
>>
>>
>> Good idea. However, atomic_t is a volatile variable, and it prevent
>> ILP. I think maybe it hurts the likely case. cmpxchg() is an atomic
>> operations, so is the bellow code better?
>>
>
> I am not sure what you mean by ILP.

I mean http://en.wikipedia.org/wiki/Instruction_level_parallelism

and volatile will suppress compiler optimization.

>
> On x86, reading twice same memory location is faster than
> reading it and store in temp variable (on stack)
> reading from stack

Thanks for this info. But compiler may store it in a register. In
fact, we can't control the behavior of compiler in C.

>
>> ?static unsigned long rnd __read_mostly;
>>
>> ?if (unlikely(!rnd)) {
>> ? ? ? ? unsigned long val;
>>
>> ? ? ? ? get_random_bytes(&val, sizeof(val));
>> ? ? ? ? if (!val)
>> ? ? ? ? ? ? ? ? val = 1;
>> ? ? ? ? cmpxchg(&rnd, 0, val);
>> ?}
>>
>> Thanks.
>>
>
> I am not sure we must use a long (we really need 4 bytes only),

Yea, 4 bytes is enough.

> and last
> time I tried to use cmpxchg(), I was being told it was not available on
> all arches.
>
> But seeing it used in kernel/pid.c, maybe its not true anymore (that is,
> __HAVE_ARCH_CMPXCHG is always defined to 1)
>
> Since its a recent change (in kernel/pid.c), I would wait a bit and see
> if an arch maintainer complains ;)
>

Thanks. I saw it. These are two cmpxchg() in file:

include/asm-generic/cmpxchg.h

include/asm-generic/system.h

static inline unsigned long __cmpxchg(volatile unsigned long *m,
unsigned long old, unsigned long new)
{
unsigned long retval;
unsigned long flags;

local_irq_save(flags);
retval = *m;
if (retval == old)
*m = new;
local_irq_restore(flags);
return retval;
}

#define cmpxchg(ptr, o, n) \
((__typeof__(*(ptr))) __cmpxchg((unsigned long *)(ptr), \
(unsigned long)(o), \
(unsigned long)(n)))

This one seems buggy when using it to non unsigned long variables. And
in kernel/pid.c, the last_pid variable is int.

--
Regards,
Changli Gao([email protected])

2010-08-19 04:00:07

by Changli Gao

[permalink] [raw]
Subject: Re: [PATCH] netfilter: save the hash of the tuple in the original direction for latter use

On Tue, Aug 17, 2010 at 5:01 PM, Eric Dumazet <[email protected]> wrote:
>
> I am not sure we must use a long (we really need 4 bytes only), and last
> time I tried to use cmpxchg(), I was being told it was not available on
> all arches.
>
> But seeing it used in kernel/pid.c, maybe its not true anymore (that is,
> __HAVE_ARCH_CMPXCHG is always defined to 1)
>
> Since its a recent change (in kernel/pid.c), I would wait a bit and see
> if an arch maintainer complains ;)
>
>

I searched the code, and found ext4, btrfs, lockdep, perf and trace
all use cmpxchg(). And after this patch serial
(http://linux.derkeiler.com/Mailing-Lists/Kernel/2007-08/msg05032.html)
was merged, maybe cmpxchg() is generic. Is there an SMP processor
doesn't support cmpxchg()?

--
Regards,
Changli Gao([email protected])

2010-08-19 12:35:55

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [PATCH] netfilter: save the hash of the tuple in the original direction for latter use

* Changli Gao ([email protected]) wrote:
> On Tue, Aug 17, 2010 at 5:01 PM, Eric Dumazet <[email protected]> wrote:
> >
> > I am not sure we must use a long (we really need 4 bytes only), and last
> > time I tried to use cmpxchg(), I was being told it was not available on
> > all arches.
> >
> > But seeing it used in kernel/pid.c, maybe its not true anymore (that is,
> > __HAVE_ARCH_CMPXCHG is always defined to 1)
> >
> > Since its a recent change (in kernel/pid.c), I would wait a bit and see
> > if an arch maintainer complains ;)
> >
> >
>
> I searched the code, and found ext4, btrfs, lockdep, perf and trace
> all use cmpxchg(). And after this patch serial
> (http://linux.derkeiler.com/Mailing-Lists/Kernel/2007-08/msg05032.html)
> was merged, maybe cmpxchg() is generic. Is there an SMP processor
> doesn't support cmpxchg()?

I made sure it was available on all architectures with that patchset,
with the intended goal to used it (cmpxchg, cmpxchg_local for 32/64-bit
and cmpxchg64, cmpxchg64_local for 64-bit) in arch-agnostic tracer code.

I added a cmpxchg emulated with irqs off for uniprocessor-only
architectures that did not have CAS support. sparc32 has moved to a
hashed locked scheme for its atomic operations long ago, which makes the
full 32 bits available.

So, all in all, I'd be surprised if an architecture would lack
cmpxchg() today. It might be possible that a new architecture that just
came in would not have taken care of deploying cmpxchg, but that should
be easily fixable.

Please see Documentation/atomic_ops.txt for details.

Thanks,

Mathieu

>
> --
> Regards,
> Changli Gao([email protected])

--
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

2010-08-19 15:31:57

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] netfilter: save the hash of the tuple in the original direction for latter use

Le jeudi 19 août 2010 à 08:35 -0400, Mathieu Desnoyers a écrit :

> I made sure it was available on all architectures with that patchset,
> with the intended goal to used it (cmpxchg, cmpxchg_local for 32/64-bit
> and cmpxchg64, cmpxchg64_local for 64-bit) in arch-agnostic tracer code.
>
> I added a cmpxchg emulated with irqs off for uniprocessor-only
> architectures that did not have CAS support. sparc32 has moved to a
> hashed locked scheme for its atomic operations long ago, which makes the
> full 32 bits available.
>
> So, all in all, I'd be surprised if an architecture would lack
> cmpxchg() today. It might be possible that a new architecture that just
> came in would not have taken care of deploying cmpxchg, but that should
> be easily fixable.

Thats pretty cool, thanks for clarification !