Lets look at destroy_conntrack:
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
...
nf_conntrack_free(ct)
kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
net->ct.nf_conntrack_cachep is created with SLAB_DESTROY_BY_RCU.
The hash is protected by rcu, so readers look up conntracks without
locks.
A conntrack is removed from the hash, but in this moment a few readers
still can use the conntrack. Then this conntrack is released and another
thread creates conntrack with the same address and the equal tuple.
After this a reader starts to validate the conntrack:
* It's not dying, because a new conntrack was created
* nf_ct_tuple_equal() returns true.
But this conntrack is not initialized yet, so it can not be used by two
threads concurrently. In this case BUG_ON may be triggered from
nf_nat_setup_info().
Florian Westphal suggested to check the confirm bit too. I think it's
right.
task 1 task 2 task 3
nf_conntrack_find_get
____nf_conntrack_find
destroy_conntrack
hlist_nulls_del_rcu
nf_conntrack_free
kmem_cache_free
__nf_conntrack_alloc
kmem_cache_alloc
memset(&ct->tuplehash[IP_CT_DIR_MAX],
if (nf_ct_is_dying(ct))
if (!nf_ct_tuple_equal()
I'm not sure, that I have ever seen this race condition in a real life.
Currently we are investigating a bug, which is reproduced on a few node.
In our case one conntrack is initialized from a few tasks concurrently,
we don't have any other explanation for this.
<2>[46267.083061] kernel BUG at net/ipv4/netfilter/nf_nat_core.c:322!
...
<4>[46267.083951] RIP: 0010:[<ffffffffa01e00a4>] [<ffffffffa01e00a4>] nf_nat_setup_info+0x564/0x590 [nf_nat]
...
<4>[46267.085549] Call Trace:
<4>[46267.085622] [<ffffffffa023421b>] alloc_null_binding+0x5b/0xa0 [iptable_nat]
<4>[46267.085697] [<ffffffffa02342bc>] nf_nat_rule_find+0x5c/0x80 [iptable_nat]
<4>[46267.085770] [<ffffffffa0234521>] nf_nat_fn+0x111/0x260 [iptable_nat]
<4>[46267.085843] [<ffffffffa0234798>] nf_nat_out+0x48/0xd0 [iptable_nat]
<4>[46267.085919] [<ffffffff814841b9>] nf_iterate+0x69/0xb0
<4>[46267.085991] [<ffffffff81494e70>] ? ip_finish_output+0x0/0x2f0
<4>[46267.086063] [<ffffffff81484374>] nf_hook_slow+0x74/0x110
<4>[46267.086133] [<ffffffff81494e70>] ? ip_finish_output+0x0/0x2f0
<4>[46267.086207] [<ffffffff814b5890>] ? dst_output+0x0/0x20
<4>[46267.086277] [<ffffffff81495204>] ip_output+0xa4/0xc0
<4>[46267.086346] [<ffffffff814b65a4>] raw_sendmsg+0x8b4/0x910
<4>[46267.086419] [<ffffffff814c10fa>] inet_sendmsg+0x4a/0xb0
<4>[46267.086491] [<ffffffff814459aa>] ? sock_update_classid+0x3a/0x50
<4>[46267.086562] [<ffffffff81444d67>] sock_sendmsg+0x117/0x140
<4>[46267.086638] [<ffffffff8151997b>] ? _spin_unlock_bh+0x1b/0x20
<4>[46267.086712] [<ffffffff8109d370>] ? autoremove_wake_function+0x0/0x40
<4>[46267.086785] [<ffffffff81495e80>] ? do_ip_setsockopt+0x90/0xd80
<4>[46267.086858] [<ffffffff8100be0e>] ? call_function_interrupt+0xe/0x20
<4>[46267.086936] [<ffffffff8118cb10>] ? ub_slab_ptr+0x20/0x90
<4>[46267.087006] [<ffffffff8118cb10>] ? ub_slab_ptr+0x20/0x90
<4>[46267.087081] [<ffffffff8118f2e8>] ? kmem_cache_alloc+0xd8/0x1e0
<4>[46267.087151] [<ffffffff81445599>] sys_sendto+0x139/0x190
<4>[46267.087229] [<ffffffff81448c0d>] ? sock_setsockopt+0x16d/0x6f0
<4>[46267.087303] [<ffffffff810efa47>] ? audit_syscall_entry+0x1d7/0x200
<4>[46267.087378] [<ffffffff810ef795>] ? __audit_syscall_exit+0x265/0x290
<4>[46267.087454] [<ffffffff81474885>] ? compat_sys_setsockopt+0x75/0x210
<4>[46267.087531] [<ffffffff81474b5f>] compat_sys_socketcall+0x13f/0x210
<4>[46267.087607] [<ffffffff8104dea3>] ia32_sysret+0x0/0x5
<4>[46267.087676] Code: 91 20 e2 01 75 29 48 89 de 4c 89 f7 e8 56 fa ff ff 85 c0 0f 84 68 fc ff ff 0f b6 4d c6 41 8b 45 00 e9 4d fb ff ff e8 7c 19 e9 e0 <0f> 0b eb fe f6 05 17 91 20 e2 80 74 ce 80 3d 5f 2e 00 00 00 74
<1>[46267.088023] RIP [<ffffffffa01e00a4>] nf_nat_setup_info+0x564/0x590
Cc: Florian Westphal <[email protected]>
Cc: Pablo Neira Ayuso <[email protected]>
Cc: Patrick McHardy <[email protected]>
Cc: Jozsef Kadlecsik <[email protected]>
Cc: "David S. Miller" <[email protected]>
Cc: Cyrill Gorcunov <[email protected]>
Signed-off-by: Andrey Vagin <[email protected]>
---
net/netfilter/nf_conntrack_core.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 43549eb..7a34bb2 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -387,8 +387,12 @@ begin:
!atomic_inc_not_zero(&ct->ct_general.use)))
h = NULL;
else {
+ /* A conntrack can be recreated with the equal tuple,
+ * so we need to check that the conntrack is initialized
+ */
if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) ||
- nf_ct_zone(ct) != zone)) {
+ nf_ct_zone(ct) != zone) ||
+ !nf_ct_is_confirmed(ct)) {
nf_ct_put(ct);
goto begin;
}
--
1.8.4.2
On 01/07/2014 02:31 PM, Andrey Vagin wrote:
> Lets look at destroy_conntrack:
>
> hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
> ...
> nf_conntrack_free(ct)
> kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
>
> net->ct.nf_conntrack_cachep is created with SLAB_DESTROY_BY_RCU.
>
> The hash is protected by rcu, so readers look up conntracks without
> locks.
> A conntrack is removed from the hash, but in this moment a few readers
> still can use the conntrack. Then this conntrack is released and another
> thread creates conntrack with the same address and the equal tuple.
> After this a reader starts to validate the conntrack:
> * It's not dying, because a new conntrack was created
> * nf_ct_tuple_equal() returns true.
>
> But this conntrack is not initialized yet, so it can not be used by two
> threads concurrently. In this case BUG_ON may be triggered from
> nf_nat_setup_info().
>
> Florian Westphal suggested to check the confirm bit too. I think it's
> right.
>
> task 1 task 2 task 3
> nf_conntrack_find_get
> ____nf_conntrack_find
> destroy_conntrack
> hlist_nulls_del_rcu
> nf_conntrack_free
> kmem_cache_free
> __nf_conntrack_alloc
> kmem_cache_alloc
> memset(&ct->tuplehash[IP_CT_DIR_MAX],
> if (nf_ct_is_dying(ct))
> if (!nf_ct_tuple_equal()
>
> I'm not sure, that I have ever seen this race condition in a real life.
> Currently we are investigating a bug, which is reproduced on a few node.
> In our case one conntrack is initialized from a few tasks concurrently,
> we don't have any other explanation for this.
I would add; this strange traffic is generated by botnet clients (/etc/atdd and /boot/.IptabLes)
Several threads are send lot of identical tcp packets via RAW socket.
https://bugzilla.openvz.org/show_bug.cgi?id=2851
> <2>[46267.083061] kernel BUG at net/ipv4/netfilter/nf_nat_core.c:322!
> ...
> <4>[46267.083951] RIP: 0010:[<ffffffffa01e00a4>] [<ffffffffa01e00a4>] nf_nat_setup_info+0x564/0x590 [nf_nat]
> ...
> <4>[46267.085549] Call Trace:
> <4>[46267.085622] [<ffffffffa023421b>] alloc_null_binding+0x5b/0xa0 [iptable_nat]
> <4>[46267.085697] [<ffffffffa02342bc>] nf_nat_rule_find+0x5c/0x80 [iptable_nat]
> <4>[46267.085770] [<ffffffffa0234521>] nf_nat_fn+0x111/0x260 [iptable_nat]
> <4>[46267.085843] [<ffffffffa0234798>] nf_nat_out+0x48/0xd0 [iptable_nat]
> <4>[46267.085919] [<ffffffff814841b9>] nf_iterate+0x69/0xb0
> <4>[46267.085991] [<ffffffff81494e70>] ? ip_finish_output+0x0/0x2f0
> <4>[46267.086063] [<ffffffff81484374>] nf_hook_slow+0x74/0x110
> <4>[46267.086133] [<ffffffff81494e70>] ? ip_finish_output+0x0/0x2f0
> <4>[46267.086207] [<ffffffff814b5890>] ? dst_output+0x0/0x20
> <4>[46267.086277] [<ffffffff81495204>] ip_output+0xa4/0xc0
> <4>[46267.086346] [<ffffffff814b65a4>] raw_sendmsg+0x8b4/0x910
> <4>[46267.086419] [<ffffffff814c10fa>] inet_sendmsg+0x4a/0xb0
> <4>[46267.086491] [<ffffffff814459aa>] ? sock_update_classid+0x3a/0x50
> <4>[46267.086562] [<ffffffff81444d67>] sock_sendmsg+0x117/0x140
> <4>[46267.086638] [<ffffffff8151997b>] ? _spin_unlock_bh+0x1b/0x20
> <4>[46267.086712] [<ffffffff8109d370>] ? autoremove_wake_function+0x0/0x40
> <4>[46267.086785] [<ffffffff81495e80>] ? do_ip_setsockopt+0x90/0xd80
> <4>[46267.086858] [<ffffffff8100be0e>] ? call_function_interrupt+0xe/0x20
> <4>[46267.086936] [<ffffffff8118cb10>] ? ub_slab_ptr+0x20/0x90
> <4>[46267.087006] [<ffffffff8118cb10>] ? ub_slab_ptr+0x20/0x90
> <4>[46267.087081] [<ffffffff8118f2e8>] ? kmem_cache_alloc+0xd8/0x1e0
> <4>[46267.087151] [<ffffffff81445599>] sys_sendto+0x139/0x190
> <4>[46267.087229] [<ffffffff81448c0d>] ? sock_setsockopt+0x16d/0x6f0
> <4>[46267.087303] [<ffffffff810efa47>] ? audit_syscall_entry+0x1d7/0x200
> <4>[46267.087378] [<ffffffff810ef795>] ? __audit_syscall_exit+0x265/0x290
> <4>[46267.087454] [<ffffffff81474885>] ? compat_sys_setsockopt+0x75/0x210
> <4>[46267.087531] [<ffffffff81474b5f>] compat_sys_socketcall+0x13f/0x210
> <4>[46267.087607] [<ffffffff8104dea3>] ia32_sysret+0x0/0x5
> <4>[46267.087676] Code: 91 20 e2 01 75 29 48 89 de 4c 89 f7 e8 56 fa ff ff 85 c0 0f 84 68 fc ff ff 0f b6 4d c6 41 8b 45 00 e9 4d fb ff ff e8 7c 19 e9 e0 <0f> 0b eb fe f6 05 17 91 20 e2 80 74 ce 80 3d 5f 2e 00 00 00 74
> <1>[46267.088023] RIP [<ffffffffa01e00a4>] nf_nat_setup_info+0x564/0x590
>
> Cc: Florian Westphal <[email protected]>
> Cc: Pablo Neira Ayuso <[email protected]>
> Cc: Patrick McHardy <[email protected]>
> Cc: Jozsef Kadlecsik <[email protected]>
> Cc: "David S. Miller" <[email protected]>
> Cc: Cyrill Gorcunov <[email protected]>
> Signed-off-by: Andrey Vagin <[email protected]>
> ---
> net/netfilter/nf_conntrack_core.c | 6 +++++-
> 1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index 43549eb..7a34bb2 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -387,8 +387,12 @@ begin:
> !atomic_inc_not_zero(&ct->ct_general.use)))
> h = NULL;
> else {
> + /* A conntrack can be recreated with the equal tuple,
> + * so we need to check that the conntrack is initialized
> + */
> if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) ||
> - nf_ct_zone(ct) != zone)) {
> + nf_ct_zone(ct) != zone) ||
> + !nf_ct_is_confirmed(ct)) {
> nf_ct_put(ct);
> goto begin;
> }
>
On Tue, 2014-01-07 at 14:31 +0400, Andrey Vagin wrote:
> Lets look at destroy_conntrack:
>
> hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
> ...
> nf_conntrack_free(ct)
> kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
>
> net->ct.nf_conntrack_cachep is created with SLAB_DESTROY_BY_RCU.
>
> The hash is protected by rcu, so readers look up conntracks without
> locks.
> A conntrack is removed from the hash, but in this moment a few readers
> still can use the conntrack. Then this conntrack is released and another
> thread creates conntrack with the same address and the equal tuple.
> After this a reader starts to validate the conntrack:
> * It's not dying, because a new conntrack was created
> * nf_ct_tuple_equal() returns true.
>
> But this conntrack is not initialized yet, so it can not be used by two
> threads concurrently. In this case BUG_ON may be triggered from
> nf_nat_setup_info().
>
> Florian Westphal suggested to check the confirm bit too. I think it's
> right.
>
> task 1 task 2 task 3
> nf_conntrack_find_get
> ____nf_conntrack_find
> destroy_conntrack
> hlist_nulls_del_rcu
> nf_conntrack_free
> kmem_cache_free
> __nf_conntrack_alloc
> kmem_cache_alloc
> memset(&ct->tuplehash[IP_CT_DIR_MAX],
> if (nf_ct_is_dying(ct))
> if (!nf_ct_tuple_equal()
>
> I'm not sure, that I have ever seen this race condition in a real life.
> Currently we are investigating a bug, which is reproduced on a few node.
> In our case one conntrack is initialized from a few tasks concurrently,
> we don't have any other explanation for this.
>
> Cc: Florian Westphal <[email protected]>
> Cc: Pablo Neira Ayuso <[email protected]>
> Cc: Patrick McHardy <[email protected]>
> Cc: Jozsef Kadlecsik <[email protected]>
> Cc: "David S. Miller" <[email protected]>
> Cc: Cyrill Gorcunov <[email protected]>
> Signed-off-by: Andrey Vagin <[email protected]>
> ---
> net/netfilter/nf_conntrack_core.c | 6 +++++-
> 1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index 43549eb..7a34bb2 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -387,8 +387,12 @@ begin:
> !atomic_inc_not_zero(&ct->ct_general.use)))
> h = NULL;
> else {
> + /* A conntrack can be recreated with the equal tuple,
> + * so we need to check that the conntrack is initialized
> + */
> if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) ||
> - nf_ct_zone(ct) != zone)) {
> + nf_ct_zone(ct) != zone) ||
> + !nf_ct_is_confirmed(ct)) {
> nf_ct_put(ct);
> goto begin;
> }
I do not think this is the right way to fix this problem (if said
problem is confirmed)
Remember the rule about SLAB_DESTROY_BY_RCU :
When a struct is freed, then reused, its important to set the its refcnt
(from 0 to 1) only when the structure is fully ready for use.
If a lookup finds a structure which is not yet setup, the
atomic_inc_not_zero() will fail.
Take a look at sk_clone_lock() and Documentation/RCU/rculist_nulls.txt
Eric Dumazet <[email protected]> wrote:
> > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > index 43549eb..7a34bb2 100644
> > --- a/net/netfilter/nf_conntrack_core.c
> > +++ b/net/netfilter/nf_conntrack_core.c
> > @@ -387,8 +387,12 @@ begin:
> > !atomic_inc_not_zero(&ct->ct_general.use)))
> > h = NULL;
> > else {
> > + /* A conntrack can be recreated with the equal tuple,
> > + * so we need to check that the conntrack is initialized
> > + */
> > if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) ||
> > - nf_ct_zone(ct) != zone)) {
> > + nf_ct_zone(ct) != zone) ||
> > + !nf_ct_is_confirmed(ct)) {
> > nf_ct_put(ct);
> > goto begin;
> > }
>
> I do not think this is the right way to fix this problem (if said
> problem is confirmed)
>
> Remember the rule about SLAB_DESTROY_BY_RCU :
>
> When a struct is freed, then reused, its important to set the its refcnt
> (from 0 to 1) only when the structure is fully ready for use.
>
> If a lookup finds a structure which is not yet setup, the
> atomic_inc_not_zero() will fail.
Indeed. But, the structure itself might be ready (or rather,
can be ready since the allocation side will set the refcount to one
after doing the initial work, such as zapping old ->status flags and
setting tuple information).
The problem is with nat extension area stored in the ct->ext area.
This extension area is preallocated but the snat/dnat action
information is only set up after the ct (or rather, the skb that grabbed
a reference to the nf_conn entry) traverses nat pre/postrouting.
This will also set up a null-binding when no matching SNAT/DNAT/MASQERUADE
rule existed.
The manipulations of the skb->nfct->ext nat area are performed without
a lock. Concurrent access is supposedly impossible as the conntrack
should not (yet) be in the hash table.
The confirmed bit is set right before we insert the conntrack into
the hash table (after we traversed rules, ct is ready to be
'published').
i.e. when the confirmed bit is NOT set we should not be 'seeing' the nf_conn
struct when we perform the lookup, as it should still be sitting on the
'unconfirmed' list, being invisible to readers.
Does that explanation make sense to you?
Thanks for looking into this.
Lets look at destroy_conntrack:
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
...
nf_conntrack_free(ct)
kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
net->ct.nf_conntrack_cachep is created with SLAB_DESTROY_BY_RCU.
The hash is protected by rcu, so readers look up conntracks without
locks.
A conntrack is removed from the hash, but in this moment a few readers
still can use the conntrack. Then this conntrack is released and another
thread creates conntrack with the same address and the equal tuple.
After this a reader starts to validate the conntrack:
* It's not dying, because a new conntrack was created
* nf_ct_tuple_equal() returns true.
But this conntrack is not initialized yet, so it can not be used by two
threads concurrently. In this case BUG_ON may be triggered from
nf_nat_setup_info().
Florian Westphal suggested to check the confirm bit too. I think it's
right.
task 1 task 2 task 3
nf_conntrack_find_get
____nf_conntrack_find
destroy_conntrack
hlist_nulls_del_rcu
nf_conntrack_free
kmem_cache_free
__nf_conntrack_alloc
kmem_cache_alloc
memset(&ct->tuplehash[IP_CT_DIR_MAX],
if (nf_ct_is_dying(ct))
if (!nf_ct_tuple_equal()
I'm not sure, that I have ever seen this race condition in a real life.
Currently we are investigating a bug, which is reproduced on a few node.
In our case one conntrack is initialized from a few tasks concurrently,
we don't have any other explanation for this.
<2>[46267.083061] kernel BUG at net/ipv4/netfilter/nf_nat_core.c:322!
...
<4>[46267.083951] RIP: 0010:[<ffffffffa01e00a4>] [<ffffffffa01e00a4>] nf_nat_setup_info+0x564/0x590 [nf_nat]
...
<4>[46267.085549] Call Trace:
<4>[46267.085622] [<ffffffffa023421b>] alloc_null_binding+0x5b/0xa0 [iptable_nat]
<4>[46267.085697] [<ffffffffa02342bc>] nf_nat_rule_find+0x5c/0x80 [iptable_nat]
<4>[46267.085770] [<ffffffffa0234521>] nf_nat_fn+0x111/0x260 [iptable_nat]
<4>[46267.085843] [<ffffffffa0234798>] nf_nat_out+0x48/0xd0 [iptable_nat]
<4>[46267.085919] [<ffffffff814841b9>] nf_iterate+0x69/0xb0
<4>[46267.085991] [<ffffffff81494e70>] ? ip_finish_output+0x0/0x2f0
<4>[46267.086063] [<ffffffff81484374>] nf_hook_slow+0x74/0x110
<4>[46267.086133] [<ffffffff81494e70>] ? ip_finish_output+0x0/0x2f0
<4>[46267.086207] [<ffffffff814b5890>] ? dst_output+0x0/0x20
<4>[46267.086277] [<ffffffff81495204>] ip_output+0xa4/0xc0
<4>[46267.086346] [<ffffffff814b65a4>] raw_sendmsg+0x8b4/0x910
<4>[46267.086419] [<ffffffff814c10fa>] inet_sendmsg+0x4a/0xb0
<4>[46267.086491] [<ffffffff814459aa>] ? sock_update_classid+0x3a/0x50
<4>[46267.086562] [<ffffffff81444d67>] sock_sendmsg+0x117/0x140
<4>[46267.086638] [<ffffffff8151997b>] ? _spin_unlock_bh+0x1b/0x20
<4>[46267.086712] [<ffffffff8109d370>] ? autoremove_wake_function+0x0/0x40
<4>[46267.086785] [<ffffffff81495e80>] ? do_ip_setsockopt+0x90/0xd80
<4>[46267.086858] [<ffffffff8100be0e>] ? call_function_interrupt+0xe/0x20
<4>[46267.086936] [<ffffffff8118cb10>] ? ub_slab_ptr+0x20/0x90
<4>[46267.087006] [<ffffffff8118cb10>] ? ub_slab_ptr+0x20/0x90
<4>[46267.087081] [<ffffffff8118f2e8>] ? kmem_cache_alloc+0xd8/0x1e0
<4>[46267.087151] [<ffffffff81445599>] sys_sendto+0x139/0x190
<4>[46267.087229] [<ffffffff81448c0d>] ? sock_setsockopt+0x16d/0x6f0
<4>[46267.087303] [<ffffffff810efa47>] ? audit_syscall_entry+0x1d7/0x200
<4>[46267.087378] [<ffffffff810ef795>] ? __audit_syscall_exit+0x265/0x290
<4>[46267.087454] [<ffffffff81474885>] ? compat_sys_setsockopt+0x75/0x210
<4>[46267.087531] [<ffffffff81474b5f>] compat_sys_socketcall+0x13f/0x210
<4>[46267.087607] [<ffffffff8104dea3>] ia32_sysret+0x0/0x5
<4>[46267.087676] Code: 91 20 e2 01 75 29 48 89 de 4c 89 f7 e8 56 fa ff ff 85 c0 0f 84 68 fc ff ff 0f b6 4d c6 41 8b 45 00 e9 4d fb ff ff e8 7c 19 e9 e0 <0f> 0b eb fe f6 05 17 91 20 e2 80 74 ce 80 3d 5f 2e 00 00 00 74
<1>[46267.088023] RIP [<ffffffffa01e00a4>] nf_nat_setup_info+0x564/0x590
v2: move nf_ct_is_confirmed into the unlikely() annotation
Cc: Florian Westphal <[email protected]>
Cc: Pablo Neira Ayuso <[email protected]>
Cc: Patrick McHardy <[email protected]>
Cc: Jozsef Kadlecsik <[email protected]>
Cc: "David S. Miller" <[email protected]>
Cc: Cyrill Gorcunov <[email protected]>
Signed-off-by: Andrey Vagin <[email protected]>
---
net/netfilter/nf_conntrack_core.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 43549eb..403f634 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -387,8 +387,12 @@ begin:
!atomic_inc_not_zero(&ct->ct_general.use)))
h = NULL;
else {
+ /* A conntrack can be recreated with the equal tuple,
+ * so we need to check that the conntrack is initialized
+ */
if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) ||
- nf_ct_zone(ct) != zone)) {
+ nf_ct_zone(ct) != zone ||
+ !nf_ct_is_confirmed(ct))) {
nf_ct_put(ct);
goto begin;
}
--
1.8.4.2
On Tue, 2014-01-07 at 16:25 +0100, Florian Westphal wrote:
> Eric Dumazet <[email protected]> wrote:
> > > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > > index 43549eb..7a34bb2 100644
> > > --- a/net/netfilter/nf_conntrack_core.c
> > > +++ b/net/netfilter/nf_conntrack_core.c
> > > @@ -387,8 +387,12 @@ begin:
> > > !atomic_inc_not_zero(&ct->ct_general.use)))
> > > h = NULL;
> > > else {
> > > + /* A conntrack can be recreated with the equal tuple,
> > > + * so we need to check that the conntrack is initialized
> > > + */
> > > if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) ||
> > > - nf_ct_zone(ct) != zone)) {
> > > + nf_ct_zone(ct) != zone) ||
> > > + !nf_ct_is_confirmed(ct)) {
> > > nf_ct_put(ct);
> > > goto begin;
> > > }
> >
> > I do not think this is the right way to fix this problem (if said
> > problem is confirmed)
> >
> > Remember the rule about SLAB_DESTROY_BY_RCU :
> >
> > When a struct is freed, then reused, its important to set the its refcnt
> > (from 0 to 1) only when the structure is fully ready for use.
> >
> > If a lookup finds a structure which is not yet setup, the
> > atomic_inc_not_zero() will fail.
>
> Indeed. But, the structure itself might be ready (or rather,
> can be ready since the allocation side will set the refcount to one
> after doing the initial work, such as zapping old ->status flags and
> setting tuple information).
>
> The problem is with nat extension area stored in the ct->ext area.
> This extension area is preallocated but the snat/dnat action
> information is only set up after the ct (or rather, the skb that grabbed
> a reference to the nf_conn entry) traverses nat pre/postrouting.
>
> This will also set up a null-binding when no matching SNAT/DNAT/MASQERUADE
> rule existed.
>
> The manipulations of the skb->nfct->ext nat area are performed without
> a lock. Concurrent access is supposedly impossible as the conntrack
> should not (yet) be in the hash table.
>
> The confirmed bit is set right before we insert the conntrack into
> the hash table (after we traversed rules, ct is ready to be
> 'published').
>
> i.e. when the confirmed bit is NOT set we should not be 'seeing' the nf_conn
> struct when we perform the lookup, as it should still be sitting on the
> 'unconfirmed' list, being invisible to readers.
>
> Does that explanation make sense to you?
>
> Thanks for looking into this.
Still, this patch adds a loop. And maybe an infinite one if confirmed
bit is set from an context that was interrupted by this one.
If you need to test the confirmed bit, then you also need to test it
before taking the refcount.
On Wed, 2014-01-08 at 17:17 +0400, Andrey Vagin wrote:
> Lets look at destroy_conntrack:
>
> hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
> ...
> nf_conntrack_free(ct)
> kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
>
> net->ct.nf_conntrack_cachep is created with SLAB_DESTROY_BY_RCU.
>
> The hash is protected by rcu, so readers look up conntracks without
> locks.
> A conntrack is removed from the hash, but in this moment a few readers
> still can use the conntrack. Then this conntrack is released and another
> thread creates conntrack with the same address and the equal tuple.
> After this a reader starts to validate the conntrack:
> * It's not dying, because a new conntrack was created
> * nf_ct_tuple_equal() returns true.
>
> But this conntrack is not initialized yet, so it can not be used by two
> threads concurrently. In this case BUG_ON may be triggered from
> nf_nat_setup_info().
>
> Florian Westphal suggested to check the confirm bit too. I think it's
> right.
>
> task 1 task 2 task 3
> nf_conntrack_find_get
> ____nf_conntrack_find
> destroy_conntrack
> hlist_nulls_del_rcu
> nf_conntrack_free
> kmem_cache_free
> __nf_conntrack_alloc
> kmem_cache_alloc
> memset(&ct->tuplehash[IP_CT_DIR_MAX],
> if (nf_ct_is_dying(ct))
> if (!nf_ct_tuple_equal()
>
> I'm not sure, that I have ever seen this race condition in a real life.
> Currently we are investigating a bug, which is reproduced on a few node.
> In our case one conntrack is initialized from a few tasks concurrently,
> we don't have any other explanation for this.
>
> <2>[46267.083061] kernel BUG at net/ipv4/netfilter/nf_nat_core.c:322!
> ...
> <4>[46267.083951] RIP: 0010:[<ffffffffa01e00a4>] [<ffffffffa01e00a4>] nf_nat_setup_info+0x564/0x590 [nf_nat]
> ...
> <4>[46267.085549] Call Trace:
> <4>[46267.085622] [<ffffffffa023421b>] alloc_null_binding+0x5b/0xa0 [iptable_nat]
> <4>[46267.085697] [<ffffffffa02342bc>] nf_nat_rule_find+0x5c/0x80 [iptable_nat]
> <4>[46267.085770] [<ffffffffa0234521>] nf_nat_fn+0x111/0x260 [iptable_nat]
> <4>[46267.085843] [<ffffffffa0234798>] nf_nat_out+0x48/0xd0 [iptable_nat]
> <4>[46267.085919] [<ffffffff814841b9>] nf_iterate+0x69/0xb0
> <4>[46267.085991] [<ffffffff81494e70>] ? ip_finish_output+0x0/0x2f0
> <4>[46267.086063] [<ffffffff81484374>] nf_hook_slow+0x74/0x110
> <4>[46267.086133] [<ffffffff81494e70>] ? ip_finish_output+0x0/0x2f0
> <4>[46267.086207] [<ffffffff814b5890>] ? dst_output+0x0/0x20
> <4>[46267.086277] [<ffffffff81495204>] ip_output+0xa4/0xc0
> <4>[46267.086346] [<ffffffff814b65a4>] raw_sendmsg+0x8b4/0x910
> <4>[46267.086419] [<ffffffff814c10fa>] inet_sendmsg+0x4a/0xb0
> <4>[46267.086491] [<ffffffff814459aa>] ? sock_update_classid+0x3a/0x50
> <4>[46267.086562] [<ffffffff81444d67>] sock_sendmsg+0x117/0x140
> <4>[46267.086638] [<ffffffff8151997b>] ? _spin_unlock_bh+0x1b/0x20
> <4>[46267.086712] [<ffffffff8109d370>] ? autoremove_wake_function+0x0/0x40
> <4>[46267.086785] [<ffffffff81495e80>] ? do_ip_setsockopt+0x90/0xd80
> <4>[46267.086858] [<ffffffff8100be0e>] ? call_function_interrupt+0xe/0x20
> <4>[46267.086936] [<ffffffff8118cb10>] ? ub_slab_ptr+0x20/0x90
> <4>[46267.087006] [<ffffffff8118cb10>] ? ub_slab_ptr+0x20/0x90
> <4>[46267.087081] [<ffffffff8118f2e8>] ? kmem_cache_alloc+0xd8/0x1e0
> <4>[46267.087151] [<ffffffff81445599>] sys_sendto+0x139/0x190
> <4>[46267.087229] [<ffffffff81448c0d>] ? sock_setsockopt+0x16d/0x6f0
> <4>[46267.087303] [<ffffffff810efa47>] ? audit_syscall_entry+0x1d7/0x200
> <4>[46267.087378] [<ffffffff810ef795>] ? __audit_syscall_exit+0x265/0x290
> <4>[46267.087454] [<ffffffff81474885>] ? compat_sys_setsockopt+0x75/0x210
> <4>[46267.087531] [<ffffffff81474b5f>] compat_sys_socketcall+0x13f/0x210
> <4>[46267.087607] [<ffffffff8104dea3>] ia32_sysret+0x0/0x5
> <4>[46267.087676] Code: 91 20 e2 01 75 29 48 89 de 4c 89 f7 e8 56 fa ff ff 85 c0 0f 84 68 fc ff ff 0f b6 4d c6 41 8b 45 00 e9 4d fb ff ff e8 7c 19 e9 e0 <0f> 0b eb fe f6 05 17 91 20 e2 80 74 ce 80 3d 5f 2e 00 00 00 74
> <1>[46267.088023] RIP [<ffffffffa01e00a4>] nf_nat_setup_info+0x564/0x590
>
> v2: move nf_ct_is_confirmed into the unlikely() annotation
>
> Cc: Florian Westphal <[email protected]>
> Cc: Pablo Neira Ayuso <[email protected]>
> Cc: Patrick McHardy <[email protected]>
> Cc: Jozsef Kadlecsik <[email protected]>
> Cc: "David S. Miller" <[email protected]>
> Cc: Cyrill Gorcunov <[email protected]>
> Signed-off-by: Andrey Vagin <[email protected]>
> ---
> net/netfilter/nf_conntrack_core.c | 6 +++++-
> 1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index 43549eb..403f634 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -387,8 +387,12 @@ begin:
> !atomic_inc_not_zero(&ct->ct_general.use)))
> h = NULL;
> else {
> + /* A conntrack can be recreated with the equal tuple,
> + * so we need to check that the conntrack is initialized
> + */
> if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) ||
> - nf_ct_zone(ct) != zone)) {
> + nf_ct_zone(ct) != zone ||
> + !nf_ct_is_confirmed(ct))) {
> nf_ct_put(ct);
> goto begin;
> }
I am still not convinced of this being the right fix.
The key we test after taking the refcount should be the same key that we
test before taking the refcount, otherwise we might add a loop here.
Your patch did not change ____nf_conntrack_find(), so I find it
confusing.
Eric Dumazet <[email protected]> wrote:
> > This will also set up a null-binding when no matching SNAT/DNAT/MASQERUADE
> > rule existed.
> >
> > The manipulations of the skb->nfct->ext nat area are performed without
> > a lock. Concurrent access is supposedly impossible as the conntrack
> > should not (yet) be in the hash table.
> >
> > The confirmed bit is set right before we insert the conntrack into
> > the hash table (after we traversed rules, ct is ready to be
> > 'published').
> >
> > i.e. when the confirmed bit is NOT set we should not be 'seeing' the nf_conn
> > struct when we perform the lookup, as it should still be sitting on the
> > 'unconfirmed' list, being invisible to readers.
> >
> > Does that explanation make sense to you?
> >
> > Thanks for looking into this.
>
> Still, this patch adds a loop. And maybe an infinite one if confirmed
> bit is set from an context that was interrupted by this one.
Hmm. There should be at most one retry.
The confirmed bit should always be set here.
If it isn't then this conntrack shouldn't be in the hash table, i.e.
when we re-try we should find the same conntrack again with the bit set.
Asuming the other cpu git interrupted after setting confirmed
bit but before inserting it into the hash table, then our re-try
should not be able find a matching entry.
Maybe I am missing something, but I don't see how we could (upon
retry) find the very same entry again with the bit still not set.
> If you need to test the confirmed bit, then you also need to test it
> before taking the refcount.
I don't think that would make sense, because it should always be
set (inserting conntrack into hash table without confirmed set is
illegal, and it is never unset again).
[ when allocating a new conntrack, ct->status is zeroed, which also
clears the flag. This happens just before we set the new objects
refcount to 1 ]
On Wed, 2014-01-08 at 15:04 +0100, Florian Westphal wrote:
> Eric Dumazet <[email protected]> wrote:
> > > This will also set up a null-binding when no matching SNAT/DNAT/MASQERUADE
> > > rule existed.
> > >
> > > The manipulations of the skb->nfct->ext nat area are performed without
> > > a lock. Concurrent access is supposedly impossible as the conntrack
> > > should not (yet) be in the hash table.
> > >
> > > The confirmed bit is set right before we insert the conntrack into
> > > the hash table (after we traversed rules, ct is ready to be
> > > 'published').
> > >
> > > i.e. when the confirmed bit is NOT set we should not be 'seeing' the nf_conn
> > > struct when we perform the lookup, as it should still be sitting on the
> > > 'unconfirmed' list, being invisible to readers.
> > >
> > > Does that explanation make sense to you?
> > >
> > > Thanks for looking into this.
> >
> > Still, this patch adds a loop. And maybe an infinite one if confirmed
> > bit is set from an context that was interrupted by this one.
>
> Hmm. There should be at most one retry.
>
> The confirmed bit should always be set here.
So why are you testing it ?
> If it isn't then this conntrack shouldn't be in the hash table, i.e.
> when we re-try we should find the same conntrack again with the bit set.
Where is this guaranteed ? The invariant is the refcnt being taken.
>
> Asuming the other cpu git interrupted after setting confirmed
> bit but before inserting it into the hash table, then our re-try
> should not be able find a matching entry.
>
> Maybe I am missing something, but I don't see how we could (upon
> retry) find the very same entry again with the bit still not set.
>
> > If you need to test the confirmed bit, then you also need to test it
> > before taking the refcount.
>
> I don't think that would make sense, because it should always be
> set (inserting conntrack into hash table without confirmed set is
> illegal, and it is never unset again).
>
> [ when allocating a new conntrack, ct->status is zeroed, which also
> clears the flag. This happens just before we set the new objects
> refcount to 1 ]
I did this RCU conversion, so I think I know what I am talking about.
The entry should not be put into hash table (or refcnt set to 1),
if its not ready. It is that simple.
We need to address this problem without adding an extra test and
possible loop for the lookups.
Whole point of RCU is to make lookups fast, by possibly making the
writers doing all the needed logic.
Eric Dumazet <[email protected]> wrote:
> > The confirmed bit should always be set here.
>
> So why are you testing it ?
To detect ct object recycling when tuple is identical.
This is my understanding of how we can end up with two
cpus thinking they have exclusive ownership of the same ct:
A cpu0: starts lookup: find ct for tuple t
B cpu1: starts lookup: find ct for tuple t
C cpu0: finds ct c for tuple t, no refcnt taken yet
cpu1: finds ct c for tuple t, no refcnt taken yet
cpu2: destroys ct c, removes from hash table, calls ->destroy function
D cpu0: tries to increment refcnt; fails since its 0: lookup ends
E cpu0: allocates a new ct object since no acceptable ct was found for t
F cpu0: allocator gives us just-freed ct c
G cpu0: initialises ct, sets refcnt to 1
H cpu0: adds extensions, ct object is put on unconfirmed list and
assigned to skb->nfct
I cpu0: skb continues through network stack
J cpu1: tries to increment refcnt, ok
K cpu1: checks if ct matches requested tuple t: it does
L cpu0: sets refcnt conntrack tuple, allocates extensions, etc.
cpu1: sets skb->nfct to ct, skb continues through network stack
-> both cpu0 and cpu1 reference a ct object that was not in hash table
cpu0 and cpu1 will then race, for example in
net/ipv4/netfilter/iptable_nat.c:nf_nat_rule_find():
if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
ret = alloc_null_binding(ct, hooknum);
[ Its possible that I misunderstand and that there is something that
precents this from happening. Usually its the 'tuple equal' test
that is performed post-atomic-inc-not-zero that detects the recycling,
so step K above would fail ]
The idea of the 'confirmed bit test' is that when its not set then the
conntrack was recycled and should not be used before the cpu that
currently 'owns' that entry has put it into the hash table again.
> I did this RCU conversion, so I think I know what I am talking about.
Yes, I know. If you have any suggestions on how to fix it, I'd be very
interested to hear about them.
> The entry should not be put into hash table (or refcnt set to 1),
> if its not ready. It is that simple.
I understand what you're saying, but I don't see how we can do it.
I think the assumption that we have a refcnt on skb->nfct is everywhere.
If I understand you correctly we'd have to differentiate between
'can be used fully (e.g. nf_nat_setup_info done for both directions)'
and 'a new conntrack was created (extensions may not be ready yet)'.
But currently in both cases the ct is assigned to the skb, and in both
cases a refcnt is taken. I am out of ideas, except perhaps using
ct->lock to serialize the nat setup (but I don't like that since
I'm not sure that the nat race is the only one).
> We need to address this problem without adding an extra test and
> possible loop for the lookups.
Agreed. I don't like the extra test either.
Many thanks for looking into this Eric!
Florian Westphal <[email protected]> wrote:
> Eric Dumazet <[email protected]> wrote:
> > > The confirmed bit should always be set here.
> >
> > So why are you testing it ?
>
> To detect ct object recycling when tuple is identical.
>
> This is my understanding of how we can end up with two
> cpus thinking they have exclusive ownership of the same ct:
>
> A cpu0: starts lookup: find ct for tuple t
> B cpu1: starts lookup: find ct for tuple t
> C cpu0: finds ct c for tuple t, no refcnt taken yet
> cpu1: finds ct c for tuple t, no refcnt taken yet
> cpu2: destroys ct c, removes from hash table, calls ->destroy function
> D cpu0: tries to increment refcnt; fails since its 0: lookup ends
> E cpu0: allocates a new ct object since no acceptable ct was found for t
> F cpu0: allocator gives us just-freed ct c
> G cpu0: initialises ct, sets refcnt to 1
> H cpu0: adds extensions, ct object is put on unconfirmed list and
> assigned to skb->nfct
> I cpu0: skb continues through network stack
> J cpu1: tries to increment refcnt, ok
> K cpu1: checks if ct matches requested tuple t: it does
> L cpu0: sets refcnt conntrack tuple, allocates extensions, etc.
^^^^
> cpu1: sets skb->nfct to ct, skb continues through network stack
sorry, for that brain fart This should only say
L cpu1: sets skb->nfct to ct, skb continues...
On Tue, Jan 07, 2014 at 07:08:25AM -0800, Eric Dumazet wrote:
> On Tue, 2014-01-07 at 14:31 +0400, Andrey Vagin wrote:
> > Lets look at destroy_conntrack:
> >
> > hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
> > ...
> > nf_conntrack_free(ct)
> > kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
> >
> > net->ct.nf_conntrack_cachep is created with SLAB_DESTROY_BY_RCU.
> >
> > The hash is protected by rcu, so readers look up conntracks without
> > locks.
> > A conntrack is removed from the hash, but in this moment a few readers
> > still can use the conntrack. Then this conntrack is released and another
> > thread creates conntrack with the same address and the equal tuple.
> > After this a reader starts to validate the conntrack:
> > * It's not dying, because a new conntrack was created
> > * nf_ct_tuple_equal() returns true.
> >
> > But this conntrack is not initialized yet, so it can not be used by two
> > threads concurrently. In this case BUG_ON may be triggered from
> > nf_nat_setup_info().
> >
> > Florian Westphal suggested to check the confirm bit too. I think it's
> > right.
> >
> > task 1 task 2 task 3
> > nf_conntrack_find_get
> > ____nf_conntrack_find
> > destroy_conntrack
> > hlist_nulls_del_rcu
> > nf_conntrack_free
> > kmem_cache_free
> > __nf_conntrack_alloc
> > kmem_cache_alloc
> > memset(&ct->tuplehash[IP_CT_DIR_MAX],
> > if (nf_ct_is_dying(ct))
> > if (!nf_ct_tuple_equal()
> >
> > I'm not sure, that I have ever seen this race condition in a real life.
> > Currently we are investigating a bug, which is reproduced on a few node.
> > In our case one conntrack is initialized from a few tasks concurrently,
> > we don't have any other explanation for this.
>
> >
> > Cc: Florian Westphal <[email protected]>
> > Cc: Pablo Neira Ayuso <[email protected]>
> > Cc: Patrick McHardy <[email protected]>
> > Cc: Jozsef Kadlecsik <[email protected]>
> > Cc: "David S. Miller" <[email protected]>
> > Cc: Cyrill Gorcunov <[email protected]>
> > Signed-off-by: Andrey Vagin <[email protected]>
> > ---
> > net/netfilter/nf_conntrack_core.c | 6 +++++-
> > 1 file changed, 5 insertions(+), 1 deletion(-)
> >
> > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > index 43549eb..7a34bb2 100644
> > --- a/net/netfilter/nf_conntrack_core.c
> > +++ b/net/netfilter/nf_conntrack_core.c
> > @@ -387,8 +387,12 @@ begin:
> > !atomic_inc_not_zero(&ct->ct_general.use)))
> > h = NULL;
> > else {
> > + /* A conntrack can be recreated with the equal tuple,
> > + * so we need to check that the conntrack is initialized
> > + */
> > if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) ||
> > - nf_ct_zone(ct) != zone)) {
> > + nf_ct_zone(ct) != zone) ||
> > + !nf_ct_is_confirmed(ct)) {
> > nf_ct_put(ct);
> > goto begin;
> > }
>
> I do not think this is the right way to fix this problem (if said
> problem is confirmed)
>
> Remember the rule about SLAB_DESTROY_BY_RCU :
>
> When a struct is freed, then reused, its important to set the its refcnt
> (from 0 to 1) only when the structure is fully ready for use.
>
> If a lookup finds a structure which is not yet setup, the
> atomic_inc_not_zero() will fail.
>
> Take a look at sk_clone_lock() and Documentation/RCU/rculist_nulls.txt
>
I have one more question. Looks like I found another problem.
init_conntrack:
hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
&net->ct.unconfirmed);
__nf_conntrack_hash_insert:
hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
&net->ct.hash[hash]);
We use one hlist_nulls_node to add a conntrack into two different lists.
As far as I understand, it's unacceptable in case of
SLAB_DESTROY_BY_RCU.
Lets imagine that we have two threads. The first one enumerates objects
from a first list, the second one recreates an object and insert it in a
second list. If a first thread in this moment stays on the object, it
can read "next", when it's in the second list, so it will continue
to enumerate objects from the second list. It is not what we want, isn't
it?
cpu1 cpu2
hlist_nulls_for_each_entry_rcu(ct)
destroy_conntrack
kmem_cache_free
init_conntrack
hlist_nulls_add_head_rcu
ct = ct->next
On Thu, 2014-01-09 at 09:24 +0400, Andrew Vagin wrote:
> I have one more question. Looks like I found another problem.
>
> init_conntrack:
> hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
> &net->ct.unconfirmed);
>
> __nf_conntrack_hash_insert:
> hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
> &net->ct.hash[hash]);
>
> We use one hlist_nulls_node to add a conntrack into two different lists.
> As far as I understand, it's unacceptable in case of
> SLAB_DESTROY_BY_RCU.
I guess you missed :
net/netfilter/nf_conntrack_core.c:1598: INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL);
>
> Lets imagine that we have two threads. The first one enumerates objects
> from a first list, the second one recreates an object and insert it in a
> second list. If a first thread in this moment stays on the object, it
> can read "next", when it's in the second list, so it will continue
> to enumerate objects from the second list. It is not what we want, isn't
> it?
>
> cpu1 cpu2
> hlist_nulls_for_each_entry_rcu(ct)
> destroy_conntrack
> kmem_cache_free
>
> init_conntrack
> hlist_nulls_add_head_rcu
> ct = ct->next
>
This will be fine.
I think we even have a counter to count number of occurrence of this
rare event. (I personally never read a non null search_restart value)
NF_CT_STAT_INC(net, search_restart);
On Tue, Jan 07, 2014 at 04:25:20PM +0100, Florian Westphal wrote:
> Eric Dumazet <[email protected]> wrote:
> > > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > > index 43549eb..7a34bb2 100644
> > > --- a/net/netfilter/nf_conntrack_core.c
> > > +++ b/net/netfilter/nf_conntrack_core.c
> > > @@ -387,8 +387,12 @@ begin:
> > > !atomic_inc_not_zero(&ct->ct_general.use)))
> > > h = NULL;
> > > else {
> > > + /* A conntrack can be recreated with the equal tuple,
> > > + * so we need to check that the conntrack is initialized
> > > + */
> > > if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) ||
> > > - nf_ct_zone(ct) != zone)) {
> > > + nf_ct_zone(ct) != zone) ||
> > > + !nf_ct_is_confirmed(ct)) {
> > > nf_ct_put(ct);
> > > goto begin;
> > > }
> >
> > I do not think this is the right way to fix this problem (if said
> > problem is confirmed)
> >
> > Remember the rule about SLAB_DESTROY_BY_RCU :
> >
> > When a struct is freed, then reused, its important to set the its refcnt
> > (from 0 to 1) only when the structure is fully ready for use.
> >
> > If a lookup finds a structure which is not yet setup, the
> > atomic_inc_not_zero() will fail.
>
> Indeed. But, the structure itself might be ready (or rather,
> can be ready since the allocation side will set the refcount to one
> after doing the initial work, such as zapping old ->status flags and
> setting tuple information).
>
> The problem is with nat extension area stored in the ct->ext area.
> This extension area is preallocated but the snat/dnat action
> information is only set up after the ct (or rather, the skb that grabbed
> a reference to the nf_conn entry) traverses nat pre/postrouting.
>
> This will also set up a null-binding when no matching SNAT/DNAT/MASQERUADE
> rule existed.
>
> The manipulations of the skb->nfct->ext nat area are performed without
> a lock. Concurrent access is supposedly impossible as the conntrack
> should not (yet) be in the hash table.
>
> The confirmed bit is set right before we insert the conntrack into
> the hash table (after we traversed rules, ct is ready to be
> 'published').
Can we allocate conntrack with zero ct_general.use and increment it at
the first time before inserting the conntrack into the hash table?
When conntrack is allocated it is attached exclusively to one skb.
It must be destroyed with skb, if it has not been confirmed, so we
don't need refcnt on this stage.
I found only one place, where a reference counter of unconfirmed
conntract can incremented. It's ctnetlink_dump_table(). Probably we
can find a way, how to fix it.
>
> i.e. when the confirmed bit is NOT set we should not be 'seeing' the nf_conn
> struct when we perform the lookup, as it should still be sitting on the
> 'unconfirmed' list, being invisible to readers.
>
> Does that explanation make sense to you?
>
> Thanks for looking into this.
Andrew Vagin <[email protected]> wrote:
> Can we allocate conntrack with zero ct_general.use and increment it at
> the first time before inserting the conntrack into the hash table?
> When conntrack is allocated it is attached exclusively to one skb.
> It must be destroyed with skb, if it has not been confirmed, so we
> don't need refcnt on this stage.
>
> I found only one place, where a reference counter of unconfirmed
> conntract can incremented. It's ctnetlink_dump_table().
What about skb_clone, etc? They will also increment the refcnt
if a conntrack entry is attached to the skb.
On Thu, Jan 09, 2014 at 09:56:22PM +0100, Florian Westphal wrote:
> Andrew Vagin <[email protected]> wrote:
> > Can we allocate conntrack with zero ct_general.use and increment it at
> > the first time before inserting the conntrack into the hash table?
> > When conntrack is allocated it is attached exclusively to one skb.
> > It must be destroyed with skb, if it has not been confirmed, so we
> > don't need refcnt on this stage.
> >
> > I found only one place, where a reference counter of unconfirmed
> > conntract can incremented. It's ctnetlink_dump_table().
>
> What about skb_clone, etc? They will also increment the refcnt
> if a conntrack entry is attached to the skb.
We can not attach an unconfirmed conntrack to a few skb, because
nf_nat_setup_info can be executed concurrently for the same conntrack.
How do we avoid this race condition for cloned skb-s?
Andrew Vagin <[email protected]> wrote:
> On Thu, Jan 09, 2014 at 09:56:22PM +0100, Florian Westphal wrote:
> > Andrew Vagin <[email protected]> wrote:
> > > Can we allocate conntrack with zero ct_general.use and increment it at
> > > the first time before inserting the conntrack into the hash table?
> > > When conntrack is allocated it is attached exclusively to one skb.
> > > It must be destroyed with skb, if it has not been confirmed, so we
> > > don't need refcnt on this stage.
> > >
> > > I found only one place, where a reference counter of unconfirmed
> > > conntract can incremented. It's ctnetlink_dump_table().
> >
> > What about skb_clone, etc? They will also increment the refcnt
> > if a conntrack entry is attached to the skb.
>
> We can not attach an unconfirmed conntrack to a few skb, because
s/few/new/?
> nf_nat_setup_info can be executed concurrently for the same conntrack.
>
> How do we avoid this race condition for cloned skb-s?
Simple, the assumption is that only one cpu owns the nfct, so it does
not matter if the skb is cloned in between, as there are no parallel
users.
The only possibility (that I know of) to violate this is to create
a bridge, enable call-iptables sysctl, add -j NFQUEUE rules and then
wait for packets that need to be forwarded to several recipients,
e.g. multicast traffic.
see
http://marc.info/?l=netfilter-devel&m=131471083501656&w=2
or search 'netfilter: nat: work around shared nfct struct in bridge
case'
2014/1/9 Eric Dumazet <[email protected]>:
> On Thu, 2014-01-09 at 09:24 +0400, Andrew Vagin wrote:
>
>> I have one more question. Looks like I found another problem.
>>
>> init_conntrack:
>> hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
>> &net->ct.unconfirmed);
>>
>> __nf_conntrack_hash_insert:
>> hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
>> &net->ct.hash[hash]);
>>
>> We use one hlist_nulls_node to add a conntrack into two different lists.
>> As far as I understand, it's unacceptable in case of
>> SLAB_DESTROY_BY_RCU.
>
> I guess you missed :
>
> net/netfilter/nf_conntrack_core.c:1598: INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL);
but we can look up something suitable and return this value, but it
will be unconfirmed. Ok, I see. This situation is fixed by this patch
too.
I don't understand the result of your discussion with Florian. Here
are a few states of conntracts: it can be used and it's initialized.
The sign of the first state is non-zero refcnt and the sign of the
second state is the confirmed bit.
In the first state conntrack is attached to skb and inserted in the
unconfirmed list. In this state the use count can be incremented in
ctnetlink_dump_list() and skb_clone().
In the second state conntrack may be attached to a few skb-s and
inserted to net->ct.hash.
I have read all emails again and I can't understand when this patch
doesn't work. Maybe you could give a sequence of actions? Thanks.
>
>
>>
>> Lets imagine that we have two threads. The first one enumerates objects
>> from a first list, the second one recreates an object and insert it in a
>> second list. If a first thread in this moment stays on the object, it
>> can read "next", when it's in the second list, so it will continue
>> to enumerate objects from the second list. It is not what we want, isn't
>> it?
>>
>> cpu1 cpu2
>> hlist_nulls_for_each_entry_rcu(ct)
>> destroy_conntrack
>> kmem_cache_free
>>
>> init_conntrack
>> hlist_nulls_add_head_rcu
>> ct = ct->next
>>
>
> This will be fine.
>
> I think we even have a counter to count number of occurrence of this
> rare event. (I personally never read a non null search_restart value)
>
> NF_CT_STAT_INC(net, search_restart);
Thank you for explanation.
Lets look at destroy_conntrack:
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
...
nf_conntrack_free(ct)
kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
net->ct.nf_conntrack_cachep is created with SLAB_DESTROY_BY_RCU.
The hash is protected by rcu, so readers look up conntracks without
locks.
A conntrack is removed from the hash, but in this moment a few readers
still can use the conntrack. Then this conntrack is released and another
thread creates conntrack with the same address and the equal tuple.
After this a reader starts to validate the conntrack:
* It's not dying, because a new conntrack was created
* nf_ct_tuple_equal() returns true.
But this conntrack is not initialized yet, so it can not be used by two
threads concurrently. In this case BUG_ON may be triggered from
nf_nat_setup_info().
Florian Westphal suggested to check the confirm bit too. I think it's
right.
task 1 task 2 task 3
nf_conntrack_find_get
____nf_conntrack_find
destroy_conntrack
hlist_nulls_del_rcu
nf_conntrack_free
kmem_cache_free
__nf_conntrack_alloc
kmem_cache_alloc
memset(&ct->tuplehash[IP_CT_DIR_MAX],
if (nf_ct_is_dying(ct))
if (!nf_ct_tuple_equal()
I'm not sure, that I have ever seen this race condition in a real life.
Currently we are investigating a bug, which is reproduced on a few nodes.
In our case one conntrack is initialized from a few tasks concurrently,
we don't have any other explanation for this.
<2>[46267.083061] kernel BUG at net/ipv4/netfilter/nf_nat_core.c:322!
...
<4>[46267.083951] RIP: 0010:[<ffffffffa01e00a4>] [<ffffffffa01e00a4>] nf_nat_setup_info+0x564/0x590 [nf_nat]
...
<4>[46267.085549] Call Trace:
<4>[46267.085622] [<ffffffffa023421b>] alloc_null_binding+0x5b/0xa0 [iptable_nat]
<4>[46267.085697] [<ffffffffa02342bc>] nf_nat_rule_find+0x5c/0x80 [iptable_nat]
<4>[46267.085770] [<ffffffffa0234521>] nf_nat_fn+0x111/0x260 [iptable_nat]
<4>[46267.085843] [<ffffffffa0234798>] nf_nat_out+0x48/0xd0 [iptable_nat]
<4>[46267.085919] [<ffffffff814841b9>] nf_iterate+0x69/0xb0
<4>[46267.085991] [<ffffffff81494e70>] ? ip_finish_output+0x0/0x2f0
<4>[46267.086063] [<ffffffff81484374>] nf_hook_slow+0x74/0x110
<4>[46267.086133] [<ffffffff81494e70>] ? ip_finish_output+0x0/0x2f0
<4>[46267.086207] [<ffffffff814b5890>] ? dst_output+0x0/0x20
<4>[46267.086277] [<ffffffff81495204>] ip_output+0xa4/0xc0
<4>[46267.086346] [<ffffffff814b65a4>] raw_sendmsg+0x8b4/0x910
<4>[46267.086419] [<ffffffff814c10fa>] inet_sendmsg+0x4a/0xb0
<4>[46267.086491] [<ffffffff814459aa>] ? sock_update_classid+0x3a/0x50
<4>[46267.086562] [<ffffffff81444d67>] sock_sendmsg+0x117/0x140
<4>[46267.086638] [<ffffffff8151997b>] ? _spin_unlock_bh+0x1b/0x20
<4>[46267.086712] [<ffffffff8109d370>] ? autoremove_wake_function+0x0/0x40
<4>[46267.086785] [<ffffffff81495e80>] ? do_ip_setsockopt+0x90/0xd80
<4>[46267.086858] [<ffffffff8100be0e>] ? call_function_interrupt+0xe/0x20
<4>[46267.086936] [<ffffffff8118cb10>] ? ub_slab_ptr+0x20/0x90
<4>[46267.087006] [<ffffffff8118cb10>] ? ub_slab_ptr+0x20/0x90
<4>[46267.087081] [<ffffffff8118f2e8>] ? kmem_cache_alloc+0xd8/0x1e0
<4>[46267.087151] [<ffffffff81445599>] sys_sendto+0x139/0x190
<4>[46267.087229] [<ffffffff81448c0d>] ? sock_setsockopt+0x16d/0x6f0
<4>[46267.087303] [<ffffffff810efa47>] ? audit_syscall_entry+0x1d7/0x200
<4>[46267.087378] [<ffffffff810ef795>] ? __audit_syscall_exit+0x265/0x290
<4>[46267.087454] [<ffffffff81474885>] ? compat_sys_setsockopt+0x75/0x210
<4>[46267.087531] [<ffffffff81474b5f>] compat_sys_socketcall+0x13f/0x210
<4>[46267.087607] [<ffffffff8104dea3>] ia32_sysret+0x0/0x5
<4>[46267.087676] Code: 91 20 e2 01 75 29 48 89 de 4c 89 f7 e8 56 fa ff ff 85 c0 0f 84 68 fc ff ff 0f b6 4d c6 41 8b 45 00 e9 4d fb ff ff e8 7c 19 e9 e0 <0f> 0b eb fe f6 05 17 91 20 e2 80 74 ce 80 3d 5f 2e 00 00 00 74
<1>[46267.088023] RIP [<ffffffffa01e00a4>] nf_nat_setup_info+0x564/0x590
v2: move nf_ct_is_confirmed into the unlikely() annotation
v3: Eric suggested to fix refcnt, so that it becomes zero before adding
in a hash, but we can't find a way how to do that. Another way is to
interpret the confirm bit as part of a search key and check it in
____nf_conntrack_find() too.
Cc: Eric Dumazet <[email protected]>
Cc: Florian Westphal <[email protected]>
Cc: Pablo Neira Ayuso <[email protected]>
Cc: Patrick McHardy <[email protected]>
Cc: Jozsef Kadlecsik <[email protected]>
Cc: "David S. Miller" <[email protected]>
Cc: Cyrill Gorcunov <[email protected]>
Signed-off-by: Andrey Vagin <[email protected]>
---
net/netfilter/nf_conntrack_core.c | 21 +++++++++++++++++----
1 file changed, 17 insertions(+), 4 deletions(-)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 43549eb..af6ad2e 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -318,6 +318,21 @@ static void death_by_timeout(unsigned long ul_conntrack)
nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0);
}
+static inline bool
+nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
+ const struct nf_conntrack_tuple *tuple,
+ u16 zone)
+{
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+ /* A conntrack can be recreated with the equal tuple,
+ * so we need to check that the conntrack is confirmed
+ */
+ return nf_ct_tuple_equal(tuple, &h->tuple) &&
+ nf_ct_zone(ct) == zone &&
+ nf_ct_is_confirmed(ct);
+}
+
/*
* Warning :
* - Caller must take a reference on returned object
@@ -339,8 +354,7 @@ ____nf_conntrack_find(struct net *net, u16 zone,
local_bh_disable();
begin:
hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) {
- if (nf_ct_tuple_equal(tuple, &h->tuple) &&
- nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) {
+ if (nf_ct_key_equal(h, tuple, zone)) {
NF_CT_STAT_INC(net, found);
local_bh_enable();
return h;
@@ -387,8 +401,7 @@ begin:
!atomic_inc_not_zero(&ct->ct_general.use)))
h = NULL;
else {
- if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) ||
- nf_ct_zone(ct) != zone)) {
+ if (unlikely(!nf_ct_key_equal(h, tuple, zone))) {
nf_ct_put(ct);
goto begin;
}
--
1.8.4.2
On Sun, 2014-01-12 at 21:50 +0400, Andrey Vagin wrote:
> Lets look at destroy_conntrack:
>
> hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
> ...
> nf_conntrack_free(ct)
> kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
>
> net->ct.nf_conntrack_cachep is created with SLAB_DESTROY_BY_RCU.
>
> The hash is protected by rcu, so readers look up conntracks without
> locks.
> A conntrack is removed from the hash, but in this moment a few readers
> still can use the conntrack. Then this conntrack is released and another
> thread creates conntrack with the same address and the equal tuple.
> After this a reader starts to validate the conntrack:
> * It's not dying, because a new conntrack was created
> * nf_ct_tuple_equal() returns true.
...
> v2: move nf_ct_is_confirmed into the unlikely() annotation
> v3: Eric suggested to fix refcnt, so that it becomes zero before adding
> in a hash, but we can't find a way how to do that. Another way is to
> interpret the confirm bit as part of a search key and check it in
> ____nf_conntrack_find() too.
>
> Cc: Eric Dumazet <[email protected]>
> Cc: Florian Westphal <[email protected]>
> Cc: Pablo Neira Ayuso <[email protected]>
> Cc: Patrick McHardy <[email protected]>
> Cc: Jozsef Kadlecsik <[email protected]>
> Cc: "David S. Miller" <[email protected]>
> Cc: Cyrill Gorcunov <[email protected]>
> Signed-off-by: Andrey Vagin <[email protected]>
> ---
Acked-by: Eric Dumazet <[email protected]>
Thanks Andrey !
On Sun, Jan 12, 2014 at 12:21:14PM -0800, Eric Dumazet wrote:
> On Sun, 2014-01-12 at 21:50 +0400, Andrey Vagin wrote:
> > Lets look at destroy_conntrack:
> >
> > hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
> > ...
> > nf_conntrack_free(ct)
> > kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
> >
> > net->ct.nf_conntrack_cachep is created with SLAB_DESTROY_BY_RCU.
> >
> > The hash is protected by rcu, so readers look up conntracks without
> > locks.
> > A conntrack is removed from the hash, but in this moment a few readers
> > still can use the conntrack. Then this conntrack is released and another
> > thread creates conntrack with the same address and the equal tuple.
> > After this a reader starts to validate the conntrack:
> > * It's not dying, because a new conntrack was created
> > * nf_ct_tuple_equal() returns true.
> ...
>
>
> > v2: move nf_ct_is_confirmed into the unlikely() annotation
> > v3: Eric suggested to fix refcnt, so that it becomes zero before adding
> > in a hash, but we can't find a way how to do that. Another way is to
> > interpret the confirm bit as part of a search key and check it in
> > ____nf_conntrack_find() too.
> >
> > Cc: Eric Dumazet <[email protected]>
> > Cc: Florian Westphal <[email protected]>
> > Cc: Pablo Neira Ayuso <[email protected]>
> > Cc: Patrick McHardy <[email protected]>
> > Cc: Jozsef Kadlecsik <[email protected]>
> > Cc: "David S. Miller" <[email protected]>
> > Cc: Cyrill Gorcunov <[email protected]>
> > Signed-off-by: Andrey Vagin <[email protected]>
> > ---
>
> Acked-by: Eric Dumazet <[email protected]>
>
> Thanks Andrey !
>
Eh, looks like this path is incomplete too:(
I think we can't set a reference counter for objects which is allocated
from a SLAB_DESTROY_BY_RCU cache. Look at the following backtrace.
cpu1 cpu2
ct = ____nf_conntrack_find()
destroy_conntrack
atomic_inc_not_zero(ct)
__nf_conntrack_alloc
atomic_set(&ct->ct_general.use, 1);
if (!nf_ct_key_equal(h, tuple, zone))
nf_ct_put(ct);
destroy_conntrack(ct) !!!!
/* continues to use the conntrack */
Did I miss something again?
I think __nf_conntrack_alloc must use atomic_inc instead of
atomic_set. And we must be sure, that the first object from a new page is
zeroized.
I am talking about this, because after this patch a bug was triggered from
another place:
<2>[67096.759353] kernel BUG at net/netfilter/nf_conntrack_core.c:211!
<4>[67096.759371] invalid opcode: 0000 [#1] SMP
<4>[67096.759385] last sysfs file: /sys/devices/virtual/block/md0/md/metadata_version
<4>[67096.759414] CPU 2
<4>[67096.759422] Modules linked in: xt_comment sch_sfq cls_fw sch_htb pio_nfs pio_direct pfmt_raw pfmt_ploop1 ploop simfs xt_string xt_hashlimit xt_recent xt_length xt_hl xt_tcpmss xt_TCPMSS xt_multiport xt_limit xt_dscp vzevent coretemp cpufreq_ondemand acpi_cpufreq freq_table mperf 8021q garp stp llc ipt_REJECT iptable_filter iptable_mangle xt_NOTRACK iptable_raw iptable_nat ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state ip6table_filter ip6table_raw xt_MARK ip6table_mangle ip6_tables ext4 jbd2 tun ip_gre ipip vzethdev vznetdev vzrst nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 ipv6 vzcpt nf_conntrack vzdquota vzmon vziolimit vzdev tunnel4 nfsd nfs lockd fscache auth_rpcgss nfs_acl sunrpc tpm_tis tpm tpm_bios microcode serio_raw i2c_i801 sg iTCO_wdt iTCO_vendor_support e1000e ext3 jbd mbcache raid1 sd_mod crc_t10dif ata_piix ahci pata_acpi ata_generic i915 drm_kms_helper drm i2c_algo_bit i2c_core video output dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan]
<4>[67096.759801]
<4>[67096.759837] Pid: 498649, comm: atdd veid: 666 Tainted: G C --------------- 2.6.32-042stab084.18 #1 042stab084_18 /DQ45CB
<4>[67096.759932] RIP: 0010:[<ffffffffa03d99ac>] [<ffffffffa03d99ac>] destroy_conntrack+0x15c/0x190 [nf_conntrack]
<4>[67096.760032] RSP: 0000:ffff88001ae378b8 EFLAGS: 00010246
<4>[67096.760075] RAX: 0000000000000000 RBX: ffff8801a57ac928 RCX: 0000000000065000
<4>[67096.760123] RDX: 000000000000f603 RSI: 0000000000000006 RDI: ffff8801a57ac928
<4>[67096.760174] RBP: ffff88001ae378d8 R08: 0000000000000002 R09: ffff8802373b06e0
<4>[67096.760221] R10: 0000000000000001 R11: 0000000000000000 R12: ffff88023928c080
<4>[67096.760255] R13: ffff880237e8c000 R14: 0000000000000002 R15: 0000000000000002
<4>[67096.760255] FS: 0000000000000000(0000) GS:ffff880028300000(0063) knlGS:00000000b63afbb0
<4>[67096.760255] CS: 0010 DS: 002b ES: 002b CR0: 000000008005003b
<4>[67096.760255] CR2: 00000000b74f44c0 CR3: 00000000b89c6000 CR4: 00000000000007e0
<4>[67096.760255] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
<4>[67096.760255] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
<4>[67096.760255] Process atdd (pid: 498649, veid: 666, threadinfo ffff88001ae36000, task ffff88001deaa980)
<4>[67096.760255] Stack:
<4>[67096.760255] ffff88001ae378e8 ffff88001ae37988 ffff88023928c080 0000000000000003
<4>[67096.760255] <d> ffff88001ae378e8 ffffffff814844a7 ffff88001ae37908 ffffffffa03d9bb5
<4>[67096.760255] <d> ffff88012dcae580 ffff88023928c080 ffff88001ae379e8 ffffffffa03d9fb2
<4>[67096.760255] Call Trace:
<4>[67096.760255] [<ffffffff814844a7>] nf_conntrack_destroy+0x17/0x30
<4>[67096.760255] [<ffffffffa03d9bb5>] nf_conntrack_find_get+0x85/0x130 [nf_conntrack]
<4>[67096.760255] [<ffffffffa03d9fb2>] nf_conntrack_in+0x352/0xb60 [nf_conntrack]
<4>[67096.760255] [<ffffffffa048c771>] ipv4_conntrack_local+0x51/0x60 [nf_conntrack_ipv4]
<4>[67096.760255] [<ffffffff81484419>] nf_iterate+0x69/0xb0
<4>[67096.760255] [<ffffffff814b5b00>] ? dst_output+0x0/0x20
<4>[67096.760255] [<ffffffff814845d4>] nf_hook_slow+0x74/0x110
<4>[67096.760255] [<ffffffff814b5b00>] ? dst_output+0x0/0x20
<4>[67096.760255] [<ffffffff814b66d5>] raw_sendmsg+0x775/0x910
<4>[67096.760255] [<ffffffff8104c5a8>] ? flush_tlb_others_ipi+0x128/0x130
<4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
<4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
<4>[67096.760255] [<ffffffff814c136a>] inet_sendmsg+0x4a/0xb0
<4>[67096.760255] [<ffffffff81444e93>] ? sock_sendmsg+0x13/0x140
<4>[67096.760255] [<ffffffff81444f97>] sock_sendmsg+0x117/0x140
<4>[67096.760255] [<ffffffff8102e299>] ? native_smp_send_reschedule+0x49/0x60
<4>[67096.760255] [<ffffffff81519beb>] ? _spin_unlock_bh+0x1b/0x20
<4>[67096.760255] [<ffffffff8109d930>] ? autoremove_wake_function+0x0/0x40
<4>[67096.760255] [<ffffffff814960f0>] ? do_ip_setsockopt+0x90/0xd80
<4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
<4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
<4>[67096.760255] [<ffffffff814457c9>] sys_sendto+0x139/0x190
<4>[67096.760255] [<ffffffff810efa77>] ? audit_syscall_entry+0x1d7/0x200
<4>[67096.760255] [<ffffffff810ef7c5>] ? __audit_syscall_exit+0x265/0x290
<4>[67096.760255] [<ffffffff81474daf>] compat_sys_socketcall+0x13f/0x210
<4>[67096.760255] [<ffffffff8104dea3>] ia32_sysret+0x0/0x5
<4>[67096.760255] Code: 0b ab 0a e1 eb b7 f6 05 34 f8 00 e2 20 74 b7 80 3d f0 b0 00 00 00 74 ae 48 89 de 48 c7 c7 20 16 3e a0 31 c0 e8 05 ca 13 e1 eb 9b <0f> 0b eb fe f6 05 0b f8 00 e2 20 0f 84 db fe ff ff 80 3d eb b0
<1>[67096.760255] RIP [<ffffffffa03d99ac>] destroy_conntrack+0x15c/0x190 [nf_conntrack]
<4>[67096.760255] RSP <ffff88001ae378b8>
>
> Eh, looks like this path is incomplete too:(
>
> I think we can't set a reference counter for objects which is allocated
> from a SLAB_DESTROY_BY_RCU cache. Look at the following backtrace.
>
> cpu1 cpu2
> ct = ____nf_conntrack_find()
> destroy_conntrack
> atomic_inc_not_zero(ct)
ct->ct_general.use is zero after destroy_conntrack(). Sorry for the noise.
On Tue, 2014-01-14 at 14:51 +0400, Andrew Vagin wrote:
> I think __nf_conntrack_alloc must use atomic_inc instead of
> atomic_set. And we must be sure, that the first object from a new page is
> zeroized.
No you can not do that, and we do not need.
If a new page is allocated, then you have the guarantee nobody can ever
uses it, its content can be totally random.
Only 'living' objects, the ones that were previously inserted in the
hash table, can be found, and their refcnt must be accurate.
A freed object has refcnt == 0, thats the golden rule.
When the page is freed (after RCU grace period), nobody cares of refcnt
anymore.
----
Eric and Florian, could you look at this patch. When you say,
that it looks good, I will ask the user to validate it.
I can't reorder these actions, because it's reproduced on a real host
with real users. Thanks.
----
nf_conntrack_free can't be called for a conntract with non-zero ref-counter,
because it can race with nf_conntrack_find_get().
A conntrack slab is created with SLAB_DESTROY_BY_RCU. Non-zero
ref-conunter says that this conntrack is used now. So when we release a
conntrack with non-zero counter, we break this assumption.
CPU1 CPU2
____nf_conntrack_find()
nf_ct_put()
destroy_conntrack()
...
init_conntrack
__nf_conntrack_alloc (set use = 1)
atomic_inc_not_zero(&ct->use) (use = 2)
if (!l4proto->new(ct, skb, dataoff, timeouts))
nf_conntrack_free(ct); (use = 2 !!!)
...
__nf_conntrack_alloc (set use = 1)
if (!nf_ct_key_equal(h, tuple, zone))
nf_ct_put(ct); (use = 0)
destroy_conntrack()
/* continue to work with CT */
After applying the path "[PATCH] netfilter: nf_conntrack: fix RCU race
in nf_conntrack_find_get (v3)" another bug was triggered in
destroy_conntrack():
<4>[67096.759334] ------------[ cut here ]------------
<2>[67096.759353] kernel BUG at net/netfilter/nf_conntrack_core.c:211!
...
<4>[67096.759837] Pid: 498649, comm: atdd veid: 666 Tainted: G C --------------- 2.6.32-042stab084.18 #1 042stab084_18 /DQ45CB
<4>[67096.759932] RIP: 0010:[<ffffffffa03d99ac>] [<ffffffffa03d99ac>] destroy_conntrack+0x15c/0x190 [nf_conntrack]
<4>[67096.760255] Call Trace:
<4>[67096.760255] [<ffffffff814844a7>] nf_conntrack_destroy+0x17/0x30
<4>[67096.760255] [<ffffffffa03d9bb5>] nf_conntrack_find_get+0x85/0x130 [nf_conntrack]
<4>[67096.760255] [<ffffffffa03d9fb2>] nf_conntrack_in+0x352/0xb60 [nf_conntrack]
<4>[67096.760255] [<ffffffffa048c771>] ipv4_conntrack_local+0x51/0x60 [nf_conntrack_ipv4]
<4>[67096.760255] [<ffffffff81484419>] nf_iterate+0x69/0xb0
<4>[67096.760255] [<ffffffff814b5b00>] ? dst_output+0x0/0x20
<4>[67096.760255] [<ffffffff814845d4>] nf_hook_slow+0x74/0x110
<4>[67096.760255] [<ffffffff814b5b00>] ? dst_output+0x0/0x20
<4>[67096.760255] [<ffffffff814b66d5>] raw_sendmsg+0x775/0x910
<4>[67096.760255] [<ffffffff8104c5a8>] ? flush_tlb_others_ipi+0x128/0x130
<4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
<4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
<4>[67096.760255] [<ffffffff814c136a>] inet_sendmsg+0x4a/0xb0
<4>[67096.760255] [<ffffffff81444e93>] ? sock_sendmsg+0x13/0x140
<4>[67096.760255] [<ffffffff81444f97>] sock_sendmsg+0x117/0x140
<4>[67096.760255] [<ffffffff8102e299>] ? native_smp_send_reschedule+0x49/0x60
<4>[67096.760255] [<ffffffff81519beb>] ? _spin_unlock_bh+0x1b/0x20
<4>[67096.760255] [<ffffffff8109d930>] ? autoremove_wake_function+0x0/0x40
<4>[67096.760255] [<ffffffff814960f0>] ? do_ip_setsockopt+0x90/0xd80
<4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
<4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
<4>[67096.760255] [<ffffffff814457c9>] sys_sendto+0x139/0x190
<4>[67096.760255] [<ffffffff810efa77>] ? audit_syscall_entry+0x1d7/0x200
<4>[67096.760255] [<ffffffff810ef7c5>] ? __audit_syscall_exit+0x265/0x290
<4>[67096.760255] [<ffffffff81474daf>] compat_sys_socketcall+0x13f/0x210
<4>[67096.760255] [<ffffffff8104dea3>] ia32_sysret+0x0/0x5
Cc: Eric Dumazet <[email protected]>
Cc: Florian Westphal <[email protected]>
Cc: Cyrill Gorcunov <[email protected]>
Cc: Vasiliy Averin <[email protected]>
Signed-off-by: Andrey Vagin <[email protected]>
---
include/net/netfilter/nf_conntrack.h | 1 -
net/netfilter/nf_conntrack_core.c | 18 +++++++++++-------
net/netfilter/nf_conntrack_netlink.c | 2 +-
net/netfilter/nf_synproxy_core.c | 4 ++--
net/netfilter/xt_CT.c | 2 +-
5 files changed, 15 insertions(+), 12 deletions(-)
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 01ea6ee..d338316 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -243,7 +243,6 @@ void nf_ct_untracked_status_or(unsigned long bits);
void nf_ct_iterate_cleanup(struct net *net,
int (*iter)(struct nf_conn *i, void *data),
void *data, u32 portid, int report);
-void nf_conntrack_free(struct nf_conn *ct);
struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
const struct nf_conntrack_tuple *orig,
const struct nf_conntrack_tuple *repl,
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index b56e53b..c38cc74 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -198,6 +198,8 @@ clean_from_lists(struct nf_conn *ct)
nf_ct_remove_expectations(ct);
}
+static void nf_conntrack_free(struct nf_conn *ct);
+
static void
destroy_conntrack(struct nf_conntrack *nfct)
{
@@ -226,9 +228,8 @@ destroy_conntrack(struct nf_conntrack *nfct)
* too. */
nf_ct_remove_expectations(ct);
- /* We overload first tuple to link into unconfirmed or dying list.*/
- BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
- hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+ if (!hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode))
+ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
NF_CT_STAT_INC(net, delete);
spin_unlock_bh(&nf_conntrack_lock);
@@ -772,18 +773,21 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
}
EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
-void nf_conntrack_free(struct nf_conn *ct)
+static void nf_conntrack_free(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
+ /* A freed object has refcnt == 0, thats
+ * the golden rule for SLAB_DESTROY_BY_RCU
+ */
+ NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0);
+
nf_ct_ext_destroy(ct);
nf_ct_ext_free(ct);
kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
smp_mb__before_atomic_dec();
atomic_dec(&net->ct.count);
}
-EXPORT_SYMBOL_GPL(nf_conntrack_free);
-
/* Allocate a new conntrack: we return -ENOMEM if classification
failed due to stress. Otherwise it really is unclassifiable. */
@@ -835,7 +839,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
}
if (!l4proto->new(ct, skb, dataoff, timeouts)) {
- nf_conntrack_free(ct);
+ nf_ct_put(ct);
pr_debug("init conntrack: can't track with proto module\n");
return NULL;
}
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 3e91ad3..fadd0f3 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1732,7 +1732,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
err2:
rcu_read_unlock();
err1:
- nf_conntrack_free(ct);
+ nf_ct_put(ct);
return ERR_PTR(err);
}
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 9858e3e..d12234c 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -381,7 +381,7 @@ static int __net_init synproxy_net_init(struct net *net)
err3:
free_percpu(snet->stats);
err2:
- nf_conntrack_free(ct);
+ nf_ct_put(ct);
err1:
return err;
}
@@ -390,7 +390,7 @@ static void __net_exit synproxy_net_exit(struct net *net)
{
struct synproxy_net *snet = synproxy_pernet(net);
- nf_conntrack_free(snet->tmpl);
+ nf_ct_put(snet->tmpl);
synproxy_proc_exit(net);
free_percpu(snet->stats);
}
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index da35ac0..da4edfe 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -237,7 +237,7 @@ out:
return 0;
err3:
- nf_conntrack_free(ct);
+ nf_ct_put(ct);
err2:
nf_ct_l3proto_module_put(par->family);
err1:
--
1.8.4.2
On Tue, Jan 14, 2014 at 09:35:48PM +0400, Andrey Vagin wrote:
> ----
> Eric and Florian, could you look at this patch. When you say,
> that it looks good, I will ask the user to validate it.
> I can't reorder these actions, because it's reproduced on a real host
> with real users. Thanks.
> ----
>
> nf_conntrack_free can't be called for a conntract with non-zero ref-counter,
> because it can race with nf_conntrack_find_get().
>
> A conntrack slab is created with SLAB_DESTROY_BY_RCU. Non-zero
> ref-conunter says that this conntrack is used now. So when we release a
> conntrack with non-zero counter, we break this assumption.
>
> CPU1 CPU2
> ____nf_conntrack_find()
> nf_ct_put()
> destroy_conntrack()
> ...
> init_conntrack
> __nf_conntrack_alloc (set use = 1)
> atomic_inc_not_zero(&ct->use) (use = 2)
> if (!l4proto->new(ct, skb, dataoff, timeouts))
> nf_conntrack_free(ct); (use = 2 !!!)
> ...
> __nf_conntrack_alloc (set use = 1)
> if (!nf_ct_key_equal(h, tuple, zone))
> nf_ct_put(ct); (use = 0)
> destroy_conntrack()
> /* continue to work with CT */
If I didn't miss something obvious this looks like a pretty possible
scenario. Thanks!
Andrey Vagin <[email protected]> wrote:
> ----
> Eric and Florian, could you look at this patch. When you say,
> that it looks good, I will ask the user to validate it.
> I can't reorder these actions, because it's reproduced on a real host
> with real users. Thanks.
> ----
>
> nf_conntrack_free can't be called for a conntract with non-zero ref-counter,
> because it can race with nf_conntrack_find_get().
Indeed.
> A conntrack slab is created with SLAB_DESTROY_BY_RCU. Non-zero
> ref-conunter says that this conntrack is used now. So when we release a
> conntrack with non-zero counter, we break this assumption.
>
> CPU1 CPU2
> ____nf_conntrack_find()
> nf_ct_put()
> destroy_conntrack()
> ...
> init_conntrack
> __nf_conntrack_alloc (set use = 1)
> atomic_inc_not_zero(&ct->use) (use = 2)
> if (!l4proto->new(ct, skb, dataoff, timeouts))
> nf_conntrack_free(ct); (use = 2 !!!)
> ...
Yes, I think this sequence is possible; we must not use nf_conntrack_free here.
> - /* We overload first tuple to link into unconfirmed or dying list.*/
> - BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
> - hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
> + if (!hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode))
> + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
This is the only thing that I don't like about this patch. Currently
all the conntracks in the system are always put on a list before they're
supposed to be visible/handled via refcnt system (unconfirmed, hash, or
dying list).
I think it would be nice if we could keep it that way.
If everything fails we could proably intoduce a 'larval' dummy list
similar to the one used by template conntracks?
On Tue, Jan 14, 2014 at 07:53:29PM +0100, Florian Westphal wrote:
> Andrey Vagin <[email protected]> wrote:
> > ----
> > Eric and Florian, could you look at this patch. When you say,
> > that it looks good, I will ask the user to validate it.
> > I can't reorder these actions, because it's reproduced on a real host
> > with real users. Thanks.
> > ----
> >
> > nf_conntrack_free can't be called for a conntract with non-zero ref-counter,
> > because it can race with nf_conntrack_find_get().
>
> Indeed.
>
> > A conntrack slab is created with SLAB_DESTROY_BY_RCU. Non-zero
> > ref-conunter says that this conntrack is used now. So when we release a
> > conntrack with non-zero counter, we break this assumption.
> >
> > CPU1 CPU2
> > ____nf_conntrack_find()
> > nf_ct_put()
> > destroy_conntrack()
> > ...
> > init_conntrack
> > __nf_conntrack_alloc (set use = 1)
> > atomic_inc_not_zero(&ct->use) (use = 2)
> > if (!l4proto->new(ct, skb, dataoff, timeouts))
> > nf_conntrack_free(ct); (use = 2 !!!)
> > ...
>
> Yes, I think this sequence is possible; we must not use nf_conntrack_free here.
>
> > - /* We overload first tuple to link into unconfirmed or dying list.*/
> > - BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
> > - hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
> > + if (!hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode))
> > + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
>
> This is the only thing that I don't like about this patch. Currently
> all the conntracks in the system are always put on a list before they're
> supposed to be visible/handled via refcnt system (unconfirmed, hash, or
> dying list).
>
> I think it would be nice if we could keep it that way.
> If everything fails we could proably intoduce a 'larval' dummy list
> similar to the one used by template conntracks?
I'm not sure, that this is required. Could you elaborate when this can
be useful?
Now I see only overhead, because we need to take the nf_conntrack_lock
lock to add conntrack in a list.
Thanks,
Andrey
Andrew Vagin <[email protected]> wrote:
> > I think it would be nice if we could keep it that way.
> > If everything fails we could proably intoduce a 'larval' dummy list
> > similar to the one used by template conntracks?
>
> I'm not sure, that this is required. Could you elaborate when this can
> be useful?
You can dump the lists via ctnetlink. Its meant as a debugging aid in
case one suspects refcnt leaks.
Granted, in this situation there should be no leak since we put the newly
allocated entry in the error case.
> Now I see only overhead, because we need to take the nf_conntrack_lock
> lock to add conntrack in a list.
True. I don't have any preference, I guess I'd just do the insertion into the
unconfirmed list when we know we cannot track to keep the "unhashed"
bug trap in the destroy function.
Pablo, any preference?
On Tue, Jan 14, 2014 at 09:35:48PM +0400, Andrey Vagin wrote:
> ----
> Eric and Florian, could you look at this patch. When you say,
> that it looks good, I will ask the user to validate it.
> I can't reorder these actions, because it's reproduced on a real host
> with real users. Thanks.
We didn't get new reports from users for the last week, so these patches
fix the problem.
> ----
>
> nf_conntrack_free can't be called for a conntract with non-zero ref-counter,
> because it can race with nf_conntrack_find_get().
>
> A conntrack slab is created with SLAB_DESTROY_BY_RCU. Non-zero
> ref-conunter says that this conntrack is used now. So when we release a
> conntrack with non-zero counter, we break this assumption.
>
> CPU1 CPU2
> ____nf_conntrack_find()
> nf_ct_put()
> destroy_conntrack()
> ...
> init_conntrack
> __nf_conntrack_alloc (set use = 1)
> atomic_inc_not_zero(&ct->use) (use = 2)
> if (!l4proto->new(ct, skb, dataoff, timeouts))
> nf_conntrack_free(ct); (use = 2 !!!)
> ...
> __nf_conntrack_alloc (set use = 1)
> if (!nf_ct_key_equal(h, tuple, zone))
> nf_ct_put(ct); (use = 0)
> destroy_conntrack()
> /* continue to work with CT */
>
> After applying the path "[PATCH] netfilter: nf_conntrack: fix RCU race
> in nf_conntrack_find_get (v3)" another bug was triggered in
> destroy_conntrack():
> <4>[67096.759334] ------------[ cut here ]------------
> <2>[67096.759353] kernel BUG at net/netfilter/nf_conntrack_core.c:211!
> ...
> <4>[67096.759837] Pid: 498649, comm: atdd veid: 666 Tainted: G C --------------- 2.6.32-042stab084.18 #1 042stab084_18 /DQ45CB
> <4>[67096.759932] RIP: 0010:[<ffffffffa03d99ac>] [<ffffffffa03d99ac>] destroy_conntrack+0x15c/0x190 [nf_conntrack]
> <4>[67096.760255] Call Trace:
> <4>[67096.760255] [<ffffffff814844a7>] nf_conntrack_destroy+0x17/0x30
> <4>[67096.760255] [<ffffffffa03d9bb5>] nf_conntrack_find_get+0x85/0x130 [nf_conntrack]
> <4>[67096.760255] [<ffffffffa03d9fb2>] nf_conntrack_in+0x352/0xb60 [nf_conntrack]
> <4>[67096.760255] [<ffffffffa048c771>] ipv4_conntrack_local+0x51/0x60 [nf_conntrack_ipv4]
> <4>[67096.760255] [<ffffffff81484419>] nf_iterate+0x69/0xb0
> <4>[67096.760255] [<ffffffff814b5b00>] ? dst_output+0x0/0x20
> <4>[67096.760255] [<ffffffff814845d4>] nf_hook_slow+0x74/0x110
> <4>[67096.760255] [<ffffffff814b5b00>] ? dst_output+0x0/0x20
> <4>[67096.760255] [<ffffffff814b66d5>] raw_sendmsg+0x775/0x910
> <4>[67096.760255] [<ffffffff8104c5a8>] ? flush_tlb_others_ipi+0x128/0x130
> <4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
> <4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
> <4>[67096.760255] [<ffffffff814c136a>] inet_sendmsg+0x4a/0xb0
> <4>[67096.760255] [<ffffffff81444e93>] ? sock_sendmsg+0x13/0x140
> <4>[67096.760255] [<ffffffff81444f97>] sock_sendmsg+0x117/0x140
> <4>[67096.760255] [<ffffffff8102e299>] ? native_smp_send_reschedule+0x49/0x60
> <4>[67096.760255] [<ffffffff81519beb>] ? _spin_unlock_bh+0x1b/0x20
> <4>[67096.760255] [<ffffffff8109d930>] ? autoremove_wake_function+0x0/0x40
> <4>[67096.760255] [<ffffffff814960f0>] ? do_ip_setsockopt+0x90/0xd80
> <4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
> <4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
> <4>[67096.760255] [<ffffffff814457c9>] sys_sendto+0x139/0x190
> <4>[67096.760255] [<ffffffff810efa77>] ? audit_syscall_entry+0x1d7/0x200
> <4>[67096.760255] [<ffffffff810ef7c5>] ? __audit_syscall_exit+0x265/0x290
> <4>[67096.760255] [<ffffffff81474daf>] compat_sys_socketcall+0x13f/0x210
> <4>[67096.760255] [<ffffffff8104dea3>] ia32_sysret+0x0/0x5
>
> Cc: Eric Dumazet <[email protected]>
> Cc: Florian Westphal <[email protected]>
> Cc: Cyrill Gorcunov <[email protected]>
> Cc: Vasiliy Averin <[email protected]>
> Signed-off-by: Andrey Vagin <[email protected]>
> ---
> include/net/netfilter/nf_conntrack.h | 1 -
> net/netfilter/nf_conntrack_core.c | 18 +++++++++++-------
> net/netfilter/nf_conntrack_netlink.c | 2 +-
> net/netfilter/nf_synproxy_core.c | 4 ++--
> net/netfilter/xt_CT.c | 2 +-
> 5 files changed, 15 insertions(+), 12 deletions(-)
>
> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> index 01ea6ee..d338316 100644
> --- a/include/net/netfilter/nf_conntrack.h
> +++ b/include/net/netfilter/nf_conntrack.h
> @@ -243,7 +243,6 @@ void nf_ct_untracked_status_or(unsigned long bits);
> void nf_ct_iterate_cleanup(struct net *net,
> int (*iter)(struct nf_conn *i, void *data),
> void *data, u32 portid, int report);
> -void nf_conntrack_free(struct nf_conn *ct);
> struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
> const struct nf_conntrack_tuple *orig,
> const struct nf_conntrack_tuple *repl,
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index b56e53b..c38cc74 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -198,6 +198,8 @@ clean_from_lists(struct nf_conn *ct)
> nf_ct_remove_expectations(ct);
> }
>
> +static void nf_conntrack_free(struct nf_conn *ct);
> +
> static void
> destroy_conntrack(struct nf_conntrack *nfct)
> {
> @@ -226,9 +228,8 @@ destroy_conntrack(struct nf_conntrack *nfct)
> * too. */
> nf_ct_remove_expectations(ct);
>
> - /* We overload first tuple to link into unconfirmed or dying list.*/
> - BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
> - hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
> + if (!hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode))
> + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
>
> NF_CT_STAT_INC(net, delete);
> spin_unlock_bh(&nf_conntrack_lock);
> @@ -772,18 +773,21 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
> }
> EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
>
> -void nf_conntrack_free(struct nf_conn *ct)
> +static void nf_conntrack_free(struct nf_conn *ct)
> {
> struct net *net = nf_ct_net(ct);
>
> + /* A freed object has refcnt == 0, thats
> + * the golden rule for SLAB_DESTROY_BY_RCU
> + */
> + NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0);
> +
> nf_ct_ext_destroy(ct);
> nf_ct_ext_free(ct);
> kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
> smp_mb__before_atomic_dec();
> atomic_dec(&net->ct.count);
> }
> -EXPORT_SYMBOL_GPL(nf_conntrack_free);
> -
>
> /* Allocate a new conntrack: we return -ENOMEM if classification
> failed due to stress. Otherwise it really is unclassifiable. */
> @@ -835,7 +839,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
> }
>
> if (!l4proto->new(ct, skb, dataoff, timeouts)) {
> - nf_conntrack_free(ct);
> + nf_ct_put(ct);
> pr_debug("init conntrack: can't track with proto module\n");
> return NULL;
> }
> diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
> index 3e91ad3..fadd0f3 100644
> --- a/net/netfilter/nf_conntrack_netlink.c
> +++ b/net/netfilter/nf_conntrack_netlink.c
> @@ -1732,7 +1732,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
> err2:
> rcu_read_unlock();
> err1:
> - nf_conntrack_free(ct);
> + nf_ct_put(ct);
> return ERR_PTR(err);
> }
>
> diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
> index 9858e3e..d12234c 100644
> --- a/net/netfilter/nf_synproxy_core.c
> +++ b/net/netfilter/nf_synproxy_core.c
> @@ -381,7 +381,7 @@ static int __net_init synproxy_net_init(struct net *net)
> err3:
> free_percpu(snet->stats);
> err2:
> - nf_conntrack_free(ct);
> + nf_ct_put(ct);
> err1:
> return err;
> }
> @@ -390,7 +390,7 @@ static void __net_exit synproxy_net_exit(struct net *net)
> {
> struct synproxy_net *snet = synproxy_pernet(net);
>
> - nf_conntrack_free(snet->tmpl);
> + nf_ct_put(snet->tmpl);
> synproxy_proc_exit(net);
> free_percpu(snet->stats);
> }
> diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
> index da35ac0..da4edfe 100644
> --- a/net/netfilter/xt_CT.c
> +++ b/net/netfilter/xt_CT.c
> @@ -237,7 +237,7 @@ out:
> return 0;
>
> err3:
> - nf_conntrack_free(ct);
> + nf_ct_put(ct);
> err2:
> nf_ct_l3proto_module_put(par->family);
> err1:
> --
> 1.8.4.2
>
On Sun, Jan 12, 2014 at 12:21:14PM -0800, Eric Dumazet wrote:
> On Sun, 2014-01-12 at 21:50 +0400, Andrey Vagin wrote:
> > Lets look at destroy_conntrack:
> >
> > hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
> > ...
> > nf_conntrack_free(ct)
> > kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
> >
> > net->ct.nf_conntrack_cachep is created with SLAB_DESTROY_BY_RCU.
> >
> > The hash is protected by rcu, so readers look up conntracks without
> > locks.
> > A conntrack is removed from the hash, but in this moment a few readers
> > still can use the conntrack. Then this conntrack is released and another
> > thread creates conntrack with the same address and the equal tuple.
> > After this a reader starts to validate the conntrack:
> > * It's not dying, because a new conntrack was created
> > * nf_ct_tuple_equal() returns true.
> ...
>
>
> > v2: move nf_ct_is_confirmed into the unlikely() annotation
> > v3: Eric suggested to fix refcnt, so that it becomes zero before adding
> > in a hash, but we can't find a way how to do that. Another way is to
> > interpret the confirm bit as part of a search key and check it in
> > ____nf_conntrack_find() too.
> >
> > Cc: Eric Dumazet <[email protected]>
> > Cc: Florian Westphal <[email protected]>
> > Cc: Pablo Neira Ayuso <[email protected]>
> > Cc: Patrick McHardy <[email protected]>
> > Cc: Jozsef Kadlecsik <[email protected]>
> > Cc: "David S. Miller" <[email protected]>
> > Cc: Cyrill Gorcunov <[email protected]>
> > Signed-off-by: Andrey Vagin <[email protected]>
> > ---
>
> Acked-by: Eric Dumazet <[email protected]>
Applied, thanks everyone!
On Thu, Jan 16, 2014 at 10:23:01AM +0100, Florian Westphal wrote:
> Andrew Vagin <[email protected]> wrote:
> > > I think it would be nice if we could keep it that way.
> > > If everything fails we could proably intoduce a 'larval' dummy list
> > > similar to the one used by template conntracks?
> >
> > I'm not sure, that this is required. Could you elaborate when this can
> > be useful?
>
> You can dump the lists via ctnetlink. Its meant as a debugging aid in
> case one suspects refcnt leaks.
>
> Granted, in this situation there should be no leak since we put the newly
> allocated entry in the error case.
>
> > Now I see only overhead, because we need to take the nf_conntrack_lock
> > lock to add conntrack in a list.
>
> True. I don't have any preference, I guess I'd just do the insertion into the
> unconfirmed list when we know we cannot track to keep the "unhashed"
> bug trap in the destroy function.
>
> Pablo, any preference?
I think we can initially set to zero the refcount and bump it once it
gets into any of the lists, so Eric's golden rule also stands for
conntracks that are released without being inserted in any list via
nf_conntrack_free().
My idea was to use dying list to detect possible runtime leaks (ie.
missing nf_ct_put somewhere), not simple leaks the initialization
path, as you said, it would add too much overhead to catch them with
the dying list, so we can skip those.
Please, let me know if you find any issue with this approach.
On Mon, Feb 03, 2014 at 12:30:46AM +0100, Pablo Neira Ayuso wrote:
> On Thu, Jan 16, 2014 at 10:23:01AM +0100, Florian Westphal wrote:
> > Andrew Vagin <[email protected]> wrote:
> > > > I think it would be nice if we could keep it that way.
> > > > If everything fails we could proably intoduce a 'larval' dummy list
> > > > similar to the one used by template conntracks?
> > >
> > > I'm not sure, that this is required. Could you elaborate when this can
> > > be useful?
> >
> > You can dump the lists via ctnetlink. Its meant as a debugging aid in
> > case one suspects refcnt leaks.
> >
> > Granted, in this situation there should be no leak since we put the newly
> > allocated entry in the error case.
> >
> > > Now I see only overhead, because we need to take the nf_conntrack_lock
> > > lock to add conntrack in a list.
> >
> > True. I don't have any preference, I guess I'd just do the insertion into the
> > unconfirmed list when we know we cannot track to keep the "unhashed"
> > bug trap in the destroy function.
> >
> > Pablo, any preference?
>
> I think we can initially set to zero the refcount and bump it once it
> gets into any of the lists, so Eric's golden rule also stands for
> conntracks that are released without being inserted in any list via
> nf_conntrack_free().
>
> My idea was to use dying list to detect possible runtime leaks (ie.
> missing nf_ct_put somewhere), not simple leaks the initialization
> path, as you said, it would add too much overhead to catch them with
> the dying list, so we can skip those.
>
> Please, let me know if you find any issue with this approach.
Hello Pablo,
I don't see any problem with this approach and I like it.
Thank you for the patch.
> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> index 01ea6ee..b2ac624 100644
> --- a/include/net/netfilter/nf_conntrack.h
> +++ b/include/net/netfilter/nf_conntrack.h
> @@ -284,6 +284,8 @@ extern unsigned int nf_conntrack_max;
> extern unsigned int nf_conntrack_hash_rnd;
> void init_nf_conntrack_hash_rnd(void);
>
> +void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl);
> +
> #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count)
> #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count)
>
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index 4d1fb5d..bd5ec5a 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -448,7 +448,8 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
> goto out;
>
> add_timer(&ct->timeout);
> - nf_conntrack_get(&ct->ct_general);
> + /* The caller holds a reference to this object */
> + atomic_set(&ct->ct_general.use, 2);
> __nf_conntrack_hash_insert(ct, hash, repl_hash);
> NF_CT_STAT_INC(net, insert);
> spin_unlock_bh(&nf_conntrack_lock);
> @@ -462,6 +463,21 @@ out:
> }
> EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
>
> +/* deletion from this larval template list happens via nf_ct_put() */
> +void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl)
> +{
> + __set_bit(IPS_TEMPLATE_BIT, &tmpl->status);
> + __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
> + nf_conntrack_get(&tmpl->ct_general);
> +
> + spin_lock_bh(&nf_conntrack_lock);
> + /* Overload tuple linked list to put us in template list. */
> + hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
> + &net->ct.tmpl);
> + spin_unlock_bh(&nf_conntrack_lock);
> +}
> +EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert);
> +
> /* Confirm a connection given skb; places it in hash table */
> int
> __nf_conntrack_confirm(struct sk_buff *skb)
> @@ -733,11 +749,11 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
> nf_ct_zone->id = zone;
> }
> #endif
> - /*
> - * changes to lookup keys must be done before setting refcnt to 1
> + /* Because we use RCU lookups, we set ct_general.use to zero before
> + * this is inserted in any list.
> */
> smp_wmb();
> - atomic_set(&ct->ct_general.use, 1);
> + atomic_set(&ct->ct_general.use, 0);
> return ct;
>
> #ifdef CONFIG_NF_CONNTRACK_ZONES
> @@ -761,6 +777,11 @@ void nf_conntrack_free(struct nf_conn *ct)
> {
> struct net *net = nf_ct_net(ct);
>
> + /* A freed object has refcnt == 0, thats
> + * the golden rule for SLAB_DESTROY_BY_RCU
> + */
> + NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0);
> +
> nf_ct_ext_destroy(ct);
> nf_ct_ext_free(ct);
> kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
> @@ -856,6 +877,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
> NF_CT_STAT_INC(net, new);
> }
>
> + /* Now it is inserted into the hashes, bump refcount */
> + nf_conntrack_get(&ct->ct_general);
> +
> /* Overload tuple linked list to put us in unconfirmed list. */
> hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
> &net->ct.unconfirmed);
> diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
> index 9858e3e..52e20c9 100644
> --- a/net/netfilter/nf_synproxy_core.c
> +++ b/net/netfilter/nf_synproxy_core.c
> @@ -363,9 +363,8 @@ static int __net_init synproxy_net_init(struct net *net)
> goto err2;
> if (!nfct_synproxy_ext_add(ct))
> goto err2;
> - __set_bit(IPS_TEMPLATE_BIT, &ct->status);
> - __set_bit(IPS_CONFIRMED_BIT, &ct->status);
>
> + nf_conntrack_tmpl_insert(net, ct);
> snet->tmpl = ct;
>
> snet->stats = alloc_percpu(struct synproxy_stats);
> @@ -390,7 +389,7 @@ static void __net_exit synproxy_net_exit(struct net *net)
> {
> struct synproxy_net *snet = synproxy_pernet(net);
>
> - nf_conntrack_free(snet->tmpl);
> + nf_ct_put(snet->tmpl);
> synproxy_proc_exit(net);
> free_percpu(snet->stats);
> }
> diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
> index 5929be6..75747ae 100644
> --- a/net/netfilter/xt_CT.c
> +++ b/net/netfilter/xt_CT.c
> @@ -228,12 +228,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
> goto err3;
> }
>
> - __set_bit(IPS_TEMPLATE_BIT, &ct->status);
> - __set_bit(IPS_CONFIRMED_BIT, &ct->status);
> -
> - /* Overload tuple linked list to put us in template list. */
> - hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
> - &par->net->ct.tmpl);
> + nf_conntrack_tmpl_insert(par->net, ct);
> out:
> info->ct = ct;
> return 0;
On Mon, 2014-02-03 at 00:30 +0100, Pablo Neira Ayuso wrote:
> */
> smp_wmb();
> - atomic_set(&ct->ct_general.use, 1);
> + atomic_set(&ct->ct_general.use, 0);
> return ct;
Hi Pablo !
I think your patch is the way to go, but might need some extra care
with memory barriers.
I believe the smp_wmb() here is no longer needed.
If its a newly allocated memory, no other users can access to ct,
if its a recycled ct, content is already 0 anyway.
After your patch, nf_conntrack_get(&tmpl->ct_general) should increment
an already non zero refcnt, so no memory barrier is needed.
But one smp_wmb() is needed right before this point :
/* The caller holds a reference to this object */
atomic_set(&ct->ct_general.use, 2);
Thanks !