Repetitive use of vmap/vunmap on the same address triggers this bug.
Simplest fix: directly kfree the data structure rather than doing it lazily.
Impact:
(-) slight slowdown on vunmap
(+) stop messing up rcu callback lists ;)
Caught it with DEBUG_RCU_HEAD, running LTTng armall/disarmall in loops.
Immediate values (with breakpoint-ipi scheme) are still using vunmap.
------------[ cut here ]------------
WARNING: at kernel/rcutree.c:1199 __call_rcu+0x181/0x190()
Hardware name: X7DAL
Modules linked in: loop ltt_statedump ltt_userspace_event ipc_tr]
Pid: 4527, comm: ltt-armall Not tainted 2.6.30.9-trace #30
Call Trace:
[<ffffffff8027b181>] ? __call_rcu+0x181/0x190
[<ffffffff8027b181>] ? __call_rcu+0x181/0x190
[<ffffffff8023a6a9>] ? warn_slowpath_common+0x79/0xd0
[<ffffffff802b4890>] ? rcu_free_va+0x0/0x10
[<ffffffff8027b181>] ? __call_rcu+0x181/0x190
[<ffffffff802b5298>] ? __purge_vmap_area_lazy+0x1a8/0x1e0
[<ffffffff802b59d4>] ? free_unmap_vmap_area_noflush+0x74/0x80
[<ffffffff802b5a0e>] ? remove_vm_area+0x2e/0x80
[<ffffffff802b5b35>] ? __vunmap+0x45/0xf0
[<ffffffffa002e826>] ? ltt_statedump_start+0x7b6/0x820 [ltt_sta]
[<ffffffff8068978a>] ? arch_imv_update+0x16a/0x2f0
[<ffffffff8027dd13>] ? imv_update_range+0x53/0xa0
[<ffffffff8026235b>] ? _module_imv_update+0x4b/0x60
[<ffffffff80262385>] ? module_imv_update+0x15/0x30
[<ffffffff80280049>] ? marker_probe_register+0x149/0xb90
[<ffffffff8040c8e0>] ? ltt_vtrace+0x0/0x8c0
[<ffffffff80409330>] ? ltt_marker_connect+0xd0/0x150
[<ffffffff8040f8dc>] ? marker_enable_write+0xec/0x120
[<ffffffff802c6cb8>] ? __dentry_open+0x268/0x350
[<ffffffff80280b2c>] ? marker_probe_cb+0x9c/0x170
[<ffffffff80280b2c>] ? marker_probe_cb+0x9c/0x170
[<ffffffff802c973b>] ? vfs_write+0xcb/0x170
[<ffffffff802c9984>] ? sys_write+0x64/0x130
[<ffffffff8020beeb>] ? system_call_fastpath+0x16/0x1b
---[ end trace ef92443e716fa199 ]---
------------[ cut here ]------------
Signed-off-by: Mathieu Desnoyers <[email protected]>
CC: [email protected]
CC: [email protected]
CC: "Paul E. McKenney" <[email protected]>
CC: [email protected]
CC: [email protected]
---
mm/vmalloc.c | 18 ++----------------
1 file changed, 2 insertions(+), 16 deletions(-)
Index: linux-2.6-lttng/mm/vmalloc.c
===================================================================
--- linux-2.6-lttng.orig/mm/vmalloc.c 2009-10-06 09:37:04.000000000 -0400
+++ linux-2.6-lttng/mm/vmalloc.c 2009-10-06 09:37:56.000000000 -0400
@@ -417,13 +417,6 @@ overflow:
return va;
}
-static void rcu_free_va(struct rcu_head *head)
-{
- struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
-
- kfree(va);
-}
-
static void __free_vmap_area(struct vmap_area *va)
{
BUG_ON(RB_EMPTY_NODE(&va->rb_node));
@@ -431,7 +424,7 @@ static void __free_vmap_area(struct vmap
RB_CLEAR_NODE(&va->rb_node);
list_del_rcu(&va->list);
- call_rcu(&va->rcu_head, rcu_free_va);
+ kfree(va);
}
/*
@@ -757,13 +750,6 @@ static struct vmap_block *new_vmap_block
return vb;
}
-static void rcu_free_vb(struct rcu_head *head)
-{
- struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
-
- kfree(vb);
-}
-
static void free_vmap_block(struct vmap_block *vb)
{
struct vmap_block *tmp;
@@ -778,7 +764,7 @@ static void free_vmap_block(struct vmap_
BUG_ON(tmp != vb);
free_unmap_vmap_area_noflush(vb->va);
- call_rcu(&vb->rcu_head, rcu_free_vb);
+ kfree(vb);
}
static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
* Mathieu Desnoyers ([email protected]) wrote:
> Repetitive use of vmap/vunmap on the same address triggers this bug.
>
> Simplest fix: directly kfree the data structure rather than doing it lazily.
>
I assumed that call_rcu in vunmap() is only used to postpone kfree()
from the hot path, but I ask for comfirmation/infirmation about that. I
fear delayed reclamation might be there for a reason. Are there rcu read
sides depending on this behavior ?
If yes, then this patch is wrong, and we could change it for
synchronize_rcu();
kfree(....);
Mathieu
> Impact:
> (-) slight slowdown on vunmap
> (+) stop messing up rcu callback lists ;)
>
> Caught it with DEBUG_RCU_HEAD, running LTTng armall/disarmall in loops.
> Immediate values (with breakpoint-ipi scheme) are still using vunmap.
>
> ------------[ cut here ]------------
> WARNING: at kernel/rcutree.c:1199 __call_rcu+0x181/0x190()
> Hardware name: X7DAL
> Modules linked in: loop ltt_statedump ltt_userspace_event ipc_tr]
> Pid: 4527, comm: ltt-armall Not tainted 2.6.30.9-trace #30
> Call Trace:
> [<ffffffff8027b181>] ? __call_rcu+0x181/0x190
> [<ffffffff8027b181>] ? __call_rcu+0x181/0x190
> [<ffffffff8023a6a9>] ? warn_slowpath_common+0x79/0xd0
> [<ffffffff802b4890>] ? rcu_free_va+0x0/0x10
> [<ffffffff8027b181>] ? __call_rcu+0x181/0x190
> [<ffffffff802b5298>] ? __purge_vmap_area_lazy+0x1a8/0x1e0
> [<ffffffff802b59d4>] ? free_unmap_vmap_area_noflush+0x74/0x80
> [<ffffffff802b5a0e>] ? remove_vm_area+0x2e/0x80
> [<ffffffff802b5b35>] ? __vunmap+0x45/0xf0
> [<ffffffffa002e826>] ? ltt_statedump_start+0x7b6/0x820 [ltt_sta]
> [<ffffffff8068978a>] ? arch_imv_update+0x16a/0x2f0
> [<ffffffff8027dd13>] ? imv_update_range+0x53/0xa0
> [<ffffffff8026235b>] ? _module_imv_update+0x4b/0x60
> [<ffffffff80262385>] ? module_imv_update+0x15/0x30
> [<ffffffff80280049>] ? marker_probe_register+0x149/0xb90
> [<ffffffff8040c8e0>] ? ltt_vtrace+0x0/0x8c0
> [<ffffffff80409330>] ? ltt_marker_connect+0xd0/0x150
> [<ffffffff8040f8dc>] ? marker_enable_write+0xec/0x120
> [<ffffffff802c6cb8>] ? __dentry_open+0x268/0x350
> [<ffffffff80280b2c>] ? marker_probe_cb+0x9c/0x170
> [<ffffffff80280b2c>] ? marker_probe_cb+0x9c/0x170
> [<ffffffff802c973b>] ? vfs_write+0xcb/0x170
> [<ffffffff802c9984>] ? sys_write+0x64/0x130
> [<ffffffff8020beeb>] ? system_call_fastpath+0x16/0x1b
> ---[ end trace ef92443e716fa199 ]---
> ------------[ cut here ]------------
>
> Signed-off-by: Mathieu Desnoyers <[email protected]>
> CC: [email protected]
> CC: [email protected]
> CC: "Paul E. McKenney" <[email protected]>
> CC: [email protected]
> CC: [email protected]
> ---
> mm/vmalloc.c | 18 ++----------------
> 1 file changed, 2 insertions(+), 16 deletions(-)
>
> Index: linux-2.6-lttng/mm/vmalloc.c
> ===================================================================
> --- linux-2.6-lttng.orig/mm/vmalloc.c 2009-10-06 09:37:04.000000000 -0400
> +++ linux-2.6-lttng/mm/vmalloc.c 2009-10-06 09:37:56.000000000 -0400
> @@ -417,13 +417,6 @@ overflow:
> return va;
> }
>
> -static void rcu_free_va(struct rcu_head *head)
> -{
> - struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
> -
> - kfree(va);
> -}
> -
> static void __free_vmap_area(struct vmap_area *va)
> {
> BUG_ON(RB_EMPTY_NODE(&va->rb_node));
> @@ -431,7 +424,7 @@ static void __free_vmap_area(struct vmap
> RB_CLEAR_NODE(&va->rb_node);
> list_del_rcu(&va->list);
>
> - call_rcu(&va->rcu_head, rcu_free_va);
> + kfree(va);
> }
>
> /*
> @@ -757,13 +750,6 @@ static struct vmap_block *new_vmap_block
> return vb;
> }
>
> -static void rcu_free_vb(struct rcu_head *head)
> -{
> - struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
> -
> - kfree(vb);
> -}
> -
> static void free_vmap_block(struct vmap_block *vb)
> {
> struct vmap_block *tmp;
> @@ -778,7 +764,7 @@ static void free_vmap_block(struct vmap_
> BUG_ON(tmp != vb);
>
> free_unmap_vmap_area_noflush(vb->va);
> - call_rcu(&vb->rcu_head, rcu_free_vb);
> + kfree(vb);
> }
>
> static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
>
> --
> Mathieu Desnoyers
> OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Tue, 6 Oct 2009, Mathieu Desnoyers wrote:
> Simplest fix: directly kfree the data structure rather than doing it lazily.
The delay is necessary as far as I can tell for performance reasons. But
Nick was the last one fiddling around with the subsystem as far as I
remember. CCing him. May be he has a minute to think about a fix that
preserved performance.
* Christoph Lameter ([email protected]) wrote:
> On Tue, 6 Oct 2009, Mathieu Desnoyers wrote:
>
> > Simplest fix: directly kfree the data structure rather than doing it lazily.
>
> The delay is necessary as far as I can tell for performance reasons. But
> Nick was the last one fiddling around with the subsystem as far as I
> remember. CCing him. May be he has a minute to think about a fix that
> preserved performance.
>
Thanks,
More info on the problem:
I added a check in alloc_vmap_area() at:
va = kmalloc_node(sizeof(struct vmap_area),
gfp_mask & GFP_RECLAIM_MASK, node);
WARN_ON(va->rcu_head.debug == LIST_POISON1);
(memory allocation debugging is off in my config)
I used this along with my new call rcu debug patch to check if memory is
reallocated before the callback (doing kfree) is executed. It triggers
this:
WARNING: at mm/vmalloc.c:343 alloc_vmap_area+0x305/0x340()
Hardware name: X7DAL
Modules linked in: loop ltt_statedump ltt_userspace_event ipc_tr]
Pid: 5584, comm: ltt-disarmall Not tainted 2.6.30.9-trace #41
Call Trace:
[<ffffffff802b8665>] ? alloc_vmap_area+0x305/0x340
[<ffffffff802b8665>] ? alloc_vmap_area+0x305/0x340
[<ffffffff8023cf29>] ? warn_slowpath_common+0x79/0xd0
[<ffffffff802b8665>] ? alloc_vmap_area+0x305/0x340
[<ffffffff80237670>] ? default_wake_function+0x0/0x10
[<ffffffff802b8769>] ? __get_vm_area_node+0xc9/0x1d0
[<ffffffff805ae965>] ? sys_getsockopt+0x85/0x160
[<ffffffff8040f160>] ? ltt_vtrace+0x0/0x8d0
[<ffffffff802b88dd>] ? get_vm_area_caller+0x2d/0x40
[<ffffffff8068bc3e>] ? arch_imv_update+0x10e/0x2f0
[<ffffffff802b93aa>] ? vmap+0x4a/0x80
[<ffffffff8068bc3e>] ? arch_imv_update+0x10e/0x2f0
[<ffffffff80283e6d>] ? get_tracepoint+0x25d/0x290
[<ffffffff80280b93>] ? imv_update_range+0x53/0xa0
[<ffffffff80284821>] ? tracepoint_update_probes+0x21/0x30
[<ffffffff802848b3>] ? tracepoint_probe_update_all+0x83/0x100
[<ffffffff80282ac1>] ? marker_update_probes+0x21/0x40
[<ffffffff8040f160>] ? ltt_vtrace+0x0/0x8d0
[<ffffffff80282d5d>] ? marker_probe_unregister+0xad/0x130
[<ffffffff8040b9dc>] ? ltt_marker_disconnect+0xdc/0x120
[<ffffffff804121d2>] ? marker_enable_write+0x112/0x120
[<ffffffff80688dd0>] ? _spin_unlock+0x10/0x30
[<ffffffff802e6b36>] ? mnt_drop_write+0x76/0x180
[<ffffffff802d96dd>] ? do_filp_open+0x2cd/0xa00
[<ffffffff804377c1>] ? tty_write+0x221/0x270
[<ffffffff802cc6cb>] ? vfs_write+0xcb/0x170
[<ffffffff802cc914>] ? sys_write+0x64/0x130
[<ffffffff8020beab>] ? system_call_fastpath+0x16/0x1b
---[ end trace 7ef506680d7a9e26 ]---
Could it be that:
static void __free_vmap_area(struct vmap_area *va)
{
BUG_ON(RB_EMPTY_NODE(&va->rb_node));
rb_erase(&va->rb_node, &vmap_area_root);
RB_CLEAR_NODE(&va->rb_node);
list_del_rcu(&va->list);
call_rcu(&va->rcu_head, rcu_free_va);
}
(especially clearing from the rb tree and va list)
allows reallocation of the node by kmalloc_node before its reclamation
(only done later by rcu_free_va) ?
Mathieu
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68