I thought we had sorted all the AUX issues, though interestingly this is
on a core2 system.
this is:
static void __rb_free_aux(struct ring_buffer *rb)
{
/*
* Should never happen, the last reference should be dropped from
* perf_mmap_close() path, which first stops aux transactions (which
* in turn are the atomic holders of aux_refcount) and then does the
* last rb_free_aux().
*/
WARN_ON_ONCE(in_atomic());
[87078.464463] WARNING: CPU: 1 PID: 19400 at kernel/events/ring_buffer.c:546 __rb_free_aux+0x40/0xe8
[87078.464466] CPU: 1 PID: 19400 Comm: perf_fuzzer Tainted: G W 4.8.0+ #209
[87078.464467] Hardware name: AOpen DE7000/nMCP7ALPx-DE R1.06 Oct.19.2012, BIOS 080015 10/19/2012
[87078.464468] ffff88011fc85b00c ffffffff812bc679c 0000000000000000c 0000000000000000c
[87078.464469] ffff88011fc85b40c ffffffff8104e0c8c 000002221fc85b98c ffff880119bf2700c
[87078.464470] ffff880119bf2700c 0000000000000000c 0000000000000001c 0000000000006108c
[87078.464470] Call Trace:
[87078.464471] <NMI> [<ffffffff812bc679>] dump_stack+0x4d/0x63
[87078.464472] [<ffffffff8104e0c8>] __warn+0xca/0xe5
[87078.464473] [<ffffffff8104e19f>] warn_slowpath_null+0x1d/0x1f
[87078.464473] [<ffffffff810fa055>] __rb_free_aux+0x40/0xe8
[87078.464474] [<ffffffff810fab48>] rb_free_aux+0x18/0x1a
[87078.464475] [<ffffffff810fad83>] perf_aux_output_end+0xca/0xd9
[87078.464475] [<ffffffff81009890>] intel_bts_interrupt+0xc4/0x11f
[87078.464476] [<ffffffff8100868c>] intel_pmu_handle_irq+0x75/0x3db
[87078.464477] [<ffffffff8158e910>] ? reboot_interrupt+0x90/0x90
[87078.464478] [<ffffffff810f93db>] ? perf_trace_run_bpf_submit+0x7b/0x8c
[87078.464479] [<ffffffff810f93db>] ? perf_trace_run_bpf_submit+0x7b/0x8c
[87078.464479] [<ffffffff81035a30>] ? __default_send_IPI_shortcut+0x39/0x40
[87078.464480] [<ffffffff81002ecd>] perf_event_nmi_handler+0x2c/0x45
[87078.464481] [<ffffffff810e72e7>] ? irq_work_queue+0x61/0x6d
[87078.464482] [<ffffffff8101ee54>] ? native_sched_clock+0x1f/0x3a
[87078.464482] [<ffffffff8101ee54>] ? native_sched_clock+0x1f/0x3a
[87078.464483] [<ffffffff81002ecd>] ? perf_event_nmi_handler+0x2c/0x45
[87078.464484] [<ffffffff8101a9a7>] nmi_handle+0x54/0xfa
[87078.464485] [<ffffffff81036903>] ? nmi_raise_cpu_backtrace+0x1d/0x1d
[87078.464485] [<ffffffff8101ac14>] default_do_nmi+0x47/0xe0
[87078.464486] [<ffffffff8101ad45>] do_nmi+0x98/0x10b
[87078.464487] [<ffffffff8158fd27>] end_repeat_nmi+0x1a/0x1e
[87078.464488] [<ffffffff81073dbc>] ? can_migrate_task+0x11c/0x11c
[87078.464488] [<ffffffff81073dbc>] ? can_migrate_task+0x11c/0x11c
[87078.464489] [<ffffffff81073dbc>] ? can_migrate_task+0x11c/0x11c
[87078.464490] <EOE> <IRQ> [<ffffffff81076439>] ? enqueue_task_fair+0x3f5/0x46b
[87078.464491] [<ffffffff8106dcaa>] enqueue_task+0x4f/0x54
[87078.464491] [<ffffffff8106dcaa>] ? enqueue_task+0x4f/0x54
[87078.464492] [<ffffffff8106e1e9>] activate_task+0x2c/0x2e
[87078.464493] [<ffffffff8106e3f0>] ttwu_do_activate+0x43/0x75
[87078.464494] [<ffffffff8106f1dd>] try_to_wake_up+0x1da/0x204
[87078.464494] [<ffffffff8106f21c>] wake_up_process+0x15/0x17
[87078.464495] [<ffffffff8106022e>] wake_up_worker+0x28/0x2a
[87078.464496] [<ffffffff81060835>] insert_work+0x65/0x6e
[87078.464496] [<ffffffff81060d29>] __queue_work+0x2c8/0x302
[87078.464497] [<ffffffff81074589>] ? update_cfs_rq_load_avg+0x310/0x349
[87078.464498] [<ffffffff8106157b>] queue_work_on+0x38/0x47
[87078.464499] [<ffffffff8147d077>] dbs_irq_work+0x23/0x25
[87078.464499] [<ffffffff810e7234>] irq_work_run_list+0x43/0x61
[87078.464500] [<ffffffff810f091a>] ? perf_event_sysfs_show+0x22/0x22
[87078.464501] [<ffffffff810e726a>] irq_work_run+0x18/0x34
[87078.464502] [<ffffffff810ac24c>] flush_smp_call_function_queue+0xd0/0x134
[87078.464502] [<ffffffff810ac957>] generic_smp_call_function_single_interrupt+0x13/0x4c
[87078.464503] [<ffffffff81032a09>] smp_call_function_single_interrupt+0x13/0x21
[87078.464504] [<ffffffff8158f04f>] call_function_single_interrupt+0x7f/0x90
[87078.464505] [<ffffffff81052951>] ? __do_softirq+0x7d/0x265
[87078.464506] [<ffffffff810a7ab6>] ? tick_program_event+0x62/0x69
[87078.464506] [<ffffffff81052cce>] irq_exit+0x52/0x93
[87078.464507] [<ffffffff81034bd6>] smp_apic_timer_interrupt+0x2e/0x39
[87078.464508] [<ffffffff8158e98f>] apic_timer_interrupt+0x7f/0x90
[87078.464508] <EOI> 4---[ end trace 26fda82430e3fa2e ]---
Vince Weaver <[email protected]> writes:
> I thought we had sorted all the AUX issues, though interestingly this is
> on a core2 system.
>
> this is:
>
> static void __rb_free_aux(struct ring_buffer *rb)
> {
>
> /*
> * Should never happen, the last reference should be dropped from
> * perf_mmap_close() path, which first stops aux transactions (which
> * in turn are the atomic holders of aux_refcount) and then does the
> * last rb_free_aux().
> */
> WARN_ON_ONCE(in_atomic());
>
>
> [87078.464463] WARNING: CPU: 1 PID: 19400 at kernel/events/ring_buffer.c:546 __rb_free_aux+0x40/0xe8
> [87078.464466] CPU: 1 PID: 19400 Comm: perf_fuzzer Tainted: G W 4.8.0+ #209
> [87078.464467] Hardware name: AOpen DE7000/nMCP7ALPx-DE R1.06 Oct.19.2012, BIOS 080015 10/19/2012
> [87078.464468] ffff88011fc85b00c ffffffff812bc679c 0000000000000000c 0000000000000000c
> [87078.464469] ffff88011fc85b40c ffffffff8104e0c8c 000002221fc85b98c ffff880119bf2700c
> [87078.464470] ffff880119bf2700c 0000000000000000c 0000000000000001c 0000000000006108c
> [87078.464470] Call Trace:
> [87078.464471] <NMI> [<ffffffff812bc679>] dump_stack+0x4d/0x63
> [87078.464472] [<ffffffff8104e0c8>] __warn+0xca/0xe5
> [87078.464473] [<ffffffff8104e19f>] warn_slowpath_null+0x1d/0x1f
> [87078.464473] [<ffffffff810fa055>] __rb_free_aux+0x40/0xe8
> [87078.464474] [<ffffffff810fab48>] rb_free_aux+0x18/0x1a
> [87078.464475] [<ffffffff810fad83>] perf_aux_output_end+0xca/0xd9
> [87078.464475] [<ffffffff81009890>] intel_bts_interrupt+0xc4/0x11f
> [87078.464476] [<ffffffff8100868c>] intel_pmu_handle_irq+0x75/0x3db
Yeah, this really shouldn't be happening. How reproducible is this? Any
clues that may help me reproduce it?
Meanwhile I'll stare at the code some more.
Regards,
--
Alex
On Thu, 10 Nov 2016, Alexander Shishkin wrote:
> Yeah, this really shouldn't be happening. How reproducible is this? Any
> clues that may help me reproduce it?
I'll see if I can reproduce it.
I'm currently stuck on most of my machines as current git since
4.9-rc1 won't boot due to the kbuild/modversions regression.
Vince
On Thu, 10 Nov 2016, Alexander Shishkin wrote:
> Yeah, this really shouldn't be happening. How reproducible is this? Any
> clues that may help me reproduce it?
I hit this again on the skylake machine, but it takes a while (many days
of fuzzing).
Also maybe related, it hits this WARN_ONCE in perf_mmap_close() first
/* this has to be the last one */
rb_free_aux(rb);
WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
[122026.776273] ------------[ cut here ]------------
[122026.785852] WARNING: CPU: 0 PID: 6027 at kernel/events/core.c:5004 perf_mmap_close+0x45c/0x470
[122026.974275] CPU: 0 PID: 6027 Comm: perf_fuzzer Tainted: G L 4.9.0-rc5+ #11
[122026.989250] Hardware name: LENOVO 10FY0017US/SKYBAY, BIOS FWKT53A 06/06/2016
[122027.002786] ffff8801e8117d50 ffffffffa9ebe748 0000000000000000 0000000000000000
[122027.017321] ffff8801e8117d98 ffffffffa9ab6ee7 ffff8801f134e040 0000138ce8117de8
[122027.031891] ffff8801dd805200 ffff8801ebe62580 ffff8801eec68610 ffff8801ebe62808
[122027.046386] Call Trace:
[122027.051269] [<ffffffffa9ebe748>] dump_stack+0x63/0x8b
[122027.061116] [<ffffffffa9ab6ee7>] __warn+0x117/0x140
[122027.070598] [<ffffffffa9ab701d>] warn_slowpath_null+0x1d/0x20
[122027.081720] [<ffffffffa9c2e71c>] perf_mmap_close+0x45c/0x470
[122027.092646] [<ffffffffa9c229c0>] ? perf_iterate_ctx+0x280/0x280
[122027.307118] [<ffffffffa9c984a4>] remove_vma+0x54/0xc0
[122027.516151] [<ffffffffa9c9c617>] do_munmap+0x367/0x650
[122027.722320] [<ffffffffa9c9d17f>] SyS_munmap+0x5f/0x80
[122027.929219] [<ffffffffa9c9d120>] ? vm_munmap+0x70/0x70
[122028.136622] [<ffffffffa9a053f0>] do_syscall_64+0xe0/0x270
[122028.345615] [<ffffffffaa344e2f>] entry_SYSCALL64_slow_path+0x25/0x25
[122028.558691] ---[ end trace 806838d55bf2c098 ]---
followed a while later by
[150064.587195] ------------[ cut here ]------------
[150064.594626] WARNING: CPU: 0 PID: 0 at kernel/events/ring_buffer.c:546 __rb_free_aux+0x1cb/0x1d0
[150064.709271] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G W L 4.9.0-rc5+ #11
[150064.719631] Hardware name: LENOVO 10FY0017US/SKYBAY, BIOS FWKT53A 06/06/2016
[150064.729680] ffff8801f5a07ce0 ffffffffa9ebe748 0000000000000000 0000000000000000
[150064.740027] ffff8801f5a07d28 ffffffffa9ab6ee7 ffffffffaa812540 00000222de0dca00
[150064.750380] ffff8801e50ed340 ffff8801f5a14740 ffff8801e50ed340 0000000000000000
[150064.760683] Call Trace:
[150064.765834] <IRQ> [150064.767909] [<ffffffffa9ebe748>] dump_stack+0x63/0x8b
[150064.775798] [<ffffffffa9ab6ee7>] __warn+0x117/0x140
[150064.783473] [<ffffffffa9ab701d>] warn_slowpath_null+0x1d/0x20
[150064.792064] [<ffffffffa9c31c6b>] __rb_free_aux+0x1cb/0x1d0
[150064.800362] [<ffffffffa9c331b8>] rb_free_aux+0x18/0x20
[150064.808266] [<ffffffffa9c3367a>] perf_aux_output_end+0x19a/0x220
[150064.817015] [<ffffffffa9a136d4>] bts_event_stop+0x104/0x1d0
[150064.825268] [<ffffffffa9a137b3>] bts_event_del+0x13/0x20
[150064.833160] [<ffffffffa9c26794>] event_sched_out.isra.91+0x1c4/0x5d0
[150064.842060] [<ffffffffa9a0725c>] ? x86_pmu_disable+0x6c/0x80
[150064.850208] [<ffffffffa9c26d9e>] group_sched_out+0x6e/0x110
[150064.858221] [<ffffffffa9c26f80>] __perf_event_disable+0x140/0x1b0
[150064.866696] [<ffffffffa9c1c79f>] event_function+0x10f/0x1e0
[150064.874605] [<ffffffffa9c26e40>] ? group_sched_out+0x110/0x110
[150064.882767] [<ffffffffa9c1c690>] ? cpu_clock_event_read+0x10/0x10
[150064.891144] [<ffffffffa9c1e880>] ? perf_cgroup_attach+0x70/0x70
[150064.899332] [<ffffffffa9c1e8f6>] remote_function+0x76/0xa0
[150064.907068] [<ffffffffa9b78973>] flush_smp_call_function_queue+0xb3/0x1d0
[150064.916149] [<ffffffffa9b79773>] generic_smp_call_function_single_interrupt+0x13/0x60
[150064.926282] [<ffffffffa9a748b0>] smp_call_function_single_interrupt+0x40/0x50
[150064.935716] [<ffffffffaa346f92>] call_function_single_interrupt+0x82/0x90
[150064.944835] <EOI> [150064.946918] [<ffffffffa9b213ea>] ? sched_idle_set_state+0x2a/0x40
[150064.955373] [<ffffffffaa1348a8>] ? cpuidle_enter_state+0x158/0x320
[150064.963900] [<ffffffffaa134ac7>] cpuidle_enter+0x17/0x20
[150064.971550] [<ffffffffa9b2138d>] call_cpuidle+0x4d/0x80
[150064.979131] [<ffffffffa9b21687>] cpu_startup_entry+0x1b7/0x290
[150064.987287] [<ffffffffaa33aca7>] rest_init+0x87/0x90
[150064.994610] [<ffffffffaa9613ad>] start_kernel+0x59c/0x5bf
[150065.002351] [<ffffffffaa960120>] ? early_idt_handler_array+0x120/0x120
[150065.011260] [<ffffffffaa9604b2>] x86_64_start_reservations+0x24/0x26
[150065.019993] [<ffffffffaa960600>] x86_64_start_kernel+0x14c/0x16f
[150065.028338] ---[ end trace 806838d55bf2c099 ]---