2015-08-03 21:08:46

by Dave Jones

[permalink] [raw]
Subject: 4.2-rc5 rcu stalls.

[ 2120.854974] INFO: rcu_preempt detected stalls on CPUs/tasks:
[ 2120.855128] Tasks blocked on level-0 rcu_node (CPUs 0-3): P1497
[ 2120.855263] (detected by 2, t=65002 jiffies, g=78835, c=78834, q=0)
[ 2120.855403] trinity-watchdo R running task 14336 1497 1496 0x00080000
[ 2120.855563] ffff8804b94e3c88 ffffffffa17fa0b0 ffff8805010a1b40 ffff8804f58b51c0
[ 2120.855728] ffff8805010a1b40 ffff8804b94e3c78 0000000000000000 ffff8804b94e4000
[ 2120.855893] 0000000000000001 0000000000000001 0000000000000002 ffff8804b94e3ca8
[ 2120.856062] Call Trace:
[ 2120.856116] [<ffffffffa17fa0b0>] ? preempt_schedule_irq+0x40/0xa0
[ 2120.856252] [<ffffffffa17fa0b6>] preempt_schedule_irq+0x46/0xa0
[ 2120.856387] [<ffffffffa1801129>] retint_kernel+0x1b/0x2d
[ 2120.856507] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
[ 2120.856627] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
[ 2120.856752] [<ffffffffa10d675e>] ? lock_release+0x33e/0x700
[ 2120.856878] [<ffffffffa108be29>] ? group_send_sig_info+0x79/0x120
[ 2120.857015] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
[ 2120.857137] [<ffffffffa108c07b>] ? kill_pid_info+0x9b/0x150
[ 2120.857259] [<ffffffffa108c098>] kill_pid_info+0xb8/0x150
[ 2120.857380] [<ffffffffa108c000>] ? kill_pid_info+0x20/0x150
[ 2120.857508] [<ffffffffa108c244>] SYSC_kill+0xf4/0x2b0
[ 2120.857620] [<ffffffffa108c1ed>] ? SYSC_kill+0x9d/0x2b0
[ 2120.857736] [<ffffffffa10d354b>] ? trace_hardirqs_on_caller+0x14b/0x1e0
[ 2120.857882] [<ffffffffa10d35ed>] ? trace_hardirqs_on+0xd/0x10
[ 2120.858012] [<ffffffffa1192ed3>] ? context_tracking_user_exit+0x13/0x20
[ 2120.858161] [<ffffffffa1012db7>] ? syscall_trace_enter_phase1+0xf7/0x150
[ 2120.858314] [<ffffffffa1001017>] ? trace_hardirqs_on_thunk+0x17/0x19
[ 2120.858456] [<ffffffffa108e64e>] SyS_kill+0xe/0x10
[ 2120.858563] [<ffffffffa1800597>] entry_SYSCALL_64_fastpath+0x12/0x6f
[ 2120.858706] trinity-watchdo R running task 14336 1497 1496 0x00080000
[ 2120.858863] ffff8804b94e3c88 ffffffffa17fa0b0 ffff8805010a1b40 ffff8804f58b51c0
[ 2120.859028] ffff8805010a1b40 ffff8804b94e3c78 0000000000000000 ffff8804b94e4000
[ 2120.859194] 0000000000000001 0000000000000001 0000000000000002 ffff8804b94e3ca8
[ 2120.859358] Call Trace:
[ 2120.859410] [<ffffffffa17fa0b0>] ? preempt_schedule_irq+0x40/0xa0
[ 2120.859545] [<ffffffffa17fa0b6>] preempt_schedule_irq+0x46/0xa0
[ 2120.859676] [<ffffffffa1801129>] retint_kernel+0x1b/0x2d
[ 2120.859794] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
[ 2120.859916] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
[ 2120.860037] [<ffffffffa10d675e>] ? lock_release+0x33e/0x700
[ 2120.860163] [<ffffffffa108be29>] ? group_send_sig_info+0x79/0x120
[ 2120.860297] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
[ 2120.860422] [<ffffffffa108c07b>] ? kill_pid_info+0x9b/0x150
[ 2120.860548] [<ffffffffa108c098>] kill_pid_info+0xb8/0x150
[ 2120.860670] [<ffffffffa108c000>] ? kill_pid_info+0x20/0x150
[ 2120.860793] [<ffffffffa108c244>] SYSC_kill+0xf4/0x2b0
[ 2120.860906] [<ffffffffa108c1ed>] ? SYSC_kill+0x9d/0x2b0
[ 2120.861022] [<ffffffffa10d354b>] ? trace_hardirqs_on_caller+0x14b/0x1e0
[ 2120.861172] [<ffffffffa10d35ed>] ? trace_hardirqs_on+0xd/0x10
[ 2120.861301] [<ffffffffa1192ed3>] ? context_tracking_user_exit+0x13/0x20
[ 2120.861449] [<ffffffffa1012db7>] ? syscall_trace_enter_phase1+0xf7/0x150
[ 2120.866641] [<ffffffffa1001017>] ? trace_hardirqs_on_thunk+0x17/0x19
[ 2120.871850] [<ffffffffa108e64e>] SyS_kill+0xe/0x10
[ 2120.877038] [<ffffffffa1800597>] entry_SYSCALL_64_fastpath+0x12/0x6f
[ 2316.020180] INFO: rcu_preempt detected stalls on CPUs/tasks:
[ 2316.025395] Tasks blocked on level-0 rcu_node (CPUs 0-3): P1497
[ 2316.030662] (detected by 2, t=260007 jiffies, g=78835, c=78834, q=0)
[ 2316.035890] trinity-watchdo R running task 14336 1497 1496 0x00080000
[ 2316.041153] ffff8804b94e3c88 ffffffffa17fa0b0 ffff8805010a1b40 ffff8800cd2db680
[ 2316.046552] ffff8805010a1b40 ffff8804b94e3c78 0000000000000000 ffff8804b94e4000
[ 2316.052039] 0000000000000001 0000000000000001 0000000000000002 ffff8804b94e3ca8
[ 2316.057560] Call Trace:
[ 2316.063066] [<ffffffffa17fa0b0>] ? preempt_schedule_irq+0x40/0xa0
[ 2316.068594] [<ffffffffa17fa0b6>] preempt_schedule_irq+0x46/0xa0
[ 2316.074034] [<ffffffffa1801129>] retint_kernel+0x1b/0x2d
[ 2316.079419] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
[ 2316.084778] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
[ 2316.090041] [<ffffffffa10d675e>] ? lock_release+0x33e/0x700
[ 2316.095219] [<ffffffffa108be29>] ? group_send_sig_info+0x79/0x120
[ 2316.100359] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
[ 2316.105489] [<ffffffffa108c07b>] ? kill_pid_info+0x9b/0x150
[ 2316.110584] [<ffffffffa108c098>] kill_pid_info+0xb8/0x150
[ 2316.115594] [<ffffffffa108c000>] ? kill_pid_info+0x20/0x150
[ 2316.120568] [<ffffffffa108c244>] SYSC_kill+0xf4/0x2b0
[ 2316.125476] [<ffffffffa108c1ed>] ? SYSC_kill+0x9d/0x2b0
[ 2316.130391] [<ffffffffa10d354b>] ? trace_hardirqs_on_caller+0x14b/0x1e0
[ 2316.135348] [<ffffffffa10d35ed>] ? trace_hardirqs_on+0xd/0x10
[ 2316.140351] [<ffffffffa1192ed3>] ? context_tracking_user_exit+0x13/0x20
[ 2316.145399] [<ffffffffa1012db7>] ? syscall_trace_enter_phase1+0xf7/0x150
[ 2316.150413] [<ffffffffa1001017>] ? trace_hardirqs_on_thunk+0x17/0x19
[ 2316.155461] [<ffffffffa108e64e>] SyS_kill+0xe/0x10
[ 2316.160499] [<ffffffffa1800597>] entry_SYSCALL_64_fastpath+0x12/0x6f
[ 2316.165609] trinity-watchdo R running task 14336 1497 1496 0x00080000
[ 2316.170851] ffff8804b94e3c88 ffffffffa17fa0b0 ffff8805010a1b40 ffff8800cd2db680
[ 2316.176131] ffff8805010a1b40 ffff8804b94e3c78 0000000000000000 ffff8804b94e4000
[ 2316.181443] 0000000000000001 0000000000000001 0000000000000002 ffff8804b94e3ca8
[ 2316.186735] Call Trace:
[ 2316.192021] [<ffffffffa17fa0b0>] ? preempt_schedule_irq+0x40/0xa0
[ 2316.197426] [<ffffffffa17fa0b6>] preempt_schedule_irq+0x46/0xa0
[ 2316.202818] [<ffffffffa1801129>] retint_kernel+0x1b/0x2d
[ 2316.208181] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
[ 2316.213541] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
[ 2316.218823] [<ffffffffa10d675e>] ? lock_release+0x33e/0x700
[ 2316.224042] [<ffffffffa108be29>] ? group_send_sig_info+0x79/0x120
[ 2316.229215] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
[ 2316.234346] [<ffffffffa108c07b>] ? kill_pid_info+0x9b/0x150
[ 2316.239491] [<ffffffffa108c098>] kill_pid_info+0xb8/0x150
[ 2316.244538] [<ffffffffa108c000>] ? kill_pid_info+0x20/0x150
[ 2316.249525] [<ffffffffa108c244>] SYSC_kill+0xf4/0x2b0
[ 2316.254503] [<ffffffffa108c1ed>] ? SYSC_kill+0x9d/0x2b0
[ 2316.259492] [<ffffffffa10d354b>] ? trace_hardirqs_on_caller+0x14b/0x1e0
[ 2316.264565] [<ffffffffa10d35ed>] ? trace_hardirqs_on+0xd/0x10
[ 2316.269594] [<ffffffffa1192ed3>] ? context_tracking_user_exit+0x13/0x20
[ 2316.274681] [<ffffffffa1012db7>] ? syscall_trace_enter_phase1+0xf7/0x150
[ 2316.279772] [<ffffffffa1001017>] ? trace_hardirqs_on_thunk+0x17/0x19
[ 2316.284873] [<ffffffffa108e64e>] SyS_kill+0xe/0x10
[ 2316.289981] [<ffffffffa1800597>] entry_SYSCALL_64_fastpath+0x12/0x6f

some time later..

[66859.981825] INFO: rcu_preempt detected stalls on CPUs/tasks:
[66859.987582] Tasks blocked on level-0 rcu_node (CPUs 0-3): P20837
[66859.993375] (detected by 2, t=65013 jiffies, g=5129960, c=5129959, q=0)
[66859.999146] trinity-c255 R running task 13728 20837 20581 0x00080002
[66860.004980] ffff8804d1417cb8 ffffffffa17fa0b0 ffff8804e4dab680 ffff8804ba2951c0
[66860.010831] ffff8804e4dab680 ffff8804d1417ca8 0000000000000000 ffff8804d1418000
[66860.016666] ffff88045f49a440 ffff8805047a0620 ffff880480fdf528 ffff8804d1417cd8
[66860.022423] Call Trace:
[66860.028116] [<ffffffffa17fa0b0>] ? preempt_schedule_irq+0x40/0xa0
[66860.033912] [<ffffffffa17fa0b6>] preempt_schedule_irq+0x46/0xa0
[66860.039663] [<ffffffffa1801129>] retint_kernel+0x1b/0x2d
[66860.045352] [<ffffffffa10d6299>] ? lock_acquire+0xd9/0x260
[66860.051044] [<ffffffffa10ed3b1>] ? rcu_is_watching+0x1/0x60
[66860.056725] [<ffffffffa1212abc>] ? dput+0x10c/0x390
[66860.062377] [<ffffffffa12129d9>] ? dput+0x29/0x390
[66860.068103] [<ffffffffa11fb6f1>] __fput+0x181/0x200
[66860.073797] [<ffffffffa11fb576>] ? __fput+0x6/0x200
[66860.079444] [<ffffffffa11fb7be>] ____fput+0xe/0x10
[66860.085070] [<ffffffffa109dfdd>] task_work_run+0x8d/0xc0
[66860.090699] [<ffffffffa107cef6>] do_exit+0x416/0xc60
[66860.096304] [<ffffffffa1192ed3>] ? context_tracking_user_exit+0x13/0x20
[66860.101978] [<ffffffffa107ec64>] do_group_exit+0x54/0xe0
[66860.107684] [<ffffffffa107ed04>] SyS_exit_group+0x14/0x20
[66860.113360] [<ffffffffa1800597>] entry_SYSCALL_64_fastpath+0x12/0x6f


2015-08-03 21:37:29

by Paul E. McKenney

[permalink] [raw]
Subject: Re: 4.2-rc5 rcu stalls.

On Mon, Aug 03, 2015 at 05:08:35PM -0400, Dave Jones wrote:
> Content analysis details: (-2.9 points, 5.0 required)
>
> pts rule name description
> ---- ---------------------- --------------------------------------------------
> -1.0 ALL_TRUSTED Passed through trusted hosts only via SMTP
> -1.9 BAYES_00 BODY: Bayes spam probability is 0 to 1%
> [score: 0.0000]
> X-Authenticated-User: [email protected]
> X-ZLA-Header: unknown; 0
> X-ZLA-DetailInfo: BA=6.00003574; NDR=6.00000001; ZLA=6.00000002; ZF=6.00000004; ZB=6.00041153; ZH=6.00102541; ZP=6.00079473; ZU=6.00000001; UDB=6.00244655; UTC=2015-08-03 21:08:45
> x-cbid: 15080321-8236-0000-0000-00000DB1C0C1
> X-IBM-ISS-SpamDetectors: Score=0.405233; BY=0; FL=0; FP=0; FZ=0; HX=0; KW=0;
> PH=0; RB=0; SC=0.405233; ST=0; TS=0; UL=0; ISC=
> X-IBM-ISS-DetailInfo: BY=3.00004248; HX=3.00000236; KW=3.00000007;
> PH=3.00000003; SC=3.00000115; SDB=6.00568899; UDB=6.00244655; UTC=2015-08-03
> 21:08:47
> X-TM-AS-MML: disable
>
> [ 2120.854974] INFO: rcu_preempt detected stalls on CPUs/tasks:
> [ 2120.855128] Tasks blocked on level-0 rcu_node (CPUs 0-3): P1497
> [ 2120.855263] (detected by 2, t=65002 jiffies, g=78835, c=78834, q=0)
> [ 2120.855403] trinity-watchdo R running task 14336 1497 1496 0x00080000
> [ 2120.855563] ffff8804b94e3c88 ffffffffa17fa0b0 ffff8805010a1b40 ffff8804f58b51c0
> [ 2120.855728] ffff8805010a1b40 ffff8804b94e3c78 0000000000000000 ffff8804b94e4000
> [ 2120.855893] 0000000000000001 0000000000000001 0000000000000002 ffff8804b94e3ca8
> [ 2120.856062] Call Trace:
> [ 2120.856116] [<ffffffffa17fa0b0>] ? preempt_schedule_irq+0x40/0xa0
> [ 2120.856252] [<ffffffffa17fa0b6>] preempt_schedule_irq+0x46/0xa0

Sasha Levin (CCed) was having roughly similar stalls, and found that
reverting b30f0e3ffedf (sched/preempt: Optimize preemption operations
on __schedule() callers) made the stalls go away. Does that help in
your case?

Thanx, Paul

> [ 2120.856387] [<ffffffffa1801129>] retint_kernel+0x1b/0x2d
> [ 2120.856507] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
> [ 2120.856627] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
> [ 2120.856752] [<ffffffffa10d675e>] ? lock_release+0x33e/0x700
> [ 2120.856878] [<ffffffffa108be29>] ? group_send_sig_info+0x79/0x120
> [ 2120.857015] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
> [ 2120.857137] [<ffffffffa108c07b>] ? kill_pid_info+0x9b/0x150
> [ 2120.857259] [<ffffffffa108c098>] kill_pid_info+0xb8/0x150
> [ 2120.857380] [<ffffffffa108c000>] ? kill_pid_info+0x20/0x150
> [ 2120.857508] [<ffffffffa108c244>] SYSC_kill+0xf4/0x2b0
> [ 2120.857620] [<ffffffffa108c1ed>] ? SYSC_kill+0x9d/0x2b0
> [ 2120.857736] [<ffffffffa10d354b>] ? trace_hardirqs_on_caller+0x14b/0x1e0
> [ 2120.857882] [<ffffffffa10d35ed>] ? trace_hardirqs_on+0xd/0x10
> [ 2120.858012] [<ffffffffa1192ed3>] ? context_tracking_user_exit+0x13/0x20
> [ 2120.858161] [<ffffffffa1012db7>] ? syscall_trace_enter_phase1+0xf7/0x150
> [ 2120.858314] [<ffffffffa1001017>] ? trace_hardirqs_on_thunk+0x17/0x19
> [ 2120.858456] [<ffffffffa108e64e>] SyS_kill+0xe/0x10
> [ 2120.858563] [<ffffffffa1800597>] entry_SYSCALL_64_fastpath+0x12/0x6f
> [ 2120.858706] trinity-watchdo R running task 14336 1497 1496 0x00080000
> [ 2120.858863] ffff8804b94e3c88 ffffffffa17fa0b0 ffff8805010a1b40 ffff8804f58b51c0
> [ 2120.859028] ffff8805010a1b40 ffff8804b94e3c78 0000000000000000 ffff8804b94e4000
> [ 2120.859194] 0000000000000001 0000000000000001 0000000000000002 ffff8804b94e3ca8
> [ 2120.859358] Call Trace:
> [ 2120.859410] [<ffffffffa17fa0b0>] ? preempt_schedule_irq+0x40/0xa0
> [ 2120.859545] [<ffffffffa17fa0b6>] preempt_schedule_irq+0x46/0xa0
> [ 2120.859676] [<ffffffffa1801129>] retint_kernel+0x1b/0x2d
> [ 2120.859794] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
> [ 2120.859916] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
> [ 2120.860037] [<ffffffffa10d675e>] ? lock_release+0x33e/0x700
> [ 2120.860163] [<ffffffffa108be29>] ? group_send_sig_info+0x79/0x120
> [ 2120.860297] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
> [ 2120.860422] [<ffffffffa108c07b>] ? kill_pid_info+0x9b/0x150
> [ 2120.860548] [<ffffffffa108c098>] kill_pid_info+0xb8/0x150
> [ 2120.860670] [<ffffffffa108c000>] ? kill_pid_info+0x20/0x150
> [ 2120.860793] [<ffffffffa108c244>] SYSC_kill+0xf4/0x2b0
> [ 2120.860906] [<ffffffffa108c1ed>] ? SYSC_kill+0x9d/0x2b0
> [ 2120.861022] [<ffffffffa10d354b>] ? trace_hardirqs_on_caller+0x14b/0x1e0
> [ 2120.861172] [<ffffffffa10d35ed>] ? trace_hardirqs_on+0xd/0x10
> [ 2120.861301] [<ffffffffa1192ed3>] ? context_tracking_user_exit+0x13/0x20
> [ 2120.861449] [<ffffffffa1012db7>] ? syscall_trace_enter_phase1+0xf7/0x150
> [ 2120.866641] [<ffffffffa1001017>] ? trace_hardirqs_on_thunk+0x17/0x19
> [ 2120.871850] [<ffffffffa108e64e>] SyS_kill+0xe/0x10
> [ 2120.877038] [<ffffffffa1800597>] entry_SYSCALL_64_fastpath+0x12/0x6f
> [ 2316.020180] INFO: rcu_preempt detected stalls on CPUs/tasks:
> [ 2316.025395] Tasks blocked on level-0 rcu_node (CPUs 0-3): P1497
> [ 2316.030662] (detected by 2, t=260007 jiffies, g=78835, c=78834, q=0)
> [ 2316.035890] trinity-watchdo R running task 14336 1497 1496 0x00080000
> [ 2316.041153] ffff8804b94e3c88 ffffffffa17fa0b0 ffff8805010a1b40 ffff8800cd2db680
> [ 2316.046552] ffff8805010a1b40 ffff8804b94e3c78 0000000000000000 ffff8804b94e4000
> [ 2316.052039] 0000000000000001 0000000000000001 0000000000000002 ffff8804b94e3ca8
> [ 2316.057560] Call Trace:
> [ 2316.063066] [<ffffffffa17fa0b0>] ? preempt_schedule_irq+0x40/0xa0
> [ 2316.068594] [<ffffffffa17fa0b6>] preempt_schedule_irq+0x46/0xa0
> [ 2316.074034] [<ffffffffa1801129>] retint_kernel+0x1b/0x2d
> [ 2316.079419] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
> [ 2316.084778] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
> [ 2316.090041] [<ffffffffa10d675e>] ? lock_release+0x33e/0x700
> [ 2316.095219] [<ffffffffa108be29>] ? group_send_sig_info+0x79/0x120
> [ 2316.100359] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
> [ 2316.105489] [<ffffffffa108c07b>] ? kill_pid_info+0x9b/0x150
> [ 2316.110584] [<ffffffffa108c098>] kill_pid_info+0xb8/0x150
> [ 2316.115594] [<ffffffffa108c000>] ? kill_pid_info+0x20/0x150
> [ 2316.120568] [<ffffffffa108c244>] SYSC_kill+0xf4/0x2b0
> [ 2316.125476] [<ffffffffa108c1ed>] ? SYSC_kill+0x9d/0x2b0
> [ 2316.130391] [<ffffffffa10d354b>] ? trace_hardirqs_on_caller+0x14b/0x1e0
> [ 2316.135348] [<ffffffffa10d35ed>] ? trace_hardirqs_on+0xd/0x10
> [ 2316.140351] [<ffffffffa1192ed3>] ? context_tracking_user_exit+0x13/0x20
> [ 2316.145399] [<ffffffffa1012db7>] ? syscall_trace_enter_phase1+0xf7/0x150
> [ 2316.150413] [<ffffffffa1001017>] ? trace_hardirqs_on_thunk+0x17/0x19
> [ 2316.155461] [<ffffffffa108e64e>] SyS_kill+0xe/0x10
> [ 2316.160499] [<ffffffffa1800597>] entry_SYSCALL_64_fastpath+0x12/0x6f
> [ 2316.165609] trinity-watchdo R running task 14336 1497 1496 0x00080000
> [ 2316.170851] ffff8804b94e3c88 ffffffffa17fa0b0 ffff8805010a1b40 ffff8800cd2db680
> [ 2316.176131] ffff8805010a1b40 ffff8804b94e3c78 0000000000000000 ffff8804b94e4000
> [ 2316.181443] 0000000000000001 0000000000000001 0000000000000002 ffff8804b94e3ca8
> [ 2316.186735] Call Trace:
> [ 2316.192021] [<ffffffffa17fa0b0>] ? preempt_schedule_irq+0x40/0xa0
> [ 2316.197426] [<ffffffffa17fa0b6>] preempt_schedule_irq+0x46/0xa0
> [ 2316.202818] [<ffffffffa1801129>] retint_kernel+0x1b/0x2d
> [ 2316.208181] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
> [ 2316.213541] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
> [ 2316.218823] [<ffffffffa10d675e>] ? lock_release+0x33e/0x700
> [ 2316.224042] [<ffffffffa108be29>] ? group_send_sig_info+0x79/0x120
> [ 2316.229215] [<ffffffffa10ad5e1>] ? get_parent_ip+0x11/0x50
> [ 2316.234346] [<ffffffffa108c07b>] ? kill_pid_info+0x9b/0x150
> [ 2316.239491] [<ffffffffa108c098>] kill_pid_info+0xb8/0x150
> [ 2316.244538] [<ffffffffa108c000>] ? kill_pid_info+0x20/0x150
> [ 2316.249525] [<ffffffffa108c244>] SYSC_kill+0xf4/0x2b0
> [ 2316.254503] [<ffffffffa108c1ed>] ? SYSC_kill+0x9d/0x2b0
> [ 2316.259492] [<ffffffffa10d354b>] ? trace_hardirqs_on_caller+0x14b/0x1e0
> [ 2316.264565] [<ffffffffa10d35ed>] ? trace_hardirqs_on+0xd/0x10
> [ 2316.269594] [<ffffffffa1192ed3>] ? context_tracking_user_exit+0x13/0x20
> [ 2316.274681] [<ffffffffa1012db7>] ? syscall_trace_enter_phase1+0xf7/0x150
> [ 2316.279772] [<ffffffffa1001017>] ? trace_hardirqs_on_thunk+0x17/0x19
> [ 2316.284873] [<ffffffffa108e64e>] SyS_kill+0xe/0x10
> [ 2316.289981] [<ffffffffa1800597>] entry_SYSCALL_64_fastpath+0x12/0x6f
>
> some time later..
>
> [66859.981825] INFO: rcu_preempt detected stalls on CPUs/tasks:
> [66859.987582] Tasks blocked on level-0 rcu_node (CPUs 0-3): P20837
> [66859.993375] (detected by 2, t=65013 jiffies, g=5129960, c=5129959, q=0)
> [66859.999146] trinity-c255 R running task 13728 20837 20581 0x00080002
> [66860.004980] ffff8804d1417cb8 ffffffffa17fa0b0 ffff8804e4dab680 ffff8804ba2951c0
> [66860.010831] ffff8804e4dab680 ffff8804d1417ca8 0000000000000000 ffff8804d1418000
> [66860.016666] ffff88045f49a440 ffff8805047a0620 ffff880480fdf528 ffff8804d1417cd8
> [66860.022423] Call Trace:
> [66860.028116] [<ffffffffa17fa0b0>] ? preempt_schedule_irq+0x40/0xa0
> [66860.033912] [<ffffffffa17fa0b6>] preempt_schedule_irq+0x46/0xa0
> [66860.039663] [<ffffffffa1801129>] retint_kernel+0x1b/0x2d
> [66860.045352] [<ffffffffa10d6299>] ? lock_acquire+0xd9/0x260
> [66860.051044] [<ffffffffa10ed3b1>] ? rcu_is_watching+0x1/0x60
> [66860.056725] [<ffffffffa1212abc>] ? dput+0x10c/0x390
> [66860.062377] [<ffffffffa12129d9>] ? dput+0x29/0x390
> [66860.068103] [<ffffffffa11fb6f1>] __fput+0x181/0x200
> [66860.073797] [<ffffffffa11fb576>] ? __fput+0x6/0x200
> [66860.079444] [<ffffffffa11fb7be>] ____fput+0xe/0x10
> [66860.085070] [<ffffffffa109dfdd>] task_work_run+0x8d/0xc0
> [66860.090699] [<ffffffffa107cef6>] do_exit+0x416/0xc60
> [66860.096304] [<ffffffffa1192ed3>] ? context_tracking_user_exit+0x13/0x20
> [66860.101978] [<ffffffffa107ec64>] do_group_exit+0x54/0xe0
> [66860.107684] [<ffffffffa107ed04>] SyS_exit_group+0x14/0x20
> [66860.113360] [<ffffffffa1800597>] entry_SYSCALL_64_fastpath+0x12/0x6f
>

2015-08-03 21:55:46

by Dave Jones

[permalink] [raw]
Subject: Re: 4.2-rc5 rcu stalls.

On Mon, Aug 03, 2015 at 02:37:23PM -0700, Paul E. McKenney wrote:
> On Mon, Aug 03, 2015 at 05:08:35PM -0400, Dave Jones wrote:
> > [ 2120.855128] Tasks blocked on level-0 rcu_node (CPUs 0-3): P1497
> > [ 2120.855263] (detected by 2, t=65002 jiffies, g=78835, c=78834, q=0)
> > [ 2120.855403] trinity-watchdo R running task 14336 1497 1496 0x00080000
> > [ 2120.855563] ffff8804b94e3c88 ffffffffa17fa0b0 ffff8805010a1b40 ffff8804f58b51c0
> > [ 2120.855728] ffff8805010a1b40 ffff8804b94e3c78 0000000000000000 ffff8804b94e4000
> > [ 2120.855893] 0000000000000001 0000000000000001 0000000000000002 ffff8804b94e3ca8
> > [ 2120.856062] Call Trace:
> > [ 2120.856116] [<ffffffffa17fa0b0>] ? preempt_schedule_irq+0x40/0xa0
> > [ 2120.856252] [<ffffffffa17fa0b6>] preempt_schedule_irq+0x46/0xa0
>
> Sasha Levin (CCed) was having roughly similar stalls, and found that
> reverting b30f0e3ffedf (sched/preempt: Optimize preemption operations
> on __schedule() callers) made the stalls go away. Does that help in
> your case?

Ugh, that doesn't revert cleanly. Got something handy ?

The curious thing is why I've only just started seeing these.
I've had that machine for a month now, so it's been fuzzing since ~4.1,
and until yesterday I'd not seen this at all.

Dave

2015-08-03 22:04:06

by Paul E. McKenney

[permalink] [raw]
Subject: Re: 4.2-rc5 rcu stalls.

On Mon, Aug 03, 2015 at 05:55:35PM -0400, Dave Jones wrote:
> Content analysis details: (-2.9 points, 5.0 required)
>
> pts rule name description
> ---- ---------------------- --------------------------------------------------
> -1.0 ALL_TRUSTED Passed through trusted hosts only via SMTP
> -1.9 BAYES_00 BODY: Bayes spam probability is 0 to 1%
> [score: 0.0000]
> X-Authenticated-User: [email protected]
> X-ZLA-Header: unknown; 0
> X-ZLA-DetailInfo: BA=6.00003574; NDR=6.00000001; ZLA=6.00000002; ZF=6.00000004; ZB=6.00041153; ZH=6.00102541; ZP=6.00079473; ZU=6.00000001; UDB=6.00244665; UTC=2015-08-03 21:55:44
> x-cbid: 15080321-0049-0000-0000-0000023893EC
> X-IBM-ISS-SpamDetectors: Score=0.399202; BY=0; FL=0; FP=0; FZ=0; HX=0; KW=0;
> PH=0; RB=0; SC=0.399202; ST=0; TS=0; UL=0; ISC=
> X-IBM-ISS-DetailInfo: BY=3.00004248; HX=3.00000236; KW=3.00000007;
> PH=3.00000003; SC=3.00000115; SDB=6.00568916; UDB=6.00244665; UTC=2015-08-03
> 21:55:45
> X-TM-AS-MML: disable
>
> On Mon, Aug 03, 2015 at 02:37:23PM -0700, Paul E. McKenney wrote:
> > On Mon, Aug 03, 2015 at 05:08:35PM -0400, Dave Jones wrote:
> > > [ 2120.855128] Tasks blocked on level-0 rcu_node (CPUs 0-3): P1497
> > > [ 2120.855263] (detected by 2, t=65002 jiffies, g=78835, c=78834, q=0)
> > > [ 2120.855403] trinity-watchdo R running task 14336 1497 1496 0x00080000
> > > [ 2120.855563] ffff8804b94e3c88 ffffffffa17fa0b0 ffff8805010a1b40 ffff8804f58b51c0
> > > [ 2120.855728] ffff8805010a1b40 ffff8804b94e3c78 0000000000000000 ffff8804b94e4000
> > > [ 2120.855893] 0000000000000001 0000000000000001 0000000000000002 ffff8804b94e3ca8
> > > [ 2120.856062] Call Trace:
> > > [ 2120.856116] [<ffffffffa17fa0b0>] ? preempt_schedule_irq+0x40/0xa0
> > > [ 2120.856252] [<ffffffffa17fa0b6>] preempt_schedule_irq+0x46/0xa0
> >
> > Sasha Levin (CCed) was having roughly similar stalls, and found that
> > reverting b30f0e3ffedf (sched/preempt: Optimize preemption operations
> > on __schedule() callers) made the stalls go away. Does that help in
> > your case?
>
> Ugh, that doesn't revert cleanly. Got something handy ?

I do not, but perhaps either Sasha or Frederic do.

> The curious thing is why I've only just started seeing these.
> I've had that machine for a month now, so it's been fuzzing since ~4.1,
> and until yesterday I'd not seen this at all.

It could well be a new problem. I just saw the preempt_schedule_irq()
and was reminded of the problem that Sasha found. ;-)

Thanx, Paul

2015-08-04 04:55:26

by Sasha Levin

[permalink] [raw]
Subject: Re: 4.2-rc5 rcu stalls.

On 08/03/2015 06:03 PM, Paul E. McKenney wrote:
>> > Ugh, that doesn't revert cleanly. Got something handy ?
> I do not, but perhaps either Sasha or Frederic do.

I've attached a revert courtesy of Peter.


Thanks,
Sasha


Attachments:
revert.patch (3.87 kB)

2015-08-05 00:13:11

by Dave Jones

[permalink] [raw]
Subject: Re: 4.2-rc5 rcu stalls.

On Tue, Aug 04, 2015 at 12:54:35AM -0400, Sasha Levin wrote:
> On 08/03/2015 06:03 PM, Paul E. McKenney wrote:
> >> > Ugh, that doesn't revert cleanly. Got something handy ?
> > I do not, but perhaps either Sasha or Frederic do.
>
> I've attached a revert courtesy of Peter.

Thanks. At first I thought this was doing the trick, but then I hit this again.


[23643.545873] INFO: rcu_preempt detected stalls on CPUs/tasks:
[23643.546031] Tasks blocked on level-0 rcu_node (CPUs 0-3): P31722
[23643.546173] (detected by 3, t=65002 jiffies, g=2256887, c=2256886, q=0)
[23643.546326] trinity-watchdo R running task 14336 31722 31721 0x00080000
[23643.546488] ffff8804fcfe7cc8 000000000000ded0 0000000000000002 ffff8804f58bb680
[23643.546661] ffff8800ce4951c0 ffff8804fcfe7cb8 ffff8804fcfe8000 ffff8804f6552608
[23643.546830] 0000000000000009 ffff8804fcfe7e88 0000000000000009 ffff8804fcfe7ce8
[23643.547001] Call Trace:
[23643.547058] [<ffffffff887fa2b2>] preempt_schedule_common+0x22/0x40
[23643.547201] [<ffffffff887fa2ef>] preempt_schedule+0x1f/0x30
[23643.547329] [<ffffffff88001058>] ___preempt_schedule+0x12/0x14
[23643.547465] [<ffffffff8808b76d>] ? do_send_sig_info+0x5d/0x80
[23643.547599] [<ffffffff887fff32>] ? _raw_spin_unlock_irqrestore+0x42/0x70
[23643.547753] [<ffffffff887fff50>] ? _raw_spin_unlock_irqrestore+0x60/0x70
[23643.547910] [<ffffffff8808b76d>] do_send_sig_info+0x5d/0x80
[23643.548039] [<ffffffff8808be62>] group_send_sig_info+0xb2/0x120
[23643.548175] [<ffffffff8808bdb5>] ? group_send_sig_info+0x5/0x120
[23643.548314] [<ffffffff880ea62f>] ? rcu_read_lock_held+0x4f/0x60
[23643.548451] [<ffffffff8808c05f>] kill_pid_info+0x7f/0x150
[23643.548576] [<ffffffff8808c000>] ? kill_pid_info+0x20/0x150
[23643.548705] [<ffffffff8808c244>] SYSC_kill+0xf4/0x2b0
[23643.548821] [<ffffffff8808c1ed>] ? SYSC_kill+0x9d/0x2b0
[23643.548942] [<ffffffff880d35cb>] ? trace_hardirqs_on_caller+0x14b/0x1e0
[23643.549097] [<ffffffff880d366d>] ? trace_hardirqs_on+0xd/0x10
[23643.549231] [<ffffffff88192f63>] ? context_tracking_user_exit+0x13/0x20
[23643.549387] [<ffffffff88012c47>] ? syscall_trace_enter_phase1+0xf7/0x150
[23643.549540] [<ffffffff88001017>] ? trace_hardirqs_on_thunk+0x17/0x19
[23643.549687] [<ffffffff8808e64e>] SyS_kill+0xe/0x10
[23643.549799] [<ffffffff88800997>] entry_SYSCALL_64_fastpath+0x12/0x6f
[23643.549946] trinity-watchdo R running task 14336 31722 31721 0x00080000
[23643.550106] ffff8804fcfe7cc8 000000000000ded0 0000000000000002 ffff8804f58bb680
[23643.550276] ffff8800ce4951c0 ffff8804fcfe7cb8 ffff8804fcfe8000 ffff8804f6552608
[23643.550446] 0000000000000009 ffff8804fcfe7e88 0000000000000009 ffff8804fcfe7ce8
[23643.550615] Call Trace:
[23643.550668] [<ffffffff887fa2b2>] preempt_schedule_common+0x22/0x40
[23643.550809] [<ffffffff887fa2ef>] preempt_schedule+0x1f/0x30
[23643.550935] [<ffffffff88001058>] ___preempt_schedule+0x12/0x14
[23643.551072] [<ffffffff8808b76d>] ? do_send_sig_info+0x5d/0x80
[23643.551204] [<ffffffff887fff32>] ? _raw_spin_unlock_irqrestore+0x42/0x70
[23643.551358] [<ffffffff887fff50>] ? _raw_spin_unlock_irqrestore+0x60/0x70
[23643.551515] [<ffffffff8808b76d>] do_send_sig_info+0x5d/0x80
[23643.551642] [<ffffffff8808be62>] group_send_sig_info+0xb2/0x120
[23643.551779] [<ffffffff8808bdb5>] ? group_send_sig_info+0x5/0x120
[23643.551915] [<ffffffff880ea62f>] ? rcu_read_lock_held+0x4f/0x60
[23643.557757] [<ffffffff8808c05f>] kill_pid_info+0x7f/0x150
[23643.563613] [<ffffffff8808c000>] ? kill_pid_info+0x20/0x150
[23643.569450] [<ffffffff8808c244>] SYSC_kill+0xf4/0x2b0
[23643.575270] [<ffffffff8808c1ed>] ? SYSC_kill+0x9d/0x2b0
[23643.581025] [<ffffffff880d35cb>] ? trace_hardirqs_on_caller+0x14b/0x1e0
[23643.586867] [<ffffffff880d366d>] ? trace_hardirqs_on+0xd/0x10
[23643.592620] [<ffffffff88192f63>] ? context_tracking_user_exit+0x13/0x20
[23643.598317] [<ffffffff88012c47>] ? syscall_trace_enter_phase1+0xf7/0x150
[23643.603903] [<ffffffff88001017>] ? trace_hardirqs_on_thunk+0x17/0x19
[23643.609404] [<ffffffff8808e64e>] SyS_kill+0xe/0x10
[23643.614788] [<ffffffff88800997>] entry_SYSCALL_64_fastpath+0x12/0x6f

Dave

2015-08-05 12:38:04

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: 4.2-rc5 rcu stalls.

On Tue, Aug 04, 2015 at 08:12:50PM -0400, Dave Jones wrote:
> On Tue, Aug 04, 2015 at 12:54:35AM -0400, Sasha Levin wrote:
> > On 08/03/2015 06:03 PM, Paul E. McKenney wrote:
> > >> > Ugh, that doesn't revert cleanly. Got something handy ?
> > > I do not, but perhaps either Sasha or Frederic do.
> >
> > I've attached a revert courtesy of Peter.
>
> Thanks. At first I thought this was doing the trick, but then I hit this again.
>
>
> [23643.545873] INFO: rcu_preempt detected stalls on CPUs/tasks:
> [23643.546031] Tasks blocked on level-0 rcu_node (CPUs 0-3): P31722
> [23643.546173] (detected by 3, t=65002 jiffies, g=2256887, c=2256886, q=0)
> [23643.546326] trinity-watchdo R running task 14336 31722 31721 0x00080000
> [23643.546488] ffff8804fcfe7cc8 000000000000ded0 0000000000000002 ffff8804f58bb680
> [23643.546661] ffff8800ce4951c0 ffff8804fcfe7cb8 ffff8804fcfe8000 ffff8804f6552608
> [23643.546830] 0000000000000009 ffff8804fcfe7e88 0000000000000009 ffff8804fcfe7ce8
> [23643.547001] Call Trace:
> [23643.547058] [<ffffffff887fa2b2>] preempt_schedule_common+0x22/0x40
> [23643.547201] [<ffffffff887fa2ef>] preempt_schedule+0x1f/0x30
> [23643.547329] [<ffffffff88001058>] ___preempt_schedule+0x12/0x14
> [23643.547465] [<ffffffff8808b76d>] ? do_send_sig_info+0x5d/0x80
> [23643.547599] [<ffffffff887fff32>] ? _raw_spin_unlock_irqrestore+0x42/0x70
> [23643.547753] [<ffffffff887fff50>] ? _raw_spin_unlock_irqrestore+0x60/0x70
> [23643.547910] [<ffffffff8808b76d>] do_send_sig_info+0x5d/0x80
> [23643.548039] [<ffffffff8808be62>] group_send_sig_info+0xb2/0x120
> [23643.548175] [<ffffffff8808bdb5>] ? group_send_sig_info+0x5/0x120
> [23643.548314] [<ffffffff880ea62f>] ? rcu_read_lock_held+0x4f/0x60
> [23643.548451] [<ffffffff8808c05f>] kill_pid_info+0x7f/0x150
> [23643.548576] [<ffffffff8808c000>] ? kill_pid_info+0x20/0x150
> [23643.548705] [<ffffffff8808c244>] SYSC_kill+0xf4/0x2b0
> [23643.548821] [<ffffffff8808c1ed>] ? SYSC_kill+0x9d/0x2b0
> [23643.548942] [<ffffffff880d35cb>] ? trace_hardirqs_on_caller+0x14b/0x1e0
> [23643.549097] [<ffffffff880d366d>] ? trace_hardirqs_on+0xd/0x10
> [23643.549231] [<ffffffff88192f63>] ? context_tracking_user_exit+0x13/0x20
> [23643.549387] [<ffffffff88012c47>] ? syscall_trace_enter_phase1+0xf7/0x150
> [23643.549540] [<ffffffff88001017>] ? trace_hardirqs_on_thunk+0x17/0x19
> [23643.549687] [<ffffffff8808e64e>] SyS_kill+0xe/0x10
> [23643.549799] [<ffffffff88800997>] entry_SYSCALL_64_fastpath+0x12/0x6f

If it still happens after Sasha's revert, which basically revert all the offending
patches related to preempt lately, then the reason might be elsewhere.

How hard was it to reproduce? I see 23000 secs in your dmesg logs which is around 6 hours.

Also did you just launch trinity? no specific options?
I'll try to reproduce that.

Thanks.

2015-08-05 13:19:34

by Dave Jones

[permalink] [raw]
Subject: Re: 4.2-rc5 rcu stalls.

On Wed, Aug 05, 2015 at 02:37:59PM +0200, Frederic Weisbecker wrote:
> On Tue, Aug 04, 2015 at 08:12:50PM -0400, Dave Jones wrote:
> > On Tue, Aug 04, 2015 at 12:54:35AM -0400, Sasha Levin wrote:
> > > On 08/03/2015 06:03 PM, Paul E. McKenney wrote:
> > > >> > Ugh, that doesn't revert cleanly. Got something handy ?
> > > > I do not, but perhaps either Sasha or Frederic do.
> > >
> > > I've attached a revert courtesy of Peter.
> >
> > Thanks. At first I thought this was doing the trick, but then I hit this again.
> >
> >
> > [23643.545873] INFO: rcu_preempt detected stalls on CPUs/tasks:
>
> If it still happens after Sasha's revert, which basically revert all the offending
> patches related to preempt lately, then the reason might be elsewhere.
>
> How hard was it to reproduce? I see 23000 secs in your dmesg logs which is around 6 hours.

yeah. That's why I thought it had fixed it up until that point.
My subsequent overnight run hit a different bug (that unpinning an unpinned lock bug in the scheduler)
so I haven't had it happen since.

> Also did you just launch trinity? no specific options?

basically

while [ 1 ];
do
trinity -N 1000000 -q -l off -C256 -a64 -x fsync -x fdatasync -x syncfs -x sync -P INET --enable-fds=sockets
sudo ipcrm -a
done

(The ipcrm thing is needed for long runs or eventually you oom, because trinity lacks the cleanup smarts)

Dave

2015-08-05 14:38:18

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: 4.2-rc5 rcu stalls.

On Wed, Aug 05, 2015 at 09:18:57AM -0400, Dave Jones wrote:
> On Wed, Aug 05, 2015 at 02:37:59PM +0200, Frederic Weisbecker wrote:
> > On Tue, Aug 04, 2015 at 08:12:50PM -0400, Dave Jones wrote:
> > > On Tue, Aug 04, 2015 at 12:54:35AM -0400, Sasha Levin wrote:
> > > > On 08/03/2015 06:03 PM, Paul E. McKenney wrote:
> > > > >> > Ugh, that doesn't revert cleanly. Got something handy ?
> > > > > I do not, but perhaps either Sasha or Frederic do.
> > > >
> > > > I've attached a revert courtesy of Peter.
> > >
> > > Thanks. At first I thought this was doing the trick, but then I hit this again.
> > >
> > >
> > > [23643.545873] INFO: rcu_preempt detected stalls on CPUs/tasks:
> >
> > If it still happens after Sasha's revert, which basically revert all the offending
> > patches related to preempt lately, then the reason might be elsewhere.
> >
> > How hard was it to reproduce? I see 23000 secs in your dmesg logs which is around 6 hours.
>
> yeah. That's why I thought it had fixed it up until that point.
> My subsequent overnight run hit a different bug (that unpinning an unpinned lock bug in the scheduler)
> so I haven't had it happen since.
>
> > Also did you just launch trinity? no specific options?
>
> basically
>
> while [ 1 ];
> do
> trinity -N 1000000 -q -l off -C256 -a64 -x fsync -x fdatasync -x syncfs -x sync -P INET --enable-fds=sockets
> sudo ipcrm -a
> done
>
> (The ipcrm thing is needed for long runs or eventually you oom, because trinity lacks the cleanup smarts)

Ok, can I run that safely on my testbox without it eating some of my files or should
I use some special purposed guest?

Thanks!

2015-08-05 14:47:10

by Dave Jones

[permalink] [raw]
Subject: Re: 4.2-rc5 rcu stalls.

On Wed, Aug 05, 2015 at 04:38:14PM +0200, Frederic Weisbecker wrote:

> > > Also did you just launch trinity? no specific options?
> >
> > basically
> >
> > while [ 1 ];
> > do
> > trinity -N 1000000 -q -l off -C256 -a64 -x fsync -x fdatasync -x syncfs -x sync -P INET --enable-fds=sockets
> > sudo ipcrm -a
> > done
> >
> > (The ipcrm thing is needed for long runs or eventually you oom, because trinity lacks the cleanup smarts)
>
> Ok, can I run that safely on my testbox without it eating some of my files or should
> I use some special purposed guest?

I wouldn't run it on anything with data I cared about (even nfs mounts)
While there are some safeguards, there might be some cases I've not thought about.

Dave

2015-08-06 04:15:27

by Dave Jones

[permalink] [raw]
Subject: Re: 4.2-rc5 rcu stalls.

On Wed, Aug 05, 2015 at 02:37:59PM +0200, Frederic Weisbecker wrote:
> On Tue, Aug 04, 2015 at 08:12:50PM -0400, Dave Jones wrote:
> > On Tue, Aug 04, 2015 at 12:54:35AM -0400, Sasha Levin wrote:
> > > On 08/03/2015 06:03 PM, Paul E. McKenney wrote:
> > > >> > Ugh, that doesn't revert cleanly. Got something handy ?
> > > > I do not, but perhaps either Sasha or Frederic do.
> > >
> > > I've attached a revert courtesy of Peter.
> >
> > Thanks. At first I thought this was doing the trick, but then I hit this again.
> >
> >
> > [23643.545873] INFO: rcu_preempt detected stalls on CPUs/tasks:
> > [23643.546031] Tasks blocked on level-0 rcu_node (CPUs 0-3): P31722
> > [23643.546173] (detected by 3, t=65002 jiffies, g=2256887, c=2256886, q=0)
> > [23643.546326] trinity-watchdo R running task 14336 31722 31721 0x00080000
> > [23643.546488] ffff8804fcfe7cc8 000000000000ded0 0000000000000002 ffff8804f58bb680
> > [23643.546661] ffff8800ce4951c0 ffff8804fcfe7cb8 ffff8804fcfe8000 ffff8804f6552608
> > [23643.546830] 0000000000000009 ffff8804fcfe7e88 0000000000000009 ffff8804fcfe7ce8
> > [23643.547001] Call Trace:
> > [23643.547058] [<ffffffff887fa2b2>] preempt_schedule_common+0x22/0x40
> > [23643.547201] [<ffffffff887fa2ef>] preempt_schedule+0x1f/0x30
> > [23643.547329] [<ffffffff88001058>] ___preempt_schedule+0x12/0x14
> > [23643.547465] [<ffffffff8808b76d>] ? do_send_sig_info+0x5d/0x80
> > [23643.547599] [<ffffffff887fff32>] ? _raw_spin_unlock_irqrestore+0x42/0x70
> > [23643.547753] [<ffffffff887fff50>] ? _raw_spin_unlock_irqrestore+0x60/0x70
> > [23643.547910] [<ffffffff8808b76d>] do_send_sig_info+0x5d/0x80
> > [23643.548039] [<ffffffff8808be62>] group_send_sig_info+0xb2/0x120
> > [23643.548175] [<ffffffff8808bdb5>] ? group_send_sig_info+0x5/0x120
> > [23643.548314] [<ffffffff880ea62f>] ? rcu_read_lock_held+0x4f/0x60
> > [23643.548451] [<ffffffff8808c05f>] kill_pid_info+0x7f/0x150
> > [23643.548576] [<ffffffff8808c000>] ? kill_pid_info+0x20/0x150
> > [23643.548705] [<ffffffff8808c244>] SYSC_kill+0xf4/0x2b0
> > [23643.548821] [<ffffffff8808c1ed>] ? SYSC_kill+0x9d/0x2b0
> > [23643.548942] [<ffffffff880d35cb>] ? trace_hardirqs_on_caller+0x14b/0x1e0
> > [23643.549097] [<ffffffff880d366d>] ? trace_hardirqs_on+0xd/0x10
> > [23643.549231] [<ffffffff88192f63>] ? context_tracking_user_exit+0x13/0x20
> > [23643.549387] [<ffffffff88012c47>] ? syscall_trace_enter_phase1+0xf7/0x150
> > [23643.549540] [<ffffffff88001017>] ? trace_hardirqs_on_thunk+0x17/0x19
> > [23643.549687] [<ffffffff8808e64e>] SyS_kill+0xe/0x10
> > [23643.549799] [<ffffffff88800997>] entry_SYSCALL_64_fastpath+0x12/0x6f
>
> If it still happens after Sasha's revert, which basically revert all the offending
> patches related to preempt lately, then the reason might be elsewhere.
>
> How hard was it to reproduce? I see 23000 secs in your dmesg logs which is around 6 hours.

Interestingly, it happened again, but only after 7 hours.
I've yet to trigger it in a shorter timeframe. Frustrating.

[28190.798758] INFO: rcu_preempt detected stalls on CPUs/tasks:
[28190.798914] Tasks blocked on level-0 rcu_node (CPUs 0-3): P32189
[28190.799054] (detected by 1, t=65002 jiffies, g=2137396, c=2137395, q=0)
[28190.799203] trinity-c224 R running task 13856 32189 31964 0x00080000
[28190.799362] ffff8804f2323da8 ffffffffa67fa4d1 ffff8804fe170000 ffff8804b66db680
[28190.799531] ffff8804fe170000 ffff8804f2323d98 0000000000000000 ffff8804f2324000
[28190.799699] 0000000000000002 0000000000000000 0000000000000000 ffff8804f2323dc8
[28190.799866] Call Trace:
[28190.799921] [<ffffffffa67fa4d1>] ? preempt_schedule_irq+0x41/0xa0
[28190.800058] [<ffffffffa67fa4d7>] preempt_schedule_irq+0x47/0xa0
[28190.800191] [<ffffffffa6801529>] retint_kernel+0x1b/0x2d
[28190.800312] [<ffffffffa60d6319>] ? lock_acquire+0xd9/0x260
[28190.800438] [<ffffffffa609d295>] ? __task_pid_nr_ns+0x5/0x190
[28190.800568] [<ffffffffa680153b>] ? retint_kernel+0x2d/0x2d
[28190.800691] [<ffffffffa609d2d2>] __task_pid_nr_ns+0x42/0x190
[28190.800820] [<ffffffffa609d295>] ? __task_pid_nr_ns+0x5/0x190
[28190.800950] [<ffffffffa6091f0b>] sys_gettid+0x1b/0x20
[28190.801064] [<ffffffffa6800997>] entry_SYSCALL_64_fastpath+0x12/0x6f
[28190.801208] trinity-c224 R running task 13856 32189 31964 0x00080000
[28190.801365] ffff8804f2323da8 ffffffffa67fa4d1 ffff8804fe170000 ffff8804b66db680
[28190.801533] ffff8804fe170000 ffff8804f2323d98 0000000000000000 ffff8804f2324000
[28190.801702] 0000000000000002 0000000000000000 0000000000000000 ffff8804f2323dc8
[28190.801870] Call Trace:
[28190.801923] [<ffffffffa67fa4d1>] ? preempt_schedule_irq+0x41/0xa0
[28190.802060] [<ffffffffa67fa4d7>] preempt_schedule_irq+0x47/0xa0
[28190.802193] [<ffffffffa6801529>] retint_kernel+0x1b/0x2d
[28190.802313] [<ffffffffa60d6319>] ? lock_acquire+0xd9/0x260
[28190.802436] [<ffffffffa609d295>] ? __task_pid_nr_ns+0x5/0x190
[28190.802565] [<ffffffffa680153b>] ? retint_kernel+0x2d/0x2d
[28190.802688] [<ffffffffa609d2d2>] __task_pid_nr_ns+0x42/0x190
[28190.802815] [<ffffffffa609d295>] ? __task_pid_nr_ns+0x5/0x190
[28190.802945] [<ffffffffa6091f0b>] sys_gettid+0x1b/0x20
[28190.803058] [<ffffffffa6800997>] entry_SYSCALL_64_fastpath+0x12/0x6f
[29929.492752] INFO: rcu_preempt detected stalls on CPUs/tasks:
[29929.492906] Tasks blocked on level-0 rcu_node (CPUs 0-3): P289
[29929.493039] (detected by 0, t=65002 jiffies, g=2141006, c=2141005, q=0)
[29929.493188] systemd-journal R running task 12464 289 1 0x00080000
[29929.493347] ffff8804ff2bbae8 ffffffffa67fa4d1 ffff880501f81b40 ffff880503d43680
[29929.493515] ffff880501f81b40 ffff8804ff2bbad8 0000000000000000 ffff8804ff2bc000
[29929.493683] ffff8800d3e9f118 ffff8800d3e9eb40 0000000000000056 ffff8804ff2bbb08
[29929.493853] Call Trace:
[29929.493909] [<ffffffffa67fa4d1>] ? preempt_schedule_irq+0x41/0xa0
[29929.494046] [<ffffffffa67fa4d7>] preempt_schedule_irq+0x47/0xa0
[29929.494181] [<ffffffffa6801529>] retint_kernel+0x1b/0x2d
[29929.494304] [<ffffffffa67f9929>] ? __schedule+0x439/0xb20
[29929.494430] [<ffffffffa6001058>] ? ___preempt_schedule+0x12/0x14
[29929.494568] [<ffffffffa6001058>] ? ___preempt_schedule+0x12/0x14
[29929.494709] [<ffffffffa66b8b11>] ? sock_def_readable+0x161/0x190
[29929.501118] [<ffffffffa60ed468>] ? rcu_is_watching+0x38/0x60
[29929.507566] [<ffffffffa60ed481>] ? rcu_is_watching+0x51/0x60
[29929.513987] [<ffffffffa66b8b11>] sock_def_readable+0x161/0x190
[29929.520344] [<ffffffffa66b89b5>] ? sock_def_readable+0x5/0x190
[29929.526678] [<ffffffffa67ffe85>] ? _raw_spin_unlock+0x35/0x60
[29929.532988] [<ffffffffa67986c9>] unix_dgram_sendmsg+0x4f9/0x570
[29929.539184] [<ffffffffa66b509b>] ___sys_sendmsg+0x30b/0x320
[29929.545270] [<ffffffffa60cfe7e>] ? put_lock_stats.isra.29+0xe/0x30
[29929.551331] [<ffffffffa638a137>] ? debug_smp_processor_id+0x17/0x20
[29929.557285] [<ffffffffa60cfe7e>] ? put_lock_stats.isra.29+0xe/0x30
[29929.563204] [<ffffffffa60ad681>] ? get_parent_ip+0x11/0x50
[29929.569047] [<ffffffffa60ad813>] ? preempt_count_sub+0xa3/0xf0
[29929.574796] [<ffffffffa621a626>] ? __fget_light+0x66/0x90
[29929.580555] [<ffffffffa6192d53>] ? context_tracking_exit+0x43/0x240
[29929.586253] [<ffffffffa66b5712>] __sys_sendmsg+0x42/0x80
[29929.591843] [<ffffffffa66b5762>] SyS_sendmsg+0x12/0x20
[29929.597385] [<ffffffffa6800997>] entry_SYSCALL_64_fastpath+0x12/0x6f