Below recursion is observed in a rare scenario where __schedule()
takes rq lock, at around same time task's affinity is being changed,
bpf function for tracing sched_switch calls migrate_enabled(),
checks for affinity change (cpus_ptr != cpus_mask) lands into
__set_cpus_allowed_ptr which tries acquire rq lock and causing the
recursion bug.
Fix the issue by switching to preempt_enable/disable() for non-RT
Kernels.
-010 |spin_bug(lock = ???, msg = ???)
-011 |debug_spin_lock_before(inline)
-011 |do_raw_spin_lock(lock = 0xFFFFFF89323BB600)
-012 |_raw_spin_lock(inline)
-012 |raw_spin_rq_lock_nested(inline)
-012 |raw_spin_rq_lock(inline)
-012 |task_rq_lock(p = 0xFFFFFF88CFF1DA00, rf = 0xFFFFFFC03707BBE8)
-013 |__set_cpus_allowed_ptr(inline)
-013 |migrate_enable()
-014 |trace_call_bpf(call = ?, ctx = 0xFFFFFFFDEF954600)
-015 |perf_trace_run_bpf_submit(inline)
-015 |perf_trace_sched_switch(__data = 0xFFFFFFE82CF0BCB8, preempt = FALSE, prev = ?, next = ?)
-016 |__traceiter_sched_switch(inline)
-016 |trace_sched_switch(inline)
-016 |__schedule(sched_mode = ?)
-017 |schedule()
-018 |arch_local_save_flags(inline)
-018 |arch_irqs_disabled(inline)
-018 |__raw_spin_lock_irq(inline)
-018 |_raw_spin_lock_irq(inline)
-018 |worker_thread(__worker = 0xFFFFFF88CE251300)
-019 |kthread(_create = 0xFFFFFF88730A5A80)
-020 |ret_from_fork(asm)
Signed-off-by: Satya Durga Srinivasu Prabhala <[email protected]>
---
kernel/sched/core.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bfa7452ca92e..e254e9227341 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2223,6 +2223,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
void migrate_disable(void)
{
+#ifdef CONFIG_PREEMPT_RT
struct task_struct *p = current;
if (p->migration_disabled) {
@@ -2234,11 +2235,15 @@ void migrate_disable(void)
this_rq()->nr_pinned++;
p->migration_disabled = 1;
preempt_enable();
+#else
+ preempt_disable();
+#endif
}
EXPORT_SYMBOL_GPL(migrate_disable);
void migrate_enable(void)
{
+#ifdef CONFIG_PREEMPT_RT
struct task_struct *p = current;
if (p->migration_disabled > 1) {
@@ -2265,6 +2270,9 @@ void migrate_enable(void)
p->migration_disabled = 0;
this_rq()->nr_pinned--;
preempt_enable();
+#else
+ preempt_enable();
+#endif
}
EXPORT_SYMBOL_GPL(migrate_enable);
--
2.36.1
Hi Satya
On 06/24/22 00:42, Satya Durga Srinivasu Prabhala wrote:
> Below recursion is observed in a rare scenario where __schedule()
> takes rq lock, at around same time task's affinity is being changed,
> bpf function for tracing sched_switch calls migrate_enabled(),
> checks for affinity change (cpus_ptr != cpus_mask) lands into
> __set_cpus_allowed_ptr which tries acquire rq lock and causing the
> recursion bug.
>
> Fix the issue by switching to preempt_enable/disable() for non-RT
> Kernels.
Interesting bug. Thanks for the report. Unfortunately I can't see this being
a fix as it just limits the bug visibility to PREEMPT_RT kernels, but won't fix
anything, no? ie: Kernels compiled with PREEMPT_RT will still hit this failure.
I'm curious how the race with set affinity is happening. I would have thought
user space would get blocked as __schedule() will hold the rq lock.
Do you have more details on that?
Thanks
--
Qais Yousef
>
> -010 |spin_bug(lock = ???, msg = ???)
> -011 |debug_spin_lock_before(inline)
> -011 |do_raw_spin_lock(lock = 0xFFFFFF89323BB600)
> -012 |_raw_spin_lock(inline)
> -012 |raw_spin_rq_lock_nested(inline)
> -012 |raw_spin_rq_lock(inline)
> -012 |task_rq_lock(p = 0xFFFFFF88CFF1DA00, rf = 0xFFFFFFC03707BBE8)
> -013 |__set_cpus_allowed_ptr(inline)
> -013 |migrate_enable()
> -014 |trace_call_bpf(call = ?, ctx = 0xFFFFFFFDEF954600)
> -015 |perf_trace_run_bpf_submit(inline)
> -015 |perf_trace_sched_switch(__data = 0xFFFFFFE82CF0BCB8, preempt = FALSE, prev = ?, next = ?)
> -016 |__traceiter_sched_switch(inline)
> -016 |trace_sched_switch(inline)
> -016 |__schedule(sched_mode = ?)
> -017 |schedule()
> -018 |arch_local_save_flags(inline)
> -018 |arch_irqs_disabled(inline)
> -018 |__raw_spin_lock_irq(inline)
> -018 |_raw_spin_lock_irq(inline)
> -018 |worker_thread(__worker = 0xFFFFFF88CE251300)
> -019 |kthread(_create = 0xFFFFFF88730A5A80)
> -020 |ret_from_fork(asm)
>
> Signed-off-by: Satya Durga Srinivasu Prabhala <[email protected]>
> ---
> kernel/sched/core.c | 8 ++++++++
> 1 file changed, 8 insertions(+)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index bfa7452ca92e..e254e9227341 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2223,6 +2223,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
>
> void migrate_disable(void)
> {
> +#ifdef CONFIG_PREEMPT_RT
> struct task_struct *p = current;
>
> if (p->migration_disabled) {
> @@ -2234,11 +2235,15 @@ void migrate_disable(void)
> this_rq()->nr_pinned++;
> p->migration_disabled = 1;
> preempt_enable();
> +#else
> + preempt_disable();
> +#endif
> }
> EXPORT_SYMBOL_GPL(migrate_disable);
>
> void migrate_enable(void)
> {
> +#ifdef CONFIG_PREEMPT_RT
> struct task_struct *p = current;
>
> if (p->migration_disabled > 1) {
> @@ -2265,6 +2270,9 @@ void migrate_enable(void)
> p->migration_disabled = 0;
> this_rq()->nr_pinned--;
> preempt_enable();
> +#else
> + preempt_enable();
> +#endif
> }
> EXPORT_SYMBOL_GPL(migrate_enable);
>
> --
> 2.36.1
>
On Fri, Jun 24, 2022 at 12:42:40AM -0700, Satya Durga Srinivasu Prabhala wrote:
> Below recursion is observed in a rare scenario where __schedule()
> takes rq lock, at around same time task's affinity is being changed,
> bpf function for tracing sched_switch calls migrate_enabled(),
> checks for affinity change (cpus_ptr != cpus_mask) lands into
> __set_cpus_allowed_ptr which tries acquire rq lock and causing the
> recursion bug.
>
> Fix the issue by switching to preempt_enable/disable() for non-RT
> Kernels.
>
> -010 |spin_bug(lock = ???, msg = ???)
> -011 |debug_spin_lock_before(inline)
> -011 |do_raw_spin_lock(lock = 0xFFFFFF89323BB600)
> -012 |_raw_spin_lock(inline)
> -012 |raw_spin_rq_lock_nested(inline)
> -012 |raw_spin_rq_lock(inline)
> -012 |task_rq_lock(p = 0xFFFFFF88CFF1DA00, rf = 0xFFFFFFC03707BBE8)
> -013 |__set_cpus_allowed_ptr(inline)
> -013 |migrate_enable()
> -014 |trace_call_bpf(call = ?, ctx = 0xFFFFFFFDEF954600)
> -015 |perf_trace_run_bpf_submit(inline)
> -015 |perf_trace_sched_switch(__data = 0xFFFFFFE82CF0BCB8, preempt = FALSE, prev = ?, next = ?)
> -016 |__traceiter_sched_switch(inline)
> -016 |trace_sched_switch(inline)
trace_sched_switch() disables preemption.
So how is this a fix?
-- Steve
> -016 |__schedule(sched_mode = ?)
> -017 |schedule()
> -018 |arch_local_save_flags(inline)
> -018 |arch_irqs_disabled(inline)
> -018 |__raw_spin_lock_irq(inline)
> -018 |_raw_spin_lock_irq(inline)
> -018 |worker_thread(__worker = 0xFFFFFF88CE251300)
> -019 |kthread(_create = 0xFFFFFF88730A5A80)
> -020 |ret_from_fork(asm)
>
> Signed-off-by: Satya Durga Srinivasu Prabhala <[email protected]>
> ---
> kernel/sched/core.c | 8 ++++++++
> 1 file changed, 8 insertions(+)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index bfa7452ca92e..e254e9227341 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2223,6 +2223,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
>
> void migrate_disable(void)
> {
> +#ifdef CONFIG_PREEMPT_RT
> struct task_struct *p = current;
>
> if (p->migration_disabled) {
> @@ -2234,11 +2235,15 @@ void migrate_disable(void)
> this_rq()->nr_pinned++;
> p->migration_disabled = 1;
> preempt_enable();
> +#else
> + preempt_disable();
> +#endif
> }
> EXPORT_SYMBOL_GPL(migrate_disable);
>
> void migrate_enable(void)
> {
> +#ifdef CONFIG_PREEMPT_RT
> struct task_struct *p = current;
>
> if (p->migration_disabled > 1) {
> @@ -2265,6 +2270,9 @@ void migrate_enable(void)
> p->migration_disabled = 0;
> this_rq()->nr_pinned--;
> preempt_enable();
> +#else
> + preempt_enable();
> +#endif
> }
> EXPORT_SYMBOL_GPL(migrate_enable);
>
> --
> 2.36.1
On Thu, Jun 30, 2022 at 10:53:10PM +0100, Qais Yousef wrote:
> Hi Satya
>
> On 06/24/22 00:42, Satya Durga Srinivasu Prabhala wrote:
> > Below recursion is observed in a rare scenario where __schedule()
> > takes rq lock, at around same time task's affinity is being changed,
> > bpf function for tracing sched_switch calls migrate_enabled(),
> > checks for affinity change (cpus_ptr != cpus_mask) lands into
> > __set_cpus_allowed_ptr which tries acquire rq lock and causing the
> > recursion bug.
> >
> > Fix the issue by switching to preempt_enable/disable() for non-RT
> > Kernels.
>
> Interesting bug. Thanks for the report. Unfortunately I can't see this being
> a fix as it just limits the bug visibility to PREEMPT_RT kernels, but won't fix
> anything, no? ie: Kernels compiled with PREEMPT_RT will still hit this failure.
Worse, there's !RT stuff that grew to rely on the preemptible migrate
disable stuff, so this actively breaks things.
> I'm curious how the race with set affinity is happening. I would have thought
> user space would get blocked as __schedule() will hold the rq lock.
>
> Do you have more details on that?
Yeah, I'm not seeing how this works either, in order for
migrate_enable() to actually call __set_cpus_allowed_ptr(), it needs to
have done migrate_disable() *before* schedule, schedule() will then have
to call migrate_disable_swich(), and *then* migrate_enable() does this.
However, if things are nicely balanced (as they should be), then
trace_call_bpf() using migrate_disable()/migrate_enable() should never
hit this path.
If, OTOH, migrate_disable() was called prior to schedule() and we did do
migrate_disable_switch(), then it should be impossible for the
tracepoint/bpf stuff to reach p->migration_disabled == 0.
> > -010 |spin_bug(lock = ???, msg = ???)
> > -011 |debug_spin_lock_before(inline)
> > -011 |do_raw_spin_lock(lock = 0xFFFFFF89323BB600)
> > -012 |_raw_spin_lock(inline)
> > -012 |raw_spin_rq_lock_nested(inline)
> > -012 |raw_spin_rq_lock(inline)
> > -012 |task_rq_lock(p = 0xFFFFFF88CFF1DA00, rf = 0xFFFFFFC03707BBE8)
> > -013 |__set_cpus_allowed_ptr(inline)
> > -013 |migrate_enable()
> > -014 |trace_call_bpf(call = ?, ctx = 0xFFFFFFFDEF954600)
> > -015 |perf_trace_run_bpf_submit(inline)
> > -015 |perf_trace_sched_switch(__data = 0xFFFFFFE82CF0BCB8, preempt = FALSE, prev = ?, next = ?)
> > -016 |__traceiter_sched_switch(inline)
> > -016 |trace_sched_switch(inline)
> > -016 |__schedule(sched_mode = ?)
> > -017 |schedule()
> > -018 |arch_local_save_flags(inline)
> > -018 |arch_irqs_disabled(inline)
> > -018 |__raw_spin_lock_irq(inline)
> > -018 |_raw_spin_lock_irq(inline)
> > -018 |worker_thread(__worker = 0xFFFFFF88CE251300)
> > -019 |kthread(_create = 0xFFFFFF88730A5A80)
> > -020 |ret_from_fork(asm)
This doesn't clarify much. Please explain how things got to be
unbalanced, don't ever just make a problem dissapear like this without
understanding what the root cause is, that'll just get your reputation
sullied.
On 07/01/22 10:33, Peter Zijlstra wrote:
> On Thu, Jun 30, 2022 at 10:53:10PM +0100, Qais Yousef wrote:
> > Hi Satya
> >
> > On 06/24/22 00:42, Satya Durga Srinivasu Prabhala wrote:
> > > Below recursion is observed in a rare scenario where __schedule()
> > > takes rq lock, at around same time task's affinity is being changed,
> > > bpf function for tracing sched_switch calls migrate_enabled(),
> > > checks for affinity change (cpus_ptr != cpus_mask) lands into
> > > __set_cpus_allowed_ptr which tries acquire rq lock and causing the
> > > recursion bug.
> > >
> > > Fix the issue by switching to preempt_enable/disable() for non-RT
> > > Kernels.
> >
> > Interesting bug. Thanks for the report. Unfortunately I can't see this being
> > a fix as it just limits the bug visibility to PREEMPT_RT kernels, but won't fix
> > anything, no? ie: Kernels compiled with PREEMPT_RT will still hit this failure.
>
> Worse, there's !RT stuff that grew to rely on the preemptible migrate
> disable stuff, so this actively breaks things.
>
> > I'm curious how the race with set affinity is happening. I would have thought
> > user space would get blocked as __schedule() will hold the rq lock.
> >
> > Do you have more details on that?
>
> Yeah, I'm not seeing how this works either, in order for
> migrate_enable() to actually call __set_cpus_allowed_ptr(), it needs to
> have done migrate_disable() *before* schedule, schedule() will then have
> to call migrate_disable_swich(), and *then* migrate_enable() does this.
>
> However, if things are nicely balanced (as they should be), then
> trace_call_bpf() using migrate_disable()/migrate_enable() should never
> hit this path.
>
> If, OTOH, migrate_disable() was called prior to schedule() and we did do
> migrate_disable_switch(), then it should be impossible for the
> tracepoint/bpf stuff to reach p->migration_disabled == 0.
I think it's worth to confirm which kernel Satya is on too. If it's GKI, then
worth checking first this is actually reproducible on/applicable to mainline.
Cheers
--
Qais Yousef
On Thu, 30 Jun 2022 18:37:04 -0400
Steven Rostedt <[email protected]> wrote:
> >
> > Fix the issue by switching to preempt_enable/disable() for non-RT
> > Kernels.
> >
> > -010 |spin_bug(lock = ???, msg = ???)
> > -011 |debug_spin_lock_before(inline)
> > -011 |do_raw_spin_lock(lock = 0xFFFFFF89323BB600)
> > -012 |_raw_spin_lock(inline)
> > -012 |raw_spin_rq_lock_nested(inline)
> > -012 |raw_spin_rq_lock(inline)
> > -012 |task_rq_lock(p = 0xFFFFFF88CFF1DA00, rf = 0xFFFFFFC03707BBE8)
> > -013 |__set_cpus_allowed_ptr(inline)
> > -013 |migrate_enable()
> > -014 |trace_call_bpf(call = ?, ctx = 0xFFFFFFFDEF954600)
> > -015 |perf_trace_run_bpf_submit(inline)
> > -015 |perf_trace_sched_switch(__data = 0xFFFFFFE82CF0BCB8, preempt = FALSE, prev = ?, next = ?)
> > -016 |__traceiter_sched_switch(inline)
> > -016 |trace_sched_switch(inline)
>
> trace_sched_switch() disables preemption.
>
> So how is this a fix?
Let me rephrase my question.
As trace_sched_switch() disables preemption, why is trace_call_bpf()
calling migrate_disable()?
Looks like you could modify the code to include a __bpf_prog_run_array()
that skips the migrate_disable(). You even have a "cant_sleep()" call in
trace_call_bpf().
-- Steve
On 01/07/2022 13:48, Qais Yousef wrote:
> On 07/01/22 10:33, Peter Zijlstra wrote:
>> On Thu, Jun 30, 2022 at 10:53:10PM +0100, Qais Yousef wrote:
>>> Hi Satya
>>>
>>> On 06/24/22 00:42, Satya Durga Srinivasu Prabhala wrote:
>>>> Below recursion is observed in a rare scenario where __schedule()
>>>> takes rq lock, at around same time task's affinity is being changed,
>>>> bpf function for tracing sched_switch calls migrate_enabled(),
>>>> checks for affinity change (cpus_ptr != cpus_mask) lands into
>>>> __set_cpus_allowed_ptr which tries acquire rq lock and causing the
>>>> recursion bug.
>>>>
>>>> Fix the issue by switching to preempt_enable/disable() for non-RT
>>>> Kernels.
>>>
>>> Interesting bug. Thanks for the report. Unfortunately I can't see this being
>>> a fix as it just limits the bug visibility to PREEMPT_RT kernels, but won't fix
>>> anything, no? ie: Kernels compiled with PREEMPT_RT will still hit this failure.
>>
>> Worse, there's !RT stuff that grew to rely on the preemptible migrate
>> disable stuff, so this actively breaks things.
>>
>>> I'm curious how the race with set affinity is happening. I would have thought
>>> user space would get blocked as __schedule() will hold the rq lock.
>>>
>>> Do you have more details on that?
>>
>> Yeah, I'm not seeing how this works either, in order for
>> migrate_enable() to actually call __set_cpus_allowed_ptr(), it needs to
>> have done migrate_disable() *before* schedule, schedule() will then have
>> to call migrate_disable_swich(), and *then* migrate_enable() does this.
>>
>> However, if things are nicely balanced (as they should be), then
>> trace_call_bpf() using migrate_disable()/migrate_enable() should never
>> hit this path.
>>
>> If, OTOH, migrate_disable() was called prior to schedule() and we did do
>> migrate_disable_switch(), then it should be impossible for the
>> tracepoint/bpf stuff to reach p->migration_disabled == 0.
>
> I think it's worth to confirm which kernel Satya is on too. If it's GKI, then
> worth checking first this is actually reproducible on/applicable to mainline.
Satya, do you still have these lines from your spin_dump() output showing
current, the kernel version and the hardware? Or a way to recreate this?
I couldn't provoke it so far.
...
[ 212.196452] BUG: spinlock recursion on CPU#4, bpftrace/1662
^^^^^^^^^^^^^
[ 212.196473] lock: 0xffff00097ef7f500, .magic: dead4ead, .owner: bpftrace/1662, .owner_cpu: 4
[ 212.196500] CPU: 4 PID: 1662 Comm: bpftrace Not tainted 5.19.0-rc2-00018-gb7ce5b6b4622-dirty #96
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[ 212.196513] Hardware name: ARM Juno development board (r0) (DT)
^^^^^^^^^^^^^^^^^^^^^^^^^^
[ 212.196520] Call trace:
...
On 6/30/22 2:53 PM, Qais Yousef wrote:
> Hi Satya
>
> On 06/24/22 00:42, Satya Durga Srinivasu Prabhala wrote:
>> Below recursion is observed in a rare scenario where __schedule()
>> takes rq lock, at around same time task's affinity is being changed,
>> bpf function for tracing sched_switch calls migrate_enabled(),
>> checks for affinity change (cpus_ptr != cpus_mask) lands into
>> __set_cpus_allowed_ptr which tries acquire rq lock and causing the
>> recursion bug.
>>
>> Fix the issue by switching to preempt_enable/disable() for non-RT
>> Kernels.
> Interesting bug. Thanks for the report. Unfortunately I can't see this being
> a fix as it just limits the bug visibility to PREEMPT_RT kernels, but won't fix
> anything, no? ie: Kernels compiled with PREEMPT_RT will still hit this failure.
Thanks for taking a look. That's right.
> I'm curious how the race with set affinity is happening. I would have thought
> user space would get blocked as __schedule() will hold the rq lock.
>
> Do you have more details on that?
I'm still trying to figure out how we even end with up such a race. No
easy way repro
the issue, I'm checking internally with some debug logs.
>
> Thanks
>
> --
> Qais Yousef
>
>> -010 |spin_bug(lock = ???, msg = ???)
>> -011 |debug_spin_lock_before(inline)
>> -011 |do_raw_spin_lock(lock = 0xFFFFFF89323BB600)
>> -012 |_raw_spin_lock(inline)
>> -012 |raw_spin_rq_lock_nested(inline)
>> -012 |raw_spin_rq_lock(inline)
>> -012 |task_rq_lock(p = 0xFFFFFF88CFF1DA00, rf = 0xFFFFFFC03707BBE8)
>> -013 |__set_cpus_allowed_ptr(inline)
>> -013 |migrate_enable()
>> -014 |trace_call_bpf(call = ?, ctx = 0xFFFFFFFDEF954600)
>> -015 |perf_trace_run_bpf_submit(inline)
>> -015 |perf_trace_sched_switch(__data = 0xFFFFFFE82CF0BCB8, preempt = FALSE, prev = ?, next = ?)
>> -016 |__traceiter_sched_switch(inline)
>> -016 |trace_sched_switch(inline)
>> -016 |__schedule(sched_mode = ?)
>> -017 |schedule()
>> -018 |arch_local_save_flags(inline)
>> -018 |arch_irqs_disabled(inline)
>> -018 |__raw_spin_lock_irq(inline)
>> -018 |_raw_spin_lock_irq(inline)
>> -018 |worker_thread(__worker = 0xFFFFFF88CE251300)
>> -019 |kthread(_create = 0xFFFFFF88730A5A80)
>> -020 |ret_from_fork(asm)
>>
>> Signed-off-by: Satya Durga Srinivasu Prabhala <[email protected]>
>> ---
>> kernel/sched/core.c | 8 ++++++++
>> 1 file changed, 8 insertions(+)
>>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index bfa7452ca92e..e254e9227341 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -2223,6 +2223,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
>>
>> void migrate_disable(void)
>> {
>> +#ifdef CONFIG_PREEMPT_RT
>> struct task_struct *p = current;
>>
>> if (p->migration_disabled) {
>> @@ -2234,11 +2235,15 @@ void migrate_disable(void)
>> this_rq()->nr_pinned++;
>> p->migration_disabled = 1;
>> preempt_enable();
>> +#else
>> + preempt_disable();
>> +#endif
>> }
>> EXPORT_SYMBOL_GPL(migrate_disable);
>>
>> void migrate_enable(void)
>> {
>> +#ifdef CONFIG_PREEMPT_RT
>> struct task_struct *p = current;
>>
>> if (p->migration_disabled > 1) {
>> @@ -2265,6 +2270,9 @@ void migrate_enable(void)
>> p->migration_disabled = 0;
>> this_rq()->nr_pinned--;
>> preempt_enable();
>> +#else
>> + preempt_enable();
>> +#endif
>> }
>> EXPORT_SYMBOL_GPL(migrate_enable);
>>
>> --
>> 2.36.1
>>
Please, burn outlook and use a real MUA, this is unreadable garbage.
On 7/4/22 1:32 AM, Dietmar Eggemann wrote:
> On 01/07/2022 13:48, Qais Yousef wrote:
>> On 07/01/22 10:33, Peter Zijlstra wrote:
>>> On Thu, Jun 30, 2022 at 10:53:10PM +0100, Qais Yousef wrote:
>>>> Hi Satya
>>>>
>>>> On 06/24/22 00:42, Satya Durga Srinivasu Prabhala wrote:
>>>>> Below recursion is observed in a rare scenario where __schedule()
>>>>> takes rq lock, at around same time task's affinity is being changed,
>>>>> bpf function for tracing sched_switch calls migrate_enabled(),
>>>>> checks for affinity change (cpus_ptr != cpus_mask) lands into
>>>>> __set_cpus_allowed_ptr which tries acquire rq lock and causing the
>>>>> recursion bug.
>>>>>
>>>>> Fix the issue by switching to preempt_enable/disable() for non-RT
>>>>> Kernels.
>>>> Interesting bug. Thanks for the report. Unfortunately I can't see this being
>>>> a fix as it just limits the bug visibility to PREEMPT_RT kernels, but won't fix
>>>> anything, no? ie: Kernels compiled with PREEMPT_RT will still hit this failure.
>>> Worse, there's !RT stuff that grew to rely on the preemptible migrate
>>> disable stuff, so this actively breaks things.
>>>
>>>> I'm curious how the race with set affinity is happening. I would have thought
>>>> user space would get blocked as __schedule() will hold the rq lock.
>>>>
>>>> Do you have more details on that?
>>> Yeah, I'm not seeing how this works either, in order for
>>> migrate_enable() to actually call __set_cpus_allowed_ptr(), it needs to
>>> have done migrate_disable() *before* schedule, schedule() will then have
>>> to call migrate_disable_swich(), and *then* migrate_enable() does this.
>>>
>>> However, if things are nicely balanced (as they should be), then
>>> trace_call_bpf() using migrate_disable()/migrate_enable() should never
>>> hit this path.
>>>
>>> If, OTOH, migrate_disable() was called prior to schedule() and we did do
>>> migrate_disable_switch(), then it should be impossible for the
>>> tracepoint/bpf stuff to reach p->migration_disabled == 0.
>> I think it's worth to confirm which kernel Satya is on too. If it's GKI, then
>> worth checking first this is actually reproducible on/applicable to mainline.
> Satya, do you still have these lines from your spin_dump() output showing
> current, the kernel version and the hardware? Or a way to recreate this?
> I couldn't provoke it so far.
>
> ...
> [ 212.196452] BUG: spinlock recursion on CPU#4, bpftrace/1662
> ^^^^^^^^^^^^^
> [ 212.196473] lock: 0xffff00097ef7f500, .magic: dead4ead, .owner: bpftrace/1662, .owner_cpu: 4
> [ 212.196500] CPU: 4 PID: 1662 Comm: bpftrace Not tainted 5.19.0-rc2-00018-gb7ce5b6b4622-dirty #96
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> [ 212.196513] Hardware name: ARM Juno development board (r0) (DT)
> ^^^^^^^^^^^^^^^^^^^^^^^^^^
> [ 212.196520] Call trace:
> ...
Thanks for cross checking. Below are the output lines from spin_dump.
I'm on 5.15 GKI Kernel.
[ 7447.326924] BUG: spinlock recursion on CPU#6, kworker/6:9/738
[ 7447.333615] lock: 0xffffff89321d8600, .magic: dead4ead, .owner:
kworker/6:9/738, .owner_cpu: 6
I'm trying to get the issue reproduced with some additional debug logs,
didn't find any
easy way so far. Still cross checking.
On 7/1/22 8:54 AM, Steven Rostedt wrote:
> On Thu, 30 Jun 2022 18:37:04 -0400
> Steven Rostedt <[email protected]> wrote:
>
>>> Fix the issue by switching to preempt_enable/disable() for non-RT
>>> Kernels.
>>>
>>> -010 |spin_bug(lock = ???, msg = ???)
>>> -011 |debug_spin_lock_before(inline)
>>> -011 |do_raw_spin_lock(lock = 0xFFFFFF89323BB600)
>>> -012 |_raw_spin_lock(inline)
>>> -012 |raw_spin_rq_lock_nested(inline)
>>> -012 |raw_spin_rq_lock(inline)
>>> -012 |task_rq_lock(p = 0xFFFFFF88CFF1DA00, rf = 0xFFFFFFC03707BBE8)
>>> -013 |__set_cpus_allowed_ptr(inline)
>>> -013 |migrate_enable()
>>> -014 |trace_call_bpf(call = ?, ctx = 0xFFFFFFFDEF954600)
>>> -015 |perf_trace_run_bpf_submit(inline)
>>> -015 |perf_trace_sched_switch(__data = 0xFFFFFFE82CF0BCB8, preempt = FALSE, prev = ?, next = ?)
>>> -016 |__traceiter_sched_switch(inline)
>>> -016 |trace_sched_switch(inline)
>> trace_sched_switch() disables preemption.
>>
>> So how is this a fix?
> Let me rephrase my question.
>
> As trace_sched_switch() disables preemption, why is trace_call_bpf()
> calling migrate_disable()?
I'm not sure, why we have migrate_disable/enable(). I will need to cross
check further.
>
> Looks like you could modify the code to include a __bpf_prog_run_array()
> that skips the migrate_disable(). You even have a "cant_sleep()" call in
> trace_call_bpf().
Thanks for the inputs. I still need to cross check and find a better way
to repro
the issue to be able to comment further. Will cross check on your
suggestion.
>
> -- Steve
On 7/1/22 4:48 AM, Qais Yousef wrote:
> On 07/01/22 10:33, Peter Zijlstra wrote:
>> On Thu, Jun 30, 2022 at 10:53:10PM +0100, Qais Yousef wrote:
>>> Hi Satya
>>>
>>> On 06/24/22 00:42, Satya Durga Srinivasu Prabhala wrote:
>>>> Below recursion is observed in a rare scenario where __schedule()
>>>> takes rq lock, at around same time task's affinity is being changed,
>>>> bpf function for tracing sched_switch calls migrate_enabled(),
>>>> checks for affinity change (cpus_ptr != cpus_mask) lands into
>>>> __set_cpus_allowed_ptr which tries acquire rq lock and causing the
>>>> recursion bug.
>>>>
>>>> Fix the issue by switching to preempt_enable/disable() for non-RT
>>>> Kernels.
>>> Interesting bug. Thanks for the report. Unfortunately I can't see this being
>>> a fix as it just limits the bug visibility to PREEMPT_RT kernels, but won't fix
>>> anything, no? ie: Kernels compiled with PREEMPT_RT will still hit this failure.
>> Worse, there's !RT stuff that grew to rely on the preemptible migrate
>> disable stuff, so this actively breaks things.
>>
>>> I'm curious how the race with set affinity is happening. I would have thought
>>> user space would get blocked as __schedule() will hold the rq lock.
>>>
>>> Do you have more details on that?
>> Yeah, I'm not seeing how this works either, in order for
>> migrate_enable() to actually call __set_cpus_allowed_ptr(), it needs to
>> have done migrate_disable() *before* schedule, schedule() will then have
>> to call migrate_disable_swich(), and *then* migrate_enable() does this.
>>
>> However, if things are nicely balanced (as they should be), then
>> trace_call_bpf() using migrate_disable()/migrate_enable() should never
>> hit this path.
>>
>> If, OTOH, migrate_disable() was called prior to schedule() and we did do
>> migrate_disable_switch(), then it should be impossible for the
>> tracepoint/bpf stuff to reach p->migration_disabled == 0.
> I think it's worth to confirm which kernel Satya is on too. If it's GKI, then
> worth checking first this is actually reproducible on/applicable to mainline.
We are seeing the issue on 5.15 GKI Kernel. On older Kernels, like 5.10
Kernel
migrate_disable/enable() end-up calling preempt_disable/enable(). I will
cross
check further on comments and inputs I received so far.
>
> Cheers
>
> --
> Qais Yousef
On 6/30/22 3:37 PM, Steven Rostedt wrote:
> On Fri, Jun 24, 2022 at 12:42:40AM -0700, Satya Durga Srinivasu Prabhala wrote:
>> Below recursion is observed in a rare scenario where __schedule()
>> takes rq lock, at around same time task's affinity is being changed,
>> bpf function for tracing sched_switch calls migrate_enabled(),
>> checks for affinity change (cpus_ptr != cpus_mask) lands into
>> __set_cpus_allowed_ptr which tries acquire rq lock and causing the
>> recursion bug.
>>
>> Fix the issue by switching to preempt_enable/disable() for non-RT
>> Kernels.
>>
>> -010 |spin_bug(lock = ???, msg = ???)
>> -011 |debug_spin_lock_before(inline)
>> -011 |do_raw_spin_lock(lock = 0xFFFFFF89323BB600)
>> -012 |_raw_spin_lock(inline)
>> -012 |raw_spin_rq_lock_nested(inline)
>> -012 |raw_spin_rq_lock(inline)
>> -012 |task_rq_lock(p = 0xFFFFFF88CFF1DA00, rf = 0xFFFFFFC03707BBE8)
>> -013 |__set_cpus_allowed_ptr(inline)
>> -013 |migrate_enable()
>> -014 |trace_call_bpf(call = ?, ctx = 0xFFFFFFFDEF954600)
>> -015 |perf_trace_run_bpf_submit(inline)
>> -015 |perf_trace_sched_switch(__data = 0xFFFFFFE82CF0BCB8, preempt = FALSE, prev = ?, next = ?)
>> -016 |__traceiter_sched_switch(inline)
>> -016 |trace_sched_switch(inline)
> trace_sched_switch() disables preemption.
>
> So how is this a fix?
Thanks for your time and comments.
I was more looking at non-RT Kernel where switching to
preempt_disable/enable() helps as it's
just increment/decrement of count. I agree, this isn't a right fix.
I'm still cross checking on easy way to repro the issue. Will cross
check further and get back.
>
> -- Steve
>
>> -016 |__schedule(sched_mode = ?)
>> -017 |schedule()
>> -018 |arch_local_save_flags(inline)
>> -018 |arch_irqs_disabled(inline)
>> -018 |__raw_spin_lock_irq(inline)
>> -018 |_raw_spin_lock_irq(inline)
>> -018 |worker_thread(__worker = 0xFFFFFF88CE251300)
>> -019 |kthread(_create = 0xFFFFFF88730A5A80)
>> -020 |ret_from_fork(asm)
>>
>> Signed-off-by: Satya Durga Srinivasu Prabhala <[email protected]>
>> ---
>> kernel/sched/core.c | 8 ++++++++
>> 1 file changed, 8 insertions(+)
>>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index bfa7452ca92e..e254e9227341 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -2223,6 +2223,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
>>
>> void migrate_disable(void)
>> {
>> +#ifdef CONFIG_PREEMPT_RT
>> struct task_struct *p = current;
>>
>> if (p->migration_disabled) {
>> @@ -2234,11 +2235,15 @@ void migrate_disable(void)
>> this_rq()->nr_pinned++;
>> p->migration_disabled = 1;
>> preempt_enable();
>> +#else
>> + preempt_disable();
>> +#endif
>> }
>> EXPORT_SYMBOL_GPL(migrate_disable);
>>
>> void migrate_enable(void)
>> {
>> +#ifdef CONFIG_PREEMPT_RT
>> struct task_struct *p = current;
>>
>> if (p->migration_disabled > 1) {
>> @@ -2265,6 +2270,9 @@ void migrate_enable(void)
>> p->migration_disabled = 0;
>> this_rq()->nr_pinned--;
>> preempt_enable();
>> +#else
>> + preempt_enable();
>> +#endif
>> }
>> EXPORT_SYMBOL_GPL(migrate_enable);
>>
>> --
>> 2.36.1
On 7/5/22 12:24 AM, Peter Zijlstra wrote:
>
> Please, burn outlook and use a real MUA, this is unreadable garbage.
Yes, I shall. Sorry about that.
On 7/1/22 1:33 AM, Peter Zijlstra wrote:
> On Thu, Jun 30, 2022 at 10:53:10PM +0100, Qais Yousef wrote:
>> Hi Satya
>>
>> On 06/24/22 00:42, Satya Durga Srinivasu Prabhala wrote:
>>> Below recursion is observed in a rare scenario where __schedule()
>>> takes rq lock, at around same time task's affinity is being changed,
>>> bpf function for tracing sched_switch calls migrate_enabled(),
>>> checks for affinity change (cpus_ptr != cpus_mask) lands into
>>> __set_cpus_allowed_ptr which tries acquire rq lock and causing the
>>> recursion bug.
>>>
>>> Fix the issue by switching to preempt_enable/disable() for non-RT
>>> Kernels.
>> Interesting bug. Thanks for the report. Unfortunately I can't see this being
>> a fix as it just limits the bug visibility to PREEMPT_RT kernels, but won't fix
>> anything, no? ie: Kernels compiled with PREEMPT_RT will still hit this failure.
> Worse, there's !RT stuff that grew to rely on the preemptible migrate
> disable stuff, so this actively breaks things.
Sorry about that. I'm cross checking further on ways to repro issue easily.
>> I'm curious how the race with set affinity is happening. I would have thought
>> user space would get blocked as __schedule() will hold the rq lock.
>>
>> Do you have more details on that?
> Yeah, I'm not seeing how this works either, in order for
> migrate_enable() to actually call __set_cpus_allowed_ptr(), it needs to
> have done migrate_disable() *before* schedule, schedule() will then have
> to call migrate_disable_swich(), and *then* migrate_enable() does this.
>
> However, if things are nicely balanced (as they should be), then
> trace_call_bpf() using migrate_disable()/migrate_enable() should never
> hit this path.
>
> If, OTOH, migrate_disable() was called prior to schedule() and we did do
> migrate_disable_switch(), then it should be impossible for the
> tracepoint/bpf stuff to reach p->migration_disabled == 0.
>
Thanks for explanation. Will cross check further on these points.
>>> -010 |spin_bug(lock = ???, msg = ???)
>>> -011 |debug_spin_lock_before(inline)
>>> -011 |do_raw_spin_lock(lock = 0xFFFFFF89323BB600)
>>> -012 |_raw_spin_lock(inline)
>>> -012 |raw_spin_rq_lock_nested(inline)
>>> -012 |raw_spin_rq_lock(inline)
>>> -012 |task_rq_lock(p = 0xFFFFFF88CFF1DA00, rf = 0xFFFFFFC03707BBE8)
>>> -013 |__set_cpus_allowed_ptr(inline)
>>> -013 |migrate_enable()
>>> -014 |trace_call_bpf(call = ?, ctx = 0xFFFFFFFDEF954600)
>>> -015 |perf_trace_run_bpf_submit(inline)
>>> -015 |perf_trace_sched_switch(__data = 0xFFFFFFE82CF0BCB8, preempt = FALSE, prev = ?, next = ?)
>>> -016 |__traceiter_sched_switch(inline)
>>> -016 |trace_sched_switch(inline)
>>> -016 |__schedule(sched_mode = ?)
>>> -017 |schedule()
>>> -018 |arch_local_save_flags(inline)
>>> -018 |arch_irqs_disabled(inline)
>>> -018 |__raw_spin_lock_irq(inline)
>>> -018 |_raw_spin_lock_irq(inline)
>>> -018 |worker_thread(__worker = 0xFFFFFF88CE251300)
>>> -019 |kthread(_create = 0xFFFFFF88730A5A80)
>>> -020 |ret_from_fork(asm)
> This doesn't clarify much. Please explain how things got to be
> unbalanced, don't ever just make a problem dissapear like this without
> understanding what the root cause is, that'll just get your reputation
> sullied.
Agreed, thanks for the comments and suggestion. Yes, I'm trying to cross
check further and find ways to repro the issue. Will get back once I find
a better way to handle the issue. I should have just tried to get
comments/feedback on the issue first instead proposing fix. Lesson
learnt :)