2020-04-01 21:42:37

by Qian Cai

[permalink] [raw]
Subject: [PATCH v2] sched/core: fix illegal RCU from offline CPUs

From: Peter Zijlstra <[email protected]>

In the CPU-offline process, it calls mmdrop() after idle entry and the
subsequent call to cpuhp_report_idle_dead(). Once execution passes the
call to rcu_report_dead(), RCU is ignoring the CPU, which results in
lockdep complaining when mmdrop() uses RCU from either memcg or
debugobjects below.

Fix it by cleaning up the active_mm state from BP instead. Every arch
which has CONFIG_HOTPLUG_CPU should have already called idle_task_exit()
from AP. The only exception is parisc because it switches them to
&init_mm unconditionally (see smp_boot_one_cpu() and smp_cpu_init()),
but the patch will still work there because it calls mmgrab(&init_mm) in
smp_cpu_init() and then should call mmdrop(&init_mm) in finish_cpu().

WARNING: suspicious RCU usage
-----------------------------
kernel/workqueue.c:710 RCU or wq_pool_mutex should be held!

other info that might help us debug this:

RCU used illegally from offline CPU!
Call Trace:
dump_stack+0xf4/0x164 (unreliable)
lockdep_rcu_suspicious+0x140/0x164
get_work_pool+0x110/0x150
__queue_work+0x1bc/0xca0
queue_work_on+0x114/0x120
css_release+0x9c/0xc0
percpu_ref_put_many+0x204/0x230
free_pcp_prepare+0x264/0x570
free_unref_page+0x38/0xf0
__mmdrop+0x21c/0x2c0
idle_task_exit+0x170/0x1b0
pnv_smp_cpu_kill_self+0x38/0x2e0
cpu_die+0x48/0x64
arch_cpu_idle_dead+0x30/0x50
do_idle+0x2f4/0x470
cpu_startup_entry+0x38/0x40
start_secondary+0x7a8/0xa80
start_secondary_resume+0x10/0x14

<Peter to sign off here>
Signed-off-by: Qian Cai <[email protected]>
---
arch/powerpc/platforms/powernv/smp.c | 1 -
include/linux/sched/mm.h | 2 ++
kernel/cpu.c | 18 +++++++++++++++++-
kernel/sched/core.c | 5 +++--
4 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 13e251699346..b2ba3e95bda7 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
/* Standard hot unplug procedure */

idle_task_exit();
- current->active_mm = NULL; /* for sanity */
cpu = smp_processor_id();
DBG("CPU%d offline\n", cpu);
generic_set_cpu_dead(cpu);
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index c49257a3b510..a132d875d351 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
__mmdrop(mm);
}

+void mmdrop(struct mm_struct *mm);
+
/*
* This has to be called after a get_task_mm()/mmget_not_zero()
* followed by taking the mmap_sem for writing before modifying the
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2371292f30b0..244d30544377 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -3,6 +3,7 @@
*
* This code is licenced under the GPL.
*/
+#include <linux/sched/mm.h>
#include <linux/proc_fs.h>
#include <linux/smp.h>
#include <linux/init.h>
@@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
return bringup_wait_for_ap(cpu);
}

+static int finish_cpu(unsigned int cpu)
+{
+ struct task_struct *idle = idle_thread_get(cpu);
+ struct mm_struct *mm = idle->active_mm;
+
+ /*
+ * idle_task_exit() will have switched to &init_mm, now
+ * clean up any remaining active_mm state.
+ */
+ if (mm != &init_mm)
+ idle->active_mm = &init_mm;
+ mmdrop(mm);
+ return 0;
+}
+
/*
* Hotplug state machine related functions
*/
@@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
[CPUHP_BRINGUP_CPU] = {
.name = "cpu:bringup",
.startup.single = bringup_cpu,
- .teardown.single = NULL,
+ .teardown.single = finish_cpu,
.cant_stop = true,
},
/* Final state before CPU kills itself */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a2694ba82874..8787958339d5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6200,13 +6200,14 @@ void idle_task_exit(void)
struct mm_struct *mm = current->active_mm;

BUG_ON(cpu_online(smp_processor_id()));
+ BUG_ON(current != this_rq()->idle);

if (mm != &init_mm) {
switch_mm(mm, &init_mm, current);
- current->active_mm = &init_mm;
finish_arch_post_lock_switch();
}
- mmdrop(mm);
+
+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
}

/*
--
2.21.0 (Apple Git-122.2)


2020-04-02 11:24:25

by Michael Ellerman

[permalink] [raw]
Subject: Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs

Qian Cai <[email protected]> writes:
> From: Peter Zijlstra <[email protected]>
>
> In the CPU-offline process, it calls mmdrop() after idle entry and the
> subsequent call to cpuhp_report_idle_dead(). Once execution passes the
> call to rcu_report_dead(), RCU is ignoring the CPU, which results in
> lockdep complaining when mmdrop() uses RCU from either memcg or
> debugobjects below.
>
> Fix it by cleaning up the active_mm state from BP instead. Every arch
> which has CONFIG_HOTPLUG_CPU should have already called idle_task_exit()
> from AP. The only exception is parisc because it switches them to
> &init_mm unconditionally (see smp_boot_one_cpu() and smp_cpu_init()),
> but the patch will still work there because it calls mmgrab(&init_mm) in
> smp_cpu_init() and then should call mmdrop(&init_mm) in finish_cpu().

Thanks for debugging this. How did you hit it in the first place?

A link to the original thread would have helped me:

https://lore.kernel.org/lkml/[email protected]/

> WARNING: suspicious RCU usage
> -----------------------------
> kernel/workqueue.c:710 RCU or wq_pool_mutex should be held!
>
> other info that might help us debug this:
>
> RCU used illegally from offline CPU!
> Call Trace:
> dump_stack+0xf4/0x164 (unreliable)
> lockdep_rcu_suspicious+0x140/0x164
> get_work_pool+0x110/0x150
> __queue_work+0x1bc/0xca0
> queue_work_on+0x114/0x120
> css_release+0x9c/0xc0
> percpu_ref_put_many+0x204/0x230
> free_pcp_prepare+0x264/0x570
> free_unref_page+0x38/0xf0
> __mmdrop+0x21c/0x2c0
> idle_task_exit+0x170/0x1b0
> pnv_smp_cpu_kill_self+0x38/0x2e0
> cpu_die+0x48/0x64
> arch_cpu_idle_dead+0x30/0x50
> do_idle+0x2f4/0x470
> cpu_startup_entry+0x38/0x40
> start_secondary+0x7a8/0xa80
> start_secondary_resume+0x10/0x14

Do we know when this started happening? ie. can we determine a Fixes
tag?

> <Peter to sign off here>
> Signed-off-by: Qian Cai <[email protected]>
> ---
> arch/powerpc/platforms/powernv/smp.c | 1 -
> include/linux/sched/mm.h | 2 ++
> kernel/cpu.c | 18 +++++++++++++++++-
> kernel/sched/core.c | 5 +++--
> 4 files changed, 22 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
> index 13e251699346..b2ba3e95bda7 100644
> --- a/arch/powerpc/platforms/powernv/smp.c
> +++ b/arch/powerpc/platforms/powernv/smp.c
> @@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
> /* Standard hot unplug procedure */
>
> idle_task_exit();
> - current->active_mm = NULL; /* for sanity */

If I'm reading it right, we'll now be running with active_mm == init_mm
in the offline loop.

I guess that's fine, I can't think of any reason it would matter, and it
seems like we were NULL'ing it out just for paranoia's sake not because
of any actual problem.

Acked-by: Michael Ellerman <[email protected]> (powerpc)


cheers

> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> index c49257a3b510..a132d875d351 100644
> --- a/include/linux/sched/mm.h
> +++ b/include/linux/sched/mm.h
> @@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
> __mmdrop(mm);
> }
>
> +void mmdrop(struct mm_struct *mm);
> +
> /*
> * This has to be called after a get_task_mm()/mmget_not_zero()
> * followed by taking the mmap_sem for writing before modifying the
> diff --git a/kernel/cpu.c b/kernel/cpu.c
> index 2371292f30b0..244d30544377 100644
> --- a/kernel/cpu.c
> +++ b/kernel/cpu.c
> @@ -3,6 +3,7 @@
> *
> * This code is licenced under the GPL.
> */
> +#include <linux/sched/mm.h>
> #include <linux/proc_fs.h>
> #include <linux/smp.h>
> #include <linux/init.h>
> @@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
> return bringup_wait_for_ap(cpu);
> }
>
> +static int finish_cpu(unsigned int cpu)
> +{
> + struct task_struct *idle = idle_thread_get(cpu);
> + struct mm_struct *mm = idle->active_mm;
> +
> + /*
> + * idle_task_exit() will have switched to &init_mm, now
> + * clean up any remaining active_mm state.
> + */
> + if (mm != &init_mm)
> + idle->active_mm = &init_mm;
> + mmdrop(mm);
> + return 0;
> +}
> +
> /*
> * Hotplug state machine related functions
> */
> @@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
> [CPUHP_BRINGUP_CPU] = {
> .name = "cpu:bringup",
> .startup.single = bringup_cpu,
> - .teardown.single = NULL,
> + .teardown.single = finish_cpu,
> .cant_stop = true,
> },
> /* Final state before CPU kills itself */
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index a2694ba82874..8787958339d5 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6200,13 +6200,14 @@ void idle_task_exit(void)
> struct mm_struct *mm = current->active_mm;
>
> BUG_ON(cpu_online(smp_processor_id()));
> + BUG_ON(current != this_rq()->idle);
>
> if (mm != &init_mm) {
> switch_mm(mm, &init_mm, current);
> - current->active_mm = &init_mm;
> finish_arch_post_lock_switch();
> }
> - mmdrop(mm);
> +
> + /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
> }
>
> /*
> --
> 2.21.0 (Apple Git-122.2)

2020-04-02 14:02:44

by Qian Cai

[permalink] [raw]
Subject: Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs



> On Apr 2, 2020, at 7:24 AM, Michael Ellerman <[email protected]> wrote:
>
> Qian Cai <[email protected]> writes:
>> From: Peter Zijlstra <[email protected]>
>>
>> In the CPU-offline process, it calls mmdrop() after idle entry and the
>> subsequent call to cpuhp_report_idle_dead(). Once execution passes the
>> call to rcu_report_dead(), RCU is ignoring the CPU, which results in
>> lockdep complaining when mmdrop() uses RCU from either memcg or
>> debugobjects below.
>>
>> Fix it by cleaning up the active_mm state from BP instead. Every arch
>> which has CONFIG_HOTPLUG_CPU should have already called idle_task_exit()
>> from AP. The only exception is parisc because it switches them to
>> &init_mm unconditionally (see smp_boot_one_cpu() and smp_cpu_init()),
>> but the patch will still work there because it calls mmgrab(&init_mm) in
>> smp_cpu_init() and then should call mmdrop(&init_mm) in finish_cpu().
>
> Thanks for debugging this. How did you hit it in the first place?

Just repeatedly offline/online CPUs which will eventually cause an idle thread
refcount goes to 0 and trigger __mmdrop() and of course it needs to enable
lockdep (PROVE_RCU?) as well as having luck to hit the cgroup, workqueue
or debugobject code paths to call RCU.

>
> A link to the original thread would have helped me:
>
> https://lore.kernel.org/lkml/[email protected]/
>
>> WARNING: suspicious RCU usage
>> -----------------------------
>> kernel/workqueue.c:710 RCU or wq_pool_mutex should be held!
>>
>> other info that might help us debug this:
>>
>> RCU used illegally from offline CPU!
>> Call Trace:
>> dump_stack+0xf4/0x164 (unreliable)
>> lockdep_rcu_suspicious+0x140/0x164
>> get_work_pool+0x110/0x150
>> __queue_work+0x1bc/0xca0
>> queue_work_on+0x114/0x120
>> css_release+0x9c/0xc0
>> percpu_ref_put_many+0x204/0x230
>> free_pcp_prepare+0x264/0x570
>> free_unref_page+0x38/0xf0
>> __mmdrop+0x21c/0x2c0
>> idle_task_exit+0x170/0x1b0
>> pnv_smp_cpu_kill_self+0x38/0x2e0
>> cpu_die+0x48/0x64
>> arch_cpu_idle_dead+0x30/0x50
>> do_idle+0x2f4/0x470
>> cpu_startup_entry+0x38/0x40
>> start_secondary+0x7a8/0xa80
>> start_secondary_resume+0x10/0x14
>
> Do we know when this started happening? ie. can we determine a Fixes
> tag?

I don’t know. I looked at some commits that it seems the code was like that
even 10-year ago. It must be nobody who cares to run lockdep (PROVE_RCU?)
with CPU hotplug very regularly.

>
>> <Peter to sign off here>
>> Signed-off-by: Qian Cai <[email protected]>
>> ---
>> arch/powerpc/platforms/powernv/smp.c | 1 -
>> include/linux/sched/mm.h | 2 ++
>> kernel/cpu.c | 18 +++++++++++++++++-
>> kernel/sched/core.c | 5 +++--
>> 4 files changed, 22 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
>> index 13e251699346..b2ba3e95bda7 100644
>> --- a/arch/powerpc/platforms/powernv/smp.c
>> +++ b/arch/powerpc/platforms/powernv/smp.c
>> @@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
>> /* Standard hot unplug procedure */
>>
>> idle_task_exit();
>> - current->active_mm = NULL; /* for sanity */
>
> If I'm reading it right, we'll now be running with active_mm == init_mm
> in the offline loop.
>
> I guess that's fine, I can't think of any reason it would matter, and it
> seems like we were NULL'ing it out just for paranoia's sake not because
> of any actual problem.
>
> Acked-by: Michael Ellerman <[email protected]> (powerpc)
>
>
> cheers
>
>> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
>> index c49257a3b510..a132d875d351 100644
>> --- a/include/linux/sched/mm.h
>> +++ b/include/linux/sched/mm.h
>> @@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
>> __mmdrop(mm);
>> }
>>
>> +void mmdrop(struct mm_struct *mm);
>> +
>> /*
>> * This has to be called after a get_task_mm()/mmget_not_zero()
>> * followed by taking the mmap_sem for writing before modifying the
>> diff --git a/kernel/cpu.c b/kernel/cpu.c
>> index 2371292f30b0..244d30544377 100644
>> --- a/kernel/cpu.c
>> +++ b/kernel/cpu.c
>> @@ -3,6 +3,7 @@
>> *
>> * This code is licenced under the GPL.
>> */
>> +#include <linux/sched/mm.h>
>> #include <linux/proc_fs.h>
>> #include <linux/smp.h>
>> #include <linux/init.h>
>> @@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
>> return bringup_wait_for_ap(cpu);
>> }
>>
>> +static int finish_cpu(unsigned int cpu)
>> +{
>> + struct task_struct *idle = idle_thread_get(cpu);
>> + struct mm_struct *mm = idle->active_mm;
>> +
>> + /*
>> + * idle_task_exit() will have switched to &init_mm, now
>> + * clean up any remaining active_mm state.
>> + */
>> + if (mm != &init_mm)
>> + idle->active_mm = &init_mm;
>> + mmdrop(mm);
>> + return 0;
>> +}
>> +
>> /*
>> * Hotplug state machine related functions
>> */
>> @@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
>> [CPUHP_BRINGUP_CPU] = {
>> .name = "cpu:bringup",
>> .startup.single = bringup_cpu,
>> - .teardown.single = NULL,
>> + .teardown.single = finish_cpu,
>> .cant_stop = true,
>> },
>> /* Final state before CPU kills itself */
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index a2694ba82874..8787958339d5 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -6200,13 +6200,14 @@ void idle_task_exit(void)
>> struct mm_struct *mm = current->active_mm;
>>
>> BUG_ON(cpu_online(smp_processor_id()));
>> + BUG_ON(current != this_rq()->idle);
>>
>> if (mm != &init_mm) {
>> switch_mm(mm, &init_mm, current);
>> - current->active_mm = &init_mm;
>> finish_arch_post_lock_switch();
>> }
>> - mmdrop(mm);
>> +
>> + /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
>> }
>>
>> /*
>> --
>> 2.21.0 (Apple Git-122.2)

2020-04-02 15:55:10

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs

On Thu, Apr 02, 2020 at 10:00:16AM -0400, Qian Cai wrote:
>
>
> > On Apr 2, 2020, at 7:24 AM, Michael Ellerman <[email protected]> wrote:
> >
> > Qian Cai <[email protected]> writes:
> >> From: Peter Zijlstra <[email protected]>
> >>
> >> In the CPU-offline process, it calls mmdrop() after idle entry and the
> >> subsequent call to cpuhp_report_idle_dead(). Once execution passes the
> >> call to rcu_report_dead(), RCU is ignoring the CPU, which results in
> >> lockdep complaining when mmdrop() uses RCU from either memcg or
> >> debugobjects below.
> >>
> >> Fix it by cleaning up the active_mm state from BP instead. Every arch
> >> which has CONFIG_HOTPLUG_CPU should have already called idle_task_exit()
> >> from AP. The only exception is parisc because it switches them to
> >> &init_mm unconditionally (see smp_boot_one_cpu() and smp_cpu_init()),
> >> but the patch will still work there because it calls mmgrab(&init_mm) in
> >> smp_cpu_init() and then should call mmdrop(&init_mm) in finish_cpu().
> >
> > Thanks for debugging this. How did you hit it in the first place?
>
> Just repeatedly offline/online CPUs which will eventually cause an idle thread
> refcount goes to 0 and trigger __mmdrop() and of course it needs to enable
> lockdep (PROVE_RCU?) as well as having luck to hit the cgroup, workqueue
> or debugobject code paths to call RCU.
>
> >
> > A link to the original thread would have helped me:
> >
> > https://lore.kernel.org/lkml/[email protected]/
> >
> >> WARNING: suspicious RCU usage
> >> -----------------------------
> >> kernel/workqueue.c:710 RCU or wq_pool_mutex should be held!
> >>
> >> other info that might help us debug this:
> >>
> >> RCU used illegally from offline CPU!
> >> Call Trace:
> >> dump_stack+0xf4/0x164 (unreliable)
> >> lockdep_rcu_suspicious+0x140/0x164
> >> get_work_pool+0x110/0x150
> >> __queue_work+0x1bc/0xca0
> >> queue_work_on+0x114/0x120
> >> css_release+0x9c/0xc0
> >> percpu_ref_put_many+0x204/0x230
> >> free_pcp_prepare+0x264/0x570
> >> free_unref_page+0x38/0xf0
> >> __mmdrop+0x21c/0x2c0
> >> idle_task_exit+0x170/0x1b0
> >> pnv_smp_cpu_kill_self+0x38/0x2e0
> >> cpu_die+0x48/0x64
> >> arch_cpu_idle_dead+0x30/0x50
> >> do_idle+0x2f4/0x470
> >> cpu_startup_entry+0x38/0x40
> >> start_secondary+0x7a8/0xa80
> >> start_secondary_resume+0x10/0x14
> >
> > Do we know when this started happening? ie. can we determine a Fixes
> > tag?
>
> I don’t know. I looked at some commits that it seems the code was like that
> even 10-year ago. It must be nobody who cares to run lockdep (PROVE_RCU?)
> with CPU hotplug very regularly.

I do run this combination quite frequently, but only as part of
rcutorture, which might not be a representative workload. For one thing,
it has a minimal userspace consisting only of a trivial init program.
I don't recall having ever seen this. (I have seen one recent complaint
about an IPI being sent to an offline CPU, but I cannot prove that this
was not due to RCU bugs that I was chasing at the time.)

Thanx, Paul

> >> <Peter to sign off here>
> >> Signed-off-by: Qian Cai <[email protected]>
> >> ---
> >> arch/powerpc/platforms/powernv/smp.c | 1 -
> >> include/linux/sched/mm.h | 2 ++
> >> kernel/cpu.c | 18 +++++++++++++++++-
> >> kernel/sched/core.c | 5 +++--
> >> 4 files changed, 22 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
> >> index 13e251699346..b2ba3e95bda7 100644
> >> --- a/arch/powerpc/platforms/powernv/smp.c
> >> +++ b/arch/powerpc/platforms/powernv/smp.c
> >> @@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
> >> /* Standard hot unplug procedure */
> >>
> >> idle_task_exit();
> >> - current->active_mm = NULL; /* for sanity */
> >
> > If I'm reading it right, we'll now be running with active_mm == init_mm
> > in the offline loop.
> >
> > I guess that's fine, I can't think of any reason it would matter, and it
> > seems like we were NULL'ing it out just for paranoia's sake not because
> > of any actual problem.
> >
> > Acked-by: Michael Ellerman <[email protected]> (powerpc)
> >
> >
> > cheers
> >
> >> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> >> index c49257a3b510..a132d875d351 100644
> >> --- a/include/linux/sched/mm.h
> >> +++ b/include/linux/sched/mm.h
> >> @@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
> >> __mmdrop(mm);
> >> }
> >>
> >> +void mmdrop(struct mm_struct *mm);
> >> +
> >> /*
> >> * This has to be called after a get_task_mm()/mmget_not_zero()
> >> * followed by taking the mmap_sem for writing before modifying the
> >> diff --git a/kernel/cpu.c b/kernel/cpu.c
> >> index 2371292f30b0..244d30544377 100644
> >> --- a/kernel/cpu.c
> >> +++ b/kernel/cpu.c
> >> @@ -3,6 +3,7 @@
> >> *
> >> * This code is licenced under the GPL.
> >> */
> >> +#include <linux/sched/mm.h>
> >> #include <linux/proc_fs.h>
> >> #include <linux/smp.h>
> >> #include <linux/init.h>
> >> @@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
> >> return bringup_wait_for_ap(cpu);
> >> }
> >>
> >> +static int finish_cpu(unsigned int cpu)
> >> +{
> >> + struct task_struct *idle = idle_thread_get(cpu);
> >> + struct mm_struct *mm = idle->active_mm;
> >> +
> >> + /*
> >> + * idle_task_exit() will have switched to &init_mm, now
> >> + * clean up any remaining active_mm state.
> >> + */
> >> + if (mm != &init_mm)
> >> + idle->active_mm = &init_mm;
> >> + mmdrop(mm);
> >> + return 0;
> >> +}
> >> +
> >> /*
> >> * Hotplug state machine related functions
> >> */
> >> @@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
> >> [CPUHP_BRINGUP_CPU] = {
> >> .name = "cpu:bringup",
> >> .startup.single = bringup_cpu,
> >> - .teardown.single = NULL,
> >> + .teardown.single = finish_cpu,
> >> .cant_stop = true,
> >> },
> >> /* Final state before CPU kills itself */
> >> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> >> index a2694ba82874..8787958339d5 100644
> >> --- a/kernel/sched/core.c
> >> +++ b/kernel/sched/core.c
> >> @@ -6200,13 +6200,14 @@ void idle_task_exit(void)
> >> struct mm_struct *mm = current->active_mm;
> >>
> >> BUG_ON(cpu_online(smp_processor_id()));
> >> + BUG_ON(current != this_rq()->idle);
> >>
> >> if (mm != &init_mm) {
> >> switch_mm(mm, &init_mm, current);
> >> - current->active_mm = &init_mm;
> >> finish_arch_post_lock_switch();
> >> }
> >> - mmdrop(mm);
> >> +
> >> + /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
> >> }
> >>
> >> /*
> >> --
> >> 2.21.0 (Apple Git-122.2)
>

2020-04-02 16:22:06

by Qian Cai

[permalink] [raw]
Subject: Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs



> On Apr 2, 2020, at 11:54 AM, Paul E. McKenney <[email protected]> wrote:
>
> I do run this combination quite frequently, but only as part of
> rcutorture, which might not be a representative workload. For one thing,
> it has a minimal userspace consisting only of a trivial init program.
> I don't recall having ever seen this. (I have seen one recent complaint
> about an IPI being sent to an offline CPU, but I cannot prove that this
> was not due to RCU bugs that I was chasing at the time.)

Yes, a trivial init is tough while running systemd should be able to catch it as it will use cgroup.

2020-04-02 18:07:33

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs

On Thu, Apr 02, 2020 at 12:19:54PM -0400, Qian Cai wrote:
>
>
> > On Apr 2, 2020, at 11:54 AM, Paul E. McKenney <[email protected]> wrote:
> >
> > I do run this combination quite frequently, but only as part of
> > rcutorture, which might not be a representative workload. For one thing,
> > it has a minimal userspace consisting only of a trivial init program.
> > I don't recall having ever seen this. (I have seen one recent complaint
> > about an IPI being sent to an offline CPU, but I cannot prove that this
> > was not due to RCU bugs that I was chasing at the time.)
>
> Yes, a trivial init is tough while running systemd should be able to catch it as it will use cgroup.

Not planning to add systemd to my rcutorture runs. ;-)

Thanx, Paul

2020-04-17 13:30:46

by Qian Cai

[permalink] [raw]
Subject: Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs



> On Apr 2, 2020, at 7:24 AM, Michael Ellerman <[email protected]> wrote:
>
> Qian Cai <[email protected]> writes:
>> From: Peter Zijlstra <[email protected]>
>>
>> In the CPU-offline process, it calls mmdrop() after idle entry and the
>> subsequent call to cpuhp_report_idle_dead(). Once execution passes the
>> call to rcu_report_dead(), RCU is ignoring the CPU, which results in
>> lockdep complaining when mmdrop() uses RCU from either memcg or
>> debugobjects below.
>>
>> Fix it by cleaning up the active_mm state from BP instead. Every arch
>> which has CONFIG_HOTPLUG_CPU should have already called idle_task_exit()
>> from AP. The only exception is parisc because it switches them to
>> &init_mm unconditionally (see smp_boot_one_cpu() and smp_cpu_init()),
>> but the patch will still work there because it calls mmgrab(&init_mm) in
>> smp_cpu_init() and then should call mmdrop(&init_mm) in finish_cpu().
>
> Thanks for debugging this. How did you hit it in the first place?
>
> A link to the original thread would have helped me:
>
> https://lore.kernel.org/lkml/[email protected]/
>
>> WARNING: suspicious RCU usage
>> -----------------------------
>> kernel/workqueue.c:710 RCU or wq_pool_mutex should be held!
>>
>> other info that might help us debug this:
>>
>> RCU used illegally from offline CPU!
>> Call Trace:
>> dump_stack+0xf4/0x164 (unreliable)
>> lockdep_rcu_suspicious+0x140/0x164
>> get_work_pool+0x110/0x150
>> __queue_work+0x1bc/0xca0
>> queue_work_on+0x114/0x120
>> css_release+0x9c/0xc0
>> percpu_ref_put_many+0x204/0x230
>> free_pcp_prepare+0x264/0x570
>> free_unref_page+0x38/0xf0
>> __mmdrop+0x21c/0x2c0
>> idle_task_exit+0x170/0x1b0
>> pnv_smp_cpu_kill_self+0x38/0x2e0
>> cpu_die+0x48/0x64
>> arch_cpu_idle_dead+0x30/0x50
>> do_idle+0x2f4/0x470
>> cpu_startup_entry+0x38/0x40
>> start_secondary+0x7a8/0xa80
>> start_secondary_resume+0x10/0x14
>
> Do we know when this started happening? ie. can we determine a Fixes
> tag?
>
>> <Peter to sign off here>
>> Signed-off-by: Qian Cai <[email protected]>
>> ---
>> arch/powerpc/platforms/powernv/smp.c | 1 -
>> include/linux/sched/mm.h | 2 ++
>> kernel/cpu.c | 18 +++++++++++++++++-
>> kernel/sched/core.c | 5 +++--
>> 4 files changed, 22 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
>> index 13e251699346..b2ba3e95bda7 100644
>> --- a/arch/powerpc/platforms/powernv/smp.c
>> +++ b/arch/powerpc/platforms/powernv/smp.c
>> @@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
>> /* Standard hot unplug procedure */
>>
>> idle_task_exit();
>> - current->active_mm = NULL; /* for sanity */
>
> If I'm reading it right, we'll now be running with active_mm == init_mm
> in the offline loop.
>
> I guess that's fine, I can't think of any reason it would matter, and it
> seems like we were NULL'ing it out just for paranoia's sake not because
> of any actual problem.
>
> Acked-by: Michael Ellerman <[email protected]> (powerpc)

Peter, can you take a look at this patch when you have a chance?

>
>
> cheers
>
>> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
>> index c49257a3b510..a132d875d351 100644
>> --- a/include/linux/sched/mm.h
>> +++ b/include/linux/sched/mm.h
>> @@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
>> __mmdrop(mm);
>> }
>>
>> +void mmdrop(struct mm_struct *mm);
>> +
>> /*
>> * This has to be called after a get_task_mm()/mmget_not_zero()
>> * followed by taking the mmap_sem for writing before modifying the
>> diff --git a/kernel/cpu.c b/kernel/cpu.c
>> index 2371292f30b0..244d30544377 100644
>> --- a/kernel/cpu.c
>> +++ b/kernel/cpu.c
>> @@ -3,6 +3,7 @@
>> *
>> * This code is licenced under the GPL.
>> */
>> +#include <linux/sched/mm.h>
>> #include <linux/proc_fs.h>
>> #include <linux/smp.h>
>> #include <linux/init.h>
>> @@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
>> return bringup_wait_for_ap(cpu);
>> }
>>
>> +static int finish_cpu(unsigned int cpu)
>> +{
>> + struct task_struct *idle = idle_thread_get(cpu);
>> + struct mm_struct *mm = idle->active_mm;
>> +
>> + /*
>> + * idle_task_exit() will have switched to &init_mm, now
>> + * clean up any remaining active_mm state.
>> + */
>> + if (mm != &init_mm)
>> + idle->active_mm = &init_mm;
>> + mmdrop(mm);
>> + return 0;
>> +}
>> +
>> /*
>> * Hotplug state machine related functions
>> */
>> @@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
>> [CPUHP_BRINGUP_CPU] = {
>> .name = "cpu:bringup",
>> .startup.single = bringup_cpu,
>> - .teardown.single = NULL,
>> + .teardown.single = finish_cpu,
>> .cant_stop = true,
>> },
>> /* Final state before CPU kills itself */
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index a2694ba82874..8787958339d5 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -6200,13 +6200,14 @@ void idle_task_exit(void)
>> struct mm_struct *mm = current->active_mm;
>>
>> BUG_ON(cpu_online(smp_processor_id()));
>> + BUG_ON(current != this_rq()->idle);
>>
>> if (mm != &init_mm) {
>> switch_mm(mm, &init_mm, current);
>> - current->active_mm = &init_mm;
>> finish_arch_post_lock_switch();
>> }
>> - mmdrop(mm);
>> +
>> + /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
>> }
>>
>> /*
>> --
>> 2.21.0 (Apple Git-122.2)

2020-04-21 14:01:11

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs

On Fri, Apr 17, 2020 at 09:26:56AM -0400, Qian Cai wrote:

> > Acked-by: Michael Ellerman <[email protected]> (powerpc)
>
> Peter, can you take a look at this patch when you have a chance?

Sorry, -ETOOMUCHEMAIL, got it now, thanks!

2020-05-01 18:26:18

by tip-bot2 for Haifeng Xu

[permalink] [raw]
Subject: [tip: sched/core] sched/core: Fix illegal RCU from offline CPUs

The following commit has been merged into the sched/core branch of tip:

Commit-ID: bf2c59fce4074e55d622089b34be3a6bc95484fb
Gitweb: https://git.kernel.org/tip/bf2c59fce4074e55d622089b34be3a6bc95484fb
Author: Peter Zijlstra <[email protected]>
AuthorDate: Wed, 01 Apr 2020 17:40:33 -04:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Thu, 30 Apr 2020 20:14:41 +02:00

sched/core: Fix illegal RCU from offline CPUs

In the CPU-offline process, it calls mmdrop() after idle entry and the
subsequent call to cpuhp_report_idle_dead(). Once execution passes the
call to rcu_report_dead(), RCU is ignoring the CPU, which results in
lockdep complaining when mmdrop() uses RCU from either memcg or
debugobjects below.

Fix it by cleaning up the active_mm state from BP instead. Every arch
which has CONFIG_HOTPLUG_CPU should have already called idle_task_exit()
from AP. The only exception is parisc because it switches them to
&init_mm unconditionally (see smp_boot_one_cpu() and smp_cpu_init()),
but the patch will still work there because it calls mmgrab(&init_mm) in
smp_cpu_init() and then should call mmdrop(&init_mm) in finish_cpu().

WARNING: suspicious RCU usage
-----------------------------
kernel/workqueue.c:710 RCU or wq_pool_mutex should be held!

other info that might help us debug this:

RCU used illegally from offline CPU!
Call Trace:
dump_stack+0xf4/0x164 (unreliable)
lockdep_rcu_suspicious+0x140/0x164
get_work_pool+0x110/0x150
__queue_work+0x1bc/0xca0
queue_work_on+0x114/0x120
css_release+0x9c/0xc0
percpu_ref_put_many+0x204/0x230
free_pcp_prepare+0x264/0x570
free_unref_page+0x38/0xf0
__mmdrop+0x21c/0x2c0
idle_task_exit+0x170/0x1b0
pnv_smp_cpu_kill_self+0x38/0x2e0
cpu_die+0x48/0x64
arch_cpu_idle_dead+0x30/0x50
do_idle+0x2f4/0x470
cpu_startup_entry+0x38/0x40
start_secondary+0x7a8/0xa80
start_secondary_resume+0x10/0x14

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Qian Cai <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Acked-by: Michael Ellerman <[email protected]> (powerpc)
Link: https://lkml.kernel.org/r/[email protected]
---
arch/powerpc/platforms/powernv/smp.c | 1 -
include/linux/sched/mm.h | 2 ++
kernel/cpu.c | 18 +++++++++++++++++-
kernel/sched/core.c | 5 +++--
4 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 13e2516..b2ba3e9 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
/* Standard hot unplug procedure */

idle_task_exit();
- current->active_mm = NULL; /* for sanity */
cpu = smp_processor_id();
DBG("CPU%d offline\n", cpu);
generic_set_cpu_dead(cpu);
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index c49257a..a132d87 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
__mmdrop(mm);
}

+void mmdrop(struct mm_struct *mm);
+
/*
* This has to be called after a get_task_mm()/mmget_not_zero()
* followed by taking the mmap_sem for writing before modifying the
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2371292..244d305 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -3,6 +3,7 @@
*
* This code is licenced under the GPL.
*/
+#include <linux/sched/mm.h>
#include <linux/proc_fs.h>
#include <linux/smp.h>
#include <linux/init.h>
@@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
return bringup_wait_for_ap(cpu);
}

+static int finish_cpu(unsigned int cpu)
+{
+ struct task_struct *idle = idle_thread_get(cpu);
+ struct mm_struct *mm = idle->active_mm;
+
+ /*
+ * idle_task_exit() will have switched to &init_mm, now
+ * clean up any remaining active_mm state.
+ */
+ if (mm != &init_mm)
+ idle->active_mm = &init_mm;
+ mmdrop(mm);
+ return 0;
+}
+
/*
* Hotplug state machine related functions
*/
@@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
[CPUHP_BRINGUP_CPU] = {
.name = "cpu:bringup",
.startup.single = bringup_cpu,
- .teardown.single = NULL,
+ .teardown.single = finish_cpu,
.cant_stop = true,
},
/* Final state before CPU kills itself */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2e6ba9e..f6ae262 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6197,13 +6197,14 @@ void idle_task_exit(void)
struct mm_struct *mm = current->active_mm;

BUG_ON(cpu_online(smp_processor_id()));
+ BUG_ON(current != this_rq()->idle);

if (mm != &init_mm) {
switch_mm(mm, &init_mm, current);
- current->active_mm = &init_mm;
finish_arch_post_lock_switch();
}
- mmdrop(mm);
+
+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
}

/*