The linux-next commit ff7db0bf24db ("sched/numa: Prefer using an idle CPU as a
migration target instead of comparing tasks") introduced a boot warning,
[ 86.520534][ T1] WARNING: suspicious RCU usage
[ 86.520540][ T1] 5.6.0-rc3-next-20200227 #7 Not tainted
[ 86.520545][ T1] -----------------------------
[ 86.520551][ T1] kernel/sched/fair.c:5914 suspicious
rcu_dereference_check() usage!
[ 86.520555][ T1]
[ 86.520555][ T1] other info that might help us debug this:
[ 86.520555][ T1]
[ 86.520561][ T1]
[ 86.520561][ T1] rcu_scheduler_active = 2, debug_locks = 1
[ 86.520567][ T1] 1 lock held by systemd/1:
[ 86.520571][ T1] #0: ffff8887f4b14848 (&mm->mmap_sem#2){++++}, at:
do_page_fault+0x1d2/0x998
[ 86.520594][ T1]
[ 86.520594][ T1] stack backtrace:
[ 86.520602][ T1] CPU: 1 PID: 1 Comm: systemd Not tainted 5.6.0-rc3-next-
20200227 #7
[ 86.520607][ T1] Hardware name: HP ProLiant XL450 Gen9 Server/ProLiant
XL450 Gen9 Server, BIOS U21 05/05/2016
[ 86.520612][ T1] Call Trace:
[ 86.520623][ T1] dump_stack+0xa0/0xea
[ 86.520634][ T1] lockdep_rcu_suspicious+0x102/0x10b
lockdep_rcu_suspicious at kernel/locking/lockdep.c:5648
[ 86.520641][ T1] update_numa_stats+0x577/0x710
test_idle_cores at kernel/sched/fair.c:5914
(inlined by) numa_idle_core at kernel/sched/fair.c:1565
(inlined by) update_numa_stats at kernel/sched/fair.c:1610
[ 86.520649][ T1] ? rcu_read_lock_held+0xac/0xc0
[ 86.520657][ T1] task_numa_migrate+0x4aa/0xdb0
[ 86.520664][ T1] ? task_numa_find_cpu+0x1010/0x1010
[ 86.520677][ T1] ? migrate_pages+0x29c/0x17c0
[ 86.520683][ T1] task_numa_fault+0x607/0xd90
[ 86.520691][ T1] ? task_numa_free+0x230/0x230
[ 86.520698][ T1] ? __kasan_check_read+0x11/0x20
[ 86.520704][ T1] ? do_raw_spin_unlock+0xa8/0x140
[ 86.520712][ T1] do_numa_page+0x33f/0x450
[ 86.520720][ T1] __handle_mm_fault+0xb81/0xb90
[ 86.520727][ T1] ? copy_page_range+0x420/0x420
[ 86.520736][ T1] handle_mm_fault+0xdc/0x2e0
[ 86.520742][ T1] do_page_fault+0x2c7/0x998
[ 86.520752][ T1] page_fault+0x34/0x40
[ 86.520758][ T1] RIP: 0033:0x7f95faf63c53
[ 86.520766][ T1] Code: 00 41 00 3d 00 00 41 00 74 3d 48 8d 05 d6 5a 2d 00
8b 00 85 c0 75 61 b8 01 01 00 00 0f 05 48 3d 00 f0 ff ff 0f 87 a5 00 00 00 <48>
8b 4c 24 38 64 48 33 0c 25 28 00 00 00 0f 85 ba 00 00 00 48 83
[ 86.520771][ T1] RSP: 002b:00007ffdda737790 EFLAGS: 00010207
[ 86.520778][ T1] RAX: 0000000000000024 RBX: 0000562a594b9fd0 RCX:
00007f95faf63c47
[ 86.520783][ T1] RDX: 00000000002a0000 RSI: 0000562a594b9fd1 RDI:
0000000000000023
[ 86.520788][ T1] RBP: 00007ffdda7379c0 R08: 00007f95fc734e30 R09:
00007ffdda737d60
[ 86.520793][ T1] R10: 0000000000000000 R11: 0000000000000246 R12:
0000562a59459fb4
[ 86.520798][ T1] R13: 0000000000000000 R14: 0000000000000001 R15:
0000000000000000
On Thu, 2020-02-27 at 09:09 -0500, Qian Cai wrote:
> The linux-next commit ff7db0bf24db ("sched/numa: Prefer using an idle CPU as a
> migration target instead of comparing tasks") introduced a boot warning,
This?
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a61d83ea2930..ca780cd1eae2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1607,7 +1607,9 @@ static void update_numa_stats(struct task_numa_env *env,
if (ns->idle_cpu == -1)
ns->idle_cpu = cpu;
+ rcu_read_lock();
idle_core = numa_idle_core(idle_core, cpu);
+ rcu_read_unlock();
}
}
>
> [ 86.520534][ T1] WARNING: suspicious RCU usage
> [ 86.520540][ T1] 5.6.0-rc3-next-20200227 #7 Not tainted
> [ 86.520545][ T1] -----------------------------
> [ 86.520551][ T1] kernel/sched/fair.c:5914 suspicious
> rcu_dereference_check() usage!
> [ 86.520555][ T1]
> [ 86.520555][ T1] other info that might help us debug this:
> [ 86.520555][ T1]
> [ 86.520561][ T1]
> [ 86.520561][ T1] rcu_scheduler_active = 2, debug_locks = 1
> [ 86.520567][ T1] 1 lock held by systemd/1:
> [ 86.520571][ T1] #0: ffff8887f4b14848 (&mm->mmap_sem#2){++++}, at:
> do_page_fault+0x1d2/0x998
> [ 86.520594][ T1]
> [ 86.520594][ T1] stack backtrace:
> [ 86.520602][ T1] CPU: 1 PID: 1 Comm: systemd Not tainted 5.6.0-rc3-next-
> 20200227 #7
> [ 86.520607][ T1] Hardware name: HP ProLiant XL450 Gen9 Server/ProLiant
> XL450 Gen9 Server, BIOS U21 05/05/2016
> [ 86.520612][ T1] Call Trace:
> [ 86.520623][ T1] dump_stack+0xa0/0xea
> [ 86.520634][ T1] lockdep_rcu_suspicious+0x102/0x10b
> lockdep_rcu_suspicious at kernel/locking/lockdep.c:5648
> [ 86.520641][ T1] update_numa_stats+0x577/0x710
> test_idle_cores at kernel/sched/fair.c:5914
> (inlined by) numa_idle_core at kernel/sched/fair.c:1565
> (inlined by) update_numa_stats at kernel/sched/fair.c:1610
> [ 86.520649][ T1] ? rcu_read_lock_held+0xac/0xc0
> [ 86.520657][ T1] task_numa_migrate+0x4aa/0xdb0
> [ 86.520664][ T1] ? task_numa_find_cpu+0x1010/0x1010
> [ 86.520677][ T1] ? migrate_pages+0x29c/0x17c0
> [ 86.520683][ T1] task_numa_fault+0x607/0xd90
> [ 86.520691][ T1] ? task_numa_free+0x230/0x230
> [ 86.520698][ T1] ? __kasan_check_read+0x11/0x20
> [ 86.520704][ T1] ? do_raw_spin_unlock+0xa8/0x140
> [ 86.520712][ T1] do_numa_page+0x33f/0x450
> [ 86.520720][ T1] __handle_mm_fault+0xb81/0xb90
> [ 86.520727][ T1] ? copy_page_range+0x420/0x420
> [ 86.520736][ T1] handle_mm_fault+0xdc/0x2e0
> [ 86.520742][ T1] do_page_fault+0x2c7/0x998
> [ 86.520752][ T1] page_fault+0x34/0x40
> [ 86.520758][ T1] RIP: 0033:0x7f95faf63c53
> [ 86.520766][ T1] Code: 00 41 00 3d 00 00 41 00 74 3d 48 8d 05 d6 5a 2d 00
> 8b 00 85 c0 75 61 b8 01 01 00 00 0f 05 48 3d 00 f0 ff ff 0f 87 a5 00 00 00 <48>
> 8b 4c 24 38 64 48 33 0c 25 28 00 00 00 0f 85 ba 00 00 00 48 83
> [ 86.520771][ T1] RSP: 002b:00007ffdda737790 EFLAGS: 00010207
> [ 86.520778][ T1] RAX: 0000000000000024 RBX: 0000562a594b9fd0 RCX:
> 00007f95faf63c47
> [ 86.520783][ T1] RDX: 00000000002a0000 RSI: 0000562a594b9fd1 RDI:
> 0000000000000023
> [ 86.520788][ T1] RBP: 00007ffdda7379c0 R08: 00007f95fc734e30 R09:
> 00007ffdda737d60
> [ 86.520793][ T1] R10: 0000000000000000 R11: 0000000000000246 R12:
> 0000562a59459fb4
> [ 86.520798][ T1] R13: 0000000000000000 R14: 0000000000000001 R15:
> 0000000000000000
On Thu, Feb 27 2020, Qian Cai wrote:
> On Thu, 2020-02-27 at 09:09 -0500, Qian Cai wrote:
>> The linux-next commit ff7db0bf24db ("sched/numa: Prefer using an idle CPU as a
>> migration target instead of comparing tasks") introduced a boot warning,
>
> This?
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a61d83ea2930..ca780cd1eae2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1607,7 +1607,9 @@ static void update_numa_stats(struct task_numa_env *env,
> if (ns->idle_cpu == -1)
> ns->idle_cpu = cpu;
>
> +rcu_read_lock();
> idle_core = numa_idle_core(idle_core, cpu);
> +rcu_read_unlock();
> }
> }
>
Hmph right, we have
numa_idle_core()->test_idle_cores()->rcu_dereference().
Dunno if it's preferable to wrap the entirety of update_numa_stats() or
if that fine-grained read-side section is ok.
On Thu, 2020-02-27 at 15:26 +0000, Valentin Schneider wrote:
> On Thu, Feb 27 2020, Qian Cai wrote:
>
> > On Thu, 2020-02-27 at 09:09 -0500, Qian Cai wrote:
> > > The linux-next commit ff7db0bf24db ("sched/numa: Prefer using an idle CPU as a
> > > migration target instead of comparing tasks") introduced a boot warning,
> >
> > This?
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index a61d83ea2930..ca780cd1eae2 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -1607,7 +1607,9 @@ static void update_numa_stats(struct task_numa_env *env,
> > if (ns->idle_cpu == -1)
> > ns->idle_cpu = cpu;
> >
> > +rcu_read_lock();
> > idle_core = numa_idle_core(idle_core, cpu);
> > +rcu_read_unlock();
> > }
> > }
> >
>
>
> Hmph right, we have
> numa_idle_core()->test_idle_cores()->rcu_dereference().
>
> Dunno if it's preferable to wrap the entirety of update_numa_stats() or
> if that fine-grained read-side section is ok.
I could not come up with a better fine-grained one than this.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a61d83ea2930..980d03fa157c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1562,9 +1562,16 @@ static inline int numa_idle_core(int idle_core, int cpu)
{
#ifdef CONFIG_SCHED_SMT
if (!static_branch_likely(&sched_smt_present) ||
- idle_core >= 0 || !test_idle_cores(cpu, false))
- return idle_core;
+ idle_core >= 0) {
+ bool idle;
+ rcu_read_lock();
+ idle = test_idle_cores(cpu, false);
+ rcu_read_unlock();
+
+ if (!idle)
+ return idle_core;
+ }
/*
* Prefer cores instead of packing HT siblings
* and triggering future load balancing.
On Thu, 2020-02-27 at 11:35 -0500, Qian Cai wrote:
> On Thu, 2020-02-27 at 15:26 +0000, Valentin Schneider wrote:
> > On Thu, Feb 27 2020, Qian Cai wrote:
> >
> > > On Thu, 2020-02-27 at 09:09 -0500, Qian Cai wrote:
> > > > The linux-next commit ff7db0bf24db ("sched/numa: Prefer using an idle CPU as a
> > > > migration target instead of comparing tasks") introduced a boot warning,
> > >
> > > This?
> > >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index a61d83ea2930..ca780cd1eae2 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -1607,7 +1607,9 @@ static void update_numa_stats(struct task_numa_env *env,
> > > if (ns->idle_cpu == -1)
> > > ns->idle_cpu = cpu;
> > >
> > > +rcu_read_lock();
> > > idle_core = numa_idle_core(idle_core, cpu);
> > > +rcu_read_unlock();
> > > }
> > > }
> > >
> >
> >
> > Hmph right, we have
> > numa_idle_core()->test_idle_cores()->rcu_dereference().
> >
> > Dunno if it's preferable to wrap the entirety of update_numa_stats() or
> > if that fine-grained read-side section is ok.
>
> I could not come up with a better fine-grained one than this.
Correction -- this one,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a61d83ea2930..580d56f9c10b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1561,10 +1561,18 @@ numa_type numa_classify(unsigned int imbalance_pct,
static inline int numa_idle_core(int idle_core, int cpu)
{
#ifdef CONFIG_SCHED_SMT
+ bool idle;
+
if (!static_branch_likely(&sched_smt_present) ||
- idle_core >= 0 || !test_idle_cores(cpu, false))
+ idle_core >= 0)
return idle_core;
+ rcu_read_lock();
+ idle = test_idle_cores(cpu, false);
+ rcu_read_unlock();
+
+ if (!idle)
+ return idle_core;
/*
* Prefer cores instead of packing HT siblings
* and triggering future load balancing.
On Thu, Feb 27, 2020 at 11:47:04AM -0500, Qian Cai wrote:
> On Thu, 2020-02-27 at 11:35 -0500, Qian Cai wrote:
> > On Thu, 2020-02-27 at 15:26 +0000, Valentin Schneider wrote:
> > > On Thu, Feb 27 2020, Qian Cai wrote:
> > >
> > > > On Thu, 2020-02-27 at 09:09 -0500, Qian Cai wrote:
> > > > > The linux-next commit ff7db0bf24db ("sched/numa: Prefer using an idle CPU as a
> > > > > migration target instead of comparing tasks") introduced a boot warning,
> > > >
> > > > This?
> > > >
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index a61d83ea2930..ca780cd1eae2 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -1607,7 +1607,9 @@ static void update_numa_stats(struct task_numa_env *env,
> > > > if (ns->idle_cpu == -1)
> > > > ns->idle_cpu = cpu;
> > > >
> > > > +rcu_read_lock();
> > > > idle_core = numa_idle_core(idle_core, cpu);
> > > > +rcu_read_unlock();
> > > > }
> > > > }
> > > >
> > >
> > >
> > > Hmph right, we have
> > > numa_idle_core()->test_idle_cores()->rcu_dereference().
> > >
> > > Dunno if it's preferable to wrap the entirety of update_numa_stats() or
> > > if that fine-grained read-side section is ok.
> >
> > I could not come up with a better fine-grained one than this.
>
> Correction -- this one,
>
Thanks for reporting this!
The proposed fix would be a lot of rcu locks and unlocks. While they are
cheap, they're not free and it's a fairly standard pattern to acquire
the rcu lock when scanning CPUs during a domain search (load balancing,
nohz balance, idle balance etc). While in this context the lock is only
needed for SMT, I do not think it's worthwhile fine-graining this or
conditionally acquiring the rcu lock so will we keep it simple?
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 11cdba201425..d34ac4ea5cee 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1592,6 +1592,7 @@ static void update_numa_stats(struct task_numa_env *env,
memset(ns, 0, sizeof(*ns));
ns->idle_cpu = -1;
+ rcu_read_lock();
for_each_cpu(cpu, cpumask_of_node(nid)) {
struct rq *rq = cpu_rq(cpu);
@@ -1611,6 +1612,7 @@ static void update_numa_stats(struct task_numa_env *env,
idle_core = numa_idle_core(idle_core, cpu);
}
}
+ rcu_read_unlock();
ns->weight = cpumask_weight(cpumask_of_node(nid));
On Thu, Feb 27 2020, Mel Gorman wrote:
> Thanks for reporting this!
>
> The proposed fix would be a lot of rcu locks and unlocks. While they are
> cheap, they're not free and it's a fairly standard pattern to acquire
> the rcu lock when scanning CPUs during a domain search (load balancing,
> nohz balance, idle balance etc). While in this context the lock is only
> needed for SMT, I do not think it's worthwhile fine-graining this or
> conditionally acquiring the rcu lock so will we keep it simple?
>
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 11cdba201425..d34ac4ea5cee 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1592,6 +1592,7 @@ static void update_numa_stats(struct task_numa_env *env,
> memset(ns, 0, sizeof(*ns));
> ns->idle_cpu = -1;
>
> + rcu_read_lock();
> for_each_cpu(cpu, cpumask_of_node(nid)) {
> struct rq *rq = cpu_rq(cpu);
>
> @@ -1611,6 +1612,7 @@ static void update_numa_stats(struct task_numa_env *env,
> idle_core = numa_idle_core(idle_core, cpu);
> }
> }
> + rcu_read_unlock();
>
> ns->weight = cpumask_weight(cpumask_of_node(nid));
>
That's closer to what I was trying to suggest (i.e. broaden the section
rather than reduce it).
On Thu, Feb 27, 2020 at 05:19:34PM +0000, Mel Gorman wrote:
> On Thu, Feb 27, 2020 at 11:47:04AM -0500, Qian Cai wrote:
> > On Thu, 2020-02-27 at 11:35 -0500, Qian Cai wrote:
> > > On Thu, 2020-02-27 at 15:26 +0000, Valentin Schneider wrote:
> > > > On Thu, Feb 27 2020, Qian Cai wrote:
> > > >
> > > > > On Thu, 2020-02-27 at 09:09 -0500, Qian Cai wrote:
> > > > > > The linux-next commit ff7db0bf24db ("sched/numa: Prefer using an idle CPU as a
> > > > > > migration target instead of comparing tasks") introduced a boot warning,
> > > > >
> > > > > This?
> > > > >
> > > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > > index a61d83ea2930..ca780cd1eae2 100644
> > > > > --- a/kernel/sched/fair.c
> > > > > +++ b/kernel/sched/fair.c
> > > > > @@ -1607,7 +1607,9 @@ static void update_numa_stats(struct task_numa_env *env,
> > > > > if (ns->idle_cpu == -1)
> > > > > ns->idle_cpu = cpu;
> > > > >
> > > > > +rcu_read_lock();
> > > > > idle_core = numa_idle_core(idle_core, cpu);
> > > > > +rcu_read_unlock();
> > > > > }
> > > > > }
> > > > >
> > > >
> > > >
> > > > Hmph right, we have
> > > > numa_idle_core()->test_idle_cores()->rcu_dereference().
> > > >
> > > > Dunno if it's preferable to wrap the entirety of update_numa_stats() or
> > > > if that fine-grained read-side section is ok.
> > >
> > > I could not come up with a better fine-grained one than this.
> >
> > Correction -- this one,
> >
>
> Thanks for reporting this!
>
> The proposed fix would be a lot of rcu locks and unlocks. While they are
> cheap, they're not free and it's a fairly standard pattern to acquire
> the rcu lock when scanning CPUs during a domain search (load balancing,
> nohz balance, idle balance etc). While in this context the lock is only
> needed for SMT, I do not think it's worthwhile fine-graining this or
> conditionally acquiring the rcu lock so will we keep it simple?
Indeed, scanning CPUs within a single RCU read-side critical section
should be OK. As long as each CPU isn't burning too much time. ;-)
Thanx, Paul
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 11cdba201425..d34ac4ea5cee 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1592,6 +1592,7 @@ static void update_numa_stats(struct task_numa_env *env,
> memset(ns, 0, sizeof(*ns));
> ns->idle_cpu = -1;
>
> + rcu_read_lock();
> for_each_cpu(cpu, cpumask_of_node(nid)) {
> struct rq *rq = cpu_rq(cpu);
>
> @@ -1611,6 +1612,7 @@ static void update_numa_stats(struct task_numa_env *env,
> idle_core = numa_idle_core(idle_core, cpu);
> }
> }
> + rcu_read_unlock();
>
> ns->weight = cpumask_weight(cpumask_of_node(nid));
>