select_idle_cpu will scan the LLC domain for idle CPUs,
it's always expensive. so commit
1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()")
introduces a way to limit how many CPUs we scan.
But this also lead to the following issue:
Our threads are all bind to the front CPUs of the LLC domain,
and now all the threads runs on the last CPU of them. nr is
always less than the cpumask_weight, for_each_cpu_wrap can't
find the CPU which our threads can run on, so the threads stay
at the last CPU all the time.
Fixes: 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()")
Signed-off-by: Cheng Jian <[email protected]>
---
kernel/sched/fair.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 08a233e97a01..16a29b570803 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5834,6 +5834,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
s64 delta;
int this = smp_processor_id();
int cpu, nr = INT_MAX, si_cpu = -1;
+ struct cpumask cpus;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -5859,11 +5860,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
time = cpu_clock(this);
- for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+ cpumask_and(&cpus, sched_domain_span(sd), p->cpus_ptr);
+
+ for_each_cpu_wrap(cpu, &cpus, target) {
if (!--nr)
return si_cpu;
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
- continue;
if (available_idle_cpu(cpu))
break;
if (si_cpu == -1 && sched_idle_cpu(cpu))
--
2.20.1
On 2019/12/12 22:41, Cheng Jian wrote:
> Our threads are all bind to the front CPUs of the LLC domain,
> and now all the threads runs on the last CPU of them. nr is
> always less than the cpumask_weight, for_each_cpu_wrap can't
> find the CPU which our threads can run on, so the threads stay
> at the last CPU all the time.
Test :
Run on ARM64 4NODE, 128 CORE
// cat a.c
#include <time.h>
int main(void)
{
struct timespec time, save;
int ret;
time.tv_sec = 0;
time.tv_nsec = 1;
while (1) {
ret = nanosleep(&time, &save);
if (ret)
nanosleep(&save, &save);
}
return 0;
}
#cat a.sh
for i in `seq 0 9`
do
taskset -c 8-11 ./a.out &
done
then run:
gcc a.c -o a.out
sh a.sh
without this patch, you can see all the task run on CPU11 all the times.
%Cpu8 : 0.0 us, 0.0 sy, 0.0 ni, 98.4 id, 0.0 wa, 1.6 hi, 0.0
si, 0.0 st
%Cpu9 : 0.0 us, 0.0 sy, 0.0 ni,100.0 id, 0.0 wa, 0.0 hi, 0.0
si, 0.0 st
%Cpu10 : 0.0 us, 0.0 sy, 0.0 ni,100.0 id, 0.0 wa, 0.0 hi, 0.0
si, 0.0 st
%Cpu11 : 5.7 us, 40.0 sy, 0.0 ni, 45.7 id, 0.0 wa, 8.6 hi, 0.0
si, 0.0 st
On Thu, Dec 12, 2019 at 10:41:02PM +0800, Cheng Jian wrote:
> select_idle_cpu will scan the LLC domain for idle CPUs,
> it's always expensive. so commit
> 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()")
> introduces a way to limit how many CPUs we scan.
>
> But this also lead to the following issue:
>
> Our threads are all bind to the front CPUs of the LLC domain,
> and now all the threads runs on the last CPU of them. nr is
> always less than the cpumask_weight, for_each_cpu_wrap can't
> find the CPU which our threads can run on, so the threads stay
> at the last CPU all the time.
>
> Fixes: 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()")
> Signed-off-by: Cheng Jian <[email protected]>
> ---
> kernel/sched/fair.c | 7 ++++---
> 1 file changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 08a233e97a01..16a29b570803 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5834,6 +5834,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
> s64 delta;
> int this = smp_processor_id();
> int cpu, nr = INT_MAX, si_cpu = -1;
> + struct cpumask cpus;
NAK, you must not put a cpumask on stack.
On Thu, Dec 12, 2019 at 10:41:02PM +0800, Cheng Jian wrote:
> Fixes: 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()")
The 'funny' thing is that select_idle_core() actually does the right
thing.
Copying that should work:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 08a233e97a01..416d574dcebf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5828,6 +5837,7 @@ static inline int select_idle_smt(struct task_struct *p, int target)
*/
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
struct sched_domain *this_sd;
u64 avg_cost, avg_idle;
u64 time, cost;
@@ -5859,11 +5869,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
time = cpu_clock(this);
- for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+
+ for_each_cpu_wrap(cpu, cpus, target) {
if (!--nr)
return si_cpu;
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
- continue;
if (available_idle_cpu(cpu))
break;
if (si_cpu == -1 && sched_idle_cpu(cpu))
On 12/12/2019 15:24, Peter Zijlstra wrote:
> On Thu, Dec 12, 2019 at 10:41:02PM +0800, Cheng Jian wrote:
>
>> Fixes: 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()")
>
> The 'funny' thing is that select_idle_core() actually does the right
> thing.
>
> Copying that should work:
>
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 08a233e97a01..416d574dcebf 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5828,6 +5837,7 @@ static inline int select_idle_smt(struct task_struct *p, int target)
> */
> static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
> {
> + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
> struct sched_domain *this_sd;
> u64 avg_cost, avg_idle;
> u64 time, cost;
> @@ -5859,11 +5869,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
>
> time = cpu_clock(this);
>
> - for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
> + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> +
> + for_each_cpu_wrap(cpu, cpus, target) {
> if (!--nr)
> return si_cpu;
> - if (!cpumask_test_cpu(cpu, p->cpus_ptr))
> - continue;
> if (available_idle_cpu(cpu))
> break;
> if (si_cpu == -1 && sched_idle_cpu(cpu))
>
That looks sane enough. I'd only argue the changelog should directly point
out that the issue is we consume some CPUs out of 'nr' that are not allowed
for the task and thus waste our attempts.
On 2019/12/12 23:04, Peter Zijlstra wrote:
> On Thu, Dec 12, 2019 at 10:41:02PM +0800, Cheng Jian wrote:
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 08a233e97a01..16a29b570803 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5834,6 +5834,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
>> s64 delta;
>> int this = smp_processor_id();
>> int cpu, nr = INT_MAX, si_cpu = -1;
>> + struct cpumask cpus;
> NAK, you must not put a cpumask on stack.
>
> .
Hi, Peter
I saw the same work in select_idle_core, and I was wondering why
the per_cpu variable was
needed for this yesterday. Now I think I probably understand : cpumask
may be too large,
putting it on the stack may cause overflow. Is this correct ?
I'm sorry I made a mistake like this. I will fix it in v2
Thank you very much.
-- Cheng Jian
On Fri, Dec 13, 2019 at 09:51:30AM +0800, chengjian (D) wrote:
> Hi, Peter
>
> ??? I saw the same work in select_idle_core, and I was wondering why the
> per_cpu variable was
>
> needed for this yesterday. Now I think I probably understand : cpumask may
> be too large,
>
> putting it on the stack may cause overflow. Is this correct ?
Yes, for instance when NR_CPUS=4096, struct cpumask ends up being 512
bytes, and that is _far_ too large for an on-stack variable, remember we
have relatively small fixed size stacks.
On 2019/12/12 23:44, Valentin Schneider wrote:
> On 12/12/2019 15:24, Peter Zijlstra wrote:
>> On Thu, Dec 12, 2019 at 10:41:02PM +0800, Cheng Jian wrote:
>>
>>> Fixes: 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()")
>> The 'funny' thing is that select_idle_core() actually does the right
>> thing.
>>
>> Copying that should work:
>>
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 08a233e97a01..416d574dcebf 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5828,6 +5837,7 @@ static inline int select_idle_smt(struct task_struct *p, int target)
>> */
>> static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
>> {
>> + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
>> struct sched_domain *this_sd;
>> u64 avg_cost, avg_idle;
>> u64 time, cost;
>> @@ -5859,11 +5869,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
>>
>> time = cpu_clock(this);
>>
>> - for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
>> + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
>> +
>> + for_each_cpu_wrap(cpu, cpus, target) {
>> if (!--nr)
>> return si_cpu;
>> - if (!cpumask_test_cpu(cpu, p->cpus_ptr))
>> - continue;
>> if (available_idle_cpu(cpu))
>> break;
>> if (si_cpu == -1 && sched_idle_cpu(cpu))
>>
> That looks sane enough. I'd only argue the changelog should directly point
> out that the issue is we consume some CPUs out of 'nr' that are not allowed
> for the task and thus waste our attempts.
>
> .
Hi,Valentin
Yeah,Yours are more clear.
Thank you so much.
-- Cheng Jian
On 2019/12/12 23:24, Peter Zijlstra wrote:
> On Thu, Dec 12, 2019 at 10:41:02PM +0800, Cheng Jian wrote:
>
>> Fixes: 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()")
> The 'funny' thing is that select_idle_core() actually does the right
> thing.
>
> Copying that should work:
>
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 08a233e97a01..416d574dcebf 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5828,6 +5837,7 @@ static inline int select_idle_smt(struct task_struct *p, int target)
> */
> static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
> {
> + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
> struct sched_domain *this_sd;
> u64 avg_cost, avg_idle;
> u64 time, cost;
> @@ -5859,11 +5869,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
>
> time = cpu_clock(this);
>
> - for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
> + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> +
> + for_each_cpu_wrap(cpu, cpus, target) {
> if (!--nr)
> return si_cpu;
> - if (!cpumask_test_cpu(cpu, p->cpus_ptr))
> - continue;
> if (available_idle_cpu(cpu))
> break;
> if (si_cpu == -1 && sched_idle_cpu(cpu))
>
> .
in select_idle_smt()
/*
* Scan the local SMT mask for idle CPUs.
*/
static int select_idle_smt(struct task_struct *p, int target)
{
int cpu, si_cpu = -1;
if (!static_branch_likely(&sched_smt_present))
return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
if (available_idle_cpu(cpu))
return cpu;
if (si_cpu == -1 && sched_idle_cpu(cpu))
si_cpu = cpu;
}
return si_cpu;
}
Why don't we do the same thing in this function,
although cpu_smt_present () often has few CPUs.
it is better to determine the 'p->cpus_ptr' first.
On 13/12/2019 09:57, chengjian (D) wrote:
>
> in select_idle_smt()
>
> /*
> * Scan the local SMT mask for idle CPUs.
> */
> static int select_idle_smt(struct task_struct *p, int target)
> {
> int cpu, si_cpu = -1;
>
> if (!static_branch_likely(&sched_smt_present))
> return -1;
>
> for_each_cpu(cpu, cpu_smt_mask(target)) {
> if (!cpumask_test_cpu(cpu, p->cpus_ptr))
> continue;
> if (available_idle_cpu(cpu))
> return cpu;
> if (si_cpu == -1 && sched_idle_cpu(cpu))
> si_cpu = cpu;
> }
>
> return si_cpu;
> }
>
>
> Why don't we do the same thing in this function,
>
> although cpu_smt_present () often has few CPUs.
>
> it is better to determine the 'p->cpus_ptr' first.
>
>
Like you said the gains here would probably be small - the highest SMT
count I'm aware of is SMT8 (POWER9). Still, if we end up with both
select_idle_core() and select_idle_cpu() using that pattern, it would make
sense IMO to align select_idle_smt() with those.
>
>
>
On Fri, Dec 13, 2019 at 11:28:00AM +0000, Valentin Schneider wrote:
> On 13/12/2019 09:57, chengjian (D) wrote:
> >
> > in select_idle_smt()
> >
> > /*
> > ?* Scan the local SMT mask for idle CPUs.
> > ?*/
> > static int select_idle_smt(struct task_struct *p, int target)
> > {
> > ??? int cpu, si_cpu = -1;
> >
> > ??? if (!static_branch_likely(&sched_smt_present))
> > ??????? return -1;
> >
> > ??? for_each_cpu(cpu, cpu_smt_mask(target)) {
> > ??????? if (!cpumask_test_cpu(cpu, p->cpus_ptr))
> > ??????????? continue;
> > ??????? if (available_idle_cpu(cpu))
> > ??????????? return cpu;
> > ??????? if (si_cpu == -1 && sched_idle_cpu(cpu))
> > ??????????? si_cpu = cpu;
> > ??? }
> >
> > ??? return si_cpu;
> > }
> >
> >
> > Why don't we do the same thing in this function,
> >
> > although cpu_smt_present () often has few CPUs.
> >
> > it is better to determine the 'p->cpus_ptr' first.
> >
> >
>
> Like you said the gains here would probably be small - the highest SMT
> count I'm aware of is SMT8 (POWER9). Still, if we end up with both
> select_idle_core() and select_idle_cpu() using that pattern, it would make
> sense IMO to align select_idle_smt() with those.
The cpumask_and() operation added would also have cost. I really don't
see that paying off.
The other sites have the problem that we combine an iteration limit with
affinity constraints. This loop doesn't do that and therefore doesn't
suffer the problem.
On 13/12/2019 12:09, Peter Zijlstra wrote:
>> Like you said the gains here would probably be small - the highest SMT
>> count I'm aware of is SMT8 (POWER9). Still, if we end up with both
>> select_idle_core() and select_idle_cpu() using that pattern, it would make
>> sense IMO to align select_idle_smt() with those.
>
> The cpumask_and() operation added would also have cost. I really don't
> see that paying off.
>
> The other sites have the problem that we combine an iteration limit with
> affinity constraints. This loop doesn't do that and therefore doesn't
> suffer the problem.
>
select_idle_core() doesn't really have an iteration limit, right? That
being said, yeah, the cpumask_and() for e.g. SMT2 systems would be
mostly wasteful.