Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 23.128.96.18 as permitted sender) client-ip=23.128.96.18;
MIME-Version: 1.0
References: <20201209062404.175565-1-aubrey.li@linux.intel.com>
 <20201209143510.GO3371@techsingularity.net> <3802e27a-56ed-9495-21b9-7c4277065155@linux.intel.com>
 <20201210113441.GS3371@techsingularity.net> <31308700-aa28-b1f7-398e-ee76772b6b87@linux.intel.com>
 <20201210125833.GT3371@techsingularity.net> <20201211174442.GU3040@hirez.programming.kicks-ass.net>
In-Reply-To: <20201211174442.GU3040@hirez.programming.kicks-ass.net>
From:   Vincent Guittot <vincent.guittot@linaro.org>
Date:   Mon, 14 Dec 2020 10:18:16 +0100
Message-ID: <CAKfTPtB1sRmZAf5MEZOquHg3z7TTriAbqpwVup5xwu42DN2yCA@mail.gmail.com>
Subject: Re: [RFC PATCH v7] sched/fair: select idle cpu from idle cpumask for
 task wakeup
To:     Peter Zijlstra <peterz@infradead.org>
Cc:     Mel Gorman <mgorman@techsingularity.net>,
        "Li, Aubrey" <aubrey.li@linux.intel.com>,
        Ingo Molnar <mingo@redhat.com>,
        Juri Lelli <juri.lelli@redhat.com>,
        Valentin Schneider <valentin.schneider@arm.com>,
        Qais Yousef <qais.yousef@arm.com>,
        Dietmar Eggemann <dietmar.eggemann@arm.com>,
        Steven Rostedt <rostedt@goodmis.org>,
        Ben Segall <bsegall@google.com>,
        Tim Chen <tim.c.chen@linux.intel.com>,
        linux-kernel <linux-kernel@vger.kernel.org>,
        Mel Gorman <mgorman@suse.de>, Jiang Biao <benbjiang@gmail.com>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
Precedence: bulk

On Fri, 11 Dec 2020 at 18:45, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Dec 10, 2020 at 12:58:33PM +0000, Mel Gorman wrote:
> > The prequisite patch to make that approach work was rejected though
> > as on its own, it's not very helpful and Vincent didn't like that the
> > load_balance_mask was abused to make it effective.
>
> So last time I poked at all this, I found that using more masks was a
> performance issue as well due to added cache misses.
>
> Anyway, I finally found some time to look at all this, and while reading
> over the whole SiS crud to refresh the brain came up with the below...
>
> It's still naf, but at least it gets rid of a bunch of duplicate
> scanning and LoC decreases, so it should be awesome. Ofcourse, as
> always, benchmarks will totally ruin everything, we'll see, I started
> some.
>
> It goes on top of Mel's first two patches (which is about as far as I
> got)...

We have several different things that Aubrey and Mel patches are
trying to achieve:

Aubrey wants to avoid scanning busy cpus
- patch works well on busy system: I see a significant benefit with
hackbench on a lot of group on my 2 nodes * 28 cores * 4 smt
    hackbench -l 2000 -g 128
tip 3.334
w/ patch 2.978 (+12%)

- Aubey also tried to not scan the cpus which are idle for a short
duration (when a tick not stopped) but there are problems because not
stopping a tick doesn't mean a short idle. In fact , something similar
to find_idlest_group_cpu() should be done in this case but then it's
no more a fast path search

Mel wants to minimize looping over the cpus
-patch 4 is an obvious win on light loaded system because it avoids
looping twice the cpus mask when some cpus are idle but no core
-But patch 3 generates perf r=C3=A9gression
    hackbench -l 2000 -g 1
tip 12.293
/w all Mel's patches 15.163 -14%
/w Aubrey + Mel patches minus patch 3 : 12.788 +3.8% But I think that
Aubreay's patch doesn't help here. Test without aubrey's patch are
running

-he also tried to used load_balance_mask to do something similar to the bel=
ow


>
> ---
>  fair.c |  124 ++++++++++++++++++++++++----------------------------------=
-------
>  1 file changed, 47 insertions(+), 77 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6021,11 +6021,13 @@ static inline void set_idle_cores(int cp
>
>  static inline bool test_idle_cores(int cpu, bool def)
>  {
> -       struct sched_domain_shared *sds;
> +       if (static_branch_likely(&sched_smt_present)) {
> +               struct sched_domain_shared *sds;
>
> -       sds =3D rcu_dereference(per_cpu(sd_llc_shared, cpu));
> -       if (sds)
> -               return READ_ONCE(sds->has_idle_cores);
> +               sds =3D rcu_dereference(per_cpu(sd_llc_shared, cpu));
> +               if (sds)
> +                       return READ_ONCE(sds->has_idle_cores);
> +       }
>
>         return def;
>  }
> @@ -6059,77 +6061,39 @@ void __update_idle_core(struct rq *rq)
>         rcu_read_unlock();
>  }
>
> -/*
> - * Scan the entire LLC domain for idle cores; this dynamically switches =
off if
> - * there are no idle cores left in the system; tracked through
> - * sd_llc->shared->has_idle_cores and enabled through update_idle_core()=
 above.
> - */
> -static int select_idle_core(struct task_struct *p, struct sched_domain *=
sd, int target)
> +static int __select_idle_core(int core, struct cpumask *cpus, int *idle_=
cpu)
>  {
> -       struct cpumask *cpus =3D this_cpu_cpumask_var_ptr(select_idle_mas=
k);
> -       int core, cpu;
> -
> -       if (!static_branch_likely(&sched_smt_present))
> -               return -1;
> -
> -       if (!test_idle_cores(target, false))
> -               return -1;
> -
> -       cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> -
> -       for_each_cpu_wrap(core, cpus, target) {
> -               bool idle =3D true;
> +       bool idle =3D true;
> +       int cpu;
>
> -               for_each_cpu(cpu, cpu_smt_mask(core)) {
> -                       if (!available_idle_cpu(cpu)) {
> -                               idle =3D false;
> -                               break;
> -                       }
> +       for_each_cpu(cpu, cpu_smt_mask(core)) {
> +               if (!available_idle_cpu(cpu)) {
> +                       idle =3D false;
> +                       continue;

By not breaking on the first not idle cpu of the core, you will
largely increase the number of loops. On my system, it increases 4
times from 28 up tu 112

>                 }
> -
> -               if (idle)
> -                       return core;
> -
> -               cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
> +               if (idle_cpu)
> +                       *idle_cpu =3D cpu;
>         }
>
> -       /*
> -        * Failed to find an idle core; stop looking for one.
> -        */
> -       set_idle_cores(target, 0);
> +       if (idle)
> +               return core;
>
> +       cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
>         return -1;
>  }
>
> -/*
> - * Scan the local SMT mask for idle CPUs.
> - */
> -static int select_idle_smt(struct task_struct *p, struct sched_domain *s=
d, int target)
> -{
> -       int cpu;
> -
> -       if (!static_branch_likely(&sched_smt_present))
> -               return -1;
> -
> -       for_each_cpu(cpu, cpu_smt_mask(target)) {
> -               if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
> -                   !cpumask_test_cpu(cpu, sched_domain_span(sd)))
> -                       continue;
> -               if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
> -                       return cpu;
> -       }
> +#else /* CONFIG_SCHED_SMT */
>
> -       return -1;
> +static inline void set_idle_cores(int cpu, int val)
> +{
>  }
>
> -#else /* CONFIG_SCHED_SMT */
> -
> -static inline int select_idle_core(struct task_struct *p, struct sched_d=
omain *sd, int target)
> +static inline bool test_idle_cores(int cpu, bool def)
>  {
> -       return -1;
> +       return def;
>  }
>
> -static inline int select_idle_smt(struct task_struct *p, struct sched_do=
main *sd, int target)
> +static inline int __select_idle_core(int core, struct cpumask *cpus, int=
 *idle_cpu)
>  {
>         return -1;
>  }
> @@ -6144,10 +6108,11 @@ static inline int select_idle_smt(struct
>  static int select_idle_cpu(struct task_struct *p, struct sched_domain *s=
d, int target)
>  {
>         struct cpumask *cpus =3D this_cpu_cpumask_var_ptr(select_idle_mas=
k);
> +       int this =3D smp_processor_id();
> +       bool smt =3D test_idle_cores(this, false);
> +       int i, cpu, idle_cpu =3D -1, nr =3D INT_MAX;
>         struct sched_domain *this_sd;
>         u64 time;
> -       int this =3D smp_processor_id();
> -       int cpu, nr =3D INT_MAX;
>
>         this_sd =3D rcu_dereference(*this_cpu_ptr(&sd_llc));
>         if (!this_sd)
> @@ -6155,7 +6120,7 @@ static int select_idle_cpu(struct task_s
>
>         cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
>
> -       if (sched_feat(SIS_PROP)) {
> +       if (sched_feat(SIS_PROP) && !smt) {
>                 u64 avg_cost, avg_idle, span_avg;
>
>                 /*
> @@ -6175,18 +6140,31 @@ static int select_idle_cpu(struct task_s
>         }
>
>         for_each_cpu_wrap(cpu, cpus, target) {
> -               if (!--nr)
> -                       return -1;
> -               if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
> -                       break;
> +               if (smt) {

sds->has_idle_core being set doesn't means that there is one idle core
which means for the same real HW state (i.e. no idle cores)
if sds->has_idle_core is set, we will loop all cpus here and never get
a chance to return a sched_idle_cpu()
but if sds->has_idle_core is clear, we will loop on a limited number
of cpus and test sched_idle_cpu()

> +                       i =3D __select_idle_core(cpu, cpus, &idle_cpu);
> +                       if ((unsigned)i < nr_cpumask_bits)
> +                               return i;
> +
> +               } else {
> +                       if (!--nr)
> +                               return -1;
> +
> +                       if (available_idle_cpu(cpu) || sched_idle_cpu(cpu=
)) {
> +                               idle_cpu =3D cpu;
> +                               break;
> +                       }
> +               }
>         }
>
> -       if (sched_feat(SIS_PROP)) {
> +       if (smt)
> +               set_idle_cores(this, false);
> +
> +       if (sched_feat(SIS_PROP) && !smt) {
>                 time =3D cpu_clock(this) - time;
>                 update_avg(&this_sd->avg_scan_cost, time);
>         }
>
> -       return cpu;
> +       return idle_cpu;
>  }
>
>  /*
> @@ -6315,18 +6293,10 @@ static int select_idle_sibling(struct ta
>         if (!sd)
>                 return target;
>
> -       i =3D select_idle_core(p, sd, target);
> -       if ((unsigned)i < nr_cpumask_bits)
> -               return i;
> -
>         i =3D select_idle_cpu(p, sd, target);
>         if ((unsigned)i < nr_cpumask_bits)
>                 return i;
>
> -       i =3D select_idle_smt(p, sd, target);
> -       if ((unsigned)i < nr_cpumask_bits)
> -               return i;
> -
>         return target;
>  }
>