2020-10-20 17:11:21

by Joel Fernandes

[permalink] [raw]
Subject: [PATCH v8 -tip 08/26] sched/fair: Snapshot the min_vruntime of CPUs on force idle

During force-idle, we end up doing cross-cpu comparison of vruntimes
during pick_next_task. If we simply compare (vruntime-min_vruntime)
across CPUs, and if the CPUs only have 1 task each, we will always
end up comparing 0 with 0 and pick just one of the tasks all the time.
This starves the task that was not picked. To fix this, take a snapshot
of the min_vruntime when entering force idle and use it for comparison.
This min_vruntime snapshot will only be used for cross-CPU vruntime
comparison, and nothing else.

This resolves several performance issues that were seen in ChromeOS
audio usecase.

Tested-by: Julien Desfossez <[email protected]>
Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
kernel/sched/core.c | 33 ++++++++++++++++++++-------------
kernel/sched/fair.c | 40 ++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 5 +++++
3 files changed, 65 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 469428979182..a5404ec9e89a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -115,19 +115,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b)
if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
return !dl_time_before(a->dl.deadline, b->dl.deadline);

- if (pa == MAX_RT_PRIO + MAX_NICE) { /* fair */
- u64 vruntime = b->se.vruntime;
-
- /*
- * Normalize the vruntime if tasks are in different cpus.
- */
- if (task_cpu(a) != task_cpu(b)) {
- vruntime -= task_cfs_rq(b)->min_vruntime;
- vruntime += task_cfs_rq(a)->min_vruntime;
- }
-
- return !((s64)(a->se.vruntime - vruntime) <= 0);
- }
+ if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
+ return cfs_prio_less(a, b);

return false;
}
@@ -4648,6 +4637,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *next, *max = NULL;
const struct sched_class *class;
const struct cpumask *smt_mask;
+ bool fi_before = false;
bool need_sync;
int i, j, cpu;

@@ -4712,6 +4702,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
rq->core->core_cookie = 0UL;
if (rq->core->core_forceidle) {
need_sync = true;
+ fi_before = true;
rq->core->core_forceidle = false;
}
for_each_cpu(i, smt_mask) {
@@ -4723,6 +4714,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
update_rq_clock(rq_i);
}

+ /* Reset the snapshot if core is no longer in force-idle. */
+ if (!fi_before) {
+ for_each_cpu(i, smt_mask) {
+ struct rq *rq_i = cpu_rq(i);
+ rq_i->cfs.min_vruntime_fi = rq_i->cfs.min_vruntime;
+ }
+ }
+
/*
* Try and select tasks for each sibling in decending sched_class
* order.
@@ -4859,6 +4858,14 @@ next_class:;
resched_curr(rq_i);
}

+ /* Snapshot if core is in force-idle. */
+ if (!fi_before && rq->core->core_forceidle) {
+ for_each_cpu(i, smt_mask) {
+ struct rq *rq_i = cpu_rq(i);
+ rq_i->cfs.min_vruntime_fi = rq_i->cfs.min_vruntime;
+ }
+ }
+
done:
set_next_task(rq, next);
return next;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56bea0decda1..9cae08c3fca1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10686,6 +10686,46 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
resched_curr(rq);
}
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
+{
+ bool samecpu = task_cpu(a) == task_cpu(b);
+ struct sched_entity *sea = &a->se;
+ struct sched_entity *seb = &b->se;
+ struct cfs_rq *cfs_rqa;
+ struct cfs_rq *cfs_rqb;
+ s64 delta;
+
+ if (samecpu) {
+ /* vruntime is per cfs_rq */
+ while (!is_same_group(sea, seb)) {
+ int sea_depth = sea->depth;
+ int seb_depth = seb->depth;
+ if (sea_depth >= seb_depth)
+ sea = parent_entity(sea);
+ if (sea_depth <= seb_depth)
+ seb = parent_entity(seb);
+ }
+
+ delta = (s64)(sea->vruntime - seb->vruntime);
+ goto out;
+ }
+
+ /* crosscpu: compare root level se's vruntime to decide priority */
+ while (sea->parent)
+ sea = sea->parent;
+ while (seb->parent)
+ seb = seb->parent;
+
+ cfs_rqa = sea->cfs_rq;
+ cfs_rqb = seb->cfs_rq;
+
+ /* normalize vruntime WRT their rq's base */
+ delta = (s64)(sea->vruntime - seb->vruntime) +
+ (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+out:
+ return delta > 0;
+}
#else
static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 884d23d5e55d..dfdb0ebb07a8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -524,6 +524,9 @@ struct cfs_rq {

u64 exec_clock;
u64 min_vruntime;
+#ifdef CONFIG_SCHED_CORE
+ u64 min_vruntime_fi;
+#endif
#ifndef CONFIG_64BIT
u64 min_vruntime_copy;
#endif
@@ -1106,6 +1109,8 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq)
return &rq->__lock;
}

+bool cfs_prio_less(struct task_struct *a, struct task_struct *b);
+
#else /* !CONFIG_SCHED_CORE */

static inline bool sched_core_enabled(struct rq *rq)
--
2.29.0.rc1.297.gfa9743e501-goog


2020-10-26 13:34:48

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 08/26] sched/fair: Snapshot the min_vruntime of CPUs on force idle

On Mon, Oct 19, 2020 at 09:43:18PM -0400, Joel Fernandes (Google) wrote:

> @@ -4723,6 +4714,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> update_rq_clock(rq_i);
> }
>
> + /* Reset the snapshot if core is no longer in force-idle. */
> + if (!fi_before) {
> + for_each_cpu(i, smt_mask) {
> + struct rq *rq_i = cpu_rq(i);
> + rq_i->cfs.min_vruntime_fi = rq_i->cfs.min_vruntime;
> + }
> + }

So this is the thing that drags vruntime_fi along when (both?) siblings
are active, right? But should we not do that after pick? Consider 2
tasks a weight 1 and a weight 10 task, one for each sibling. By syncing
the vruntime before picking, the cfs_prio_less() loop will not be able
to distinguish between these two, since they'll both have effectively
the same lag.

If however, you syn after pick, then the weight 1 task will have accreud
far more runtime than the weight 10 task, and consequently the weight 10
task will have preference when a decision will have to be made.

(also, if this were the right place, the whole thing should've been part
of the for_each_cpu() loop right before this)

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 56bea0decda1..9cae08c3fca1 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -10686,6 +10686,46 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
> __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
> resched_curr(rq);
> }
> +
> +bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
> +{
> + bool samecpu = task_cpu(a) == task_cpu(b);
> + struct sched_entity *sea = &a->se;
> + struct sched_entity *seb = &b->se;
> + struct cfs_rq *cfs_rqa;
> + struct cfs_rq *cfs_rqb;
> + s64 delta;
> +
> + if (samecpu) {
> + /* vruntime is per cfs_rq */
> + while (!is_same_group(sea, seb)) {
> + int sea_depth = sea->depth;
> + int seb_depth = seb->depth;
> + if (sea_depth >= seb_depth)
> + sea = parent_entity(sea);
> + if (sea_depth <= seb_depth)
> + seb = parent_entity(seb);
> + }
> +
> + delta = (s64)(sea->vruntime - seb->vruntime);
> + goto out;
> + }
> +
> + /* crosscpu: compare root level se's vruntime to decide priority */
> + while (sea->parent)
> + sea = sea->parent;
> + while (seb->parent)
> + seb = seb->parent;

This seems unfortunate, I think we can do better.

> +
> + cfs_rqa = sea->cfs_rq;
> + cfs_rqb = seb->cfs_rq;
> +
> + /* normalize vruntime WRT their rq's base */
> + delta = (s64)(sea->vruntime - seb->vruntime) +
> + (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
> +out:
> + return delta > 0;
> +}


How's something like this?

- after each pick, such that the pick itself sees the divergence (see
above); either:

- pull the vruntime_fi forward, when !fi
- freeze the vruntime_fi, when newly fi (A)

- either way, update vruntime_fi for each cfs_rq in the active
hierachy.

- when comparing, and fi, update the vruntime_fi hierachy until we
encounter a mark from (A), per doing it during the pick, but before
runtime, this guaranteees it hasn't moved since (A).

XXX, still buggered on SMT>2, imagine having {ta, tb, fi, i} on an SMT4,
then when comparing any two tasks that do not involve the fi, we should
(probably) have pulled them fwd -- but we can't actually pull them,
because then the fi thing would break, mooo.


--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -115,19 +115,8 @@ static inline bool prio_less(struct task
if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
return !dl_time_before(a->dl.deadline, b->dl.deadline);

- if (pa == MAX_RT_PRIO + MAX_NICE) { /* fair */
- u64 vruntime = b->se.vruntime;
-
- /*
- * Normalize the vruntime if tasks are in different cpus.
- */
- if (task_cpu(a) != task_cpu(b)) {
- vruntime -= task_cfs_rq(b)->min_vruntime;
- vruntime += task_cfs_rq(a)->min_vruntime;
- }
-
- return !((s64)(a->se.vruntime - vruntime) <= 0);
- }
+ if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
+ return cfs_prio_less(a, b);

return false;
}
@@ -4642,12 +4631,15 @@ pick_task(struct rq *rq, const struct sc
return cookie_pick;
}

+extern void task_vruntime_update(struct rq *rq, struct task_struct *p);
+
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct task_struct *next, *max = NULL;
const struct sched_class *class;
const struct cpumask *smt_mask;
+ bool fi_before = false;
bool need_sync;
int i, j, cpu;

@@ -4707,6 +4699,7 @@ pick_next_task(struct rq *rq, struct tas
need_sync = !!rq->core->core_cookie;
if (rq->core->core_forceidle) {
need_sync = true;
+ fi_before = true;
rq->core->core_forceidle = false;
}

@@ -4757,6 +4750,11 @@ pick_next_task(struct rq *rq, struct tas
continue;

rq_i->core_pick = p;
+ if (rq_i->idle == p && rq_i->nr_running) {
+ rq->core->core_forceidle = true;
+ if (!fi_before)
+ rq->core->core_forceidle_seq++;
+ }

/*
* If this new candidate is of higher priority than the
@@ -4775,6 +4773,7 @@ pick_next_task(struct rq *rq, struct tas
max = p;

if (old_max) {
+ rq->core->core_forceidle = false;
for_each_cpu(j, smt_mask) {
if (j == i)
continue;
@@ -4823,10 +4822,8 @@ pick_next_task(struct rq *rq, struct tas
if (!rq_i->core_pick)
continue;

- if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running &&
- !rq_i->core->core_forceidle) {
- rq_i->core->core_forceidle = true;
- }
+ if (!(fi_before && rq->core->core_forceidle))
+ task_vruntime_update(rq_i, rq_i->core_pick);

if (i == cpu) {
rq_i->core_pick = NULL;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10686,6 +10686,67 @@ static inline void task_tick_core(struct
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
resched_curr(rq);
}
+
+static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (forceidle) {
+ if (cfs_rq->forceidle_seq == fi_seq)
+ break;
+ cfs_rq->forceidle_seq = fi_seq;
+ }
+
+ cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
+ }
+}
+
+void task_vruntime_update(struct rq *rq, struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+
+ if (p->sched_class != &fair_sched_class)
+ return;
+
+ se_fi_update(se, rq->core->core_forceidle_seq, rq->core->core_forceidle);
+}
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
+{
+ struct rq *rq = task_rq(a);
+ struct sched_entity *sea = &a->se;
+ struct sched_entity *seb = &b->se;
+ struct cfs_rq *cfs_rqa;
+ struct cfs_rq *cfs_rqb;
+ s64 delta;
+
+ SCHED_WARN_ON(task_rq(b)->core != rq->core);
+
+ while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
+ int sea_depth = sea->depth;
+ int seb_depth = seb->depth;
+
+ if (sea_depth >= seb_depth)
+ sea = parent_entity(sea);
+ if (sea_depth <= seb_depth)
+ seb = parent_entity(seb);
+ }
+
+ if (rq->core->core_forceidle) {
+ se_fi_update(sea, rq->core->core_forceidle_seq, true);
+ se_fi_update(seb, rq->core->core_forceidle_seq, true);
+ }
+
+ cfs_rqa = sea->cfs_rq;
+ cfs_rqb = seb->cfs_rq;
+
+ /* normalize vruntime WRT their rq's base */
+ delta = (s64)(sea->vruntime - seb->vruntime) +
+ (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+
+ return delta > 0;
+}
#else
static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
#endif
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -522,6 +522,11 @@ struct cfs_rq {
unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
unsigned int idle_h_nr_running; /* SCHED_IDLE */

+#ifdef CONFIG_SCHED_CORE
+ unsigned int forceidle_seq;
+ u64 min_vruntime_fi;
+#endif
+
u64 exec_clock;
u64 min_vruntime;
#ifndef CONFIG_64BIT
@@ -1061,7 +1066,8 @@ struct rq {
unsigned int core_task_seq;
unsigned int core_pick_seq;
unsigned long core_cookie;
- unsigned char core_forceidle;
+ unsigned int core_forceidle;
+ unsigned int core_forceidle_seq;
#endif
};

@@ -1106,6 +1112,8 @@ static inline raw_spinlock_t *rq_lockp(s
return &rq->__lock;
}

+bool cfs_prio_less(struct task_struct *a, struct task_struct *b);
+
#else /* !CONFIG_SCHED_CORE */

static inline bool sched_core_enabled(struct rq *rq)

2020-10-28 21:43:14

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 08/26] sched/fair: Snapshot the min_vruntime of CPUs on force idle

On Mon, Oct 26, 2020 at 01:47:24PM +0100, Peter Zijlstra wrote:
> On Mon, Oct 19, 2020 at 09:43:18PM -0400, Joel Fernandes (Google) wrote:
>
> > @@ -4723,6 +4714,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > update_rq_clock(rq_i);
> > }
> >
> > + /* Reset the snapshot if core is no longer in force-idle. */
> > + if (!fi_before) {
> > + for_each_cpu(i, smt_mask) {
> > + struct rq *rq_i = cpu_rq(i);
> > + rq_i->cfs.min_vruntime_fi = rq_i->cfs.min_vruntime;
> > + }
> > + }
>
> So this is the thing that drags vruntime_fi along when (both?) siblings
> are active, right? But should we not do that after pick? Consider 2
> tasks a weight 1 and a weight 10 task, one for each sibling. By syncing
> the vruntime before picking, the cfs_prio_less() loop will not be able
> to distinguish between these two, since they'll both have effectively
> the same lag.

Just to set some terms, by lag you mean "the delta between task->vruntime and
cfs_rq->min_vruntime_fi" right ?

Assuming this is what you mean,

Say there's 2 task T1 and T2 on 2 different CPUs.
T1 has weight=10 vruntime=200
T2 has weight=1 vruntime=1000

Say T1's cfs_rq R1 has min_vruntime of 100
T2's cfs_rq R2 has min_vruntime of 200

First time we force idle, we do the "sync" in my patch (which is what I think
you can call snapshotting of min_vruntime). Assuming we do the sync _before_
picking:
This causes R1's ->min_vruntime_fi to be 100.
R2's ->min_vruntime_fi to be 200.

So during picking, cfs_prio_less() will see R1's "lag" as (200-100) = 100.
And R2's "lag" as (1000-200) = 800.

So the lags are different and I didn't get what you mean by "have effectively
the same lag".

Could you let me know what I am missing?

Also your patch is great and I feel it does the same thing as my patch except
for doing the min_vruntime snapshot up the hierarchy (which is great!). BTW,
do you really need force_idle_seq?

It seems to me, since you have fi_before, you can just set another variable
on the stack if the fi status changed during picking, and pass that along to
se_fi_update() or is there another case where the force_idle_seq is needed?

Thanks!

- Joel


> If however, you syn after pick, then the weight 1 task will have accreud
> far more runtime than the weight 10 task, and consequently the weight 10
> task will have preference when a decision will have to be made.
>
> (also, if this were the right place, the whole thing should've been part
> of the for_each_cpu() loop right before this)
>
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 56bea0decda1..9cae08c3fca1 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -10686,6 +10686,46 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
> > __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
> > resched_curr(rq);
> > }
> > +
> > +bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
> > +{
> > + bool samecpu = task_cpu(a) == task_cpu(b);
> > + struct sched_entity *sea = &a->se;
> > + struct sched_entity *seb = &b->se;
> > + struct cfs_rq *cfs_rqa;
> > + struct cfs_rq *cfs_rqb;
> > + s64 delta;
> > +
> > + if (samecpu) {
> > + /* vruntime is per cfs_rq */
> > + while (!is_same_group(sea, seb)) {
> > + int sea_depth = sea->depth;
> > + int seb_depth = seb->depth;
> > + if (sea_depth >= seb_depth)
> > + sea = parent_entity(sea);
> > + if (sea_depth <= seb_depth)
> > + seb = parent_entity(seb);
> > + }
> > +
> > + delta = (s64)(sea->vruntime - seb->vruntime);
> > + goto out;
> > + }
> > +
> > + /* crosscpu: compare root level se's vruntime to decide priority */
> > + while (sea->parent)
> > + sea = sea->parent;
> > + while (seb->parent)
> > + seb = seb->parent;
>
> This seems unfortunate, I think we can do better.
>
> > +
> > + cfs_rqa = sea->cfs_rq;
> > + cfs_rqb = seb->cfs_rq;
> > +
> > + /* normalize vruntime WRT their rq's base */
> > + delta = (s64)(sea->vruntime - seb->vruntime) +
> > + (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
> > +out:
> > + return delta > 0;
> > +}
>
>
> How's something like this?
>
> - after each pick, such that the pick itself sees the divergence (see
> above); either:
>
> - pull the vruntime_fi forward, when !fi
> - freeze the vruntime_fi, when newly fi (A)
>
> - either way, update vruntime_fi for each cfs_rq in the active
> hierachy.
>
> - when comparing, and fi, update the vruntime_fi hierachy until we
> encounter a mark from (A), per doing it during the pick, but before
> runtime, this guaranteees it hasn't moved since (A).
>
> XXX, still buggered on SMT>2, imagine having {ta, tb, fi, i} on an SMT4,
> then when comparing any two tasks that do not involve the fi, we should
> (probably) have pulled them fwd -- but we can't actually pull them,
> because then the fi thing would break, mooo.
>
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -115,19 +115,8 @@ static inline bool prio_less(struct task
> if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
> return !dl_time_before(a->dl.deadline, b->dl.deadline);
>
> - if (pa == MAX_RT_PRIO + MAX_NICE) { /* fair */
> - u64 vruntime = b->se.vruntime;
> -
> - /*
> - * Normalize the vruntime if tasks are in different cpus.
> - */
> - if (task_cpu(a) != task_cpu(b)) {
> - vruntime -= task_cfs_rq(b)->min_vruntime;
> - vruntime += task_cfs_rq(a)->min_vruntime;
> - }
> -
> - return !((s64)(a->se.vruntime - vruntime) <= 0);
> - }
> + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
> + return cfs_prio_less(a, b);
>
> return false;
> }
> @@ -4642,12 +4631,15 @@ pick_task(struct rq *rq, const struct sc
> return cookie_pick;
> }
>
> +extern void task_vruntime_update(struct rq *rq, struct task_struct *p);
> +
> static struct task_struct *
> pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> {
> struct task_struct *next, *max = NULL;
> const struct sched_class *class;
> const struct cpumask *smt_mask;
> + bool fi_before = false;
> bool need_sync;
> int i, j, cpu;
>
> @@ -4707,6 +4699,7 @@ pick_next_task(struct rq *rq, struct tas
> need_sync = !!rq->core->core_cookie;
> if (rq->core->core_forceidle) {
> need_sync = true;
> + fi_before = true;
> rq->core->core_forceidle = false;
> }
>
> @@ -4757,6 +4750,11 @@ pick_next_task(struct rq *rq, struct tas
> continue;
>
> rq_i->core_pick = p;
> + if (rq_i->idle == p && rq_i->nr_running) {
> + rq->core->core_forceidle = true;
> + if (!fi_before)
> + rq->core->core_forceidle_seq++;
> + }
>
> /*
> * If this new candidate is of higher priority than the
> @@ -4775,6 +4773,7 @@ pick_next_task(struct rq *rq, struct tas
> max = p;
>
> if (old_max) {
> + rq->core->core_forceidle = false;
> for_each_cpu(j, smt_mask) {
> if (j == i)
> continue;
> @@ -4823,10 +4822,8 @@ pick_next_task(struct rq *rq, struct tas
> if (!rq_i->core_pick)
> continue;
>
> - if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running &&
> - !rq_i->core->core_forceidle) {
> - rq_i->core->core_forceidle = true;
> - }
> + if (!(fi_before && rq->core->core_forceidle))
> + task_vruntime_update(rq_i, rq_i->core_pick);
>
> if (i == cpu) {
> rq_i->core_pick = NULL;
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -10686,6 +10686,67 @@ static inline void task_tick_core(struct
> __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
> resched_curr(rq);
> }
> +
> +static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
> +{
> + for_each_sched_entity(se) {
> + struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> + if (forceidle) {
> + if (cfs_rq->forceidle_seq == fi_seq)
> + break;
> + cfs_rq->forceidle_seq = fi_seq;
> + }
> +
> + cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
> + }
> +}
> +
> +void task_vruntime_update(struct rq *rq, struct task_struct *p)
> +{
> + struct sched_entity *se = &p->se;
> +
> + if (p->sched_class != &fair_sched_class)
> + return;
> +
> + se_fi_update(se, rq->core->core_forceidle_seq, rq->core->core_forceidle);
> +}
> +
> +bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
> +{
> + struct rq *rq = task_rq(a);
> + struct sched_entity *sea = &a->se;
> + struct sched_entity *seb = &b->se;
> + struct cfs_rq *cfs_rqa;
> + struct cfs_rq *cfs_rqb;
> + s64 delta;
> +
> + SCHED_WARN_ON(task_rq(b)->core != rq->core);
> +
> + while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
> + int sea_depth = sea->depth;
> + int seb_depth = seb->depth;
> +
> + if (sea_depth >= seb_depth)
> + sea = parent_entity(sea);
> + if (sea_depth <= seb_depth)
> + seb = parent_entity(seb);
> + }
> +
> + if (rq->core->core_forceidle) {
> + se_fi_update(sea, rq->core->core_forceidle_seq, true);
> + se_fi_update(seb, rq->core->core_forceidle_seq, true);
> + }
> +
> + cfs_rqa = sea->cfs_rq;
> + cfs_rqb = seb->cfs_rq;
> +
> + /* normalize vruntime WRT their rq's base */
> + delta = (s64)(sea->vruntime - seb->vruntime) +
> + (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
> +
> + return delta > 0;
> +}
> #else
> static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
> #endif
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -522,6 +522,11 @@ struct cfs_rq {
> unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
> unsigned int idle_h_nr_running; /* SCHED_IDLE */
>
> +#ifdef CONFIG_SCHED_CORE
> + unsigned int forceidle_seq;
> + u64 min_vruntime_fi;
> +#endif
> +
> u64 exec_clock;
> u64 min_vruntime;
> #ifndef CONFIG_64BIT
> @@ -1061,7 +1066,8 @@ struct rq {
> unsigned int core_task_seq;
> unsigned int core_pick_seq;
> unsigned long core_cookie;
> - unsigned char core_forceidle;
> + unsigned int core_forceidle;
> + unsigned int core_forceidle_seq;
> #endif
> };
>
> @@ -1106,6 +1112,8 @@ static inline raw_spinlock_t *rq_lockp(s
> return &rq->__lock;
> }
>
> +bool cfs_prio_less(struct task_struct *a, struct task_struct *b);
> +
> #else /* !CONFIG_SCHED_CORE */
>
> static inline bool sched_core_enabled(struct rq *rq)

2020-10-29 10:03:21

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 08/26] sched/fair: Snapshot the min_vruntime of CPUs on force idle

Hi Peter,

I am still working on understanding your approach and will reply soon, but I
just wanted to clarify the question on my approach:

On Mon, Oct 26, 2020 at 01:47:24PM +0100, Peter Zijlstra wrote:
> On Mon, Oct 19, 2020 at 09:43:18PM -0400, Joel Fernandes (Google) wrote:
>
> > @@ -4723,6 +4714,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > update_rq_clock(rq_i);
> > }
> >
> > + /* Reset the snapshot if core is no longer in force-idle. */
> > + if (!fi_before) {
> > + for_each_cpu(i, smt_mask) {
> > + struct rq *rq_i = cpu_rq(i);
> > + rq_i->cfs.min_vruntime_fi = rq_i->cfs.min_vruntime;
> > + }
> > + }
>
> So this is the thing that drags vruntime_fi along when (both?) siblings
> are active, right? But should we not do that after pick? Consider 2
> tasks a weight 1 and a weight 10 task, one for each sibling. By syncing
> the vruntime before picking, the cfs_prio_less() loop will not be able
> to distinguish between these two, since they'll both have effectively
> the same lag.

Actually the snapshot I take is local to the rq's business, it is not
core-wide. So there's no core-wide sync in the patch.

So for each rq, I'm assigning the rq's ->min_vruntime_fi to its own
->min_vruntime.

And then later in cfs_prio_less(), I'm normalizing task's ->vruntime with
respect to the ->min_vruntime_fi and using that delta for comparison. So each
->rq is really dealing with its own lag, its not sync'ed.

About why it is to be done before, if we are not in force-idle any more, then
every time we pick, then rq's min_vruntime_fi will be assigned its
->min_vruntime which will be used later, as baseline for that particular rq,
during the cfs_prio_less(). So it has to be done before at least in my
approach. This is no different from not having this patch.

However, if we are in force-idle, then the min_vruntime_fi stays put on all
siblings so that the delta continues to grow on all cpus *since* the force
idle started.

The reason I also do it after is, if the core just entered force idle but
wasn't previously in it, then we take a snapshot at that point and continue
to use that snapshot in future selections as long as the core is in force
idle.

This approach may have some drawbacks but it reduces the task latencies quite
a bit in my testing.

Thoughts?

thanks,

- Joel


> If however, you syn after pick, then the weight 1 task will have accreud
> far more runtime than the weight 10 task, and consequently the weight 10
> task will have preference when a decision will have to be made.
>
> (also, if this were the right place, the whole thing should've been part
> of the for_each_cpu() loop right before this)

> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 56bea0decda1..9cae08c3fca1 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -10686,6 +10686,46 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
> > __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
> > resched_curr(rq);
> > }
> > +
> > +bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
> > +{
> > + bool samecpu = task_cpu(a) == task_cpu(b);
> > + struct sched_entity *sea = &a->se;
> > + struct sched_entity *seb = &b->se;
> > + struct cfs_rq *cfs_rqa;
> > + struct cfs_rq *cfs_rqb;
> > + s64 delta;
> > +
> > + if (samecpu) {
> > + /* vruntime is per cfs_rq */
> > + while (!is_same_group(sea, seb)) {
> > + int sea_depth = sea->depth;
> > + int seb_depth = seb->depth;
> > + if (sea_depth >= seb_depth)
> > + sea = parent_entity(sea);
> > + if (sea_depth <= seb_depth)
> > + seb = parent_entity(seb);
> > + }
> > +
> > + delta = (s64)(sea->vruntime - seb->vruntime);
> > + goto out;
> > + }
> > +
> > + /* crosscpu: compare root level se's vruntime to decide priority */
> > + while (sea->parent)
> > + sea = sea->parent;
> > + while (seb->parent)
> > + seb = seb->parent;
>
> This seems unfortunate, I think we can do better.
>
> > +
> > + cfs_rqa = sea->cfs_rq;
> > + cfs_rqb = seb->cfs_rq;
> > +
> > + /* normalize vruntime WRT their rq's base */
> > + delta = (s64)(sea->vruntime - seb->vruntime) +
> > + (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
> > +out:
> > + return delta > 0;
> > +}
>
>
> How's something like this?
>
> - after each pick, such that the pick itself sees the divergence (see
> above); either:
>
> - pull the vruntime_fi forward, when !fi
> - freeze the vruntime_fi, when newly fi (A)
>
> - either way, update vruntime_fi for each cfs_rq in the active
> hierachy.
>
> - when comparing, and fi, update the vruntime_fi hierachy until we
> encounter a mark from (A), per doing it during the pick, but before
> runtime, this guaranteees it hasn't moved since (A).
>
> XXX, still buggered on SMT>2, imagine having {ta, tb, fi, i} on an SMT4,
> then when comparing any two tasks that do not involve the fi, we should
> (probably) have pulled them fwd -- but we can't actually pull them,
> because then the fi thing would break, mooo.
>
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -115,19 +115,8 @@ static inline bool prio_less(struct task
> if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
> return !dl_time_before(a->dl.deadline, b->dl.deadline);
>
> - if (pa == MAX_RT_PRIO + MAX_NICE) { /* fair */
> - u64 vruntime = b->se.vruntime;
> -
> - /*
> - * Normalize the vruntime if tasks are in different cpus.
> - */
> - if (task_cpu(a) != task_cpu(b)) {
> - vruntime -= task_cfs_rq(b)->min_vruntime;
> - vruntime += task_cfs_rq(a)->min_vruntime;
> - }
> -
> - return !((s64)(a->se.vruntime - vruntime) <= 0);
> - }
> + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
> + return cfs_prio_less(a, b);
>
> return false;
> }
> @@ -4642,12 +4631,15 @@ pick_task(struct rq *rq, const struct sc
> return cookie_pick;
> }
>
> +extern void task_vruntime_update(struct rq *rq, struct task_struct *p);
> +
> static struct task_struct *
> pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> {
> struct task_struct *next, *max = NULL;
> const struct sched_class *class;
> const struct cpumask *smt_mask;
> + bool fi_before = false;
> bool need_sync;
> int i, j, cpu;
>
> @@ -4707,6 +4699,7 @@ pick_next_task(struct rq *rq, struct tas
> need_sync = !!rq->core->core_cookie;
> if (rq->core->core_forceidle) {
> need_sync = true;
> + fi_before = true;
> rq->core->core_forceidle = false;
> }
>
> @@ -4757,6 +4750,11 @@ pick_next_task(struct rq *rq, struct tas
> continue;
>
> rq_i->core_pick = p;
> + if (rq_i->idle == p && rq_i->nr_running) {
> + rq->core->core_forceidle = true;
> + if (!fi_before)
> + rq->core->core_forceidle_seq++;
> + }
>
> /*
> * If this new candidate is of higher priority than the
> @@ -4775,6 +4773,7 @@ pick_next_task(struct rq *rq, struct tas
> max = p;
>
> if (old_max) {
> + rq->core->core_forceidle = false;
> for_each_cpu(j, smt_mask) {
> if (j == i)
> continue;
> @@ -4823,10 +4822,8 @@ pick_next_task(struct rq *rq, struct tas
> if (!rq_i->core_pick)
> continue;
>
> - if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running &&
> - !rq_i->core->core_forceidle) {
> - rq_i->core->core_forceidle = true;
> - }
> + if (!(fi_before && rq->core->core_forceidle))
> + task_vruntime_update(rq_i, rq_i->core_pick);
>
> if (i == cpu) {
> rq_i->core_pick = NULL;
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -10686,6 +10686,67 @@ static inline void task_tick_core(struct
> __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
> resched_curr(rq);
> }
> +
> +static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
> +{
> + for_each_sched_entity(se) {
> + struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> + if (forceidle) {
> + if (cfs_rq->forceidle_seq == fi_seq)
> + break;
> + cfs_rq->forceidle_seq = fi_seq;
> + }
> +
> + cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
> + }
> +}
> +
> +void task_vruntime_update(struct rq *rq, struct task_struct *p)
> +{
> + struct sched_entity *se = &p->se;
> +
> + if (p->sched_class != &fair_sched_class)
> + return;
> +
> + se_fi_update(se, rq->core->core_forceidle_seq, rq->core->core_forceidle);
> +}
> +
> +bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
> +{
> + struct rq *rq = task_rq(a);
> + struct sched_entity *sea = &a->se;
> + struct sched_entity *seb = &b->se;
> + struct cfs_rq *cfs_rqa;
> + struct cfs_rq *cfs_rqb;
> + s64 delta;
> +
> + SCHED_WARN_ON(task_rq(b)->core != rq->core);
> +
> + while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
> + int sea_depth = sea->depth;
> + int seb_depth = seb->depth;
> +
> + if (sea_depth >= seb_depth)
> + sea = parent_entity(sea);
> + if (sea_depth <= seb_depth)
> + seb = parent_entity(seb);
> + }
> +
> + if (rq->core->core_forceidle) {
> + se_fi_update(sea, rq->core->core_forceidle_seq, true);
> + se_fi_update(seb, rq->core->core_forceidle_seq, true);
> + }
> +
> + cfs_rqa = sea->cfs_rq;
> + cfs_rqb = seb->cfs_rq;
> +
> + /* normalize vruntime WRT their rq's base */
> + delta = (s64)(sea->vruntime - seb->vruntime) +
> + (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
> +
> + return delta > 0;
> +}
> #else
> static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
> #endif
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -522,6 +522,11 @@ struct cfs_rq {
> unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
> unsigned int idle_h_nr_running; /* SCHED_IDLE */
>
> +#ifdef CONFIG_SCHED_CORE
> + unsigned int forceidle_seq;
> + u64 min_vruntime_fi;
> +#endif
> +
> u64 exec_clock;
> u64 min_vruntime;
> #ifndef CONFIG_64BIT
> @@ -1061,7 +1066,8 @@ struct rq {
> unsigned int core_task_seq;
> unsigned int core_pick_seq;
> unsigned long core_cookie;
> - unsigned char core_forceidle;
> + unsigned int core_forceidle;
> + unsigned int core_forceidle_seq;
> #endif
> };
>
> @@ -1106,6 +1112,8 @@ static inline raw_spinlock_t *rq_lockp(s
> return &rq->__lock;
> }
>
> +bool cfs_prio_less(struct task_struct *a, struct task_struct *b);
> +
> #else /* !CONFIG_SCHED_CORE */
>
> static inline bool sched_core_enabled(struct rq *rq)

2020-10-29 17:01:43

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 08/26] sched/fair: Snapshot the min_vruntime of CPUs on force idle

On Mon, Oct 26, 2020 at 01:47:24PM +0100, Peter Zijlstra wrote:
> On Mon, Oct 19, 2020 at 09:43:18PM -0400, Joel Fernandes (Google) wrote:
>
> > @@ -4723,6 +4714,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > update_rq_clock(rq_i);
> > }
> >
> > + /* Reset the snapshot if core is no longer in force-idle. */
> > + if (!fi_before) {
> > + for_each_cpu(i, smt_mask) {
> > + struct rq *rq_i = cpu_rq(i);
> > + rq_i->cfs.min_vruntime_fi = rq_i->cfs.min_vruntime;
> > + }
> > + }
>
> So this is the thing that drags vruntime_fi along when (both?) siblings
> are active, right? But should we not do that after pick? Consider 2
> tasks a weight 1 and a weight 10 task, one for each sibling. By syncing
> the vruntime before picking, the cfs_prio_less() loop will not be able
> to distinguish between these two, since they'll both have effectively
> the same lag.
>
> If however, you syn after pick, then the weight 1 task will have accreud
> far more runtime than the weight 10 task, and consequently the weight 10
> task will have preference when a decision will have to be made.
>
> (also, if this were the right place, the whole thing should've been part
> of the for_each_cpu() loop right before this)
>
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 56bea0decda1..9cae08c3fca1 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -10686,6 +10686,46 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
> > __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
> > resched_curr(rq);
> > }
> > +
> > +bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
> > +{
> > + bool samecpu = task_cpu(a) == task_cpu(b);
> > + struct sched_entity *sea = &a->se;
> > + struct sched_entity *seb = &b->se;
> > + struct cfs_rq *cfs_rqa;
> > + struct cfs_rq *cfs_rqb;
> > + s64 delta;
> > +
> > + if (samecpu) {
> > + /* vruntime is per cfs_rq */
> > + while (!is_same_group(sea, seb)) {
> > + int sea_depth = sea->depth;
> > + int seb_depth = seb->depth;
> > + if (sea_depth >= seb_depth)
> > + sea = parent_entity(sea);
> > + if (sea_depth <= seb_depth)
> > + seb = parent_entity(seb);
> > + }
> > +
> > + delta = (s64)(sea->vruntime - seb->vruntime);
> > + goto out;
> > + }
> > +
> > + /* crosscpu: compare root level se's vruntime to decide priority */
> > + while (sea->parent)
> > + sea = sea->parent;
> > + while (seb->parent)
> > + seb = seb->parent;
>
> This seems unfortunate, I think we can do better.
>
> > +
> > + cfs_rqa = sea->cfs_rq;
> > + cfs_rqb = seb->cfs_rq;
> > +
> > + /* normalize vruntime WRT their rq's base */
> > + delta = (s64)(sea->vruntime - seb->vruntime) +
> > + (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
> > +out:
> > + return delta > 0;
> > +}
>
>
> How's something like this?
>
> - after each pick, such that the pick itself sees the divergence (see
> above); either:
>
> - pull the vruntime_fi forward, when !fi
> - freeze the vruntime_fi, when newly fi (A)
>
> - either way, update vruntime_fi for each cfs_rq in the active
> hierachy.
>
> - when comparing, and fi, update the vruntime_fi hierachy until we
> encounter a mark from (A), per doing it during the pick, but before
> runtime, this guaranteees it hasn't moved since (A).
>
> XXX, still buggered on SMT>2, imagine having {ta, tb, fi, i} on an SMT4,
> then when comparing any two tasks that do not involve the fi, we should
> (probably) have pulled them fwd -- but we can't actually pull them,
> because then the fi thing would break, mooo.

Hi Peter, Vineeth,

I tried Peter's diff (had to backport it to 4.19 as that's what our devices are
on). Let me know if I screwed the backport:
https://chromium.googlesource.com/chromiumos/third_party/kernel/+/6469fd5b6bcd0eb1e054307aa4b54bc9e937346d%5E%21/

With Peter's changes, I see really bad wakeup latencies when running a simple
synthetic test (1 task busy on a CPU with the sibling doing a 70% run-sleep
cycle with 1 second period. The 'perf sched latency' worst case is of the
order of 100s of ms, which I don't see with my patch.

I do feel Peter's patch is better though and I'll try to debug it.

thanks,

- Joel


>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -115,19 +115,8 @@ static inline bool prio_less(struct task
> if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
> return !dl_time_before(a->dl.deadline, b->dl.deadline);
>
> - if (pa == MAX_RT_PRIO + MAX_NICE) { /* fair */
> - u64 vruntime = b->se.vruntime;
> -
> - /*
> - * Normalize the vruntime if tasks are in different cpus.
> - */
> - if (task_cpu(a) != task_cpu(b)) {
> - vruntime -= task_cfs_rq(b)->min_vruntime;
> - vruntime += task_cfs_rq(a)->min_vruntime;
> - }
> -
> - return !((s64)(a->se.vruntime - vruntime) <= 0);
> - }
> + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
> + return cfs_prio_less(a, b);
>
> return false;
> }
> @@ -4642,12 +4631,15 @@ pick_task(struct rq *rq, const struct sc
> return cookie_pick;
> }
>
> +extern void task_vruntime_update(struct rq *rq, struct task_struct *p);
> +
> static struct task_struct *
> pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> {
> struct task_struct *next, *max = NULL;
> const struct sched_class *class;
> const struct cpumask *smt_mask;
> + bool fi_before = false;
> bool need_sync;
> int i, j, cpu;
>
> @@ -4707,6 +4699,7 @@ pick_next_task(struct rq *rq, struct tas
> need_sync = !!rq->core->core_cookie;
> if (rq->core->core_forceidle) {
> need_sync = true;
> + fi_before = true;
> rq->core->core_forceidle = false;
> }
>
> @@ -4757,6 +4750,11 @@ pick_next_task(struct rq *rq, struct tas
> continue;
>
> rq_i->core_pick = p;
> + if (rq_i->idle == p && rq_i->nr_running) {
> + rq->core->core_forceidle = true;
> + if (!fi_before)
> + rq->core->core_forceidle_seq++;
> + }
>
> /*
> * If this new candidate is of higher priority than the
> @@ -4775,6 +4773,7 @@ pick_next_task(struct rq *rq, struct tas
> max = p;
>
> if (old_max) {
> + rq->core->core_forceidle = false;
> for_each_cpu(j, smt_mask) {
> if (j == i)
> continue;
> @@ -4823,10 +4822,8 @@ pick_next_task(struct rq *rq, struct tas
> if (!rq_i->core_pick)
> continue;
>
> - if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running &&
> - !rq_i->core->core_forceidle) {
> - rq_i->core->core_forceidle = true;
> - }
> + if (!(fi_before && rq->core->core_forceidle))
> + task_vruntime_update(rq_i, rq_i->core_pick);
>
> if (i == cpu) {
> rq_i->core_pick = NULL;
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -10686,6 +10686,67 @@ static inline void task_tick_core(struct
> __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
> resched_curr(rq);
> }
> +
> +static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
> +{
> + for_each_sched_entity(se) {
> + struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> + if (forceidle) {
> + if (cfs_rq->forceidle_seq == fi_seq)
> + break;
> + cfs_rq->forceidle_seq = fi_seq;
> + }
> +
> + cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
> + }
> +}
> +
> +void task_vruntime_update(struct rq *rq, struct task_struct *p)
> +{
> + struct sched_entity *se = &p->se;
> +
> + if (p->sched_class != &fair_sched_class)
> + return;
> +
> + se_fi_update(se, rq->core->core_forceidle_seq, rq->core->core_forceidle);
> +}
> +
> +bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
> +{
> + struct rq *rq = task_rq(a);
> + struct sched_entity *sea = &a->se;
> + struct sched_entity *seb = &b->se;
> + struct cfs_rq *cfs_rqa;
> + struct cfs_rq *cfs_rqb;
> + s64 delta;
> +
> + SCHED_WARN_ON(task_rq(b)->core != rq->core);
> +
> + while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
> + int sea_depth = sea->depth;
> + int seb_depth = seb->depth;
> +
> + if (sea_depth >= seb_depth)
> + sea = parent_entity(sea);
> + if (sea_depth <= seb_depth)
> + seb = parent_entity(seb);
> + }
> +
> + if (rq->core->core_forceidle) {
> + se_fi_update(sea, rq->core->core_forceidle_seq, true);
> + se_fi_update(seb, rq->core->core_forceidle_seq, true);
> + }
> +
> + cfs_rqa = sea->cfs_rq;
> + cfs_rqb = seb->cfs_rq;
> +
> + /* normalize vruntime WRT their rq's base */
> + delta = (s64)(sea->vruntime - seb->vruntime) +
> + (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
> +
> + return delta > 0;
> +}
> #else
> static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
> #endif
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -522,6 +522,11 @@ struct cfs_rq {
> unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
> unsigned int idle_h_nr_running; /* SCHED_IDLE */
>
> +#ifdef CONFIG_SCHED_CORE
> + unsigned int forceidle_seq;
> + u64 min_vruntime_fi;
> +#endif
> +
> u64 exec_clock;
> u64 min_vruntime;
> #ifndef CONFIG_64BIT
> @@ -1061,7 +1066,8 @@ struct rq {
> unsigned int core_task_seq;
> unsigned int core_pick_seq;
> unsigned long core_cookie;
> - unsigned char core_forceidle;
> + unsigned int core_forceidle;
> + unsigned int core_forceidle_seq;
> #endif
> };
>
> @@ -1106,6 +1112,8 @@ static inline raw_spinlock_t *rq_lockp(s
> return &rq->__lock;
> }
>
> +bool cfs_prio_less(struct task_struct *a, struct task_struct *b);
> +
> #else /* !CONFIG_SCHED_CORE */
>
> static inline bool sched_core_enabled(struct rq *rq)

2020-10-29 18:27:58

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 08/26] sched/fair: Snapshot the min_vruntime of CPUs on force idle

On Mon, Oct 26, 2020 at 01:47:24PM +0100, Peter Zijlstra wrote:
[..]
> How's something like this?
>
> - after each pick, such that the pick itself sees the divergence (see
> above); either:
>
> - pull the vruntime_fi forward, when !fi
> - freeze the vruntime_fi, when newly fi (A)
>
> - either way, update vruntime_fi for each cfs_rq in the active
> hierachy.
>
> - when comparing, and fi, update the vruntime_fi hierachy until we
> encounter a mark from (A), per doing it during the pick, but before
> runtime, this guaranteees it hasn't moved since (A).
>
> XXX, still buggered on SMT>2, imagine having {ta, tb, fi, i} on an SMT4,
> then when comparing any two tasks that do not involve the fi, we should
> (probably) have pulled them fwd -- but we can't actually pull them,
> because then the fi thing would break, mooo.
>
v>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -115,19 +115,8 @@ static inline bool prio_less(struct task
> if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
> return !dl_time_before(a->dl.deadline, b->dl.deadline);
>
> - if (pa == MAX_RT_PRIO + MAX_NICE) { /* fair */
> - u64 vruntime = b->se.vruntime;
> -
> - /*
> - * Normalize the vruntime if tasks are in different cpus.
> - */
> - if (task_cpu(a) != task_cpu(b)) {
> - vruntime -= task_cfs_rq(b)->min_vruntime;
> - vruntime += task_cfs_rq(a)->min_vruntime;
> - }
> -
> - return !((s64)(a->se.vruntime - vruntime) <= 0);
> - }
> + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
> + return cfs_prio_less(a, b);
>
> return false;
> }
> @@ -4642,12 +4631,15 @@ pick_task(struct rq *rq, const struct sc
> return cookie_pick;
> }
>
> +extern void task_vruntime_update(struct rq *rq, struct task_struct *p);
> +
> static struct task_struct *
> pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> {
> struct task_struct *next, *max = NULL;
> const struct sched_class *class;
> const struct cpumask *smt_mask;
> + bool fi_before = false;
> bool need_sync;
> int i, j, cpu;
>
> @@ -4707,6 +4699,7 @@ pick_next_task(struct rq *rq, struct tas
> need_sync = !!rq->core->core_cookie;
> if (rq->core->core_forceidle) {
> need_sync = true;
> + fi_before = true;
> rq->core->core_forceidle = false;
> }
>
> @@ -4757,6 +4750,11 @@ pick_next_task(struct rq *rq, struct tas
> continue;
>
> rq_i->core_pick = p;
> + if (rq_i->idle == p && rq_i->nr_running) {
> + rq->core->core_forceidle = true;
> + if (!fi_before)
> + rq->core->core_forceidle_seq++;
> + }
>
> /*
> * If this new candidate is of higher priority than the
> @@ -4775,6 +4773,7 @@ pick_next_task(struct rq *rq, struct tas
> max = p;
>
> if (old_max) {
> + rq->core->core_forceidle = false;
> for_each_cpu(j, smt_mask) {
> if (j == i)
> continue;
> @@ -4823,10 +4822,8 @@ pick_next_task(struct rq *rq, struct tas
> if (!rq_i->core_pick)
> continue;
>
> - if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running &&
> - !rq_i->core->core_forceidle) {
> - rq_i->core->core_forceidle = true;
> - }
> + if (!(fi_before && rq->core->core_forceidle))
> + task_vruntime_update(rq_i, rq_i->core_pick);

Shouldn't this be:

if (!fi_before && rq->core->core_forceidle)
task_vruntime_update(rq_i, rq_i->core_pick);

?

>
> if (i == cpu) {
> rq_i->core_pick = NULL;
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -10686,6 +10686,67 @@ static inline void task_tick_core(struct
> __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
> resched_curr(rq);
> }
> +
> +static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
> +{
> + for_each_sched_entity(se) {
> + struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> + if (forceidle) {
> + if (cfs_rq->forceidle_seq == fi_seq)
> + break;
> + cfs_rq->forceidle_seq = fi_seq;
> + }
> +
> + cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
> + }
> +}
> +
> +void task_vruntime_update(struct rq *rq, struct task_struct *p)
> +{
> + struct sched_entity *se = &p->se;
> +
> + if (p->sched_class != &fair_sched_class)
> + return;
> +
> + se_fi_update(se, rq->core->core_forceidle_seq, rq->core->core_forceidle);
> +}
> +
> +bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
> +{
> + struct rq *rq = task_rq(a);
> + struct sched_entity *sea = &a->se;
> + struct sched_entity *seb = &b->se;
> + struct cfs_rq *cfs_rqa;
> + struct cfs_rq *cfs_rqb;
> + s64 delta;
> +
> + SCHED_WARN_ON(task_rq(b)->core != rq->core);
> +
> + while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
> + int sea_depth = sea->depth;
> + int seb_depth = seb->depth;
> +
> + if (sea_depth >= seb_depth)
> + sea = parent_entity(sea);
> + if (sea_depth <= seb_depth)
> + seb = parent_entity(seb);
> + }
> +
> + if (rq->core->core_forceidle) {
> + se_fi_update(sea, rq->core->core_forceidle_seq, true);
> + se_fi_update(seb, rq->core->core_forceidle_seq, true);
> + }

As we chatted on IRC you mentioned the reason for the sync here is:

say we have 2 cgroups (a,b) under root, and we go force-idle in a, then we
update a and root. Then we pick and end up in b, but b hasn't been updated
yet.

One thing I was wondering about that was, if the pick of 'b' happens much
later than 'a', then the snapshot might be happening too late right?

Maybe the snapshot should happen on all cfs_rqs on all siblings in
pick_next_task() itself? That way everything gets updated at the instant the
force-idle started. Thought that may be a bit more slow.

thanks,

- Joel

2020-10-29 19:02:53

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 08/26] sched/fair: Snapshot the min_vruntime of CPUs on force idle

On Thu, Oct 29, 2020 at 02:24:29PM -0400, Joel Fernandes wrote:

> > @@ -4823,10 +4822,8 @@ pick_next_task(struct rq *rq, struct tas
> > if (!rq_i->core_pick)
> > continue;
> >
> > - if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running &&
> > - !rq_i->core->core_forceidle) {
> > - rq_i->core->core_forceidle = true;
> > - }
> > + if (!(fi_before && rq->core->core_forceidle))
> > + task_vruntime_update(rq_i, rq_i->core_pick);
>
> Shouldn't this be:
>
> if (!fi_before && rq->core->core_forceidle)
> task_vruntime_update(rq_i, rq_i->core_pick);
>
> ?

*groan*, I should've written a comment there :/

When we're not fi, we need to update.
when we're fi and we were not fi, we must update
When we're fi and we were already fi, we must not update

Which gives:

fib fi X

0 0 1
0 1 0
1 0 1
1 1 1

which is: !(!fib && fi) or something.

> > +bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
> > +{
> > + struct rq *rq = task_rq(a);
> > + struct sched_entity *sea = &a->se;
> > + struct sched_entity *seb = &b->se;
> > + struct cfs_rq *cfs_rqa;
> > + struct cfs_rq *cfs_rqb;
> > + s64 delta;
> > +
> > + SCHED_WARN_ON(task_rq(b)->core != rq->core);
> > +
> > + while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
> > + int sea_depth = sea->depth;
> > + int seb_depth = seb->depth;
> > +
> > + if (sea_depth >= seb_depth)
> > + sea = parent_entity(sea);
> > + if (sea_depth <= seb_depth)
> > + seb = parent_entity(seb);
> > + }
> > +
> > + if (rq->core->core_forceidle) {
> > + se_fi_update(sea, rq->core->core_forceidle_seq, true);
> > + se_fi_update(seb, rq->core->core_forceidle_seq, true);
> > + }
>
> As we chatted on IRC you mentioned the reason for the sync here is:
>
> say we have 2 cgroups (a,b) under root, and we go force-idle in a, then we
> update a and root. Then we pick and end up in b, but b hasn't been updated
> yet.
>
> One thing I was wondering about that was, if the pick of 'b' happens much
> later than 'a', then the snapshot might be happening too late right?

No, since this is the first pick in b since fi, it cannot have advanced.
So by updating to fi_seq before picking, we guarantee it is unchanged
since we went fi.

2020-10-30 02:39:23

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 08/26] sched/fair: Snapshot the min_vruntime of CPUs on force idle

On Thu, Oct 29, 2020 at 2:59 PM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Oct 29, 2020 at 02:24:29PM -0400, Joel Fernandes wrote:
>
> > > @@ -4823,10 +4822,8 @@ pick_next_task(struct rq *rq, struct tas
> > > if (!rq_i->core_pick)
> > > continue;
> > >
> > > - if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running &&
> > > - !rq_i->core->core_forceidle) {
> > > - rq_i->core->core_forceidle = true;
> > > - }
> > > + if (!(fi_before && rq->core->core_forceidle))
> > > + task_vruntime_update(rq_i, rq_i->core_pick);
> >
> > Shouldn't this be:
> >
> > if (!fi_before && rq->core->core_forceidle)
> > task_vruntime_update(rq_i, rq_i->core_pick);
> >
> > ?
>
> *groan*, I should've written a comment there :/
>
> When we're not fi, we need to update.
> when we're fi and we were not fi, we must update
> When we're fi and we were already fi, we must not update
>
> Which gives:
>
> fib fi X
>
> 0 0 1
> 0 1 0
> 1 0 1
> 1 1 1
>
> which is: !(!fib && fi) or something.
>

Got it! This is what my initial patch intended to do as well, but
yours is better.

> > > +bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
> > > +{
> > > + struct rq *rq = task_rq(a);
> > > + struct sched_entity *sea = &a->se;
> > > + struct sched_entity *seb = &b->se;
> > > + struct cfs_rq *cfs_rqa;
> > > + struct cfs_rq *cfs_rqb;
> > > + s64 delta;
> > > +
> > > + SCHED_WARN_ON(task_rq(b)->core != rq->core);
> > > +
> > > + while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
> > > + int sea_depth = sea->depth;
> > > + int seb_depth = seb->depth;
> > > +
> > > + if (sea_depth >= seb_depth)
> > > + sea = parent_entity(sea);
> > > + if (sea_depth <= seb_depth)
> > > + seb = parent_entity(seb);
> > > + }
> > > +
> > > + if (rq->core->core_forceidle) {
> > > + se_fi_update(sea, rq->core->core_forceidle_seq, true);
> > > + se_fi_update(seb, rq->core->core_forceidle_seq, true);
> > > + }
> >
> > As we chatted on IRC you mentioned the reason for the sync here is:
> >
> > say we have 2 cgroups (a,b) under root, and we go force-idle in a, then we
> > update a and root. Then we pick and end up in b, but b hasn't been updated
> > yet.
> >
> > One thing I was wondering about that was, if the pick of 'b' happens much
> > later than 'a', then the snapshot might be happening too late right?
>
> No, since this is the first pick in b since fi, it cannot have advanced.
> So by updating to fi_seq before picking, we guarantee it is unchanged
> since we went fi.

Makes complete sense.

I got it to a point where the latencies are much lower, but still not
at a point where it's as good as the initial patch I posted.

There could be more bugs. At the moment, the only one I corrected in
your patch is making the truth table do !(!fib && fi). But there is
still something else going on.

Thanks!

- Joel

2020-10-30 02:45:03

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 08/26] sched/fair: Snapshot the min_vruntime of CPUs on force idle

On Thu, Oct 29, 2020 at 10:36 PM Joel Fernandes <[email protected]> wrote:

> > > > +bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
> > > > +{
> > > > + struct rq *rq = task_rq(a);
> > > > + struct sched_entity *sea = &a->se;
> > > > + struct sched_entity *seb = &b->se;
> > > > + struct cfs_rq *cfs_rqa;
> > > > + struct cfs_rq *cfs_rqb;
> > > > + s64 delta;
> > > > +
> > > > + SCHED_WARN_ON(task_rq(b)->core != rq->core);
> > > > +
> > > > + while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
> > > > + int sea_depth = sea->depth;
> > > > + int seb_depth = seb->depth;
> > > > +
> > > > + if (sea_depth >= seb_depth)
> > > > + sea = parent_entity(sea);
> > > > + if (sea_depth <= seb_depth)
> > > > + seb = parent_entity(seb);
> > > > + }
> > > > +
> > > > + if (rq->core->core_forceidle) {
> > > > + se_fi_update(sea, rq->core->core_forceidle_seq, true);
> > > > + se_fi_update(seb, rq->core->core_forceidle_seq, true);
> > > > + }
> > >
> > > As we chatted on IRC you mentioned the reason for the sync here is:
> > >
> > > say we have 2 cgroups (a,b) under root, and we go force-idle in a, then we
> > > update a and root. Then we pick and end up in b, but b hasn't been updated
> > > yet.
> > >
> > > One thing I was wondering about that was, if the pick of 'b' happens much
> > > later than 'a', then the snapshot might be happening too late right?
> >
> > No, since this is the first pick in b since fi, it cannot have advanced.
> > So by updating to fi_seq before picking, we guarantee it is unchanged
> > since we went fi.
>
> Makes complete sense.
>
> I got it to a point where the latencies are much lower, but still not
> at a point where it's as good as the initial patch I posted.
>
> There could be more bugs. At the moment, the only one I corrected in
> your patch is making the truth table do !(!fib && fi). But there is
> still something else going on.

Forgot to ask, do you also need to do the task_vruntime_update() for
the unconstrained pick?

That's in line with what you mentioned: That you still need to do the
update if fi_before == false and fi_now == false.

So something like this?
@@ -4209,6 +4209,10 @@ pick_next_task(struct rq *rq, struct
task_struct *prev, struct rq_flags *rf)
next = p;
trace_printk("unconstrained pick: %s/%d %lx\n",
next->comm, next->pid,
next->core_cookie);
+
+ WARN_ON_ONCE(fi_before);
+ task_vruntime_update(rq_i, p);
+
goto done;
}

Quoting the truth table:

> > fib fi X
> >
> > 0 0 1
> > 0 1 0
> > 1 0 1
> > 1 1 1
> >

thanks,

- Joel

2020-10-30 08:44:41

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 08/26] sched/fair: Snapshot the min_vruntime of CPUs on force idle

On Thu, Oct 29, 2020 at 10:42:29PM -0400, Joel Fernandes wrote:

> Forgot to ask, do you also need to do the task_vruntime_update() for
> the unconstrained pick?

Humm.. interesting case.

Yes, however... since in that case the picks aren't synchronized it's a
wee bit dodgy. I'll throw it on the pile together with SMT4.


Also, I'm still hoping you can make this form work:

https://lkml.kernel.org/r/[email protected]

(note that the put_prev_task() needs an additional rq argument)

That's both simpler code and faster.

2020-10-31 21:43:05

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 08/26] sched/fair: Snapshot the min_vruntime of CPUs on force idle

On Fri, Oct 30, 2020 at 09:41:12AM +0100, Peter Zijlstra wrote:
> On Thu, Oct 29, 2020 at 10:42:29PM -0400, Joel Fernandes wrote:
>
> > Forgot to ask, do you also need to do the task_vruntime_update() for
> > the unconstrained pick?
>
> Humm.. interesting case.
>
> Yes, however... since in that case the picks aren't synchronized it's a
> wee bit dodgy. I'll throw it on the pile together with SMT4.

Ok. I threw it into the patch below anyway to not take any chances. I got the
version of your patch to a point where the perf is looking good (there's
still some room for improvement, but I am much much happier with the tests).

The changes are:
- Fix but in !(fib && !fi) thingie.
- Pass prio_less with with force-idle information ... prio_less() is called
from various paths. We need to be careful what we pass it. Its possible we
select idle on something, but we are not done with selection yet - we
continue doing prio_less(). So just pass it fi_before.
- During enqueue, prio_less is used then, we may need to sync even then.
- Make cfs_prio_less sync even when !fi.
- Sync the min_vruntime for unconstrained picks and during prio_less() even
when the core is not in FI. Clarify why you feel for its dodgy for sync'ing
when unconstrained?

> Also, I'm still hoping you can make this form work:
>
> https://lkml.kernel.org/r/[email protected]
>
> (note that the put_prev_task() needs an additional rq argument)
>
> That's both simpler code and faster.

Sure, I'll give that a go soon.

(This is based on our older 4.19 kernel, so the task_tick_fair() bit needed
changes to make it work - which aren't needed in v8).

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 711f8a78a947..c15fd5bbc707 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -97,7 +97,7 @@ static inline int __task_prio(struct task_struct *p)
*/

/* real prio, less is less */
-static inline bool prio_less(struct task_struct *a, struct task_struct *b)
+static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
{

int pa = __task_prio(a), pb = __task_prio(b);
@@ -112,7 +112,7 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b)
return !dl_time_before(a->dl.deadline, b->dl.deadline);

if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
- return cfs_prio_less(a, b);
+ return cfs_prio_less(a, b, in_fi);

return false;
}
@@ -126,7 +126,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
return false;

/* flip prio, so high prio is leftmost */
- if (prio_less(b, a))
+ if (prio_less(b, a, task_rq(a)->core->core_forceidle))
return true;

return false;
@@ -4025,7 +4025,7 @@ void sched_core_irq_exit(void)
* - Else returns idle_task.
*/
static struct task_struct *
-pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max)
+pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max, bool in_fi)
{
struct task_struct *class_pick, *cookie_pick;
unsigned long cookie = rq->core->core_cookie;
@@ -4040,7 +4040,7 @@ pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *ma
* higher priority than max.
*/
if (max && class_pick->core_cookie &&
- prio_less(class_pick, max))
+ prio_less(class_pick, max, in_fi))
return idle_sched_class.pick_task(rq);

return class_pick;
@@ -4059,13 +4059,15 @@ pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *ma
* the core (so far) and it must be selected, otherwise we must go with
* the cookie pick in order to satisfy the constraint.
*/
- if (prio_less(cookie_pick, class_pick) &&
- (!max || prio_less(max, class_pick)))
+ if (prio_less(cookie_pick, class_pick, in_fi) &&
+ (!max || prio_less(max, class_pick, in_fi)))
return class_pick;

return cookie_pick;
}

+extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
+
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
@@ -4141,15 +4143,6 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
update_rq_clock(rq_i);
}

- if (!fi_before) {
- for_each_cpu(i, smt_mask) {
- struct rq *rq_i = cpu_rq(i);
-
- /* Reset the snapshot if core is no longer in force-idle. */
- rq_i->cfs.min_vruntime_fi = rq_i->cfs.min_vruntime;
- }
- }
-
/*
* Try and select tasks for each sibling in decending sched_class
* order.
@@ -4174,7 +4167,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* highest priority task already selected for this
* core.
*/
- p = pick_task(rq_i, class, max);
+ p = pick_task(rq_i, class, max, fi_before);
if (!p) {
/*
* If there weren't no cookies; we don't need
@@ -4199,6 +4192,10 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
*/
if (i == cpu && !need_sync && !p->core_cookie) {
next = p;
+
+ WARN_ON_ONCE(fi_before);
+ task_vruntime_update(rq_i, p, false);
+
goto done;
}

@@ -4206,6 +4203,11 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
occ++;

rq_i->core_pick = p;
+ if (rq_i->idle == p && rq_i->nr_running) {
+ rq->core->core_forceidle = true;
+ if (!fi_before)
+ rq->core->core_forceidle_seq++;
+ }

/*
* If this new candidate is of higher priority than the
@@ -4224,6 +4226,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
max = p;

if (old_max) {
+ rq->core->core_forceidle = false;
for_each_cpu(j, smt_mask) {
if (j == i)
continue;
@@ -4268,29 +4271,22 @@ next_class:;

WARN_ON_ONCE(!rq_i->core_pick);

- if (is_idle_task(rq_i->core_pick) && rq_i->nr_running) {
- rq_i->core_forceidle = true;
- need_sync = true;
- }
+ if (!(!fi_before && rq->core->core_forceidle)) {
+ task_vruntime_update(rq_i, rq_i->core_pick, true);

- rq_i->core_pick->core_occupation = occ;
+ rq_i->core_pick->core_occupation = occ;

- if (i == cpu)
- continue;
+ if (i == cpu)
+ continue;

- if (rq_i->curr != rq_i->core_pick)
- resched_curr(rq_i);
+ if (rq_i->curr != rq_i->core_pick) {
+ resched_curr(rq_i);
+ }
+ }

/* Did we break L1TF mitigation requirements? */
- WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
- }
-
- if (!fi_before && need_sync) {
- for_each_cpu(i, smt_mask) {
- struct rq *rq_i = cpu_rq(i);
-
- /* Snapshot if core is in force-idle. */
- rq_i->cfs.min_vruntime_fi = rq_i->cfs.min_vruntime;
+ if (unlikely(!cookie_match(next, rq_i->core_pick))) {
+ WARN_ON_ONCE(1);
}
}
done:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7e36608477aa..6d8e16bc3d79 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -483,36 +483,30 @@ static inline u64 cfs_rq_min_vruntime(struct cfs_rq *cfs_rq)
}

#ifdef CONFIG_FAIR_GROUP_SCHED
-bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
+static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle);
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
{
- bool samecpu = task_cpu(a) == task_cpu(b);
+ struct rq *rq = task_rq(a);
struct sched_entity *sea = &a->se;
struct sched_entity *seb = &b->se;
struct cfs_rq *cfs_rqa;
struct cfs_rq *cfs_rqb;
s64 delta;

- if (samecpu) {
- /* vruntime is per cfs_rq */
- while (!is_same_group(sea, seb)) {
- int sea_depth = sea->depth;
- int seb_depth = seb->depth;
+ SCHED_WARN_ON(task_rq(b)->core != rq->core);

- if (sea_depth >= seb_depth)
- sea = parent_entity(sea);
- if (sea_depth <= seb_depth)
- seb = parent_entity(seb);
- }
+ while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
+ int sea_depth = sea->depth;
+ int seb_depth = seb->depth;

- delta = (s64)(sea->vruntime - seb->vruntime);
- goto out;
+ if (sea_depth >= seb_depth)
+ sea = parent_entity(sea);
+ if (sea_depth <= seb_depth)
+ seb = parent_entity(seb);
}

- /* crosscpu: compare root level se's vruntime to decide priority */
- while (sea->parent)
- sea = sea->parent;
- while (seb->parent)
- seb = seb->parent;
+ se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
+ se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);

cfs_rqa = sea->cfs_rq;
cfs_rqb = seb->cfs_rq;
@@ -520,7 +514,7 @@ bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
/* normalize vruntime WRT their rq's base */
delta = (s64)(sea->vruntime - seb->vruntime) +
(s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
-out:
+
return delta > 0;
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -10929,8 +10923,6 @@ static void core_sched_deactivate_fair(struct rq *rq)
*/
static void resched_forceidle_sibling(struct rq *rq, struct sched_entity *se)
{
- int cpu = cpu_of(rq), sibling_cpu;
-
/*
* If runqueue has only one task which used up its slice and if the
* sibling is forced idle, then trigger schedule to give forced idle
@@ -10944,23 +10936,9 @@ static void resched_forceidle_sibling(struct rq *rq, struct sched_entity *se)
* forced idle cpu has atleast MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks
* and use that to check if we need to give up the cpu.
*/
- if (rq->cfs.nr_running > 1 ||
- !__entity_slice_used(se, MIN_NR_TASKS_DURING_FORCEIDLE))
- return;
-
- for_each_cpu(sibling_cpu, cpu_smt_mask(cpu)) {
- struct rq *sibling_rq;
- if (sibling_cpu == cpu)
- continue;
- if (cpu_is_offline(sibling_cpu))
- continue;
-
- sibling_rq = cpu_rq(sibling_cpu);
- if (sibling_rq->core_forceidle) {
- resched_curr(rq);
- break;
- }
- }
+ if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
+ __entity_slice_used(se, MIN_NR_TASKS_DURING_FORCEIDLE))
+ resched_curr(rq);
}
#endif

@@ -11102,6 +11080,41 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
update_load_avg(cfs_rq, se, UPDATE_TG);
}
}
+static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
+{
+ bool root = true;
+ long old, new;
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (forceidle) {
+ if (cfs_rq->forceidle_seq == fi_seq)
+ break;
+ cfs_rq->forceidle_seq = fi_seq;
+ }
+
+ if (root) {
+ old = cfs_rq->min_vruntime_fi;
+ new = cfs_rq->min_vruntime;
+ root = false;
+ trace_printk("cfs_rq(min_vruntime_fi) %Lu->%Lu\n",
+ old, new);
+ }
+
+ cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
+ }
+}
+
+void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
+{
+ struct sched_entity *se = &p->se;
+
+ if (p->sched_class != &fair_sched_class)
+ return;
+
+ se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
+}
+
#else
static void propagate_entity_cfs_rq(struct sched_entity *se) { }
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 45c8ce5c2333..74ad557d551e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -497,9 +497,13 @@ struct cfs_rq {
unsigned int nr_running;
unsigned int h_nr_running;

+#ifdef CONFIG_SCHED_CORE
+ unsigned int forceidle_seq;
+ u64 min_vruntime_fi;
+#endif
+
u64 exec_clock;
u64 min_vruntime;
- u64 min_vruntime_fi;
#ifndef CONFIG_64BIT
u64 min_vruntime_copy;
#endif
@@ -974,7 +978,6 @@ struct rq {
unsigned int core_enabled;
unsigned int core_sched_seq;
struct rb_root core_tree;
- bool core_forceidle;
unsigned char core_pause_pending;
unsigned int core_this_irq_nest;

@@ -982,6 +985,8 @@ struct rq {
unsigned int core_task_seq;
unsigned int core_pick_seq;
unsigned long core_cookie;
+ unsigned int core_forceidle;
+ unsigned int core_forceidle_seq;
unsigned int core_irq_nest;
#endif
};
@@ -1061,6 +1066,8 @@ extern void queue_core_balance(struct rq *rq);
void sched_core_add(struct rq *rq, struct task_struct *p);
void sched_core_remove(struct rq *rq, struct task_struct *p);

+bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi);
+
#else /* !CONFIG_SCHED_CORE */

static inline bool sched_core_enabled(struct rq *rq)
@@ -2549,7 +2556,7 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
#define perf_domain_span(pd) NULL
#endif

-bool cfs_prio_less(struct task_struct *a, struct task_struct *b);
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi);

#ifdef CONFIG_SMP
extern struct static_key_false sched_energy_present;