During force-idle, we end up doing cross-cpu comparison of vruntimes
during pick_next_task. If we simply compare (vruntime-min_vruntime)
across CPUs, and if the CPUs only have 1 task each, we will always
end up comparing 0 with 0 and pick just one of the tasks all the time.
This starves the task that was not picked. To fix this, take a snapshot
of the min_vruntime when entering force idle and use it for comparison.
This min_vruntime snapshot will only be used for cross-CPU vruntime
comparison, and nothing else.
This resolves several performance issues that were seen in ChromeOS
audio usecase.
NOTE: Note, this patch will be improved in a later patch. It is just
kept here as the basis for the later patch and to make rebasing
easier. Further, it may make reverting the improvement easier in
case the improvement causes any regression.
Tested-by: Julien Desfossez <[email protected]>
Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
kernel/sched/core.c | 33 ++++++++++++++++++++-------------
kernel/sched/fair.c | 40 ++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 5 +++++
3 files changed, 65 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 52d0e83072a4..4ee4902c2cf5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -115,19 +115,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b)
if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
return !dl_time_before(a->dl.deadline, b->dl.deadline);
- if (pa == MAX_RT_PRIO + MAX_NICE) { /* fair */
- u64 vruntime = b->se.vruntime;
-
- /*
- * Normalize the vruntime if tasks are in different cpus.
- */
- if (task_cpu(a) != task_cpu(b)) {
- vruntime -= task_cfs_rq(b)->min_vruntime;
- vruntime += task_cfs_rq(a)->min_vruntime;
- }
-
- return !((s64)(a->se.vruntime - vruntime) <= 0);
- }
+ if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
+ return cfs_prio_less(a, b);
return false;
}
@@ -5144,6 +5133,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *next, *max = NULL;
const struct sched_class *class;
const struct cpumask *smt_mask;
+ bool fi_before = false;
bool need_sync;
int i, j, cpu;
@@ -5208,6 +5198,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
rq->core->core_cookie = 0UL;
if (rq->core->core_forceidle) {
need_sync = true;
+ fi_before = true;
rq->core->core_forceidle = false;
}
for_each_cpu(i, smt_mask) {
@@ -5219,6 +5210,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
update_rq_clock(rq_i);
}
+ /* Reset the snapshot if core is no longer in force-idle. */
+ if (!fi_before) {
+ for_each_cpu(i, smt_mask) {
+ struct rq *rq_i = cpu_rq(i);
+ rq_i->cfs.min_vruntime_fi = rq_i->cfs.min_vruntime;
+ }
+ }
+
/*
* Try and select tasks for each sibling in decending sched_class
* order.
@@ -5355,6 +5354,14 @@ next_class:;
resched_curr(rq_i);
}
+ /* Snapshot if core is in force-idle. */
+ if (!fi_before && rq->core->core_forceidle) {
+ for_each_cpu(i, smt_mask) {
+ struct rq *rq_i = cpu_rq(i);
+ rq_i->cfs.min_vruntime_fi = rq_i->cfs.min_vruntime;
+ }
+ }
+
done:
set_next_task(rq, next);
return next;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 42965c4fd71f..de82f88ba98c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10726,6 +10726,46 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
resched_curr(rq);
}
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
+{
+ bool samecpu = task_cpu(a) == task_cpu(b);
+ struct sched_entity *sea = &a->se;
+ struct sched_entity *seb = &b->se;
+ struct cfs_rq *cfs_rqa;
+ struct cfs_rq *cfs_rqb;
+ s64 delta;
+
+ if (samecpu) {
+ /* vruntime is per cfs_rq */
+ while (!is_same_group(sea, seb)) {
+ int sea_depth = sea->depth;
+ int seb_depth = seb->depth;
+ if (sea_depth >= seb_depth)
+ sea = parent_entity(sea);
+ if (sea_depth <= seb_depth)
+ seb = parent_entity(seb);
+ }
+
+ delta = (s64)(sea->vruntime - seb->vruntime);
+ goto out;
+ }
+
+ /* crosscpu: compare root level se's vruntime to decide priority */
+ while (sea->parent)
+ sea = sea->parent;
+ while (seb->parent)
+ seb = seb->parent;
+
+ cfs_rqa = sea->cfs_rq;
+ cfs_rqb = seb->cfs_rq;
+
+ /* normalize vruntime WRT their rq's base */
+ delta = (s64)(sea->vruntime - seb->vruntime) +
+ (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+out:
+ return delta > 0;
+}
#else
static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index be656ca8693d..d934cc51acf1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -517,6 +517,9 @@ struct cfs_rq {
u64 exec_clock;
u64 min_vruntime;
+#ifdef CONFIG_SCHED_CORE
+ u64 min_vruntime_fi;
+#endif
#ifndef CONFIG_64BIT
u64 min_vruntime_copy;
#endif
@@ -1130,6 +1133,8 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq)
return &rq->__lock;
}
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b);
+
#else /* !CONFIG_SCHED_CORE */
static inline bool sched_core_enabled(struct rq *rq)
--
2.29.2.299.gdc1121823c-goog
On Tue, Nov 17, 2020 at 06:19:39PM -0500, Joel Fernandes (Google) wrote:
> During force-idle, we end up doing cross-cpu comparison of vruntimes
> during pick_next_task. If we simply compare (vruntime-min_vruntime)
> across CPUs, and if the CPUs only have 1 task each, we will always
> end up comparing 0 with 0 and pick just one of the tasks all the time.
> This starves the task that was not picked. To fix this, take a snapshot
> of the min_vruntime when entering force idle and use it for comparison.
> This min_vruntime snapshot will only be used for cross-CPU vruntime
> comparison, and nothing else.
>
> This resolves several performance issues that were seen in ChromeOS
> audio usecase.
>
> NOTE: Note, this patch will be improved in a later patch. It is just
> kept here as the basis for the later patch and to make rebasing
> easier. Further, it may make reverting the improvement easier in
> case the improvement causes any regression.
>
This seems cumbersome, is there no way to track the min_vruntime via
rq->core->min_vruntime?
Balbir Singh.
Hi Balbir,
On 11/22/20 6:44 AM, Balbir Singh wrote:
>
> This seems cumbersome, is there no way to track the min_vruntime via
> rq->core->min_vruntime?
Do you mean to have a core wide min_vruntime? We had a
similar approach from v3 to v7 and it had major issues which
broke the assumptions of cfs. There were some lengthy
discussions and Peter explained in-depth about the issues:
https://lwn.net/ml/linux-kernel/[email protected]/
https://lwn.net/ml/linux-kernel/[email protected]/
Thanks,
Vineeth
On Mon, Nov 23, 2020 at 07:31:31AM -0500, Vineeth Pillai wrote:
> Hi Balbir,
>
> On 11/22/20 6:44 AM, Balbir Singh wrote:
> >
> > This seems cumbersome, is there no way to track the min_vruntime via
> > rq->core->min_vruntime?
> Do you mean to have a core wide min_vruntime? We had a
> similar approach from v3 to v7 and it had major issues which
> broke the assumptions of cfs. There were some lengthy
> discussions and Peter explained in-depth about the issues:
>
> https://lwn.net/ml/linux-kernel/[email protected]/
> https://lwn.net/ml/linux-kernel/[email protected]/
>
One of the equations in the link is
">From which immediately follows that:
T_k + T_l
S_k+l = --------- (13)
W_k + W_l
On which we can define a combined lag:
lag_k+l(i) := S_k+l - s_i (14)
And that gives us the tools to compare tasks across a combined runqueue.
"
S_k+l reads like rq->core->vruntime, but it sounds like the equivalent
of rq->core->vruntime is updated when we enter forced idle as opposed to
all the time.
Balbir
On Tue, Nov 24, 2020 at 10:31:49AM +1100, Balbir Singh wrote:
> On Mon, Nov 23, 2020 at 07:31:31AM -0500, Vineeth Pillai wrote:
> > Hi Balbir,
> >
> > On 11/22/20 6:44 AM, Balbir Singh wrote:
> > >
> > > This seems cumbersome, is there no way to track the min_vruntime via
> > > rq->core->min_vruntime?
> > Do you mean to have a core wide min_vruntime? We had a
> > similar approach from v3 to v7 and it had major issues which
> > broke the assumptions of cfs. There were some lengthy
> > discussions and Peter explained in-depth about the issues:
> >
> > https://lwn.net/ml/linux-kernel/[email protected]/
> > https://lwn.net/ml/linux-kernel/[email protected]/
> >
>
> One of the equations in the link is
>
> ">From which immediately follows that:
>
> T_k + T_l
> S_k+l = --------- (13)
> W_k + W_l
>
> On which we can define a combined lag:
>
> lag_k+l(i) := S_k+l - s_i (14)
>
> And that gives us the tools to compare tasks across a combined runqueue.
> "
>
> S_k+l reads like rq->core->vruntime, but it sounds like the equivalent
> of rq->core->vruntime is updated when we enter forced idle as opposed to
> all the time.
Yes, but actually computing and maintaining it is hella hard. Try it
with the very first example in that email (the infeasible weight
scenario) and tell me how it works for you ;-)
Also note that the text below (6) mentions dynamic, then look up the
EEVDF paper which describes some of the dynamics -- the paper is
incomplete and contains a bug, I forget if it ever got updated or if
there's another paper that completes it (the BQF I/O scheduler started
from that and fixed it).
I'm not saying it cannot be done, I'm just saying it is really rather
involved and probably not worth it.
The basic observation the current approach relies on is that al that
faffery basically boils down to the fact that vruntime only means
something when there is contention. And that only the progression is
important not the actual value. That is, this is all fundamentally a
differential equation and our integration constant is meaningless (also
embodied in (7)).
Also, I think the code as proposed here relies on SMT2 and is buggered
for SMT3+. Now, that second link above describes means of making SMT3+
work, but we're not there yet.
On Tue, Nov 24, 2020 at 10:09:55AM +0100, Peter Zijlstra wrote:
> On Tue, Nov 24, 2020 at 10:31:49AM +1100, Balbir Singh wrote:
> > On Mon, Nov 23, 2020 at 07:31:31AM -0500, Vineeth Pillai wrote:
> > > Hi Balbir,
> > >
> > > On 11/22/20 6:44 AM, Balbir Singh wrote:
> > > >
> > > > This seems cumbersome, is there no way to track the min_vruntime via
> > > > rq->core->min_vruntime?
> > > Do you mean to have a core wide min_vruntime? We had a
> > > similar approach from v3 to v7 and it had major issues which
> > > broke the assumptions of cfs. There were some lengthy
> > > discussions and Peter explained in-depth about the issues:
> > >
> > > https://lwn.net/ml/linux-kernel/[email protected]/
> > > https://lwn.net/ml/linux-kernel/[email protected]/
> > >
> >
> > One of the equations in the link is
> >
> > ">From which immediately follows that:
> >
> > T_k + T_l
> > S_k+l = --------- (13)
> > W_k + W_l
> >
> > On which we can define a combined lag:
> >
> > lag_k+l(i) := S_k+l - s_i (14)
> >
> > And that gives us the tools to compare tasks across a combined runqueue.
> > "
> >
> > S_k+l reads like rq->core->vruntime, but it sounds like the equivalent
> > of rq->core->vruntime is updated when we enter forced idle as opposed to
> > all the time.
>
> Yes, but actually computing and maintaining it is hella hard. Try it
> with the very first example in that email (the infeasible weight
> scenario) and tell me how it works for you ;-)
>
OK, I was hoping it could be done in the new RBTree's enqueue/dequeue,
but yes I've not implemented it and I should go back and take a look at
the first example again.
> Also note that the text below (6) mentions dynamic, then look up the
> EEVDF paper which describes some of the dynamics -- the paper is
> incomplete and contains a bug, I forget if it ever got updated or if
> there's another paper that completes it (the BQF I/O scheduler started
> from that and fixed it).
I see, I am yet to read the EEVDF paper, but now I am out on a tangent
:)
>
> I'm not saying it cannot be done, I'm just saying it is really rather
> involved and probably not worth it.
>
Fair enough
> The basic observation the current approach relies on is that al that
> faffery basically boils down to the fact that vruntime only means
> something when there is contention. And that only the progression is
> important not the actual value. That is, this is all fundamentally a
> differential equation and our integration constant is meaningless (also
> embodied in (7)).
>
I'll reread (6) and (7), I am trying to understand forced idle and
contention together, from what I understand of the patches, there is
1. two tasks that are core scheduled, in that case vruntime works as
expected on each CPU, but we need to compare their combined vrtuntime
against other tasks on each CPU respectively for them to be
selected/chosen?
2. When one of the tasks selected is a part of the core scheduling group
and the other CPU does not select a core scheduled tasks, we need to ask
ourselves if that CPU should force idle and that's where this logic
comes into play?
> Also, I think the code as proposed here relies on SMT2 and is buggered
> for SMT3+. Now, that second link above describes means of making SMT3+
> work, but we're not there yet.
Thanks,
Balbir Singh.
On Thu, Nov 26, 2020 at 10:17:15AM +1100, Balbir Singh wrote:
> On Tue, Nov 24, 2020 at 10:09:55AM +0100, Peter Zijlstra wrote:
> > The basic observation the current approach relies on is that al that
> > faffery basically boils down to the fact that vruntime only means
> > something when there is contention. And that only the progression is
> > important not the actual value. That is, this is all fundamentally a
> > differential equation and our integration constant is meaningless (also
> > embodied in (7)).
> >
>
> I'll reread (6) and (7), I am trying to understand forced idle and
> contention together, from what I understand of the patches, there is
When we force-idle there is contention by definition; there's a task
that wanted to run, but couldn't.
> 1. two tasks that are core scheduled, in that case vruntime works as
> expected on each CPU, but we need to compare their combined vrtuntime
> against other tasks on each CPU respectively for them to be
> selected/chosen?
We need to compare across CPUs when the cookies don't match. This is
required to avoid starving one or the other.
> 2. When one of the tasks selected is a part of the core scheduling group
> and the other CPU does not select a core scheduled tasks, we need to ask
> ourselves if that CPU should force idle and that's where this logic
> comes into play?
When one CPU selects a cookie task, and the other CPU cannot find a
matching task, it must go idle (as idle matches everyone). This is the
basic core-scheduling constraint.
So suppose you have two tasks, A and B, both with a cookie, but not
matching.
Normal scheduling would run A and B concurrent on the two siblings. Core
scheduling obviously cannot do this. When we pick A, the other CPU is
not allowed to run B and thus will have to be forced idle and
vice-versa.
The next problem is avoiding starvation. Assuming equal weight between
the tasks, we'd want to end up running A and B in alternating cycles.
This means having to compare runtimes between A and B, but when they're
on different runqueues the actual vruntime values can be wildly
divergent and cannot be reasily compared (the integration constant is
meaningless but really annoying ;-).
We also cannot use min_vruntime (which is the same as the task vruntime
when there is only a single task), because then you cannot observe
progress. The difference between min_vruntime and the task runtime is
always 0, so you can't tell who just ran and who got starved.
This is where our snapshots come in play, we snapshot vruntime after
task selection (before running), such that at the next pick we can tell
who made progress and who got starved.
By marking the vruntime of both runqueues at the same point in time we
basically normalize away that integration constant. You effectively
reset the vruntime to 0 (through (7), but without iterating all the
tasks and adjusting it).
Does that make sense?
Once you get this, read that second email linked.