Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
noting that lag is fundamentally a temporal measure. It should not be
carried around indefinitely.
OTOH it should also not be instantly discarded, doing so will allow a
task to game the system by purposefully (micro) sleeping at the end of
its time quantum.
Since lag is intimately tied to the virtual time base, a wall-time
based decay is also insufficient, notably competition is required for
any of this to make sense.
Instead, delay the dequeue and keep the 'tasks' on the runqueue,
competing until they are eligible.
Strictly speaking, we only care about keeping them until the 0-lag
point, but that is a difficult proposition, instead carry them around
until they get picked again, and dequeue them at that point.
Since we should have dequeued them at the 0-lag point, truncate lag
(eg. don't let them earn positive lag).
XXX test the cfs-throttle stuff
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/sched.h | 1
kernel/sched/core.c | 22 +++++--
kernel/sched/fair.c | 148 +++++++++++++++++++++++++++++++++++++++++++-----
kernel/sched/features.h | 12 +++
kernel/sched/sched.h | 2
5 files changed, 167 insertions(+), 18 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -542,6 +542,7 @@ struct sched_entity {
struct list_head group_node;
unsigned int on_rq;
+ unsigned int sched_delayed;
u64 exec_start;
u64 sum_exec_runtime;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2154,10 +2154,14 @@ void activate_task(struct rq *rq, struct
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
- WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING);
- ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+ bool sleep = flags & DEQUEUE_SLEEP;
- dequeue_task(rq, p, flags);
+ if (dequeue_task(rq, p, flags)) {
+ WRITE_ONCE(p->on_rq, sleep ? 0 : TASK_ON_RQ_MIGRATING);
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+ } else {
+ SCHED_WARN_ON(!sleep); /* only sleep can fail */
+ }
}
static inline int __normal_prio(int policy, int rt_prio, int nice)
@@ -3858,12 +3862,17 @@ static int ttwu_runnable(struct task_str
rq = __task_rq_lock(p, &rf);
if (task_on_rq_queued(p)) {
+ update_rq_clock(rq);
+ if (p->se.sched_delayed) {
+ /* mustn't run a delayed task */
+ SCHED_WARN_ON(task_on_cpu(rq, p));
+ enqueue_task(rq, p, ENQUEUE_DELAYED);
+ }
if (!task_on_cpu(rq, p)) {
/*
* When on_rq && !on_cpu the task is preempted, see if
* it should preempt the task that is current now.
*/
- update_rq_clock(rq);
wakeup_preempt(rq, p, wake_flags);
}
ttwu_do_wakeup(p);
@@ -4243,11 +4252,16 @@ int try_to_wake_up(struct task_struct *p
* case the whole 'p->on_rq && ttwu_runnable()' case below
* without taking any locks.
*
+ * Specifically, given current runs ttwu() we must be before
+ * schedule()'s deactivate_task(), as such this must not
+ * observe sched_delayed.
+ *
* In particular:
* - we rely on Program-Order guarantees for all the ordering,
* - we're serialized against set_special_state() by virtue of
* it disabling IRQs (this allows not taking ->pi_lock).
*/
+ SCHED_WARN_ON(p->se.sched_delayed);
if (!ttwu_state_match(p, state, &success))
goto out;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5270,6 +5270,10 @@ static inline int cfs_rq_throttled(struc
static inline bool cfs_bandwidth_used(void);
static void
+requeue_delayed_entity(struct sched_entity *se);
+
+/* XXX bool and pull in the requeue_delayed_entity thing */
+static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
bool curr = cfs_rq->curr == se;
@@ -5356,18 +5360,33 @@ static void clear_buddies(struct cfs_rq
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-static void
+static bool
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- int action = UPDATE_TG;
-
- if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
- action |= DO_DETACH;
-
/*
* Update run-time statistics of the 'current'.
*/
- update_curr(cfs_rq);
+ if (flags & DEQUEUE_DELAYED) {
+ SCHED_WARN_ON(!se->sched_delayed);
+ se->sched_delayed = 0;
+ } else {
+ bool sleep = flags & DEQUEUE_SLEEP;
+
+ SCHED_WARN_ON(sleep && se->sched_delayed);
+ update_curr(cfs_rq);
+
+ if (sched_feat(DELAY_DEQUEUE) && sleep &&
+ !entity_eligible(cfs_rq, se)) {
+ if (cfs_rq->next == se)
+ cfs_rq->next = NULL;
+ se->sched_delayed = 1;
+ return false;
+ }
+ }
+
+ int action = UPDATE_TG;
+ if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
+ action |= DO_DETACH;
/*
* When dequeuing a sched_entity, we must:
@@ -5407,6 +5426,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
if (cfs_rq->nr_running == 0)
update_idle_cfs_rq_clock_pelt(cfs_rq);
+
+ return true;
}
static void
@@ -5432,6 +5453,7 @@ set_next_entity(struct cfs_rq *cfs_rq, s
}
update_stats_curr_start(cfs_rq, se);
+ SCHED_WARN_ON(cfs_rq->curr);
cfs_rq->curr = se;
/*
@@ -5452,6 +5474,8 @@ set_next_entity(struct cfs_rq *cfs_rq, s
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
+static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
+
/*
* Pick the next process, keeping these things in mind, in this order:
* 1) keep things fair between processes/task groups
@@ -5460,16 +5484,29 @@ set_next_entity(struct cfs_rq *cfs_rq, s
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *
-pick_next_entity(struct cfs_rq *cfs_rq)
+pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
{
/*
* Enabling NEXT_BUDDY will affect latency but not fairness.
*/
if (sched_feat(NEXT_BUDDY) &&
- cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
+ /* ->next will never be delayed */
+ SCHED_WARN_ON(cfs_rq->next->sched_delayed);
return cfs_rq->next;
+ }
- return pick_eevdf(cfs_rq);
+ struct sched_entity *se = pick_eevdf(cfs_rq);
+ if (se->sched_delayed) {
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ SCHED_WARN_ON(se->sched_delayed);
+ SCHED_WARN_ON(se->on_rq);
+ if (sched_feat(DELAY_ZERO) && se->vlag > 0)
+ se->vlag = 0;
+
+ return NULL;
+ }
+ return se;
}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@ -5493,6 +5530,7 @@ static void put_prev_entity(struct cfs_r
/* in !on_rq case, update occurred at dequeue */
update_load_avg(cfs_rq, prev, 0);
}
+ SCHED_WARN_ON(cfs_rq->curr != prev);
cfs_rq->curr = NULL;
}
@@ -5793,6 +5831,10 @@ static bool throttle_cfs_rq(struct cfs_r
if (!se->on_rq)
goto done;
+ /*
+ * XXX should be fine vs sched_delay; if won't run after this.
+ * Either pick dequeues it, or unthrottle. Double check!!
+ */
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
if (cfs_rq_is_idle(group_cfs_rq(se)))
@@ -5882,8 +5924,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cf
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- if (se->on_rq)
+ if (se->on_rq) {
+ if (se->sched_delayed)
+ requeue_delayed_entity(se);
break;
+ }
enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
if (cfs_rq_is_idle(group_cfs_rq(se)))
@@ -6729,6 +6774,40 @@ static int sched_idle_cpu(int cpu)
}
#endif
+static void
+requeue_delayed_entity(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /*
+ * se->sched_delayed should imply both: se->on_rq == 1 and
+ * cfs_rq->curr != se. Because a delayed entity is one that is still on
+ * the runqueue competing until elegibility.
+ *
+ * Except for groups, consider current going idle and newidle pulling a
+ * task in the same group -- in that case 'cfs_rq->curr == se'.
+ */
+ SCHED_WARN_ON(!se->sched_delayed);
+ SCHED_WARN_ON(!se->on_rq);
+ SCHED_WARN_ON(entity_is_task(se) && cfs_rq->curr == se);
+
+ if (sched_feat(DELAY_ZERO)) {
+ update_entity_lag(cfs_rq, se);
+ if (se->vlag > 0) {
+ cfs_rq->nr_running--;
+ if (se != cfs_rq->curr)
+ __dequeue_entity(cfs_rq, se);
+ se->vlag = 0;
+ place_entity(cfs_rq, se, 0);
+ if (se != cfs_rq->curr)
+ __enqueue_entity(cfs_rq, se);
+ cfs_rq->nr_running++;
+ }
+ }
+
+ se->sched_delayed = 0;
+}
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -6742,6 +6821,11 @@ enqueue_task_fair(struct rq *rq, struct
int idle_h_nr_running = task_has_idle_policy(p);
int task_new = !(flags & ENQUEUE_WAKEUP);
+ if (flags & ENQUEUE_DELAYED) {
+ requeue_delayed_entity(se);
+ return;
+ }
+
/*
* The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency.
@@ -6759,8 +6843,11 @@ enqueue_task_fair(struct rq *rq, struct
cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
for_each_sched_entity(se) {
- if (se->on_rq)
+ if (se->on_rq) {
+ if (se->sched_delayed)
+ requeue_delayed_entity(se);
break;
+ }
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags);
@@ -6836,6 +6923,7 @@ static int dequeue_entities(struct rq *r
{
bool was_sched_idle = sched_idle_rq(rq);
bool task_sleep = flags & DEQUEUE_SLEEP;
+ bool task_delayed = flags & DEQUEUE_DELAYED;
struct task_struct *p = NULL;
struct cfs_rq *cfs_rq;
int idle_h_nr_running;
@@ -6849,7 +6937,13 @@ static int dequeue_entities(struct rq *r
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- dequeue_entity(cfs_rq, se, flags);
+
+ if (!dequeue_entity(cfs_rq, se, flags)) {
+ if (p && &p->se == se)
+ return -1;
+
+ break;
+ }
/* h_nr_running is the hierachical count of tasks */
if (p) {
@@ -6877,6 +6971,7 @@ static int dequeue_entities(struct rq *r
break;
}
flags |= DEQUEUE_SLEEP;
+ flags &= ~DEQUEUE_DELAYED;
}
for_each_sched_entity(se) {
@@ -6906,6 +7001,18 @@ static int dequeue_entities(struct rq *r
/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
+
+ if (task_delayed) {
+ SCHED_WARN_ON(!task_sleep);
+ SCHED_WARN_ON(p->on_rq != 1);
+
+ /* Fix-up what dequeue_task_fair() skipped */
+ util_est_update(&rq->cfs, p, task_sleep);
+ hrtick_update(rq);
+
+ /* Fix-up what deactivate_task() skipped. */
+ WRITE_ONCE(p->on_rq, 0);
+ }
}
return 1;
@@ -6923,6 +7030,10 @@ static bool dequeue_task_fair(struct rq
if (dequeue_entities(rq, &p->se, flags) < 0)
return false;
+ /*
+ * It doesn't make sense to update util_est for the delayed dequeue
+ * case where ttwu will make it appear the sleep never happened.
+ */
util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
hrtick_update(rq);
return true;
@@ -8463,7 +8574,9 @@ static struct task_struct *pick_task_fai
if (unlikely(check_cfs_rq_runtime(cfs_rq)))
goto again;
- se = pick_next_entity(cfs_rq);
+ se = pick_next_entity(rq, cfs_rq);
+ if (!se)
+ goto again;
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
@@ -12803,10 +12916,17 @@ static void attach_task_cfs_rq(struct ta
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
+ // XXX think harder on this
+ // this could actually be handled correctly I suppose; keep the whole
+ // se enqueued while boosted. horrible complexity though
+ p->se.sched_delayed = 0;
+ // XXX also vlag ?!?
}
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
+ SCHED_WARN_ON(p->se.sched_delayed);
+
attach_task_cfs_rq(p);
set_task_max_allowed_capacity(p);
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -29,6 +29,18 @@ SCHED_FEAT(NEXT_BUDDY, false)
SCHED_FEAT(CACHE_HOT_BUDDY, true)
/*
+ * Delay dequeueing tasks until they get selected or woken.
+ *
+ * By delaying the dequeue for non-eligible tasks, they remain in the
+ * competition and can burn off their negative lag. When they get selected
+ * they'll have positive lag by definition.
+ *
+ * DELAY_ZERO clips the lag on dequeue (or wakeup) to 0.
+ */
+SCHED_FEAT(DELAY_DEQUEUE, true)
+SCHED_FEAT(DELAY_ZERO, true)
+
+/*
* Allow wakeup-time preemption of the current task:
*/
SCHED_FEAT(WAKEUP_PREEMPTION, true)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2245,6 +2245,7 @@ extern const u32 sched_prio_to_wmult[40
#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
+#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
@@ -2260,6 +2261,7 @@ extern const u32 sched_prio_to_wmult[40
#endif
#define ENQUEUE_INITIAL 0x80
#define ENQUEUE_MIGRATING 0x100
+#define ENQUEUE_DELAYED 0x200
#define RETRY_TASK ((void *)-1UL)
On 2024-04-05 at 12:28:02 +0200, Peter Zijlstra wrote:
> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
> noting that lag is fundamentally a temporal measure. It should not be
> carried around indefinitely.
>
> OTOH it should also not be instantly discarded, doing so will allow a
> task to game the system by purposefully (micro) sleeping at the end of
> its time quantum.
>
> Since lag is intimately tied to the virtual time base, a wall-time
> based decay is also insufficient, notably competition is required for
> any of this to make sense.
>
> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
> competing until they are eligible.
>
> Strictly speaking, we only care about keeping them until the 0-lag
> point, but that is a difficult proposition, instead carry them around
> until they get picked again, and dequeue them at that point.
>
> Since we should have dequeued them at the 0-lag point, truncate lag
> (eg. don't let them earn positive lag).
>
> XXX test the cfs-throttle stuff
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
Tested schbench on xeon server, which has 240 CPUs/2 sockets.
schbench -m 2 -r 100
the result seems ok to me.
baseline:
NO_DELAY_DEQUEUE
NO_DELAY_ZERO
Wakeup Latencies percentiles (usec) runtime 100 (s) (1658446 total samples)
50.0th: 5 (361126 samples)
90.0th: 11 (654121 samples)
* 99.0th: 25 (123032 samples)
99.9th: 673 (13845 samples)
min=1, max=8337
Request Latencies percentiles (usec) runtime 100 (s) (1662381 total samples)
50.0th: 14992 (524771 samples)
90.0th: 15344 (657370 samples)
* 99.0th: 15568 (129769 samples)
99.9th: 15888 (10017 samples)
min=3529, max=43841
RPS percentiles (requests) runtime 100 (s) (101 total samples)
20.0th: 16544 (37 samples)
* 50.0th: 16608 (30 samples)
90.0th: 16736 (31 samples)
min=16403, max=17698
average rps: 16623.81
DELAY_DEQUEUE
DELAY_ZERO
Wakeup Latencies percentiles (usec) runtime 100 (s) (1668161 total samples)
50.0th: 6 (394867 samples)
90.0th: 12 (653021 samples)
* 99.0th: 31 (142636 samples)
99.9th: 755 (14547 samples)
min=1, max=5226
Request Latencies percentiles (usec) runtime 100 (s) (1671859 total samples)
50.0th: 14384 (511809 samples)
90.0th: 14992 (653508 samples)
* 99.0th: 15408 (149257 samples)
99.9th: 15984 (12090 samples)
min=3546, max=38360
RPS percentiles (requests) runtime 100 (s) (101 total samples)
20.0th: 16672 (45 samples)
* 50.0th: 16736 (52 samples)
90.0th: 16736 (0 samples)
min=16629, max=16800
average rps: 16718.59
The 99th wakeup latency increases a little bit, and should be in the acceptible
range(25 -> 31 us). Meanwhile the throughput increases accordingly. Here are
the possible reason I can think of:
1. wakeup latency: The time to find an eligible entity in the tree
during wakeup might take longer - if there are more delayed-dequeue
tasks in the tree.
2. throughput: Inhibit task dequeue can decrease the ratio to touch the
task group's load_avg: dequeue_entity()-> { update_load_avg(), update_cfs_group()),
which reduces the cache contention among CPUs, and improves throughput.
> + } else {
> + bool sleep = flags & DEQUEUE_SLEEP;
> +
> + SCHED_WARN_ON(sleep && se->sched_delayed);
> + update_curr(cfs_rq);
> +
> + if (sched_feat(DELAY_DEQUEUE) && sleep &&
> + !entity_eligible(cfs_rq, se)) {
Regarding the elibigle check, it was found that there could be an overflow
issue, and it brings false negative of entity_eligible(), which was described here:
https://lore.kernel.org/lkml/[email protected]/
and also reported on another machine
https://lore.kernel.org/lkml/[email protected]/
I don't have good idea to avoid that overflow properly, while I'm trying to
reproduce it locally, do you have any guidance on how to address it?
thanks,
Chenyu
On Sat, Apr 06, 2024 at 05:23:25PM +0800, Chen Yu wrote:
> The 99th wakeup latency increases a little bit, and should be in the acceptible
> range(25 -> 31 us).
Ah, my test runs haven't been stable enough to observe that.
> Meanwhile the throughput increases accordingly. Here are
> the possible reason I can think of:
>
> 1. wakeup latency: The time to find an eligible entity in the tree
> during wakeup might take longer - if there are more delayed-dequeue
> tasks in the tree.
Another possible cause might be that previously a schedule() would be
1 dequeue, 1 pick.
But now it can be much more variable, a pick can basically do N dequeues
and N+1 picks.
So not only do we do more picks, but if you're focussed on worst case
latency, it goes up, because we can do multiple dequeues for a single
pick.
If we find this to really be a problem, I had some half baked ideas to
fix it, but it added significant complexity, so keep it simple until
need proves we need more etc.
> 2. throughput: Inhibit task dequeue can decrease the ratio to touch the
> task group's load_avg: dequeue_entity()-> { update_load_avg(), update_cfs_group()),
> which reduces the cache contention among CPUs, and improves throughput.
Ah, yes, there's that.
> > + } else {
> > + bool sleep = flags & DEQUEUE_SLEEP;
> > +
> > + SCHED_WARN_ON(sleep && se->sched_delayed);
> > + update_curr(cfs_rq);
> > +
> > + if (sched_feat(DELAY_DEQUEUE) && sleep &&
> > + !entity_eligible(cfs_rq, se)) {
>
> Regarding the elibigle check, it was found that there could be an overflow
> issue, and it brings false negative of entity_eligible(), which was described here:
> https://lore.kernel.org/lkml/[email protected]/
> and also reported on another machine
> https://lore.kernel.org/lkml/[email protected]/
> I don't have good idea to avoid that overflow properly, while I'm trying to
> reproduce it locally, do you have any guidance on how to address it?
I have not yet seen those, let me go stare at them now. Thanks!
>
>> The 99th wakeup latency increases a little bit, and should be in the acceptible
>> range(25 -> 31 us).
>
> Ah, my test runs haven't been stable enough to observe that.
>
>> Meanwhile the throughput increases accordingly. Here are
>> the possible reason I can think of:
>>
>> 1. wakeup latency: The time to find an eligible entity in the tree
>> during wakeup might take longer - if there are more delayed-dequeue
>> tasks in the tree.
>
> Another possible cause might be that previously a schedule() would be
> 1 dequeue, 1 pick.
>
> But now it can be much more variable, a pick can basically do N dequeues
> and N+1 picks.
>
> So not only do we do more picks, but if you're focussed on worst case
> latency, it goes up, because we can do multiple dequeues for a single
> pick.
>
> If we find this to really be a problem, I had some half baked ideas to
> fix it, but it added significant complexity, so keep it simple until
> need proves we need more etc.
I have an alternative approach to delayed-dequeue inspired by the
original CFS implementation.
The idea is to keep the task's vruntime when it goes to sleep.
When the task is woken up, see if the lag is positive at the woken time,
if it is the case, clamp it to 0 by setting vruntime to avg_vruntime().
<Sleep>
In dequeue_entity(): Remove the task from runqueue, but keep the task's
vruntime, and do not calculate vlag at this time.
<Wake Up on the same CPU>
In enqueue_entity():
1. Do not call place_entity().
2. If the task's vruntume is less than the cfs_rq's avg_vruntime(),
set the task's vruntime to avg_vruntime(), and update the task's
deadline according to its timeslice.
3. Insert the task into the runqueue.
<Wake Up on different CPU>
In migrate_task_rq_fair():
1. Calculate the task's vlag as if it is on the original cfs_rq.
2. Set the task's vlag to 0 if it is positive.
In enqueue_entity(): Use place_entity() to calculate the task's new
vruntime and deadline according to the vlag and the new runqueue before
inserting it into the runqueue.
Hello Peter,
On 4/5/2024 3:58 PM, Peter Zijlstra wrote:
> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
> noting that lag is fundamentally a temporal measure. It should not be
> carried around indefinitely.
>
> OTOH it should also not be instantly discarded, doing so will allow a
> task to game the system by purposefully (micro) sleeping at the end of
> its time quantum.
>
> Since lag is intimately tied to the virtual time base, a wall-time
> based decay is also insufficient, notably competition is required for
> any of this to make sense.
>
> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
> competing until they are eligible.
>
> Strictly speaking, we only care about keeping them until the 0-lag
> point, but that is a difficult proposition, instead carry them around
> until they get picked again, and dequeue them at that point.
>
> Since we should have dequeued them at the 0-lag point, truncate lag
> (eg. don't let them earn positive lag).
>
> XXX test the cfs-throttle stuff
I ran into a few issues when testing the series on top of tip:sched/core
at commit 4475cd8bfd9b ("sched/balancing: Simplify the sg_status bitmask
and use separate ->overloaded and ->overutilized flags"). All of these
splats surfaced when running unixbench with Delayed Dequeue (echoing
NO_DELAY_DEQUEUE to /sys/kernel/debug/sched/features seems to make the
system stable when running Unixbench spawn)
Unixbench (https://github.com/kdlucas/byte-unixbench.git) command:
./Run spawn -c 512
Splats appear soon into the run. Following are the splats and their
corresponding code blocks from my 3rd Generation EPYC system
(2 x 64C/128T):
1. NULL pointer dereferencing in can_migrate_task():
BUG: kernel NULL pointer dereference, address: 0000000000000040
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: 0000 [#1] PREEMPT SMP NOPTI
CPU: 154 PID: 1507736 Comm: spawn Not tainted 6.9.0-rc1-test+ #958
Hardware name: Dell Inc. PowerEdge R6525/024PW1, BIOS 2.7.3 03/30/2022
RIP: 0010:can_migrate_task+0x2b/0x6c0
Code: ...
RSP: 0018:ffffb6bb9e6a3bc0 EFLAGS: 00010086
RAX: 0000000000000000 RBX: ffffb6bb9e6a3c80 RCX: ffff90ad0d209400
RDX: 0000000000000008 RSI: ffffb6bb9e6a3c80 RDI: ffff90eb3b236438
RBP: ffff90eb3b236438 R08: 0000005c743512ab R09: ffffffffffff0000
R10: 0000000000000001 R11: 0000000000000100 R12: ffff90eb3b236438
R13: ffff90eb3b2364f0 R14: ffff90eb3b6359c0 R15: ffff90eb3b6359c0
FS: 0000000000000000(0000) GS:ffff90eb3df00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000040 CR3: 000000807da3c006 CR4: 0000000000770ef0
PKRU: 55555554
Call Trace:
<TASK>
? __die+0x24/0x70
? page_fault_oops+0x14a/0x510
? exc_page_fault+0x77/0x170
? asm_exc_page_fault+0x26/0x30
? can_migrate_task+0x2b/0x6c0
sched_balance_rq+0x7a8/0x1190
sched_balance_newidle+0x1e2/0x490
pick_next_task_fair+0x36/0x4a0
__schedule+0x1c0/0x1710
? srso_alias_return_thunk+0x5/0xfbef5
? refill_stock+0x1a/0x30
? srso_alias_return_thunk+0x5/0xfbef5
? obj_cgroup_uncharge_pages+0x4d/0xd0
do_task_dead+0x42/0x50
do_exit+0x777/0xad0
do_group_exit+0x30/0x80
__x64_sys_exit_group+0x18/0x20
do_syscall_64+0x79/0x120
? srso_alias_return_thunk+0x5/0xfbef5
? srso_alias_return_thunk+0x5/0xfbef5
? irqentry_exit_to_user_mode+0x5b/0x170
entry_SYSCALL_64_after_hwframe+0x6c/0x74
RIP: 0033:0x7f963a6eac31
Code: Unable to access opcode bytes at 0x7f963a6eac07.
RSP: 002b:00007ffc6b7158c8 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7
RAX: ffffffffffffffda RBX: 00007f963a816a00 RCX: 00007f963a6eac31
RDX: 000000000000003c RSI: 00000000000000e7 RDI: 0000000000000000
RBP: 0000000000000000 R08: ffffffffffffff80 R09: 0000000000000020
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f963a816a00
R13: 0000000000000000 R14: 00007f963a81bee8 R15: 00007f963a81bf00
</TASK>
Modules linked in: ...
CR2: 0000000000000040
---[ end trace 0000000000000000 ]---
$ scripts/faddr2line vmlinux can_migrate_task+0x2b/0x6c0
can_migrate_task+0x2b/0x6c0:
throttled_lb_pair at kernel/sched/fair.c:5738
(inlined by) can_migrate_task at kernel/sched/fair.c:9090
Corresponds to:
static inline int throttled_lb_pair(struct task_group *tg,
int src_cpu, int dest_cpu)
{
struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
src_cfs_rq = tg->cfs_rq[src_cpu]; /* <----- Here -----< */
dest_cfs_rq = tg->cfs_rq[dest_cpu];
return throttled_hierarchy(src_cfs_rq) ||
throttled_hierarchy(dest_cfs_rq);
}
(inlined by)
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
/* Called here */
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
...
}
2. A NULL pointer dereferencing in pick_next_task_fair():
BUG: kernel NULL pointer dereference, address: 0000000000000098
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: 0000 [#1] PREEMPT SMP NOPTI
CPU: 107 PID: 1206665 Comm: spawn Tainted: G W 6.9.0-rc1-test+ #958
Hardware name: Dell Inc. PowerEdge R6525/024PW1, BIOS 2.7.3 03/30/2022
RIP: 0010:pick_next_task_fair+0x327/0x4a0
Code: ...
RSP: 0018:ffffb613c212fd28 EFLAGS: 00010002
RAX: 0000004ed2799383 RBX: 0000000000000000 RCX: ffff8f65baf3f800
RDX: ffff8f65baf3ca00 RSI: 0000000000000000 RDI: 000000825ae302ab
RBP: ffff8f64b13b59c0 R08: 0000000000000015 R09: 0000000000000314
R10: 0000000000000001 R11: 0000000000000001 R12: ffff8f261ac199c0
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
FS: 00007f768b79e740(0000) GS:ffff8f64b1380000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000098 CR3: 00000040d1010006 CR4: 0000000000770ef0
PKRU: 55555554
Call Trace:
<TASK>
? __die+0x24/0x70
? page_fault_oops+0x14a/0x510
? srso_alias_return_thunk+0x5/0xfbef5
? report_bug+0x18e/0x1a0
? srso_alias_return_thunk+0x5/0xfbef5
? exc_page_fault+0x77/0x170
? asm_exc_page_fault+0x26/0x30
? pick_next_task_fair+0x327/0x4a0
? pick_next_task_fair+0x320/0x4a0
__schedule+0x1c0/0x1710
? release_task+0x2fc/0x4c0
? srso_alias_return_thunk+0x5/0xfbef5
schedule+0x30/0x120
syscall_exit_to_user_mode+0x98/0x1b0
do_syscall_64+0x85/0x120
? srso_alias_return_thunk+0x5/0xfbef5
? __count_memcg_events+0x69/0x100
? srso_alias_return_thunk+0x5/0xfbef5
? count_memcg_events.constprop.0+0x1a/0x30
? srso_alias_return_thunk+0x5/0xfbef5
? handle_mm_fault+0x17d/0x2e0
? srso_alias_return_thunk+0x5/0xfbef5
? do_user_addr_fault+0x33d/0x6f0
? srso_alias_return_thunk+0x5/0xfbef5
? srso_alias_return_thunk+0x5/0xfbef5
? irqentry_exit_to_user_mode+0x5b/0x170
entry_SYSCALL_64_after_hwframe+0x6c/0x74
RIP: 0033:0x7f768b4eab57
Code: ...
RSP: 002b:00007fff5f6e2018 EFLAGS: 00000246 ORIG_RAX: 0000000000000038
RAX: 000000000026d13b RBX: 00007f768b7ee040 RCX: 00007f768b4eab57
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000001200011
RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f768b79ea10 R11: 0000000000000246 R12: 0000000000000001
R13: 000055b811d1a140 R14: 000055b811d1cd88 R15: 00007f768b7ee040
</TASK>
Modules linked in: ...
CR2: 0000000000000098
---[ end trace 0000000000000000 ]---
$ scripts/faddr2line vmlinux pick_next_task_fair+0x327/0x4a0
pick_next_task_fair+0x327/0x4a0:
is_same_group at kernel/sched/fair.c:418
(inlined by) pick_next_task_fair at kernel/sched/fair.c:8625
struct cfs_rq *
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
if (se->cfs_rq == pse->cfs_rq) /* <----- HERE -----< */
return se->cfs_rq;
return NULL;
}
(inlined by)
struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
...
if (prev != p) {
...
while (!(cfs_rq = is_same_group(se, pse) /* <---- HERE ----< */)) {
...
}
...
}
...
}
3. A NULL Pointer dereferencing in __dequeue_entity():
BUG: kernel NULL pointer dereference, address: 0000000000000000
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: 0000 [#1] PREEMPT SMP NOPTI
CPU: 95 PID: 60896 Comm: spawn Not tainted 6.9.0-rc1-test+ #958
Hardware name: Dell Inc. PowerEdge R6525/024PW1, BIOS 2.7.3 03/30/2022
RIP: 0010:__rb_erase_color+0x88/0x260
Code: ...
RSP: 0018:ffffab158755fc08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: ffffffff841314b0 RCX: 0000000017fc8dd0
RDX: 0000000000000000 RSI: ffff8decfb1fe450 RDI: ffff8decf80bcdd0
RBP: ffff8decf80bcdd0 R08: ffff8decf80bcdd0 R09: ffffffffffffbb60
R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000
R13: ffff8decfb1fe450 R14: ffff8ded0ec03400 R15: ffff8decfb1fe400
FS: 00007f1ded0a2740(0000) GS:ffff8e2bb0d80000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 00000040e66f0005 CR4: 0000000000770ef0
PKRU: 55555554
Call Trace:
<TASK>
? __die+0x24/0x70
? page_fault_oops+0x14a/0x510
? exc_page_fault+0x77/0x170
? asm_exc_page_fault+0x26/0x30
? __pfx_min_vruntime_cb_rotate+0x10/0x10
? __rb_erase_color+0x88/0x260
__dequeue_entity+0x1b7/0x310
set_next_entity+0xc0/0x1e0
pick_next_task_fair+0x355/0x4a0
__schedule+0x1c0/0x1710
? native_queued_spin_lock_slowpath+0x2a4/0x2f0
schedule+0x30/0x120
do_wait+0xad/0x100
kernel_wait4+0xa9/0x150
? __pfx_child_wait_callback+0x10/0x10
do_syscall_64+0x79/0x120
? srso_alias_return_thunk+0x5/0xfbef5
? __count_memcg_events+0x69/0x100
? srso_alias_return_thunk+0x5/0xfbef5
? count_memcg_events.constprop.0+0x1a/0x30
? srso_alias_return_thunk+0x5/0xfbef5
? handle_mm_fault+0x17d/0x2e0
? srso_alias_return_thunk+0x5/0xfbef5
? do_user_addr_fault+0x33d/0x6f0
? srso_alias_return_thunk+0x5/0xfbef5
? srso_alias_return_thunk+0x5/0xfbef5
? irqentry_exit_to_user_mode+0x5b/0x170
entry_SYSCALL_64_after_hwframe+0x6c/0x74
RIP: 0033:0x7f1deceea3ea
Code: ...
RSP: 002b:00007ffd7fd37ca8 EFLAGS: 00000246 ORIG_RAX: 000000000000003d
RAX: ffffffffffffffda RBX: 00007ffd7fd37cb4 RCX: 00007f1deceea3ea
RDX: 0000000000000000 RSI: 00007ffd7fd37cb4 RDI: 00000000ffffffff
RBP: 0000000000000002 R08: 00000000000136f5 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffd7fd37dd8
R13: 000055fd8debf140 R14: 000055fd8dec1d88 R15: 00007f1ded0f2040
</TASK>
Modules linked in: ...
CR2: 0000000000000000
---[ end trace 0000000000000000 ]---
Note: I only ran into this issue with unixbench spawn. A bunch of other
benchmarks (hackbench, stream, tbench, netperf, schbench, other variants
of unixbench) ran fine without bringing down the machine.
Attaching my config below in case this in config specific.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> include/linux/sched.h | 1
> kernel/sched/core.c | 22 +++++--
> kernel/sched/fair.c | 148 +++++++++++++++++++++++++++++++++++++++++++-----
> kernel/sched/features.h | 12 +++
> kernel/sched/sched.h | 2
> 5 files changed, 167 insertions(+), 18 deletions(-)
>
> [..snip..]
>
--
Thanks and Regards,
Prateek
On Fri, 2024-04-12 at 16:12 +0530, K Prateek Nayak wrote:
>
> I ran into a few issues when testing the series on top of tip:sched/core
> at commit 4475cd8bfd9b ("sched/balancing: Simplify the sg_status bitmask
> and use separate ->overloaded and ->overutilized flags"). All of these
> splats surfaced when running unixbench with Delayed Dequeue (echoing
> NO_DELAY_DEQUEUE to /sys/kernel/debug/sched/features seems to make the
> system stable when running Unixbench spawn)
>
> Unixbench (https://github.com/kdlucas/byte-unixbench.git) command:
>
> ./Run spawn -c 512
That plus a hackbench loop works a treat.
>
> Splats appear soon into the run. Following are the splats and their
> corresponding code blocks from my 3rd Generation EPYC system
> (2 x 64C/128T):
Seems a big box is not required. With a low fat sched config (no group
sched), starting ./Run spawn -c 16 (cpus*2) along with a hackbench loop
reliably blows my old i7-4790 box out of the water nearly instantly.
DUMPFILE: vmcore
CPUS: 8
DATE: Mon Apr 15 07:20:29 CEST 2024
UPTIME: 00:07:23
LOAD AVERAGE: 1632.20, 684.99, 291.84
TASKS: 1401
NODENAME: homer
RELEASE: 6.9.0.g0bbac3f-master
VERSION: #7 SMP Mon Apr 15 06:40:05 CEST 2024
MACHINE: x86_64 (3591 Mhz)
MEMORY: 16 GB
PANIC: "Oops: 0000 [#1] SMP NOPTI" (check log for details)
PID: 22664
COMMAND: "hackbench"
TASK: ffff888100acbf00 [THREAD_INFO: ffff888100acbf00]
CPU: 5
STATE: TASK_WAKING (PANIC)
crash> bt -sx
PID: 22664 TASK: ffff888100acbf00 CPU: 5 COMMAND: "hackbench"
#0 [ffff88817cc17920] machine_kexec+0x156 at ffffffff810642d6
#1 [ffff88817cc17970] __crash_kexec+0xd7 at ffffffff81153147
#2 [ffff88817cc17a28] crash_kexec+0x23 at ffffffff811535f3
#3 [ffff88817cc17a38] oops_end+0xbe at ffffffff810329be
#4 [ffff88817cc17a58] page_fault_oops+0x81 at ffffffff81071951
#5 [ffff88817cc17ab8] exc_page_fault+0x62 at ffffffff8194f6f2
#6 [ffff88817cc17ae0] asm_exc_page_fault+0x22 at ffffffff81a00ba2
[exception RIP: pick_task_fair+71]
RIP: ffffffff810d5b57 RSP: ffff88817cc17b90 RFLAGS: 00010046
RAX: 0000000000000000 RBX: ffff88840ed70ec0 RCX: 00000001d7ec138c
RDX: ffffffffe7a7f400 RSI: 0000000000000000 RDI: 0000000000000000
RBP: ffff88840ed70ec0 R8: 0000000000000c00 R9: 000000675402f79e
R10: ffff88817cc17b30 R11: 00000000000000bb R12: ffff88840ed70f40
R13: ffffffff81f64f16 R14: ffff888100acc560 R15: ffff888100acbf00
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#7 [ffff88817cc17bb0] pick_next_task_fair+0x42 at ffffffff810d92c2
#8 [ffff88817cc17be0] __schedule+0x10d at ffffffff8195936d
#9 [ffff88817cc17c50] schedule+0x1c at ffffffff81959ddc
#10 [ffff88817cc17c60] schedule_timeout+0x18c at ffffffff8195fc4c
#11 [ffff88817cc17cc8] unix_stream_read_generic+0x2b7 at ffffffff81869917
#12 [ffff88817cc17da8] unix_stream_recvmsg+0x68 at ffffffff8186a2d8
#13 [ffff88817cc17de0] sock_read_iter+0x159 at ffffffff8170bd69
#14 [ffff88817cc17e70] vfs_read+0x2ce at ffffffff812f195e
#15 [ffff88817cc17ef8] ksys_read+0x40 at ffffffff812f21d0
#16 [ffff88817cc17f30] do_syscall_64+0x57 at ffffffff8194b947
#17 [ffff88817cc17f50] entry_SYSCALL_64_after_hwframe+0x76 at ffffffff81a0012b
RIP: 00007f625660871e RSP: 00007ffc75d48188 RFLAGS: 00000246
RAX: ffffffffffffffda RBX: 00007ffc75d48200 RCX: 00007f625660871e
RDX: 0000000000000064 RSI: 00007ffc75d48190 RDI: 0000000000000007
RBP: 00007ffc75d48260 R8: 00007ffc75d48140 R9: 00007f6256612010
R10: 00007f62565f5070 R11: 0000000000000246 R12: 0000000000000064
R13: 0000000000000000 R14: 0000000000000064 R15: 0000000000000000
ORIG_RAX: 0000000000000000 CS: 0033 SS: 002b
crash> dis pick_task_fair+71
0xffffffff810d5b57 <pick_task_fair+71>: cmpb $0x0,0x4c(%rax)
crash> gdb list *pick_task_fair+71
0xffffffff810d5b57 is in pick_task_fair (kernel/sched/fair.c:5498).
5493 SCHED_WARN_ON(cfs_rq->next->sched_delayed);
5494 return cfs_rq->next;
5495 }
5496
5497 struct sched_entity *se = pick_eevdf(cfs_rq);
5498 if (se->sched_delayed) {
5499 dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
5500 SCHED_WARN_ON(se->sched_delayed);
5501 SCHED_WARN_ON(se->on_rq);
5502 if (sched_feat(DELAY_ZERO) && se->vlag > 0)
crash> struct -ox sched_entity
struct sched_entity {
[0x0] struct load_weight load;
[0x10] struct rb_node run_node;
[0x28] u64 deadline;
[0x30] u64 min_vruntime;
[0x38] struct list_head group_node;
[0x48] unsigned int on_rq;
[0x4c] unsigned char sched_delayed;
[0x4d] unsigned char custom_slice;
[0x50] u64 exec_start;
[0x58] u64 sum_exec_runtime;
[0x60] u64 prev_sum_exec_runtime;
[0x68] u64 vruntime;
[0x70] s64 vlag;
[0x78] u64 slice;
[0x80] u64 nr_migrations;
[0xc0] struct sched_avg avg;
}
SIZE: 0x100
crash>
Hi Peter,
On 4/5/24 11:28, Peter Zijlstra wrote:
> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
> noting that lag is fundamentally a temporal measure. It should not be
> carried around indefinitely.
>
> OTOH it should also not be instantly discarded, doing so will allow a
> task to game the system by purposefully (micro) sleeping at the end of
> its time quantum.
>
> Since lag is intimately tied to the virtual time base, a wall-time
> based decay is also insufficient, notably competition is required for
> any of this to make sense.
>
> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
> competing until they are eligible.
>
> Strictly speaking, we only care about keeping them until the 0-lag
> point, but that is a difficult proposition, instead carry them around
> until they get picked again, and dequeue them at that point.
>
> Since we should have dequeued them at the 0-lag point, truncate lag
> (eg. don't let them earn positive lag).
>
> XXX test the cfs-throttle stuff
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Playing around with a Pixel 6 running a 6.6-based kernel with this
series backported on top, I spotted a noticeable performance improvement
on the speedometer benchmark:
- m6.6-stock-* is the 6.6 mainline Android kernel unmodified.
- m6.6-eevdf-complete-* is the 6.6 mainline Android kernel with
this series applied on top (along with a few required backported
patches).
+-------------------+-----------------------+-----------+
| metric | tag | perc_diff |
+-------------------+-----------------------+-----------+
| Speedometer Score | m6.6-stock-1 | 0.0% |
| Speedometer Score | m6.6-stock-2 | 1.23% |
| Speedometer Score | m6.6-stock-3 | -0.22% |
| Speedometer Score | m6.6-eevdf-complete-1 | 4.54% |
| Speedometer Score | m6.6-eevdf-complete-2 | 4.95% |
| Speedometer Score | m6.6-eevdf-complete-3 | 6.07% |
+-------------------+-----------------------+-----------+
Also some interesting improvements in terms of frame timing for the
uibenchjanktests benchmark. In particular the metrics of missed
deadlines and jank (late) frames, which seems to indicate better
latencies.
+-----------------------+-----------------------+-----------+
| metric | tag | perc_diff |
+-----------------------+-----------------------+-----------+
| gfx-avg-frame-time-50 | m6.6-stock-1 | 0.0 |
| gfx-avg-frame-time-90 | m6.6-stock-1 | 0.0 |
| gfx-avg-frame-time-95 | m6.6-stock-1 | 0.0 |
| gfx-avg-frame-time-99 | m6.6-stock-1 | 0.0 |
| gfx-avg-frame-time-50 | m6.6-stock-2 | 3.46 |
| gfx-avg-frame-time-90 | m6.6-stock-2 | 1.19 |
| gfx-avg-frame-time-95 | m6.6-stock-2 | 0.24 |
| gfx-avg-frame-time-99 | m6.6-stock-2 | 0.48 |
| gfx-avg-frame-time-50 | m6.6-eevdf-complete-1 | -30.45 |
| gfx-avg-frame-time-90 | m6.6-eevdf-complete-1 | -48.44 |
| gfx-avg-frame-time-95 | m6.6-eevdf-complete-1 | -51.32 |
| gfx-avg-frame-time-99 | m6.6-eevdf-complete-1 | -52.48 |
| gfx-avg-frame-time-50 | m6.6-eevdf-complete-2 | -30.32 |
| gfx-avg-frame-time-90 | m6.6-eevdf-complete-2 | -48.16 |
| gfx-avg-frame-time-95 | m6.6-eevdf-complete-2 | -51.08 |
| gfx-avg-frame-time-99 | m6.6-eevdf-complete-2 | -51.7 |
+-----------------------+-----------------------+-----------+
+-----------------------------------+-----------------------+-----------+
| metric | tag | perc_diff |
+-----------------------------------+-----------------------+-----------+
| gfx-avg-num-frame-deadline-missed | m6.6-stock-1 | 0.0 |
| gfx-max-num-frame-deadline-missed | m6.6-stock-1 | 0.0 |
| gfx-avg-num-frame-deadline-missed | m6.6-stock-2 | -3.21 |
| gfx-max-num-frame-deadline-missed | m6.6-stock-2 | -3.21 |
| gfx-avg-num-frame-deadline-missed | m6.6-eevdf-complete-1 | -85.29 |
| gfx-max-num-frame-deadline-missed | m6.6-eevdf-complete-1 | -85.29 |
| gfx-avg-num-frame-deadline-missed | m6.6-eevdf-complete-2 | -84.8 |
| gfx-max-num-frame-deadline-missed | m6.6-eevdf-complete-2 | -84.8 |
+-----------------------------------+-----------------------+-----------+
+----------------------------+-----------------------+-----------+
| metric | tag | perc_diff |
+----------------------------+-----------------------+-----------+
| gfx-avg-high-input-latency | m6.6-stock-1 | 0.0 |
| gfx-max-high-input-latency | m6.6-stock-1 | 0.0 |
| gfx-avg-high-input-latency | m6.6-stock-2 | 0.93 |
| gfx-max-high-input-latency | m6.6-stock-2 | 0.93 |
| gfx-avg-high-input-latency | m6.6-eevdf-complete-1 | -18.35 |
| gfx-max-high-input-latency | m6.6-eevdf-complete-1 | -18.35 |
| gfx-avg-high-input-latency | m6.6-eevdf-complete-2 | -18.05 |
| gfx-max-high-input-latency | m6.6-eevdf-complete-2 | -18.05 |
+----------------------------+-----------------------+-----------+
+--------------+-----------------------+-----------+
| metric | tag | perc_diff |
+--------------+-----------------------+-----------+
| gfx-avg-jank | m6.6-stock-1 | 0.0 |
| gfx-max-jank | m6.6-stock-1 | 0.0 |
| gfx-avg-jank | m6.6-stock-2 | 1.56 |
| gfx-max-jank | m6.6-stock-2 | 1.56 |
| gfx-avg-jank | m6.6-eevdf-complete-1 | -82.81 |
| gfx-max-jank | m6.6-eevdf-complete-1 | -82.81 |
| gfx-avg-jank | m6.6-eevdf-complete-2 | -78.12 |
| gfx-max-jank | m6.6-eevdf-complete-2 | -78.12 |
+--------------+-----------------------+-----------+
Bisecting through the patches in this series, I ended up with patch 08/10
as the one that improved things overall for these benchmarks.
I'd like to investigate this further to understand the reason behind some of
these dramatic improvements.
> ---
> include/linux/sched.h | 1
> kernel/sched/core.c | 22 +++++--
> kernel/sched/fair.c | 148 +++++++++++++++++++++++++++++++++++++++++++-----
> kernel/sched/features.h | 12 +++
> kernel/sched/sched.h | 2
> 5 files changed, 167 insertions(+), 18 deletions(-)
>
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -542,6 +542,7 @@ struct sched_entity {
>
> struct list_head group_node;
> unsigned int on_rq;
> + unsigned int sched_delayed;
>
> u64 exec_start;
> u64 sum_exec_runtime;
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2154,10 +2154,14 @@ void activate_task(struct rq *rq, struct
>
> void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
> {
> - WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING);
> - ASSERT_EXCLUSIVE_WRITER(p->on_rq);
> + bool sleep = flags & DEQUEUE_SLEEP;
>
> - dequeue_task(rq, p, flags);
> + if (dequeue_task(rq, p, flags)) {
> + WRITE_ONCE(p->on_rq, sleep ? 0 : TASK_ON_RQ_MIGRATING);
> + ASSERT_EXCLUSIVE_WRITER(p->on_rq);
> + } else {
> + SCHED_WARN_ON(!sleep); /* only sleep can fail */
> + }
> }
>
> static inline int __normal_prio(int policy, int rt_prio, int nice)
> @@ -3858,12 +3862,17 @@ static int ttwu_runnable(struct task_str
>
> rq = __task_rq_lock(p, &rf);
> if (task_on_rq_queued(p)) {
> + update_rq_clock(rq);
> + if (p->se.sched_delayed) {
> + /* mustn't run a delayed task */
> + SCHED_WARN_ON(task_on_cpu(rq, p));
> + enqueue_task(rq, p, ENQUEUE_DELAYED);
> + }
> if (!task_on_cpu(rq, p)) {
> /*
> * When on_rq && !on_cpu the task is preempted, see if
> * it should preempt the task that is current now.
> */
> - update_rq_clock(rq);
> wakeup_preempt(rq, p, wake_flags);
> }
> ttwu_do_wakeup(p);
> @@ -4243,11 +4252,16 @@ int try_to_wake_up(struct task_struct *p
> * case the whole 'p->on_rq && ttwu_runnable()' case below
> * without taking any locks.
> *
> + * Specifically, given current runs ttwu() we must be before
> + * schedule()'s deactivate_task(), as such this must not
> + * observe sched_delayed.
> + *
> * In particular:
> * - we rely on Program-Order guarantees for all the ordering,
> * - we're serialized against set_special_state() by virtue of
> * it disabling IRQs (this allows not taking ->pi_lock).
> */
> + SCHED_WARN_ON(p->se.sched_delayed);
> if (!ttwu_state_match(p, state, &success))
> goto out;
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5270,6 +5270,10 @@ static inline int cfs_rq_throttled(struc
> static inline bool cfs_bandwidth_used(void);
>
> static void
> +requeue_delayed_entity(struct sched_entity *se);
> +
> +/* XXX bool and pull in the requeue_delayed_entity thing */
> +static void
> enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> {
> bool curr = cfs_rq->curr == se;
> @@ -5356,18 +5360,33 @@ static void clear_buddies(struct cfs_rq
>
> static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
>
> -static void
> +static bool
> dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> {
> - int action = UPDATE_TG;
> -
> - if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
> - action |= DO_DETACH;
> -
> /*
> * Update run-time statistics of the 'current'.
> */
> - update_curr(cfs_rq);
> + if (flags & DEQUEUE_DELAYED) {
> + SCHED_WARN_ON(!se->sched_delayed);
> + se->sched_delayed = 0;
> + } else {
> + bool sleep = flags & DEQUEUE_SLEEP;
> +
> + SCHED_WARN_ON(sleep && se->sched_delayed);
> + update_curr(cfs_rq);
> +
> + if (sched_feat(DELAY_DEQUEUE) && sleep &&
> + !entity_eligible(cfs_rq, se)) {
> + if (cfs_rq->next == se)
> + cfs_rq->next = NULL;
> + se->sched_delayed = 1;
> + return false;
> + }
> + }
> +
> + int action = UPDATE_TG;
> + if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
> + action |= DO_DETACH;
>
> /*
> * When dequeuing a sched_entity, we must:
> @@ -5407,6 +5426,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
>
> if (cfs_rq->nr_running == 0)
> update_idle_cfs_rq_clock_pelt(cfs_rq);
> +
> + return true;
> }
>
> static void
> @@ -5432,6 +5453,7 @@ set_next_entity(struct cfs_rq *cfs_rq, s
> }
>
> update_stats_curr_start(cfs_rq, se);
> + SCHED_WARN_ON(cfs_rq->curr);
> cfs_rq->curr = se;
>
> /*
> @@ -5452,6 +5474,8 @@ set_next_entity(struct cfs_rq *cfs_rq, s
> se->prev_sum_exec_runtime = se->sum_exec_runtime;
> }
>
> +static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
> +
> /*
> * Pick the next process, keeping these things in mind, in this order:
> * 1) keep things fair between processes/task groups
> @@ -5460,16 +5484,29 @@ set_next_entity(struct cfs_rq *cfs_rq, s
> * 4) do not run the "skip" process, if something else is available
> */
> static struct sched_entity *
> -pick_next_entity(struct cfs_rq *cfs_rq)
> +pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
> {
> /*
> * Enabling NEXT_BUDDY will affect latency but not fairness.
> */
> if (sched_feat(NEXT_BUDDY) &&
> - cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
> + cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
> + /* ->next will never be delayed */
> + SCHED_WARN_ON(cfs_rq->next->sched_delayed);
> return cfs_rq->next;
> + }
>
> - return pick_eevdf(cfs_rq);
> + struct sched_entity *se = pick_eevdf(cfs_rq);
> + if (se->sched_delayed) {
> + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> + SCHED_WARN_ON(se->sched_delayed);
> + SCHED_WARN_ON(se->on_rq);
> + if (sched_feat(DELAY_ZERO) && se->vlag > 0)
> + se->vlag = 0;
> +
> + return NULL;
> + }
> + return se;
> }
>
> static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
> @@ -5493,6 +5530,7 @@ static void put_prev_entity(struct cfs_r
> /* in !on_rq case, update occurred at dequeue */
> update_load_avg(cfs_rq, prev, 0);
> }
> + SCHED_WARN_ON(cfs_rq->curr != prev);
> cfs_rq->curr = NULL;
> }
>
> @@ -5793,6 +5831,10 @@ static bool throttle_cfs_rq(struct cfs_r
> if (!se->on_rq)
> goto done;
>
> + /*
> + * XXX should be fine vs sched_delay; if won't run after this.
> + * Either pick dequeues it, or unthrottle. Double check!!
> + */
> dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
>
> if (cfs_rq_is_idle(group_cfs_rq(se)))
> @@ -5882,8 +5924,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cf
> for_each_sched_entity(se) {
> struct cfs_rq *qcfs_rq = cfs_rq_of(se);
>
> - if (se->on_rq)
> + if (se->on_rq) {
> + if (se->sched_delayed)
> + requeue_delayed_entity(se);
> break;
> + }
> enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
>
> if (cfs_rq_is_idle(group_cfs_rq(se)))
> @@ -6729,6 +6774,40 @@ static int sched_idle_cpu(int cpu)
> }
> #endif
>
> +static void
> +requeue_delayed_entity(struct sched_entity *se)
> +{
> + struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> + /*
> + * se->sched_delayed should imply both: se->on_rq == 1 and
> + * cfs_rq->curr != se. Because a delayed entity is one that is still on
> + * the runqueue competing until elegibility.
> + *
> + * Except for groups, consider current going idle and newidle pulling a
> + * task in the same group -- in that case 'cfs_rq->curr == se'.
> + */
> + SCHED_WARN_ON(!se->sched_delayed);
> + SCHED_WARN_ON(!se->on_rq);
> + SCHED_WARN_ON(entity_is_task(se) && cfs_rq->curr == se);
> +
> + if (sched_feat(DELAY_ZERO)) {
> + update_entity_lag(cfs_rq, se);
> + if (se->vlag > 0) {
> + cfs_rq->nr_running--;
> + if (se != cfs_rq->curr)
> + __dequeue_entity(cfs_rq, se);
> + se->vlag = 0;
> + place_entity(cfs_rq, se, 0);
> + if (se != cfs_rq->curr)
> + __enqueue_entity(cfs_rq, se);
> + cfs_rq->nr_running++;
> + }
> + }
> +
> + se->sched_delayed = 0;
> +}
> +
> /*
> * The enqueue_task method is called before nr_running is
> * increased. Here we update the fair scheduling stats and
> @@ -6742,6 +6821,11 @@ enqueue_task_fair(struct rq *rq, struct
> int idle_h_nr_running = task_has_idle_policy(p);
> int task_new = !(flags & ENQUEUE_WAKEUP);
>
> + if (flags & ENQUEUE_DELAYED) {
> + requeue_delayed_entity(se);
> + return;
> + }
> +
> /*
> * The code below (indirectly) updates schedutil which looks at
> * the cfs_rq utilization to select a frequency.
> @@ -6759,8 +6843,11 @@ enqueue_task_fair(struct rq *rq, struct
> cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
>
> for_each_sched_entity(se) {
> - if (se->on_rq)
> + if (se->on_rq) {
> + if (se->sched_delayed)
> + requeue_delayed_entity(se);
> break;
> + }
> cfs_rq = cfs_rq_of(se);
> enqueue_entity(cfs_rq, se, flags);
>
> @@ -6836,6 +6923,7 @@ static int dequeue_entities(struct rq *r
> {
> bool was_sched_idle = sched_idle_rq(rq);
> bool task_sleep = flags & DEQUEUE_SLEEP;
> + bool task_delayed = flags & DEQUEUE_DELAYED;
> struct task_struct *p = NULL;
> struct cfs_rq *cfs_rq;
> int idle_h_nr_running;
> @@ -6849,7 +6937,13 @@ static int dequeue_entities(struct rq *r
>
> for_each_sched_entity(se) {
> cfs_rq = cfs_rq_of(se);
> - dequeue_entity(cfs_rq, se, flags);
> +
> + if (!dequeue_entity(cfs_rq, se, flags)) {
> + if (p && &p->se == se)
> + return -1;
> +
> + break;
> + }
>
> /* h_nr_running is the hierachical count of tasks */
> if (p) {
> @@ -6877,6 +6971,7 @@ static int dequeue_entities(struct rq *r
> break;
> }
> flags |= DEQUEUE_SLEEP;
> + flags &= ~DEQUEUE_DELAYED;
> }
>
> for_each_sched_entity(se) {
> @@ -6906,6 +7001,18 @@ static int dequeue_entities(struct rq *r
> /* balance early to pull high priority tasks */
> if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
> rq->next_balance = jiffies;
> +
> + if (task_delayed) {
> + SCHED_WARN_ON(!task_sleep);
> + SCHED_WARN_ON(p->on_rq != 1);
> +
> + /* Fix-up what dequeue_task_fair() skipped */
> + util_est_update(&rq->cfs, p, task_sleep);
> + hrtick_update(rq);
> +
> + /* Fix-up what deactivate_task() skipped. */
> + WRITE_ONCE(p->on_rq, 0);
> + }
> }
>
> return 1;
> @@ -6923,6 +7030,10 @@ static bool dequeue_task_fair(struct rq
> if (dequeue_entities(rq, &p->se, flags) < 0)
> return false;
>
> + /*
> + * It doesn't make sense to update util_est for the delayed dequeue
> + * case where ttwu will make it appear the sleep never happened.
> + */
> util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
> hrtick_update(rq);
> return true;
> @@ -8463,7 +8574,9 @@ static struct task_struct *pick_task_fai
> if (unlikely(check_cfs_rq_runtime(cfs_rq)))
> goto again;
>
> - se = pick_next_entity(cfs_rq);
> + se = pick_next_entity(rq, cfs_rq);
> + if (!se)
> + goto again;
> cfs_rq = group_cfs_rq(se);
> } while (cfs_rq);
>
> @@ -12803,10 +12916,17 @@ static void attach_task_cfs_rq(struct ta
> static void switched_from_fair(struct rq *rq, struct task_struct *p)
> {
> detach_task_cfs_rq(p);
> + // XXX think harder on this
> + // this could actually be handled correctly I suppose; keep the whole
> + // se enqueued while boosted. horrible complexity though
> + p->se.sched_delayed = 0;
> + // XXX also vlag ?!?
> }
>
> static void switched_to_fair(struct rq *rq, struct task_struct *p)
> {
> + SCHED_WARN_ON(p->se.sched_delayed);
> +
> attach_task_cfs_rq(p);
>
> set_task_max_allowed_capacity(p);
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -29,6 +29,18 @@ SCHED_FEAT(NEXT_BUDDY, false)
> SCHED_FEAT(CACHE_HOT_BUDDY, true)
>
> /*
> + * Delay dequeueing tasks until they get selected or woken.
> + *
> + * By delaying the dequeue for non-eligible tasks, they remain in the
> + * competition and can burn off their negative lag. When they get selected
> + * they'll have positive lag by definition.
> + *
> + * DELAY_ZERO clips the lag on dequeue (or wakeup) to 0.
> + */
> +SCHED_FEAT(DELAY_DEQUEUE, true)
> +SCHED_FEAT(DELAY_ZERO, true)
> +
> +/*
> * Allow wakeup-time preemption of the current task:
> */
> SCHED_FEAT(WAKEUP_PREEMPTION, true)
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2245,6 +2245,7 @@ extern const u32 sched_prio_to_wmult[40
> #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
> #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
> #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
> +#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */
>
> #define ENQUEUE_WAKEUP 0x01
> #define ENQUEUE_RESTORE 0x02
> @@ -2260,6 +2261,7 @@ extern const u32 sched_prio_to_wmult[40
> #endif
> #define ENQUEUE_INITIAL 0x80
> #define ENQUEUE_MIGRATING 0x100
> +#define ENQUEUE_DELAYED 0x200
>
> #define RETRY_TASK ((void *)-1UL)
>
>
>
>
(+ Chen Yu, Oliver Sang)
Hello Mike,
On 4/15/2024 4:26 PM, Mike Galbraith wrote:
> On Fri, 2024-04-12 at 16:12 +0530, K Prateek Nayak wrote:
>>
>> I ran into a few issues when testing the series on top of tip:sched/core
>> at commit 4475cd8bfd9b ("sched/balancing: Simplify the sg_status bitmask
>> and use separate ->overloaded and ->overutilized flags"). All of these
>> splats surfaced when running unixbench with Delayed Dequeue (echoing
>> NO_DELAY_DEQUEUE to /sys/kernel/debug/sched/features seems to make the
>> system stable when running Unixbench spawn)
>>
>> Unixbench (https://github.com/kdlucas/byte-unixbench.git) command:
>>
>> ./Run spawn -c 512
>
> That plus a hackbench loop works a treat.
>
>>
>> Splats appear soon into the run. Following are the splats and their
>> corresponding code blocks from my 3rd Generation EPYC system
>> (2 x 64C/128T):
>
> Seems a big box is not required. With a low fat sched config (no group
> sched), starting ./Run spawn -c 16 (cpus*2) along with a hackbench loop
> reliably blows my old i7-4790 box out of the water nearly instantly.
>
> DUMPFILE: vmcore
> CPUS: 8
> DATE: Mon Apr 15 07:20:29 CEST 2024
> UPTIME: 00:07:23
> LOAD AVERAGE: 1632.20, 684.99, 291.84
> TASKS: 1401
> NODENAME: homer
> RELEASE: 6.9.0.g0bbac3f-master
> VERSION: #7 SMP Mon Apr 15 06:40:05 CEST 2024
> MACHINE: x86_64 (3591 Mhz)
> MEMORY: 16 GB
> PANIC: "Oops: 0000 [#1] SMP NOPTI" (check log for details)
> PID: 22664
> COMMAND: "hackbench"
> TASK: ffff888100acbf00 [THREAD_INFO: ffff888100acbf00]
> CPU: 5
> STATE: TASK_WAKING (PANIC)
>
> crash> bt -sx
> PID: 22664 TASK: ffff888100acbf00 CPU: 5 COMMAND: "hackbench"
> #0 [ffff88817cc17920] machine_kexec+0x156 at ffffffff810642d6
> #1 [ffff88817cc17970] __crash_kexec+0xd7 at ffffffff81153147
> #2 [ffff88817cc17a28] crash_kexec+0x23 at ffffffff811535f3
> #3 [ffff88817cc17a38] oops_end+0xbe at ffffffff810329be
> #4 [ffff88817cc17a58] page_fault_oops+0x81 at ffffffff81071951
> #5 [ffff88817cc17ab8] exc_page_fault+0x62 at ffffffff8194f6f2
> #6 [ffff88817cc17ae0] asm_exc_page_fault+0x22 at ffffffff81a00ba2
> [exception RIP: pick_task_fair+71]
> RIP: ffffffff810d5b57 RSP: ffff88817cc17b90 RFLAGS: 00010046
> RAX: 0000000000000000 RBX: ffff88840ed70ec0 RCX: 00000001d7ec138c
> RDX: ffffffffe7a7f400 RSI: 0000000000000000 RDI: 0000000000000000
> RBP: ffff88840ed70ec0 R8: 0000000000000c00 R9: 000000675402f79e
> R10: ffff88817cc17b30 R11: 00000000000000bb R12: ffff88840ed70f40
> R13: ffffffff81f64f16 R14: ffff888100acc560 R15: ffff888100acbf00
> ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
> #7 [ffff88817cc17bb0] pick_next_task_fair+0x42 at ffffffff810d92c2
> #8 [ffff88817cc17be0] __schedule+0x10d at ffffffff8195936d
> #9 [ffff88817cc17c50] schedule+0x1c at ffffffff81959ddc
> #10 [ffff88817cc17c60] schedule_timeout+0x18c at ffffffff8195fc4c
> #11 [ffff88817cc17cc8] unix_stream_read_generic+0x2b7 at ffffffff81869917
> #12 [ffff88817cc17da8] unix_stream_recvmsg+0x68 at ffffffff8186a2d8
> #13 [ffff88817cc17de0] sock_read_iter+0x159 at ffffffff8170bd69
> #14 [ffff88817cc17e70] vfs_read+0x2ce at ffffffff812f195e
> #15 [ffff88817cc17ef8] ksys_read+0x40 at ffffffff812f21d0
> #16 [ffff88817cc17f30] do_syscall_64+0x57 at ffffffff8194b947
> #17 [ffff88817cc17f50] entry_SYSCALL_64_after_hwframe+0x76 at ffffffff81a0012b
> RIP: 00007f625660871e RSP: 00007ffc75d48188 RFLAGS: 00000246
> RAX: ffffffffffffffda RBX: 00007ffc75d48200 RCX: 00007f625660871e
> RDX: 0000000000000064 RSI: 00007ffc75d48190 RDI: 0000000000000007
> RBP: 00007ffc75d48260 R8: 00007ffc75d48140 R9: 00007f6256612010
> R10: 00007f62565f5070 R11: 0000000000000246 R12: 0000000000000064
> R13: 0000000000000000 R14: 0000000000000064 R15: 0000000000000000
> ORIG_RAX: 0000000000000000 CS: 0033 SS: 002b
> crash> dis pick_task_fair+71
> 0xffffffff810d5b57 <pick_task_fair+71>: cmpb $0x0,0x4c(%rax)
> crash> gdb list *pick_task_fair+71
> 0xffffffff810d5b57 is in pick_task_fair (kernel/sched/fair.c:5498).
> 5493 SCHED_WARN_ON(cfs_rq->next->sched_delayed);
> 5494 return cfs_rq->next;
> 5495 }
> 5496
> 5497 struct sched_entity *se = pick_eevdf(cfs_rq);
> 5498 if (se->sched_delayed) {
Wondering if you are running into the issue where pick_eevdf() returns
NULL despite there being runnable CFS task on the runqueue. Can you try
running with the following patch from Chenyu -
https://lore.kernel.org/lkml/[email protected]/
> 5499 dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> 5500 SCHED_WARN_ON(se->sched_delayed);
> 5501 SCHED_WARN_ON(se->on_rq);
> 5502 if (sched_feat(DELAY_ZERO) && se->vlag > 0)
> crash> struct -ox sched_entity
> struct sched_entity {
> [0x0] struct load_weight load;
> [0x10] struct rb_node run_node;
> [0x28] u64 deadline;
> [0x30] u64 min_vruntime;
> [0x38] struct list_head group_node;
> [0x48] unsigned int on_rq;
> [0x4c] unsigned char sched_delayed;
> [0x4d] unsigned char custom_slice;
> [0x50] u64 exec_start;
> [0x58] u64 sum_exec_runtime;
> [0x60] u64 prev_sum_exec_runtime;
> [0x68] u64 vruntime;
> [0x70] s64 vlag;
> [0x78] u64 slice;
> [0x80] u64 nr_migrations;
> [0xc0] struct sched_avg avg;
> }
> SIZE: 0x100
> crash>
>
--
Thanks and Regards,
Prateek
On Tue, 2024-04-16 at 08:48 +0530, K Prateek Nayak wrote:
>
> Wondering if you are running into the issue where pick_eevdf() returns
> NULL despite there being runnable CFS task on the runqueue. Can you try
> running with the following patch from Chenyu -
> https://lore.kernel.org/lkml/[email protected]/
I hadn't tried it in master branch, but had in my 6.1-eevdf tree which
contains the previous delay dequeue version, because I found that it
too would explode, which surprised me having run it quite a bit woe
free. Patch helped 6.1-eevdf not at all. I just tried it in master to
be sure, and while the death throes were not identical, this one began
with SCHED_WARN_ON() in put_prev_entity(), box still went belly up.
BTW/FWIW, turning on AUTOGROUP seemingly rendered box immune (unless
booted with noautogroup 'course), and that immunity even held when I
launched all players in the same cgroup (hmm).
-Mike
Greetings,
I tossed a couple rocks at it today, and seem to have hit the little
bugger. The root cause seems to be doing the delay dequeue business on
exiting tasks. Hunk #1 of hacklet below seems to quell the explosions.
crash> bt -sx
PID: 21722 TASK: ffff88815710ee40 CPU: 6 COMMAND: "hackbench"
#0 [ffff88822773bb90] machine_kexec+0x156 at ffffffff810642d6
#1 [ffff88822773bbe0] __crash_kexec+0xd7 at ffffffff81152a07
#2 [ffff88822773bc98] crash_kexec+0x23 at ffffffff81152eb3
#3 [ffff88822773bca8] oops_end+0xbe at ffffffff810329be
#4 [ffff88822773bcc8] page_fault_oops+0x81 at ffffffff81071951
#5 [ffff88822773bd28] exc_page_fault+0x62 at ffffffff8194e9e2
#6 [ffff88822773bd50] asm_exc_page_fault+0x22 at ffffffff81a00ba2
[exception RIP: pick_next_task_fair+178]
RIP: ffffffff810d8d12 RSP: ffff88822773be00 RFLAGS: 00010006
RAX: ffff88813cb780b8 RBX: ffff88840edb0e80 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffff88813cb78080 RDI: ffff88840ec30f00
RBP: ffff88813cb78000 R8: ffff88815710eec0 R9: 0000000000000001
R10: ffff88822773bdc8 R11: 0000000000000013 R12: 0000000000030e80
R13: ffff88815710ee40 R14: ffff88813cb78080 R15: ffff88815710ee40
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#7 [ffff88822773be28] __schedule+0x10d at ffffffff8195865d
#8 [ffff88822773be98] do_task_dead+0x3e at ffffffff810cc00e
#9 [ffff88822773beb0] do_exit+0x770 at ffffffff8108f0e0
#10 [ffff88822773bf00] do_group_exit+0x2c at ffffffff8108f64c
#11 [ffff88822773bf28] __x64_sys_exit_group+0x14 at ffffffff8108f6f4
#12 [ffff88822773bf30] do_syscall_64+0x57 at ffffffff8194ac37
#13 [ffff88822773bf50] entry_SYSCALL_64_after_hwframe+0x76 at ffffffff81a0012b
RIP: 00007f4f2aa76136 RSP: 00007ffcbba84748 RFLAGS: 00000246
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f4f2aa76136
RDX: 0000000000000000 RSI: 000000000000003c RDI: 0000000000000000
RBP: 00007f4f2ab86970 R8: 00000000000000e7 R9: ffffffffffffff80
R10: 0000000000000004 R11: 0000000000000246 R12: 00007f4f2ab86970
R13: 0000000000000001 R14: 00007f4f2ab8a328 R15: 0000000000000000
ORIG_RAX: 00000000000000e7 CS: 0033 SS: 002b
crash> task_struct ffff88815710ee40 | grep sched_delayed
sched_delayed = 1 '\001',
crash>
---
kernel/sched/fair.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5374,6 +5374,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
update_curr(cfs_rq);
if (sched_feat(DELAY_DEQUEUE) && sleep &&
+ !(entity_is_task(se) && (task_of(se)->flags & PF_EXITING)) &&
!entity_eligible(cfs_rq, se)) {
if (cfs_rq->next == se)
cfs_rq->next = NULL;
@@ -5495,14 +5496,14 @@ pick_next_entity(struct rq *rq, struct c
}
struct sched_entity *se = pick_eevdf(cfs_rq);
- if (se->sched_delayed) {
+ while (se && se->sched_delayed) {
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
SCHED_WARN_ON(se->sched_delayed);
SCHED_WARN_ON(se->on_rq);
if (sched_feat(DELAY_ZERO) && se->vlag > 0)
se->vlag = 0;
- return NULL;
+ se = pick_eevdf(cfs_rq);
}
return se;
}
Hello Mike,
On 4/18/2024 9:54 PM, Mike Galbraith wrote:
> Greetings,
>
> I tossed a couple rocks at it today, and seem to have hit the little
> bugger. The root cause seems to be doing the delay dequeue business on
> exiting tasks. Hunk #1 of hacklet below seems to quell the explosions.
>
> crash> bt -sx
> PID: 21722 TASK: ffff88815710ee40 CPU: 6 COMMAND: "hackbench"
> #0 [ffff88822773bb90] machine_kexec+0x156 at ffffffff810642d6
> #1 [ffff88822773bbe0] __crash_kexec+0xd7 at ffffffff81152a07
> #2 [ffff88822773bc98] crash_kexec+0x23 at ffffffff81152eb3
> #3 [ffff88822773bca8] oops_end+0xbe at ffffffff810329be
> #4 [ffff88822773bcc8] page_fault_oops+0x81 at ffffffff81071951
> #5 [ffff88822773bd28] exc_page_fault+0x62 at ffffffff8194e9e2
> #6 [ffff88822773bd50] asm_exc_page_fault+0x22 at ffffffff81a00ba2
> [exception RIP: pick_next_task_fair+178]
> RIP: ffffffff810d8d12 RSP: ffff88822773be00 RFLAGS: 00010006
> RAX: ffff88813cb780b8 RBX: ffff88840edb0e80 RCX: 0000000000000000
> RDX: 0000000000000000 RSI: ffff88813cb78080 RDI: ffff88840ec30f00
> RBP: ffff88813cb78000 R8: ffff88815710eec0 R9: 0000000000000001
> R10: ffff88822773bdc8 R11: 0000000000000013 R12: 0000000000030e80
> R13: ffff88815710ee40 R14: ffff88813cb78080 R15: ffff88815710ee40
> ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
> #7 [ffff88822773be28] __schedule+0x10d at ffffffff8195865d
> #8 [ffff88822773be98] do_task_dead+0x3e at ffffffff810cc00e
> #9 [ffff88822773beb0] do_exit+0x770 at ffffffff8108f0e0
> #10 [ffff88822773bf00] do_group_exit+0x2c at ffffffff8108f64c
> #11 [ffff88822773bf28] __x64_sys_exit_group+0x14 at ffffffff8108f6f4
> #12 [ffff88822773bf30] do_syscall_64+0x57 at ffffffff8194ac37
> #13 [ffff88822773bf50] entry_SYSCALL_64_after_hwframe+0x76 at ffffffff81a0012b
> RIP: 00007f4f2aa76136 RSP: 00007ffcbba84748 RFLAGS: 00000246
> RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f4f2aa76136
> RDX: 0000000000000000 RSI: 000000000000003c RDI: 0000000000000000
> RBP: 00007f4f2ab86970 R8: 00000000000000e7 R9: ffffffffffffff80
> R10: 0000000000000004 R11: 0000000000000246 R12: 00007f4f2ab86970
> R13: 0000000000000001 R14: 00007f4f2ab8a328 R15: 0000000000000000
> ORIG_RAX: 00000000000000e7 CS: 0033 SS: 002b
> crash> task_struct ffff88815710ee40 | grep sched_delayed
> sched_delayed = 1 '\001',
> crash>
>
My machine too survived the spawn test with the below patch. Thank you
for looking into this. I'll resume testing with the below patch
included.
--
Thanks and Regards,
Prateek
> ---
> kernel/sched/fair.c | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5374,6 +5374,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
> update_curr(cfs_rq);
>
> if (sched_feat(DELAY_DEQUEUE) && sleep &&
> + !(entity_is_task(se) && (task_of(se)->flags & PF_EXITING)) &&
> !entity_eligible(cfs_rq, se)) {
> if (cfs_rq->next == se)
> cfs_rq->next = NULL;
> @@ -5495,14 +5496,14 @@ pick_next_entity(struct rq *rq, struct c
> }
>
> struct sched_entity *se = pick_eevdf(cfs_rq);
> - if (se->sched_delayed) {
> + while (se && se->sched_delayed) {
> dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> SCHED_WARN_ON(se->sched_delayed);
> SCHED_WARN_ON(se->on_rq);
> if (sched_feat(DELAY_ZERO) && se->vlag > 0)
> se->vlag = 0;
>
> - return NULL;
> + se = pick_eevdf(cfs_rq);
> }
> return se;
> }
>
(removes apparently busted bytedance.com address and retries xmit)
Greetings!
With this version, the better CPU distribution (for tbench synchronous
net-blasting) closed the CFS vs EEVDF throughput deficit. I verified
both by rolling the previous version forward and back-porting to 6.1
where I've got CFS and EEVDF to re-compare, now with both dequeue delay
patch versions.
As usual, there will be winners and losers, but (modulo dead buglet) it
looks kinda promising to me.
Distribution of single pinned buddy pair measured in master:
DELAY_DEQUEUE
----------------------------------------------------------------------------------------------------------
Task | Runtime ms | Switches | Avg delay ms | Max delay ms | Sum delay ms |
----------------------------------------------------------------------------------------------------------
tbench:(2) | 6277.099 ms | 1597104 | avg: 0.003 ms | max: 0.129 ms | sum: 4334.723 ms |
tbench_srv:(2) | 5724.971 ms | 1682629 | avg: 0.001 ms | max: 0.083 ms | sum: 2076.616 ms |
----------------------------------------------------------------------------------------------------------
TOTAL: | 12021.128 ms | 3280275 | | 1.729 ms | 6425.483 ms |
----------------------------------------------------------------------------------------------------------
client/server CPU distribution ~52%/48%
NO_DELAY_DEQUEUE
----------------------------------------------------------------------------------------------------------
Task | Runtime ms | Switches | Avg delay ms | Max delay ms | Sum delay ms |
----------------------------------------------------------------------------------------------------------
tbench:(2) | 6724.774 ms | 1546761 | avg: 0.002 ms | max: 0.409 ms | sum: 2443.549 ms |
tbench_srv:(2) | 5275.329 ms | 1571688 | avg: 0.002 ms | max: 0.086 ms | sum: 2734.151 ms |
----------------------------------------------------------------------------------------------------------
TOTAL: | 12019.641 ms | 3119000 | | 9996.367 ms | 15187.833 ms |
----------------------------------------------------------------------------------------------------------
client/server CPU distribution ~56%/44%
Note switches and delay sum. For tbench, they translate directly to
throughput. The other shoe lands with async CPU hog net-blasters, for
those, scheduler cycles tends to be wasted cycles.
-Mike
On Fri, Apr 05, 2024 at 12:28:02PM +0200, Peter Zijlstra wrote:
> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
> noting that lag is fundamentally a temporal measure. It should not be
> carried around indefinitely.
>
> OTOH it should also not be instantly discarded, doing so will allow a
> task to game the system by purposefully (micro) sleeping at the end of
> its time quantum.
>
> Since lag is intimately tied to the virtual time base, a wall-time
> based decay is also insufficient, notably competition is required for
> any of this to make sense.
>
> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
> competing until they are eligible.
>
> Strictly speaking, we only care about keeping them until the 0-lag
> point, but that is a difficult proposition, instead carry them around
> until they get picked again, and dequeue them at that point.
>
> Since we should have dequeued them at the 0-lag point, truncate lag
> (eg. don't let them earn positive lag).
>
I stumbled upon a subtle change between CFS and EEVDF which kinda relates
to the question how long lag should be carried around, but on the other
side of the 0-lag value.
Assume that we have two tasks taking turns on a single CPU.
Task 1 does something and wakes up Task 2.
Task 2 does something and goes to sleep.
And we're just repeating that.
Task 1 and task 2 only run for very short amounts of time, i.e. much
shorter than a regular time slice.
Let's further assume, that task 1 runs longer than task 2.
In CFS, this means, that vruntime of task 1 starts to outrun the vruntime
of task 2. This means that vruntime(task2) < vruntime(task1). Hence, task 2
always gets picked on wake up because it has the smaller vruntime.
In EEVDF, this would translate to a permanent positive lag, which also
causes task 2 to get consistently scheduled on wake up.
Let's now assume, that ocassionally, task 2 runs a little bit longer than
task 1. In CFS, this means, that task 2 can close the vruntime gap by a
bit, but, it can easily remain below the value of task 1. Task 2 would
still get picked on wake up.
With EEVDF, in its current form, task 2 will now get a negative lag, which
in turn, will cause it not being picked on the next wake up.
So, it seems we have a change in the level of how far the both variants look
into the past. CFS being willing to take more history into account, whereas
EEVDF does not (with update_entity_lag setting the lag value from scratch,
and place_entity not taking the original vruntime into account).
All of this can be seen as correct by design, a task consumes more time
than the others, so it has to give way to others. The big difference
is now, that CFS allowed a task to collect some bonus by constantly using
less CPU time than others and trading that time against ocassionally taking
more CPU time. EEVDF could do the same thing, by allowing the accumulation
of (positive) lag, which can then be traded against the one time the task
would get negative lag.
This might of course clash with other EEVDF assumptions but blends with the
question on how long lag should be preserved.
I stumbled upon a performance degredation caused by this nuance. EEVDF is
not necessarily at fault here. The problem rather lies in a component
that relies on CFS allowing the above mentioned bonus aggregation while
knowing that the corresponding task 2 has to run.
See: https://lore.kernel.org/netdev/ZWbapeL34Z8AMR5f@DESKTOP-2CCOB1S./T/
With the reasoning above, the degradation can be explained perfectly, and
even be fixed by allowing the accumulation of lag by applying the patch
below.
I was just wondering if my assumptions here are correct and whether this
would be an implicit change that one should be aware about.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 03be0d1330a6..b83a72311d2a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -701,7 +701,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
s64 lag, limit;
SCHED_WARN_ON(!se->on_rq);
- lag = avg_vruntime(cfs_rq) - se->vruntime;
+ lag = se->vlag + avg_vruntime(cfs_rq) - se->vruntime;
limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
se->vlag = clamp(lag, -limit, limit);
Hi,
On 4/15/24 18:07, Luis Machado wrote:
> Hi Peter,
>
> On 4/5/24 11:28, Peter Zijlstra wrote:
>> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
>> noting that lag is fundamentally a temporal measure. It should not be
>> carried around indefinitely.
>>
>> OTOH it should also not be instantly discarded, doing so will allow a
>> task to game the system by purposefully (micro) sleeping at the end of
>> its time quantum.
>>
>> Since lag is intimately tied to the virtual time base, a wall-time
>> based decay is also insufficient, notably competition is required for
>> any of this to make sense.
>>
>> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
>> competing until they are eligible.
>>
>> Strictly speaking, we only care about keeping them until the 0-lag
>> point, but that is a difficult proposition, instead carry them around
>> until they get picked again, and dequeue them at that point.
>>
>> Since we should have dequeued them at the 0-lag point, truncate lag
>> (eg. don't let them earn positive lag).
>>
>> XXX test the cfs-throttle stuff
>>
>> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
>
> Playing around with a Pixel 6 running a 6.6-based kernel with this
> series backported on top, I spotted a noticeable performance improvement
> on the speedometer benchmark:
>
> - m6.6-stock-* is the 6.6 mainline Android kernel unmodified.
>
> - m6.6-eevdf-complete-* is the 6.6 mainline Android kernel with
> this series applied on top (along with a few required backported
> patches).
>
> +-------------------+-----------------------+-----------+
> | metric | tag | perc_diff |
> +-------------------+-----------------------+-----------+
> | Speedometer Score | m6.6-stock-1 | 0.0% |
> | Speedometer Score | m6.6-stock-2 | 1.23% |
> | Speedometer Score | m6.6-stock-3 | -0.22% |
> | Speedometer Score | m6.6-eevdf-complete-1 | 4.54% |
> | Speedometer Score | m6.6-eevdf-complete-2 | 4.95% |
> | Speedometer Score | m6.6-eevdf-complete-3 | 6.07% |
> +-------------------+-----------------------+-----------+
>
> Also some interesting improvements in terms of frame timing for the
> uibenchjanktests benchmark. In particular the metrics of missed
> deadlines and jank (late) frames, which seems to indicate better
> latencies.
>
> +-----------------------+-----------------------+-----------+
> | metric | tag | perc_diff |
> +-----------------------+-----------------------+-----------+
> | gfx-avg-frame-time-50 | m6.6-stock-1 | 0.0 |
> | gfx-avg-frame-time-90 | m6.6-stock-1 | 0.0 |
> | gfx-avg-frame-time-95 | m6.6-stock-1 | 0.0 |
> | gfx-avg-frame-time-99 | m6.6-stock-1 | 0.0 |
> | gfx-avg-frame-time-50 | m6.6-stock-2 | 3.46 |
> | gfx-avg-frame-time-90 | m6.6-stock-2 | 1.19 |
> | gfx-avg-frame-time-95 | m6.6-stock-2 | 0.24 |
> | gfx-avg-frame-time-99 | m6.6-stock-2 | 0.48 |
> | gfx-avg-frame-time-50 | m6.6-eevdf-complete-1 | -30.45 |
> | gfx-avg-frame-time-90 | m6.6-eevdf-complete-1 | -48.44 |
> | gfx-avg-frame-time-95 | m6.6-eevdf-complete-1 | -51.32 |
> | gfx-avg-frame-time-99 | m6.6-eevdf-complete-1 | -52.48 |
> | gfx-avg-frame-time-50 | m6.6-eevdf-complete-2 | -30.32 |
> | gfx-avg-frame-time-90 | m6.6-eevdf-complete-2 | -48.16 |
> | gfx-avg-frame-time-95 | m6.6-eevdf-complete-2 | -51.08 |
> | gfx-avg-frame-time-99 | m6.6-eevdf-complete-2 | -51.7 |
> +-----------------------+-----------------------+-----------+
>
> +-----------------------------------+-----------------------+-----------+
> | metric | tag | perc_diff |
> +-----------------------------------+-----------------------+-----------+
> | gfx-avg-num-frame-deadline-missed | m6.6-stock-1 | 0.0 |
> | gfx-max-num-frame-deadline-missed | m6.6-stock-1 | 0.0 |
> | gfx-avg-num-frame-deadline-missed | m6.6-stock-2 | -3.21 |
> | gfx-max-num-frame-deadline-missed | m6.6-stock-2 | -3.21 |
> | gfx-avg-num-frame-deadline-missed | m6.6-eevdf-complete-1 | -85.29 |
> | gfx-max-num-frame-deadline-missed | m6.6-eevdf-complete-1 | -85.29 |
> | gfx-avg-num-frame-deadline-missed | m6.6-eevdf-complete-2 | -84.8 |
> | gfx-max-num-frame-deadline-missed | m6.6-eevdf-complete-2 | -84.8 |
> +-----------------------------------+-----------------------+-----------+
>
> +----------------------------+-----------------------+-----------+
> | metric | tag | perc_diff |
> +----------------------------+-----------------------+-----------+
> | gfx-avg-high-input-latency | m6.6-stock-1 | 0.0 |
> | gfx-max-high-input-latency | m6.6-stock-1 | 0.0 |
> | gfx-avg-high-input-latency | m6.6-stock-2 | 0.93 |
> | gfx-max-high-input-latency | m6.6-stock-2 | 0.93 |
> | gfx-avg-high-input-latency | m6.6-eevdf-complete-1 | -18.35 |
> | gfx-max-high-input-latency | m6.6-eevdf-complete-1 | -18.35 |
> | gfx-avg-high-input-latency | m6.6-eevdf-complete-2 | -18.05 |
> | gfx-max-high-input-latency | m6.6-eevdf-complete-2 | -18.05 |
> +----------------------------+-----------------------+-----------+
>
> +--------------+-----------------------+-----------+
> | metric | tag | perc_diff |
> +--------------+-----------------------+-----------+
> | gfx-avg-jank | m6.6-stock-1 | 0.0 |
> | gfx-max-jank | m6.6-stock-1 | 0.0 |
> | gfx-avg-jank | m6.6-stock-2 | 1.56 |
> | gfx-max-jank | m6.6-stock-2 | 1.56 |
> | gfx-avg-jank | m6.6-eevdf-complete-1 | -82.81 |
> | gfx-max-jank | m6.6-eevdf-complete-1 | -82.81 |
> | gfx-avg-jank | m6.6-eevdf-complete-2 | -78.12 |
> | gfx-max-jank | m6.6-eevdf-complete-2 | -78.12 |
> +--------------+-----------------------+-----------+
>
> Bisecting through the patches in this series, I ended up with patch 08/10
> as the one that improved things overall for these benchmarks.
>
> I'd like to investigate this further to understand the reason behind some of
> these dramatic improvements.
>
Investigating these improvements a bit more, I noticed they came with a significantly
higher power usage on the Pixel6 (where EAS is enabled). I bisected it down to the delayed
dequeue patch. Disabling DELAY_DEQUEUE and DELAY_ZERO at runtime doesn't help in bringing
the power usage down.
Though I don't fully understand the reason behind this change in behavior yet, I did spot
the benchmark processes running almost entirely on the big core cluster, with little
to no use of the little core and mid core clusters.
That would explain higher power usage and also the significant jump in performance.
I wonder if the delayed dequeue logic is having an unwanted effect on the calculation of
utilization/load of the runqueue and, as a consequence, we're scheduling things to run on
higher OPP's in the big cores, leading to poor decisions for energy efficiency.
On Thu, Apr 18, 2024 at 06:24:59PM +0200, Mike Galbraith wrote:
> Greetings,
>
> I tossed a couple rocks at it today, and seem to have hit the little
> bugger. The root cause seems to be doing the delay dequeue business on
> exiting tasks. Hunk #1 of hacklet below seems to quell the explosions.
>
Neat, Thanks! I was just about ready to go look at this again.
>
> ---
> kernel/sched/fair.c | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5374,6 +5374,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
> update_curr(cfs_rq);
>
> if (sched_feat(DELAY_DEQUEUE) && sleep &&
> + !(entity_is_task(se) && (task_of(se)->flags & PF_EXITING)) &&
> !entity_eligible(cfs_rq, se)) {
> if (cfs_rq->next == se)
> cfs_rq->next = NULL;
> @@ -5495,14 +5496,14 @@ pick_next_entity(struct rq *rq, struct c
> }
>
> struct sched_entity *se = pick_eevdf(cfs_rq);
> - if (se->sched_delayed) {
> + while (se && se->sched_delayed) {
> dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> SCHED_WARN_ON(se->sched_delayed);
> SCHED_WARN_ON(se->on_rq);
> if (sched_feat(DELAY_ZERO) && se->vlag > 0)
> se->vlag = 0;
>
> - return NULL;
> + se = pick_eevdf(cfs_rq);
> }
> return se;
> }
>
On Thu, Apr 11, 2024 at 09:32:23AM +0800, Yan-Jie Wang wrote:
> I have an alternative approach to delayed-dequeue inspired by the original
> CFS implementation.
>
> The idea is to keep the task's vruntime when it goes to sleep.
> When the task is woken up, see if the lag is positive at the woken time, if
> it is the case, clamp it to 0 by setting vruntime to avg_vruntime().
Problem is that avg_vruntime() can go backwards, eg by dequeueing a
negative lag task. This gets really hard to argue about real quick.
Keeping the task competing (delaying the dequeue) is by far the simplest
solution -- conceptually.
The code just became a total mess because of cgroups :/
On Wed, Apr 24, 2024 at 04:15:42PM +0100, Luis Machado wrote:
> > Bisecting through the patches in this series, I ended up with patch 08/10
> > as the one that improved things overall for these benchmarks.
> >
> > I'd like to investigate this further to understand the reason behind some of
> > these dramatic improvements.
> >
>
> Investigating these improvements a bit more, I noticed they came with a significantly
> higher power usage on the Pixel6 (where EAS is enabled). I bisected it down to the delayed
> dequeue patch. Disabling DELAY_DEQUEUE and DELAY_ZERO at runtime doesn't help in bringing
> the power usage down.
Hmm, that is unexpected. The intent was for NO_DELAY_DEQUEUE to fully
disable things. I'll go have a prod at it.
> Though I don't fully understand the reason behind this change in behavior yet, I did spot
> the benchmark processes running almost entirely on the big core cluster, with little
> to no use of the little core and mid core clusters.
>
> That would explain higher power usage and also the significant jump in performance.
ISTR you (arm) has these tools to trace and plot the varioud util
values. This should be readily reflected there if that is the case, no?
> I wonder if the delayed dequeue logic is having an unwanted effect on the calculation of
> utilization/load of the runqueue and, as a consequence, we're scheduling things to run on
> higher OPP's in the big cores, leading to poor decisions for energy efficiency.
Notably util_est_update() gets delayed. Given we don't actually do an
enqueue when a delayed task gets woken, it didn't seem to make sense to
update that sooner.
I'll go over all that again.
On Thu, Apr 18, 2024 at 06:24:59PM +0200, Mike Galbraith wrote:
> The root cause seems to be doing the delay dequeue business on
> exiting tasks.
> ---
> kernel/sched/fair.c | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5374,6 +5374,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
> update_curr(cfs_rq);
>
> if (sched_feat(DELAY_DEQUEUE) && sleep &&
> + !(entity_is_task(se) && (task_of(se)->flags & PF_EXITING)) &&
> !entity_eligible(cfs_rq, se)) {
> if (cfs_rq->next == se)
> cfs_rq->next = NULL;
So I think this can be easier done in dequeue_task_fair(), where we
still know this is a task.
Perhaps something like (I'll test later):
if (p->flags & PF_EXITING)
flags &= ~DEQUEUE_SLEEP;
But now I need to go think about the case of removing a cgroup...
*urgh*.
On Thu, Apr 25, 2024 at 12:42:20PM +0200, Peter Zijlstra wrote:
> > I wonder if the delayed dequeue logic is having an unwanted effect on the calculation of
> > utilization/load of the runqueue and, as a consequence, we're scheduling things to run on
> > higher OPP's in the big cores, leading to poor decisions for energy efficiency.
>
> Notably util_est_update() gets delayed. Given we don't actually do an
> enqueue when a delayed task gets woken, it didn't seem to make sense to
> update that sooner.
The PELT runnable values will be inflated because of delayed dequeue.
cpu_util() uses those in the @boost case, and as such this can indeed
affect things.
This can also slightly affect the cgroup case, but since the delay goes
away as contention goes away, and the cgroup case must already assume
worst case overlap, this seems limited.
/me goes ponder things moar.
On Fri, Apr 26, 2024 at 11:32:41AM +0200, Peter Zijlstra wrote:
> On Thu, Apr 25, 2024 at 01:49:49PM +0200, Peter Zijlstra wrote:
> > On Thu, Apr 25, 2024 at 12:42:20PM +0200, Peter Zijlstra wrote:
> >
> > > > I wonder if the delayed dequeue logic is having an unwanted effect on the calculation of
> > > > utilization/load of the runqueue and, as a consequence, we're scheduling things to run on
> > > > higher OPP's in the big cores, leading to poor decisions for energy efficiency.
> > >
> > > Notably util_est_update() gets delayed. Given we don't actually do an
> > > enqueue when a delayed task gets woken, it didn't seem to make sense to
> > > update that sooner.
> >
> > The PELT runnable values will be inflated because of delayed dequeue.
> > cpu_util() uses those in the @boost case, and as such this can indeed
> > affect things.
> >
> > This can also slightly affect the cgroup case, but since the delay goes
> > away as contention goes away, and the cgroup case must already assume
> > worst case overlap, this seems limited.
> >
> > /me goes ponder things moar.
>
> First order approximation of a fix would be something like the totally
> untested below I suppose...
>
> ---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index cfd1fd188d29..f3f70b5adca0 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5391,6 +5391,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> if (cfs_rq->next == se)
> cfs_rq->next = NULL;
> se->sched_delayed = 1;
> + update_load_avg(cfs_rq, se, 0);
> return false;
> }
> }
> @@ -6817,6 +6818,7 @@ requeue_delayed_entity(struct sched_entity *se)
> }
>
> se->sched_delayed = 0;
> + update_load_avg(qcfs_rq, se, 0);
Compiler demands this ^^^^^ be cfs_rq instead ;-) brown_paper_bags--;
> }
>
> /*
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index d07a3b98f1fb..d16529613123 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -810,6 +810,9 @@ static inline void se_update_runnable(struct sched_entity *se)
>
> static inline long se_runnable(struct sched_entity *se)
> {
> + if (se->sched_delayed)
> + return false;
> +
> if (entity_is_task(se))
> return !!se->on_rq;
> else
> @@ -823,6 +826,9 @@ static inline void se_update_runnable(struct sched_entity *se) {}
>
> static inline long se_runnable(struct sched_entity *se)
> {
> + if (se->sched_delayed)
> + return false;
> +
> return !!se->on_rq;
> }
> #endif
On 4/26/24 10:32, Peter Zijlstra wrote:
> On Thu, Apr 25, 2024 at 01:49:49PM +0200, Peter Zijlstra wrote:
>> On Thu, Apr 25, 2024 at 12:42:20PM +0200, Peter Zijlstra wrote:
>>
>>>> I wonder if the delayed dequeue logic is having an unwanted effect on the calculation of
>>>> utilization/load of the runqueue and, as a consequence, we're scheduling things to run on
>>>> higher OPP's in the big cores, leading to poor decisions for energy efficiency.
>>>
>>> Notably util_est_update() gets delayed. Given we don't actually do an
>>> enqueue when a delayed task gets woken, it didn't seem to make sense to
>>> update that sooner.
>>
>> The PELT runnable values will be inflated because of delayed dequeue.
>> cpu_util() uses those in the @boost case, and as such this can indeed
>> affect things.
>>
>> This can also slightly affect the cgroup case, but since the delay goes
>> away as contention goes away, and the cgroup case must already assume
>> worst case overlap, this seems limited.
>>
>> /me goes ponder things moar.
>
> First order approximation of a fix would be something like the totally
> untested below I suppose...
Thanks Peter. Let me give it a try and I'll report back.
>
> ---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index cfd1fd188d29..f3f70b5adca0 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5391,6 +5391,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> if (cfs_rq->next == se)
> cfs_rq->next = NULL;
> se->sched_delayed = 1;
> + update_load_avg(cfs_rq, se, 0);
> return false;
> }
> }
> @@ -6817,6 +6818,7 @@ requeue_delayed_entity(struct sched_entity *se)
> }
>
> se->sched_delayed = 0;
> + update_load_avg(qcfs_rq, se, 0);
> }
>
> /*
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index d07a3b98f1fb..d16529613123 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -810,6 +810,9 @@ static inline void se_update_runnable(struct sched_entity *se)
>
> static inline long se_runnable(struct sched_entity *se)
> {
> + if (se->sched_delayed)
> + return false;
> +
> if (entity_is_task(se))
> return !!se->on_rq;
> else
> @@ -823,6 +826,9 @@ static inline void se_update_runnable(struct sched_entity *se) {}
>
> static inline long se_runnable(struct sched_entity *se)
> {
> + if (se->sched_delayed)
> + return false;
> +
> return !!se->on_rq;
> }
> #endif
On 4/25/24 11:42, Peter Zijlstra wrote:
> On Wed, Apr 24, 2024 at 04:15:42PM +0100, Luis Machado wrote:
>
>>> Bisecting through the patches in this series, I ended up with patch 08/10
>>> as the one that improved things overall for these benchmarks.
>>>
>>> I'd like to investigate this further to understand the reason behind some of
>>> these dramatic improvements.
>>>
>>
>> Investigating these improvements a bit more, I noticed they came with a significantly
>> higher power usage on the Pixel6 (where EAS is enabled). I bisected it down to the delayed
>> dequeue patch. Disabling DELAY_DEQUEUE and DELAY_ZERO at runtime doesn't help in bringing
>> the power usage down.
>
> Hmm, that is unexpected. The intent was for NO_DELAY_DEQUEUE to fully
> disable things. I'll go have a prod at it.
I'm running a few more numbers to confirm this situation with the feature switches.
>
>> Though I don't fully understand the reason behind this change in behavior yet, I did spot
>> the benchmark processes running almost entirely on the big core cluster, with little
>> to no use of the little core and mid core clusters.
>>
>> That would explain higher power usage and also the significant jump in performance.
>
> ISTR you (arm) has these tools to trace and plot the varioud util
> values. This should be readily reflected there if that is the case, no?
Indeed we do, but I'm still in the process of compiling those numbers into a meaningful plot,
so I'm afraid I don't have those handy yet, sorry.
>
>> I wonder if the delayed dequeue logic is having an unwanted effect on the calculation of
>> utilization/load of the runqueue and, as a consequence, we're scheduling things to run on
>> higher OPP's in the big cores, leading to poor decisions for energy efficiency.
>
> Notably util_est_update() gets delayed. Given we don't actually do an
> enqueue when a delayed task gets woken, it didn't seem to make sense to
> update that sooner.
>
> I'll go over all that again
On Thu, Apr 25, 2024 at 01:49:49PM +0200, Peter Zijlstra wrote:
> On Thu, Apr 25, 2024 at 12:42:20PM +0200, Peter Zijlstra wrote:
>
> > > I wonder if the delayed dequeue logic is having an unwanted effect on the calculation of
> > > utilization/load of the runqueue and, as a consequence, we're scheduling things to run on
> > > higher OPP's in the big cores, leading to poor decisions for energy efficiency.
> >
> > Notably util_est_update() gets delayed. Given we don't actually do an
> > enqueue when a delayed task gets woken, it didn't seem to make sense to
> > update that sooner.
>
> The PELT runnable values will be inflated because of delayed dequeue.
> cpu_util() uses those in the @boost case, and as such this can indeed
> affect things.
>
> This can also slightly affect the cgroup case, but since the delay goes
> away as contention goes away, and the cgroup case must already assume
> worst case overlap, this seems limited.
>
> /me goes ponder things moar.
First order approximation of a fix would be something like the totally
untested below I suppose...
---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cfd1fd188d29..f3f70b5adca0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5391,6 +5391,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (cfs_rq->next == se)
cfs_rq->next = NULL;
se->sched_delayed = 1;
+ update_load_avg(cfs_rq, se, 0);
return false;
}
}
@@ -6817,6 +6818,7 @@ requeue_delayed_entity(struct sched_entity *se)
}
se->sched_delayed = 0;
+ update_load_avg(qcfs_rq, se, 0);
}
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d07a3b98f1fb..d16529613123 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -810,6 +810,9 @@ static inline void se_update_runnable(struct sched_entity *se)
static inline long se_runnable(struct sched_entity *se)
{
+ if (se->sched_delayed)
+ return false;
+
if (entity_is_task(se))
return !!se->on_rq;
else
@@ -823,6 +826,9 @@ static inline void se_update_runnable(struct sched_entity *se) {}
static inline long se_runnable(struct sched_entity *se)
{
+ if (se->sched_delayed)
+ return false;
+
return !!se->on_rq;
}
#endif
On Fri, Apr 26, 2024 at 12:56:07PM +0200, Peter Zijlstra wrote:
> On Thu, Apr 25, 2024 at 01:28:55PM +0200, Peter Zijlstra wrote:
> > On Thu, Apr 18, 2024 at 06:24:59PM +0200, Mike Galbraith wrote:
> > > The root cause seems to be doing the delay dequeue business on
> > > exiting tasks.
> >
> > > ---
> > > kernel/sched/fair.c | 5 +++--
> > > 1 file changed, 3 insertions(+), 2 deletions(-)
> > >
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -5374,6 +5374,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
> > > update_curr(cfs_rq);
> > >
> > > if (sched_feat(DELAY_DEQUEUE) && sleep &&
> > > + !(entity_is_task(se) && (task_of(se)->flags & PF_EXITING)) &&
> > > !entity_eligible(cfs_rq, se)) {
> > > if (cfs_rq->next == se)
> > > cfs_rq->next = NULL;
> >
> > So I think this can be easier done in dequeue_task_fair(), where we
> > still know this is a task.
> >
> > Perhaps something like (I'll test later):
> >
> > if (p->flags & PF_EXITING)
> > flags &= ~DEQUEUE_SLEEP;
> >
> > But now I need to go think about the case of removing a cgroup...
> > *urgh*.
>
> I ended up with the below instead; lemme go run this unixbench spawn on it.
Seems to survive that.
I pushed out the patches with updates to queue/sched/eevdf
On Thu, Apr 25, 2024 at 01:28:55PM +0200, Peter Zijlstra wrote:
> On Thu, Apr 18, 2024 at 06:24:59PM +0200, Mike Galbraith wrote:
> > The root cause seems to be doing the delay dequeue business on
> > exiting tasks.
>
> > ---
> > kernel/sched/fair.c | 5 +++--
> > 1 file changed, 3 insertions(+), 2 deletions(-)
> >
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -5374,6 +5374,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
> > update_curr(cfs_rq);
> >
> > if (sched_feat(DELAY_DEQUEUE) && sleep &&
> > + !(entity_is_task(se) && (task_of(se)->flags & PF_EXITING)) &&
> > !entity_eligible(cfs_rq, se)) {
> > if (cfs_rq->next == se)
> > cfs_rq->next = NULL;
>
> So I think this can be easier done in dequeue_task_fair(), where we
> still know this is a task.
>
> Perhaps something like (I'll test later):
>
> if (p->flags & PF_EXITING)
> flags &= ~DEQUEUE_SLEEP;
>
> But now I need to go think about the case of removing a cgroup...
> *urgh*.
I ended up with the below instead; lemme go run this unixbench spawn on it.
---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 95666034e76c..b5918fa9a0f0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8429,7 +8431,20 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
static void task_dead_fair(struct task_struct *p)
{
- remove_entity_load_avg(&p->se);
+ struct sched_entity *se = &p->se;
+
+ if (p->se.sched_delayed) {
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+ if (se->sched_delayed)
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ task_rq_unlock(rq, p, &rf);
+ }
+
+ remove_entity_load_avg(se);
}
/*
@@ -13089,28 +13104,34 @@ void online_fair_sched_group(struct task_group *tg)
void unregister_fair_sched_group(struct task_group *tg)
{
- unsigned long flags;
- struct rq *rq;
int cpu;
destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(cpu) {
- if (tg->se[cpu])
- remove_entity_load_avg(tg->se[cpu]);
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+ struct sched_entity *se = tg->se[cpu];
+ struct rq *rq = cpu_rq(cpu);
+
+ if (se) {
+ if (se->sched_delayed) {
+ guard(rq_lock_irqsave)(rq);
+ update_rq_clock(rq);
+ if (se->sched_delayed)
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
+ remove_entity_load_avg(se);
+ }
/*
* Only empty task groups can be destroyed; so we can speculatively
* check on_list without danger of it being re-added.
*/
- if (!tg->cfs_rq[cpu]->on_list)
- continue;
-
- rq = cpu_rq(cpu);
-
- raw_spin_rq_lock_irqsave(rq, flags);
- list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- raw_spin_rq_unlock_irqrestore(rq, flags);
+ if (cfs_rq->on_list) {
+ guard(rq_lock_irqsave)(rq);
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
}
}
On Fri, 2024-04-26 at 13:16 +0200, Peter Zijlstra wrote:
> >
> > I ended up with the below instead; lemme go run this unixbench
> > spawn on it.
>
> Seems to survive that.
>
> I pushed out the patches with updates to queue/sched/eevdf
Yup, solid... but fwiw, tbench liked the previous version better.
trusty (with a 'c') ole i7-4790 box, tbench 8
for i in 1 2 3; do tbench.sh 8 10 2>&1|grep Throughput; done
6.9.0.gc942a0c-master +eevdf.current
NO_DELAY_DEQUEUE
Throughput 3285.04 MB/sec 8 clients 8 procs max_latency=3.481 ms
Throughput 3289.66 MB/sec 8 clients 8 procs max_latency=8.124 ms
Throughput 3293.83 MB/sec 8 clients 8 procs max_latency=2.210 ms
DELAY_DEQUEUE
Throughput 3246.3 MB/sec 8 clients 8 procs max_latency=2.181 ms
Throughput 3236.96 MB/sec 8 clients 8 procs max_latency=6.988 ms
Throughput 3248.6 MB/sec 8 clients 8 procs max_latency=2.130 ms
6.9.0.gc942a0c-master +eevdf.prev
NO_DELAY_DEQUEUE
Throughput 3457.92 MB/sec 8 clients 8 procs max_latency=3.885 ms
Throughput 3470.95 MB/sec 8 clients 8 procs max_latency=4.475 ms
Throughput 3467.87 MB/sec 8 clients 8 procs max_latency=2.182 ms
DELAY_DEQUEUE
Throughput 3712.96 MB/sec 8 clients 8 procs max_latency=4.231 ms
Throughput 3667.87 MB/sec 8 clients 8 procs max_latency=5.020 ms
Throughput 3679.65 MB/sec 8 clients 8 procs max_latency=2.847 ms
Trees are identical modulo extracted eevdf additions. The previous win
that put eevdf on par with cfs went missing.. and then some.
For reference, cfs vs eevdf log extract for previously mentioned gain.
6.1.87-cfs
Throughput 3660.98 MB/sec 8 clients 8 procs max_latency=2.204 ms
Throughput 3678.67 MB/sec 8 clients 8 procs max_latency=10.127 ms
Throughput 3631.89 MB/sec 8 clients 8 procs max_latency=13.019 ms
1.000
6.1.87-eevdf - naked eevdf +fixes
Throughput 3441.86 MB/sec 8 clients 8 procs max_latency=3.943 ms
Throughput 3439.68 MB/sec 8 clients 8 procs max_latency=4.285 ms
Throughput 3432.28 MB/sec 8 clients 8 procs max_latency=3.557 ms
vs cfs .940
6.1.87-eevdf +delay_dequeue.prev patch set
DELAY_DEQUEUE
Throughput 3696.94 MB/sec 8 clients 8 procs max_latency=2.179 ms
Throughput 3694.64 MB/sec 8 clients 8 procs max_latency=6.322 ms
Throughput 3654.49 MB/sec 8 clients 8 procs max_latency=4.101 ms
vs cfs 1.006
box waxes nostalgic (son, when I was yo age [flex])
4.19.312
Throughput 4099.07 MB/sec 8 clients 8 procs max_latency=2.169 ms
Throughput 4107.49 MB/sec 8 clients 8 procs max_latency=12.404 ms
Throughput 4118.41 MB/sec 8 clients 8 procs max_latency=14.150 ms
-Mike
On Fri, 2024-04-26 at 18:03 +0200, Mike Galbraith wrote:
> fwiw, tbench liked the previous version better.
The culprit lies somewhere in the new PREEMPT_SHORT patch.
-Mike
On Sat, 2024-04-27 at 08:42 +0200, Mike Galbraith wrote:
> On Fri, 2024-04-26 at 18:03 +0200, Mike Galbraith wrote:
> > fwiw, tbench liked the previous version better.
>
> The culprit lies somewhere in the new PREEMPT_SHORT patch.
<squint>
Ah, moving the curr eligibility check into pick_curr() left an
ineligible curr to be seen/used further down in pick_eevdf().
found:
if (!best || (curr && entity_before(curr, best)))
best = curr;
Nit: PREEMPT_SHORT depending on RUN_TO_PARITY looks odd.
-Mike
On Sun, Apr 28, 2024 at 06:32:49PM +0200, Mike Galbraith wrote:
> On Sat, 2024-04-27 at 08:42 +0200, Mike Galbraith wrote:
> > On Fri, 2024-04-26 at 18:03 +0200, Mike Galbraith wrote:
> > > fwiw, tbench liked the previous version better.
> >
> > The culprit lies somewhere in the new PREEMPT_SHORT patch.
>
> <squint>
>
> Ah, moving the curr eligibility check into pick_curr() left an
> ineligible curr to be seen/used further down in pick_eevdf().
>
> found:
> if (!best || (curr && entity_before(curr, best)))
> best = curr;
Hmm yes, over aggressive cleanup that. Let me try again.
> Nit: PREEMPT_SHORT depending on RUN_TO_PARITY looks odd.
The thinking was that without RUN_TO_PARITY we'll always do a full pick
and it will always pick a (new) shorter deadline task over current.
The PREEMPRT_SHORT thing really is an exception to avoid RUN_TO_PARITY
from ruining latency game for short tasks.
Hi Peter,
On 4/26/24 10:32, Peter Zijlstra wrote:
> On Thu, Apr 25, 2024 at 01:49:49PM +0200, Peter Zijlstra wrote:
>> On Thu, Apr 25, 2024 at 12:42:20PM +0200, Peter Zijlstra wrote:
>>
>>>> I wonder if the delayed dequeue logic is having an unwanted effect on the calculation of
>>>> utilization/load of the runqueue and, as a consequence, we're scheduling things to run on
>>>> higher OPP's in the big cores, leading to poor decisions for energy efficiency.
>>>
>>> Notably util_est_update() gets delayed. Given we don't actually do an
>>> enqueue when a delayed task gets woken, it didn't seem to make sense to
>>> update that sooner.
>>
>> The PELT runnable values will be inflated because of delayed dequeue.
>> cpu_util() uses those in the @boost case, and as such this can indeed
>> affect things.
>>
>> This can also slightly affect the cgroup case, but since the delay goes
>> away as contention goes away, and the cgroup case must already assume
>> worst case overlap, this seems limited.
>>
>> /me goes ponder things moar.
>
> First order approximation of a fix would be something like the totally
> untested below I suppose...
I gave this a try on the Pixel 6, and I noticed some improvement (see below), but not
enough to bring it back to the original levels.
(1) m6.6-stock - Basic EEVDF with wakeup preemption fix (baseline)
(2) m6.6-eevdf-complete: m6.6-stock plus this series.
(3) m6.6-eevdf-complete-no-delay-dequeue: (2) + NO_DELAY_DEQUEUE
(4) m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero: (2) + NO_DELAY_DEQUEUE + NO_DELAY_ZERO
(5) m6.6-eevdf-complete-no-delay-zero: (2) + NO_DELAY_ZERO
(6) m6.6-eevdf-complete-pelt-fix: (2) + the proposed load_avg update patch.
I included (3), (4) and (5) to exercise the impact of disabling the individual
scheduler features.
Energy use.
+------------+------------------------------------------------------+-----------+
| cluster | tag | perc_diff |
+------------+------------------------------------------------------+-----------+
| CPU | m6.6-stock | 0.0% |
| CPU-Big | m6.6-stock | 0.0% |
| CPU-Little | m6.6-stock | 0.0% |
| CPU-Mid | m6.6-stock | 0.0% |
| GPU | m6.6-stock | 0.0% |
| Total | m6.6-stock | 0.0% |
| CPU | m6.6-eevdf-complete | 114.51% |
| CPU-Big | m6.6-eevdf-complete | 90.75% |
| CPU-Little | m6.6-eevdf-complete | 98.74% |
| CPU-Mid | m6.6-eevdf-complete | 213.9% |
| GPU | m6.6-eevdf-complete | -7.04% |
| Total | m6.6-eevdf-complete | 100.92% |
| CPU | m6.6-eevdf-complete-no-delay-dequeue | 117.77% |
| CPU-Big | m6.6-eevdf-complete-no-delay-dequeue | 113.79% |
| CPU-Little | m6.6-eevdf-complete-no-delay-dequeue | 97.47% |
| CPU-Mid | m6.6-eevdf-complete-no-delay-dequeue | 189.0% |
| GPU | m6.6-eevdf-complete-no-delay-dequeue | -6.74% |
| Total | m6.6-eevdf-complete-no-delay-dequeue | 103.84% |
| CPU | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 120.45% |
| CPU-Big | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 113.65% |
| CPU-Little | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 99.04% |
| CPU-Mid | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 201.14% |
| GPU | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | -5.37% |
| Total | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 106.38% |
| CPU | m6.6-eevdf-complete-no-delay-zero | 119.05% |
| CPU-Big | m6.6-eevdf-complete-no-delay-zero | 107.55% |
| CPU-Little | m6.6-eevdf-complete-no-delay-zero | 98.66% |
| CPU-Mid | m6.6-eevdf-complete-no-delay-zero | 206.58% |
| GPU | m6.6-eevdf-complete-no-delay-zero | -5.25% |
| Total | m6.6-eevdf-complete-no-delay-zero | 105.14% |
| CPU | m6.6-eevdf-complete-pelt-fix | 105.56% |
| CPU-Big | m6.6-eevdf-complete-pelt-fix | 100.45% |
| CPU-Little | m6.6-eevdf-complete-pelt-fix | 94.4% |
| CPU-Mid | m6.6-eevdf-complete-pelt-fix | 150.94% |
| GPU | m6.6-eevdf-complete-pelt-fix | -3.96% |
| Total | m6.6-eevdf-complete-pelt-fix | 93.31% |
+------------+------------------------------------------------------+-----------+
Utilization and load levels.
+---------+------------------------------------------------------+----------+-----------+
| cluster | tag | variable | perc_diff |
+---------+------------------------------------------------------+----------+-----------+
| little | m6.6-stock | load | 0.0% |
| little | m6.6-stock | util | 0.0% |
| little | m6.6-eevdf-complete | load | 29.56% |
| little | m6.6-eevdf-complete | util | 55.4% |
| little | m6.6-eevdf-complete-no-delay-dequeue | load | 42.89% |
| little | m6.6-eevdf-complete-no-delay-dequeue | util | 69.47% |
| little | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | load | 51.05% |
| little | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | util | 76.55% |
| little | m6.6-eevdf-complete-no-delay-zero | load | 34.51% |
| little | m6.6-eevdf-complete-no-delay-zero | util | 72.53% |
| little | m6.6-eevdf-complete-pelt-fix | load | 29.96% |
| little | m6.6-eevdf-complete-pelt-fix | util | 59.82% |
| mid | m6.6-stock | load | 0.0% |
| mid | m6.6-stock | util | 0.0% |
| mid | m6.6-eevdf-complete | load | 29.37% |
| mid | m6.6-eevdf-complete | util | 75.22% |
| mid | m6.6-eevdf-complete-no-delay-dequeue | load | 36.4% |
| mid | m6.6-eevdf-complete-no-delay-dequeue | util | 80.28% |
| mid | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | load | 30.35% |
| mid | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | util | 90.2% |
| mid | m6.6-eevdf-complete-no-delay-zero | load | 37.83% |
| mid | m6.6-eevdf-complete-no-delay-zero | util | 93.79% |
| mid | m6.6-eevdf-complete-pelt-fix | load | 33.57% |
| mid | m6.6-eevdf-complete-pelt-fix | util | 67.83% |
| big | m6.6-stock | load | 0.0% |
| big | m6.6-stock | util | 0.0% |
| big | m6.6-eevdf-complete | load | 97.39% |
| big | m6.6-eevdf-complete | util | 12.63% |
| big | m6.6-eevdf-complete-no-delay-dequeue | load | 139.69% |
| big | m6.6-eevdf-complete-no-delay-dequeue | util | 22.58% |
| big | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | load | 125.36% |
| big | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | util | 23.15% |
| big | m6.6-eevdf-complete-no-delay-zero | load | 128.56% |
| big | m6.6-eevdf-complete-no-delay-zero | util | 25.03% |
| big | m6.6-eevdf-complete-pelt-fix | load | 130.73% |
| big | m6.6-eevdf-complete-pelt-fix | util | 17.52% |
+---------+------------------------------------------------------+----------+-----------+
On 4/29/24 15:33, Luis Machado wrote:
> Hi Peter,
>
> On 4/26/24 10:32, Peter Zijlstra wrote:
>> On Thu, Apr 25, 2024 at 01:49:49PM +0200, Peter Zijlstra wrote:
>>> On Thu, Apr 25, 2024 at 12:42:20PM +0200, Peter Zijlstra wrote:
>>>
>>>>> I wonder if the delayed dequeue logic is having an unwanted effect on the calculation of
>>>>> utilization/load of the runqueue and, as a consequence, we're scheduling things to run on
>>>>> higher OPP's in the big cores, leading to poor decisions for energy efficiency.
>>>>
>>>> Notably util_est_update() gets delayed. Given we don't actually do an
>>>> enqueue when a delayed task gets woken, it didn't seem to make sense to
>>>> update that sooner.
>>>
>>> The PELT runnable values will be inflated because of delayed dequeue.
>>> cpu_util() uses those in the @boost case, and as such this can indeed
>>> affect things.
>>>
>>> This can also slightly affect the cgroup case, but since the delay goes
>>> away as contention goes away, and the cgroup case must already assume
>>> worst case overlap, this seems limited.
>>>
>>> /me goes ponder things moar.
>>
>> First order approximation of a fix would be something like the totally
>> untested below I suppose...
>
> I gave this a try on the Pixel 6, and I noticed some improvement (see below), but not
> enough to bring it back to the original levels.
>
> (1) m6.6-stock - Basic EEVDF with wakeup preemption fix (baseline)
> (2) m6.6-eevdf-complete: m6.6-stock plus this series.
> (3) m6.6-eevdf-complete-no-delay-dequeue: (2) + NO_DELAY_DEQUEUE
> (4) m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero: (2) + NO_DELAY_DEQUEUE + NO_DELAY_ZERO
> (5) m6.6-eevdf-complete-no-delay-zero: (2) + NO_DELAY_ZERO
> (6) m6.6-eevdf-complete-pelt-fix: (2) + the proposed load_avg update patch.
>
> I included (3), (4) and (5) to exercise the impact of disabling the individual
> scheduler features.
>
>
> Energy use.
>
> +------------+------------------------------------------------------+-----------+
> | cluster | tag | perc_diff |
> +------------+------------------------------------------------------+-----------+
> | CPU | m6.6-stock | 0.0% |
> | CPU-Big | m6.6-stock | 0.0% |
> | CPU-Little | m6.6-stock | 0.0% |
> | CPU-Mid | m6.6-stock | 0.0% |
> | GPU | m6.6-stock | 0.0% |
> | Total | m6.6-stock | 0.0% |
> | CPU | m6.6-eevdf-complete | 114.51% |
> | CPU-Big | m6.6-eevdf-complete | 90.75% |
> | CPU-Little | m6.6-eevdf-complete | 98.74% |
> | CPU-Mid | m6.6-eevdf-complete | 213.9% |
> | GPU | m6.6-eevdf-complete | -7.04% |
> | Total | m6.6-eevdf-complete | 100.92% |
> | CPU | m6.6-eevdf-complete-no-delay-dequeue | 117.77% |
> | CPU-Big | m6.6-eevdf-complete-no-delay-dequeue | 113.79% |
> | CPU-Little | m6.6-eevdf-complete-no-delay-dequeue | 97.47% |
> | CPU-Mid | m6.6-eevdf-complete-no-delay-dequeue | 189.0% |
> | GPU | m6.6-eevdf-complete-no-delay-dequeue | -6.74% |
> | Total | m6.6-eevdf-complete-no-delay-dequeue | 103.84% |
> | CPU | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 120.45% |
> | CPU-Big | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 113.65% |
> | CPU-Little | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 99.04% |
> | CPU-Mid | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 201.14% |
> | GPU | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | -5.37% |
> | Total | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 106.38% |
> | CPU | m6.6-eevdf-complete-no-delay-zero | 119.05% |
> | CPU-Big | m6.6-eevdf-complete-no-delay-zero | 107.55% |
> | CPU-Little | m6.6-eevdf-complete-no-delay-zero | 98.66% |
> | CPU-Mid | m6.6-eevdf-complete-no-delay-zero | 206.58% |
> | GPU | m6.6-eevdf-complete-no-delay-zero | -5.25% |
> | Total | m6.6-eevdf-complete-no-delay-zero | 105.14% |
> | CPU | m6.6-eevdf-complete-pelt-fix | 105.56% |
> | CPU-Big | m6.6-eevdf-complete-pelt-fix | 100.45% |
> | CPU-Little | m6.6-eevdf-complete-pelt-fix | 94.4% |
> | CPU-Mid | m6.6-eevdf-complete-pelt-fix | 150.94% |
> | GPU | m6.6-eevdf-complete-pelt-fix | -3.96% |
> | Total | m6.6-eevdf-complete-pelt-fix | 93.31% |
> +------------+------------------------------------------------------+-----------+
>
> Utilization and load levels.
>
> +---------+------------------------------------------------------+----------+-----------+
> | cluster | tag | variable | perc_diff |
> +---------+------------------------------------------------------+----------+-----------+
> | little | m6.6-stock | load | 0.0% |
> | little | m6.6-stock | util | 0.0% |
> | little | m6.6-eevdf-complete | load | 29.56% |
> | little | m6.6-eevdf-complete | util | 55.4% |
> | little | m6.6-eevdf-complete-no-delay-dequeue | load | 42.89% |
> | little | m6.6-eevdf-complete-no-delay-dequeue | util | 69.47% |
> | little | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | load | 51.05% |
> | little | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | util | 76.55% |
> | little | m6.6-eevdf-complete-no-delay-zero | load | 34.51% |
> | little | m6.6-eevdf-complete-no-delay-zero | util | 72.53% |
> | little | m6.6-eevdf-complete-pelt-fix | load | 29.96% |
> | little | m6.6-eevdf-complete-pelt-fix | util | 59.82% |
> | mid | m6.6-stock | load | 0.0% |
> | mid | m6.6-stock | util | 0.0% |
> | mid | m6.6-eevdf-complete | load | 29.37% |
> | mid | m6.6-eevdf-complete | util | 75.22% |
> | mid | m6.6-eevdf-complete-no-delay-dequeue | load | 36.4% |
> | mid | m6.6-eevdf-complete-no-delay-dequeue | util | 80.28% |
> | mid | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | load | 30.35% |
> | mid | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | util | 90.2% |
> | mid | m6.6-eevdf-complete-no-delay-zero | load | 37.83% |
> | mid | m6.6-eevdf-complete-no-delay-zero | util | 93.79% |
> | mid | m6.6-eevdf-complete-pelt-fix | load | 33.57% |
> | mid | m6.6-eevdf-complete-pelt-fix | util | 67.83% |
> | big | m6.6-stock | load | 0.0% |
> | big | m6.6-stock | util | 0.0% |
> | big | m6.6-eevdf-complete | load | 97.39% |
> | big | m6.6-eevdf-complete | util | 12.63% |
> | big | m6.6-eevdf-complete-no-delay-dequeue | load | 139.69% |
> | big | m6.6-eevdf-complete-no-delay-dequeue | util | 22.58% |
> | big | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | load | 125.36% |
> | big | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | util | 23.15% |
> | big | m6.6-eevdf-complete-no-delay-zero | load | 128.56% |
> | big | m6.6-eevdf-complete-no-delay-zero | util | 25.03% |
> | big | m6.6-eevdf-complete-pelt-fix | load | 130.73% |
> | big | m6.6-eevdf-complete-pelt-fix | util | 17.52% |
> +---------+------------------------------------------------------+----------+-----------+
Going through the code, my understanding is that the util_est functions seem to be getting
called correctly, and in the right order. That is, we first util_est_enqueue, then util_est_dequeue
and finally util_est_update. So the stats *should* be correct.
On dequeuing (dequeue_task_fair), we immediately call util_est_dequeue, even for the case of
a DEQUEUE_DELAYED task, since we're no longer going to run the dequeue_delayed task for now, even
though it is still in the rq.
We delay the util_est_update of dequeue_delayed tasks until a later time in dequeue_entities.
Eventually the dequeue_delayed task will have its lag zeroed when it becomes eligible again,
(requeue_delayed_entity) while still being in the rq. It will then get dequeued/enqueued (requeued),
and marked as a non-dequeue-delayed task.
Next time we attempt to enqueue such a task (enqueue_task_fair), it will skip the ENQUEUE_DELAYED
block and call util_est_enqueue.
Still, something seems to be signalling that util/load is high, and causing migration to the big cores.
Maybe we're not decaying the util/load properly at some point, and inflated numbers start to happen.
I'll continue investigating.
On 5/2/24 11:26, Luis Machado wrote:
> On 4/29/24 15:33, Luis Machado wrote:
>> Hi Peter,
>>
>> On 4/26/24 10:32, Peter Zijlstra wrote:
>>> On Thu, Apr 25, 2024 at 01:49:49PM +0200, Peter Zijlstra wrote:
>>>> On Thu, Apr 25, 2024 at 12:42:20PM +0200, Peter Zijlstra wrote:
>>>>
>>>>>> I wonder if the delayed dequeue logic is having an unwanted effect on the calculation of
>>>>>> utilization/load of the runqueue and, as a consequence, we're scheduling things to run on
>>>>>> higher OPP's in the big cores, leading to poor decisions for energy efficiency.
>>>>>
>>>>> Notably util_est_update() gets delayed. Given we don't actually do an
>>>>> enqueue when a delayed task gets woken, it didn't seem to make sense to
>>>>> update that sooner.
>>>>
>>>> The PELT runnable values will be inflated because of delayed dequeue.
>>>> cpu_util() uses those in the @boost case, and as such this can indeed
>>>> affect things.
>>>>
>>>> This can also slightly affect the cgroup case, but since the delay goes
>>>> away as contention goes away, and the cgroup case must already assume
>>>> worst case overlap, this seems limited.
>>>>
>>>> /me goes ponder things moar.
>>>
>>> First order approximation of a fix would be something like the totally
>>> untested below I suppose...
>>
>> I gave this a try on the Pixel 6, and I noticed some improvement (see below), but not
>> enough to bring it back to the original levels.
>>
>> (1) m6.6-stock - Basic EEVDF with wakeup preemption fix (baseline)
>> (2) m6.6-eevdf-complete: m6.6-stock plus this series.
>> (3) m6.6-eevdf-complete-no-delay-dequeue: (2) + NO_DELAY_DEQUEUE
>> (4) m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero: (2) + NO_DELAY_DEQUEUE + NO_DELAY_ZERO
>> (5) m6.6-eevdf-complete-no-delay-zero: (2) + NO_DELAY_ZERO
>> (6) m6.6-eevdf-complete-pelt-fix: (2) + the proposed load_avg update patch.
>>
>> I included (3), (4) and (5) to exercise the impact of disabling the individual
>> scheduler features.
>>
>>
>> Energy use.
>>
>> +------------+------------------------------------------------------+-----------+
>> | cluster | tag | perc_diff |
>> +------------+------------------------------------------------------+-----------+
>> | CPU | m6.6-stock | 0.0% |
>> | CPU-Big | m6.6-stock | 0.0% |
>> | CPU-Little | m6.6-stock | 0.0% |
>> | CPU-Mid | m6.6-stock | 0.0% |
>> | GPU | m6.6-stock | 0.0% |
>> | Total | m6.6-stock | 0.0% |
>> | CPU | m6.6-eevdf-complete | 114.51% |
>> | CPU-Big | m6.6-eevdf-complete | 90.75% |
>> | CPU-Little | m6.6-eevdf-complete | 98.74% |
>> | CPU-Mid | m6.6-eevdf-complete | 213.9% |
>> | GPU | m6.6-eevdf-complete | -7.04% |
>> | Total | m6.6-eevdf-complete | 100.92% |
>> | CPU | m6.6-eevdf-complete-no-delay-dequeue | 117.77% |
>> | CPU-Big | m6.6-eevdf-complete-no-delay-dequeue | 113.79% |
>> | CPU-Little | m6.6-eevdf-complete-no-delay-dequeue | 97.47% |
>> | CPU-Mid | m6.6-eevdf-complete-no-delay-dequeue | 189.0% |
>> | GPU | m6.6-eevdf-complete-no-delay-dequeue | -6.74% |
>> | Total | m6.6-eevdf-complete-no-delay-dequeue | 103.84% |
>> | CPU | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 120.45% |
>> | CPU-Big | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 113.65% |
>> | CPU-Little | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 99.04% |
>> | CPU-Mid | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 201.14% |
>> | GPU | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | -5.37% |
>> | Total | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 106.38% |
>> | CPU | m6.6-eevdf-complete-no-delay-zero | 119.05% |
>> | CPU-Big | m6.6-eevdf-complete-no-delay-zero | 107.55% |
>> | CPU-Little | m6.6-eevdf-complete-no-delay-zero | 98.66% |
>> | CPU-Mid | m6.6-eevdf-complete-no-delay-zero | 206.58% |
>> | GPU | m6.6-eevdf-complete-no-delay-zero | -5.25% |
>> | Total | m6.6-eevdf-complete-no-delay-zero | 105.14% |
>> | CPU | m6.6-eevdf-complete-pelt-fix | 105.56% |
>> | CPU-Big | m6.6-eevdf-complete-pelt-fix | 100.45% |
>> | CPU-Little | m6.6-eevdf-complete-pelt-fix | 94.4% |
>> | CPU-Mid | m6.6-eevdf-complete-pelt-fix | 150.94% |
>> | GPU | m6.6-eevdf-complete-pelt-fix | -3.96% |
>> | Total | m6.6-eevdf-complete-pelt-fix | 93.31% |
>> +------------+------------------------------------------------------+-----------+
>>
>> Utilization and load levels.
>>
>> +---------+------------------------------------------------------+----------+-----------+
>> | cluster | tag | variable | perc_diff |
>> +---------+------------------------------------------------------+----------+-----------+
>> | little | m6.6-stock | load | 0.0% |
>> | little | m6.6-stock | util | 0.0% |
>> | little | m6.6-eevdf-complete | load | 29.56% |
>> | little | m6.6-eevdf-complete | util | 55.4% |
>> | little | m6.6-eevdf-complete-no-delay-dequeue | load | 42.89% |
>> | little | m6.6-eevdf-complete-no-delay-dequeue | util | 69.47% |
>> | little | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | load | 51.05% |
>> | little | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | util | 76.55% |
>> | little | m6.6-eevdf-complete-no-delay-zero | load | 34.51% |
>> | little | m6.6-eevdf-complete-no-delay-zero | util | 72.53% |
>> | little | m6.6-eevdf-complete-pelt-fix | load | 29.96% |
>> | little | m6.6-eevdf-complete-pelt-fix | util | 59.82% |
>> | mid | m6.6-stock | load | 0.0% |
>> | mid | m6.6-stock | util | 0.0% |
>> | mid | m6.6-eevdf-complete | load | 29.37% |
>> | mid | m6.6-eevdf-complete | util | 75.22% |
>> | mid | m6.6-eevdf-complete-no-delay-dequeue | load | 36.4% |
>> | mid | m6.6-eevdf-complete-no-delay-dequeue | util | 80.28% |
>> | mid | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | load | 30.35% |
>> | mid | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | util | 90.2% |
>> | mid | m6.6-eevdf-complete-no-delay-zero | load | 37.83% |
>> | mid | m6.6-eevdf-complete-no-delay-zero | util | 93.79% |
>> | mid | m6.6-eevdf-complete-pelt-fix | load | 33.57% |
>> | mid | m6.6-eevdf-complete-pelt-fix | util | 67.83% |
>> | big | m6.6-stock | load | 0.0% |
>> | big | m6.6-stock | util | 0.0% |
>> | big | m6.6-eevdf-complete | load | 97.39% |
>> | big | m6.6-eevdf-complete | util | 12.63% |
>> | big | m6.6-eevdf-complete-no-delay-dequeue | load | 139.69% |
>> | big | m6.6-eevdf-complete-no-delay-dequeue | util | 22.58% |
>> | big | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | load | 125.36% |
>> | big | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | util | 23.15% |
>> | big | m6.6-eevdf-complete-no-delay-zero | load | 128.56% |
>> | big | m6.6-eevdf-complete-no-delay-zero | util | 25.03% |
>> | big | m6.6-eevdf-complete-pelt-fix | load | 130.73% |
>> | big | m6.6-eevdf-complete-pelt-fix | util | 17.52% |
>> +---------+------------------------------------------------------+----------+-----------+
>
> Going through the code, my understanding is that the util_est functions seem to be getting
> called correctly, and in the right order. That is, we first util_est_enqueue, then util_est_dequeue
> and finally util_est_update. So the stats *should* be correct.
>
> On dequeuing (dequeue_task_fair), we immediately call util_est_dequeue, even for the case of
> a DEQUEUE_DELAYED task, since we're no longer going to run the dequeue_delayed task for now, even
> though it is still in the rq.
>
> We delay the util_est_update of dequeue_delayed tasks until a later time in dequeue_entities.
>
> Eventually the dequeue_delayed task will have its lag zeroed when it becomes eligible again,
> (requeue_delayed_entity) while still being in the rq. It will then get dequeued/enqueued (requeued),
> and marked as a non-dequeue-delayed task.
>
> Next time we attempt to enqueue such a task (enqueue_task_fair), it will skip the ENQUEUE_DELAYED
> block and call util_est_enqueue.
>
> Still, something seems to be signalling that util/load is high, and causing migration to the big cores.
>
> Maybe we're not decaying the util/load properly at some point, and inflated numbers start to happen.
>
> I'll continue investigating.
>
Just a quick update on this. While investigating this behavior, I
spotted very high loadavg values on an idle system. For instance:
load average: 4733.84, 4721.24, 4680.33
I wonder if someone else also spotted this.
These values keep increasing slowly but steadily. When the system
is under load they increase a bit more rapidly. Makes me wonder
if we're missing decrementing nr_uninterruptible in some path,
since that is what seems to be causing the loadavg to be off.
On Wed, May 15, 2024 at 11:36:49AM +0200, Peter Zijlstra wrote:
> On Fri, May 10, 2024 at 03:49:46PM +0100, Luis Machado wrote:
> > Just a quick update on this. While investigating this behavior, I
> > spotted very high loadavg values on an idle system. For instance:
> >
> > load average: 4733.84, 4721.24, 4680.33
> >
> > I wonder if someone else also spotted this.
>
> Hadn't spotted it, but now that you mention it, I can definitely see it.
>
> Let me go prod with something sharp. Thanks!
What's the point of making notes if you then don't read them... *sigh*.
Does this help?
---
kernel/sched/core.c | 23 ++++++++++++-----------
kernel/sched/fair.c | 4 ++--
kernel/sched/sched.h | 8 ++++++++
3 files changed, 22 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5ffd7e047393..43f061bcfe54 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2154,14 +2154,18 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
- bool sleep = flags & DEQUEUE_SLEEP;
+ SCHED_WARN_ON(flags & DEQUEUE_SLEEP);
- if (dequeue_task(rq, p, flags)) {
- WRITE_ONCE(p->on_rq, sleep ? 0 : TASK_ON_RQ_MIGRATING);
- ASSERT_EXCLUSIVE_WRITER(p->on_rq);
- } else {
- SCHED_WARN_ON(!sleep); /* only sleep can fail */
- }
+ dequeue_task(rq, p, flags);
+
+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+}
+
+static void block_task(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags))
+ __block_task(rq, p);
}
static inline int __normal_prio(int policy, int rt_prio, int nice)
@@ -6693,9 +6697,6 @@ static void __sched notrace __schedule(unsigned int sched_mode)
!(prev_state & TASK_NOLOAD) &&
!(prev_state & TASK_FROZEN);
- if (prev->sched_contributes_to_load)
- rq->nr_uninterruptible++;
-
/*
* __schedule() ttwu()
* prev_state = prev->state; if (p->on_rq && ...)
@@ -6707,7 +6708,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
*
* After this, schedule() must not care about p->state any more.
*/
- deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
+ block_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
if (prev->in_iowait) {
atomic_inc(&rq->nr_iowait);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 536eabcb1a71..596a5fabe490 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7032,8 +7032,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
util_est_update(&rq->cfs, p, task_sleep);
hrtick_update(rq);
- /* Fix-up what deactivate_task() skipped. */
- WRITE_ONCE(p->on_rq, 0);
+ /* Fix-up what block_task() skipped. */
+ __block_task(rq, p);
}
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 12841d8195c5..48e5f49d9bc2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2560,6 +2560,14 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
sched_update_tick_dependency(rq);
}
+static inline void __block_task(struct rq *rq, struct task_struct *p)
+{
+ WRITE_ONCE(p->on_rq, 0);
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible++;
+}
+
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
On Fri, May 10, 2024 at 03:49:46PM +0100, Luis Machado wrote:
> Just a quick update on this. While investigating this behavior, I
> spotted very high loadavg values on an idle system. For instance:
>
> load average: 4733.84, 4721.24, 4680.33
>
> I wonder if someone else also spotted this.
Hadn't spotted it, but now that you mention it, I can definitely see it.
Let me go prod with something sharp. Thanks!
On Wed, 2024-05-15 at 13:48 +0200, Peter Zijlstra wrote:
> On Wed, May 15, 2024 at 11:36:49AM +0200, Peter Zijlstra wrote:
> > On Fri, May 10, 2024 at 03:49:46PM +0100, Luis Machado wrote:
> > > Just a quick update on this. While investigating this behavior, I
> > > spotted very high loadavg values on an idle system. For instance:
> > >
> > > load average: 4733.84, 4721.24, 4680.33
> > >
> > > I wonder if someone else also spotted this.
> >
> > Hadn't spotted it, but now that you mention it, I can definitely see it.
(ditto.. wart not enormous enough for awesome powers of observation:)
> >
> > Let me go prod with something sharp. Thanks!
>
> What's the point of making notes if you then don't read them... *sigh*.
>
> Does this help?
That fixed it up here.
> ---
> kernel/sched/core.c | 23 ++++++++++++-----------
> kernel/sched/fair.c | 4 ++--
> kernel/sched/sched.h | 8 ++++++++
> 3 files changed, 22 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 5ffd7e047393..43f061bcfe54 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2154,14 +2154,18 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
>
> void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
> {
> - bool sleep = flags & DEQUEUE_SLEEP;
> + SCHED_WARN_ON(flags & DEQUEUE_SLEEP);
>
> - if (dequeue_task(rq, p, flags)) {
> - WRITE_ONCE(p->on_rq, sleep ? 0 : TASK_ON_RQ_MIGRATING);
> - ASSERT_EXCLUSIVE_WRITER(p->on_rq);
> - } else {
> - SCHED_WARN_ON(!sleep); /* only sleep can fail */
> - }
> + dequeue_task(rq, p, flags);
> +
> + WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
> + ASSERT_EXCLUSIVE_WRITER(p->on_rq);
> +}
> +
> +static void block_task(struct rq *rq, struct task_struct *p, int flags)
> +{
> + if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags))
> + __block_task(rq, p);
> }
>
> static inline int __normal_prio(int policy, int rt_prio, int nice)
> @@ -6693,9 +6697,6 @@ static void __sched notrace __schedule(unsigned int sched_mode)
> !(prev_state & TASK_NOLOAD) &&
> !(prev_state & TASK_FROZEN);
>
> - if (prev->sched_contributes_to_load)
> - rq->nr_uninterruptible++;
> -
> /*
> * __schedule() ttwu()
> * prev_state = prev->state; if (p->on_rq && ...)
> @@ -6707,7 +6708,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
> *
> * After this, schedule() must not care about p->state any more.
> */
> - deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
> + block_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
>
> if (prev->in_iowait) {
> atomic_inc(&rq->nr_iowait);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 536eabcb1a71..596a5fabe490 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7032,8 +7032,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> util_est_update(&rq->cfs, p, task_sleep);
> hrtick_update(rq);
>
> - /* Fix-up what deactivate_task() skipped. */
> - WRITE_ONCE(p->on_rq, 0);
> + /* Fix-up what block_task() skipped. */
> + __block_task(rq, p);
> }
> }
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 12841d8195c5..48e5f49d9bc2 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2560,6 +2560,14 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
> sched_update_tick_dependency(rq);
> }
>
> +static inline void __block_task(struct rq *rq, struct task_struct *p)
> +{
> + WRITE_ONCE(p->on_rq, 0);
> + ASSERT_EXCLUSIVE_WRITER(p->on_rq);
> + if (p->sched_contributes_to_load)
> + rq->nr_uninterruptible++;
> +}
> +
> extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
> extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
>
On 5/15/24 12:48, Peter Zijlstra wrote:
> On Wed, May 15, 2024 at 11:36:49AM +0200, Peter Zijlstra wrote:
>> On Fri, May 10, 2024 at 03:49:46PM +0100, Luis Machado wrote:
>>> Just a quick update on this. While investigating this behavior, I
>>> spotted very high loadavg values on an idle system. For instance:
>>>
>>> load average: 4733.84, 4721.24, 4680.33
>>>
>>> I wonder if someone else also spotted this.
>>
>> Hadn't spotted it, but now that you mention it, I can definitely see it.
>>
>> Let me go prod with something sharp. Thanks!
>
> What's the point of making notes if you then don't read them... *sigh*.
Makes it look like you did read them? :-)
>
> Does this help?
>
It does address the load_avg issues as Mike G pointed out. Thanks for the quick patch!
Unfortunately, for me, it didn't help with the energy regression. So my investigation
continues.
Something still seems to be driving things out of the smaller cores and into
the bigger cores. And the load_avg accounting proved to be a red herring.
> ---
> kernel/sched/core.c | 23 ++++++++++++-----------
> kernel/sched/fair.c | 4 ++--
> kernel/sched/sched.h | 8 ++++++++
> 3 files changed, 22 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 5ffd7e047393..43f061bcfe54 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2154,14 +2154,18 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
>
> void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
> {
> - bool sleep = flags & DEQUEUE_SLEEP;
> + SCHED_WARN_ON(flags & DEQUEUE_SLEEP);
>
> - if (dequeue_task(rq, p, flags)) {
> - WRITE_ONCE(p->on_rq, sleep ? 0 : TASK_ON_RQ_MIGRATING);
> - ASSERT_EXCLUSIVE_WRITER(p->on_rq);
> - } else {
> - SCHED_WARN_ON(!sleep); /* only sleep can fail */
> - }
> + dequeue_task(rq, p, flags);
> +
> + WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
> + ASSERT_EXCLUSIVE_WRITER(p->on_rq);
> +}
> +
> +static void block_task(struct rq *rq, struct task_struct *p, int flags)
> +{
> + if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags))
> + __block_task(rq, p);
> }
>
> static inline int __normal_prio(int policy, int rt_prio, int nice)
> @@ -6693,9 +6697,6 @@ static void __sched notrace __schedule(unsigned int sched_mode)
> !(prev_state & TASK_NOLOAD) &&
> !(prev_state & TASK_FROZEN);
>
> - if (prev->sched_contributes_to_load)
> - rq->nr_uninterruptible++;
> -
> /*
> * __schedule() ttwu()
> * prev_state = prev->state; if (p->on_rq && ...)
> @@ -6707,7 +6708,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
> *
> * After this, schedule() must not care about p->state any more.
> */
> - deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
> + block_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
>
> if (prev->in_iowait) {
> atomic_inc(&rq->nr_iowait);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 536eabcb1a71..596a5fabe490 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7032,8 +7032,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> util_est_update(&rq->cfs, p, task_sleep);
> hrtick_update(rq);
>
> - /* Fix-up what deactivate_task() skipped. */
> - WRITE_ONCE(p->on_rq, 0);
> + /* Fix-up what block_task() skipped. */
> + __block_task(rq, p);
> }
> }
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 12841d8195c5..48e5f49d9bc2 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2560,6 +2560,14 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
> sched_update_tick_dependency(rq);
> }
>
> +static inline void __block_task(struct rq *rq, struct task_struct *p)
> +{
> + WRITE_ONCE(p->on_rq, 0);
> + ASSERT_EXCLUSIVE_WRITER(p->on_rq);
> + if (p->sched_contributes_to_load)
> + rq->nr_uninterruptible++;
> +}
> +
> extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
> extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
>
On Mon, Apr 29, 2024 at 03:33:04PM +0100, Luis Machado wrote:
> (2) m6.6-eevdf-complete: m6.6-stock plus this series.
> (3) m6.6-eevdf-complete-no-delay-dequeue: (2) + NO_DELAY_DEQUEUE
> +------------+------------------------------------------------------+-----------+
> | cluster | tag | perc_diff |
> +------------+------------------------------------------------------+-----------+
> | CPU | m6.6-stock | 0.0% |
> | CPU-Big | m6.6-stock | 0.0% |
> | CPU-Little | m6.6-stock | 0.0% |
> | CPU-Mid | m6.6-stock | 0.0% |
> | GPU | m6.6-stock | 0.0% |
> | Total | m6.6-stock | 0.0% |
> | CPU | m6.6-eevdf-complete-no-delay-dequeue | 117.77% |
> | CPU-Big | m6.6-eevdf-complete-no-delay-dequeue | 113.79% |
> | CPU-Little | m6.6-eevdf-complete-no-delay-dequeue | 97.47% |
> | CPU-Mid | m6.6-eevdf-complete-no-delay-dequeue | 189.0% |
> | GPU | m6.6-eevdf-complete-no-delay-dequeue | -6.74% |
> | Total | m6.6-eevdf-complete-no-delay-dequeue | 103.84% |
This one is still flummoxing me. I've gone over the patch a few times on
different days and I'm not seeing it. Without DELAY_DEQUEUE it should
behave as before.
Let me try and split this patch up into smaller parts such that you can
try and bisect this.
Peter,
On 5/23/24 09:45, Peter Zijlstra wrote:
> On Mon, Apr 29, 2024 at 03:33:04PM +0100, Luis Machado wrote:
>
>> (2) m6.6-eevdf-complete: m6.6-stock plus this series.
>> (3) m6.6-eevdf-complete-no-delay-dequeue: (2) + NO_DELAY_DEQUEUE
>
>> +------------+------------------------------------------------------+-----------+
>> | cluster | tag | perc_diff |
>> +------------+------------------------------------------------------+-----------+
>> | CPU | m6.6-stock | 0.0% |
>> | CPU-Big | m6.6-stock | 0.0% |
>> | CPU-Little | m6.6-stock | 0.0% |
>> | CPU-Mid | m6.6-stock | 0.0% |
>> | GPU | m6.6-stock | 0.0% |
>> | Total | m6.6-stock | 0.0% |
>
>> | CPU | m6.6-eevdf-complete-no-delay-dequeue | 117.77% |
>> | CPU-Big | m6.6-eevdf-complete-no-delay-dequeue | 113.79% |
>> | CPU-Little | m6.6-eevdf-complete-no-delay-dequeue | 97.47% |
>> | CPU-Mid | m6.6-eevdf-complete-no-delay-dequeue | 189.0% |
>> | GPU | m6.6-eevdf-complete-no-delay-dequeue | -6.74% |
>> | Total | m6.6-eevdf-complete-no-delay-dequeue | 103.84% |
>
> This one is still flummoxing me. I've gone over the patch a few times on
> different days and I'm not seeing it. Without DELAY_DEQUEUE it should
> behave as before.
>
> Let me try and split this patch up into smaller parts such that you can
> try and bisect this.
>
Same situation on my end. I've been chasing this for some time and I don't fully
understand why things go off the rails energy-wise as soon as DELAY_DEQUEUE is
enabled, now that the load_avg accounting red herring is gone.
I do have one additional piece of information though. Hopefully it will be useful.
Booting the kernel with NO_DELAY_DEQUEUE (default to false), things work fine. Then
if I switch to DELAY_DEQUEUE at runtime, things start using a lot more power.
The interesting bit is if I switch to NO_DELAY_DEQUEUE again at runtime, things don't
go back to normal. Rather they stay the same, using a lot more energy.
I wonder if we're leaving some unbalanced state somewhere while DELAY_DEQUEUE is on,
something that is signalling we have more load/utilization than we actually do.
The PELT signals look reasonable from what I can see. We don't seem to be boosting
frequencies, but we're running things mostly on big cores with DELAY_DEQUEUE on.
I'll keep investigating this. Please let me know if you need some additional data or
testing and I can get that going.
On Thu, May 23, 2024 at 10:06:04AM +0100, Luis Machado wrote:
> Booting the kernel with NO_DELAY_DEQUEUE (default to false), things work fine. Then
> if I switch to DELAY_DEQUEUE at runtime, things start using a lot more power.
>
> The interesting bit is if I switch to NO_DELAY_DEQUEUE again at runtime, things don't
> go back to normal. Rather they stay the same, using a lot more energy.
Ooh, cute.. weird. I'll try and see if we leak state somehow.
On Thu, May 02, 2024 at 11:26:14AM +0100, Luis Machado wrote:
> Going through the code, my understanding is that the util_est functions seem to be getting
> called correctly, and in the right order. That is, we first util_est_enqueue, then util_est_dequeue
> and finally util_est_update. So the stats *should* be correct.
>
> On dequeuing (dequeue_task_fair), we immediately call util_est_dequeue, even for the case of
> a DEQUEUE_DELAYED task, since we're no longer going to run the dequeue_delayed task for now, even
> though it is still in the rq.
>
> We delay the util_est_update of dequeue_delayed tasks until a later time in dequeue_entities.
>
> Eventually the dequeue_delayed task will have its lag zeroed when it becomes eligible again,
> (requeue_delayed_entity) while still being in the rq. It will then get dequeued/enqueued (requeued),
> and marked as a non-dequeue-delayed task.
>
> Next time we attempt to enqueue such a task (enqueue_task_fair), it will skip the ENQUEUE_DELAYED
> block and call util_est_enqueue.
>
> Still, something seems to be signalling that util/load is high, and causing migration to the big cores.
>
> Maybe we're not decaying the util/load properly at some point, and inflated numbers start to happen.
>
> I'll continue investigating.
So I re-read all this util_est stuff again this evening and I am
confused :-) Please bear with me.
So the old code does:
dequeue_task_fair()
util_est_dequeue();
// actual dequeue -- updates pelt
util_est_update();
While the new code does:
dequeue_task_fair()
util_est_dequeue();
if (!dequeue())
return;
util_est_update();
delayed_dequeue:
util_est_update();
Specifically, we only call util_est_update() if/when we do the actual
dequeue -- potentially at a later point in time. Because, as I argued in
the comments, ttwu()'s ENQUEUE_DELAYED will not actually enqueue the
task (since it is still enqueued) and therefore the update would be
spurious.
However!!, if we do dequeue, then we'll end up updating the EWMA with a
potentially different task_util(p).
And this is where the confusion comes... this extra time on the runqueue
would not be running and thus decreate util_avg, as such task_util_est()
should be lower than before and tasks should tend towards smaller cores,
rather than larger cores as you seem to be seeing.
[ there is the 'dequeued + UTIL_EST_MARGIN < task_runnable()' escape
clause, which we might be hitting... dunno ]
In any case, if you haven't tried it already, could I ask you to test
the below (once you're back in the office)?
Also, this doesn't really explain why things go sideways once you enable
DELAY_DEQUEUE and then don't re-align if you disable it again. I mean,
it should eventually age out the dodgy EWMA contributions and start
working as expected.
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7008,7 +7008,6 @@ static int dequeue_entities(struct rq *r
SCHED_WARN_ON(p->on_rq != 1);
/* Fix-up what dequeue_task_fair() skipped */
- util_est_update(&rq->cfs, p, task_sleep);
hrtick_update(rq);
/* Fix-up what block_task() skipped. */
@@ -7028,13 +7027,11 @@ static bool dequeue_task_fair(struct rq
{
util_est_dequeue(&rq->cfs, p);
- if (dequeue_entities(rq, &p->se, flags) < 0)
+ if (dequeue_entities(rq, &p->se, flags) < 0) {
+ util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
return false;
+ }
- /*
- * It doesn't make sense to update util_est for the delayed dequeue
- * case where ttwu will make it appear the sleep never happened.
- */
util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
hrtick_update(rq);
return true;
On 23/05/2024 10:33, Peter Zijlstra wrote:
> On Thu, May 23, 2024 at 10:06:04AM +0100, Luis Machado wrote:
>
>> Booting the kernel with NO_DELAY_DEQUEUE (default to false), things work fine. Then
>> if I switch to DELAY_DEQUEUE at runtime, things start using a lot more power.
>>
>> The interesting bit is if I switch to NO_DELAY_DEQUEUE again at runtime, things don't
>> go back to normal. Rather they stay the same, using a lot more energy.
>
> Ooh, cute.. weird. I'll try and see if we leak state somehow.
Hi. I'm working on uclamp anyway so I just gave this series (and all
fixes mentioned in this thread) a go.
It seems after running with this series for a while, suddenly uclamp_max
no longer works. Running a task with uclamp_max of 110 still sees an rq
uclamp_max of 1024. Also weirdly, the PELT signal of that uclamp_max is
completely broken. It goes up to a very high value and then suddenly
drops to almost 0 from time to time.
I wonder if the reason Luis saw that is because he was the first one to
run delayed dequeue with uclamp. I'm not familiar enough with the series
to know how it could possibly affect uclamp so investigating in this
front may be worth trying. Maybe some leaked uclamp state on the rq?
On 5/29/24 23:50, Peter Zijlstra wrote:
> On Thu, May 02, 2024 at 11:26:14AM +0100, Luis Machado wrote:
>
>> Going through the code, my understanding is that the util_est functions seem to be getting
>> called correctly, and in the right order. That is, we first util_est_enqueue, then util_est_dequeue
>> and finally util_est_update. So the stats *should* be correct.
>>
>> On dequeuing (dequeue_task_fair), we immediately call util_est_dequeue, even for the case of
>> a DEQUEUE_DELAYED task, since we're no longer going to run the dequeue_delayed task for now, even
>> though it is still in the rq.
>>
>> We delay the util_est_update of dequeue_delayed tasks until a later time in dequeue_entities.
>>
>> Eventually the dequeue_delayed task will have its lag zeroed when it becomes eligible again,
>> (requeue_delayed_entity) while still being in the rq. It will then get dequeued/enqueued (requeued),
>> and marked as a non-dequeue-delayed task.
>>
>> Next time we attempt to enqueue such a task (enqueue_task_fair), it will skip the ENQUEUE_DELAYED
>> block and call util_est_enqueue.
>>
>> Still, something seems to be signalling that util/load is high, and causing migration to the big cores.
>>
>> Maybe we're not decaying the util/load properly at some point, and inflated numbers start to happen.
>>
>> I'll continue investigating.
Apologies for not getting this earlier, but I wanted to collect as much
data as possible so we can hopefully see what's going wrong with this
particular patch in the series.
(picking up from the previous questions on find_energy_efficient_cpu and
overutilized)
For the rate of calls to find_energy_efficient_cpu, I checked again and
I didn't see any significant differences there. With delayed dequeue
or without delayed dequeue, I see almost the same number of calls,
with the expected small variation between runs.
As for the overutilized part, there is a bit of a change within
find_energy_efficient_cpu. When I first enable delayed dequeue via
the debugfs (I boot up the system with it disabled for a clean state),
I start to see a lot more cases of us bailing out of
find_energy_efficient_cpu due to the overutilized condition.
Strangely though, things seem to stabilize after a while, and the
rate of overutilized conditions when within find_energy_efficient_cpu
goes down to normal levels.
Here's some numbers from a trace session:
Delayed dequeue off (clean state, 2 iterations):
------------------------------------------------
Calls to find_energy_efficient_cpu: 211087
Overutilized when in find_energy_efficient_cpu: 36224
Calls to find_energy_efficient_cpu: 205804
Overutilized when in find_energy_efficient_cpu: 35274
Delayed dequeue on (3 iterations):
------------------------------------------------
Calls to find_energy_efficient_cpu: 204079
Overutilized when in find_energy_efficient_cpu: 86966
Calls to find_energy_efficient_cpu: 207351
Overutilized when in find_energy_efficient_cpu: 111233
Calls to find_energy_efficient_cpu: 208556
Overutilized when in find_energy_efficient_cpu: 77986
Delayed dequeue off again (3 iterations):
------------------------------------------------
Calls to find_energy_efficient_cpu: 210396
Overutilized when in find_energy_efficient_cpu: 34108
Calls to find_energy_efficient_cpu: 210978
Overutilized when in find_energy_efficient_cpu: 33122
Calls to find_energy_efficient_cpu: 210181
Overutilized when in find_energy_efficient_cpu: 33060
Delayed dequeue on again (3 iterations):
------------------------------------------------
Calls to find_energy_efficient_cpu: 202932
Overutilized when in find_energy_efficient_cpu: 35060
Calls to find_energy_efficient_cpu: 206098
Overutilized when in find_energy_efficient_cpu: 57373
Calls to find_energy_efficient_cpu: 199444
Overutilized when in find_energy_efficient_cpu: 33959
On the matter of switched_from_fair() being called with a task
that is sched_delayed == 1, I can confirm I see some occurrences
of that. They are not that numerous, but do happen from time to
time on the trace data. I haven't investigated things in that
direction yet though.
But we do make those tasks sched_delayed = 0 in there, so things
should go back to normal eventually, unless we fail to dequeue
the sched_delayed task properly for this case.
>
> So I re-read all this util_est stuff again this evening and I am
> confused :-) Please bear with me.
>
> So the old code does:
>
> dequeue_task_fair()
> util_est_dequeue();
> // actual dequeue -- updates pelt
> util_est_update();
>
>
> While the new code does:
>
> dequeue_task_fair()
> util_est_dequeue();
> if (!dequeue())
> return;
> util_est_update();
>
> delayed_dequeue:
> util_est_update();
>
>
The above is my understanding as well.
> Specifically, we only call util_est_update() if/when we do the actual
> dequeue -- potentially at a later point in time. Because, as I argued in
> the comments, ttwu()'s ENQUEUE_DELAYED will not actually enqueue the
> task (since it is still enqueued) and therefore the update would be
> spurious.
>
> However!!, if we do dequeue, then we'll end up updating the EWMA with a
> potentially different task_util(p).
>
> And this is where the confusion comes... this extra time on the runqueue
> would not be running and thus decreate util_avg, as such task_util_est()
> should be lower than before and tasks should tend towards smaller cores,
> rather than larger cores as you seem to be seeing.
Right. I also expected the util_avg to decrease with more time on the
runqueue. And that might be happening...
>
> [ there is the 'dequeued + UTIL_EST_MARGIN < task_runnable()' escape
> clause, which we might be hitting... dunno ]
>
> In any case, if you haven't tried it already, could I ask you to test
> the below (once you're back in the office)?
>
> Also, this doesn't really explain why things go sideways once you enable
> DELAY_DEQUEUE and then don't re-align if you disable it again. I mean,
> it should eventually age out the dodgy EWMA contributions and start
> working as expected.
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7008,7 +7008,6 @@ static int dequeue_entities(struct rq *r
> SCHED_WARN_ON(p->on_rq != 1);
>
> /* Fix-up what dequeue_task_fair() skipped */
> - util_est_update(&rq->cfs, p, task_sleep);
> hrtick_update(rq);
>
> /* Fix-up what block_task() skipped. */
> @@ -7028,13 +7027,11 @@ static bool dequeue_task_fair(struct rq
> {
> util_est_dequeue(&rq->cfs, p);
>
> - if (dequeue_entities(rq, &p->se, flags) < 0)
> + if (dequeue_entities(rq, &p->se, flags) < 0) {
> + util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
> return false;
> + }
>
> - /*
> - * It doesn't make sense to update util_est for the delayed dequeue
> - * case where ttwu will make it appear the sleep never happened.
> - */
> util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
> hrtick_update(rq);
> return true;
... as the above patch unfortunately doesn't improve things. The util_est_update
might be a red herring here.
Exchanging some information with Hongyan today, he was a bit suspicious of the uclamp
behavior with the eevdf complete series applied.
Checking the uclamp code, I see we have some refcounting tied to enqueuing/dequeuing
of tasks, and the uclamp values are organized in buckets.
Just for fun I added a few trace_printk's in uclamp_eff_value, uclamp_rq_inc_id and
uclamp_rq_dec_id.
Booting up the system with delayed_dequeue disabled and running the benchmark, I
see the uclamp bucket management pretty stable. Tasks get added to the uclamp
buckets but then get removed. At the end of the benchmark, the uclamp buckets
are (almost always) clean of tasks.
Enabling delayed dequeue, I can see the uclamp buckets slowly filling up with
tasks. At the end of the benchmark, I see uclamp buckets with 30, 40 or 50
tasks still. If I do another run, I can see 80, 100 tasks still.
I suspect refcounting might be going wrong somewhere due to delayed dequeue
tasks, but that's more of a guess right now. Hopefully that is useful
information. I'll resume investigation tomorrow.
On Mon, Jun 03, 2024 at 08:30:43PM +0100, Luis Machado wrote:
> Exchanging some information with Hongyan today, he was a bit suspicious of the uclamp
> behavior with the eevdf complete series applied.
>
> Checking the uclamp code, I see we have some refcounting tied to enqueuing/dequeuing
> of tasks, and the uclamp values are organized in buckets.
>
> Just for fun I added a few trace_printk's in uclamp_eff_value, uclamp_rq_inc_id and
> uclamp_rq_dec_id.
>
> Booting up the system with delayed_dequeue disabled and running the benchmark, I
> see the uclamp bucket management pretty stable. Tasks get added to the uclamp
> buckets but then get removed. At the end of the benchmark, the uclamp buckets
> are (almost always) clean of tasks.
>
> Enabling delayed dequeue, I can see the uclamp buckets slowly filling up with
> tasks. At the end of the benchmark, I see uclamp buckets with 30, 40 or 50
> tasks still. If I do another run, I can see 80, 100 tasks still.
>
> I suspect refcounting might be going wrong somewhere due to delayed dequeue
> tasks, but that's more of a guess right now. Hopefully that is useful
> information. I'll resume investigation tomorrow.
Thank you both!!
Does the below help?
Note how dequeue_task() does uclamp_rq_dec() unconditionally, which is
then not balanced in the case below.
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3664,6 +3664,7 @@ static int ttwu_runnable(struct task_str
/* mustn't run a delayed task */
SCHED_WARN_ON(task_on_cpu(rq, p));
enqueue_task(rq, p, ENQUEUE_DELAYED);
+ uclamp_rq_inc(rq, p);
}
if (!task_on_cpu(rq, p)) {
/*
Hi Peter,
On 04/06/2024 11:11, Peter Zijlstra wrote:
> On Mon, Jun 03, 2024 at 08:30:43PM +0100, Luis Machado wrote:
>
>> Exchanging some information with Hongyan today, he was a bit suspicious of the uclamp
>> behavior with the eevdf complete series applied.
>>
>> Checking the uclamp code, I see we have some refcounting tied to enqueuing/dequeuing
>> of tasks, and the uclamp values are organized in buckets.
>>
>> Just for fun I added a few trace_printk's in uclamp_eff_value, uclamp_rq_inc_id and
>> uclamp_rq_dec_id.
>>
>> Booting up the system with delayed_dequeue disabled and running the benchmark, I
>> see the uclamp bucket management pretty stable. Tasks get added to the uclamp
>> buckets but then get removed. At the end of the benchmark, the uclamp buckets
>> are (almost always) clean of tasks.
>>
>> Enabling delayed dequeue, I can see the uclamp buckets slowly filling up with
>> tasks. At the end of the benchmark, I see uclamp buckets with 30, 40 or 50
>> tasks still. If I do another run, I can see 80, 100 tasks still.
>>
>> I suspect refcounting might be going wrong somewhere due to delayed dequeue
>> tasks, but that's more of a guess right now. Hopefully that is useful
>> information. I'll resume investigation tomorrow.
>
> Thank you both!!
>
> Does the below help?
>
> Note how dequeue_task() does uclamp_rq_dec() unconditionally, which is
> then not balanced in the case below.
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3664,6 +3664,7 @@ static int ttwu_runnable(struct task_str
> /* mustn't run a delayed task */
> SCHED_WARN_ON(task_on_cpu(rq, p));
> enqueue_task(rq, p, ENQUEUE_DELAYED);
> + uclamp_rq_inc(rq, p);
> }
> if (!task_on_cpu(rq, p)) {
> /*
Further communication with Luis makes me investigate more on uclamp
bucket leaks.
I added a separate variable uclamp_on_rq in task_struct and WARN when
the state didn't match. The thing is, without your above fix, the first
WARN happens on uclamp_rq_inc(), not dec(), which means a task is
already added to rq uclamp and is somehow added again by mistake.
Because this is a double inc(), your fix won't fix this problem. I added
the trace here when the double inc() happened, in case you can see
something.
Juno-r2-Debian login: [ 176.010780] ------------[ cut here
]------------
[1083/15764]
[ 176.015414] WARNING: CPU: 5 PID: 0 at kernel/sched/core.c:1594
uclamp_rq_inc+0x188/0x1a0
[ 176.023539] Modules linked in:
[ 176.026602] CPU: 5 PID: 0 Comm: swapper/5 Not tainted
6.10.0-rc1-ga946a2b48119-dirty #1034
[ 176.034882] Hardware name: ARM Juno development board (r2) (DT)
[ 176.040815] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS
BTYPE=--)
[ 176.047796] pc : uclamp_rq_inc+0x188/0x1a0
[ 176.051903] lr : enqueue_task+0x48/0x70
[ 176.055752] sp : ffffffc081fb3de0
[ 176.059073] x29: ffffffc081fb3de0 x28: 0000000000000000 x27:
ffffff8800342300
[ 176.066233] x26: ffffff88084311b8 x25: ffffffc081ed2000 x24:
ffffffc081ca7008
[ 176.073388] x23: 0000000000000000 x22: 00000000000000c0 x21:
ffffff897efbbf40
[ 176.080543] x20: ffffff8808431180 x19: 0000000000000049 x18:
0000000000000408
[ 176.087700] x17: ffffffc8fd301000 x16: ffffffc081fb0000 x15:
000000000000afc8
[ 176.094857] x14: 0000000000000001 x13: 0000000000000004 x12:
000000020c8392c2
[ 176.102017] x11: 0000000000000800 x10: 0000000000000800 x9 :
0000000000000000
[ 176.109177] x8 : 0000000000000001 x7 : 0000000000000021 x6 :
ffffffffffffff4c
[ 176.116337] x5 : 0000000000007e90 x4 : 00000000000fc33c x3 :
00000000000139b4
[ 176.123487] x2 : 0000000000000049 x1 : ffffff8808431180 x0 :
ffffff897efbbf40
[ 176.130639] Call trace:
[ 176.133089] uclamp_rq_inc+0x188/0x1a0
[ 176.136849] activate_task+0x48/0x60
[ 176.140433] ttwu_do_activate+0x80/0x22c
[ 176.144364] sched_ttwu_pending+0x9c/0x120
[ 176.148470] __flush_smp_call_function_queue+0x3d0/0x568
[ 176.153792] generic_smp_call_function_single_interrupt+0x10/0x18
[ 176.159902] do_handle_IPI+0xa0/0x230
[ 176.163577] ipi_handler+0x18/0x24
[ 176.166987] handle_percpu_devid_irq+0x80/0x204
[ 176.171532] generic_handle_domain_irq+0x28/0x40
[ 176.176161] gic_handle_irq+0x58/0xd0
[ 176.179833] call_on_irq_stack+0x24/0x34
[ 176.183764] el1_interrupt+0x70/0xbc
[ 176.187348] el1h_64_irq_handler+0x14/0x1c
[ 176.191449] el1h_64_irq+0x64/0x68
[ 176.194857] arch_local_irq_enable+0x4/0x8
[ 176.198960] cpuidle_enter+0x34/0x48
[ 176.202547] do_idle+0x1c4/0x234
[ 176.205786] cpu_startup_entry+0x30/0x34
[ 176.209723] secondary_start_kernel+0x124/0x144
[ 176.214267] __secondary_switched+0x6c/0x70
[ 176.218458] ---[ end trace 0000000000000000 ]---
Or, do you mean to add a uclamp_rq_dec() instead? But even if I skip the
uclamp_rq_inc() at that line (which is equivalent to doing
uclamp_rq_dec() after enqueue_task()), I still get a WARN like this:
Juno-r2-Debian login: [ 77.183511] ------------[ cut here
]------------
[ 77.188157] WARNING: CPU: 2 PID: 114 at kernel/sched/core.c:1594
uclamp_rq_inc+0x188/0x1a0
[ 77.196455] Modules linked in:
[ 77.199519] CPU: 2 PID: 114 Comm: kworker/u25:3 Not tainted
6.10.0-rc1-ga946a2b48119-dirty #1035
[ 77.208325] Hardware name: ARM Juno development board (r2) (DT)
[ 77.214255] Workqueue: rpciod rpc_async_schedule
[ 77.218893] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS
BTYPE=--)
[ 77.225873] pc : uclamp_rq_inc+0x188/0x1a0
[ 77.229979] lr : enqueue_task+0x44/0x7c
[ 77.233826] sp : ffffffc082af3c80
[ 77.237143] x29: ffffffc082af3c80 x28: ffffffc081cd6000 x27:
0000000000000402
[ 77.244305] x26: ffffffc8fd285000 x25: ffffffc081cbaf40 x24:
ffffff8800f92300
[ 77.251466] x23: ffffff897ef3ff40 x22: 00000000000000c0 x21:
ffffff897ef3ff40
[ 77.258625] x20: ffffff88028f3480 x19: 0000000000000009 x18:
0000000000000000
[ 77.265784] x17: 0000000000000000 x16: 0000000000000001 x15:
00000000000003ae
[ 77.272943] x14: 000000000003eca5 x13: 000000000003eca5 x12:
00000000000000eb
[ 77.280102] x11: 0000000000000000 x10: 0000000000001800 x9 :
0000000000000000
[ 77.287261] x8 : 0000000000000001 x7 : 0000000000000000 x6 :
000000000000003f
[ 77.294420] x5 : 0000000000000311 x4 : 0000000000000113 x3 :
000000000000017f
[ 77.301578] x2 : 0000000000000009 x1 : ffffff88028f3480 x0 :
ffffff897ef3ff40
[ 77.308738] Call trace:
[ 77.311185] uclamp_rq_inc+0x188/0x1a0
[ 77.314944] activate_task+0x48/0x60
[ 77.318530] ttwu_do_activate+0x80/0x22c
[ 77.322464] try_to_wake_up+0x1e8/0x50c
[ 77.326311] wake_up_process+0x14/0x1c
[ 77.330070] kick_pool+0xb8/0x144
[ 77.333395] process_one_work+0x104/0x3a4
[ 77.337418] worker_thread+0x2b8/0x424
[ 77.341176] kthread+0xf8/0x194
[ 77.344327] ret_from_fork+0x10/0x20
[ 77.347913] ---[ end trace 0000000000000000 ]---
I pasted the trace here in case useful.
Hongyan
On 6/4/24 11:11, Peter Zijlstra wrote:
> On Mon, Jun 03, 2024 at 08:30:43PM +0100, Luis Machado wrote:
>
>> Exchanging some information with Hongyan today, he was a bit suspicious of the uclamp
>> behavior with the eevdf complete series applied.
>>
>> Checking the uclamp code, I see we have some refcounting tied to enqueuing/dequeuing
>> of tasks, and the uclamp values are organized in buckets.
>>
>> Just for fun I added a few trace_printk's in uclamp_eff_value, uclamp_rq_inc_id and
>> uclamp_rq_dec_id.
>>
>> Booting up the system with delayed_dequeue disabled and running the benchmark, I
>> see the uclamp bucket management pretty stable. Tasks get added to the uclamp
>> buckets but then get removed. At the end of the benchmark, the uclamp buckets
>> are (almost always) clean of tasks.
>>
>> Enabling delayed dequeue, I can see the uclamp buckets slowly filling up with
>> tasks. At the end of the benchmark, I see uclamp buckets with 30, 40 or 50
>> tasks still. If I do another run, I can see 80, 100 tasks still.
>>
>> I suspect refcounting might be going wrong somewhere due to delayed dequeue
>> tasks, but that's more of a guess right now. Hopefully that is useful
>> information. I'll resume investigation tomorrow.
>
> Thank you both!!
>
> Does the below help?
>
> Note how dequeue_task() does uclamp_rq_dec() unconditionally, which is
> then not balanced in the case below.
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3664,6 +3664,7 @@ static int ttwu_runnable(struct task_str
> /* mustn't run a delayed task */
> SCHED_WARN_ON(task_on_cpu(rq, p));
> enqueue_task(rq, p, ENQUEUE_DELAYED);
> + uclamp_rq_inc(rq, p);
> }
> if (!task_on_cpu(rq, p)) {
> /*
As Hongyan pointed out in a separate message, the above makes things
worse, as we end up with even more leftover tasks in the uclamp
buckets.
I'm trying a fix in kernel/sched/core.c:enqueue_task that only
calls uclamp_rq_inc if the task is not sched_delayed, so:
- uclamp_rq_inc(rq, p);
+ if (!p->se.sched_delayed)
+ uclamp_rq_inc(rq, p);
I'm not entirely sure it is correct, but it seems to fix things,
but I'm still running some tests.
With the current code, given uclamp_rq_inc and uclamp_rq_dec get
called in enqueue_task and dequeue_task, the additional enqueue_task
call from ttwu_runnable for a delayed_dequeue task may do an additional
unconditional call to uclamp_rq_inc, no?
On 04/06/2024 15:23, Luis Machado wrote:
> On 6/4/24 11:11, Peter Zijlstra wrote:
>> On Mon, Jun 03, 2024 at 08:30:43PM +0100, Luis Machado wrote:
>>
>>> Exchanging some information with Hongyan today, he was a bit suspicious of the uclamp
>>> behavior with the eevdf complete series applied.
>>>
>>> Checking the uclamp code, I see we have some refcounting tied to enqueuing/dequeuing
>>> of tasks, and the uclamp values are organized in buckets.
>>>
>>> Just for fun I added a few trace_printk's in uclamp_eff_value, uclamp_rq_inc_id and
>>> uclamp_rq_dec_id.
>>>
>>> Booting up the system with delayed_dequeue disabled and running the benchmark, I
>>> see the uclamp bucket management pretty stable. Tasks get added to the uclamp
>>> buckets but then get removed. At the end of the benchmark, the uclamp buckets
>>> are (almost always) clean of tasks.
>>>
>>> Enabling delayed dequeue, I can see the uclamp buckets slowly filling up with
>>> tasks. At the end of the benchmark, I see uclamp buckets with 30, 40 or 50
>>> tasks still. If I do another run, I can see 80, 100 tasks still.
>>>
>>> I suspect refcounting might be going wrong somewhere due to delayed dequeue
>>> tasks, but that's more of a guess right now. Hopefully that is useful
>>> information. I'll resume investigation tomorrow.
>>
>> Thank you both!!
>>
>> Does the below help?
>>
>> Note how dequeue_task() does uclamp_rq_dec() unconditionally, which is
>> then not balanced in the case below.
>>
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -3664,6 +3664,7 @@ static int ttwu_runnable(struct task_str
>> /* mustn't run a delayed task */
>> SCHED_WARN_ON(task_on_cpu(rq, p));
>> enqueue_task(rq, p, ENQUEUE_DELAYED);
>> + uclamp_rq_inc(rq, p);
>> }
>> if (!task_on_cpu(rq, p)) {
>> /*
>
> As Hongyan pointed out in a separate message, the above makes things
> worse, as we end up with even more leftover tasks in the uclamp
> buckets.
>
> I'm trying a fix in kernel/sched/core.c:enqueue_task that only
> calls uclamp_rq_inc if the task is not sched_delayed, so:
>
> - uclamp_rq_inc(rq, p);
> + if (!p->se.sched_delayed)
> + uclamp_rq_inc(rq, p);
>
> I'm not entirely sure it is correct, but it seems to fix things,
> but I'm still running some tests.
This seems to so far not trigger any WARN or bad behavior in my tests.
>
> With the current code, given uclamp_rq_inc and uclamp_rq_dec get
> called in enqueue_task and dequeue_task, the additional enqueue_task
> call from ttwu_runnable for a delayed_dequeue task may do an additional
> unconditional call to uclamp_rq_inc, no?
On Tue, Jun 04, 2024 at 03:23:41PM +0100, Luis Machado wrote:
> On 6/4/24 11:11, Peter Zijlstra wrote:
> > Note how dequeue_task() does uclamp_rq_dec() unconditionally, which is
> > then not balanced in the case below.
> >
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -3664,6 +3664,7 @@ static int ttwu_runnable(struct task_str
> > /* mustn't run a delayed task */
> > SCHED_WARN_ON(task_on_cpu(rq, p));
> > enqueue_task(rq, p, ENQUEUE_DELAYED);
> > + uclamp_rq_inc(rq, p);
> > }
> > if (!task_on_cpu(rq, p)) {
> > /*
>
> As Hongyan pointed out in a separate message, the above makes things
> worse, as we end up with even more leftover tasks in the uclamp
> buckets.
>
> I'm trying a fix in kernel/sched/core.c:enqueue_task that only
> calls uclamp_rq_inc if the task is not sched_delayed, so:
>
> - uclamp_rq_inc(rq, p);
> + if (!p->se.sched_delayed)
> + uclamp_rq_inc(rq, p);
>
> I'm not entirely sure it is correct, but it seems to fix things,
> but I'm still running some tests.
>
> With the current code, given uclamp_rq_inc and uclamp_rq_dec get
> called in enqueue_task and dequeue_task, the additional enqueue_task
> call from ttwu_runnable for a delayed_dequeue task may do an additional
> unconditional call to uclamp_rq_inc, no?
Yes, I got enqueue_task() and class->enqueue_task() confused this
morning.
But with the above, you skip inc for sched_delayed, but dequeue_task()
will have done the dec, so isn't it then still unbalanced?
Oh well, I'll go stare at this in tomorrow.
In any case, is there a uclamp self-test somewhere?
On 6/5/24 08:22, Peter Zijlstra wrote:
> On Tue, Jun 04, 2024 at 09:12:20PM +0200, Peter Zijlstra wrote:
>
>> But with the above, you skip inc for sched_delayed, but dequeue_task()
>> will have done the dec, so isn't it then still unbalanced?
>>
>> Oh well, I'll go stare at this in tomorrow.
It did not appear to keep things unbalanced towards uclamp_rq_dec, but
might've been luck of not hitting the right code path.
In any case ...
>
> OK, just before I went to play in my giant hamster wheel it hit me.
>
> When dequeue_task() 'fails' and sets sched_delayed, we'll have done
> uclamp_rq_dec().
>
> Then, since the delayed task is still on the rq -- per the failure -- it
> can be migrated, this will again do dequeue_task() which will *agian* do
> a uclamp_rq_dec().
>
> So now we have a double dequeue -- *FAIL*.
>
> Worse, the migration will then do an enqueue_task() on the new rq
> causing uclamp_rq_inc(). If you then get a ttwu() / ENQUEUE_DELAYED, you
> can tickle yet another uclamp_rq_inc() for another fail.
>
> Something like the below avoids uclamp_rq_{inc,dec}() when
> ->sched_delayed, and moves it post class->enqueue_task() such that for
> the ENQUEUE_DELAYED case, we *will* do the inc after ->sched_delayed
> gets cleared.
>
> Hmm?
>
> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 9ac1054c2a4bb..965e6464e68e9 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1676,6 +1676,9 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
> if (unlikely(!p->sched_class->uclamp_enabled))
> return;
>
> + if (p->se.sched_delayed)
> + return;
> +
> for_each_clamp_id(clamp_id)
> uclamp_rq_inc_id(rq, p, clamp_id);
>
> @@ -1700,6 +1703,9 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
> if (unlikely(!p->sched_class->uclamp_enabled))
> return;
>
> + if (p->se.sched_delayed)
> + return;
> +
> for_each_clamp_id(clamp_id)
> uclamp_rq_dec_id(rq, p, clamp_id);
> }
> @@ -1979,9 +1985,10 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
> psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
> }
>
> - uclamp_rq_inc(rq, p);
> p->sched_class->enqueue_task(rq, p, flags);
>
> + uclamp_rq_inc(rq, p);
> +
> if (sched_core_enabled(rq))
> sched_core_enqueue(rq, p);
> }
> @@ -2003,6 +2010,7 @@ bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
> }
>
> uclamp_rq_dec(rq, p);
> +
> return p->sched_class->dequeue_task(rq, p, flags);
> }
>
... thanks for the patch! The above seems to do it for me. I can see
more reasonable energy use with the eevdf-complete series. Still a
bit higher. Might be noise, we'll see.
I'll go stare at it and run some more tests on our end with this fix
applied on top and will report back.
On Wed, Jun 05, 2024 at 10:14:47AM +0100, Luis Machado wrote:
> ... thanks for the patch! The above seems to do it for me. I can see
> more reasonable energy use with the eevdf-complete series. Still a
> bit higher. Might be noise, we'll see.
W00t!!!
Let me write a decent Changelog and stuff it in the git tree along with
all the other bits.
Thanks for all the testing.
On Tue, Jun 04, 2024 at 09:12:20PM +0200, Peter Zijlstra wrote:
> But with the above, you skip inc for sched_delayed, but dequeue_task()
> will have done the dec, so isn't it then still unbalanced?
>
> Oh well, I'll go stare at this in tomorrow.
OK, just before I went to play in my giant hamster wheel it hit me.
When dequeue_task() 'fails' and sets sched_delayed, we'll have done
uclamp_rq_dec().
Then, since the delayed task is still on the rq -- per the failure -- it
can be migrated, this will again do dequeue_task() which will *agian* do
a uclamp_rq_dec().
So now we have a double dequeue -- *FAIL*.
Worse, the migration will then do an enqueue_task() on the new rq
causing uclamp_rq_inc(). If you then get a ttwu() / ENQUEUE_DELAYED, you
can tickle yet another uclamp_rq_inc() for another fail.
Something like the below avoids uclamp_rq_{inc,dec}() when
->sched_delayed, and moves it post class->enqueue_task() such that for
the ENQUEUE_DELAYED case, we *will* do the inc after ->sched_delayed
gets cleared.
Hmm?
---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9ac1054c2a4bb..965e6464e68e9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1676,6 +1676,9 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
if (unlikely(!p->sched_class->uclamp_enabled))
return;
+ if (p->se.sched_delayed)
+ return;
+
for_each_clamp_id(clamp_id)
uclamp_rq_inc_id(rq, p, clamp_id);
@@ -1700,6 +1703,9 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
if (unlikely(!p->sched_class->uclamp_enabled))
return;
+ if (p->se.sched_delayed)
+ return;
+
for_each_clamp_id(clamp_id)
uclamp_rq_dec_id(rq, p, clamp_id);
}
@@ -1979,9 +1985,10 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
}
- uclamp_rq_inc(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
+ uclamp_rq_inc(rq, p);
+
if (sched_core_enabled(rq))
sched_core_enqueue(rq, p);
}
@@ -2003,6 +2010,7 @@ bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
}
uclamp_rq_dec(rq, p);
+
return p->sched_class->dequeue_task(rq, p, flags);
}
On 6/5/24 10:42, Peter Zijlstra wrote:
> On Wed, Jun 05, 2024 at 10:14:47AM +0100, Luis Machado wrote:
>> ... thanks for the patch! The above seems to do it for me. I can see
>> more reasonable energy use with the eevdf-complete series. Still a
>> bit higher. Might be noise, we'll see.
>
> W00t!!!
>
> Let me write a decent Changelog and stuff it in the git tree along with
> all the other bits.
>
> Thanks for all the testing.
I've been doing some more testing of the eevdf-complete series with
the Pixel6/EAS platform. Hopefully these numbers will prove useful.
The energy regression from the original delayed-dequeue patch seems to
have been mostly fixed (with your proposed patch). Energy readings for
the big and mid cores are mostly stable and comparable with stock eevdf
(without eevdf-complete).
The only difference I can spot now is in the energy use of the little
cores. Compared to stock eevdf, the delayed-dequeue code seems to make
the energy use of the little cores a bit spiky, meaning sometimes we get
the expected level of energy use, but other times we get 40%, 60% or
even 90% more.
For instance...
(1) m6.6-eevdf-stock*: stock eevdf runs
(2) m6.6-eevdf-complete-ndd-dz*: eevdf-complete + NO_DELAY_DEQUEUE + DELAY_ZERO
(3) m6.6-eevdf-complete-dd-dz*: eevdf-complete + DELAY_DEQUEUE + DELAY_ZERO
+------------+---------------------------------+-----------+
| channel | tag | perc_diff |
+------------+---------------------------------+-----------+
| CPU-Little | m6.6-eevdf-stock-1 | 0.0% |
| CPU-Little | m6.6-eevdf-stock-2 | -4.21% |
| CPU-Little | m6.6-eevdf-stock-3 | -7.86% |
| CPU-Little | m6.6-eevdf-stock-4 | -5.67% |
| CPU-Little | m6.6-eevdf-stock-5 | -6.61% |
| CPU-Little | m6.6-eevdf-complete-ndd-dz-1 | -2.21% |
| CPU-Little | m6.6-eevdf-complete-ndd-dz-2 | -9.99% |
| CPU-Little | m6.6-eevdf-complete-ndd-dz-3 | -6.1% |
| CPU-Little | m6.6-eevdf-complete-ndd-dz-4 | -5.66% |
| CPU-Little | m6.6-eevdf-complete-ndd-dz-5 | -7.12% |
| CPU-Little | m6.6-eevdf-complete-dd-dz-1 | 96.69% |
| CPU-Little | m6.6-eevdf-complete-dd-dz-2 | 22.1% |
| CPU-Little | m6.6-eevdf-complete-dd-dz-3 | 44.82% |
| CPU-Little | m6.6-eevdf-complete-dd-dz-4 | -0.23% |
| CPU-Little | m6.6-eevdf-complete-dd-dz-5 | 8.28% |
+------------+---------------------------------+-----------+
Looking at what might explain the spiky behavior with DELAY_DEQUEUE, I
noticed the idle residency data (we have 2 idle states) also shows some
spikyness and potential clues.
Looks like (1) and (2) manage to switch to idle states in a consistent
manner, whereas (3) seems a bit erratic and more prone to take a
shallower idle state (idle 0) as opposed to a deeper idle state (idle 1).
(1) and (2) seem to make better use of the deeper idle state.
+-------------------------------+---------+------------+-------+
| tag | cluster | idle_state | time |
+-------------------------------+---------+------------+-------+
| m6.6-eevdf-stock-1 | little | not idle | 63.49 |
| m6.6-eevdf-stock-1 | little | idle 0 | 30.66 |
| m6.6-eevdf-stock-1 | little | idle 1 | 12.15 |
| m6.6-eevdf-stock-2 | little | not idle | 62.6 |
| m6.6-eevdf-stock-2 | little | idle 0 | 31.13 |
| m6.6-eevdf-stock-2 | little | idle 1 | 14.56 |
| m6.6-eevdf-stock-3 | little | not idle | 63.98 |
| m6.6-eevdf-stock-3 | little | idle 0 | 31.54 |
| m6.6-eevdf-stock-3 | little | idle 1 | 15.91 |
| m6.6-eevdf-stock-4 | little | not idle | 64.18 |
| m6.6-eevdf-stock-4 | little | idle 0 | 31.32 |
| m6.6-eevdf-stock-4 | little | idle 1 | 15.83 |
| m6.6-eevdf-stock-5 | little | not idle | 63.32 |
| m6.6-eevdf-stock-5 | little | idle 0 | 30.4 |
| m6.6-eevdf-stock-5 | little | idle 1 | 14.33 |
| m6.6-eevdf-complete-ndd-dz-1 | little | not idle | 62.62 |
| m6.6-eevdf-complete-ndd-dz-1 | little | idle 0 | 29.48 |
| m6.6-eevdf-complete-ndd-dz-1 | little | idle 1 | 13.19 |
| m6.6-eevdf-complete-ndd-dz-2 | little | not idle | 64.12 |
| m6.6-eevdf-complete-ndd-dz-2 | little | idle 0 | 27.62 |
| m6.6-eevdf-complete-ndd-dz-2 | little | idle 1 | 14.73 |
| m6.6-eevdf-complete-ndd-dz-3 | little | not idle | 62.86 |
| m6.6-eevdf-complete-ndd-dz-3 | little | idle 0 | 27.87 |
| m6.6-eevdf-complete-ndd-dz-3 | little | idle 1 | 14.97 |
| m6.6-eevdf-complete-ndd-dz-4 | little | not idle | 63.01 |
| m6.6-eevdf-complete-ndd-dz-4 | little | idle 0 | 28.2 |
| m6.6-eevdf-complete-ndd-dz-4 | little | idle 1 | 14.11 |
| m6.6-eevdf-complete-ndd-dz-5 | little | not idle | 62.1 |
| m6.6-eevdf-complete-ndd-dz-5 | little | idle 0 | 29.06 |
| m6.6-eevdf-complete-ndd-dz-5 | little | idle 1 | 14.73 |
| m6.6-eevdf-complete-dd-dz-1 | little | not idle | 46.18 |
| m6.6-eevdf-complete-dd-dz-1 | little | idle 0 | 53.78 |
| m6.6-eevdf-complete-dd-dz-1 | little | idle 1 | 3.75 |
| m6.6-eevdf-complete-dd-dz-2 | little | not idle | 57.64 |
| m6.6-eevdf-complete-dd-dz-2 | little | idle 0 | 40.47 |
| m6.6-eevdf-complete-dd-dz-2 | little | idle 1 | 7.39 |
| m6.6-eevdf-complete-dd-dz-3 | little | not idle | 43.14 |
| m6.6-eevdf-complete-dd-dz-3 | little | idle 0 | 57.73 |
| m6.6-eevdf-complete-dd-dz-3 | little | idle 1 | 3.65 |
| m6.6-eevdf-complete-dd-dz-4 | little | not idle | 58.97 |
| m6.6-eevdf-complete-dd-dz-4 | little | idle 0 | 36.4 |
| m6.6-eevdf-complete-dd-dz-4 | little | idle 1 | 9.42 |
| m6.6-eevdf-complete-dd-dz-5 | little | not idle | 55.85 |
| m6.6-eevdf-complete-dd-dz-5 | little | idle 0 | 36.75 |
| m6.6-eevdf-complete-dd-dz-5 | little | idle 1 | 13.14 |
+-------------------------------+---------+------------+-------+
I can't draw a precise conclusion, but it might be down to delayed util_est
updates or even the additional time the delayed-dequeue tasks spend on the
runqueue. But delayed-dequeue does change the overall behavior a bit on
these heterogeneous platforms, energy-wise.