2024-04-05 13:06:48

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
noting that lag is fundamentally a temporal measure. It should not be
carried around indefinitely.

OTOH it should also not be instantly discarded, doing so will allow a
task to game the system by purposefully (micro) sleeping at the end of
its time quantum.

Since lag is intimately tied to the virtual time base, a wall-time
based decay is also insufficient, notably competition is required for
any of this to make sense.

Instead, delay the dequeue and keep the 'tasks' on the runqueue,
competing until they are eligible.

Strictly speaking, we only care about keeping them until the 0-lag
point, but that is a difficult proposition, instead carry them around
until they get picked again, and dequeue them at that point.

Since we should have dequeued them at the 0-lag point, truncate lag
(eg. don't let them earn positive lag).

XXX test the cfs-throttle stuff

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/sched.h | 1
kernel/sched/core.c | 22 +++++--
kernel/sched/fair.c | 148 +++++++++++++++++++++++++++++++++++++++++++-----
kernel/sched/features.h | 12 +++
kernel/sched/sched.h | 2
5 files changed, 167 insertions(+), 18 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -542,6 +542,7 @@ struct sched_entity {

struct list_head group_node;
unsigned int on_rq;
+ unsigned int sched_delayed;

u64 exec_start;
u64 sum_exec_runtime;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2154,10 +2154,14 @@ void activate_task(struct rq *rq, struct

void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
- WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING);
- ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+ bool sleep = flags & DEQUEUE_SLEEP;

- dequeue_task(rq, p, flags);
+ if (dequeue_task(rq, p, flags)) {
+ WRITE_ONCE(p->on_rq, sleep ? 0 : TASK_ON_RQ_MIGRATING);
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+ } else {
+ SCHED_WARN_ON(!sleep); /* only sleep can fail */
+ }
}

static inline int __normal_prio(int policy, int rt_prio, int nice)
@@ -3858,12 +3862,17 @@ static int ttwu_runnable(struct task_str

rq = __task_rq_lock(p, &rf);
if (task_on_rq_queued(p)) {
+ update_rq_clock(rq);
+ if (p->se.sched_delayed) {
+ /* mustn't run a delayed task */
+ SCHED_WARN_ON(task_on_cpu(rq, p));
+ enqueue_task(rq, p, ENQUEUE_DELAYED);
+ }
if (!task_on_cpu(rq, p)) {
/*
* When on_rq && !on_cpu the task is preempted, see if
* it should preempt the task that is current now.
*/
- update_rq_clock(rq);
wakeup_preempt(rq, p, wake_flags);
}
ttwu_do_wakeup(p);
@@ -4243,11 +4252,16 @@ int try_to_wake_up(struct task_struct *p
* case the whole 'p->on_rq && ttwu_runnable()' case below
* without taking any locks.
*
+ * Specifically, given current runs ttwu() we must be before
+ * schedule()'s deactivate_task(), as such this must not
+ * observe sched_delayed.
+ *
* In particular:
* - we rely on Program-Order guarantees for all the ordering,
* - we're serialized against set_special_state() by virtue of
* it disabling IRQs (this allows not taking ->pi_lock).
*/
+ SCHED_WARN_ON(p->se.sched_delayed);
if (!ttwu_state_match(p, state, &success))
goto out;

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5270,6 +5270,10 @@ static inline int cfs_rq_throttled(struc
static inline bool cfs_bandwidth_used(void);

static void
+requeue_delayed_entity(struct sched_entity *se);
+
+/* XXX bool and pull in the requeue_delayed_entity thing */
+static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
bool curr = cfs_rq->curr == se;
@@ -5356,18 +5360,33 @@ static void clear_buddies(struct cfs_rq

static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);

-static void
+static bool
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- int action = UPDATE_TG;
-
- if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
- action |= DO_DETACH;
-
/*
* Update run-time statistics of the 'current'.
*/
- update_curr(cfs_rq);
+ if (flags & DEQUEUE_DELAYED) {
+ SCHED_WARN_ON(!se->sched_delayed);
+ se->sched_delayed = 0;
+ } else {
+ bool sleep = flags & DEQUEUE_SLEEP;
+
+ SCHED_WARN_ON(sleep && se->sched_delayed);
+ update_curr(cfs_rq);
+
+ if (sched_feat(DELAY_DEQUEUE) && sleep &&
+ !entity_eligible(cfs_rq, se)) {
+ if (cfs_rq->next == se)
+ cfs_rq->next = NULL;
+ se->sched_delayed = 1;
+ return false;
+ }
+ }
+
+ int action = UPDATE_TG;
+ if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
+ action |= DO_DETACH;

/*
* When dequeuing a sched_entity, we must:
@@ -5407,6 +5426,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, st

if (cfs_rq->nr_running == 0)
update_idle_cfs_rq_clock_pelt(cfs_rq);
+
+ return true;
}

static void
@@ -5432,6 +5453,7 @@ set_next_entity(struct cfs_rq *cfs_rq, s
}

update_stats_curr_start(cfs_rq, se);
+ SCHED_WARN_ON(cfs_rq->curr);
cfs_rq->curr = se;

/*
@@ -5452,6 +5474,8 @@ set_next_entity(struct cfs_rq *cfs_rq, s
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}

+static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
+
/*
* Pick the next process, keeping these things in mind, in this order:
* 1) keep things fair between processes/task groups
@@ -5460,16 +5484,29 @@ set_next_entity(struct cfs_rq *cfs_rq, s
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *
-pick_next_entity(struct cfs_rq *cfs_rq)
+pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
{
/*
* Enabling NEXT_BUDDY will affect latency but not fairness.
*/
if (sched_feat(NEXT_BUDDY) &&
- cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
+ /* ->next will never be delayed */
+ SCHED_WARN_ON(cfs_rq->next->sched_delayed);
return cfs_rq->next;
+ }

- return pick_eevdf(cfs_rq);
+ struct sched_entity *se = pick_eevdf(cfs_rq);
+ if (se->sched_delayed) {
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ SCHED_WARN_ON(se->sched_delayed);
+ SCHED_WARN_ON(se->on_rq);
+ if (sched_feat(DELAY_ZERO) && se->vlag > 0)
+ se->vlag = 0;
+
+ return NULL;
+ }
+ return se;
}

static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@ -5493,6 +5530,7 @@ static void put_prev_entity(struct cfs_r
/* in !on_rq case, update occurred at dequeue */
update_load_avg(cfs_rq, prev, 0);
}
+ SCHED_WARN_ON(cfs_rq->curr != prev);
cfs_rq->curr = NULL;
}

@@ -5793,6 +5831,10 @@ static bool throttle_cfs_rq(struct cfs_r
if (!se->on_rq)
goto done;

+ /*
+ * XXX should be fine vs sched_delay; if won't run after this.
+ * Either pick dequeues it, or unthrottle. Double check!!
+ */
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);

if (cfs_rq_is_idle(group_cfs_rq(se)))
@@ -5882,8 +5924,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cf
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);

- if (se->on_rq)
+ if (se->on_rq) {
+ if (se->sched_delayed)
+ requeue_delayed_entity(se);
break;
+ }
enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);

if (cfs_rq_is_idle(group_cfs_rq(se)))
@@ -6729,6 +6774,40 @@ static int sched_idle_cpu(int cpu)
}
#endif

+static void
+requeue_delayed_entity(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /*
+ * se->sched_delayed should imply both: se->on_rq == 1 and
+ * cfs_rq->curr != se. Because a delayed entity is one that is still on
+ * the runqueue competing until elegibility.
+ *
+ * Except for groups, consider current going idle and newidle pulling a
+ * task in the same group -- in that case 'cfs_rq->curr == se'.
+ */
+ SCHED_WARN_ON(!se->sched_delayed);
+ SCHED_WARN_ON(!se->on_rq);
+ SCHED_WARN_ON(entity_is_task(se) && cfs_rq->curr == se);
+
+ if (sched_feat(DELAY_ZERO)) {
+ update_entity_lag(cfs_rq, se);
+ if (se->vlag > 0) {
+ cfs_rq->nr_running--;
+ if (se != cfs_rq->curr)
+ __dequeue_entity(cfs_rq, se);
+ se->vlag = 0;
+ place_entity(cfs_rq, se, 0);
+ if (se != cfs_rq->curr)
+ __enqueue_entity(cfs_rq, se);
+ cfs_rq->nr_running++;
+ }
+ }
+
+ se->sched_delayed = 0;
+}
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -6742,6 +6821,11 @@ enqueue_task_fair(struct rq *rq, struct
int idle_h_nr_running = task_has_idle_policy(p);
int task_new = !(flags & ENQUEUE_WAKEUP);

+ if (flags & ENQUEUE_DELAYED) {
+ requeue_delayed_entity(se);
+ return;
+ }
+
/*
* The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency.
@@ -6759,8 +6843,11 @@ enqueue_task_fair(struct rq *rq, struct
cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);

for_each_sched_entity(se) {
- if (se->on_rq)
+ if (se->on_rq) {
+ if (se->sched_delayed)
+ requeue_delayed_entity(se);
break;
+ }
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags);

@@ -6836,6 +6923,7 @@ static int dequeue_entities(struct rq *r
{
bool was_sched_idle = sched_idle_rq(rq);
bool task_sleep = flags & DEQUEUE_SLEEP;
+ bool task_delayed = flags & DEQUEUE_DELAYED;
struct task_struct *p = NULL;
struct cfs_rq *cfs_rq;
int idle_h_nr_running;
@@ -6849,7 +6937,13 @@ static int dequeue_entities(struct rq *r

for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- dequeue_entity(cfs_rq, se, flags);
+
+ if (!dequeue_entity(cfs_rq, se, flags)) {
+ if (p && &p->se == se)
+ return -1;
+
+ break;
+ }

/* h_nr_running is the hierachical count of tasks */
if (p) {
@@ -6877,6 +6971,7 @@ static int dequeue_entities(struct rq *r
break;
}
flags |= DEQUEUE_SLEEP;
+ flags &= ~DEQUEUE_DELAYED;
}

for_each_sched_entity(se) {
@@ -6906,6 +7001,18 @@ static int dequeue_entities(struct rq *r
/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
+
+ if (task_delayed) {
+ SCHED_WARN_ON(!task_sleep);
+ SCHED_WARN_ON(p->on_rq != 1);
+
+ /* Fix-up what dequeue_task_fair() skipped */
+ util_est_update(&rq->cfs, p, task_sleep);
+ hrtick_update(rq);
+
+ /* Fix-up what deactivate_task() skipped. */
+ WRITE_ONCE(p->on_rq, 0);
+ }
}

return 1;
@@ -6923,6 +7030,10 @@ static bool dequeue_task_fair(struct rq
if (dequeue_entities(rq, &p->se, flags) < 0)
return false;

+ /*
+ * It doesn't make sense to update util_est for the delayed dequeue
+ * case where ttwu will make it appear the sleep never happened.
+ */
util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
hrtick_update(rq);
return true;
@@ -8463,7 +8574,9 @@ static struct task_struct *pick_task_fai
if (unlikely(check_cfs_rq_runtime(cfs_rq)))
goto again;

- se = pick_next_entity(cfs_rq);
+ se = pick_next_entity(rq, cfs_rq);
+ if (!se)
+ goto again;
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);

@@ -12803,10 +12916,17 @@ static void attach_task_cfs_rq(struct ta
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
+ // XXX think harder on this
+ // this could actually be handled correctly I suppose; keep the whole
+ // se enqueued while boosted. horrible complexity though
+ p->se.sched_delayed = 0;
+ // XXX also vlag ?!?
}

static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
+ SCHED_WARN_ON(p->se.sched_delayed);
+
attach_task_cfs_rq(p);

set_task_max_allowed_capacity(p);
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -29,6 +29,18 @@ SCHED_FEAT(NEXT_BUDDY, false)
SCHED_FEAT(CACHE_HOT_BUDDY, true)

/*
+ * Delay dequeueing tasks until they get selected or woken.
+ *
+ * By delaying the dequeue for non-eligible tasks, they remain in the
+ * competition and can burn off their negative lag. When they get selected
+ * they'll have positive lag by definition.
+ *
+ * DELAY_ZERO clips the lag on dequeue (or wakeup) to 0.
+ */
+SCHED_FEAT(DELAY_DEQUEUE, true)
+SCHED_FEAT(DELAY_ZERO, true)
+
+/*
* Allow wakeup-time preemption of the current task:
*/
SCHED_FEAT(WAKEUP_PREEMPTION, true)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2245,6 +2245,7 @@ extern const u32 sched_prio_to_wmult[40
#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
+#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */

#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
@@ -2260,6 +2261,7 @@ extern const u32 sched_prio_to_wmult[40
#endif
#define ENQUEUE_INITIAL 0x80
#define ENQUEUE_MIGRATING 0x100
+#define ENQUEUE_DELAYED 0x200

#define RETRY_TASK ((void *)-1UL)





2024-04-06 09:29:42

by Chen Yu

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On 2024-04-05 at 12:28:02 +0200, Peter Zijlstra wrote:
> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
> noting that lag is fundamentally a temporal measure. It should not be
> carried around indefinitely.
>
> OTOH it should also not be instantly discarded, doing so will allow a
> task to game the system by purposefully (micro) sleeping at the end of
> its time quantum.
>
> Since lag is intimately tied to the virtual time base, a wall-time
> based decay is also insufficient, notably competition is required for
> any of this to make sense.
>
> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
> competing until they are eligible.
>
> Strictly speaking, we only care about keeping them until the 0-lag
> point, but that is a difficult proposition, instead carry them around
> until they get picked again, and dequeue them at that point.
>
> Since we should have dequeued them at the 0-lag point, truncate lag
> (eg. don't let them earn positive lag).
>
> XXX test the cfs-throttle stuff
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---

Tested schbench on xeon server, which has 240 CPUs/2 sockets.
schbench -m 2 -r 100
the result seems ok to me.

baseline:
NO_DELAY_DEQUEUE
NO_DELAY_ZERO
Wakeup Latencies percentiles (usec) runtime 100 (s) (1658446 total samples)
50.0th: 5 (361126 samples)
90.0th: 11 (654121 samples)
* 99.0th: 25 (123032 samples)
99.9th: 673 (13845 samples)
min=1, max=8337
Request Latencies percentiles (usec) runtime 100 (s) (1662381 total samples)
50.0th: 14992 (524771 samples)
90.0th: 15344 (657370 samples)
* 99.0th: 15568 (129769 samples)
99.9th: 15888 (10017 samples)
min=3529, max=43841
RPS percentiles (requests) runtime 100 (s) (101 total samples)
20.0th: 16544 (37 samples)
* 50.0th: 16608 (30 samples)
90.0th: 16736 (31 samples)
min=16403, max=17698
average rps: 16623.81


DELAY_DEQUEUE
DELAY_ZERO
Wakeup Latencies percentiles (usec) runtime 100 (s) (1668161 total samples)
50.0th: 6 (394867 samples)
90.0th: 12 (653021 samples)
* 99.0th: 31 (142636 samples)
99.9th: 755 (14547 samples)
min=1, max=5226
Request Latencies percentiles (usec) runtime 100 (s) (1671859 total samples)
50.0th: 14384 (511809 samples)
90.0th: 14992 (653508 samples)
* 99.0th: 15408 (149257 samples)
99.9th: 15984 (12090 samples)
min=3546, max=38360
RPS percentiles (requests) runtime 100 (s) (101 total samples)
20.0th: 16672 (45 samples)
* 50.0th: 16736 (52 samples)
90.0th: 16736 (0 samples)
min=16629, max=16800
average rps: 16718.59


The 99th wakeup latency increases a little bit, and should be in the acceptible
range(25 -> 31 us). Meanwhile the throughput increases accordingly. Here are
the possible reason I can think of:

1. wakeup latency: The time to find an eligible entity in the tree
during wakeup might take longer - if there are more delayed-dequeue
tasks in the tree.
2. throughput: Inhibit task dequeue can decrease the ratio to touch the
task group's load_avg: dequeue_entity()-> { update_load_avg(), update_cfs_group()),
which reduces the cache contention among CPUs, and improves throughput.


> + } else {
> + bool sleep = flags & DEQUEUE_SLEEP;
> +
> + SCHED_WARN_ON(sleep && se->sched_delayed);
> + update_curr(cfs_rq);
> +
> + if (sched_feat(DELAY_DEQUEUE) && sleep &&
> + !entity_eligible(cfs_rq, se)) {

Regarding the elibigle check, it was found that there could be an overflow
issue, and it brings false negative of entity_eligible(), which was described here:
https://lore.kernel.org/lkml/[email protected]/
and also reported on another machine
https://lore.kernel.org/lkml/[email protected]/
I don't have good idea to avoid that overflow properly, while I'm trying to
reproduce it locally, do you have any guidance on how to address it?

thanks,
Chenyu

2024-04-08 09:07:04

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On Sat, Apr 06, 2024 at 05:23:25PM +0800, Chen Yu wrote:

> The 99th wakeup latency increases a little bit, and should be in the acceptible
> range(25 -> 31 us).

Ah, my test runs haven't been stable enough to observe that.

> Meanwhile the throughput increases accordingly. Here are
> the possible reason I can think of:
>
> 1. wakeup latency: The time to find an eligible entity in the tree
> during wakeup might take longer - if there are more delayed-dequeue
> tasks in the tree.

Another possible cause might be that previously a schedule() would be
1 dequeue, 1 pick.

But now it can be much more variable, a pick can basically do N dequeues
and N+1 picks.

So not only do we do more picks, but if you're focussed on worst case
latency, it goes up, because we can do multiple dequeues for a single
pick.

If we find this to really be a problem, I had some half baked ideas to
fix it, but it added significant complexity, so keep it simple until
need proves we need more etc.

> 2. throughput: Inhibit task dequeue can decrease the ratio to touch the
> task group's load_avg: dequeue_entity()-> { update_load_avg(), update_cfs_group()),
> which reduces the cache contention among CPUs, and improves throughput.

Ah, yes, there's that.

> > + } else {
> > + bool sleep = flags & DEQUEUE_SLEEP;
> > +
> > + SCHED_WARN_ON(sleep && se->sched_delayed);
> > + update_curr(cfs_rq);
> > +
> > + if (sched_feat(DELAY_DEQUEUE) && sleep &&
> > + !entity_eligible(cfs_rq, se)) {
>
> Regarding the elibigle check, it was found that there could be an overflow
> issue, and it brings false negative of entity_eligible(), which was described here:
> https://lore.kernel.org/lkml/[email protected]/
> and also reported on another machine
> https://lore.kernel.org/lkml/[email protected]/
> I don't have good idea to avoid that overflow properly, while I'm trying to
> reproduce it locally, do you have any guidance on how to address it?

I have not yet seen those, let me go stare at them now. Thanks!

2024-04-11 01:33:19

by Yan-Jie Wang

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

>
>> The 99th wakeup latency increases a little bit, and should be in the acceptible
>> range(25 -> 31 us).
>
> Ah, my test runs haven't been stable enough to observe that.
>
>> Meanwhile the throughput increases accordingly. Here are
>> the possible reason I can think of:
>>
>> 1. wakeup latency: The time to find an eligible entity in the tree
>> during wakeup might take longer - if there are more delayed-dequeue
>> tasks in the tree.
>
> Another possible cause might be that previously a schedule() would be
> 1 dequeue, 1 pick.
>
> But now it can be much more variable, a pick can basically do N dequeues
> and N+1 picks.
>
> So not only do we do more picks, but if you're focussed on worst case
> latency, it goes up, because we can do multiple dequeues for a single
> pick.
>
> If we find this to really be a problem, I had some half baked ideas to
> fix it, but it added significant complexity, so keep it simple until
> need proves we need more etc.

I have an alternative approach to delayed-dequeue inspired by the
original CFS implementation.

The idea is to keep the task's vruntime when it goes to sleep.
When the task is woken up, see if the lag is positive at the woken time,
if it is the case, clamp it to 0 by setting vruntime to avg_vruntime().


<Sleep>

In dequeue_entity(): Remove the task from runqueue, but keep the task's
vruntime, and do not calculate vlag at this time.


<Wake Up on the same CPU>

In enqueue_entity():
1. Do not call place_entity().
2. If the task's vruntume is less than the cfs_rq's avg_vruntime(),
set the task's vruntime to avg_vruntime(), and update the task's
deadline according to its timeslice.
3. Insert the task into the runqueue.


<Wake Up on different CPU>

In migrate_task_rq_fair():
1. Calculate the task's vlag as if it is on the original cfs_rq.
2. Set the task's vlag to 0 if it is positive.

In enqueue_entity(): Use place_entity() to calculate the task's new
vruntime and deadline according to the vlag and the new runqueue before
inserting it into the runqueue.


2024-04-12 10:43:54

by K Prateek Nayak

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

Hello Peter,

On 4/5/2024 3:58 PM, Peter Zijlstra wrote:
> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
> noting that lag is fundamentally a temporal measure. It should not be
> carried around indefinitely.
>
> OTOH it should also not be instantly discarded, doing so will allow a
> task to game the system by purposefully (micro) sleeping at the end of
> its time quantum.
>
> Since lag is intimately tied to the virtual time base, a wall-time
> based decay is also insufficient, notably competition is required for
> any of this to make sense.
>
> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
> competing until they are eligible.
>
> Strictly speaking, we only care about keeping them until the 0-lag
> point, but that is a difficult proposition, instead carry them around
> until they get picked again, and dequeue them at that point.
>
> Since we should have dequeued them at the 0-lag point, truncate lag
> (eg. don't let them earn positive lag).
>
> XXX test the cfs-throttle stuff

I ran into a few issues when testing the series on top of tip:sched/core
at commit 4475cd8bfd9b ("sched/balancing: Simplify the sg_status bitmask
and use separate ->overloaded and ->overutilized flags"). All of these
splats surfaced when running unixbench with Delayed Dequeue (echoing
NO_DELAY_DEQUEUE to /sys/kernel/debug/sched/features seems to make the
system stable when running Unixbench spawn)

Unixbench (https://github.com/kdlucas/byte-unixbench.git) command:

./Run spawn -c 512

Splats appear soon into the run. Following are the splats and their
corresponding code blocks from my 3rd Generation EPYC system
(2 x 64C/128T):

1. NULL pointer dereferencing in can_migrate_task():

BUG: kernel NULL pointer dereference, address: 0000000000000040
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: 0000 [#1] PREEMPT SMP NOPTI
CPU: 154 PID: 1507736 Comm: spawn Not tainted 6.9.0-rc1-test+ #958
Hardware name: Dell Inc. PowerEdge R6525/024PW1, BIOS 2.7.3 03/30/2022
RIP: 0010:can_migrate_task+0x2b/0x6c0
Code: ...
RSP: 0018:ffffb6bb9e6a3bc0 EFLAGS: 00010086
RAX: 0000000000000000 RBX: ffffb6bb9e6a3c80 RCX: ffff90ad0d209400
RDX: 0000000000000008 RSI: ffffb6bb9e6a3c80 RDI: ffff90eb3b236438
RBP: ffff90eb3b236438 R08: 0000005c743512ab R09: ffffffffffff0000
R10: 0000000000000001 R11: 0000000000000100 R12: ffff90eb3b236438
R13: ffff90eb3b2364f0 R14: ffff90eb3b6359c0 R15: ffff90eb3b6359c0
FS: 0000000000000000(0000) GS:ffff90eb3df00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000040 CR3: 000000807da3c006 CR4: 0000000000770ef0
PKRU: 55555554
Call Trace:
<TASK>
? __die+0x24/0x70
? page_fault_oops+0x14a/0x510
? exc_page_fault+0x77/0x170
? asm_exc_page_fault+0x26/0x30
? can_migrate_task+0x2b/0x6c0
sched_balance_rq+0x7a8/0x1190
sched_balance_newidle+0x1e2/0x490
pick_next_task_fair+0x36/0x4a0
__schedule+0x1c0/0x1710
? srso_alias_return_thunk+0x5/0xfbef5
? refill_stock+0x1a/0x30
? srso_alias_return_thunk+0x5/0xfbef5
? obj_cgroup_uncharge_pages+0x4d/0xd0
do_task_dead+0x42/0x50
do_exit+0x777/0xad0
do_group_exit+0x30/0x80
__x64_sys_exit_group+0x18/0x20
do_syscall_64+0x79/0x120
? srso_alias_return_thunk+0x5/0xfbef5
? srso_alias_return_thunk+0x5/0xfbef5
? irqentry_exit_to_user_mode+0x5b/0x170
entry_SYSCALL_64_after_hwframe+0x6c/0x74
RIP: 0033:0x7f963a6eac31
Code: Unable to access opcode bytes at 0x7f963a6eac07.
RSP: 002b:00007ffc6b7158c8 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7
RAX: ffffffffffffffda RBX: 00007f963a816a00 RCX: 00007f963a6eac31
RDX: 000000000000003c RSI: 00000000000000e7 RDI: 0000000000000000
RBP: 0000000000000000 R08: ffffffffffffff80 R09: 0000000000000020
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f963a816a00
R13: 0000000000000000 R14: 00007f963a81bee8 R15: 00007f963a81bf00
</TASK>
Modules linked in: ...
CR2: 0000000000000040
---[ end trace 0000000000000000 ]---

$ scripts/faddr2line vmlinux can_migrate_task+0x2b/0x6c0
can_migrate_task+0x2b/0x6c0:
throttled_lb_pair at kernel/sched/fair.c:5738
(inlined by) can_migrate_task at kernel/sched/fair.c:9090

Corresponds to:

static inline int throttled_lb_pair(struct task_group *tg,
int src_cpu, int dest_cpu)
{
struct cfs_rq *src_cfs_rq, *dest_cfs_rq;

src_cfs_rq = tg->cfs_rq[src_cpu]; /* <----- Here -----< */
dest_cfs_rq = tg->cfs_rq[dest_cpu];

return throttled_hierarchy(src_cfs_rq) ||
throttled_hierarchy(dest_cfs_rq);
}

(inlined by)
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
/* Called here */
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;

...
}


2. A NULL pointer dereferencing in pick_next_task_fair():

BUG: kernel NULL pointer dereference, address: 0000000000000098
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: 0000 [#1] PREEMPT SMP NOPTI
CPU: 107 PID: 1206665 Comm: spawn Tainted: G W 6.9.0-rc1-test+ #958
Hardware name: Dell Inc. PowerEdge R6525/024PW1, BIOS 2.7.3 03/30/2022
RIP: 0010:pick_next_task_fair+0x327/0x4a0
Code: ...
RSP: 0018:ffffb613c212fd28 EFLAGS: 00010002
RAX: 0000004ed2799383 RBX: 0000000000000000 RCX: ffff8f65baf3f800
RDX: ffff8f65baf3ca00 RSI: 0000000000000000 RDI: 000000825ae302ab
RBP: ffff8f64b13b59c0 R08: 0000000000000015 R09: 0000000000000314
R10: 0000000000000001 R11: 0000000000000001 R12: ffff8f261ac199c0
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
FS: 00007f768b79e740(0000) GS:ffff8f64b1380000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000098 CR3: 00000040d1010006 CR4: 0000000000770ef0
PKRU: 55555554
Call Trace:
<TASK>
? __die+0x24/0x70
? page_fault_oops+0x14a/0x510
? srso_alias_return_thunk+0x5/0xfbef5
? report_bug+0x18e/0x1a0
? srso_alias_return_thunk+0x5/0xfbef5
? exc_page_fault+0x77/0x170
? asm_exc_page_fault+0x26/0x30
? pick_next_task_fair+0x327/0x4a0
? pick_next_task_fair+0x320/0x4a0
__schedule+0x1c0/0x1710
? release_task+0x2fc/0x4c0
? srso_alias_return_thunk+0x5/0xfbef5
schedule+0x30/0x120
syscall_exit_to_user_mode+0x98/0x1b0
do_syscall_64+0x85/0x120
? srso_alias_return_thunk+0x5/0xfbef5
? __count_memcg_events+0x69/0x100
? srso_alias_return_thunk+0x5/0xfbef5
? count_memcg_events.constprop.0+0x1a/0x30
? srso_alias_return_thunk+0x5/0xfbef5
? handle_mm_fault+0x17d/0x2e0
? srso_alias_return_thunk+0x5/0xfbef5
? do_user_addr_fault+0x33d/0x6f0
? srso_alias_return_thunk+0x5/0xfbef5
? srso_alias_return_thunk+0x5/0xfbef5
? irqentry_exit_to_user_mode+0x5b/0x170
entry_SYSCALL_64_after_hwframe+0x6c/0x74
RIP: 0033:0x7f768b4eab57
Code: ...
RSP: 002b:00007fff5f6e2018 EFLAGS: 00000246 ORIG_RAX: 0000000000000038
RAX: 000000000026d13b RBX: 00007f768b7ee040 RCX: 00007f768b4eab57
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000001200011
RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f768b79ea10 R11: 0000000000000246 R12: 0000000000000001
R13: 000055b811d1a140 R14: 000055b811d1cd88 R15: 00007f768b7ee040
</TASK>
Modules linked in: ...
CR2: 0000000000000098
---[ end trace 0000000000000000 ]---

$ scripts/faddr2line vmlinux pick_next_task_fair+0x327/0x4a0
pick_next_task_fair+0x327/0x4a0:
is_same_group at kernel/sched/fair.c:418
(inlined by) pick_next_task_fair at kernel/sched/fair.c:8625

struct cfs_rq *
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
if (se->cfs_rq == pse->cfs_rq) /* <----- HERE -----< */
return se->cfs_rq;

return NULL;
}

(inlined by)
struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
...
if (prev != p) {
...
while (!(cfs_rq = is_same_group(se, pse) /* <---- HERE ----< */)) {
...
}
...
}
...
}


3. A NULL Pointer dereferencing in __dequeue_entity():

BUG: kernel NULL pointer dereference, address: 0000000000000000
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: 0000 [#1] PREEMPT SMP NOPTI
CPU: 95 PID: 60896 Comm: spawn Not tainted 6.9.0-rc1-test+ #958
Hardware name: Dell Inc. PowerEdge R6525/024PW1, BIOS 2.7.3 03/30/2022
RIP: 0010:__rb_erase_color+0x88/0x260
Code: ...
RSP: 0018:ffffab158755fc08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: ffffffff841314b0 RCX: 0000000017fc8dd0
RDX: 0000000000000000 RSI: ffff8decfb1fe450 RDI: ffff8decf80bcdd0
RBP: ffff8decf80bcdd0 R08: ffff8decf80bcdd0 R09: ffffffffffffbb60
R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000
R13: ffff8decfb1fe450 R14: ffff8ded0ec03400 R15: ffff8decfb1fe400
FS: 00007f1ded0a2740(0000) GS:ffff8e2bb0d80000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 00000040e66f0005 CR4: 0000000000770ef0
PKRU: 55555554
Call Trace:
<TASK>
? __die+0x24/0x70
? page_fault_oops+0x14a/0x510
? exc_page_fault+0x77/0x170
? asm_exc_page_fault+0x26/0x30
? __pfx_min_vruntime_cb_rotate+0x10/0x10
? __rb_erase_color+0x88/0x260
__dequeue_entity+0x1b7/0x310
set_next_entity+0xc0/0x1e0
pick_next_task_fair+0x355/0x4a0
__schedule+0x1c0/0x1710
? native_queued_spin_lock_slowpath+0x2a4/0x2f0
schedule+0x30/0x120
do_wait+0xad/0x100
kernel_wait4+0xa9/0x150
? __pfx_child_wait_callback+0x10/0x10
do_syscall_64+0x79/0x120
? srso_alias_return_thunk+0x5/0xfbef5
? __count_memcg_events+0x69/0x100
? srso_alias_return_thunk+0x5/0xfbef5
? count_memcg_events.constprop.0+0x1a/0x30
? srso_alias_return_thunk+0x5/0xfbef5
? handle_mm_fault+0x17d/0x2e0
? srso_alias_return_thunk+0x5/0xfbef5
? do_user_addr_fault+0x33d/0x6f0
? srso_alias_return_thunk+0x5/0xfbef5
? srso_alias_return_thunk+0x5/0xfbef5
? irqentry_exit_to_user_mode+0x5b/0x170
entry_SYSCALL_64_after_hwframe+0x6c/0x74
RIP: 0033:0x7f1deceea3ea
Code: ...
RSP: 002b:00007ffd7fd37ca8 EFLAGS: 00000246 ORIG_RAX: 000000000000003d
RAX: ffffffffffffffda RBX: 00007ffd7fd37cb4 RCX: 00007f1deceea3ea
RDX: 0000000000000000 RSI: 00007ffd7fd37cb4 RDI: 00000000ffffffff
RBP: 0000000000000002 R08: 00000000000136f5 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffd7fd37dd8
R13: 000055fd8debf140 R14: 000055fd8dec1d88 R15: 00007f1ded0f2040
</TASK>
Modules linked in: ...
CR2: 0000000000000000
---[ end trace 0000000000000000 ]---


Note: I only ran into this issue with unixbench spawn. A bunch of other
benchmarks (hackbench, stream, tbench, netperf, schbench, other variants
of unixbench) ran fine without bringing down the machine.

Attaching my config below in case this in config specific.

>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> include/linux/sched.h | 1
> kernel/sched/core.c | 22 +++++--
> kernel/sched/fair.c | 148 +++++++++++++++++++++++++++++++++++++++++++-----
> kernel/sched/features.h | 12 +++
> kernel/sched/sched.h | 2
> 5 files changed, 167 insertions(+), 18 deletions(-)
>
> [..snip..]
>

--
Thanks and Regards,
Prateek


Attachments:
eevdf_complete_config (278.14 kB)

2024-04-15 10:57:47

by Mike Galbraith

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On Fri, 2024-04-12 at 16:12 +0530, K Prateek Nayak wrote:
>
> I ran into a few issues when testing the series on top of tip:sched/core
> at commit 4475cd8bfd9b ("sched/balancing: Simplify the sg_status bitmask
> and use separate ->overloaded and ->overutilized flags"). All of these
> splats surfaced when running unixbench with Delayed Dequeue (echoing
> NO_DELAY_DEQUEUE to /sys/kernel/debug/sched/features seems to make the
> system stable when running Unixbench spawn)
>
> Unixbench (https://github.com/kdlucas/byte-unixbench.git) command:
>
>         ./Run spawn -c 512

That plus a hackbench loop works a treat.

>
> Splats appear soon into the run. Following are the splats and their
> corresponding code blocks from my 3rd Generation EPYC system
> (2 x 64C/128T):

Seems a big box is not required. With a low fat sched config (no group
sched), starting ./Run spawn -c 16 (cpus*2) along with a hackbench loop
reliably blows my old i7-4790 box out of the water nearly instantly.

DUMPFILE: vmcore
CPUS: 8
DATE: Mon Apr 15 07:20:29 CEST 2024
UPTIME: 00:07:23
LOAD AVERAGE: 1632.20, 684.99, 291.84
TASKS: 1401
NODENAME: homer
RELEASE: 6.9.0.g0bbac3f-master
VERSION: #7 SMP Mon Apr 15 06:40:05 CEST 2024
MACHINE: x86_64 (3591 Mhz)
MEMORY: 16 GB
PANIC: "Oops: 0000 [#1] SMP NOPTI" (check log for details)
PID: 22664
COMMAND: "hackbench"
TASK: ffff888100acbf00 [THREAD_INFO: ffff888100acbf00]
CPU: 5
STATE: TASK_WAKING (PANIC)

crash> bt -sx
PID: 22664 TASK: ffff888100acbf00 CPU: 5 COMMAND: "hackbench"
#0 [ffff88817cc17920] machine_kexec+0x156 at ffffffff810642d6
#1 [ffff88817cc17970] __crash_kexec+0xd7 at ffffffff81153147
#2 [ffff88817cc17a28] crash_kexec+0x23 at ffffffff811535f3
#3 [ffff88817cc17a38] oops_end+0xbe at ffffffff810329be
#4 [ffff88817cc17a58] page_fault_oops+0x81 at ffffffff81071951
#5 [ffff88817cc17ab8] exc_page_fault+0x62 at ffffffff8194f6f2
#6 [ffff88817cc17ae0] asm_exc_page_fault+0x22 at ffffffff81a00ba2
[exception RIP: pick_task_fair+71]
RIP: ffffffff810d5b57 RSP: ffff88817cc17b90 RFLAGS: 00010046
RAX: 0000000000000000 RBX: ffff88840ed70ec0 RCX: 00000001d7ec138c
RDX: ffffffffe7a7f400 RSI: 0000000000000000 RDI: 0000000000000000
RBP: ffff88840ed70ec0 R8: 0000000000000c00 R9: 000000675402f79e
R10: ffff88817cc17b30 R11: 00000000000000bb R12: ffff88840ed70f40
R13: ffffffff81f64f16 R14: ffff888100acc560 R15: ffff888100acbf00
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#7 [ffff88817cc17bb0] pick_next_task_fair+0x42 at ffffffff810d92c2
#8 [ffff88817cc17be0] __schedule+0x10d at ffffffff8195936d
#9 [ffff88817cc17c50] schedule+0x1c at ffffffff81959ddc
#10 [ffff88817cc17c60] schedule_timeout+0x18c at ffffffff8195fc4c
#11 [ffff88817cc17cc8] unix_stream_read_generic+0x2b7 at ffffffff81869917
#12 [ffff88817cc17da8] unix_stream_recvmsg+0x68 at ffffffff8186a2d8
#13 [ffff88817cc17de0] sock_read_iter+0x159 at ffffffff8170bd69
#14 [ffff88817cc17e70] vfs_read+0x2ce at ffffffff812f195e
#15 [ffff88817cc17ef8] ksys_read+0x40 at ffffffff812f21d0
#16 [ffff88817cc17f30] do_syscall_64+0x57 at ffffffff8194b947
#17 [ffff88817cc17f50] entry_SYSCALL_64_after_hwframe+0x76 at ffffffff81a0012b
RIP: 00007f625660871e RSP: 00007ffc75d48188 RFLAGS: 00000246
RAX: ffffffffffffffda RBX: 00007ffc75d48200 RCX: 00007f625660871e
RDX: 0000000000000064 RSI: 00007ffc75d48190 RDI: 0000000000000007
RBP: 00007ffc75d48260 R8: 00007ffc75d48140 R9: 00007f6256612010
R10: 00007f62565f5070 R11: 0000000000000246 R12: 0000000000000064
R13: 0000000000000000 R14: 0000000000000064 R15: 0000000000000000
ORIG_RAX: 0000000000000000 CS: 0033 SS: 002b
crash> dis pick_task_fair+71
0xffffffff810d5b57 <pick_task_fair+71>: cmpb $0x0,0x4c(%rax)
crash> gdb list *pick_task_fair+71
0xffffffff810d5b57 is in pick_task_fair (kernel/sched/fair.c:5498).
5493 SCHED_WARN_ON(cfs_rq->next->sched_delayed);
5494 return cfs_rq->next;
5495 }
5496
5497 struct sched_entity *se = pick_eevdf(cfs_rq);
5498 if (se->sched_delayed) {
5499 dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
5500 SCHED_WARN_ON(se->sched_delayed);
5501 SCHED_WARN_ON(se->on_rq);
5502 if (sched_feat(DELAY_ZERO) && se->vlag > 0)
crash> struct -ox sched_entity
struct sched_entity {
[0x0] struct load_weight load;
[0x10] struct rb_node run_node;
[0x28] u64 deadline;
[0x30] u64 min_vruntime;
[0x38] struct list_head group_node;
[0x48] unsigned int on_rq;
[0x4c] unsigned char sched_delayed;
[0x4d] unsigned char custom_slice;
[0x50] u64 exec_start;
[0x58] u64 sum_exec_runtime;
[0x60] u64 prev_sum_exec_runtime;
[0x68] u64 vruntime;
[0x70] s64 vlag;
[0x78] u64 slice;
[0x80] u64 nr_migrations;
[0xc0] struct sched_avg avg;
}
SIZE: 0x100
crash>


2024-04-15 17:20:24

by Luis Machado

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

Hi Peter,

On 4/5/24 11:28, Peter Zijlstra wrote:
> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
> noting that lag is fundamentally a temporal measure. It should not be
> carried around indefinitely.
>
> OTOH it should also not be instantly discarded, doing so will allow a
> task to game the system by purposefully (micro) sleeping at the end of
> its time quantum.
>
> Since lag is intimately tied to the virtual time base, a wall-time
> based decay is also insufficient, notably competition is required for
> any of this to make sense.
>
> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
> competing until they are eligible.
>
> Strictly speaking, we only care about keeping them until the 0-lag
> point, but that is a difficult proposition, instead carry them around
> until they get picked again, and dequeue them at that point.
>
> Since we should have dequeued them at the 0-lag point, truncate lag
> (eg. don't let them earn positive lag).
>
> XXX test the cfs-throttle stuff
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>

Playing around with a Pixel 6 running a 6.6-based kernel with this
series backported on top, I spotted a noticeable performance improvement
on the speedometer benchmark:

- m6.6-stock-* is the 6.6 mainline Android kernel unmodified.

- m6.6-eevdf-complete-* is the 6.6 mainline Android kernel with
this series applied on top (along with a few required backported
patches).

+-------------------+-----------------------+-----------+
| metric | tag | perc_diff |
+-------------------+-----------------------+-----------+
| Speedometer Score | m6.6-stock-1 | 0.0% |
| Speedometer Score | m6.6-stock-2 | 1.23% |
| Speedometer Score | m6.6-stock-3 | -0.22% |
| Speedometer Score | m6.6-eevdf-complete-1 | 4.54% |
| Speedometer Score | m6.6-eevdf-complete-2 | 4.95% |
| Speedometer Score | m6.6-eevdf-complete-3 | 6.07% |
+-------------------+-----------------------+-----------+

Also some interesting improvements in terms of frame timing for the
uibenchjanktests benchmark. In particular the metrics of missed
deadlines and jank (late) frames, which seems to indicate better
latencies.

+-----------------------+-----------------------+-----------+
| metric | tag | perc_diff |
+-----------------------+-----------------------+-----------+
| gfx-avg-frame-time-50 | m6.6-stock-1 | 0.0 |
| gfx-avg-frame-time-90 | m6.6-stock-1 | 0.0 |
| gfx-avg-frame-time-95 | m6.6-stock-1 | 0.0 |
| gfx-avg-frame-time-99 | m6.6-stock-1 | 0.0 |
| gfx-avg-frame-time-50 | m6.6-stock-2 | 3.46 |
| gfx-avg-frame-time-90 | m6.6-stock-2 | 1.19 |
| gfx-avg-frame-time-95 | m6.6-stock-2 | 0.24 |
| gfx-avg-frame-time-99 | m6.6-stock-2 | 0.48 |
| gfx-avg-frame-time-50 | m6.6-eevdf-complete-1 | -30.45 |
| gfx-avg-frame-time-90 | m6.6-eevdf-complete-1 | -48.44 |
| gfx-avg-frame-time-95 | m6.6-eevdf-complete-1 | -51.32 |
| gfx-avg-frame-time-99 | m6.6-eevdf-complete-1 | -52.48 |
| gfx-avg-frame-time-50 | m6.6-eevdf-complete-2 | -30.32 |
| gfx-avg-frame-time-90 | m6.6-eevdf-complete-2 | -48.16 |
| gfx-avg-frame-time-95 | m6.6-eevdf-complete-2 | -51.08 |
| gfx-avg-frame-time-99 | m6.6-eevdf-complete-2 | -51.7 |
+-----------------------+-----------------------+-----------+

+-----------------------------------+-----------------------+-----------+
| metric | tag | perc_diff |
+-----------------------------------+-----------------------+-----------+
| gfx-avg-num-frame-deadline-missed | m6.6-stock-1 | 0.0 |
| gfx-max-num-frame-deadline-missed | m6.6-stock-1 | 0.0 |
| gfx-avg-num-frame-deadline-missed | m6.6-stock-2 | -3.21 |
| gfx-max-num-frame-deadline-missed | m6.6-stock-2 | -3.21 |
| gfx-avg-num-frame-deadline-missed | m6.6-eevdf-complete-1 | -85.29 |
| gfx-max-num-frame-deadline-missed | m6.6-eevdf-complete-1 | -85.29 |
| gfx-avg-num-frame-deadline-missed | m6.6-eevdf-complete-2 | -84.8 |
| gfx-max-num-frame-deadline-missed | m6.6-eevdf-complete-2 | -84.8 |
+-----------------------------------+-----------------------+-----------+

+----------------------------+-----------------------+-----------+
| metric | tag | perc_diff |
+----------------------------+-----------------------+-----------+
| gfx-avg-high-input-latency | m6.6-stock-1 | 0.0 |
| gfx-max-high-input-latency | m6.6-stock-1 | 0.0 |
| gfx-avg-high-input-latency | m6.6-stock-2 | 0.93 |
| gfx-max-high-input-latency | m6.6-stock-2 | 0.93 |
| gfx-avg-high-input-latency | m6.6-eevdf-complete-1 | -18.35 |
| gfx-max-high-input-latency | m6.6-eevdf-complete-1 | -18.35 |
| gfx-avg-high-input-latency | m6.6-eevdf-complete-2 | -18.05 |
| gfx-max-high-input-latency | m6.6-eevdf-complete-2 | -18.05 |
+----------------------------+-----------------------+-----------+

+--------------+-----------------------+-----------+
| metric | tag | perc_diff |
+--------------+-----------------------+-----------+
| gfx-avg-jank | m6.6-stock-1 | 0.0 |
| gfx-max-jank | m6.6-stock-1 | 0.0 |
| gfx-avg-jank | m6.6-stock-2 | 1.56 |
| gfx-max-jank | m6.6-stock-2 | 1.56 |
| gfx-avg-jank | m6.6-eevdf-complete-1 | -82.81 |
| gfx-max-jank | m6.6-eevdf-complete-1 | -82.81 |
| gfx-avg-jank | m6.6-eevdf-complete-2 | -78.12 |
| gfx-max-jank | m6.6-eevdf-complete-2 | -78.12 |
+--------------+-----------------------+-----------+

Bisecting through the patches in this series, I ended up with patch 08/10
as the one that improved things overall for these benchmarks.

I'd like to investigate this further to understand the reason behind some of
these dramatic improvements.

> ---
> include/linux/sched.h | 1
> kernel/sched/core.c | 22 +++++--
> kernel/sched/fair.c | 148 +++++++++++++++++++++++++++++++++++++++++++-----
> kernel/sched/features.h | 12 +++
> kernel/sched/sched.h | 2
> 5 files changed, 167 insertions(+), 18 deletions(-)
>
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -542,6 +542,7 @@ struct sched_entity {
>
> struct list_head group_node;
> unsigned int on_rq;
> + unsigned int sched_delayed;
>
> u64 exec_start;
> u64 sum_exec_runtime;
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2154,10 +2154,14 @@ void activate_task(struct rq *rq, struct
>
> void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
> {
> - WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING);
> - ASSERT_EXCLUSIVE_WRITER(p->on_rq);
> + bool sleep = flags & DEQUEUE_SLEEP;
>
> - dequeue_task(rq, p, flags);
> + if (dequeue_task(rq, p, flags)) {
> + WRITE_ONCE(p->on_rq, sleep ? 0 : TASK_ON_RQ_MIGRATING);
> + ASSERT_EXCLUSIVE_WRITER(p->on_rq);
> + } else {
> + SCHED_WARN_ON(!sleep); /* only sleep can fail */
> + }
> }
>
> static inline int __normal_prio(int policy, int rt_prio, int nice)
> @@ -3858,12 +3862,17 @@ static int ttwu_runnable(struct task_str
>
> rq = __task_rq_lock(p, &rf);
> if (task_on_rq_queued(p)) {
> + update_rq_clock(rq);
> + if (p->se.sched_delayed) {
> + /* mustn't run a delayed task */
> + SCHED_WARN_ON(task_on_cpu(rq, p));
> + enqueue_task(rq, p, ENQUEUE_DELAYED);
> + }
> if (!task_on_cpu(rq, p)) {
> /*
> * When on_rq && !on_cpu the task is preempted, see if
> * it should preempt the task that is current now.
> */
> - update_rq_clock(rq);
> wakeup_preempt(rq, p, wake_flags);
> }
> ttwu_do_wakeup(p);
> @@ -4243,11 +4252,16 @@ int try_to_wake_up(struct task_struct *p
> * case the whole 'p->on_rq && ttwu_runnable()' case below
> * without taking any locks.
> *
> + * Specifically, given current runs ttwu() we must be before
> + * schedule()'s deactivate_task(), as such this must not
> + * observe sched_delayed.
> + *
> * In particular:
> * - we rely on Program-Order guarantees for all the ordering,
> * - we're serialized against set_special_state() by virtue of
> * it disabling IRQs (this allows not taking ->pi_lock).
> */
> + SCHED_WARN_ON(p->se.sched_delayed);
> if (!ttwu_state_match(p, state, &success))
> goto out;
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5270,6 +5270,10 @@ static inline int cfs_rq_throttled(struc
> static inline bool cfs_bandwidth_used(void);
>
> static void
> +requeue_delayed_entity(struct sched_entity *se);
> +
> +/* XXX bool and pull in the requeue_delayed_entity thing */
> +static void
> enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> {
> bool curr = cfs_rq->curr == se;
> @@ -5356,18 +5360,33 @@ static void clear_buddies(struct cfs_rq
>
> static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
>
> -static void
> +static bool
> dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> {
> - int action = UPDATE_TG;
> -
> - if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
> - action |= DO_DETACH;
> -
> /*
> * Update run-time statistics of the 'current'.
> */
> - update_curr(cfs_rq);
> + if (flags & DEQUEUE_DELAYED) {
> + SCHED_WARN_ON(!se->sched_delayed);
> + se->sched_delayed = 0;
> + } else {
> + bool sleep = flags & DEQUEUE_SLEEP;
> +
> + SCHED_WARN_ON(sleep && se->sched_delayed);
> + update_curr(cfs_rq);
> +
> + if (sched_feat(DELAY_DEQUEUE) && sleep &&
> + !entity_eligible(cfs_rq, se)) {
> + if (cfs_rq->next == se)
> + cfs_rq->next = NULL;
> + se->sched_delayed = 1;
> + return false;
> + }
> + }
> +
> + int action = UPDATE_TG;
> + if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
> + action |= DO_DETACH;
>
> /*
> * When dequeuing a sched_entity, we must:
> @@ -5407,6 +5426,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
>
> if (cfs_rq->nr_running == 0)
> update_idle_cfs_rq_clock_pelt(cfs_rq);
> +
> + return true;
> }
>
> static void
> @@ -5432,6 +5453,7 @@ set_next_entity(struct cfs_rq *cfs_rq, s
> }
>
> update_stats_curr_start(cfs_rq, se);
> + SCHED_WARN_ON(cfs_rq->curr);
> cfs_rq->curr = se;
>
> /*
> @@ -5452,6 +5474,8 @@ set_next_entity(struct cfs_rq *cfs_rq, s
> se->prev_sum_exec_runtime = se->sum_exec_runtime;
> }
>
> +static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
> +
> /*
> * Pick the next process, keeping these things in mind, in this order:
> * 1) keep things fair between processes/task groups
> @@ -5460,16 +5484,29 @@ set_next_entity(struct cfs_rq *cfs_rq, s
> * 4) do not run the "skip" process, if something else is available
> */
> static struct sched_entity *
> -pick_next_entity(struct cfs_rq *cfs_rq)
> +pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
> {
> /*
> * Enabling NEXT_BUDDY will affect latency but not fairness.
> */
> if (sched_feat(NEXT_BUDDY) &&
> - cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
> + cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
> + /* ->next will never be delayed */
> + SCHED_WARN_ON(cfs_rq->next->sched_delayed);
> return cfs_rq->next;
> + }
>
> - return pick_eevdf(cfs_rq);
> + struct sched_entity *se = pick_eevdf(cfs_rq);
> + if (se->sched_delayed) {
> + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> + SCHED_WARN_ON(se->sched_delayed);
> + SCHED_WARN_ON(se->on_rq);
> + if (sched_feat(DELAY_ZERO) && se->vlag > 0)
> + se->vlag = 0;
> +
> + return NULL;
> + }
> + return se;
> }
>
> static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
> @@ -5493,6 +5530,7 @@ static void put_prev_entity(struct cfs_r
> /* in !on_rq case, update occurred at dequeue */
> update_load_avg(cfs_rq, prev, 0);
> }
> + SCHED_WARN_ON(cfs_rq->curr != prev);
> cfs_rq->curr = NULL;
> }
>
> @@ -5793,6 +5831,10 @@ static bool throttle_cfs_rq(struct cfs_r
> if (!se->on_rq)
> goto done;
>
> + /*
> + * XXX should be fine vs sched_delay; if won't run after this.
> + * Either pick dequeues it, or unthrottle. Double check!!
> + */
> dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
>
> if (cfs_rq_is_idle(group_cfs_rq(se)))
> @@ -5882,8 +5924,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cf
> for_each_sched_entity(se) {
> struct cfs_rq *qcfs_rq = cfs_rq_of(se);
>
> - if (se->on_rq)
> + if (se->on_rq) {
> + if (se->sched_delayed)
> + requeue_delayed_entity(se);
> break;
> + }
> enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
>
> if (cfs_rq_is_idle(group_cfs_rq(se)))
> @@ -6729,6 +6774,40 @@ static int sched_idle_cpu(int cpu)
> }
> #endif
>
> +static void
> +requeue_delayed_entity(struct sched_entity *se)
> +{
> + struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> + /*
> + * se->sched_delayed should imply both: se->on_rq == 1 and
> + * cfs_rq->curr != se. Because a delayed entity is one that is still on
> + * the runqueue competing until elegibility.
> + *
> + * Except for groups, consider current going idle and newidle pulling a
> + * task in the same group -- in that case 'cfs_rq->curr == se'.
> + */
> + SCHED_WARN_ON(!se->sched_delayed);
> + SCHED_WARN_ON(!se->on_rq);
> + SCHED_WARN_ON(entity_is_task(se) && cfs_rq->curr == se);
> +
> + if (sched_feat(DELAY_ZERO)) {
> + update_entity_lag(cfs_rq, se);
> + if (se->vlag > 0) {
> + cfs_rq->nr_running--;
> + if (se != cfs_rq->curr)
> + __dequeue_entity(cfs_rq, se);
> + se->vlag = 0;
> + place_entity(cfs_rq, se, 0);
> + if (se != cfs_rq->curr)
> + __enqueue_entity(cfs_rq, se);
> + cfs_rq->nr_running++;
> + }
> + }
> +
> + se->sched_delayed = 0;
> +}
> +
> /*
> * The enqueue_task method is called before nr_running is
> * increased. Here we update the fair scheduling stats and
> @@ -6742,6 +6821,11 @@ enqueue_task_fair(struct rq *rq, struct
> int idle_h_nr_running = task_has_idle_policy(p);
> int task_new = !(flags & ENQUEUE_WAKEUP);
>
> + if (flags & ENQUEUE_DELAYED) {
> + requeue_delayed_entity(se);
> + return;
> + }
> +
> /*
> * The code below (indirectly) updates schedutil which looks at
> * the cfs_rq utilization to select a frequency.
> @@ -6759,8 +6843,11 @@ enqueue_task_fair(struct rq *rq, struct
> cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
>
> for_each_sched_entity(se) {
> - if (se->on_rq)
> + if (se->on_rq) {
> + if (se->sched_delayed)
> + requeue_delayed_entity(se);
> break;
> + }
> cfs_rq = cfs_rq_of(se);
> enqueue_entity(cfs_rq, se, flags);
>
> @@ -6836,6 +6923,7 @@ static int dequeue_entities(struct rq *r
> {
> bool was_sched_idle = sched_idle_rq(rq);
> bool task_sleep = flags & DEQUEUE_SLEEP;
> + bool task_delayed = flags & DEQUEUE_DELAYED;
> struct task_struct *p = NULL;
> struct cfs_rq *cfs_rq;
> int idle_h_nr_running;
> @@ -6849,7 +6937,13 @@ static int dequeue_entities(struct rq *r
>
> for_each_sched_entity(se) {
> cfs_rq = cfs_rq_of(se);
> - dequeue_entity(cfs_rq, se, flags);
> +
> + if (!dequeue_entity(cfs_rq, se, flags)) {
> + if (p && &p->se == se)
> + return -1;
> +
> + break;
> + }
>
> /* h_nr_running is the hierachical count of tasks */
> if (p) {
> @@ -6877,6 +6971,7 @@ static int dequeue_entities(struct rq *r
> break;
> }
> flags |= DEQUEUE_SLEEP;
> + flags &= ~DEQUEUE_DELAYED;
> }
>
> for_each_sched_entity(se) {
> @@ -6906,6 +7001,18 @@ static int dequeue_entities(struct rq *r
> /* balance early to pull high priority tasks */
> if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
> rq->next_balance = jiffies;
> +
> + if (task_delayed) {
> + SCHED_WARN_ON(!task_sleep);
> + SCHED_WARN_ON(p->on_rq != 1);
> +
> + /* Fix-up what dequeue_task_fair() skipped */
> + util_est_update(&rq->cfs, p, task_sleep);
> + hrtick_update(rq);
> +
> + /* Fix-up what deactivate_task() skipped. */
> + WRITE_ONCE(p->on_rq, 0);
> + }
> }
>
> return 1;
> @@ -6923,6 +7030,10 @@ static bool dequeue_task_fair(struct rq
> if (dequeue_entities(rq, &p->se, flags) < 0)
> return false;
>
> + /*
> + * It doesn't make sense to update util_est for the delayed dequeue
> + * case where ttwu will make it appear the sleep never happened.
> + */
> util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
> hrtick_update(rq);
> return true;
> @@ -8463,7 +8574,9 @@ static struct task_struct *pick_task_fai
> if (unlikely(check_cfs_rq_runtime(cfs_rq)))
> goto again;
>
> - se = pick_next_entity(cfs_rq);
> + se = pick_next_entity(rq, cfs_rq);
> + if (!se)
> + goto again;
> cfs_rq = group_cfs_rq(se);
> } while (cfs_rq);
>
> @@ -12803,10 +12916,17 @@ static void attach_task_cfs_rq(struct ta
> static void switched_from_fair(struct rq *rq, struct task_struct *p)
> {
> detach_task_cfs_rq(p);
> + // XXX think harder on this
> + // this could actually be handled correctly I suppose; keep the whole
> + // se enqueued while boosted. horrible complexity though
> + p->se.sched_delayed = 0;
> + // XXX also vlag ?!?
> }
>
> static void switched_to_fair(struct rq *rq, struct task_struct *p)
> {
> + SCHED_WARN_ON(p->se.sched_delayed);
> +
> attach_task_cfs_rq(p);
>
> set_task_max_allowed_capacity(p);
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -29,6 +29,18 @@ SCHED_FEAT(NEXT_BUDDY, false)
> SCHED_FEAT(CACHE_HOT_BUDDY, true)
>
> /*
> + * Delay dequeueing tasks until they get selected or woken.
> + *
> + * By delaying the dequeue for non-eligible tasks, they remain in the
> + * competition and can burn off their negative lag. When they get selected
> + * they'll have positive lag by definition.
> + *
> + * DELAY_ZERO clips the lag on dequeue (or wakeup) to 0.
> + */
> +SCHED_FEAT(DELAY_DEQUEUE, true)
> +SCHED_FEAT(DELAY_ZERO, true)
> +
> +/*
> * Allow wakeup-time preemption of the current task:
> */
> SCHED_FEAT(WAKEUP_PREEMPTION, true)
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2245,6 +2245,7 @@ extern const u32 sched_prio_to_wmult[40
> #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
> #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
> #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
> +#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */
>
> #define ENQUEUE_WAKEUP 0x01
> #define ENQUEUE_RESTORE 0x02
> @@ -2260,6 +2261,7 @@ extern const u32 sched_prio_to_wmult[40
> #endif
> #define ENQUEUE_INITIAL 0x80
> #define ENQUEUE_MIGRATING 0x100
> +#define ENQUEUE_DELAYED 0x200
>
> #define RETRY_TASK ((void *)-1UL)
>
>
>
>


2024-04-16 03:19:18

by K Prateek Nayak

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

(+ Chen Yu, Oliver Sang)

Hello Mike,

On 4/15/2024 4:26 PM, Mike Galbraith wrote:
> On Fri, 2024-04-12 at 16:12 +0530, K Prateek Nayak wrote:
>>
>> I ran into a few issues when testing the series on top of tip:sched/core
>> at commit 4475cd8bfd9b ("sched/balancing: Simplify the sg_status bitmask
>> and use separate ->overloaded and ->overutilized flags"). All of these
>> splats surfaced when running unixbench with Delayed Dequeue (echoing
>> NO_DELAY_DEQUEUE to /sys/kernel/debug/sched/features seems to make the
>> system stable when running Unixbench spawn)
>>
>> Unixbench (https://github.com/kdlucas/byte-unixbench.git) command:
>>
>>         ./Run spawn -c 512
>
> That plus a hackbench loop works a treat.
>
>>
>> Splats appear soon into the run. Following are the splats and their
>> corresponding code blocks from my 3rd Generation EPYC system
>> (2 x 64C/128T):
>
> Seems a big box is not required. With a low fat sched config (no group
> sched), starting ./Run spawn -c 16 (cpus*2) along with a hackbench loop
> reliably blows my old i7-4790 box out of the water nearly instantly.
>
> DUMPFILE: vmcore
> CPUS: 8
> DATE: Mon Apr 15 07:20:29 CEST 2024
> UPTIME: 00:07:23
> LOAD AVERAGE: 1632.20, 684.99, 291.84
> TASKS: 1401
> NODENAME: homer
> RELEASE: 6.9.0.g0bbac3f-master
> VERSION: #7 SMP Mon Apr 15 06:40:05 CEST 2024
> MACHINE: x86_64 (3591 Mhz)
> MEMORY: 16 GB
> PANIC: "Oops: 0000 [#1] SMP NOPTI" (check log for details)
> PID: 22664
> COMMAND: "hackbench"
> TASK: ffff888100acbf00 [THREAD_INFO: ffff888100acbf00]
> CPU: 5
> STATE: TASK_WAKING (PANIC)
>
> crash> bt -sx
> PID: 22664 TASK: ffff888100acbf00 CPU: 5 COMMAND: "hackbench"
> #0 [ffff88817cc17920] machine_kexec+0x156 at ffffffff810642d6
> #1 [ffff88817cc17970] __crash_kexec+0xd7 at ffffffff81153147
> #2 [ffff88817cc17a28] crash_kexec+0x23 at ffffffff811535f3
> #3 [ffff88817cc17a38] oops_end+0xbe at ffffffff810329be
> #4 [ffff88817cc17a58] page_fault_oops+0x81 at ffffffff81071951
> #5 [ffff88817cc17ab8] exc_page_fault+0x62 at ffffffff8194f6f2
> #6 [ffff88817cc17ae0] asm_exc_page_fault+0x22 at ffffffff81a00ba2
> [exception RIP: pick_task_fair+71]
> RIP: ffffffff810d5b57 RSP: ffff88817cc17b90 RFLAGS: 00010046
> RAX: 0000000000000000 RBX: ffff88840ed70ec0 RCX: 00000001d7ec138c
> RDX: ffffffffe7a7f400 RSI: 0000000000000000 RDI: 0000000000000000
> RBP: ffff88840ed70ec0 R8: 0000000000000c00 R9: 000000675402f79e
> R10: ffff88817cc17b30 R11: 00000000000000bb R12: ffff88840ed70f40
> R13: ffffffff81f64f16 R14: ffff888100acc560 R15: ffff888100acbf00
> ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
> #7 [ffff88817cc17bb0] pick_next_task_fair+0x42 at ffffffff810d92c2
> #8 [ffff88817cc17be0] __schedule+0x10d at ffffffff8195936d
> #9 [ffff88817cc17c50] schedule+0x1c at ffffffff81959ddc
> #10 [ffff88817cc17c60] schedule_timeout+0x18c at ffffffff8195fc4c
> #11 [ffff88817cc17cc8] unix_stream_read_generic+0x2b7 at ffffffff81869917
> #12 [ffff88817cc17da8] unix_stream_recvmsg+0x68 at ffffffff8186a2d8
> #13 [ffff88817cc17de0] sock_read_iter+0x159 at ffffffff8170bd69
> #14 [ffff88817cc17e70] vfs_read+0x2ce at ffffffff812f195e
> #15 [ffff88817cc17ef8] ksys_read+0x40 at ffffffff812f21d0
> #16 [ffff88817cc17f30] do_syscall_64+0x57 at ffffffff8194b947
> #17 [ffff88817cc17f50] entry_SYSCALL_64_after_hwframe+0x76 at ffffffff81a0012b
> RIP: 00007f625660871e RSP: 00007ffc75d48188 RFLAGS: 00000246
> RAX: ffffffffffffffda RBX: 00007ffc75d48200 RCX: 00007f625660871e
> RDX: 0000000000000064 RSI: 00007ffc75d48190 RDI: 0000000000000007
> RBP: 00007ffc75d48260 R8: 00007ffc75d48140 R9: 00007f6256612010
> R10: 00007f62565f5070 R11: 0000000000000246 R12: 0000000000000064
> R13: 0000000000000000 R14: 0000000000000064 R15: 0000000000000000
> ORIG_RAX: 0000000000000000 CS: 0033 SS: 002b
> crash> dis pick_task_fair+71
> 0xffffffff810d5b57 <pick_task_fair+71>: cmpb $0x0,0x4c(%rax)
> crash> gdb list *pick_task_fair+71
> 0xffffffff810d5b57 is in pick_task_fair (kernel/sched/fair.c:5498).
> 5493 SCHED_WARN_ON(cfs_rq->next->sched_delayed);
> 5494 return cfs_rq->next;
> 5495 }
> 5496
> 5497 struct sched_entity *se = pick_eevdf(cfs_rq);
> 5498 if (se->sched_delayed) {

Wondering if you are running into the issue where pick_eevdf() returns
NULL despite there being runnable CFS task on the runqueue. Can you try
running with the following patch from Chenyu -
https://lore.kernel.org/lkml/[email protected]/

> 5499 dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> 5500 SCHED_WARN_ON(se->sched_delayed);
> 5501 SCHED_WARN_ON(se->on_rq);
> 5502 if (sched_feat(DELAY_ZERO) && se->vlag > 0)
> crash> struct -ox sched_entity
> struct sched_entity {
> [0x0] struct load_weight load;
> [0x10] struct rb_node run_node;
> [0x28] u64 deadline;
> [0x30] u64 min_vruntime;
> [0x38] struct list_head group_node;
> [0x48] unsigned int on_rq;
> [0x4c] unsigned char sched_delayed;
> [0x4d] unsigned char custom_slice;
> [0x50] u64 exec_start;
> [0x58] u64 sum_exec_runtime;
> [0x60] u64 prev_sum_exec_runtime;
> [0x68] u64 vruntime;
> [0x70] s64 vlag;
> [0x78] u64 slice;
> [0x80] u64 nr_migrations;
> [0xc0] struct sched_avg avg;
> }
> SIZE: 0x100
> crash>
>

--
Thanks and Regards,
Prateek

2024-04-16 05:37:31

by Mike Galbraith

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On Tue, 2024-04-16 at 08:48 +0530, K Prateek Nayak wrote:
>
> Wondering if you are running into the issue where pick_eevdf() returns
> NULL despite there being runnable CFS task on the runqueue. Can you try
> running with the following patch from Chenyu -
> https://lore.kernel.org/lkml/[email protected]/

I hadn't tried it in master branch, but had in my 6.1-eevdf tree which
contains the previous delay dequeue version, because I found that it
too would explode, which surprised me having run it quite a bit woe
free. Patch helped 6.1-eevdf not at all. I just tried it in master to
be sure, and while the death throes were not identical, this one began
with SCHED_WARN_ON() in put_prev_entity(), box still went belly up.

BTW/FWIW, turning on AUTOGROUP seemingly rendered box immune (unless
booted with noautogroup 'course), and that immunity even held when I
launched all players in the same cgroup (hmm).

-Mike

2024-04-18 16:26:06

by Mike Galbraith

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

Greetings,

I tossed a couple rocks at it today, and seem to have hit the little
bugger. The root cause seems to be doing the delay dequeue business on
exiting tasks. Hunk #1 of hacklet below seems to quell the explosions.

crash> bt -sx
PID: 21722 TASK: ffff88815710ee40 CPU: 6 COMMAND: "hackbench"
#0 [ffff88822773bb90] machine_kexec+0x156 at ffffffff810642d6
#1 [ffff88822773bbe0] __crash_kexec+0xd7 at ffffffff81152a07
#2 [ffff88822773bc98] crash_kexec+0x23 at ffffffff81152eb3
#3 [ffff88822773bca8] oops_end+0xbe at ffffffff810329be
#4 [ffff88822773bcc8] page_fault_oops+0x81 at ffffffff81071951
#5 [ffff88822773bd28] exc_page_fault+0x62 at ffffffff8194e9e2
#6 [ffff88822773bd50] asm_exc_page_fault+0x22 at ffffffff81a00ba2
[exception RIP: pick_next_task_fair+178]
RIP: ffffffff810d8d12 RSP: ffff88822773be00 RFLAGS: 00010006
RAX: ffff88813cb780b8 RBX: ffff88840edb0e80 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffff88813cb78080 RDI: ffff88840ec30f00
RBP: ffff88813cb78000 R8: ffff88815710eec0 R9: 0000000000000001
R10: ffff88822773bdc8 R11: 0000000000000013 R12: 0000000000030e80
R13: ffff88815710ee40 R14: ffff88813cb78080 R15: ffff88815710ee40
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#7 [ffff88822773be28] __schedule+0x10d at ffffffff8195865d
#8 [ffff88822773be98] do_task_dead+0x3e at ffffffff810cc00e
#9 [ffff88822773beb0] do_exit+0x770 at ffffffff8108f0e0
#10 [ffff88822773bf00] do_group_exit+0x2c at ffffffff8108f64c
#11 [ffff88822773bf28] __x64_sys_exit_group+0x14 at ffffffff8108f6f4
#12 [ffff88822773bf30] do_syscall_64+0x57 at ffffffff8194ac37
#13 [ffff88822773bf50] entry_SYSCALL_64_after_hwframe+0x76 at ffffffff81a0012b
RIP: 00007f4f2aa76136 RSP: 00007ffcbba84748 RFLAGS: 00000246
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f4f2aa76136
RDX: 0000000000000000 RSI: 000000000000003c RDI: 0000000000000000
RBP: 00007f4f2ab86970 R8: 00000000000000e7 R9: ffffffffffffff80
R10: 0000000000000004 R11: 0000000000000246 R12: 00007f4f2ab86970
R13: 0000000000000001 R14: 00007f4f2ab8a328 R15: 0000000000000000
ORIG_RAX: 00000000000000e7 CS: 0033 SS: 002b
crash> task_struct ffff88815710ee40 | grep sched_delayed
sched_delayed = 1 '\001',
crash>

---
kernel/sched/fair.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5374,6 +5374,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
update_curr(cfs_rq);

if (sched_feat(DELAY_DEQUEUE) && sleep &&
+ !(entity_is_task(se) && (task_of(se)->flags & PF_EXITING)) &&
!entity_eligible(cfs_rq, se)) {
if (cfs_rq->next == se)
cfs_rq->next = NULL;
@@ -5495,14 +5496,14 @@ pick_next_entity(struct rq *rq, struct c
}

struct sched_entity *se = pick_eevdf(cfs_rq);
- if (se->sched_delayed) {
+ while (se && se->sched_delayed) {
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
SCHED_WARN_ON(se->sched_delayed);
SCHED_WARN_ON(se->on_rq);
if (sched_feat(DELAY_ZERO) && se->vlag > 0)
se->vlag = 0;

- return NULL;
+ se = pick_eevdf(cfs_rq);
}
return se;
}


2024-04-18 17:09:13

by K Prateek Nayak

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

Hello Mike,

On 4/18/2024 9:54 PM, Mike Galbraith wrote:
> Greetings,
>
> I tossed a couple rocks at it today, and seem to have hit the little
> bugger. The root cause seems to be doing the delay dequeue business on
> exiting tasks. Hunk #1 of hacklet below seems to quell the explosions.
>
> crash> bt -sx
> PID: 21722 TASK: ffff88815710ee40 CPU: 6 COMMAND: "hackbench"
> #0 [ffff88822773bb90] machine_kexec+0x156 at ffffffff810642d6
> #1 [ffff88822773bbe0] __crash_kexec+0xd7 at ffffffff81152a07
> #2 [ffff88822773bc98] crash_kexec+0x23 at ffffffff81152eb3
> #3 [ffff88822773bca8] oops_end+0xbe at ffffffff810329be
> #4 [ffff88822773bcc8] page_fault_oops+0x81 at ffffffff81071951
> #5 [ffff88822773bd28] exc_page_fault+0x62 at ffffffff8194e9e2
> #6 [ffff88822773bd50] asm_exc_page_fault+0x22 at ffffffff81a00ba2
> [exception RIP: pick_next_task_fair+178]
> RIP: ffffffff810d8d12 RSP: ffff88822773be00 RFLAGS: 00010006
> RAX: ffff88813cb780b8 RBX: ffff88840edb0e80 RCX: 0000000000000000
> RDX: 0000000000000000 RSI: ffff88813cb78080 RDI: ffff88840ec30f00
> RBP: ffff88813cb78000 R8: ffff88815710eec0 R9: 0000000000000001
> R10: ffff88822773bdc8 R11: 0000000000000013 R12: 0000000000030e80
> R13: ffff88815710ee40 R14: ffff88813cb78080 R15: ffff88815710ee40
> ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
> #7 [ffff88822773be28] __schedule+0x10d at ffffffff8195865d
> #8 [ffff88822773be98] do_task_dead+0x3e at ffffffff810cc00e
> #9 [ffff88822773beb0] do_exit+0x770 at ffffffff8108f0e0
> #10 [ffff88822773bf00] do_group_exit+0x2c at ffffffff8108f64c
> #11 [ffff88822773bf28] __x64_sys_exit_group+0x14 at ffffffff8108f6f4
> #12 [ffff88822773bf30] do_syscall_64+0x57 at ffffffff8194ac37
> #13 [ffff88822773bf50] entry_SYSCALL_64_after_hwframe+0x76 at ffffffff81a0012b
> RIP: 00007f4f2aa76136 RSP: 00007ffcbba84748 RFLAGS: 00000246
> RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f4f2aa76136
> RDX: 0000000000000000 RSI: 000000000000003c RDI: 0000000000000000
> RBP: 00007f4f2ab86970 R8: 00000000000000e7 R9: ffffffffffffff80
> R10: 0000000000000004 R11: 0000000000000246 R12: 00007f4f2ab86970
> R13: 0000000000000001 R14: 00007f4f2ab8a328 R15: 0000000000000000
> ORIG_RAX: 00000000000000e7 CS: 0033 SS: 002b
> crash> task_struct ffff88815710ee40 | grep sched_delayed
> sched_delayed = 1 '\001',
> crash>
>

My machine too survived the spawn test with the below patch. Thank you
for looking into this. I'll resume testing with the below patch
included.

--
Thanks and Regards,
Prateek

> ---
> kernel/sched/fair.c | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5374,6 +5374,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
> update_curr(cfs_rq);
>
> if (sched_feat(DELAY_DEQUEUE) && sleep &&
> + !(entity_is_task(se) && (task_of(se)->flags & PF_EXITING)) &&
> !entity_eligible(cfs_rq, se)) {
> if (cfs_rq->next == se)
> cfs_rq->next = NULL;
> @@ -5495,14 +5496,14 @@ pick_next_entity(struct rq *rq, struct c
> }
>
> struct sched_entity *se = pick_eevdf(cfs_rq);
> - if (se->sched_delayed) {
> + while (se && se->sched_delayed) {
> dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> SCHED_WARN_ON(se->sched_delayed);
> SCHED_WARN_ON(se->on_rq);
> if (sched_feat(DELAY_ZERO) && se->vlag > 0)
> se->vlag = 0;
>
> - return NULL;
> + se = pick_eevdf(cfs_rq);
> }
> return se;
> }
>


2024-04-20 05:58:15

by Mike Galbraith

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

(removes apparently busted bytedance.com address and retries xmit)

Greetings!

With this version, the better CPU distribution (for tbench synchronous
net-blasting) closed the CFS vs EEVDF throughput deficit. I verified
both by rolling the previous version forward and back-porting to 6.1
where I've got CFS and EEVDF to re-compare, now with both dequeue delay
patch versions.

As usual, there will be winners and losers, but (modulo dead buglet) it
looks kinda promising to me.

Distribution of single pinned buddy pair measured in master:
DELAY_DEQUEUE
----------------------------------------------------------------------------------------------------------
Task | Runtime ms | Switches | Avg delay ms | Max delay ms | Sum delay ms |
----------------------------------------------------------------------------------------------------------
tbench:(2) | 6277.099 ms | 1597104 | avg: 0.003 ms | max: 0.129 ms | sum: 4334.723 ms |
tbench_srv:(2) | 5724.971 ms | 1682629 | avg: 0.001 ms | max: 0.083 ms | sum: 2076.616 ms |
----------------------------------------------------------------------------------------------------------
TOTAL: | 12021.128 ms | 3280275 | | 1.729 ms | 6425.483 ms |
----------------------------------------------------------------------------------------------------------
client/server CPU distribution ~52%/48%

NO_DELAY_DEQUEUE
----------------------------------------------------------------------------------------------------------
Task | Runtime ms | Switches | Avg delay ms | Max delay ms | Sum delay ms |
----------------------------------------------------------------------------------------------------------
tbench:(2) | 6724.774 ms | 1546761 | avg: 0.002 ms | max: 0.409 ms | sum: 2443.549 ms |
tbench_srv:(2) | 5275.329 ms | 1571688 | avg: 0.002 ms | max: 0.086 ms | sum: 2734.151 ms |
----------------------------------------------------------------------------------------------------------
TOTAL: | 12019.641 ms | 3119000 | | 9996.367 ms | 15187.833 ms |
----------------------------------------------------------------------------------------------------------
client/server CPU distribution ~56%/44%

Note switches and delay sum. For tbench, they translate directly to
throughput. The other shoe lands with async CPU hog net-blasters, for
those, scheduler cycles tends to be wasted cycles.

-Mike

2024-04-22 13:14:18

by Tobias Huschle

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On Fri, Apr 05, 2024 at 12:28:02PM +0200, Peter Zijlstra wrote:
> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
> noting that lag is fundamentally a temporal measure. It should not be
> carried around indefinitely.
>
> OTOH it should also not be instantly discarded, doing so will allow a
> task to game the system by purposefully (micro) sleeping at the end of
> its time quantum.
>
> Since lag is intimately tied to the virtual time base, a wall-time
> based decay is also insufficient, notably competition is required for
> any of this to make sense.
>
> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
> competing until they are eligible.
>
> Strictly speaking, we only care about keeping them until the 0-lag
> point, but that is a difficult proposition, instead carry them around
> until they get picked again, and dequeue them at that point.
>
> Since we should have dequeued them at the 0-lag point, truncate lag
> (eg. don't let them earn positive lag).
>

I stumbled upon a subtle change between CFS and EEVDF which kinda relates
to the question how long lag should be carried around, but on the other
side of the 0-lag value.

Assume that we have two tasks taking turns on a single CPU.
Task 1 does something and wakes up Task 2.
Task 2 does something and goes to sleep.
And we're just repeating that.
Task 1 and task 2 only run for very short amounts of time, i.e. much
shorter than a regular time slice.

Let's further assume, that task 1 runs longer than task 2.
In CFS, this means, that vruntime of task 1 starts to outrun the vruntime
of task 2. This means that vruntime(task2) < vruntime(task1). Hence, task 2
always gets picked on wake up because it has the smaller vruntime.
In EEVDF, this would translate to a permanent positive lag, which also
causes task 2 to get consistently scheduled on wake up.

Let's now assume, that ocassionally, task 2 runs a little bit longer than
task 1. In CFS, this means, that task 2 can close the vruntime gap by a
bit, but, it can easily remain below the value of task 1. Task 2 would
still get picked on wake up.
With EEVDF, in its current form, task 2 will now get a negative lag, which
in turn, will cause it not being picked on the next wake up.

So, it seems we have a change in the level of how far the both variants look
into the past. CFS being willing to take more history into account, whereas
EEVDF does not (with update_entity_lag setting the lag value from scratch,
and place_entity not taking the original vruntime into account).

All of this can be seen as correct by design, a task consumes more time
than the others, so it has to give way to others. The big difference
is now, that CFS allowed a task to collect some bonus by constantly using
less CPU time than others and trading that time against ocassionally taking
more CPU time. EEVDF could do the same thing, by allowing the accumulation
of (positive) lag, which can then be traded against the one time the task
would get negative lag.

This might of course clash with other EEVDF assumptions but blends with the
question on how long lag should be preserved.

I stumbled upon a performance degredation caused by this nuance. EEVDF is
not necessarily at fault here. The problem rather lies in a component
that relies on CFS allowing the above mentioned bonus aggregation while
knowing that the corresponding task 2 has to run.
See: https://lore.kernel.org/netdev/ZWbapeL34Z8AMR5f@DESKTOP-2CCOB1S./T/
With the reasoning above, the degradation can be explained perfectly, and
even be fixed by allowing the accumulation of lag by applying the patch
below.

I was just wondering if my assumptions here are correct and whether this
would be an implicit change that one should be aware about.


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 03be0d1330a6..b83a72311d2a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -701,7 +701,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
s64 lag, limit;

SCHED_WARN_ON(!se->on_rq);
- lag = avg_vruntime(cfs_rq) - se->vruntime;
+ lag = se->vlag + avg_vruntime(cfs_rq) - se->vruntime;

limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
se->vlag = clamp(lag, -limit, limit);

2024-04-24 15:16:22

by Luis Machado

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

Hi,

On 4/15/24 18:07, Luis Machado wrote:
> Hi Peter,
>
> On 4/5/24 11:28, Peter Zijlstra wrote:
>> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
>> noting that lag is fundamentally a temporal measure. It should not be
>> carried around indefinitely.
>>
>> OTOH it should also not be instantly discarded, doing so will allow a
>> task to game the system by purposefully (micro) sleeping at the end of
>> its time quantum.
>>
>> Since lag is intimately tied to the virtual time base, a wall-time
>> based decay is also insufficient, notably competition is required for
>> any of this to make sense.
>>
>> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
>> competing until they are eligible.
>>
>> Strictly speaking, we only care about keeping them until the 0-lag
>> point, but that is a difficult proposition, instead carry them around
>> until they get picked again, and dequeue them at that point.
>>
>> Since we should have dequeued them at the 0-lag point, truncate lag
>> (eg. don't let them earn positive lag).
>>
>> XXX test the cfs-throttle stuff
>>
>> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
>
> Playing around with a Pixel 6 running a 6.6-based kernel with this
> series backported on top, I spotted a noticeable performance improvement
> on the speedometer benchmark:
>
> - m6.6-stock-* is the 6.6 mainline Android kernel unmodified.
>
> - m6.6-eevdf-complete-* is the 6.6 mainline Android kernel with
> this series applied on top (along with a few required backported
> patches).
>
> +-------------------+-----------------------+-----------+
> | metric | tag | perc_diff |
> +-------------------+-----------------------+-----------+
> | Speedometer Score | m6.6-stock-1 | 0.0% |
> | Speedometer Score | m6.6-stock-2 | 1.23% |
> | Speedometer Score | m6.6-stock-3 | -0.22% |
> | Speedometer Score | m6.6-eevdf-complete-1 | 4.54% |
> | Speedometer Score | m6.6-eevdf-complete-2 | 4.95% |
> | Speedometer Score | m6.6-eevdf-complete-3 | 6.07% |
> +-------------------+-----------------------+-----------+
>
> Also some interesting improvements in terms of frame timing for the
> uibenchjanktests benchmark. In particular the metrics of missed
> deadlines and jank (late) frames, which seems to indicate better
> latencies.
>
> +-----------------------+-----------------------+-----------+
> | metric | tag | perc_diff |
> +-----------------------+-----------------------+-----------+
> | gfx-avg-frame-time-50 | m6.6-stock-1 | 0.0 |
> | gfx-avg-frame-time-90 | m6.6-stock-1 | 0.0 |
> | gfx-avg-frame-time-95 | m6.6-stock-1 | 0.0 |
> | gfx-avg-frame-time-99 | m6.6-stock-1 | 0.0 |
> | gfx-avg-frame-time-50 | m6.6-stock-2 | 3.46 |
> | gfx-avg-frame-time-90 | m6.6-stock-2 | 1.19 |
> | gfx-avg-frame-time-95 | m6.6-stock-2 | 0.24 |
> | gfx-avg-frame-time-99 | m6.6-stock-2 | 0.48 |
> | gfx-avg-frame-time-50 | m6.6-eevdf-complete-1 | -30.45 |
> | gfx-avg-frame-time-90 | m6.6-eevdf-complete-1 | -48.44 |
> | gfx-avg-frame-time-95 | m6.6-eevdf-complete-1 | -51.32 |
> | gfx-avg-frame-time-99 | m6.6-eevdf-complete-1 | -52.48 |
> | gfx-avg-frame-time-50 | m6.6-eevdf-complete-2 | -30.32 |
> | gfx-avg-frame-time-90 | m6.6-eevdf-complete-2 | -48.16 |
> | gfx-avg-frame-time-95 | m6.6-eevdf-complete-2 | -51.08 |
> | gfx-avg-frame-time-99 | m6.6-eevdf-complete-2 | -51.7 |
> +-----------------------+-----------------------+-----------+
>
> +-----------------------------------+-----------------------+-----------+
> | metric | tag | perc_diff |
> +-----------------------------------+-----------------------+-----------+
> | gfx-avg-num-frame-deadline-missed | m6.6-stock-1 | 0.0 |
> | gfx-max-num-frame-deadline-missed | m6.6-stock-1 | 0.0 |
> | gfx-avg-num-frame-deadline-missed | m6.6-stock-2 | -3.21 |
> | gfx-max-num-frame-deadline-missed | m6.6-stock-2 | -3.21 |
> | gfx-avg-num-frame-deadline-missed | m6.6-eevdf-complete-1 | -85.29 |
> | gfx-max-num-frame-deadline-missed | m6.6-eevdf-complete-1 | -85.29 |
> | gfx-avg-num-frame-deadline-missed | m6.6-eevdf-complete-2 | -84.8 |
> | gfx-max-num-frame-deadline-missed | m6.6-eevdf-complete-2 | -84.8 |
> +-----------------------------------+-----------------------+-----------+
>
> +----------------------------+-----------------------+-----------+
> | metric | tag | perc_diff |
> +----------------------------+-----------------------+-----------+
> | gfx-avg-high-input-latency | m6.6-stock-1 | 0.0 |
> | gfx-max-high-input-latency | m6.6-stock-1 | 0.0 |
> | gfx-avg-high-input-latency | m6.6-stock-2 | 0.93 |
> | gfx-max-high-input-latency | m6.6-stock-2 | 0.93 |
> | gfx-avg-high-input-latency | m6.6-eevdf-complete-1 | -18.35 |
> | gfx-max-high-input-latency | m6.6-eevdf-complete-1 | -18.35 |
> | gfx-avg-high-input-latency | m6.6-eevdf-complete-2 | -18.05 |
> | gfx-max-high-input-latency | m6.6-eevdf-complete-2 | -18.05 |
> +----------------------------+-----------------------+-----------+
>
> +--------------+-----------------------+-----------+
> | metric | tag | perc_diff |
> +--------------+-----------------------+-----------+
> | gfx-avg-jank | m6.6-stock-1 | 0.0 |
> | gfx-max-jank | m6.6-stock-1 | 0.0 |
> | gfx-avg-jank | m6.6-stock-2 | 1.56 |
> | gfx-max-jank | m6.6-stock-2 | 1.56 |
> | gfx-avg-jank | m6.6-eevdf-complete-1 | -82.81 |
> | gfx-max-jank | m6.6-eevdf-complete-1 | -82.81 |
> | gfx-avg-jank | m6.6-eevdf-complete-2 | -78.12 |
> | gfx-max-jank | m6.6-eevdf-complete-2 | -78.12 |
> +--------------+-----------------------+-----------+
>
> Bisecting through the patches in this series, I ended up with patch 08/10
> as the one that improved things overall for these benchmarks.
>
> I'd like to investigate this further to understand the reason behind some of
> these dramatic improvements.
>

Investigating these improvements a bit more, I noticed they came with a significantly
higher power usage on the Pixel6 (where EAS is enabled). I bisected it down to the delayed
dequeue patch. Disabling DELAY_DEQUEUE and DELAY_ZERO at runtime doesn't help in bringing
the power usage down.

Though I don't fully understand the reason behind this change in behavior yet, I did spot
the benchmark processes running almost entirely on the big core cluster, with little
to no use of the little core and mid core clusters.

That would explain higher power usage and also the significant jump in performance.

I wonder if the delayed dequeue logic is having an unwanted effect on the calculation of
utilization/load of the runqueue and, as a consequence, we're scheduling things to run on
higher OPP's in the big cores, leading to poor decisions for energy efficiency.



2024-04-24 18:36:06

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On Thu, Apr 18, 2024 at 06:24:59PM +0200, Mike Galbraith wrote:
> Greetings,
>
> I tossed a couple rocks at it today, and seem to have hit the little
> bugger. The root cause seems to be doing the delay dequeue business on
> exiting tasks. Hunk #1 of hacklet below seems to quell the explosions.
>

Neat, Thanks! I was just about ready to go look at this again.

>
> ---
> kernel/sched/fair.c | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5374,6 +5374,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
> update_curr(cfs_rq);
>
> if (sched_feat(DELAY_DEQUEUE) && sleep &&
> + !(entity_is_task(se) && (task_of(se)->flags & PF_EXITING)) &&
> !entity_eligible(cfs_rq, se)) {
> if (cfs_rq->next == se)
> cfs_rq->next = NULL;
> @@ -5495,14 +5496,14 @@ pick_next_entity(struct rq *rq, struct c
> }
>
> struct sched_entity *se = pick_eevdf(cfs_rq);
> - if (se->sched_delayed) {
> + while (se && se->sched_delayed) {
> dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> SCHED_WARN_ON(se->sched_delayed);
> SCHED_WARN_ON(se->on_rq);
> if (sched_feat(DELAY_ZERO) && se->vlag > 0)
> se->vlag = 0;
>
> - return NULL;
> + se = pick_eevdf(cfs_rq);
> }
> return se;
> }
>

2024-04-25 10:25:39

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On Thu, Apr 11, 2024 at 09:32:23AM +0800, Yan-Jie Wang wrote:
> I have an alternative approach to delayed-dequeue inspired by the original
> CFS implementation.
>
> The idea is to keep the task's vruntime when it goes to sleep.
> When the task is woken up, see if the lag is positive at the woken time, if
> it is the case, clamp it to 0 by setting vruntime to avg_vruntime().

Problem is that avg_vruntime() can go backwards, eg by dequeueing a
negative lag task. This gets really hard to argue about real quick.

Keeping the task competing (delaying the dequeue) is by far the simplest
solution -- conceptually.

The code just became a total mess because of cgroups :/

2024-04-25 10:42:41

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On Wed, Apr 24, 2024 at 04:15:42PM +0100, Luis Machado wrote:

> > Bisecting through the patches in this series, I ended up with patch 08/10
> > as the one that improved things overall for these benchmarks.
> >
> > I'd like to investigate this further to understand the reason behind some of
> > these dramatic improvements.
> >
>
> Investigating these improvements a bit more, I noticed they came with a significantly
> higher power usage on the Pixel6 (where EAS is enabled). I bisected it down to the delayed
> dequeue patch. Disabling DELAY_DEQUEUE and DELAY_ZERO at runtime doesn't help in bringing
> the power usage down.

Hmm, that is unexpected. The intent was for NO_DELAY_DEQUEUE to fully
disable things. I'll go have a prod at it.

> Though I don't fully understand the reason behind this change in behavior yet, I did spot
> the benchmark processes running almost entirely on the big core cluster, with little
> to no use of the little core and mid core clusters.
>
> That would explain higher power usage and also the significant jump in performance.

ISTR you (arm) has these tools to trace and plot the varioud util
values. This should be readily reflected there if that is the case, no?

> I wonder if the delayed dequeue logic is having an unwanted effect on the calculation of
> utilization/load of the runqueue and, as a consequence, we're scheduling things to run on
> higher OPP's in the big cores, leading to poor decisions for energy efficiency.

Notably util_est_update() gets delayed. Given we don't actually do an
enqueue when a delayed task gets woken, it didn't seem to make sense to
update that sooner.

I'll go over all that again.

2024-04-25 11:29:20

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On Thu, Apr 18, 2024 at 06:24:59PM +0200, Mike Galbraith wrote:
> The root cause seems to be doing the delay dequeue business on
> exiting tasks.

> ---
> kernel/sched/fair.c | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5374,6 +5374,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
> update_curr(cfs_rq);
>
> if (sched_feat(DELAY_DEQUEUE) && sleep &&
> + !(entity_is_task(se) && (task_of(se)->flags & PF_EXITING)) &&
> !entity_eligible(cfs_rq, se)) {
> if (cfs_rq->next == se)
> cfs_rq->next = NULL;

So I think this can be easier done in dequeue_task_fair(), where we
still know this is a task.

Perhaps something like (I'll test later):

if (p->flags & PF_EXITING)
flags &= ~DEQUEUE_SLEEP;

But now I need to go think about the case of removing a cgroup...
*urgh*.

2024-04-25 14:44:24

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On Thu, Apr 25, 2024 at 12:42:20PM +0200, Peter Zijlstra wrote:

> > I wonder if the delayed dequeue logic is having an unwanted effect on the calculation of
> > utilization/load of the runqueue and, as a consequence, we're scheduling things to run on
> > higher OPP's in the big cores, leading to poor decisions for energy efficiency.
>
> Notably util_est_update() gets delayed. Given we don't actually do an
> enqueue when a delayed task gets woken, it didn't seem to make sense to
> update that sooner.

The PELT runnable values will be inflated because of delayed dequeue.
cpu_util() uses those in the @boost case, and as such this can indeed
affect things.

This can also slightly affect the cgroup case, but since the delay goes
away as contention goes away, and the cgroup case must already assume
worst case overlap, this seems limited.

/me goes ponder things moar.

2024-04-26 09:37:33

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On Fri, Apr 26, 2024 at 11:32:41AM +0200, Peter Zijlstra wrote:
> On Thu, Apr 25, 2024 at 01:49:49PM +0200, Peter Zijlstra wrote:
> > On Thu, Apr 25, 2024 at 12:42:20PM +0200, Peter Zijlstra wrote:
> >
> > > > I wonder if the delayed dequeue logic is having an unwanted effect on the calculation of
> > > > utilization/load of the runqueue and, as a consequence, we're scheduling things to run on
> > > > higher OPP's in the big cores, leading to poor decisions for energy efficiency.
> > >
> > > Notably util_est_update() gets delayed. Given we don't actually do an
> > > enqueue when a delayed task gets woken, it didn't seem to make sense to
> > > update that sooner.
> >
> > The PELT runnable values will be inflated because of delayed dequeue.
> > cpu_util() uses those in the @boost case, and as such this can indeed
> > affect things.
> >
> > This can also slightly affect the cgroup case, but since the delay goes
> > away as contention goes away, and the cgroup case must already assume
> > worst case overlap, this seems limited.
> >
> > /me goes ponder things moar.
>
> First order approximation of a fix would be something like the totally
> untested below I suppose...
>
> ---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index cfd1fd188d29..f3f70b5adca0 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5391,6 +5391,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> if (cfs_rq->next == se)
> cfs_rq->next = NULL;
> se->sched_delayed = 1;
> + update_load_avg(cfs_rq, se, 0);
> return false;
> }
> }
> @@ -6817,6 +6818,7 @@ requeue_delayed_entity(struct sched_entity *se)
> }
>
> se->sched_delayed = 0;
> + update_load_avg(qcfs_rq, se, 0);

Compiler demands this ^^^^^ be cfs_rq instead ;-) brown_paper_bags--;

> }
>
> /*
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index d07a3b98f1fb..d16529613123 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -810,6 +810,9 @@ static inline void se_update_runnable(struct sched_entity *se)
>
> static inline long se_runnable(struct sched_entity *se)
> {
> + if (se->sched_delayed)
> + return false;
> +
> if (entity_is_task(se))
> return !!se->on_rq;
> else
> @@ -823,6 +826,9 @@ static inline void se_update_runnable(struct sched_entity *se) {}
>
> static inline long se_runnable(struct sched_entity *se)
> {
> + if (se->sched_delayed)
> + return false;
> +
> return !!se->on_rq;
> }
> #endif

2024-04-26 10:19:30

by Luis Machado

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On 4/26/24 10:32, Peter Zijlstra wrote:
> On Thu, Apr 25, 2024 at 01:49:49PM +0200, Peter Zijlstra wrote:
>> On Thu, Apr 25, 2024 at 12:42:20PM +0200, Peter Zijlstra wrote:
>>
>>>> I wonder if the delayed dequeue logic is having an unwanted effect on the calculation of
>>>> utilization/load of the runqueue and, as a consequence, we're scheduling things to run on
>>>> higher OPP's in the big cores, leading to poor decisions for energy efficiency.
>>>
>>> Notably util_est_update() gets delayed. Given we don't actually do an
>>> enqueue when a delayed task gets woken, it didn't seem to make sense to
>>> update that sooner.
>>
>> The PELT runnable values will be inflated because of delayed dequeue.
>> cpu_util() uses those in the @boost case, and as such this can indeed
>> affect things.
>>
>> This can also slightly affect the cgroup case, but since the delay goes
>> away as contention goes away, and the cgroup case must already assume
>> worst case overlap, this seems limited.
>>
>> /me goes ponder things moar.
>
> First order approximation of a fix would be something like the totally
> untested below I suppose...


Thanks Peter. Let me give it a try and I'll report back.

>
> ---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index cfd1fd188d29..f3f70b5adca0 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5391,6 +5391,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> if (cfs_rq->next == se)
> cfs_rq->next = NULL;
> se->sched_delayed = 1;
> + update_load_avg(cfs_rq, se, 0);
> return false;
> }
> }
> @@ -6817,6 +6818,7 @@ requeue_delayed_entity(struct sched_entity *se)
> }
>
> se->sched_delayed = 0;
> + update_load_avg(qcfs_rq, se, 0);
> }
>
> /*
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index d07a3b98f1fb..d16529613123 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -810,6 +810,9 @@ static inline void se_update_runnable(struct sched_entity *se)
>
> static inline long se_runnable(struct sched_entity *se)
> {
> + if (se->sched_delayed)
> + return false;
> +
> if (entity_is_task(se))
> return !!se->on_rq;
> else
> @@ -823,6 +826,9 @@ static inline void se_update_runnable(struct sched_entity *se) {}
>
> static inline long se_runnable(struct sched_entity *se)
> {
> + if (se->sched_delayed)
> + return false;
> +
> return !!se->on_rq;
> }
> #endif


2024-04-26 10:22:42

by Luis Machado

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On 4/25/24 11:42, Peter Zijlstra wrote:
> On Wed, Apr 24, 2024 at 04:15:42PM +0100, Luis Machado wrote:
>
>>> Bisecting through the patches in this series, I ended up with patch 08/10
>>> as the one that improved things overall for these benchmarks.
>>>
>>> I'd like to investigate this further to understand the reason behind some of
>>> these dramatic improvements.
>>>
>>
>> Investigating these improvements a bit more, I noticed they came with a significantly
>> higher power usage on the Pixel6 (where EAS is enabled). I bisected it down to the delayed
>> dequeue patch. Disabling DELAY_DEQUEUE and DELAY_ZERO at runtime doesn't help in bringing
>> the power usage down.
>
> Hmm, that is unexpected. The intent was for NO_DELAY_DEQUEUE to fully
> disable things. I'll go have a prod at it.

I'm running a few more numbers to confirm this situation with the feature switches.

>
>> Though I don't fully understand the reason behind this change in behavior yet, I did spot
>> the benchmark processes running almost entirely on the big core cluster, with little
>> to no use of the little core and mid core clusters.
>>
>> That would explain higher power usage and also the significant jump in performance.
>
> ISTR you (arm) has these tools to trace and plot the varioud util
> values. This should be readily reflected there if that is the case, no?

Indeed we do, but I'm still in the process of compiling those numbers into a meaningful plot,
so I'm afraid I don't have those handy yet, sorry.

>
>> I wonder if the delayed dequeue logic is having an unwanted effect on the calculation of
>> utilization/load of the runqueue and, as a consequence, we're scheduling things to run on
>> higher OPP's in the big cores, leading to poor decisions for energy efficiency.
>
> Notably util_est_update() gets delayed. Given we don't actually do an
> enqueue when a delayed task gets woken, it didn't seem to make sense to
> update that sooner.
>
> I'll go over all that again

2024-04-26 10:37:19

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On Thu, Apr 25, 2024 at 01:49:49PM +0200, Peter Zijlstra wrote:
> On Thu, Apr 25, 2024 at 12:42:20PM +0200, Peter Zijlstra wrote:
>
> > > I wonder if the delayed dequeue logic is having an unwanted effect on the calculation of
> > > utilization/load of the runqueue and, as a consequence, we're scheduling things to run on
> > > higher OPP's in the big cores, leading to poor decisions for energy efficiency.
> >
> > Notably util_est_update() gets delayed. Given we don't actually do an
> > enqueue when a delayed task gets woken, it didn't seem to make sense to
> > update that sooner.
>
> The PELT runnable values will be inflated because of delayed dequeue.
> cpu_util() uses those in the @boost case, and as such this can indeed
> affect things.
>
> This can also slightly affect the cgroup case, but since the delay goes
> away as contention goes away, and the cgroup case must already assume
> worst case overlap, this seems limited.
>
> /me goes ponder things moar.

First order approximation of a fix would be something like the totally
untested below I suppose...

---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cfd1fd188d29..f3f70b5adca0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5391,6 +5391,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (cfs_rq->next == se)
cfs_rq->next = NULL;
se->sched_delayed = 1;
+ update_load_avg(cfs_rq, se, 0);
return false;
}
}
@@ -6817,6 +6818,7 @@ requeue_delayed_entity(struct sched_entity *se)
}

se->sched_delayed = 0;
+ update_load_avg(qcfs_rq, se, 0);
}

/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d07a3b98f1fb..d16529613123 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -810,6 +810,9 @@ static inline void se_update_runnable(struct sched_entity *se)

static inline long se_runnable(struct sched_entity *se)
{
+ if (se->sched_delayed)
+ return false;
+
if (entity_is_task(se))
return !!se->on_rq;
else
@@ -823,6 +826,9 @@ static inline void se_update_runnable(struct sched_entity *se) {}

static inline long se_runnable(struct sched_entity *se)
{
+ if (se->sched_delayed)
+ return false;
+
return !!se->on_rq;
}
#endif

2024-04-26 11:16:28

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On Fri, Apr 26, 2024 at 12:56:07PM +0200, Peter Zijlstra wrote:
> On Thu, Apr 25, 2024 at 01:28:55PM +0200, Peter Zijlstra wrote:
> > On Thu, Apr 18, 2024 at 06:24:59PM +0200, Mike Galbraith wrote:
> > > The root cause seems to be doing the delay dequeue business on
> > > exiting tasks.
> >
> > > ---
> > > kernel/sched/fair.c | 5 +++--
> > > 1 file changed, 3 insertions(+), 2 deletions(-)
> > >
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -5374,6 +5374,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
> > > update_curr(cfs_rq);
> > >
> > > if (sched_feat(DELAY_DEQUEUE) && sleep &&
> > > + !(entity_is_task(se) && (task_of(se)->flags & PF_EXITING)) &&
> > > !entity_eligible(cfs_rq, se)) {
> > > if (cfs_rq->next == se)
> > > cfs_rq->next = NULL;
> >
> > So I think this can be easier done in dequeue_task_fair(), where we
> > still know this is a task.
> >
> > Perhaps something like (I'll test later):
> >
> > if (p->flags & PF_EXITING)
> > flags &= ~DEQUEUE_SLEEP;
> >
> > But now I need to go think about the case of removing a cgroup...
> > *urgh*.
>
> I ended up with the below instead; lemme go run this unixbench spawn on it.

Seems to survive that.

I pushed out the patches with updates to queue/sched/eevdf

2024-04-26 14:11:54

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On Thu, Apr 25, 2024 at 01:28:55PM +0200, Peter Zijlstra wrote:
> On Thu, Apr 18, 2024 at 06:24:59PM +0200, Mike Galbraith wrote:
> > The root cause seems to be doing the delay dequeue business on
> > exiting tasks.
>
> > ---
> > kernel/sched/fair.c | 5 +++--
> > 1 file changed, 3 insertions(+), 2 deletions(-)
> >
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -5374,6 +5374,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
> > update_curr(cfs_rq);
> >
> > if (sched_feat(DELAY_DEQUEUE) && sleep &&
> > + !(entity_is_task(se) && (task_of(se)->flags & PF_EXITING)) &&
> > !entity_eligible(cfs_rq, se)) {
> > if (cfs_rq->next == se)
> > cfs_rq->next = NULL;
>
> So I think this can be easier done in dequeue_task_fair(), where we
> still know this is a task.
>
> Perhaps something like (I'll test later):
>
> if (p->flags & PF_EXITING)
> flags &= ~DEQUEUE_SLEEP;
>
> But now I need to go think about the case of removing a cgroup...
> *urgh*.

I ended up with the below instead; lemme go run this unixbench spawn on it.

---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 95666034e76c..b5918fa9a0f0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8429,7 +8431,20 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)

static void task_dead_fair(struct task_struct *p)
{
- remove_entity_load_avg(&p->se);
+ struct sched_entity *se = &p->se;
+
+ if (p->se.sched_delayed) {
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+ if (se->sched_delayed)
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ task_rq_unlock(rq, p, &rf);
+ }
+
+ remove_entity_load_avg(se);
}

/*
@@ -13089,28 +13104,34 @@ void online_fair_sched_group(struct task_group *tg)

void unregister_fair_sched_group(struct task_group *tg)
{
- unsigned long flags;
- struct rq *rq;
int cpu;

destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));

for_each_possible_cpu(cpu) {
- if (tg->se[cpu])
- remove_entity_load_avg(tg->se[cpu]);
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+ struct sched_entity *se = tg->se[cpu];
+ struct rq *rq = cpu_rq(cpu);
+
+ if (se) {
+ if (se->sched_delayed) {
+ guard(rq_lock_irqsave)(rq);
+ update_rq_clock(rq);
+ if (se->sched_delayed)
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
+ remove_entity_load_avg(se);
+ }

/*
* Only empty task groups can be destroyed; so we can speculatively
* check on_list without danger of it being re-added.
*/
- if (!tg->cfs_rq[cpu]->on_list)
- continue;
-
- rq = cpu_rq(cpu);
-
- raw_spin_rq_lock_irqsave(rq, flags);
- list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- raw_spin_rq_unlock_irqrestore(rq, flags);
+ if (cfs_rq->on_list) {
+ guard(rq_lock_irqsave)(rq);
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
}
}


2024-04-26 16:05:04

by Mike Galbraith

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On Fri, 2024-04-26 at 13:16 +0200, Peter Zijlstra wrote:
> >
> > I ended up with the below instead; lemme go run this unixbench
> > spawn on it.
>
> Seems to survive that.
>
> I pushed out the patches with updates to queue/sched/eevdf

Yup, solid... but fwiw, tbench liked the previous version better.

trusty (with a 'c') ole i7-4790 box, tbench 8

for i in 1 2 3; do tbench.sh 8 10 2>&1|grep Throughput; done

6.9.0.gc942a0c-master +eevdf.current
NO_DELAY_DEQUEUE
Throughput 3285.04 MB/sec 8 clients 8 procs max_latency=3.481 ms
Throughput 3289.66 MB/sec 8 clients 8 procs max_latency=8.124 ms
Throughput 3293.83 MB/sec 8 clients 8 procs max_latency=2.210 ms
DELAY_DEQUEUE
Throughput 3246.3 MB/sec 8 clients 8 procs max_latency=2.181 ms
Throughput 3236.96 MB/sec 8 clients 8 procs max_latency=6.988 ms
Throughput 3248.6 MB/sec 8 clients 8 procs max_latency=2.130 ms

6.9.0.gc942a0c-master +eevdf.prev
NO_DELAY_DEQUEUE
Throughput 3457.92 MB/sec 8 clients 8 procs max_latency=3.885 ms
Throughput 3470.95 MB/sec 8 clients 8 procs max_latency=4.475 ms
Throughput 3467.87 MB/sec 8 clients 8 procs max_latency=2.182 ms
DELAY_DEQUEUE
Throughput 3712.96 MB/sec 8 clients 8 procs max_latency=4.231 ms
Throughput 3667.87 MB/sec 8 clients 8 procs max_latency=5.020 ms
Throughput 3679.65 MB/sec 8 clients 8 procs max_latency=2.847 ms

Trees are identical modulo extracted eevdf additions. The previous win
that put eevdf on par with cfs went missing.. and then some.

For reference, cfs vs eevdf log extract for previously mentioned gain.

6.1.87-cfs
Throughput 3660.98 MB/sec 8 clients 8 procs max_latency=2.204 ms
Throughput 3678.67 MB/sec 8 clients 8 procs max_latency=10.127 ms
Throughput 3631.89 MB/sec 8 clients 8 procs max_latency=13.019 ms
1.000

6.1.87-eevdf - naked eevdf +fixes
Throughput 3441.86 MB/sec 8 clients 8 procs max_latency=3.943 ms
Throughput 3439.68 MB/sec 8 clients 8 procs max_latency=4.285 ms
Throughput 3432.28 MB/sec 8 clients 8 procs max_latency=3.557 ms
vs cfs .940

6.1.87-eevdf +delay_dequeue.prev patch set
DELAY_DEQUEUE
Throughput 3696.94 MB/sec 8 clients 8 procs max_latency=2.179 ms
Throughput 3694.64 MB/sec 8 clients 8 procs max_latency=6.322 ms
Throughput 3654.49 MB/sec 8 clients 8 procs max_latency=4.101 ms
vs cfs 1.006

box waxes nostalgic (son, when I was yo age [flex])
4.19.312
Throughput 4099.07 MB/sec 8 clients 8 procs max_latency=2.169 ms
Throughput 4107.49 MB/sec 8 clients 8 procs max_latency=12.404 ms
Throughput 4118.41 MB/sec 8 clients 8 procs max_latency=14.150 ms

-Mike

2024-04-27 06:44:09

by Mike Galbraith

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On Fri, 2024-04-26 at 18:03 +0200, Mike Galbraith wrote:
> fwiw, tbench liked the previous version better.

The culprit lies somewhere in the new PREEMPT_SHORT patch.

-Mike

2024-04-28 16:34:10

by Mike Galbraith

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On Sat, 2024-04-27 at 08:42 +0200, Mike Galbraith wrote:
> On Fri, 2024-04-26 at 18:03 +0200, Mike Galbraith wrote:
> > fwiw, tbench liked the previous version better.
>
> The culprit lies somewhere in the new PREEMPT_SHORT patch.

<squint>

Ah, moving the curr eligibility check into pick_curr() left an
ineligible curr to be seen/used further down in pick_eevdf().

found:
if (!best || (curr && entity_before(curr, best)))
best = curr;

Nit: PREEMPT_SHORT depending on RUN_TO_PARITY looks odd.

-Mike

2024-04-29 12:15:49

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On Sun, Apr 28, 2024 at 06:32:49PM +0200, Mike Galbraith wrote:
> On Sat, 2024-04-27 at 08:42 +0200, Mike Galbraith wrote:
> > On Fri, 2024-04-26 at 18:03 +0200, Mike Galbraith wrote:
> > > fwiw, tbench liked the previous version better.
> >
> > The culprit lies somewhere in the new PREEMPT_SHORT patch.
>
> <squint>
>
> Ah, moving the curr eligibility check into pick_curr() left an
> ineligible curr to be seen/used further down in pick_eevdf().
>
> found:
> if (!best || (curr && entity_before(curr, best)))
> best = curr;

Hmm yes, over aggressive cleanup that. Let me try again.

> Nit: PREEMPT_SHORT depending on RUN_TO_PARITY looks odd.

The thinking was that without RUN_TO_PARITY we'll always do a full pick
and it will always pick a (new) shorter deadline task over current.

The PREEMPRT_SHORT thing really is an exception to avoid RUN_TO_PARITY
from ruining latency game for short tasks.

2024-04-29 14:33:41

by Luis Machado

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

Hi Peter,

On 4/26/24 10:32, Peter Zijlstra wrote:
> On Thu, Apr 25, 2024 at 01:49:49PM +0200, Peter Zijlstra wrote:
>> On Thu, Apr 25, 2024 at 12:42:20PM +0200, Peter Zijlstra wrote:
>>
>>>> I wonder if the delayed dequeue logic is having an unwanted effect on the calculation of
>>>> utilization/load of the runqueue and, as a consequence, we're scheduling things to run on
>>>> higher OPP's in the big cores, leading to poor decisions for energy efficiency.
>>>
>>> Notably util_est_update() gets delayed. Given we don't actually do an
>>> enqueue when a delayed task gets woken, it didn't seem to make sense to
>>> update that sooner.
>>
>> The PELT runnable values will be inflated because of delayed dequeue.
>> cpu_util() uses those in the @boost case, and as such this can indeed
>> affect things.
>>
>> This can also slightly affect the cgroup case, but since the delay goes
>> away as contention goes away, and the cgroup case must already assume
>> worst case overlap, this seems limited.
>>
>> /me goes ponder things moar.
>
> First order approximation of a fix would be something like the totally
> untested below I suppose...

I gave this a try on the Pixel 6, and I noticed some improvement (see below), but not
enough to bring it back to the original levels.

(1) m6.6-stock - Basic EEVDF with wakeup preemption fix (baseline)
(2) m6.6-eevdf-complete: m6.6-stock plus this series.
(3) m6.6-eevdf-complete-no-delay-dequeue: (2) + NO_DELAY_DEQUEUE
(4) m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero: (2) + NO_DELAY_DEQUEUE + NO_DELAY_ZERO
(5) m6.6-eevdf-complete-no-delay-zero: (2) + NO_DELAY_ZERO
(6) m6.6-eevdf-complete-pelt-fix: (2) + the proposed load_avg update patch.

I included (3), (4) and (5) to exercise the impact of disabling the individual
scheduler features.


Energy use.

+------------+------------------------------------------------------+-----------+
| cluster | tag | perc_diff |
+------------+------------------------------------------------------+-----------+
| CPU | m6.6-stock | 0.0% |
| CPU-Big | m6.6-stock | 0.0% |
| CPU-Little | m6.6-stock | 0.0% |
| CPU-Mid | m6.6-stock | 0.0% |
| GPU | m6.6-stock | 0.0% |
| Total | m6.6-stock | 0.0% |
| CPU | m6.6-eevdf-complete | 114.51% |
| CPU-Big | m6.6-eevdf-complete | 90.75% |
| CPU-Little | m6.6-eevdf-complete | 98.74% |
| CPU-Mid | m6.6-eevdf-complete | 213.9% |
| GPU | m6.6-eevdf-complete | -7.04% |
| Total | m6.6-eevdf-complete | 100.92% |
| CPU | m6.6-eevdf-complete-no-delay-dequeue | 117.77% |
| CPU-Big | m6.6-eevdf-complete-no-delay-dequeue | 113.79% |
| CPU-Little | m6.6-eevdf-complete-no-delay-dequeue | 97.47% |
| CPU-Mid | m6.6-eevdf-complete-no-delay-dequeue | 189.0% |
| GPU | m6.6-eevdf-complete-no-delay-dequeue | -6.74% |
| Total | m6.6-eevdf-complete-no-delay-dequeue | 103.84% |
| CPU | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 120.45% |
| CPU-Big | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 113.65% |
| CPU-Little | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 99.04% |
| CPU-Mid | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 201.14% |
| GPU | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | -5.37% |
| Total | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 106.38% |
| CPU | m6.6-eevdf-complete-no-delay-zero | 119.05% |
| CPU-Big | m6.6-eevdf-complete-no-delay-zero | 107.55% |
| CPU-Little | m6.6-eevdf-complete-no-delay-zero | 98.66% |
| CPU-Mid | m6.6-eevdf-complete-no-delay-zero | 206.58% |
| GPU | m6.6-eevdf-complete-no-delay-zero | -5.25% |
| Total | m6.6-eevdf-complete-no-delay-zero | 105.14% |
| CPU | m6.6-eevdf-complete-pelt-fix | 105.56% |
| CPU-Big | m6.6-eevdf-complete-pelt-fix | 100.45% |
| CPU-Little | m6.6-eevdf-complete-pelt-fix | 94.4% |
| CPU-Mid | m6.6-eevdf-complete-pelt-fix | 150.94% |
| GPU | m6.6-eevdf-complete-pelt-fix | -3.96% |
| Total | m6.6-eevdf-complete-pelt-fix | 93.31% |
+------------+------------------------------------------------------+-----------+

Utilization and load levels.

+---------+------------------------------------------------------+----------+-----------+
| cluster | tag | variable | perc_diff |
+---------+------------------------------------------------------+----------+-----------+
| little | m6.6-stock | load | 0.0% |
| little | m6.6-stock | util | 0.0% |
| little | m6.6-eevdf-complete | load | 29.56% |
| little | m6.6-eevdf-complete | util | 55.4% |
| little | m6.6-eevdf-complete-no-delay-dequeue | load | 42.89% |
| little | m6.6-eevdf-complete-no-delay-dequeue | util | 69.47% |
| little | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | load | 51.05% |
| little | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | util | 76.55% |
| little | m6.6-eevdf-complete-no-delay-zero | load | 34.51% |
| little | m6.6-eevdf-complete-no-delay-zero | util | 72.53% |
| little | m6.6-eevdf-complete-pelt-fix | load | 29.96% |
| little | m6.6-eevdf-complete-pelt-fix | util | 59.82% |
| mid | m6.6-stock | load | 0.0% |
| mid | m6.6-stock | util | 0.0% |
| mid | m6.6-eevdf-complete | load | 29.37% |
| mid | m6.6-eevdf-complete | util | 75.22% |
| mid | m6.6-eevdf-complete-no-delay-dequeue | load | 36.4% |
| mid | m6.6-eevdf-complete-no-delay-dequeue | util | 80.28% |
| mid | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | load | 30.35% |
| mid | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | util | 90.2% |
| mid | m6.6-eevdf-complete-no-delay-zero | load | 37.83% |
| mid | m6.6-eevdf-complete-no-delay-zero | util | 93.79% |
| mid | m6.6-eevdf-complete-pelt-fix | load | 33.57% |
| mid | m6.6-eevdf-complete-pelt-fix | util | 67.83% |
| big | m6.6-stock | load | 0.0% |
| big | m6.6-stock | util | 0.0% |
| big | m6.6-eevdf-complete | load | 97.39% |
| big | m6.6-eevdf-complete | util | 12.63% |
| big | m6.6-eevdf-complete-no-delay-dequeue | load | 139.69% |
| big | m6.6-eevdf-complete-no-delay-dequeue | util | 22.58% |
| big | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | load | 125.36% |
| big | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | util | 23.15% |
| big | m6.6-eevdf-complete-no-delay-zero | load | 128.56% |
| big | m6.6-eevdf-complete-no-delay-zero | util | 25.03% |
| big | m6.6-eevdf-complete-pelt-fix | load | 130.73% |
| big | m6.6-eevdf-complete-pelt-fix | util | 17.52% |
+---------+------------------------------------------------------+----------+-----------+

2024-05-02 10:26:44

by Luis Machado

[permalink] [raw]
Subject: Re: [RFC][PATCH 08/10] sched/fair: Implement delayed dequeue

On 4/29/24 15:33, Luis Machado wrote:
> Hi Peter,
>
> On 4/26/24 10:32, Peter Zijlstra wrote:
>> On Thu, Apr 25, 2024 at 01:49:49PM +0200, Peter Zijlstra wrote:
>>> On Thu, Apr 25, 2024 at 12:42:20PM +0200, Peter Zijlstra wrote:
>>>
>>>>> I wonder if the delayed dequeue logic is having an unwanted effect on the calculation of
>>>>> utilization/load of the runqueue and, as a consequence, we're scheduling things to run on
>>>>> higher OPP's in the big cores, leading to poor decisions for energy efficiency.
>>>>
>>>> Notably util_est_update() gets delayed. Given we don't actually do an
>>>> enqueue when a delayed task gets woken, it didn't seem to make sense to
>>>> update that sooner.
>>>
>>> The PELT runnable values will be inflated because of delayed dequeue.
>>> cpu_util() uses those in the @boost case, and as such this can indeed
>>> affect things.
>>>
>>> This can also slightly affect the cgroup case, but since the delay goes
>>> away as contention goes away, and the cgroup case must already assume
>>> worst case overlap, this seems limited.
>>>
>>> /me goes ponder things moar.
>>
>> First order approximation of a fix would be something like the totally
>> untested below I suppose...
>
> I gave this a try on the Pixel 6, and I noticed some improvement (see below), but not
> enough to bring it back to the original levels.
>
> (1) m6.6-stock - Basic EEVDF with wakeup preemption fix (baseline)
> (2) m6.6-eevdf-complete: m6.6-stock plus this series.
> (3) m6.6-eevdf-complete-no-delay-dequeue: (2) + NO_DELAY_DEQUEUE
> (4) m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero: (2) + NO_DELAY_DEQUEUE + NO_DELAY_ZERO
> (5) m6.6-eevdf-complete-no-delay-zero: (2) + NO_DELAY_ZERO
> (6) m6.6-eevdf-complete-pelt-fix: (2) + the proposed load_avg update patch.
>
> I included (3), (4) and (5) to exercise the impact of disabling the individual
> scheduler features.
>
>
> Energy use.
>
> +------------+------------------------------------------------------+-----------+
> | cluster | tag | perc_diff |
> +------------+------------------------------------------------------+-----------+
> | CPU | m6.6-stock | 0.0% |
> | CPU-Big | m6.6-stock | 0.0% |
> | CPU-Little | m6.6-stock | 0.0% |
> | CPU-Mid | m6.6-stock | 0.0% |
> | GPU | m6.6-stock | 0.0% |
> | Total | m6.6-stock | 0.0% |
> | CPU | m6.6-eevdf-complete | 114.51% |
> | CPU-Big | m6.6-eevdf-complete | 90.75% |
> | CPU-Little | m6.6-eevdf-complete | 98.74% |
> | CPU-Mid | m6.6-eevdf-complete | 213.9% |
> | GPU | m6.6-eevdf-complete | -7.04% |
> | Total | m6.6-eevdf-complete | 100.92% |
> | CPU | m6.6-eevdf-complete-no-delay-dequeue | 117.77% |
> | CPU-Big | m6.6-eevdf-complete-no-delay-dequeue | 113.79% |
> | CPU-Little | m6.6-eevdf-complete-no-delay-dequeue | 97.47% |
> | CPU-Mid | m6.6-eevdf-complete-no-delay-dequeue | 189.0% |
> | GPU | m6.6-eevdf-complete-no-delay-dequeue | -6.74% |
> | Total | m6.6-eevdf-complete-no-delay-dequeue | 103.84% |
> | CPU | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 120.45% |
> | CPU-Big | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 113.65% |
> | CPU-Little | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 99.04% |
> | CPU-Mid | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 201.14% |
> | GPU | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | -5.37% |
> | Total | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | 106.38% |
> | CPU | m6.6-eevdf-complete-no-delay-zero | 119.05% |
> | CPU-Big | m6.6-eevdf-complete-no-delay-zero | 107.55% |
> | CPU-Little | m6.6-eevdf-complete-no-delay-zero | 98.66% |
> | CPU-Mid | m6.6-eevdf-complete-no-delay-zero | 206.58% |
> | GPU | m6.6-eevdf-complete-no-delay-zero | -5.25% |
> | Total | m6.6-eevdf-complete-no-delay-zero | 105.14% |
> | CPU | m6.6-eevdf-complete-pelt-fix | 105.56% |
> | CPU-Big | m6.6-eevdf-complete-pelt-fix | 100.45% |
> | CPU-Little | m6.6-eevdf-complete-pelt-fix | 94.4% |
> | CPU-Mid | m6.6-eevdf-complete-pelt-fix | 150.94% |
> | GPU | m6.6-eevdf-complete-pelt-fix | -3.96% |
> | Total | m6.6-eevdf-complete-pelt-fix | 93.31% |
> +------------+------------------------------------------------------+-----------+
>
> Utilization and load levels.
>
> +---------+------------------------------------------------------+----------+-----------+
> | cluster | tag | variable | perc_diff |
> +---------+------------------------------------------------------+----------+-----------+
> | little | m6.6-stock | load | 0.0% |
> | little | m6.6-stock | util | 0.0% |
> | little | m6.6-eevdf-complete | load | 29.56% |
> | little | m6.6-eevdf-complete | util | 55.4% |
> | little | m6.6-eevdf-complete-no-delay-dequeue | load | 42.89% |
> | little | m6.6-eevdf-complete-no-delay-dequeue | util | 69.47% |
> | little | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | load | 51.05% |
> | little | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | util | 76.55% |
> | little | m6.6-eevdf-complete-no-delay-zero | load | 34.51% |
> | little | m6.6-eevdf-complete-no-delay-zero | util | 72.53% |
> | little | m6.6-eevdf-complete-pelt-fix | load | 29.96% |
> | little | m6.6-eevdf-complete-pelt-fix | util | 59.82% |
> | mid | m6.6-stock | load | 0.0% |
> | mid | m6.6-stock | util | 0.0% |
> | mid | m6.6-eevdf-complete | load | 29.37% |
> | mid | m6.6-eevdf-complete | util | 75.22% |
> | mid | m6.6-eevdf-complete-no-delay-dequeue | load | 36.4% |
> | mid | m6.6-eevdf-complete-no-delay-dequeue | util | 80.28% |
> | mid | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | load | 30.35% |
> | mid | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | util | 90.2% |
> | mid | m6.6-eevdf-complete-no-delay-zero | load | 37.83% |
> | mid | m6.6-eevdf-complete-no-delay-zero | util | 93.79% |
> | mid | m6.6-eevdf-complete-pelt-fix | load | 33.57% |
> | mid | m6.6-eevdf-complete-pelt-fix | util | 67.83% |
> | big | m6.6-stock | load | 0.0% |
> | big | m6.6-stock | util | 0.0% |
> | big | m6.6-eevdf-complete | load | 97.39% |
> | big | m6.6-eevdf-complete | util | 12.63% |
> | big | m6.6-eevdf-complete-no-delay-dequeue | load | 139.69% |
> | big | m6.6-eevdf-complete-no-delay-dequeue | util | 22.58% |
> | big | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | load | 125.36% |
> | big | m6.6-eevdf-complete-no-delay-dequeue-no-delay-zero | util | 23.15% |
> | big | m6.6-eevdf-complete-no-delay-zero | load | 128.56% |
> | big | m6.6-eevdf-complete-no-delay-zero | util | 25.03% |
> | big | m6.6-eevdf-complete-pelt-fix | load | 130.73% |
> | big | m6.6-eevdf-complete-pelt-fix | util | 17.52% |
> +---------+------------------------------------------------------+----------+-----------+

Going through the code, my understanding is that the util_est functions seem to be getting
called correctly, and in the right order. That is, we first util_est_enqueue, then util_est_dequeue
and finally util_est_update. So the stats *should* be correct.

On dequeuing (dequeue_task_fair), we immediately call util_est_dequeue, even for the case of
a DEQUEUE_DELAYED task, since we're no longer going to run the dequeue_delayed task for now, even
though it is still in the rq.

We delay the util_est_update of dequeue_delayed tasks until a later time in dequeue_entities.

Eventually the dequeue_delayed task will have its lag zeroed when it becomes eligible again,
(requeue_delayed_entity) while still being in the rq. It will then get dequeued/enqueued (requeued),
and marked as a non-dequeue-delayed task.

Next time we attempt to enqueue such a task (enqueue_task_fair), it will skip the ENQUEUE_DELAYED
block and call util_est_enqueue.

Still, something seems to be signalling that util/load is high, and causing migration to the big cores.

Maybe we're not decaying the util/load properly at some point, and inflated numbers start to happen.

I'll continue investigating.