LinuxLists.cc - [PATCH 07/15] sched/smp: Use lag to simplify cross-runqueue placement

2023-05-31 12:58:10

Subject: [PATCH 07/15] sched/smp: Use lag to simplify cross-runqueue placement

Using lag is both more correct and simpler when moving between
runqueues.

Notable, min_vruntime() was invented as a cheap approximation of
avg_vruntime() for this very purpose (SMP migration). Since we now
have the real thing; use it.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/fair.c | 145 ++++++----------------------------------------------
1 file changed, 19 insertions(+), 126 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5083,7 +5083,7 @@ place_entity(struct cfs_rq *cfs_rq, stru
*
* EEVDF: placement strategy #1 / #2
*/
- if (sched_feat(PLACE_LAG) && cfs_rq->nr_running > 1) {
+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
struct sched_entity *curr = cfs_rq->curr;
unsigned long load;

@@ -5171,61 +5171,21 @@ static void check_enqueue_throttle(struc

static inline bool cfs_bandwidth_used(void);

-/*
- * MIGRATION
- *
- * dequeue
- * update_curr()
- * update_min_vruntime()
- * vruntime -= min_vruntime
- *
- * enqueue
- * update_curr()
- * update_min_vruntime()
- * vruntime += min_vruntime
- *
- * this way the vruntime transition between RQs is done when both
- * min_vruntime are up-to-date.
- *
- * WAKEUP (remote)
- *
- * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
- * vruntime -= min_vruntime
- *
- * enqueue
- * update_curr()
- * update_min_vruntime()
- * vruntime += min_vruntime
- *
- * this way we don't have the most up-to-date min_vruntime on the originating
- * CPU and an up-to-date min_vruntime on the destination CPU.
- */
-
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
bool curr = cfs_rq->curr == se;

/*
* If we're the current task, we must renormalise before calling
* update_curr().
*/
- if (renorm && curr)
- se->vruntime += cfs_rq->min_vruntime;
+ if (curr)
+ place_entity(cfs_rq, se, 0);

update_curr(cfs_rq);

/*
- * Otherwise, renormalise after, such that we're placed at the current
- * moment in time, instead of some random moment in the past. Being
- * placed in the past could significantly boost this task to the
- * fairness detriment of existing tasks.
- */
- if (renorm && !curr)
- se->vruntime += cfs_rq->min_vruntime;
-
- /*
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
@@ -5236,11 +5196,22 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
*/
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
se_update_runnable(se);
+ /*
+ * XXX update_load_avg() above will have attached us to the pelt sum;
+ * but update_cfs_group() here will re-adjust the weight and have to
+ * undo/redo all that. Seems wasteful.
+ */
update_cfs_group(se);
- account_entity_enqueue(cfs_rq, se);

- if (flags & ENQUEUE_WAKEUP)
+ /*
+ * XXX now that the entity has been re-weighted, and it's lag adjusted,
+ * we can place the entity.
+ */
+ if (!curr)
place_entity(cfs_rq, se, 0);
+
+ account_entity_enqueue(cfs_rq, se);
+
/* Entity has migrated, no longer consider this task hot */
if (flags & ENQUEUE_MIGRATED)
se->exec_start = 0;
@@ -5335,23 +5306,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, st

clear_buddies(cfs_rq, se);

- if (flags & DEQUEUE_SLEEP)
- update_entity_lag(cfs_rq, se);
-
+ update_entity_lag(cfs_rq, se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);

- /*
- * Normalize after update_curr(); which will also have moved
- * min_vruntime if @se is the one holding it back. But before doing
- * update_min_vruntime() again, which will discount @se's position and
- * can move min_vruntime forward still more.
- */
- if (!(flags & DEQUEUE_SLEEP))
- se->vruntime -= cfs_rq->min_vruntime;
-
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);

@@ -8102,18 +8062,6 @@ static void migrate_task_rq_fair(struct
{
struct sched_entity *se = &p->se;

- /*
- * As blocked tasks retain absolute vruntime the migration needs to
- * deal with this by subtracting the old and adding the new
- * min_vruntime -- the latter is done by enqueue_entity() when placing
- * the task on the new runqueue.
- */
- if (READ_ONCE(p->__state) == TASK_WAKING) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
- }
-
if (!task_on_rq_migrating(p)) {
remove_entity_load_avg(se);

@@ -12482,8 +12430,8 @@ static void task_tick_fair(struct rq *rq
*/
static void task_fork_fair(struct task_struct *p)
{
- struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
+ struct cfs_rq *cfs_rq;
struct rq *rq = this_rq();
struct rq_flags rf;

@@ -12492,22 +12440,9 @@ static void task_fork_fair(struct task_s

cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
- if (curr) {
+ if (curr)
update_curr(cfs_rq);
- se->vruntime = curr->vruntime;
- }
place_entity(cfs_rq, se, 1);
-
- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
- /*
- * Upon rescheduling, sched_class::put_prev_task() will place
- * 'current' within the tree based on its new key value.
- */
- swap(curr->vruntime, se->vruntime);
- resched_curr(rq);
- }
-
- se->vruntime -= cfs_rq->min_vruntime;
rq_unlock(rq, &rf);
}

@@ -12536,34 +12471,6 @@ prio_changed_fair(struct rq *rq, struct
check_preempt_curr(rq, p, 0);
}

-static inline bool vruntime_normalized(struct task_struct *p)
-{
- struct sched_entity *se = &p->se;
-
- /*
- * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
- * the dequeue_entity(.flags=0) will already have normalized the
- * vruntime.
- */
- if (p->on_rq)
- return true;
-
- /*
- * When !on_rq, vruntime of the task has usually NOT been normalized.
- * But there are some cases where it has already been normalized:
- *
- * - A forked child which is waiting for being woken up by
- * wake_up_new_task().
- * - A task which has been woken up by try_to_wake_up() and
- * waiting for actually being woken up by sched_ttwu_pending().
- */
- if (!se->sum_exec_runtime ||
- (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
- return true;
-
- return false;
-}
-
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Propagate the changes of the sched_entity across the tg tree to make it
@@ -12634,16 +12541,6 @@ static void attach_entity_cfs_rq(struct
static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- if (!vruntime_normalized(p)) {
- /*
- * Fix up our vruntime so that the current sleep doesn't
- * cause 'unlimited' sleep bonus.
- */
- place_entity(cfs_rq, se, 0);
- se->vruntime -= cfs_rq->min_vruntime;
- }

detach_entity_cfs_rq(se);
}
@@ -12651,12 +12548,8 @@ static void detach_task_cfs_rq(struct ta
static void attach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);

attach_entity_cfs_rq(se);
-
- if (!vruntime_normalized(p))
- se->vruntime += cfs_rq->min_vruntime;
}

static void switched_from_fair(struct rq *rq, struct task_struct *p)

2023-08-10 08:01:00

by tip-bot2 for Alexey Makhalov

[permalink] [raw]

Subject: [tip: sched/core] sched/smp: Use lag to simplify cross-runqueue placement

The following commit has been merged into the sched/core branch of tip:

Commit-ID: e8f331bcc270354a803c2127c486190d33eac441
Gitweb: https://git.kernel.org/tip/e8f331bcc270354a803c2127c486190d33eac441
Author: Peter Zijlstra <[email protected]>
AuthorDate: Wed, 31 May 2023 13:58:46 +02:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Wed, 19 Jul 2023 09:43:58 +02:00

sched/smp: Use lag to simplify cross-runqueue placement

Using lag is both more correct and simpler when moving between
runqueues.

Notable, min_vruntime() was invented as a cheap approximation of
avg_vruntime() for this very purpose (SMP migration). Since we now
have the real thing; use it.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
kernel/sched/fair.c | 145 +++++--------------------------------------
1 file changed, 19 insertions(+), 126 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 58798da..57e8bc1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5083,7 +5083,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
*
* EEVDF: placement strategy #1 / #2
*/
- if (sched_feat(PLACE_LAG) && cfs_rq->nr_running > 1) {
+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
struct sched_entity *curr = cfs_rq->curr;
unsigned long load;

@@ -5172,61 +5172,21 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);

static inline bool cfs_bandwidth_used(void);

-/*
- * MIGRATION
- *
- * dequeue
- * update_curr()
- * update_min_vruntime()
- * vruntime -= min_vruntime
- *
- * enqueue
- * update_curr()
- * update_min_vruntime()
- * vruntime += min_vruntime
- *
- * this way the vruntime transition between RQs is done when both
- * min_vruntime are up-to-date.
- *
- * WAKEUP (remote)
- *
- * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
- * vruntime -= min_vruntime
- *
- * enqueue
- * update_curr()
- * update_min_vruntime()
- * vruntime += min_vruntime
- *
- * this way we don't have the most up-to-date min_vruntime on the originating
- * CPU and an up-to-date min_vruntime on the destination CPU.
- */
-
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
bool curr = cfs_rq->curr == se;

/*
* If we're the current task, we must renormalise before calling
* update_curr().
*/
- if (renorm && curr)
- se->vruntime += cfs_rq->min_vruntime;
+ if (curr)
+ place_entity(cfs_rq, se, 0);

update_curr(cfs_rq);

/*
- * Otherwise, renormalise after, such that we're placed at the current
- * moment in time, instead of some random moment in the past. Being
- * placed in the past could significantly boost this task to the
- * fairness detriment of existing tasks.
- */
- if (renorm && !curr)
- se->vruntime += cfs_rq->min_vruntime;
-
- /*
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
@@ -5237,11 +5197,22 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
se_update_runnable(se);
+ /*
+ * XXX update_load_avg() above will have attached us to the pelt sum;
+ * but update_cfs_group() here will re-adjust the weight and have to
+ * undo/redo all that. Seems wasteful.
+ */
update_cfs_group(se);
- account_entity_enqueue(cfs_rq, se);

- if (flags & ENQUEUE_WAKEUP)
+ /*
+ * XXX now that the entity has been re-weighted, and it's lag adjusted,
+ * we can place the entity.
+ */
+ if (!curr)
place_entity(cfs_rq, se, 0);
+
+ account_entity_enqueue(cfs_rq, se);
+
/* Entity has migrated, no longer consider this task hot */
if (flags & ENQUEUE_MIGRATED)
se->exec_start = 0;
@@ -5346,23 +5317,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)

clear_buddies(cfs_rq, se);

- if (flags & DEQUEUE_SLEEP)
- update_entity_lag(cfs_rq, se);
-
+ update_entity_lag(cfs_rq, se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);

- /*
- * Normalize after update_curr(); which will also have moved
- * min_vruntime if @se is the one holding it back. But before doing
- * update_min_vruntime() again, which will discount @se's position and
- * can move min_vruntime forward still more.
- */
- if (!(flags & DEQUEUE_SLEEP))
- se->vruntime -= cfs_rq->min_vruntime;
-
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);

@@ -8208,18 +8168,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
struct sched_entity *se = &p->se;

- /*
- * As blocked tasks retain absolute vruntime the migration needs to
- * deal with this by subtracting the old and adding the new
- * min_vruntime -- the latter is done by enqueue_entity() when placing
- * the task on the new runqueue.
- */
- if (READ_ONCE(p->__state) == TASK_WAKING) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
- }
-
if (!task_on_rq_migrating(p)) {
remove_entity_load_avg(se);

@@ -12709,8 +12657,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
*/
static void task_fork_fair(struct task_struct *p)
{
- struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
+ struct cfs_rq *cfs_rq;
struct rq *rq = this_rq();
struct rq_flags rf;

@@ -12719,22 +12667,9 @@ static void task_fork_fair(struct task_struct *p)

cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
- if (curr) {
+ if (curr)
update_curr(cfs_rq);
- se->vruntime = curr->vruntime;
- }
place_entity(cfs_rq, se, 1);
-
- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
- /*
- * Upon rescheduling, sched_class::put_prev_task() will place
- * 'current' within the tree based on its new key value.
- */
- swap(curr->vruntime, se->vruntime);
- resched_curr(rq);
- }
-
- se->vruntime -= cfs_rq->min_vruntime;
rq_unlock(rq, &rf);
}

@@ -12763,34 +12698,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
check_preempt_curr(rq, p, 0);
}

-static inline bool vruntime_normalized(struct task_struct *p)
-{
- struct sched_entity *se = &p->se;
-
- /*
- * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
- * the dequeue_entity(.flags=0) will already have normalized the
- * vruntime.
- */
- if (p->on_rq)
- return true;
-
- /*
- * When !on_rq, vruntime of the task has usually NOT been normalized.
- * But there are some cases where it has already been normalized:
- *
- * - A forked child which is waiting for being woken up by
- * wake_up_new_task().
- * - A task which has been woken up by try_to_wake_up() and
- * waiting for actually being woken up by sched_ttwu_pending().
- */
- if (!se->sum_exec_runtime ||
- (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
- return true;
-
- return false;
-}
-
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Propagate the changes of the sched_entity across the tg tree to make it
@@ -12861,16 +12768,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- if (!vruntime_normalized(p)) {
- /*
- * Fix up our vruntime so that the current sleep doesn't
- * cause 'unlimited' sleep bonus.
- */
- place_entity(cfs_rq, se, 0);
- se->vruntime -= cfs_rq->min_vruntime;
- }

detach_entity_cfs_rq(se);
}
@@ -12878,12 +12775,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
static void attach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);

attach_entity_cfs_rq(se);
-
- if (!vruntime_normalized(p))
- se->vruntime += cfs_rq->min_vruntime;
}

static void switched_from_fair(struct rq *rq, struct task_struct *p)

2023-09-12 20:04:59

by Sebastian Andrzej Siewior

[permalink] [raw]

Subject: Re: [PATCH 07/15] sched/smp: Use lag to simplify cross-runqueue placement

On 2023-05-31 13:58:46 [+0200], Peter Zijlstra wrote:
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -12492,22 +12440,9 @@ static void task_fork_fair(struct task_s
>
> cfs_rq = task_cfs_rq(current);
> curr = cfs_rq->curr;
> - if (curr) {
> + if (curr)
> update_curr(cfs_rq);
> - se->vruntime = curr->vruntime;
> - }
> place_entity(cfs_rq, se, 1);
> -
> - if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {

Since the removal of sysctl_sched_child_runs_first there is no user of
this anymore. There is still the sysctl file sched_child_runs_first with
no functionality.
Is this intended or should it be removed?

…
> rq_unlock(rq, &rf);
> }

Sebastian

2023-09-13 12:21:29

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH 07/15] sched/smp: Use lag to simplify cross-runqueue placement

On Tue, Sep 12, 2023 at 05:32:21PM +0200, Sebastian Andrzej Siewior wrote:
> On 2023-05-31 13:58:46 [+0200], Peter Zijlstra wrote:
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -12492,22 +12440,9 @@ static void task_fork_fair(struct task_s
> >
> > cfs_rq = task_cfs_rq(current);
> > curr = cfs_rq->curr;
> > - if (curr) {
> > + if (curr)
> > update_curr(cfs_rq);
> > - se->vruntime = curr->vruntime;
> > - }
> > place_entity(cfs_rq, se, 1);
> > -
> > - if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
>
> Since the removal of sysctl_sched_child_runs_first there is no user of
> this anymore. There is still the sysctl file sched_child_runs_first with
> no functionality.
> Is this intended or should it be removed?

Hurmph... I think that knob has been somewat dysfunctional for a long
while and it might be time to remove it.

Ingo?

2023-10-04 01:19:13

by Daniel Jordan

[permalink] [raw]

Subject: [PATCH] sched/fair: Preserve PLACE_DEADLINE_INITIAL deadline

2023-10-04 13:10:32

by Daniel Jordan

[permalink] [raw]

Subject: [PATCH v2] sched/fair: Preserve PLACE_DEADLINE_INITIAL deadline

An entity is supposed to get an earlier deadline with
PLACE_DEADLINE_INITIAL when it's forked, but the deadline gets
overwritten soon after in enqueue_entity() the first time a forked
entity is woken so that PLACE_DEADLINE_INITIAL is effectively a no-op.

Placing in task_fork_fair() seems unnecessary since none of the values
that get set (slice, vruntime, deadline) are used before they're set
again at enqueue time, so get rid of that (and with it all of
task_fork_fair()) and just pass ENQUEUE_INITIAL to enqueue_entity() via
wake_up_new_task().

Fixes: e8f331bcc270 ("sched/smp: Use lag to simplify cross-runqueue placement")
Signed-off-by: Daniel Jordan <[email protected]>
---

v2
- place_entity() seems like the only reason for task_fork_fair() to exist
after the recent removal of sysctl_sched_child_runs_first, so take out
the whole function.

Still based on today's peterz/sched/eevdf

kernel/sched/core.c | 2 +-
kernel/sched/fair.c | 24 ------------------------
2 files changed, 1 insertion(+), 25 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 779cdc7969c81..500e2dbfd41dd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4854,7 +4854,7 @@ void wake_up_new_task(struct task_struct *p)
update_rq_clock(rq);
post_init_entity_util_avg(p);

- activate_task(rq, p, ENQUEUE_NOCLOCK);
+ activate_task(rq, p, ENQUEUE_INITIAL | ENQUEUE_NOCLOCK);
trace_sched_wakeup_new(p);
wakeup_preempt(rq, p, WF_FORK);
#ifdef CONFIG_SMP
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a0b4dac2662c9..3827b302eeb9b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12427,29 +12427,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
task_tick_core(rq, curr);
}

-/*
- * called on fork with the child task as argument from the parent's context
- * - child not yet on the tasklist
- * - preemption disabled
- */
-static void task_fork_fair(struct task_struct *p)
-{
- struct sched_entity *se = &p->se, *curr;
- struct cfs_rq *cfs_rq;
- struct rq *rq = this_rq();
- struct rq_flags rf;
-
- rq_lock(rq, &rf);
- update_rq_clock(rq);
-
- cfs_rq = task_cfs_rq(current);
- curr = cfs_rq->curr;
- if (curr)
- update_curr(cfs_rq);
- place_entity(cfs_rq, se, ENQUEUE_INITIAL);
- rq_unlock(rq, &rf);
-}
-
/*
* Priority of the task has changed. Check to see if we preempt
* the current task.
@@ -12953,7 +12930,6 @@ DEFINE_SCHED_CLASS(fair) = {
#endif

.task_tick = task_tick_fair,
- .task_fork = task_fork_fair,

.prio_changed = prio_changed_fair,
.switched_from = switched_from_fair,
--
2.41.0

2023-10-04 15:47:52

by Chen Yu

[permalink] [raw]

Subject: Re: [PATCH v2] sched/fair: Preserve PLACE_DEADLINE_INITIAL deadline

Hi Daniel,

On 2023-10-04 at 09:09:08 -0400, Daniel Jordan wrote:
> An entity is supposed to get an earlier deadline with
> PLACE_DEADLINE_INITIAL when it's forked, but the deadline gets
> overwritten soon after in enqueue_entity() the first time a forked
> entity is woken so that PLACE_DEADLINE_INITIAL is effectively a no-op.
>
> Placing in task_fork_fair() seems unnecessary since none of the values
> that get set (slice, vruntime, deadline) are used before they're set
> again at enqueue time, so get rid of that (and with it all of
> task_fork_fair()) and just pass ENQUEUE_INITIAL to enqueue_entity() via
> wake_up_new_task().
>
> Fixes: e8f331bcc270 ("sched/smp: Use lag to simplify cross-runqueue placement")
> Signed-off-by: Daniel Jordan <[email protected]>
> ---
>
> v2
> - place_entity() seems like the only reason for task_fork_fair() to exist
> after the recent removal of sysctl_sched_child_runs_first, so take out
> the whole function.

At first glance I thought if we remove task_fork_fair(), do we lose one chance to
update the parent task's statistic in update_curr()? We might get out-of-date
parent task's deadline and make preemption decision based on the stale data in
wake_up_new_task() -> wakeup_preempt() -> pick_eevdf(). But after a second thought,
I found that wake_up_new_task() -> enqueue_entity() itself would invoke update_curr(),
so this should not be a problem.

Then I was wondering why can't we just skip place_entity() in enqueue_entity()
if ENQUEUE_WAKEUP is not set, just like the code before e8f331bcc270? In this
way the new fork task's deadline will not be overwritten by wake_up_new_task()->
enqueue_entity(). Then I realized that, after e8f331bcc270, the task's vruntime
and deadline are all calculated by place_entity() rather than being renormalised
to cfs_rq->min_vruntime in enqueue_entity(), so we can not simply skip place_entity()
in enqueue_entity().

Per my understanding, this patch looks good,

Reviewed-by: Chen Yu <[email protected]>

thanks,
Chenyu

2023-10-05 14:37:47

by K Prateek Nayak

[permalink] [raw]

Subject: Re: [PATCH] sched/fair: Preserve PLACE_DEADLINE_INITIAL deadline

Hello Daniel,

On 10/4/2023 6:47 AM, Daniel Jordan wrote:
> An entity is supposed to get an earlier deadline with
> PLACE_DEADLINE_INITIAL when it's forked, but the deadline gets
> overwritten soon after in enqueue_entity() the first time a forked
> entity is woken so that PLACE_DEADLINE_INITIAL is effectively a no-op.
>
> Placing in task_fork_fair() seems unnecessary since none of the values
> that get set (slice, vruntime, deadline) are used before they're set
> again at enqueue time, so get rid of that and just pass ENQUEUE_INITIAL
> to enqueue_entity() via wake_up_new_task().
>
> Fixes: e8f331bcc270 ("sched/smp: Use lag to simplify cross-runqueue placement")
> Signed-off-by: Daniel Jordan <[email protected]>

I got a chance to this this on a 3rd Generation EPYC system. I don't
see anything out of the ordinary except for a small regression on
hackbench. I'll leave the full result below.

o System details

- 3rd Generation EPYC System
- 2 sockets each with 64C/128T
- NPS1 (Each socket is a NUMA node)
- Boost enabled, C2 Disabled (POLL and MWAIT based C1 remained enabled)

o Kernel Details

- tip: tip:sched/core at commit d4d6596b4386 ("sched/headers: Remove
duplicate header inclusions")

- place-deadline-fix: tip + this patch

o Benchmark Results

==================================================================
Test : hackbench
Units : Normalized time in seconds
Interpretation: Lower is better
Statistic : AMean
==================================================================
Case: tip[pct imp](CV) place-deadline-fix[pct imp](CV)
1-groups 1.00 [ -0.00]( 2.58) 1.04 [ -3.63]( 3.14)
2-groups 1.00 [ -0.00]( 1.87) 1.03 [ -2.98]( 1.85)
4-groups 1.00 [ -0.00]( 1.63) 1.02 [ -2.35]( 1.59)
8-groups 1.00 [ -0.00]( 1.38) 1.03 [ -2.92]( 1.20)
16-groups 1.00 [ -0.00]( 2.67) 1.02 [ -1.61]( 2.08)

==================================================================
Test : tbench
Units : Normalized throughput
Interpretation: Higher is better
Statistic : AMean
==================================================================
Clients: tip[pct imp](CV) place-deadline-fix[pct imp](CV)
1 1.00 [ 0.00]( 0.59) 1.02 [ 2.09]( 0.07)
2 1.00 [ 0.00]( 1.19) 1.02 [ 2.38]( 0.82)
4 1.00 [ 0.00]( 0.33) 1.03 [ 2.89]( 0.99)
8 1.00 [ 0.00]( 0.76) 1.02 [ 2.10]( 0.46)
16 1.00 [ 0.00]( 1.10) 1.01 [ 0.81]( 0.49)
32 1.00 [ 0.00]( 1.47) 1.02 [ 1.77]( 0.58)
64 1.00 [ 0.00]( 1.77) 1.02 [ 1.83]( 1.77)
128 1.00 [ 0.00]( 0.41) 1.02 [ 2.49]( 0.52)
256 1.00 [ 0.00]( 0.63) 1.03 [ 3.03]( 1.38)
512 1.00 [ 0.00]( 0.32) 1.02 [ 1.61]( 0.45)
1024 1.00 [ 0.00]( 0.22) 1.01 [ 1.00]( 0.26)

==================================================================
Test : stream-10
Units : Normalized Bandwidth, MB/s
Interpretation: Higher is better
Statistic : HMean
==================================================================
Test: tip[pct imp](CV) place-deadline-fix[pct imp](CV)
Copy 1.00 [ 0.00]( 9.30) 0.85 [-15.36](11.26)
Scale 1.00 [ 0.00]( 6.67) 0.98 [ -2.36]( 7.53)
Add 1.00 [ 0.00]( 6.77) 0.92 [ -7.86]( 7.83)
Triad 1.00 [ 0.00]( 7.36) 0.94 [ -5.57]( 6.82)

==================================================================
Test : stream-100
Units : Normalized Bandwidth, MB/s
Interpretation: Higher is better
Statistic : HMean
==================================================================
Test: tip[pct imp](CV) place-deadline-fix[pct imp](CV)
Copy 1.00 [ 0.00]( 1.83) 0.96 [ -3.68]( 5.08)
Scale 1.00 [ 0.00]( 6.41) 1.03 [ 2.66]( 5.28)
Add 1.00 [ 0.00]( 6.23) 1.02 [ 1.54]( 4.97)
Triad 1.00 [ 0.00]( 0.89) 0.94 [ -5.68]( 6.78)

==================================================================
Test : netperf
Units : Normalized Througput
Interpretation: Higher is better
Statistic : AMean
==================================================================
Clients: tip[pct imp](CV) place-deadline-fix[pct imp](CV)
1-clients 1.00 [ 0.00]( 0.05) 1.02 [ 1.83]( 1.98)
2-clients 1.00 [ 0.00]( 0.93) 1.02 [ 1.87]( 2.45)
4-clients 1.00 [ 0.00]( 0.54) 1.02 [ 2.19]( 1.99)
8-clients 1.00 [ 0.00]( 0.48) 1.02 [ 2.29]( 2.27)
16-clients 1.00 [ 0.00]( 0.42) 1.02 [ 1.60]( 1.70)
32-clients 1.00 [ 0.00]( 0.78) 1.02 [ 1.88]( 2.08)
64-clients 1.00 [ 0.00]( 1.45) 1.02 [ 2.33]( 2.18)
128-clients 1.00 [ 0.00]( 0.97) 1.02 [ 2.38]( 1.95)
256-clients 1.00 [ 0.00]( 4.57) 1.02 [ 2.50]( 5.42)
512-clients 1.00 [ 0.00](52.74) 1.03 [ 3.38](49.69)

==================================================================
Test : schbench
Units : Normalized 99th percentile latency in us
Interpretation: Lower is better
Statistic : Median
==================================================================
#workers: tip[pct imp](CV) place-deadline-fix[pct imp](CV)
1 1.00 [ -0.00]( 3.95) 0.90 [ 10.26](31.80)
2 1.00 [ -0.00](10.45) 1.08 [ -7.89](15.33)
4 1.00 [ -0.00]( 4.76) 0.93 [ 7.14]( 3.95)
8 1.00 [ -0.00]( 9.35) 1.06 [ -6.25]( 8.90)
16 1.00 [ -0.00]( 8.84) 0.92 [ 8.06]( 4.39)
32 1.00 [ -0.00]( 3.33) 1.04 [ -4.40]( 3.68)
64 1.00 [ -0.00]( 6.70) 0.96 [ 4.17]( 2.75)
128 1.00 [ -0.00]( 0.71) 0.96 [ 3.55]( 1.26)
256 1.00 [ -0.00](31.20) 1.28 [-28.21]( 9.69)
512 1.00 [ -0.00]( 4.98) 1.00 [ 0.48]( 2.76)

==================================================================
Test : ycsb-cassandra
Units : Normalized throughput
Interpretation: Higher is better
Statistic : Mean
==================================================================
Metric tip place-deadline-fix(pct imp)
Throughput 1.00 1.01 (%diff: 1.06%)

==================================================================
Test : ycsb-mondodb
Units : Normalized throughput
Interpretation: Higher is better
Statistic : Mean
==================================================================
Metric tip place-deadline-fix(pct imp)
Throughput 1.00 1.00 (%diff: 0.25%)

==================================================================
Test : DeathStarBench
Units : Normalized throughput
Interpretation: Higher is better
Statistic : Mean
==================================================================
Pinning scaling tip place-deadline-fix(pct imp)
1CCD 1 1.00 1.00 (%diff: -0.32%)
2CCD 2 1.00 1.00 (%diff: -0.26%)
4CCD 4 1.00 1.00 (%diff: 0.17%)
8CCD 8 1.00 1.00 (%diff: -0.17%)

--
I see there is a v2. I'll give that a spin as well.

> ---
>
> Tested on top of peterz/sched/eevdf from 2023-10-03.
>
> kernel/sched/core.c | 2 +-
> kernel/sched/fair.c | 1 -
> 2 files changed, 1 insertion(+), 2 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 779cdc7969c81..500e2dbfd41dd 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4854,7 +4854,7 @@ void wake_up_new_task(struct task_struct *p)
> update_rq_clock(rq);
> post_init_entity_util_avg(p);
>
> - activate_task(rq, p, ENQUEUE_NOCLOCK);
> + activate_task(rq, p, ENQUEUE_INITIAL | ENQUEUE_NOCLOCK);
> trace_sched_wakeup_new(p);
> wakeup_preempt(rq, p, WF_FORK);
> #ifdef CONFIG_SMP
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a0b4dac2662c9..5872b8a3f5891 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -12446,7 +12446,6 @@ static void task_fork_fair(struct task_struct *p)
> curr = cfs_rq->curr;
> if (curr)
> update_curr(cfs_rq);
> - place_entity(cfs_rq, se, ENQUEUE_INITIAL);
> rq_unlock(rq, &rf);
> }
>

--
Thanks and Regards,
Prateek

2023-10-06 16:33:07

by Daniel Jordan

[permalink] [raw]

Subject: Re: [PATCH v2] sched/fair: Preserve PLACE_DEADLINE_INITIAL deadline

Hi Chenyu,

On Wed, Oct 04, 2023 at 11:46:21PM +0800, Chen Yu wrote:
> Hi Daniel,
>
> On 2023-10-04 at 09:09:08 -0400, Daniel Jordan wrote:
> > An entity is supposed to get an earlier deadline with
> > PLACE_DEADLINE_INITIAL when it's forked, but the deadline gets
> > overwritten soon after in enqueue_entity() the first time a forked
> > entity is woken so that PLACE_DEADLINE_INITIAL is effectively a no-op.
> >
> > Placing in task_fork_fair() seems unnecessary since none of the values
> > that get set (slice, vruntime, deadline) are used before they're set
> > again at enqueue time, so get rid of that (and with it all of
> > task_fork_fair()) and just pass ENQUEUE_INITIAL to enqueue_entity() via
> > wake_up_new_task().
> >
> > Fixes: e8f331bcc270 ("sched/smp: Use lag to simplify cross-runqueue placement")
> > Signed-off-by: Daniel Jordan <[email protected]>
> > ---
> >
> > v2
> > - place_entity() seems like the only reason for task_fork_fair() to exist
> > after the recent removal of sysctl_sched_child_runs_first, so take out
> > the whole function.
>
> At first glance I thought if we remove task_fork_fair(), do we lose one chance to
> update the parent task's statistic in update_curr()? We might get out-of-date
> parent task's deadline and make preemption decision based on the stale data in
> wake_up_new_task() -> wakeup_preempt() -> pick_eevdf(). But after a second thought,
> I found that wake_up_new_task() -> enqueue_entity() itself would invoke update_curr(),
> so this should not be a problem.
>
> Then I was wondering why can't we just skip place_entity() in enqueue_entity()
> if ENQUEUE_WAKEUP is not set, just like the code before e8f331bcc270? In this
> way the new fork task's deadline will not be overwritten by wake_up_new_task()->
> enqueue_entity(). Then I realized that, after e8f331bcc270, the task's vruntime
> and deadline are all calculated by place_entity() rather than being renormalised
> to cfs_rq->min_vruntime in enqueue_entity(), so we can not simply skip place_entity()
> in enqueue_entity().

This all made me wonder if the order of update_curr() for the parent and
place_entity() for the child matters. And it does, since placing uses
avg_vruntime(), which wants an up-to-date vruntime for current and
min_vruntime for cfs_rq. Good that 'curr' in enqueue_entity() is false
on fork so that the parent's vruntime is up to date, but it seems
placing should always happen after update_curr().

> Per my understanding, this patch looks good,
>
> Reviewed-by: Chen Yu <[email protected]>

Thanks!

Daniel

2023-10-06 16:37:15

by Daniel Jordan

[permalink] [raw]

Subject: Re: [PATCH] sched/fair: Preserve PLACE_DEADLINE_INITIAL deadline

Hi Prateek,

On Thu, Oct 05, 2023 at 11:26:07AM +0530, K Prateek Nayak wrote:
> Hello Daniel,
>
> On 10/4/2023 6:47 AM, Daniel Jordan wrote:
> > An entity is supposed to get an earlier deadline with
> > PLACE_DEADLINE_INITIAL when it's forked, but the deadline gets
> > overwritten soon after in enqueue_entity() the first time a forked
> > entity is woken so that PLACE_DEADLINE_INITIAL is effectively a no-op.
> >
> > Placing in task_fork_fair() seems unnecessary since none of the values
> > that get set (slice, vruntime, deadline) are used before they're set
> > again at enqueue time, so get rid of that and just pass ENQUEUE_INITIAL
> > to enqueue_entity() via wake_up_new_task().
> >
> > Fixes: e8f331bcc270 ("sched/smp: Use lag to simplify cross-runqueue placement")
> > Signed-off-by: Daniel Jordan <[email protected]>
>
> I got a chance to this this on a 3rd Generation EPYC system. I don't
> see anything out of the ordinary except for a small regression on
> hackbench. I'll leave the full result below.

Thanks for testing!

> o System details
>
> - 3rd Generation EPYC System
> - 2 sockets each with 64C/128T
> - NPS1 (Each socket is a NUMA node)
> - Boost enabled, C2 Disabled (POLL and MWAIT based C1 remained enabled)
>
>
> o Kernel Details
>
> - tip: tip:sched/core at commit d4d6596b4386 ("sched/headers: Remove
> duplicate header inclusions")
>
> - place-deadline-fix: tip + this patch
>
>
> o Benchmark Results
>
> ==================================================================
> Test : hackbench
> Units : Normalized time in seconds
> Interpretation: Lower is better
> Statistic : AMean
> ==================================================================
> Case: tip[pct imp](CV) place-deadline-fix[pct imp](CV)
> 1-groups 1.00 [ -0.00]( 2.58) 1.04 [ -3.63]( 3.14)
> 2-groups 1.00 [ -0.00]( 1.87) 1.03 [ -2.98]( 1.85)
> 4-groups 1.00 [ -0.00]( 1.63) 1.02 [ -2.35]( 1.59)
> 8-groups 1.00 [ -0.00]( 1.38) 1.03 [ -2.92]( 1.20)
> 16-groups 1.00 [ -0.00]( 2.67) 1.02 [ -1.61]( 2.08)

Huh, numbers do seem a bit outside the noise. Doesn't hackbench only
fork at the beginning? I glanced at perf messaging source just now, but
not sure if you use that version. Anyway, I wouldn't expect this patch
to have much of an effect in that case.

2023-10-06 16:49:32

by Daniel Jordan

[permalink] [raw]

Subject: [PATCH] sched/fair: Always update_curr() before placing at enqueue

Placing wants current's vruntime and the cfs_rq's min_vruntime up to
date so that avg_runtime() is too, and similarly it wants the entity to
be re-weighted and lag adjusted so vslice and vlag are fresh, so always
do update_curr() and update_cfs_group() beforehand.

There doesn't seem to be a reason to treat the 'curr' case specially
after e8f331bcc270 since vruntime doesn't get normalized anymore.

Fixes: e8f331bcc270 ("sched/smp: Use lag to simplify cross-runqueue placement")
Signed-off-by: Daniel Jordan <[email protected]>
---

Not sure what the XXX above place_entity() is for, maybe it can go away?

Based on tip/sched/core.

kernel/sched/fair.c | 14 ++------------
1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04fbcbda97d5f..db2ca9bf9cc49 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5047,15 +5047,6 @@ static inline bool cfs_bandwidth_used(void);
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- bool curr = cfs_rq->curr == se;
-
- /*
- * If we're the current task, we must renormalise before calling
- * update_curr().
- */
- if (curr)
- place_entity(cfs_rq, se, flags);
-
update_curr(cfs_rq);

/*
@@ -5080,8 +5071,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* XXX now that the entity has been re-weighted, and it's lag adjusted,
* we can place the entity.
*/
- if (!curr)
- place_entity(cfs_rq, se, flags);
+ place_entity(cfs_rq, se, flags);

account_entity_enqueue(cfs_rq, se);

@@ -5091,7 +5081,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)

check_schedstat_required();
update_stats_enqueue_fair(cfs_rq, se, flags);
- if (!curr)
+ if (cfs_rq->curr != se)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;

--
2.41.0

2023-10-12 04:48:47

by K Prateek Nayak

[permalink] [raw]

Subject: Re: [PATCH v2] sched/fair: Preserve PLACE_DEADLINE_INITIAL deadline

Hello Daniel,

Same as v1, I do not see any regressions with this version either.
I'll leave the full results below.

o Machine details

- 3rd Generation EPYC System
- 2 sockets each with 64C/128T
- NPS1 (Each socket is a NUMA node)
- C2 Disabled (POLL and C1(MWAIT) remained enabled)

o Kernel Details

- tip: tip:sched/core at commit 238437d88cea ("intel_idle: Add ibrs_off
module parameter to force-disable IBRS")
[For DeathStarBench comparisons alone since I ran to the issue
which below commit solves]
+ min_deadline fix commit 8dafa9d0eb1a ("sched/eevdf: Fix
min_deadline heap integrity") from tip:sched/urgent

- place-initial-fix: tip + this patch as is

o Benchmark Results

==================================================================
Test : hackbench
Units : Normalized time in seconds
Interpretation: Lower is better
Statistic : AMean
==================================================================
Case: tip[pct imp](CV) place-initial-fix[pct imp](CV)
1-groups 1.00 [ -0.00]( 2.11) 1.01 [ -1.08]( 2.60)
2-groups 1.00 [ -0.00]( 1.31) 1.01 [ -0.93]( 1.61)
4-groups 1.00 [ -0.00]( 1.04) 1.00 [ -0.00]( 1.25)
8-groups 1.00 [ -0.00]( 1.34) 0.99 [ 1.15]( 0.85)
16-groups 1.00 [ -0.00]( 2.45) 1.00 [ -0.27]( 2.32)

==================================================================
Test : tbench
Units : Normalized throughput
Interpretation: Higher is better
Statistic : AMean
==================================================================
Clients: tip[pct imp](CV) place-initial-fix[pct imp](CV)
1 1.00 [ 0.00]( 0.46) 0.99 [ -0.59]( 0.88)
2 1.00 [ 0.00]( 0.64) 0.99 [ -1.43]( 0.69)
4 1.00 [ 0.00]( 0.59) 0.99 [ -1.49]( 0.76)
8 1.00 [ 0.00]( 0.34) 1.00 [ -0.35]( 0.20)
16 1.00 [ 0.00]( 0.72) 0.98 [ -1.96]( 1.97)
32 1.00 [ 0.00]( 0.65) 1.00 [ -0.24]( 1.07)
64 1.00 [ 0.00]( 0.59) 1.00 [ -0.14]( 1.18)
128 1.00 [ 0.00]( 1.19) 0.99 [ -1.04]( 0.93)
256 1.00 [ 0.00]( 0.16) 1.00 [ -0.18]( 0.34)
512 1.00 [ 0.00]( 0.20) 0.99 [ -0.62]( 0.02)
1024 1.00 [ 0.00]( 0.06) 1.00 [ -0.49]( 0.37)

==================================================================
Test : stream-10
Units : Normalized Bandwidth, MB/s
Interpretation: Higher is better
Statistic : HMean
==================================================================
Test: tip[pct imp](CV) place-initial-fix[pct imp](CV)
Copy 1.00 [ 0.00]( 6.04) 1.00 [ -0.21]( 7.98)
Scale 1.00 [ 0.00]( 5.44) 0.99 [ -0.75]( 5.75)
Add 1.00 [ 0.00]( 5.44) 0.99 [ -1.48]( 5.40)
Triad 1.00 [ 0.00]( 7.82) 1.02 [ 2.21]( 8.33)

==================================================================
Test : stream-100
Units : Normalized Bandwidth, MB/s
Interpretation: Higher is better
Statistic : HMean
==================================================================
Test: tip[pct imp](CV) place-initial-fix[pct imp](CV)
Copy 1.00 [ 0.00]( 1.14) 1.00 [ 0.40]( 1.12)
Scale 1.00 [ 0.00]( 4.60) 1.01 [ 1.05]( 4.99)
Add 1.00 [ 0.00]( 4.91) 1.00 [ -0.14]( 4.97)
Triad 1.00 [ 0.00]( 0.60) 0.96 [ -3.53]( 6.13)

==================================================================
Test : netperf
Units : Normalized Througput
Interpretation: Higher is better
Statistic : AMean
==================================================================
Clients: tip[pct imp](CV) place-initial-fix[pct imp](CV)
1-clients 1.00 [ 0.00]( 0.61) 1.00 [ 0.40]( 0.75)
2-clients 1.00 [ 0.00]( 0.44) 1.00 [ -0.47]( 0.91)
4-clients 1.00 [ 0.00]( 0.75) 1.00 [ -0.23]( 0.84)
8-clients 1.00 [ 0.00]( 0.65) 1.00 [ -0.07]( 0.62)
16-clients 1.00 [ 0.00]( 0.49) 1.00 [ -0.29]( 0.56)
32-clients 1.00 [ 0.00]( 0.57) 1.00 [ -0.14]( 0.46)
64-clients 1.00 [ 0.00]( 1.67) 1.00 [ -0.14]( 1.81)
128-clients 1.00 [ 0.00]( 1.11) 1.01 [ 0.64]( 1.04)
256-clients 1.00 [ 0.00]( 2.64) 0.99 [ -1.29]( 5.25)
512-clients 1.00 [ 0.00](52.49) 0.99 [ -0.57](53.01)

==================================================================
Test : schbench
Units : Normalized 99th percentile latency in us
Interpretation: Lower is better
Statistic : Median
==================================================================
#workers: tip[pct imp](CV) place-initial-fix[pct imp](CV)
1 1.00 [ -0.00]( 8.41) 1.05 [ -5.41](13.45)
2 1.00 [ -0.00]( 5.29) 0.88 [ 12.50](13.21)
4 1.00 [ -0.00]( 1.32) 1.00 [ -0.00]( 4.80)
8 1.00 [ -0.00]( 9.52) 0.94 [ 6.25]( 8.85)
16 1.00 [ -0.00]( 1.61) 0.97 [ 3.23]( 5.00)
32 1.00 [ -0.00]( 7.27) 0.88 [ 12.50]( 2.30)
64 1.00 [ -0.00]( 6.96) 1.07 [ -6.94]( 4.94)
128 1.00 [ -0.00]( 3.41) 0.99 [ 1.44]( 2.69)
256 1.00 [ -0.00](32.95) 0.81 [ 19.17](16.38)
512 1.00 [ -0.00]( 3.20) 0.98 [ 1.66]( 2.35)

==================================================================
Test : ycsb-cassandra
Units : Normalized throughput
Interpretation: Higher is better
Statistic : Mean
==================================================================
metric tip place-initial-fix(%diff)
throughput 1.00 0.99 (%diff: -0.67%)

==================================================================
Test : ycsb-mondodb
Units : Normalized throughput
Interpretation: Higher is better
Statistic : Mean
==================================================================
metric tip place-initial-fix(%diff)
throughput 1.00 0.99 (%diff: -0.68%)

==================================================================
Test : DeathStarBench
Units : Normalized throughput
Interpretation: Higher is better
Statistic : Mean
Note : Comparisons contains additional commit 8dafa9d0eb1a
("sched/eevdf: Fix min_deadline heap integrity") from
tip:sched/urgent to fix an EEVDF issue being hit
==================================================================
Pinning scaling tip place-initial-fix (%diff)
1CCD 1 1.00 1.00 (%diff: -0.09%)
2CCD 2 1.00 1.02 (%diff: 2.46%)
4CCD 4 1.00 1.00 (%diff: 0.45%)
8CCD 8 1.00 1.00 (%diff: -0.46%)

--

On 10/4/2023 6:39 PM, Daniel Jordan wrote:
> An entity is supposed to get an earlier deadline with
> PLACE_DEADLINE_INITIAL when it's forked, but the deadline gets
> overwritten soon after in enqueue_entity() the first time a forked
> entity is woken so that PLACE_DEADLINE_INITIAL is effectively a no-op.
>
> Placing in task_fork_fair() seems unnecessary since none of the values
> that get set (slice, vruntime, deadline) are used before they're set
> again at enqueue time, so get rid of that (and with it all of
> task_fork_fair()) and just pass ENQUEUE_INITIAL to enqueue_entity() via
> wake_up_new_task().
>
> Fixes: e8f331bcc270 ("sched/smp: Use lag to simplify cross-runqueue placement")
> Signed-off-by: Daniel Jordan <[email protected]>

Tested-by: K Prateek Nayak <[email protected]>

> ---
>
> v2
> - place_entity() seems like the only reason for task_fork_fair() to exist
> after the recent removal of sysctl_sched_child_runs_first, so take out
> the whole function.
>
> Still based on today's peterz/sched/eevdf
>
> kernel/sched/core.c | 2 +-
> kernel/sched/fair.c | 24 ------------------------
> 2 files changed, 1 insertion(+), 25 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 779cdc7969c81..500e2dbfd41dd 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4854,7 +4854,7 @@ void wake_up_new_task(struct task_struct *p)
> update_rq_clock(rq);
> post_init_entity_util_avg(p);
>
> - activate_task(rq, p, ENQUEUE_NOCLOCK);
> + activate_task(rq, p, ENQUEUE_INITIAL | ENQUEUE_NOCLOCK);
> trace_sched_wakeup_new(p);
> wakeup_preempt(rq, p, WF_FORK);
> #ifdef CONFIG_SMP
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a0b4dac2662c9..3827b302eeb9b 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -12427,29 +12427,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
> task_tick_core(rq, curr);
> }
>
> -/*
> - * called on fork with the child task as argument from the parent's context
> - * - child not yet on the tasklist
> - * - preemption disabled
> - */
> -static void task_fork_fair(struct task_struct *p)
> -{
> - struct sched_entity *se = &p->se, *curr;
> - struct cfs_rq *cfs_rq;
> - struct rq *rq = this_rq();
> - struct rq_flags rf;
> -
> - rq_lock(rq, &rf);
> - update_rq_clock(rq);
> -
> - cfs_rq = task_cfs_rq(current);
> - curr = cfs_rq->curr;
> - if (curr)
> - update_curr(cfs_rq);
> - place_entity(cfs_rq, se, ENQUEUE_INITIAL);
> - rq_unlock(rq, &rf);
> -}
> -
> /*
> * Priority of the task has changed. Check to see if we preempt
> * the current task.
> @@ -12953,7 +12930,6 @@ DEFINE_SCHED_CLASS(fair) = {
> #endif
>
> .task_tick = task_tick_fair,
> - .task_fork = task_fork_fair,
>
> .prio_changed = prio_changed_fair,
> .switched_from = switched_from_fair,

--
Thanks and Regards,
Prateek

2023-10-16 05:39:40

by K Prateek Nayak

[permalink] [raw]

Subject: Re: [PATCH] sched/fair: Always update_curr() before placing at enqueue

Hello Daniel,

I see a good and consistent improvement in Stream (with shorter loops)
with this change. Everything else is more or less the same.

I'll leave the detailed results below.

On 10/6/2023 10:18 PM, Daniel Jordan wrote:
> Placing wants current's vruntime and the cfs_rq's min_vruntime up to
> date so that avg_runtime() is too, and similarly it wants the entity to
> be re-weighted and lag adjusted so vslice and vlag are fresh, so always
> do update_curr() and update_cfs_group() beforehand.
>
> There doesn't seem to be a reason to treat the 'curr' case specially
> after e8f331bcc270 since vruntime doesn't get normalized anymore.
>
> Fixes: e8f331bcc270 ("sched/smp: Use lag to simplify cross-runqueue placement")
> Signed-off-by: Daniel Jordan <[email protected]>

o Machine details

- 3rd Generation EPYC System
- 2 sockets each with 64C/128T
- NPS1 (Each socket is a NUMA node)
- C2 Disabled (POLL and C1(MWAIT) remained enabled)

o Kernel Details

- tip: tip:sched/core at commit 3657680f38cd ("sched/psi: Delete the
'update_total' function parameter from update_triggers()") +
cherry-pick commit 8dafa9d0eb1a sched/eevdf: Fix min_deadline heap
integrity") from tip:sched/urgent + cherry-pick commit b01db23d5923
("sched/eevdf: Fix pick_eevdf()") from tip:sched/urgent

update_curr_opt: tip + this patch

o Benchmark Results

==================================================================
Test : hackbench
Units : Normalized time in seconds
Interpretation: Lower is better
Statistic : AMean
==================================================================
Case: tip[pct imp](CV) update_curr_opt[pct imp](CV)
1-groups 1.00 [ -0.00]( 2.69) 1.01 [ -0.71]( 2.88)
2-groups 1.00 [ -0.00]( 1.69) 1.01 [ -0.62]( 1.40)
4-groups 1.00 [ -0.00]( 1.25) 1.01 [ -1.17]( 1.03)
8-groups 1.00 [ -0.00]( 1.36) 1.00 [ -0.43]( 0.83)
16-groups 1.00 [ -0.00]( 1.44) 1.00 [ -0.13]( 2.32)

==================================================================
Test : tbench
Units : Normalized throughput
Interpretation: Higher is better
Statistic : AMean
==================================================================
Clients: tip[pct imp](CV) update_curr_opt[pct imp](CV)
1 1.00 [ 0.00]( 0.33) 1.01 [ 0.52]( 1.51)
2 1.00 [ 0.00]( 0.22) 1.00 [ -0.01]( 0.37)
4 1.00 [ 0.00]( 0.25) 0.99 [ -0.71]( 0.60)
8 1.00 [ 0.00]( 0.71) 1.00 [ -0.26]( 0.36)
16 1.00 [ 0.00]( 0.79) 0.99 [ -1.21]( 0.77)
32 1.00 [ 0.00]( 0.94) 0.99 [ -0.82]( 1.46)
64 1.00 [ 0.00]( 1.76) 0.99 [ -0.92]( 1.25)
128 1.00 [ 0.00]( 0.68) 0.98 [ -2.22]( 1.19)
256 1.00 [ 0.00]( 1.23) 0.99 [ -1.43]( 0.79)
512 1.00 [ 0.00]( 0.28) 0.99 [ -0.93]( 0.14)
1024 1.00 [ 0.00]( 0.20) 0.99 [ -1.44]( 0.41)

==================================================================
Test : stream-10
Units : Normalized Bandwidth, MB/s
Interpretation: Higher is better
Statistic : HMean
==================================================================
Test: tip[pct imp](CV) update_curr_opt[pct imp](CV)
Copy 1.00 [ 0.00](11.88) 1.22 [ 21.92]( 7.37)
Scale 1.00 [ 0.00]( 7.01) 1.04 [ 4.02]( 4.89)
Add 1.00 [ 0.00]( 6.56) 1.11 [ 11.03]( 4.77)
Triad 1.00 [ 0.00]( 8.81) 1.14 [ 14.12]( 3.89)

==================================================================
Test : stream-100
Units : Normalized Bandwidth, MB/s
Interpretation: Higher is better
Statistic : HMean
==================================================================
Test: tip[pct imp](CV) update_curr_opt[pct imp](CV)
Copy 1.00 [ 0.00]( 1.07) 1.01 [ 0.77]( 1.59)
Scale 1.00 [ 0.00]( 4.81) 0.97 [ -2.99]( 7.18)
Add 1.00 [ 0.00]( 4.56) 0.98 [ -2.39]( 6.86)
Triad 1.00 [ 0.00]( 1.78) 1.00 [ -0.35]( 4.22)

==================================================================
Test : netperf
Units : Normalized Througput
Interpretation: Higher is better
Statistic : AMean
==================================================================
Clients: tip[pct imp](CV) update_curr_opt[pct imp](CV)
1-clients 1.00 [ 0.00]( 0.62) 0.99 [ -1.03]( 0.24)
2-clients 1.00 [ 0.00]( 0.36) 1.00 [ -0.32]( 0.66)
4-clients 1.00 [ 0.00]( 0.31) 1.00 [ -0.17]( 0.44)
8-clients 1.00 [ 0.00]( 0.39) 1.00 [ 0.24]( 0.67)
16-clients 1.00 [ 0.00]( 0.58) 1.00 [ 0.50]( 0.46)
32-clients 1.00 [ 0.00]( 0.71) 1.01 [ 0.54]( 0.66)
64-clients 1.00 [ 0.00]( 2.13) 1.00 [ 0.35]( 1.80)
128-clients 1.00 [ 0.00]( 0.94) 0.99 [ -0.71]( 0.97)
256-clients 1.00 [ 0.00]( 6.09) 1.01 [ 1.28]( 3.41)
512-clients 1.00 [ 0.00](55.28) 1.01 [ 1.32](49.78)

==================================================================
Test : schbench
Units : Normalized 99th percentile latency in us
Interpretation: Lower is better
Statistic : Median
==================================================================
#workers: tip[pct imp](CV) update_curr_opt[pct imp](CV)
1 1.00 [ -0.00]( 2.91) 0.97 [ 2.56]( 1.53)
2 1.00 [ -0.00](21.78) 0.89 [ 10.53](10.23)
4 1.00 [ -0.00]( 4.88) 1.07 [ -7.32]( 6.82)
8 1.00 [ -0.00]( 2.49) 1.00 [ -0.00]( 9.53)
16 1.00 [ -0.00]( 3.70) 1.02 [ -1.75]( 0.99)
32 1.00 [ -0.00](12.65) 0.83 [ 16.51]( 4.41)
64 1.00 [ -0.00]( 3.98) 0.97 [ 2.59]( 8.27)
128 1.00 [ -0.00]( 1.49) 0.96 [ 3.60]( 8.01)
256 1.00 [ -0.00](40.79) 0.80 [ 20.39](36.89)
512 1.00 [ -0.00]( 1.12) 0.98 [ 2.20]( 0.75)

==================================================================
Test : ycsb-cassandra
Units : Normalized throughput
Interpretation: Higher is better
Statistic : Mean
==================================================================
metric tip update_curr_opt (%diff)
throughput 1.00 1.00 (%diff: -0.45%)

==================================================================
Test : ycsb-mondodb
Units : Normalized throughput
Interpretation: Higher is better
Statistic : Mean
==================================================================
metric tip update_curr_opt (%diff)
throughput 1.00 1.00 (%diff: -0.13%)

==================================================================
Test : DeathStarBench
Units : Normalized throughput
Interpretation: Higher is better
Statistic : Mean
==================================================================
Pinning scaling tip update_curr_opt (%diff)
1CCD 1 1.00 1.01 (%diff: 0.57%)
2CCD 2 1.00 1.00 (%diff: -0.27%)
4CCD 4 1.00 1.00 (%diff: 0.06%)
8CCD 8 1.00 1.00 (%diff: 0.45%)

--
Feel free to include

Tested-by: K Prateek Nayak <[email protected]>

I'll keep a lookout for future versions.

> ---
>
> Not sure what the XXX above place_entity() is for, maybe it can go away?
>
> Based on tip/sched/core.
>
> kernel/sched/fair.c | 14 ++------------
> 1 file changed, 2 insertions(+), 12 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 04fbcbda97d5f..db2ca9bf9cc49 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5047,15 +5047,6 @@ static inline bool cfs_bandwidth_used(void);
> static void
> enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> {
> - bool curr = cfs_rq->curr == se;
> -
> - /*
> - * If we're the current task, we must renormalise before calling
> - * update_curr().
> - */
> - if (curr)
> - place_entity(cfs_rq, se, flags);
> -
> update_curr(cfs_rq);
>
> /*
> @@ -5080,8 +5071,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> * XXX now that the entity has been re-weighted, and it's lag adjusted,
> * we can place the entity.
> */
> - if (!curr)
> - place_entity(cfs_rq, se, flags);
> + place_entity(cfs_rq, se, flags);
>
> account_entity_enqueue(cfs_rq, se);
>
> @@ -5091,7 +5081,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>
> check_schedstat_required();
> update_stats_enqueue_fair(cfs_rq, se, flags);
> - if (!curr)
> + if (cfs_rq->curr != se)
> __enqueue_entity(cfs_rq, se);
> se->on_rq = 1;
>

--
Thanks and Regards,
Prateek

2023-10-18 00:45:30

by Daniel Jordan

[permalink] [raw]

Subject: Re: [PATCH] sched/fair: Always update_curr() before placing at enqueue

On Fri, Oct 06, 2023 at 09:58:10PM +0200, Peter Zijlstra wrote:
> On Fri, Oct 06, 2023 at 12:48:26PM -0400, Daniel Jordan wrote:
> > Placing wants current's vruntime and the cfs_rq's min_vruntime up to
> > date so that avg_runtime() is too, and similarly it wants the entity to
> > be re-weighted and lag adjusted so vslice and vlag are fresh, so always
> > do update_curr() and update_cfs_group() beforehand.
> >
> > There doesn't seem to be a reason to treat the 'curr' case specially
> > after e8f331bcc270 since vruntime doesn't get normalized anymore.
> >
> > Fixes: e8f331bcc270 ("sched/smp: Use lag to simplify cross-runqueue placement")
> > Signed-off-by: Daniel Jordan <[email protected]>
> > ---
> >
> > Not sure what the XXX above place_entity() is for, maybe it can go away?
> >
> > Based on tip/sched/core.
> >
> > kernel/sched/fair.c | 14 ++------------
> > 1 file changed, 2 insertions(+), 12 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 04fbcbda97d5f..db2ca9bf9cc49 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -5047,15 +5047,6 @@ static inline bool cfs_bandwidth_used(void);
> > static void
> > enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> > {
> > - bool curr = cfs_rq->curr == se;
> > -
> > - /*
> > - * If we're the current task, we must renormalise before calling
> > - * update_curr().
> > - */
> > - if (curr)
> > - place_entity(cfs_rq, se, flags);
> > -
> > update_curr(cfs_rq);
>
> IIRC part of the reason for this order is the:
>
> dequeue
> update
> enqueue
>
> pattern we have all over the place. You don't want the enqueue to move
> time forward in this case.
>
> Could be that all magically works, but please double check.

Yes, I wasn't thinking of the dequeue/update/enqueue places.
Considering these, it seems like there's more to fix (from before EEVDF
even).

Sorry for the delayed response, been staring for a while thinking I'd
have it all by the next day. It'll take a bit longer to sort out all
the cases, but I'll keep going.