2021-10-08 00:37:48

by Josh Don

[permalink] [raw]
Subject: [PATCH] sched/core: forced idle accounting

Adds accounting for "forced idle" time, which is time where a cookie'd
task forces its SMT sibling to idle, despite the presence of runnable
tasks.

Forced idle time is one means to measure the cost of enabling core
scheduling (ie. the capacity lost due to the need to force idle).

Signed-off-by: Josh Don <[email protected]>
---
include/linux/sched.h | 1 +
kernel/sched/core.c | 46 ++++++++++++++++-----
kernel/sched/core_sched.c | 85 ++++++++++++++++++++++++++++++++++++++-
kernel/sched/debug.c | 3 ++
kernel/sched/sched.h | 11 ++++-
5 files changed, 135 insertions(+), 11 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2aa30e8e1440..d114ba350802 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -784,6 +784,7 @@ struct task_struct {
struct rb_node core_node;
unsigned long core_cookie;
unsigned int core_occupation;
+ u64 core_forceidle_sum;
#endif

#ifdef CONFIG_CGROUP_SCHED
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ada028e579b0..baa4f48cacff 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
}

-void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
{
rq->core->core_task_seq++;

- if (!sched_core_enqueued(p))
- return;
+ if (sched_core_enqueued(p)) {
+ rb_erase(&p->core_node, &rq->core_tree);
+ RB_CLEAR_NODE(&p->core_node);
+ }

- rb_erase(&p->core_node, &rq->core_tree);
- RB_CLEAR_NODE(&p->core_node);
+ /*
+ * Migrating the last task off the cpu, with the cpu in forced idle
+ * state. Reschedule to create an accounting edge for forced idle,
+ * and re-examine whether the core is still in forced idle state.
+ */
+ if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
+ rq->core->core_forceidle && rq->curr == rq->idle)
+ resched_curr(rq);
}

/*
@@ -364,7 +372,8 @@ void sched_core_put(void)
#else /* !CONFIG_SCHED_CORE */

static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
-static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
+static inline void
+sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }

#endif /* CONFIG_SCHED_CORE */

@@ -2020,7 +2029,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (sched_core_enabled(rq))
- sched_core_dequeue(rq, p);
+ sched_core_dequeue(rq, p, flags);

if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
@@ -5256,6 +5265,7 @@ void scheduler_tick(void)
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
+ sched_core_tick(rq);

rq_unlock(rq, &rf);

@@ -5668,6 +5678,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *next, *p, *max = NULL;
const struct cpumask *smt_mask;
bool fi_before = false;
+ bool core_clock_updated = (rq == rq->core);
unsigned long cookie;
int i, cpu, occ = 0;
struct rq *rq_i;
@@ -5721,9 +5732,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* reset state */
rq->core->core_cookie = 0UL;
if (rq->core->core_forceidle) {
+ if (!core_clock_updated) {
+ update_rq_clock(rq->core);
+ core_clock_updated = true;
+ }
+ sched_core_account_forceidle(rq);
+ rq->core->core_forceidle = false;
+ rq->core->core_forceidle_start = 0;
need_sync = true;
fi_before = true;
- rq->core->core_forceidle = false;
}

/*
@@ -5765,7 +5782,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
for_each_cpu_wrap(i, smt_mask, cpu) {
rq_i = cpu_rq(i);

- if (i != cpu)
+ if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i);

p = rq_i->core_pick = pick_task(rq_i);
@@ -5804,6 +5821,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
}

+ if (rq->core->core_forceidle && cookie)
+ rq->core->core_forceidle_start = rq_clock(rq->core);
+
rq->core->core_pick_seq = rq->core->core_task_seq;
next = rq->core_pick;
rq->core_sched_seq = rq->core->core_pick_seq;
@@ -6051,6 +6071,13 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
core_rq->core_forceidle = rq->core_forceidle;
core_rq->core_forceidle_seq = rq->core_forceidle_seq;

+ /*
+ * Accounting edge for forced idle is handled in pick_next_task().
+ * Don't need another one here, since the hotplug thread shouldn't
+ * have a cookie.
+ */
+ core_rq->core_forceidle_start = 0;
+
/* install new leader */
for_each_cpu(t, smt_mask) {
rq = cpu_rq(t);
@@ -9424,6 +9451,7 @@ void __init sched_init(void)
rq->core_enabled = 0;
rq->core_tree = RB_ROOT;
rq->core_forceidle = false;
+ rq->core_forceidle_start = 0;

rq->core_cookie = 0UL;
#endif
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 48ac72696012..aae4ac2ac7ec 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,

enqueued = sched_core_enqueued(p);
if (enqueued)
- sched_core_dequeue(rq, p);
+ sched_core_dequeue(rq, p, DEQUEUE_SAVE);

old_cookie = p->core_cookie;
p->core_cookie = cookie;
@@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
* If task is currently running, it may not be compatible anymore after
* the cookie change, so enter the scheduler on its CPU to schedule it
* away.
+ *
+ * Note that it is possible that as a result of this cookie change, the
+ * core has now entered/left forced idle state. Defer accounting to the
+ * next scheduling edge, rather than always forcing a reschedule here.
*/
if (task_running(rq, p))
resched_curr(rq);
@@ -109,6 +113,7 @@ void sched_core_fork(struct task_struct *p)
{
RB_CLEAR_NODE(&p->core_node);
p->core_cookie = sched_core_clone_cookie(current);
+ p->core_forceidle_sum = 0;
}

void sched_core_free(struct task_struct *p)
@@ -228,3 +233,81 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
return err;
}

+/* REQUIRES: rq->core's clock recently updated. */
+void sched_core_account_forceidle(struct rq *rq)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
+ unsigned int smt_count;
+ u64 delta, now = rq_clock(rq->core);
+ struct rq *rq_i;
+ struct task_struct *p;
+ int i;
+
+ lockdep_assert_rq_held(rq);
+
+ WARN_ON_ONCE(!rq->core->core_forceidle);
+
+ if (rq->core->core_forceidle_start == 0)
+ return;
+
+ delta = now - rq->core->core_forceidle_start;
+ if (unlikely((s64)delta <= 0))
+ return;
+
+ rq->core->core_forceidle_start = now;
+
+ /*
+ * For larger SMT configurations, we need to scale the charged
+ * forced idle amount since there can be more than one forced idle
+ * sibling and more than one running cookied task.
+ */
+ smt_count = cpumask_weight(smt_mask);
+ if (smt_count > 2) {
+ unsigned int nr_forced_idle = 0, nr_running = 0;
+
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+ p = rq_i->core_pick ?: rq_i->curr;
+
+ if (p != rq_i->idle)
+ nr_running++;
+ else if (rq_i->nr_running)
+ nr_forced_idle++;
+ }
+
+ if (WARN_ON_ONCE(!nr_running)) {
+ /* can't be forced idle without a running task */
+ } else {
+ delta *= nr_forced_idle;
+ delta /= nr_running;
+ }
+ }
+
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+ p = rq_i->core_pick ?: rq_i->curr;
+
+ if (!p->core_cookie)
+ continue;
+
+ p->core_forceidle_sum += delta;
+
+ /* Optimize for common case. */
+ if (smt_count == 2)
+ break;
+ }
+}
+
+void sched_core_tick(struct rq *rq)
+{
+ if (!sched_core_enabled(rq))
+ return;
+
+ if (!rq->core->core_forceidle)
+ return;
+
+ if (rq != rq->core)
+ update_rq_clock(rq->core);
+ sched_core_account_forceidle(rq);
+}
+
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 26fac5e28bc0..0fe6a1bb8b60 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1045,6 +1045,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
__PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value);
__PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN));
__PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX));
+#endif
+#ifdef CONFIG_SCHED_CORE
+ PN(core_forceidle_sum);
#endif
P(policy);
P(prio);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a00fc7057d97..4678b85754f2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1113,6 +1113,7 @@ struct rq {
unsigned long core_cookie;
unsigned char core_forceidle;
unsigned int core_forceidle_seq;
+ u64 core_forceidle_start;
#endif
};

@@ -1253,11 +1254,15 @@ static inline bool sched_core_enqueued(struct task_struct *p)
}

extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
-extern void sched_core_dequeue(struct rq *rq, struct task_struct *p);
+extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);

extern void sched_core_get(void);
extern void sched_core_put(void);

+extern void sched_core_account_forceidle(struct rq *rq);
+
+extern void sched_core_tick(struct rq *rq);
+
#else /* !CONFIG_SCHED_CORE */

static inline bool sched_core_enabled(struct rq *rq)
@@ -1300,6 +1305,10 @@ static inline bool sched_group_cookie_match(struct rq *rq,
{
return true;
}
+
+static inline void sched_core_account_forceidle(struct rq *rq) {}
+
+static inline void sched_core_tick(struct rq *rq) {}
#endif /* CONFIG_SCHED_CORE */

static inline void lockdep_assert_rq_held(struct rq *rq)
--
2.33.0.882.g93a45727a2-goog


2021-10-08 21:05:20

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH] sched/core: forced idle accounting

On Thu, Oct 7, 2021 at 5:08 PM Josh Don <[email protected]> wrote:
>
> @@ -6051,6 +6071,13 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
> core_rq->core_forceidle = rq->core_forceidle;
> core_rq->core_forceidle_seq = rq->core_forceidle_seq;
>
> + /*
> + * Accounting edge for forced idle is handled in pick_next_task().
> + * Don't need another one here, since the hotplug thread shouldn't
> + * have a cookie.
> + */
> + core_rq->core_forceidle_start = 0;
> +
> /* install new leader */
> for_each_cpu(t, smt_mask) {
> rq = cpu_rq(t);

Realized there needs to be a similar edge in sched_core_flip(). I'll
include that in a v2, after seeing if there are any other comments on
this patch.

2021-10-09 15:57:24

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] sched/core: forced idle accounting

On Thu, Oct 07, 2021 at 05:08:25PM -0700, Josh Don wrote:
> Adds accounting for "forced idle" time, which is time where a cookie'd
> task forces its SMT sibling to idle, despite the presence of runnable
> tasks.
>
> Forced idle time is one means to measure the cost of enabling core
> scheduling (ie. the capacity lost due to the need to force idle).

It seems an excessive amount of code for what it says to do.

> + smt_count = cpumask_weight(smt_mask);

That's a fairly expensive operation to find a number that's going the be
to same over and over and over...

> + if (smt_count > 2) {
> + unsigned int nr_forced_idle = 0, nr_running = 0;
> +
> + for_each_cpu(i, smt_mask) {
> + rq_i = cpu_rq(i);
> + p = rq_i->core_pick ?: rq_i->curr;
> +
> + if (p != rq_i->idle)
> + nr_running++;
> + else if (rq_i->nr_running)
> + nr_forced_idle++;
> + }
> +
> + if (WARN_ON_ONCE(!nr_running)) {
> + /* can't be forced idle without a running task */
> + } else {
> + delta *= nr_forced_idle;
> + delta /= nr_running;
> + }

Now the comment sayeth:

> + /*
> + * For larger SMT configurations, we need to scale the charged
> + * forced idle amount since there can be more than one forced idle
> + * sibling and more than one running cookied task.
> + */

But why?

> + }
> +
> + for_each_cpu(i, smt_mask) {
> + rq_i = cpu_rq(i);
> + p = rq_i->core_pick ?: rq_i->curr;
> +
> + if (!p->core_cookie)
> + continue;
> +
> + p->core_forceidle_sum += delta;
> +
> + /* Optimize for common case. */
> + if (smt_count == 2)
> + break;
> + }
> +}

2021-10-09 18:17:59

by Tao Zhou

[permalink] [raw]
Subject: Re: [PATCH] sched/core: forced idle accounting

Hi Josh,

On Thu, Oct 07, 2021 at 05:08:25PM -0700, Josh Don wrote:
> Adds accounting for "forced idle" time, which is time where a cookie'd
> task forces its SMT sibling to idle, despite the presence of runnable
> tasks.
>
> Forced idle time is one means to measure the cost of enabling core
> scheduling (ie. the capacity lost due to the need to force idle).
>
> Signed-off-by: Josh Don <[email protected]>
> ---
> include/linux/sched.h | 1 +
> kernel/sched/core.c | 46 ++++++++++++++++-----
> kernel/sched/core_sched.c | 85 ++++++++++++++++++++++++++++++++++++++-
> kernel/sched/debug.c | 3 ++
> kernel/sched/sched.h | 11 ++++-
> 5 files changed, 135 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 2aa30e8e1440..d114ba350802 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -784,6 +784,7 @@ struct task_struct {
> struct rb_node core_node;
> unsigned long core_cookie;
> unsigned int core_occupation;
> + u64 core_forceidle_sum;
> #endif
>
> #ifdef CONFIG_CGROUP_SCHED
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index ada028e579b0..baa4f48cacff 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
> rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
> }
>
> -void sched_core_dequeue(struct rq *rq, struct task_struct *p)
> +void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
> {
> rq->core->core_task_seq++;
>
> - if (!sched_core_enqueued(p))
> - return;
> + if (sched_core_enqueued(p)) {
> + rb_erase(&p->core_node, &rq->core_tree);
> + RB_CLEAR_NODE(&p->core_node);
> + }
>
> - rb_erase(&p->core_node, &rq->core_tree);
> - RB_CLEAR_NODE(&p->core_node);
> + /*
> + * Migrating the last task off the cpu, with the cpu in forced idle
> + * state. Reschedule to create an accounting edge for forced idle,
> + * and re-examine whether the core is still in forced idle state.
> + */
> + if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
> + rq->core->core_forceidle && rq->curr == rq->idle)
> + resched_curr(rq);
> }
>
> /*
> @@ -364,7 +372,8 @@ void sched_core_put(void)
> #else /* !CONFIG_SCHED_CORE */
>
> static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
> -static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
> +static inline void
> +sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
>
> #endif /* CONFIG_SCHED_CORE */
>
> @@ -2020,7 +2029,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
> static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
> {
> if (sched_core_enabled(rq))
> - sched_core_dequeue(rq, p);
> + sched_core_dequeue(rq, p, flags);
>
> if (!(flags & DEQUEUE_NOCLOCK))
> update_rq_clock(rq);
> @@ -5256,6 +5265,7 @@ void scheduler_tick(void)
> if (sched_feat(LATENCY_WARN))
> resched_latency = cpu_resched_latency(rq);
> calc_global_load_tick(rq);
> + sched_core_tick(rq);
>
> rq_unlock(rq, &rf);
>
> @@ -5668,6 +5678,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> struct task_struct *next, *p, *max = NULL;
> const struct cpumask *smt_mask;
> bool fi_before = false;
> + bool core_clock_updated = (rq == rq->core);
> unsigned long cookie;
> int i, cpu, occ = 0;
> struct rq *rq_i;
> @@ -5721,9 +5732,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> /* reset state */
> rq->core->core_cookie = 0UL;
> if (rq->core->core_forceidle) {
> + if (!core_clock_updated) {
> + update_rq_clock(rq->core);
> + core_clock_updated = true;
> + }
> + sched_core_account_forceidle(rq);
> + rq->core->core_forceidle = false;
> + rq->core->core_forceidle_start = 0;
> need_sync = true;
> fi_before = true;
> - rq->core->core_forceidle = false;
> }
>
> /*
> @@ -5765,7 +5782,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> for_each_cpu_wrap(i, smt_mask, cpu) {
> rq_i = cpu_rq(i);
>
> - if (i != cpu)
> + if (i != cpu && (rq_i != rq->core || !core_clock_updated))
> update_rq_clock(rq_i);
>
> p = rq_i->core_pick = pick_task(rq_i);
> @@ -5804,6 +5821,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> }
> }
>
> + if (rq->core->core_forceidle && cookie)
> + rq->core->core_forceidle_start = rq_clock(rq->core);
> +
> rq->core->core_pick_seq = rq->core->core_task_seq;
> next = rq->core_pick;
> rq->core_sched_seq = rq->core->core_pick_seq;
> @@ -6051,6 +6071,13 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
> core_rq->core_forceidle = rq->core_forceidle;
> core_rq->core_forceidle_seq = rq->core_forceidle_seq;
>
> + /*
> + * Accounting edge for forced idle is handled in pick_next_task().
> + * Don't need another one here, since the hotplug thread shouldn't
> + * have a cookie.
> + */
> + core_rq->core_forceidle_start = 0;
> +
> /* install new leader */
> for_each_cpu(t, smt_mask) {
> rq = cpu_rq(t);
> @@ -9424,6 +9451,7 @@ void __init sched_init(void)
> rq->core_enabled = 0;
> rq->core_tree = RB_ROOT;
> rq->core_forceidle = false;
> + rq->core_forceidle_start = 0;
>
> rq->core_cookie = 0UL;
> #endif
> diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
> index 48ac72696012..aae4ac2ac7ec 100644
> --- a/kernel/sched/core_sched.c
> +++ b/kernel/sched/core_sched.c
> @@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
>
> enqueued = sched_core_enqueued(p);
> if (enqueued)
> - sched_core_dequeue(rq, p);
> + sched_core_dequeue(rq, p, DEQUEUE_SAVE);
>
> old_cookie = p->core_cookie;
> p->core_cookie = cookie;
> @@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
> * If task is currently running, it may not be compatible anymore after
> * the cookie change, so enter the scheduler on its CPU to schedule it
> * away.
> + *
> + * Note that it is possible that as a result of this cookie change, the
> + * core has now entered/left forced idle state. Defer accounting to the
> + * next scheduling edge, rather than always forcing a reschedule here.
> */
> if (task_running(rq, p))
> resched_curr(rq);
> @@ -109,6 +113,7 @@ void sched_core_fork(struct task_struct *p)
> {
> RB_CLEAR_NODE(&p->core_node);
> p->core_cookie = sched_core_clone_cookie(current);
> + p->core_forceidle_sum = 0;
> }
>
> void sched_core_free(struct task_struct *p)
> @@ -228,3 +233,81 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
> return err;
> }
>
> +/* REQUIRES: rq->core's clock recently updated. */
> +void sched_core_account_forceidle(struct rq *rq)
> +{
> + const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
> + unsigned int smt_count;
> + u64 delta, now = rq_clock(rq->core);
> + struct rq *rq_i;
> + struct task_struct *p;
> + int i;
> +
> + lockdep_assert_rq_held(rq);
> +
> + WARN_ON_ONCE(!rq->core->core_forceidle);
> +
> + if (rq->core->core_forceidle_start == 0)
> + return;
> +
> + delta = now - rq->core->core_forceidle_start;
> + if (unlikely((s64)delta <= 0))
> + return;
> +
> + rq->core->core_forceidle_start = now;
> +
> + /*
> + * For larger SMT configurations, we need to scale the charged
> + * forced idle amount since there can be more than one forced idle
> + * sibling and more than one running cookied task.
> + */
> + smt_count = cpumask_weight(smt_mask);
> + if (smt_count > 2) {
> + unsigned int nr_forced_idle = 0, nr_running = 0;
> +
> + for_each_cpu(i, smt_mask) {
> + rq_i = cpu_rq(i);
> + p = rq_i->core_pick ?: rq_i->curr;
> +
> + if (p != rq_i->idle)
> + nr_running++;
> + else if (rq_i->nr_running)
> + nr_forced_idle++;
> + }
> +
> + if (WARN_ON_ONCE(!nr_running)) {
> + /* can't be forced idle without a running task */
> + } else {
> + delta *= nr_forced_idle;
> + delta /= nr_running;
> + }

Is it possible to use (smt_count - core_occupation) / core_occupation
to evaluate this delta ?

> + }
> +
> + for_each_cpu(i, smt_mask) {
> + rq_i = cpu_rq(i);
> + p = rq_i->core_pick ?: rq_i->curr;
> +
> + if (!p->core_cookie)
> + continue;
> +
> + p->core_forceidle_sum += delta;
> +
> + /* Optimize for common case. */
> + if (smt_count == 2)
> + break;
> + }
> +}
> +
> +void sched_core_tick(struct rq *rq)
> +{
> + if (!sched_core_enabled(rq))
> + return;
> +
> + if (!rq->core->core_forceidle)
> + return;
> +
> + if (rq != rq->core)
> + update_rq_clock(rq->core);
> + sched_core_account_forceidle(rq);
> +}
> +
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 26fac5e28bc0..0fe6a1bb8b60 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -1045,6 +1045,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
> __PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value);
> __PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN));
> __PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX));
> +#endif
> +#ifdef CONFIG_SCHED_CORE
> + PN(core_forceidle_sum);
> #endif
> P(policy);
> P(prio);
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index a00fc7057d97..4678b85754f2 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1113,6 +1113,7 @@ struct rq {
> unsigned long core_cookie;
> unsigned char core_forceidle;
> unsigned int core_forceidle_seq;
> + u64 core_forceidle_start;
> #endif
> };
>
> @@ -1253,11 +1254,15 @@ static inline bool sched_core_enqueued(struct task_struct *p)
> }
>
> extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
> -extern void sched_core_dequeue(struct rq *rq, struct task_struct *p);
> +extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
>
> extern void sched_core_get(void);
> extern void sched_core_put(void);
>
> +extern void sched_core_account_forceidle(struct rq *rq);
> +
> +extern void sched_core_tick(struct rq *rq);
> +
> #else /* !CONFIG_SCHED_CORE */
>
> static inline bool sched_core_enabled(struct rq *rq)
> @@ -1300,6 +1305,10 @@ static inline bool sched_group_cookie_match(struct rq *rq,
> {
> return true;
> }
> +
> +static inline void sched_core_account_forceidle(struct rq *rq) {}
> +
> +static inline void sched_core_tick(struct rq *rq) {}
> #endif /* CONFIG_SCHED_CORE */
>
> static inline void lockdep_assert_rq_held(struct rq *rq)
> --
> 2.33.0.882.g93a45727a2-goog



Thanks,
Tao

2021-10-11 17:36:36

by Hao Luo

[permalink] [raw]
Subject: Re: [PATCH] sched/core: forced idle accounting

On Thu, Oct 7, 2021 at 5:08 PM Josh Don <[email protected]> wrote:
>
> Adds accounting for "forced idle" time, which is time where a cookie'd
> task forces its SMT sibling to idle, despite the presence of runnable
> tasks.
>
> Forced idle time is one means to measure the cost of enabling core
> scheduling (ie. the capacity lost due to the need to force idle).
>
> Signed-off-by: Josh Don <[email protected]>
> ---
> include/linux/sched.h | 1 +
> kernel/sched/core.c | 46 ++++++++++++++++-----
> kernel/sched/core_sched.c | 85 ++++++++++++++++++++++++++++++++++++++-
> kernel/sched/debug.c | 3 ++
> kernel/sched/sched.h | 11 ++++-
> 5 files changed, 135 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 2aa30e8e1440..d114ba350802 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -784,6 +784,7 @@ struct task_struct {
> struct rb_node core_node;
> unsigned long core_cookie;
> unsigned int core_occupation;
> + u64 core_forceidle_sum;
> #endif
>
> #ifdef CONFIG_CGROUP_SCHED
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index ada028e579b0..baa4f48cacff 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
> rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
> }
>
> -void sched_core_dequeue(struct rq *rq, struct task_struct *p)
> +void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
> {
> rq->core->core_task_seq++;
>
> - if (!sched_core_enqueued(p))
> - return;
> + if (sched_core_enqueued(p)) {
> + rb_erase(&p->core_node, &rq->core_tree);
> + RB_CLEAR_NODE(&p->core_node);
> + }
>
> - rb_erase(&p->core_node, &rq->core_tree);
> - RB_CLEAR_NODE(&p->core_node);
> + /*
> + * Migrating the last task off the cpu, with the cpu in forced idle
> + * state. Reschedule to create an accounting edge for forced idle,
> + * and re-examine whether the core is still in forced idle state.
> + */
> + if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
> + rq->core->core_forceidle && rq->curr == rq->idle)
> + resched_curr(rq);

Resched_curr is probably an unwanted side effect of dequeue. Maybe we
could extract the check and resched_curr out into a function, and call
the function outside of sched_core_dequeue(). In that way, the
interface of dequeue doesn't need to change.

> }
>
> /*
> @@ -364,7 +372,8 @@ void sched_core_put(void)
> #else /* !CONFIG_SCHED_CORE */
>
> static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
> -static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
> +static inline void
> +sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
>
> #endif /* CONFIG_SCHED_CORE */
>
> @@ -2020,7 +2029,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
> static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
> {
> if (sched_core_enabled(rq))
> - sched_core_dequeue(rq, p);
> + sched_core_dequeue(rq, p, flags);
>
> if (!(flags & DEQUEUE_NOCLOCK))
> update_rq_clock(rq);
> @@ -5256,6 +5265,7 @@ void scheduler_tick(void)
> if (sched_feat(LATENCY_WARN))
> resched_latency = cpu_resched_latency(rq);
> calc_global_load_tick(rq);
> + sched_core_tick(rq);
>
> rq_unlock(rq, &rf);
>
> @@ -5668,6 +5678,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> struct task_struct *next, *p, *max = NULL;
> const struct cpumask *smt_mask;
> bool fi_before = false;
> + bool core_clock_updated = (rq == rq->core);
> unsigned long cookie;
> int i, cpu, occ = 0;
> struct rq *rq_i;
> @@ -5721,9 +5732,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> /* reset state */
> rq->core->core_cookie = 0UL;
> if (rq->core->core_forceidle) {
> + if (!core_clock_updated) {
> + update_rq_clock(rq->core);
> + core_clock_updated = true;
> + }
> + sched_core_account_forceidle(rq);
> + rq->core->core_forceidle = false;
> + rq->core->core_forceidle_start = 0;
> need_sync = true;
> fi_before = true;
> - rq->core->core_forceidle = false;
> }
>
> /*
> @@ -5765,7 +5782,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> for_each_cpu_wrap(i, smt_mask, cpu) {
> rq_i = cpu_rq(i);
>
> - if (i != cpu)
> + if (i != cpu && (rq_i != rq->core || !core_clock_updated))
> update_rq_clock(rq_i);

Do you mean (rq_i != rq->core && !core_clock_updated)? I thought
rq->core has core_clock updated always.

>
> p = rq_i->core_pick = pick_task(rq_i);
> @@ -5804,6 +5821,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> }
> }
>
> + if (rq->core->core_forceidle && cookie)
> + rq->core->core_forceidle_start = rq_clock(rq->core);
> +
> rq->core->core_pick_seq = rq->core->core_task_seq;
> next = rq->core_pick;
> rq->core_sched_seq = rq->core->core_pick_seq;
> @@ -6051,6 +6071,13 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
> core_rq->core_forceidle = rq->core_forceidle;
> core_rq->core_forceidle_seq = rq->core_forceidle_seq;
>
> + /*
> + * Accounting edge for forced idle is handled in pick_next_task().
> + * Don't need another one here, since the hotplug thread shouldn't
> + * have a cookie.
> + */
> + core_rq->core_forceidle_start = 0;
> +
> /* install new leader */
> for_each_cpu(t, smt_mask) {
> rq = cpu_rq(t);
> @@ -9424,6 +9451,7 @@ void __init sched_init(void)
> rq->core_enabled = 0;
> rq->core_tree = RB_ROOT;
> rq->core_forceidle = false;
> + rq->core_forceidle_start = 0;
>
> rq->core_cookie = 0UL;
> #endif
> diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
> index 48ac72696012..aae4ac2ac7ec 100644
> --- a/kernel/sched/core_sched.c
> +++ b/kernel/sched/core_sched.c
> @@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
>
> enqueued = sched_core_enqueued(p);
> if (enqueued)
> - sched_core_dequeue(rq, p);
> + sched_core_dequeue(rq, p, DEQUEUE_SAVE);
>
> old_cookie = p->core_cookie;
> p->core_cookie = cookie;
> @@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
> * If task is currently running, it may not be compatible anymore after
> * the cookie change, so enter the scheduler on its CPU to schedule it
> * away.
> + *
> + * Note that it is possible that as a result of this cookie change, the
> + * core has now entered/left forced idle state. Defer accounting to the
> + * next scheduling edge, rather than always forcing a reschedule here.
> */
> if (task_running(rq, p))
> resched_curr(rq);
> @@ -109,6 +113,7 @@ void sched_core_fork(struct task_struct *p)
> {
> RB_CLEAR_NODE(&p->core_node);
> p->core_cookie = sched_core_clone_cookie(current);
> + p->core_forceidle_sum = 0;
> }
>
> void sched_core_free(struct task_struct *p)
> @@ -228,3 +233,81 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
> return err;
> }
>
> +/* REQUIRES: rq->core's clock recently updated. */
> +void sched_core_account_forceidle(struct rq *rq)
> +{
> + const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
> + unsigned int smt_count;
> + u64 delta, now = rq_clock(rq->core);
> + struct rq *rq_i;
> + struct task_struct *p;
> + int i;
> +
> + lockdep_assert_rq_held(rq);
> +
> + WARN_ON_ONCE(!rq->core->core_forceidle);
> +
> + if (rq->core->core_forceidle_start == 0)
> + return;
> +
> + delta = now - rq->core->core_forceidle_start;
> + if (unlikely((s64)delta <= 0))
> + return;
> +
> + rq->core->core_forceidle_start = now;
> +
> + /*
> + * For larger SMT configurations, we need to scale the charged
> + * forced idle amount since there can be more than one forced idle
> + * sibling and more than one running cookied task.
> + */
> + smt_count = cpumask_weight(smt_mask);
> + if (smt_count > 2) {
> + unsigned int nr_forced_idle = 0, nr_running = 0;
> +
> + for_each_cpu(i, smt_mask) {
> + rq_i = cpu_rq(i);
> + p = rq_i->core_pick ?: rq_i->curr;
> +
> + if (p != rq_i->idle)
> + nr_running++;
> + else if (rq_i->nr_running)
> + nr_forced_idle++;
> + }
> +
> + if (WARN_ON_ONCE(!nr_running)) {
> + /* can't be forced idle without a running task */
> + } else {
> + delta *= nr_forced_idle;
> + delta /= nr_running;
> + }
> + }
> +
> + for_each_cpu(i, smt_mask) {
> + rq_i = cpu_rq(i);
> + p = rq_i->core_pick ?: rq_i->curr;
> +
> + if (!p->core_cookie)
> + continue;
> +
> + p->core_forceidle_sum += delta;
> +
> + /* Optimize for common case. */
> + if (smt_count == 2)
> + break;
> + }
> +}
> +
> +void sched_core_tick(struct rq *rq)
> +{
> + if (!sched_core_enabled(rq))
> + return;
> +
> + if (!rq->core->core_forceidle)
> + return;
> +
> + if (rq != rq->core)
> + update_rq_clock(rq->core);
> + sched_core_account_forceidle(rq);
> +}
> +
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 26fac5e28bc0..0fe6a1bb8b60 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -1045,6 +1045,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
> __PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value);
> __PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN));
> __PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX));
> +#endif
> +#ifdef CONFIG_SCHED_CORE
> + PN(core_forceidle_sum);
> #endif
> P(policy);
> P(prio);
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index a00fc7057d97..4678b85754f2 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1113,6 +1113,7 @@ struct rq {
> unsigned long core_cookie;
> unsigned char core_forceidle;
> unsigned int core_forceidle_seq;
> + u64 core_forceidle_start;
> #endif
> };
>
> @@ -1253,11 +1254,15 @@ static inline bool sched_core_enqueued(struct task_struct *p)
> }
>
> extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
> -extern void sched_core_dequeue(struct rq *rq, struct task_struct *p);
> +extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
>
> extern void sched_core_get(void);
> extern void sched_core_put(void);
>
> +extern void sched_core_account_forceidle(struct rq *rq);
> +
> +extern void sched_core_tick(struct rq *rq);
> +
> #else /* !CONFIG_SCHED_CORE */
>
> static inline bool sched_core_enabled(struct rq *rq)
> @@ -1300,6 +1305,10 @@ static inline bool sched_group_cookie_match(struct rq *rq,
> {
> return true;
> }
> +
> +static inline void sched_core_account_forceidle(struct rq *rq) {}
> +
> +static inline void sched_core_tick(struct rq *rq) {}
> #endif /* CONFIG_SCHED_CORE */
>
> static inline void lockdep_assert_rq_held(struct rq *rq)
> --
> 2.33.0.882.g93a45727a2-goog
>

2021-10-12 00:15:43

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH] sched/core: forced idle accounting

On Sat, Oct 9, 2021 at 8:55 AM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Oct 07, 2021 at 05:08:25PM -0700, Josh Don wrote:
> > Adds accounting for "forced idle" time, which is time where a cookie'd
> > task forces its SMT sibling to idle, despite the presence of runnable
> > tasks.
> >
> > Forced idle time is one means to measure the cost of enabling core
> > scheduling (ie. the capacity lost due to the need to force idle).
>
> It seems an excessive amount of code for what it says to do.

I think I can cut some of that down by simplifying the SMT>2 case :)

>
> > + smt_count = cpumask_weight(smt_mask);
>
> That's a fairly expensive operation to find a number that's going the be
> to same over and over and over...

Per Tao's suggestion, the nr_running and nr_forced_idle can be
computed and cached in pick(). Then there won't be any extra overhead
here, other than a potential division when SMT>2.

> > + if (smt_count > 2) {
> > + unsigned int nr_forced_idle = 0, nr_running = 0;
> > +
> > + for_each_cpu(i, smt_mask) {
> > + rq_i = cpu_rq(i);
> > + p = rq_i->core_pick ?: rq_i->curr;
> > +
> > + if (p != rq_i->idle)
> > + nr_running++;
> > + else if (rq_i->nr_running)
> > + nr_forced_idle++;
> > + }
> > +
> > + if (WARN_ON_ONCE(!nr_running)) {
> > + /* can't be forced idle without a running task */
> > + } else {
> > + delta *= nr_forced_idle;
> > + delta /= nr_running;
> > + }
>
> Now the comment sayeth:
>
> > + /*
> > + * For larger SMT configurations, we need to scale the charged
> > + * forced idle amount since there can be more than one forced idle
> > + * sibling and more than one running cookied task.
> > + */
>
> But why?

We scale by the number of cpus actually forced idle, since we don't
want to falsely over or under charge forced idle time (defined
strictly as time where we have a runnable task but idle the cpu). The
more important scaling here though is the division over the number of
running entities. This is done so that the aggregate amount of forced
idle over some group of threads makes sense. Ie if we have a cpu with
SMT8, and a group of 7 threads sharing a cookie, we don't want to
accrue 7 units of forced idle time per unit time while the 8th SMT is
forced idle.

> > + }
> > +
> > + for_each_cpu(i, smt_mask) {
> > + rq_i = cpu_rq(i);
> > + p = rq_i->core_pick ?: rq_i->curr;
> > +
> > + if (!p->core_cookie)
> > + continue;
> > +
> > + p->core_forceidle_sum += delta;
> > +
> > + /* Optimize for common case. */
> > + if (smt_count == 2)
> > + break;
> > + }
> > +}

2021-10-12 00:17:31

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH] sched/core: forced idle accounting

On Sat, Oct 9, 2021 at 11:11 AM Tao Zhou <[email protected]> wrote:
[snip]
> > + smt_count = cpumask_weight(smt_mask);
> > + if (smt_count > 2) {
> > + unsigned int nr_forced_idle = 0, nr_running = 0;
> > +
> > + for_each_cpu(i, smt_mask) {
> > + rq_i = cpu_rq(i);
> > + p = rq_i->core_pick ?: rq_i->curr;
> > +
> > + if (p != rq_i->idle)
> > + nr_running++;
> > + else if (rq_i->nr_running)
> > + nr_forced_idle++;
> > + }
> > +
> > + if (WARN_ON_ONCE(!nr_running)) {
> > + /* can't be forced idle without a running task */
> > + } else {
> > + delta *= nr_forced_idle;
> > + delta /= nr_running;
> > + }
>
> Is it possible to use (smt_count - core_occupation) / core_occupation
> to evaluate this delta ?

Good point, we can use the pre-computed info here. This will help
simplify this logic.

Thanks,
Josh

2021-10-12 00:34:55

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH] sched/core: forced idle accounting

On Mon, Oct 11, 2021 at 10:33 AM Hao Luo <[email protected]> wrote:
>
> On Thu, Oct 7, 2021 at 5:08 PM Josh Don <[email protected]> wrote:
> > -void sched_core_dequeue(struct rq *rq, struct task_struct *p)
> > +void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
> > {
> > rq->core->core_task_seq++;
> >
> > - if (!sched_core_enqueued(p))
> > - return;
> > + if (sched_core_enqueued(p)) {
> > + rb_erase(&p->core_node, &rq->core_tree);
> > + RB_CLEAR_NODE(&p->core_node);
> > + }
> >
> > - rb_erase(&p->core_node, &rq->core_tree);
> > - RB_CLEAR_NODE(&p->core_node);
> > + /*
> > + * Migrating the last task off the cpu, with the cpu in forced idle
> > + * state. Reschedule to create an accounting edge for forced idle,
> > + * and re-examine whether the core is still in forced idle state.
> > + */
> > + if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
> > + rq->core->core_forceidle && rq->curr == rq->idle)
> > + resched_curr(rq);
>
> Resched_curr is probably an unwanted side effect of dequeue. Maybe we
> could extract the check and resched_curr out into a function, and call
> the function outside of sched_core_dequeue(). In that way, the
> interface of dequeue doesn't need to change.

This resched is an atypical case; normal load balancing won't steal
the last runnable task off a cpu. The main reasons this resched could
trigger are: migration due to affinity change, and migration due to
sched core doing a cookie_steal. Could bubble this up to
deactivate_task(), but seems less brittle to keep this in dequeue()
with the check against DEQUEUE_SAVE (since this creates an important
accounting edge). Thoughts?

> > /*
> > @@ -5765,7 +5782,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > for_each_cpu_wrap(i, smt_mask, cpu) {
> > rq_i = cpu_rq(i);
> >
> > - if (i != cpu)
> > + if (i != cpu && (rq_i != rq->core || !core_clock_updated))
> > update_rq_clock(rq_i);
>
> Do you mean (rq_i != rq->core && !core_clock_updated)? I thought
> rq->core has core_clock updated always.

rq->clock is updated on entry to pick_next_task(). rq->core is only
updated if rq == rq->core, or if we've done the clock update for
rq->core above.

2021-10-12 12:30:17

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] sched/core: forced idle accounting

On Mon, Oct 11, 2021 at 05:12:43PM -0700, Josh Don wrote:

> > > + if (WARN_ON_ONCE(!nr_running)) {
> > > + /* can't be forced idle without a running task */
> > > + } else {
> > > + delta *= nr_forced_idle;
> > > + delta /= nr_running;
> > > + }
> >
> > Now the comment sayeth:
> >
> > > + /*
> > > + * For larger SMT configurations, we need to scale the charged
> > > + * forced idle amount since there can be more than one forced idle
> > > + * sibling and more than one running cookied task.
> > > + */
> >
> > But why?
>
> We scale by the number of cpus actually forced idle, since we don't
> want to falsely over or under charge forced idle time (defined
> strictly as time where we have a runnable task but idle the cpu). The
> more important scaling here though is the division over the number of
> running entities. This is done so that the aggregate amount of forced
> idle over some group of threads makes sense. Ie if we have a cpu with
> SMT8, and a group of 7 threads sharing a cookie, we don't want to
> accrue 7 units of forced idle time per unit time while the 8th SMT is
> forced idle.

So why not simply compute the strict per-cpu force-idle time and let
userspace sort out the rest?

2021-10-12 19:50:16

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH] sched/core: forced idle accounting

On Tue, Oct 12, 2021 at 5:27 AM Peter Zijlstra <[email protected]> wrote:
>
> On Mon, Oct 11, 2021 at 05:12:43PM -0700, Josh Don wrote:
>
> > > > + if (WARN_ON_ONCE(!nr_running)) {
> > > > + /* can't be forced idle without a running task */
> > > > + } else {
> > > > + delta *= nr_forced_idle;
> > > > + delta /= nr_running;
> > > > + }
> > >
> > > Now the comment sayeth:
> > >
> > > > + /*
> > > > + * For larger SMT configurations, we need to scale the charged
> > > > + * forced idle amount since there can be more than one forced idle
> > > > + * sibling and more than one running cookied task.
> > > > + */
> > >
> > > But why?
> >
> > We scale by the number of cpus actually forced idle, since we don't
> > want to falsely over or under charge forced idle time (defined
> > strictly as time where we have a runnable task but idle the cpu). The
> > more important scaling here though is the division over the number of
> > running entities. This is done so that the aggregate amount of forced
> > idle over some group of threads makes sense. Ie if we have a cpu with
> > SMT8, and a group of 7 threads sharing a cookie, we don't want to
> > accrue 7 units of forced idle time per unit time while the 8th SMT is
> > forced idle.
>
> So why not simply compute the strict per-cpu force-idle time and let
> userspace sort out the rest?

Do you mean to compute force idle solely as a per-cpu value? I think
that would be fine in addition to the per-thread field, but a
desirable property here is proper attribution to the cause of the
force idle. That lets system management understand which jobs are the
most antagonistic from a coresched perspective, and is a signal
(albeit noisy, due to system state and load balancing decisions) for
scaling their capacity requirements.

2021-10-14 14:37:13

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] sched/core: forced idle accounting

On Tue, Oct 12, 2021 at 12:45:28PM -0700, Josh Don wrote:
> On Tue, Oct 12, 2021 at 5:27 AM Peter Zijlstra <[email protected]> wrote:

> > > We scale by the number of cpus actually forced idle, since we don't
> > > want to falsely over or under charge forced idle time (defined
> > > strictly as time where we have a runnable task but idle the cpu). The
> > > more important scaling here though is the division over the number of
> > > running entities. This is done so that the aggregate amount of forced
> > > idle over some group of threads makes sense. Ie if we have a cpu with
> > > SMT8, and a group of 7 threads sharing a cookie, we don't want to
> > > accrue 7 units of forced idle time per unit time while the 8th SMT is
> > > forced idle.
> >
> > So why not simply compute the strict per-cpu force-idle time and let
> > userspace sort out the rest?
>
> Do you mean to compute force idle solely as a per-cpu value? I think
> that would be fine in addition to the per-thread field, but a
> desirable property here is proper attribution to the cause of the
> force idle. That lets system management understand which jobs are the
> most antagonistic from a coresched perspective, and is a signal
> (albeit noisy, due to system state and load balancing decisions) for
> scaling their capacity requirements.

Urgh, reading is hard. I hadn't noticed you did per-task accounting (and
the original changelog doesn't clarify this either).

Also, should all this be undef SCHED_DEBUG ? Or be part of SCHEDSTATS ?

2021-10-14 19:19:50

by Hao Luo

[permalink] [raw]
Subject: Re: [PATCH] sched/core: forced idle accounting

On Mon, Oct 11, 2021 at 5:31 PM Josh Don <[email protected]> wrote:
>
> On Mon, Oct 11, 2021 at 10:33 AM Hao Luo <[email protected]> wrote:
> >
> > On Thu, Oct 7, 2021 at 5:08 PM Josh Don <[email protected]> wrote:
> > > -void sched_core_dequeue(struct rq *rq, struct task_struct *p)
> > > +void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
> > > {
> > > rq->core->core_task_seq++;
> > >
> > > - if (!sched_core_enqueued(p))
> > > - return;
> > > + if (sched_core_enqueued(p)) {
> > > + rb_erase(&p->core_node, &rq->core_tree);
> > > + RB_CLEAR_NODE(&p->core_node);
> > > + }
> > >
> > > - rb_erase(&p->core_node, &rq->core_tree);
> > > - RB_CLEAR_NODE(&p->core_node);
> > > + /*
> > > + * Migrating the last task off the cpu, with the cpu in forced idle
> > > + * state. Reschedule to create an accounting edge for forced idle,
> > > + * and re-examine whether the core is still in forced idle state.
> > > + */
> > > + if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
> > > + rq->core->core_forceidle && rq->curr == rq->idle)
> > > + resched_curr(rq);
> >
> > Resched_curr is probably an unwanted side effect of dequeue. Maybe we
> > could extract the check and resched_curr out into a function, and call
> > the function outside of sched_core_dequeue(). In that way, the
> > interface of dequeue doesn't need to change.
>
> This resched is an atypical case; normal load balancing won't steal
> the last runnable task off a cpu. The main reasons this resched could
> trigger are: migration due to affinity change, and migration due to
> sched core doing a cookie_steal. Could bubble this up to
> deactivate_task(), but seems less brittle to keep this in dequeue()
> with the check against DEQUEUE_SAVE (since this creates an important
> accounting edge). Thoughts?
>

I prefer bubbling it up to deactivate_task(). Depending on how many
callers of deactivate_task() need this resched, IMHO it is even fine
to put it in deactivate_task's caller. Wrapping it in a function may
help clarify its purpose.

> > > /*
> > > @@ -5765,7 +5782,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > > for_each_cpu_wrap(i, smt_mask, cpu) {
> > > rq_i = cpu_rq(i);
> > >
> > > - if (i != cpu)
> > > + if (i != cpu && (rq_i != rq->core || !core_clock_updated))
> > > update_rq_clock(rq_i);
> >
> > Do you mean (rq_i != rq->core && !core_clock_updated)? I thought
> > rq->core has core_clock updated always.
>
> rq->clock is updated on entry to pick_next_task(). rq->core is only
> updated if rq == rq->core, or if we've done the clock update for
> rq->core above.

I meant 'if (i != cpu && rq_i != rq->core)'. Because at this point,
core_clock should already have been updated, is that not the case?
Anyway, the tracking of clock updates here is too confusing to me.

2021-10-15 07:21:02

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH] sched/core: forced idle accounting

On Thu, Oct 14, 2021 at 10:58 AM Hao Luo <[email protected]> wrote:
>
> On Mon, Oct 11, 2021 at 5:31 PM Josh Don <[email protected]> wrote:
> >
> > On Mon, Oct 11, 2021 at 10:33 AM Hao Luo <[email protected]> wrote:
> > >
> > > On Thu, Oct 7, 2021 at 5:08 PM Josh Don <[email protected]> wrote:
> > > > -void sched_core_dequeue(struct rq *rq, struct task_struct *p)
> > > > +void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
> > > > {
> > > > rq->core->core_task_seq++;
> > > >
> > > > - if (!sched_core_enqueued(p))
> > > > - return;
> > > > + if (sched_core_enqueued(p)) {
> > > > + rb_erase(&p->core_node, &rq->core_tree);
> > > > + RB_CLEAR_NODE(&p->core_node);
> > > > + }
> > > >
> > > > - rb_erase(&p->core_node, &rq->core_tree);
> > > > - RB_CLEAR_NODE(&p->core_node);
> > > > + /*
> > > > + * Migrating the last task off the cpu, with the cpu in forced idle
> > > > + * state. Reschedule to create an accounting edge for forced idle,
> > > > + * and re-examine whether the core is still in forced idle state.
> > > > + */
> > > > + if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
> > > > + rq->core->core_forceidle && rq->curr == rq->idle)
> > > > + resched_curr(rq);
> > >
> > > Resched_curr is probably an unwanted side effect of dequeue. Maybe we
> > > could extract the check and resched_curr out into a function, and call
> > > the function outside of sched_core_dequeue(). In that way, the
> > > interface of dequeue doesn't need to change.
> >
> > This resched is an atypical case; normal load balancing won't steal
> > the last runnable task off a cpu. The main reasons this resched could
> > trigger are: migration due to affinity change, and migration due to
> > sched core doing a cookie_steal. Could bubble this up to
> > deactivate_task(), but seems less brittle to keep this in dequeue()
> > with the check against DEQUEUE_SAVE (since this creates an important
> > accounting edge). Thoughts?
> >
>
> I prefer bubbling it up to deactivate_task(). Depending on how many
> callers of deactivate_task() need this resched, IMHO it is even fine
> to put it in deactivate_task's caller. Wrapping it in a function may
> help clarify its purpose.

I'd argue against bubbling up above deactivate_task(); makes things
much more brittle if a new use for deactivate_task() is added in the
future.

Tried both ways; IMO it seems slightly better to leave in dequeue() vs
deactivate(); less confusing to one hook instead of two for coresched
to handle dequeuing a task.

> > > > /*
> > > > @@ -5765,7 +5782,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > > > for_each_cpu_wrap(i, smt_mask, cpu) {
> > > > rq_i = cpu_rq(i);
> > > >
> > > > - if (i != cpu)
> > > > + if (i != cpu && (rq_i != rq->core || !core_clock_updated))
> > > > update_rq_clock(rq_i);
> > >
> > > Do you mean (rq_i != rq->core && !core_clock_updated)? I thought
> > > rq->core has core_clock updated always.
> >
> > rq->clock is updated on entry to pick_next_task(). rq->core is only
> > updated if rq == rq->core, or if we've done the clock update for
> > rq->core above.
>
> I meant 'if (i != cpu && rq_i != rq->core)'. Because at this point,
> core_clock should already have been updated, is that not the case?
> Anyway, the tracking of clock updates here is too confusing to me.

Added a comment here, but the logic flow is:
- rq->clock is always updated on entry to pick_next_task()
- rq->core->clock _may_ be updated by the time we get to this part of
pick_next_task(). We have to be careful to avoid a double update,
hence the need for the core_clock_updated var.

2021-10-15 07:21:03

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH] sched/core: forced idle accounting

On Thu, Oct 14, 2021 at 7:24 AM Peter Zijlstra <[email protected]> wrote:
>
> On Tue, Oct 12, 2021 at 12:45:28PM -0700, Josh Don wrote:
> > On Tue, Oct 12, 2021 at 5:27 AM Peter Zijlstra <[email protected]> wrote:
>
> > > > We scale by the number of cpus actually forced idle, since we don't
> > > > want to falsely over or under charge forced idle time (defined
> > > > strictly as time where we have a runnable task but idle the cpu). The
> > > > more important scaling here though is the division over the number of
> > > > running entities. This is done so that the aggregate amount of forced
> > > > idle over some group of threads makes sense. Ie if we have a cpu with
> > > > SMT8, and a group of 7 threads sharing a cookie, we don't want to
> > > > accrue 7 units of forced idle time per unit time while the 8th SMT is
> > > > forced idle.
> > >
> > > So why not simply compute the strict per-cpu force-idle time and let
> > > userspace sort out the rest?
> >
> > Do you mean to compute force idle solely as a per-cpu value? I think
> > that would be fine in addition to the per-thread field, but a
> > desirable property here is proper attribution to the cause of the
> > force idle. That lets system management understand which jobs are the
> > most antagonistic from a coresched perspective, and is a signal
> > (albeit noisy, due to system state and load balancing decisions) for
> > scaling their capacity requirements.
>
> Urgh, reading is hard. I hadn't noticed you did per-task accounting (and
> the original changelog doesn't clarify this either).

Yea, I'll add that to the description, along with a few other
implementation details.

> Also, should all this be undef SCHED_DEBUG ? Or be part of SCHEDSTATS ?

schedstats seems like a good home, that way users can avoid most of
the extra overhead if schedstats is disabled.

2021-10-15 07:24:45

by Hao Luo

[permalink] [raw]
Subject: Re: [PATCH] sched/core: forced idle accounting

On Thu, Oct 14, 2021 at 4:29 PM Josh Don <[email protected]> wrote:
>
> On Thu, Oct 14, 2021 at 10:58 AM Hao Luo <[email protected]> wrote:
> >
> > On Mon, Oct 11, 2021 at 5:31 PM Josh Don <[email protected]> wrote:
> > >
> > > On Mon, Oct 11, 2021 at 10:33 AM Hao Luo <[email protected]> wrote:
> > > >
> > > > On Thu, Oct 7, 2021 at 5:08 PM Josh Don <[email protected]> wrote:
> > > > > -void sched_core_dequeue(struct rq *rq, struct task_struct *p)
> > > > > +void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
> > > > > {
> > > > > rq->core->core_task_seq++;
> > > > >
> > > > > - if (!sched_core_enqueued(p))
> > > > > - return;
> > > > > + if (sched_core_enqueued(p)) {
> > > > > + rb_erase(&p->core_node, &rq->core_tree);
> > > > > + RB_CLEAR_NODE(&p->core_node);
> > > > > + }
> > > > >
> > > > > - rb_erase(&p->core_node, &rq->core_tree);
> > > > > - RB_CLEAR_NODE(&p->core_node);
> > > > > + /*
> > > > > + * Migrating the last task off the cpu, with the cpu in forced idle
> > > > > + * state. Reschedule to create an accounting edge for forced idle,
> > > > > + * and re-examine whether the core is still in forced idle state.
> > > > > + */
> > > > > + if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
> > > > > + rq->core->core_forceidle && rq->curr == rq->idle)
> > > > > + resched_curr(rq);
> > > >
> > > > Resched_curr is probably an unwanted side effect of dequeue. Maybe we
> > > > could extract the check and resched_curr out into a function, and call
> > > > the function outside of sched_core_dequeue(). In that way, the
> > > > interface of dequeue doesn't need to change.
> > >
> > > This resched is an atypical case; normal load balancing won't steal
> > > the last runnable task off a cpu. The main reasons this resched could
> > > trigger are: migration due to affinity change, and migration due to
> > > sched core doing a cookie_steal. Could bubble this up to
> > > deactivate_task(), but seems less brittle to keep this in dequeue()
> > > with the check against DEQUEUE_SAVE (since this creates an important
> > > accounting edge). Thoughts?
> > >
> >
> > I prefer bubbling it up to deactivate_task(). Depending on how many
> > callers of deactivate_task() need this resched, IMHO it is even fine
> > to put it in deactivate_task's caller. Wrapping it in a function may
> > help clarify its purpose.
>
> I'd argue against bubbling up above deactivate_task(); makes things
> much more brittle if a new use for deactivate_task() is added in the
> future.
>
> Tried both ways; IMO it seems slightly better to leave in dequeue() vs
> deactivate(); less confusing to one hook instead of two for coresched
> to handle dequeuing a task.
>

Ack. No problem. I don't have strong objections here.

> > > > > /*
> > > > > @@ -5765,7 +5782,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > > > > for_each_cpu_wrap(i, smt_mask, cpu) {
> > > > > rq_i = cpu_rq(i);
> > > > >
> > > > > - if (i != cpu)
> > > > > + if (i != cpu && (rq_i != rq->core || !core_clock_updated))
> > > > > update_rq_clock(rq_i);
> > > >
> > > > Do you mean (rq_i != rq->core && !core_clock_updated)? I thought
> > > > rq->core has core_clock updated always.
> > >
> > > rq->clock is updated on entry to pick_next_task(). rq->core is only
> > > updated if rq == rq->core, or if we've done the clock update for
> > > rq->core above.
> >
> > I meant 'if (i != cpu && rq_i != rq->core)'. Because at this point,
> > core_clock should already have been updated, is that not the case?
> > Anyway, the tracking of clock updates here is too confusing to me.
>
> Added a comment here, but the logic flow is:
> - rq->clock is always updated on entry to pick_next_task()
> - rq->core->clock _may_ be updated by the time we get to this part of
> pick_next_task(). We have to be careful to avoid a double update,
> hence the need for the core_clock_updated var.

Yeah. Sync'ed offline and that cleared my confusion. Thanks.

Hao