LinuxLists.cc - [PATCH v2] sched/core: forced idle accounting

2021-10-18 20:37:51

Subject: [PATCH v2] sched/core: forced idle accounting

Adds accounting for "forced idle" time, which is time where a cookie'd
task forces its SMT sibling to idle, despite the presence of runnable
tasks.

Forced idle time is one means to measure the cost of enabling core
scheduling (ie. the capacity lost due to the need to force idle).

Forced idle time is attributed to the thread responsible for causing the
forced idle. A few details:
- Forced idle time is displayed via /proc/PID/sched. It also requires
that schedstats is enabled.
- Forced idle is only accounted when a sibling hyperthread is held
idle despite the presence of runnable tasks. No time is charged if
a sibling is idle but has no runnable tasks.
- Tasks with 0 cookie are never charged forced idle.
- For SMT > 2, we scale the amount of forced idle charged based on the
number of forced idle siblings. Additionally, we split the time up and
evenly charge it to all running tasks, as each is equally responsible
for the forced idle.

Signed-off-by: Josh Don <[email protected]>
---
v2:
- simplify the SMT>2 case by tracking the forced idle count in pick()
- added an accounting edge for sched_core_flip()

include/linux/sched.h | 4 ++
kernel/sched/core.c | 82 +++++++++++++++++++++++++++++----------
kernel/sched/core_sched.c | 71 ++++++++++++++++++++++++++++++++-
kernel/sched/debug.c | 4 ++
kernel/sched/fair.c | 2 +-
kernel/sched/sched.h | 20 +++++++++-
6 files changed, 159 insertions(+), 24 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 96abf9bd3864..ab6d401f367e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -523,7 +523,11 @@ struct sched_statistics {
u64 nr_wakeups_affine_attempts;
u64 nr_wakeups_passive;
u64 nr_wakeups_idle;
+
+#ifdef CONFIG_SCHED_CORE
+ u64 core_forceidle_sum;
#endif
+#endif /* CONFIG_SCHEDSTATS */
} ____cacheline_aligned;

struct sched_entity {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ada028e579b0..ff49d259bf92 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -144,7 +144,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
return false;

/* flip prio, so high prio is leftmost */
- if (prio_less(b, a, task_rq(a)->core->core_forceidle))
+ if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
return true;

return false;
@@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
}

-void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
{
rq->core->core_task_seq++;

- if (!sched_core_enqueued(p))
- return;
+ if (sched_core_enqueued(p)) {
+ rb_erase(&p->core_node, &rq->core_tree);
+ RB_CLEAR_NODE(&p->core_node);
+ }

- rb_erase(&p->core_node, &rq->core_tree);
- RB_CLEAR_NODE(&p->core_node);
+ /*
+ * Migrating the last task off the cpu, with the cpu in forced idle
+ * state. Reschedule to create an accounting edge for forced idle,
+ * and re-examine whether the core is still in forced idle state.
+ */
+ if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
+ rq->core->core_forceidle_count && rq->curr == rq->idle)
+ resched_curr(rq);
}

/*
@@ -280,6 +288,8 @@ static void __sched_core_flip(bool enabled)
for_each_cpu(t, smt_mask)
cpu_rq(t)->core_enabled = enabled;

+ cpu_rq(cpu)->core->core_forceidle_start = 0;
+
sched_core_unlock(cpu, &flags);

cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
@@ -364,7 +374,8 @@ void sched_core_put(void)
#else /* !CONFIG_SCHED_CORE */

static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
-static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
+static inline void
+sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }

#endif /* CONFIG_SCHED_CORE */

@@ -2020,7 +2031,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (sched_core_enabled(rq))
- sched_core_dequeue(rq, p);
+ sched_core_dequeue(rq, p, flags);

if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
@@ -5256,6 +5267,7 @@ void scheduler_tick(void)
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
+ sched_core_tick(rq);

rq_unlock(rq, &rf);

@@ -5668,6 +5680,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *next, *p, *max = NULL;
const struct cpumask *smt_mask;
bool fi_before = false;
+ bool core_clock_updated = (rq == rq->core);
unsigned long cookie;
int i, cpu, occ = 0;
struct rq *rq_i;
@@ -5720,10 +5733,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)

/* reset state */
rq->core->core_cookie = 0UL;
- if (rq->core->core_forceidle) {
+ if (rq->core->core_forceidle_count) {
+ if (!core_clock_updated) {
+ update_rq_clock(rq->core);
+ core_clock_updated = true;
+ }
+ sched_core_account_forceidle(rq);
+ /* reset after accounting force idle */
+ rq->core->core_forceidle_start = 0;
+ rq->core->core_forceidle_count = 0;
+ rq->core->core_forceidle_occupation = 0;
need_sync = true;
fi_before = true;
- rq->core->core_forceidle = false;
}

/*
@@ -5765,7 +5786,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
for_each_cpu_wrap(i, smt_mask, cpu) {
rq_i = cpu_rq(i);

- if (i != cpu)
+ /*
+ * Current cpu always has its clock updated on entrance to
+ * pick_next_task(). If the current cpu is not the core,
+ * the core may also have been updated above.
+ */
+ if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i);

p = rq_i->core_pick = pick_task(rq_i);
@@ -5795,7 +5821,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)

if (p == rq_i->idle) {
if (rq_i->nr_running) {
- rq->core->core_forceidle = true;
+ rq->core->core_forceidle_count++;
if (!fi_before)
rq->core->core_forceidle_seq++;
}
@@ -5804,6 +5830,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
}

+ if (rq->core->core_forceidle_count) {
+ if (cookie)
+ rq->core->core_forceidle_start = rq_clock(rq->core);
+ rq->core->core_forceidle_occupation = occ;
+ }
+
rq->core->core_pick_seq = rq->core->core_task_seq;
next = rq->core_pick;
rq->core_sched_seq = rq->core->core_pick_seq;
@@ -5840,8 +5872,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* 1 0 1
* 1 1 0
*/
- if (!(fi_before && rq->core->core_forceidle))
- task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
+ if (!(fi_before && rq->core->core_forceidle_count))
+ task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);

rq_i->core_pick->core_occupation = occ;

@@ -6045,11 +6077,19 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
goto unlock;

/* copy the shared state to the new leader */
- core_rq->core_task_seq = rq->core_task_seq;
- core_rq->core_pick_seq = rq->core_pick_seq;
- core_rq->core_cookie = rq->core_cookie;
- core_rq->core_forceidle = rq->core_forceidle;
- core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+ core_rq->core_task_seq = rq->core_task_seq;
+ core_rq->core_pick_seq = rq->core_pick_seq;
+ core_rq->core_cookie = rq->core_cookie;
+ core_rq->core_forceidle_count = rq->core_forceidle_count;
+ core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+ core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
+
+ /*
+ * Accounting edge for forced idle is handled in pick_next_task().
+ * Don't need another one here, since the hotplug thread shouldn't
+ * have a cookie.
+ */
+ core_rq->core_forceidle_start = 0;

/* install new leader */
for_each_cpu(t, smt_mask) {
@@ -9423,7 +9463,9 @@ void __init sched_init(void)
rq->core_pick = NULL;
rq->core_enabled = 0;
rq->core_tree = RB_ROOT;
- rq->core_forceidle = false;
+ rq->core_forceidle_count = 0;
+ rq->core_forceidle_occupation = 0;
+ rq->core_forceidle_start = 0;

rq->core_cookie = 0UL;
#endif
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 48ac72696012..6587c4c409d2 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,

enqueued = sched_core_enqueued(p);
if (enqueued)
- sched_core_dequeue(rq, p);
+ sched_core_dequeue(rq, p, DEQUEUE_SAVE);

old_cookie = p->core_cookie;
p->core_cookie = cookie;
@@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
* If task is currently running, it may not be compatible anymore after
* the cookie change, so enter the scheduler on its CPU to schedule it
* away.
+ *
+ * Note that it is possible that as a result of this cookie change, the
+ * core has now entered/left forced idle state. Defer accounting to the
+ * next scheduling edge, rather than always forcing a reschedule here.
*/
if (task_running(rq, p))
resched_curr(rq);
@@ -228,3 +232,68 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
return err;
}

+#ifdef CONFIG_SCHEDSTATS
+
+/* REQUIRES: rq->core's clock recently updated. */
+void sched_core_account_forceidle(struct rq *rq)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
+ u64 delta, now = rq_clock(rq->core);
+ struct rq *rq_i;
+ struct task_struct *p;
+ int i;
+
+ lockdep_assert_rq_held(rq);
+
+ WARN_ON_ONCE(!rq->core->core_forceidle_count);
+
+ if (rq->core->core_forceidle_start == 0)
+ return;
+
+ delta = now - rq->core->core_forceidle_start;
+ if (unlikely((s64)delta <= 0))
+ return;
+
+ rq->core->core_forceidle_start = now;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) {
+ /* can't be forced idle without a running task */
+ } else if (rq->core->core_forceidle_count > 1 ||
+ rq->core->core_forceidle_occupation > 1) {
+ /*
+ * For larger SMT configurations, we need to scale the charged
+ * forced idle amount since there can be more than one forced
+ * idle sibling and more than one running cookied task.
+ */
+ delta *= rq->core->core_forceidle_count;
+ delta = div_u64(delta, rq->core->core_forceidle_occupation);
+ }
+
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+ p = rq_i->core_pick ?: rq_i->curr;
+
+ if (!p->core_cookie)
+ continue;
+
+ __schedstat_add(p->stats.core_forceidle_sum, delta);
+ }
+}
+
+void sched_core_tick(struct rq *rq)
+{
+ if (!sched_core_enabled(rq))
+ return;
+
+ if (!rq->core->core_forceidle_count)
+ return;
+
+ if (rq != rq->core)
+ update_rq_clock(rq->core);
+ sched_core_account_forceidle(rq);
+}
+
+#endif /* CONFIG_SCHEDSTATS */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7dcbaa31c5d9..aa29211de1bf 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1023,6 +1023,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,

__PN(avg_atom);
__PN(avg_per_cpu);
+
+#ifdef CONFIG_SCHED_CORE
+ PN_SCHEDSTAT(core_forceidle_sum);
+#endif
}

__P(nr_switches);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 87db481e8a56..da3a6732ddac 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11047,7 +11047,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
* if we need to give up the CPU.
*/
- if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
+ if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
resched_curr(rq);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a00fc7057d97..fb7c9e3f2787 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1111,8 +1111,10 @@ struct rq {
unsigned int core_task_seq;
unsigned int core_pick_seq;
unsigned long core_cookie;
- unsigned char core_forceidle;
+ unsigned int core_forceidle_count;
unsigned int core_forceidle_seq;
+ unsigned int core_forceidle_occupation;
+ u64 core_forceidle_start;
#endif
};

@@ -1253,7 +1255,7 @@ static inline bool sched_core_enqueued(struct task_struct *p)
}

extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
-extern void sched_core_dequeue(struct rq *rq, struct task_struct *p);
+extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);

extern void sched_core_get(void);
extern void sched_core_put(void);
@@ -1302,6 +1304,20 @@ static inline bool sched_group_cookie_match(struct rq *rq,
}
#endif /* CONFIG_SCHED_CORE */

+#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
+
+extern void sched_core_account_forceidle(struct rq *rq);
+
+extern void sched_core_tick(struct rq *rq);
+
+#else
+
+static inline void sched_core_account_forceidle(struct rq *rq) {}
+
+static inline void sched_core_tick(struct rq *rq) {}
+
+#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */
+
static inline void lockdep_assert_rq_held(struct rq *rq)
{
lockdep_assert_held(__rq_lockp(rq));
--
2.33.0.1079.g6e70778dc9-goog

2021-10-19 09:36:25

by Tao Zhou

[permalink] [raw]

Subject: Re: [PATCH v2] sched/core: forced idle accounting

Hi Josh,

On Mon, Oct 18, 2021 at 01:34:28PM -0700, Josh Don wrote:
> Adds accounting for "forced idle" time, which is time where a cookie'd
> task forces its SMT sibling to idle, despite the presence of runnable
> tasks.
>
> Forced idle time is one means to measure the cost of enabling core
> scheduling (ie. the capacity lost due to the need to force idle).
>
> Forced idle time is attributed to the thread responsible for causing the
> forced idle. A few details:
> - Forced idle time is displayed via /proc/PID/sched. It also requires
> that schedstats is enabled.
> - Forced idle is only accounted when a sibling hyperthread is held
> idle despite the presence of runnable tasks. No time is charged if
> a sibling is idle but has no runnable tasks.

This is clear to me now. Last time I am not get this. Thanks.

> - Tasks with 0 cookie are never charged forced idle.
> - For SMT > 2, we scale the amount of forced idle charged based on the
> number of forced idle siblings. Additionally, we split the time up and
> evenly charge it to all running tasks, as each is equally responsible
> for the forced idle.
>
> Signed-off-by: Josh Don <[email protected]>
> ---
> v2:
> - simplify the SMT>2 case by tracking the forced idle count in pick()
> - added an accounting edge for sched_core_flip()
>
> include/linux/sched.h | 4 ++
> kernel/sched/core.c | 82 +++++++++++++++++++++++++++++----------
> kernel/sched/core_sched.c | 71 ++++++++++++++++++++++++++++++++-
> kernel/sched/debug.c | 4 ++
> kernel/sched/fair.c | 2 +-
> kernel/sched/sched.h | 20 +++++++++-
> 6 files changed, 159 insertions(+), 24 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 96abf9bd3864..ab6d401f367e 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -523,7 +523,11 @@ struct sched_statistics {
> u64 nr_wakeups_affine_attempts;
> u64 nr_wakeups_passive;
> u64 nr_wakeups_idle;
> +
> +#ifdef CONFIG_SCHED_CORE
> + u64 core_forceidle_sum;
> #endif
> +#endif /* CONFIG_SCHEDSTATS */
> } ____cacheline_aligned;
>
> struct sched_entity {
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index ada028e579b0..ff49d259bf92 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -144,7 +144,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
> return false;
>
> /* flip prio, so high prio is leftmost */
> - if (prio_less(b, a, task_rq(a)->core->core_forceidle))
> + if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
> return true;
>
> return false;
> @@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
> rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
> }
>
> -void sched_core_dequeue(struct rq *rq, struct task_struct *p)
> +void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
> {
> rq->core->core_task_seq++;
>
> - if (!sched_core_enqueued(p))
> - return;
> + if (sched_core_enqueued(p)) {
> + rb_erase(&p->core_node, &rq->core_tree);
> + RB_CLEAR_NODE(&p->core_node);
> + }
>
> - rb_erase(&p->core_node, &rq->core_tree);
> - RB_CLEAR_NODE(&p->core_node);
> + /*
> + * Migrating the last task off the cpu, with the cpu in forced idle
> + * state. Reschedule to create an accounting edge for forced idle,
> + * and re-examine whether the core is still in forced idle state.
> + */
> + if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
> + rq->core->core_forceidle_count && rq->curr == rq->idle)
> + resched_curr(rq);
> }
>
> /*
> @@ -280,6 +288,8 @@ static void __sched_core_flip(bool enabled)
> for_each_cpu(t, smt_mask)
> cpu_rq(t)->core_enabled = enabled;
>
> + cpu_rq(cpu)->core->core_forceidle_start = 0;
> +
> sched_core_unlock(cpu, &flags);
>
> cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
> @@ -364,7 +374,8 @@ void sched_core_put(void)
> #else /* !CONFIG_SCHED_CORE */
>
> static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
> -static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
> +static inline void
> +sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
>
> #endif /* CONFIG_SCHED_CORE */
>
> @@ -2020,7 +2031,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
> static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
> {
> if (sched_core_enabled(rq))
> - sched_core_dequeue(rq, p);
> + sched_core_dequeue(rq, p, flags);
>
> if (!(flags & DEQUEUE_NOCLOCK))
> update_rq_clock(rq);
> @@ -5256,6 +5267,7 @@ void scheduler_tick(void)
> if (sched_feat(LATENCY_WARN))
> resched_latency = cpu_resched_latency(rq);
> calc_global_load_tick(rq);
> + sched_core_tick(rq);
>
> rq_unlock(rq, &rf);
>
> @@ -5668,6 +5680,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> struct task_struct *next, *p, *max = NULL;
> const struct cpumask *smt_mask;
> bool fi_before = false;
> + bool core_clock_updated = (rq == rq->core);
> unsigned long cookie;
> int i, cpu, occ = 0;
> struct rq *rq_i;
> @@ -5720,10 +5733,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>
> /* reset state */
> rq->core->core_cookie = 0UL;
> - if (rq->core->core_forceidle) {
> + if (rq->core->core_forceidle_count) {
> + if (!core_clock_updated) {
> + update_rq_clock(rq->core);
> + core_clock_updated = true;
> + }
> + sched_core_account_forceidle(rq);
> + /* reset after accounting force idle */
> + rq->core->core_forceidle_start = 0;
> + rq->core->core_forceidle_count = 0;
> + rq->core->core_forceidle_occupation = 0;
> need_sync = true;
> fi_before = true;
> - rq->core->core_forceidle = false;
> }
>
> /*
> @@ -5765,7 +5786,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> for_each_cpu_wrap(i, smt_mask, cpu) {
> rq_i = cpu_rq(i);
>
> - if (i != cpu)
> + /*
> + * Current cpu always has its clock updated on entrance to
> + * pick_next_task(). If the current cpu is not the core,
> + * the core may also have been updated above.
> + */

I intercept one sentence from last version's input to make this comment
comprehensive. Hope it will.

/*
* Current cpu always has its clock updated on entrance to
* pick_next_task(). If the current cpu is not the core,
* the core may also have been updated above. We have to be
* careful to avoid double update of core rq's clock.
*/

> + if (i != cpu && (rq_i != rq->core || !core_clock_updated))
> update_rq_clock(rq_i);
>
> p = rq_i->core_pick = pick_task(rq_i);
> @@ -5795,7 +5821,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>
> if (p == rq_i->idle) {
> if (rq_i->nr_running) {
> - rq->core->core_forceidle = true;
> + rq->core->core_forceidle_count++;
> if (!fi_before)
> rq->core->core_forceidle_seq++;
> }
> @@ -5804,6 +5830,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> }
> }
>
> + if (rq->core->core_forceidle_count) {
> + if (cookie)
> + rq->core->core_forceidle_start = rq_clock(rq->core);
> + rq->core->core_forceidle_occupation = occ;
> + }
> +
> rq->core->core_pick_seq = rq->core->core_task_seq;
> next = rq->core_pick;
> rq->core_sched_seq = rq->core->core_pick_seq;
> @@ -5840,8 +5872,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> * 1 0 1
> * 1 1 0
> */
> - if (!(fi_before && rq->core->core_forceidle))
> - task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
> + if (!(fi_before && rq->core->core_forceidle_count))
> + task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
>
> rq_i->core_pick->core_occupation = occ;
>
> @@ -6045,11 +6077,19 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
> goto unlock;
>
> /* copy the shared state to the new leader */
> - core_rq->core_task_seq = rq->core_task_seq;
> - core_rq->core_pick_seq = rq->core_pick_seq;
> - core_rq->core_cookie = rq->core_cookie;
> - core_rq->core_forceidle = rq->core_forceidle;
> - core_rq->core_forceidle_seq = rq->core_forceidle_seq;
> + core_rq->core_task_seq = rq->core_task_seq;
> + core_rq->core_pick_seq = rq->core_pick_seq;
> + core_rq->core_cookie = rq->core_cookie;
> + core_rq->core_forceidle_count = rq->core_forceidle_count;
> + core_rq->core_forceidle_seq = rq->core_forceidle_seq;
> + core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
> +
> + /*
> + * Accounting edge for forced idle is handled in pick_next_task().
> + * Don't need another one here, since the hotplug thread shouldn't
> + * have a cookie.
> + */
> + core_rq->core_forceidle_start = 0;
>
> /* install new leader */
> for_each_cpu(t, smt_mask) {
> @@ -9423,7 +9463,9 @@ void __init sched_init(void)
> rq->core_pick = NULL;
> rq->core_enabled = 0;
> rq->core_tree = RB_ROOT;
> - rq->core_forceidle = false;
> + rq->core_forceidle_count = 0;
> + rq->core_forceidle_occupation = 0;
> + rq->core_forceidle_start = 0;
>
> rq->core_cookie = 0UL;
> #endif
> diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
> index 48ac72696012..6587c4c409d2 100644
> --- a/kernel/sched/core_sched.c
> +++ b/kernel/sched/core_sched.c
> @@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
>
> enqueued = sched_core_enqueued(p);
> if (enqueued)
> - sched_core_dequeue(rq, p);
> + sched_core_dequeue(rq, p, DEQUEUE_SAVE);
>
> old_cookie = p->core_cookie;
> p->core_cookie = cookie;
> @@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
> * If task is currently running, it may not be compatible anymore after
> * the cookie change, so enter the scheduler on its CPU to schedule it
> * away.
> + *
> + * Note that it is possible that as a result of this cookie change, the
> + * core has now entered/left forced idle state. Defer accounting to the
> + * next scheduling edge, rather than always forcing a reschedule here.
> */
> if (task_running(rq, p))
> resched_curr(rq);
> @@ -228,3 +232,68 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
> return err;
> }
>
> +#ifdef CONFIG_SCHEDSTATS
> +
> +/* REQUIRES: rq->core's clock recently updated. */
> +void sched_core_account_forceidle(struct rq *rq)
> +{
> + const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
> + u64 delta, now = rq_clock(rq->core);
> + struct rq *rq_i;
> + struct task_struct *p;
> + int i;
> +
> + lockdep_assert_rq_held(rq);
> +
> + WARN_ON_ONCE(!rq->core->core_forceidle_count);
> +
> + if (rq->core->core_forceidle_start == 0)
> + return;
> +
> + delta = now - rq->core->core_forceidle_start;
> + if (unlikely((s64)delta <= 0))
> + return;
> +
> + rq->core->core_forceidle_start = now;
> +
> + if (!schedstat_enabled())
> + return;
> +
> + if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) {
> + /* can't be forced idle without a running task */
> + } else if (rq->core->core_forceidle_count > 1 ||
> + rq->core->core_forceidle_occupation > 1) {

This also except that smt count in last version.

> + /*
> + * For larger SMT configurations, we need to scale the charged
> + * forced idle amount since there can be more than one forced
> + * idle sibling and more than one running cookied task.
> + */
> + delta *= rq->core->core_forceidle_count;
> + delta = div_u64(delta, rq->core->core_forceidle_occupation);
> + }
> +
> + for_each_cpu(i, smt_mask) {
> + rq_i = cpu_rq(i);
> + p = rq_i->core_pick ?: rq_i->curr;
> +
> + if (!p->core_cookie)
> + continue;
> +
> + __schedstat_add(p->stats.core_forceidle_sum, delta);
> + }
> +}
> +
> +void sched_core_tick(struct rq *rq)
> +{
> + if (!sched_core_enabled(rq))
> + return;
> +
> + if (!rq->core->core_forceidle_count)
> + return;
> +
> + if (rq != rq->core)
> + update_rq_clock(rq->core);
> + sched_core_account_forceidle(rq);
> +}
> +
> +#endif /* CONFIG_SCHEDSTATS */
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 7dcbaa31c5d9..aa29211de1bf 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -1023,6 +1023,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
>
> __PN(avg_atom);
> __PN(avg_per_cpu);
> +
> +#ifdef CONFIG_SCHED_CORE
> + PN_SCHEDSTAT(core_forceidle_sum);
> +#endif
> }
>
> __P(nr_switches);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 87db481e8a56..da3a6732ddac 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -11047,7 +11047,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
> * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
> * if we need to give up the CPU.
> */
> - if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
> + if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
> __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
> resched_curr(rq);
> }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index a00fc7057d97..fb7c9e3f2787 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1111,8 +1111,10 @@ struct rq {
> unsigned int core_task_seq;
> unsigned int core_pick_seq;
> unsigned long core_cookie;
> - unsigned char core_forceidle;
> + unsigned int core_forceidle_count;
> unsigned int core_forceidle_seq;
> + unsigned int core_forceidle_occupation;
> + u64 core_forceidle_start;
> #endif
> };
>
> @@ -1253,7 +1255,7 @@ static inline bool sched_core_enqueued(struct task_struct *p)
> }
>
> extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
> -extern void sched_core_dequeue(struct rq *rq, struct task_struct *p);
> +extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
>
> extern void sched_core_get(void);
> extern void sched_core_put(void);
> @@ -1302,6 +1304,20 @@ static inline bool sched_group_cookie_match(struct rq *rq,
> }
> #endif /* CONFIG_SCHED_CORE */
>
> +#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
> +
> +extern void sched_core_account_forceidle(struct rq *rq);
> +
> +extern void sched_core_tick(struct rq *rq);
> +
> +#else
> +
> +static inline void sched_core_account_forceidle(struct rq *rq) {}
> +
> +static inline void sched_core_tick(struct rq *rq) {}
> +
> +#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */
> +
> static inline void lockdep_assert_rq_held(struct rq *rq)
> {
> lockdep_assert_held(__rq_lockp(rq));
> --
> 2.33.0.1079.g6e70778dc9-goog
>

Thanks,
Tao

2021-11-09 20:22:18

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH v2] sched/core: forced idle accounting

On Mon, Oct 18, 2021 at 01:34:28PM -0700, Josh Don wrote:
> @@ -280,6 +288,8 @@ static void __sched_core_flip(bool enabled)
> for_each_cpu(t, smt_mask)
> cpu_rq(t)->core_enabled = enabled;
>
> + cpu_rq(cpu)->core->core_forceidle_start = 0;
> +
> sched_core_unlock(cpu, &flags);
>
> cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);

Not specific to this patch, but perhaps we should do better here and
explicitly initialize more state...

2021-11-09 21:42:27

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH v2] sched/core: forced idle accounting

On Mon, Oct 18, 2021 at 01:34:28PM -0700, Josh Don wrote:
> @@ -5804,6 +5830,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> }
> }
>
> + if (rq->core->core_forceidle_count) {

Does this want to be something like:

if (schedstat_enabled() && .. ) ?

afaict without schedstat on this is dead code.

> + if (cookie)
> + rq->core->core_forceidle_start = rq_clock(rq->core);
> + rq->core->core_forceidle_occupation = occ;
> + }
> +
> rq->core->core_pick_seq = rq->core->core_task_seq;
> next = rq->core_pick;
> rq->core_sched_seq = rq->core->core_pick_seq;

2021-11-10 00:48:50

by Josh Don

[permalink] [raw]

Subject: Re: [PATCH v2] sched/core: forced idle accounting

On Tue, Nov 9, 2021 at 3:15 AM Peter Zijlstra <[email protected]> wrote:
>
> On Mon, Oct 18, 2021 at 01:34:28PM -0700, Josh Don wrote:
> > @@ -5804,6 +5830,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > }
> > }
> >
> > + if (rq->core->core_forceidle_count) {
>
> Does this want to be something like:
>
> if (schedstat_enabled() && .. ) ?
>
> afaict without schedstat on this is dead code.
>

Makes sense to me, and similarly we can bail out of sched_core_tick()
early in the disabled case.

2021-11-10 00:50:05

by Josh Don

[permalink] [raw]

Subject: Re: [PATCH v2] sched/core: forced idle accounting

On Tue, Nov 9, 2021 at 2:21 AM Peter Zijlstra <[email protected]> wrote:
>
> On Mon, Oct 18, 2021 at 01:34:28PM -0700, Josh Don wrote:
> > @@ -280,6 +288,8 @@ static void __sched_core_flip(bool enabled)
> > for_each_cpu(t, smt_mask)
> > cpu_rq(t)->core_enabled = enabled;
> >
> > + cpu_rq(cpu)->core->core_forceidle_start = 0;
> > +
> > sched_core_unlock(cpu, &flags);
> >
> > cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
>
> Not specific to this patch, but perhaps we should do better here and
> explicitly initialize more state...

The others are not needed for correctness, but I do agree it makes
sense to keep the state consistent. I think that set is

core_pick
core_cookie
core_forceidle_count
core_forceidle_occupation

I can throw another patch on top of this to reset those as well.

2021-11-10 08:40:34

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH v2] sched/core: forced idle accounting

On Tue, Nov 09, 2021 at 02:59:00PM -0800, Josh Don wrote:
> On Tue, Nov 9, 2021 at 3:15 AM Peter Zijlstra <[email protected]> wrote:
> >
> > On Mon, Oct 18, 2021 at 01:34:28PM -0700, Josh Don wrote:
> > > @@ -5804,6 +5830,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > > }
> > > }
> > >
> > > + if (rq->core->core_forceidle_count) {
> >
> > Does this want to be something like:
> >
> > if (schedstat_enabled() && .. ) ?
> >
> > afaict without schedstat on this is dead code.
> >
>
> Makes sense to me, and similarly we can bail out of sched_core_tick()
> early in the disabled case.

Ok, I already changed the patch to include the above, lemme also edit
sched_core_tick().

2021-11-10 09:03:46

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH v2] sched/core: forced idle accounting

On Wed, Nov 10, 2021 at 09:38:53AM +0100, Peter Zijlstra wrote:
> On Tue, Nov 09, 2021 at 02:59:00PM -0800, Josh Don wrote:
> > On Tue, Nov 9, 2021 at 3:15 AM Peter Zijlstra <[email protected]> wrote:
> > >
> > > On Mon, Oct 18, 2021 at 01:34:28PM -0700, Josh Don wrote:
> > > > @@ -5804,6 +5830,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > > > }
> > > > }
> > > >
> > > > + if (rq->core->core_forceidle_count) {
> > >
> > > Does this want to be something like:
> > >
> > > if (schedstat_enabled() && .. ) ?
> > >
> > > afaict without schedstat on this is dead code.
> > >
> >
> > Makes sense to me, and similarly we can bail out of sched_core_tick()
> > early in the disabled case.
>
> Ok, I already changed the patch to include the above, lemme also edit
> sched_core_tick().

Something like so then?

--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -235,7 +235,7 @@ int sched_core_share_pid(unsigned int cm
#ifdef CONFIG_SCHEDSTATS

/* REQUIRES: rq->core's clock recently updated. */
-void sched_core_account_forceidle(struct rq *rq)
+void __sched_core_account_forceidle(struct rq *rq)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
u64 delta, now = rq_clock(rq->core);
@@ -256,9 +256,6 @@ void sched_core_account_forceidle(struct

rq->core->core_forceidle_start = now;

- if (!schedstat_enabled())
- return;
-
if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) {
/* can't be forced idle without a running task */
} else if (rq->core->core_forceidle_count > 1 ||
@@ -283,17 +280,15 @@ void sched_core_account_forceidle(struct
}
}

-void sched_core_tick(struct rq *rq)
+void __sched_core_tick(struct rq *rq)
{
- if (!sched_core_enabled(rq))
- return;
-
if (!rq->core->core_forceidle_count)
return;

if (rq != rq->core)
update_rq_clock(rq->core);
- sched_core_account_forceidle(rq);
+
+ __sched_core_account_forceidle(rq);
}

#endif /* CONFIG_SCHEDSTATS */
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1304,20 +1304,6 @@ static inline bool sched_group_cookie_ma
}
#endif /* CONFIG_SCHED_CORE */

-#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
-
-extern void sched_core_account_forceidle(struct rq *rq);
-
-extern void sched_core_tick(struct rq *rq);
-
-#else
-
-static inline void sched_core_account_forceidle(struct rq *rq) {}
-
-static inline void sched_core_tick(struct rq *rq) {}
-
-#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */
-
static inline void lockdep_assert_rq_held(struct rq *rq)
{
lockdep_assert_held(__rq_lockp(rq));
@@ -1870,6 +1856,32 @@ static inline void flush_smp_call_functi
#include "stats.h"
#include "autogroup.h"

+#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
+
+extern void __sched_core_account_forceidle(struct rq *rq);
+
+static inline void sched_core_account_forceidle(struct rq *rq)
+{
+ if (schedstat_enabled())
+ __sched_core_account_forceidle(rq);
+}
+
+extern void __sched_core_tick(struct rq *rq);
+
+static inline void sched_core_tick(struct rq *rq)
+{
+ if (sched_core_enabled(rq) && schedstat_enabled())
+ __sched_core_tick(rq);
+}
+
+#else
+
+static inline void sched_core_account_forceidle(struct rq *rq) {}
+
+static inline void sched_core_tick(struct rq *rq) {}
+
+#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */
+
#ifdef CONFIG_CGROUP_SCHED

/*

2021-11-10 19:05:47

by Josh Don

[permalink] [raw]

Subject: Re: [PATCH v2] sched/core: forced idle accounting

On Wed, Nov 10, 2021 at 12:59 AM Peter Zijlstra <[email protected]> wrote:
>
> > Ok, I already changed the patch to include the above, lemme also edit
> > sched_core_tick().
>
>
> Something like so then?
>
[snip]

Thanks Peter! Looks good to me.

One thing to note though: now that we don't have the
schedstat_enabled() check in __sched_core_account_forceidle(), I'd
recommend either adding a "REQUIRES: schedstat enabled" comment to be
explicit, or switching to schedstat_add().

2021-11-17 14:00:56

by tip-bot2 for Eric Biggers

[permalink] [raw]

Subject: [tip: sched/core] sched/core: Forced idle accounting

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 4feee7d12603deca8775f9f9ae5e121093837444
Gitweb: https://git.kernel.org/tip/4feee7d12603deca8775f9f9ae5e121093837444
Author: Josh Don <[email protected]>
AuthorDate: Mon, 18 Oct 2021 13:34:28 -07:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Wed, 17 Nov 2021 14:49:00 +01:00

sched/core: Forced idle accounting

Adds accounting for "forced idle" time, which is time where a cookie'd
task forces its SMT sibling to idle, despite the presence of runnable
tasks.

Forced idle time is one means to measure the cost of enabling core
scheduling (ie. the capacity lost due to the need to force idle).

Forced idle time is attributed to the thread responsible for causing
the forced idle.

A few details:
- Forced idle time is displayed via /proc/PID/sched. It also requires
that schedstats is enabled.
- Forced idle is only accounted when a sibling hyperthread is held
idle despite the presence of runnable tasks. No time is charged if
a sibling is idle but has no runnable tasks.
- Tasks with 0 cookie are never charged forced idle.
- For SMT > 2, we scale the amount of forced idle charged based on the
number of forced idle siblings. Additionally, we split the time up and
evenly charge it to all running tasks, as each is equally responsible
for the forced idle.

Signed-off-by: Josh Don <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
include/linux/sched.h | 4 ++-
kernel/sched/core.c | 82 ++++++++++++++++++++++++++++----------
kernel/sched/core_sched.c | 66 ++++++++++++++++++++++++++++++-
kernel/sched/debug.c | 4 ++-
kernel/sched/fair.c | 2 +-
kernel/sched/sched.h | 32 ++++++++++++++-
6 files changed, 166 insertions(+), 24 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78c351e..d2e261a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -523,7 +523,11 @@ struct sched_statistics {
u64 nr_wakeups_affine_attempts;
u64 nr_wakeups_passive;
u64 nr_wakeups_idle;
+
+#ifdef CONFIG_SCHED_CORE
+ u64 core_forceidle_sum;
#endif
+#endif /* CONFIG_SCHEDSTATS */
} ____cacheline_aligned;

struct sched_entity {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3c9b0fd..beaa8be 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -144,7 +144,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
return false;

/* flip prio, so high prio is leftmost */
- if (prio_less(b, a, task_rq(a)->core->core_forceidle))
+ if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
return true;

return false;
@@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
}

-void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
{
rq->core->core_task_seq++;

- if (!sched_core_enqueued(p))
- return;
+ if (sched_core_enqueued(p)) {
+ rb_erase(&p->core_node, &rq->core_tree);
+ RB_CLEAR_NODE(&p->core_node);
+ }

- rb_erase(&p->core_node, &rq->core_tree);
- RB_CLEAR_NODE(&p->core_node);
+ /*
+ * Migrating the last task off the cpu, with the cpu in forced idle
+ * state. Reschedule to create an accounting edge for forced idle,
+ * and re-examine whether the core is still in forced idle state.
+ */
+ if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
+ rq->core->core_forceidle_count && rq->curr == rq->idle)
+ resched_curr(rq);
}

/*
@@ -280,6 +288,8 @@ static void __sched_core_flip(bool enabled)
for_each_cpu(t, smt_mask)
cpu_rq(t)->core_enabled = enabled;

+ cpu_rq(cpu)->core->core_forceidle_start = 0;
+
sched_core_unlock(cpu, &flags);

cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
@@ -364,7 +374,8 @@ void sched_core_put(void)
#else /* !CONFIG_SCHED_CORE */

static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
-static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
+static inline void
+sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }

#endif /* CONFIG_SCHED_CORE */

@@ -2005,7 +2016,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (sched_core_enabled(rq))
- sched_core_dequeue(rq, p);
+ sched_core_dequeue(rq, p, flags);

if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
@@ -5244,6 +5255,7 @@ void scheduler_tick(void)
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
+ sched_core_tick(rq);

rq_unlock(rq, &rf);

@@ -5656,6 +5668,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *next, *p, *max = NULL;
const struct cpumask *smt_mask;
bool fi_before = false;
+ bool core_clock_updated = (rq == rq->core);
unsigned long cookie;
int i, cpu, occ = 0;
struct rq *rq_i;
@@ -5708,10 +5721,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)

/* reset state */
rq->core->core_cookie = 0UL;
- if (rq->core->core_forceidle) {
+ if (rq->core->core_forceidle_count) {
+ if (!core_clock_updated) {
+ update_rq_clock(rq->core);
+ core_clock_updated = true;
+ }
+ sched_core_account_forceidle(rq);
+ /* reset after accounting force idle */
+ rq->core->core_forceidle_start = 0;
+ rq->core->core_forceidle_count = 0;
+ rq->core->core_forceidle_occupation = 0;
need_sync = true;
fi_before = true;
- rq->core->core_forceidle = false;
}

/*
@@ -5753,7 +5774,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
for_each_cpu_wrap(i, smt_mask, cpu) {
rq_i = cpu_rq(i);

- if (i != cpu)
+ /*
+ * Current cpu always has its clock updated on entrance to
+ * pick_next_task(). If the current cpu is not the core,
+ * the core may also have been updated above.
+ */
+ if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i);

p = rq_i->core_pick = pick_task(rq_i);
@@ -5783,7 +5809,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)

if (p == rq_i->idle) {
if (rq_i->nr_running) {
- rq->core->core_forceidle = true;
+ rq->core->core_forceidle_count++;
if (!fi_before)
rq->core->core_forceidle_seq++;
}
@@ -5792,6 +5818,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
}

+ if (schedstat_enabled() && rq->core->core_forceidle_count) {
+ if (cookie)
+ rq->core->core_forceidle_start = rq_clock(rq->core);
+ rq->core->core_forceidle_occupation = occ;
+ }
+
rq->core->core_pick_seq = rq->core->core_task_seq;
next = rq->core_pick;
rq->core_sched_seq = rq->core->core_pick_seq;
@@ -5828,8 +5860,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* 1 0 1
* 1 1 0
*/
- if (!(fi_before && rq->core->core_forceidle))
- task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
+ if (!(fi_before && rq->core->core_forceidle_count))
+ task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);

rq_i->core_pick->core_occupation = occ;

@@ -6033,11 +6065,19 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
goto unlock;

/* copy the shared state to the new leader */
- core_rq->core_task_seq = rq->core_task_seq;
- core_rq->core_pick_seq = rq->core_pick_seq;
- core_rq->core_cookie = rq->core_cookie;
- core_rq->core_forceidle = rq->core_forceidle;
- core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+ core_rq->core_task_seq = rq->core_task_seq;
+ core_rq->core_pick_seq = rq->core_pick_seq;
+ core_rq->core_cookie = rq->core_cookie;
+ core_rq->core_forceidle_count = rq->core_forceidle_count;
+ core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+ core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
+
+ /*
+ * Accounting edge for forced idle is handled in pick_next_task().
+ * Don't need another one here, since the hotplug thread shouldn't
+ * have a cookie.
+ */
+ core_rq->core_forceidle_start = 0;

/* install new leader */
for_each_cpu(t, smt_mask) {
@@ -9413,7 +9453,9 @@ void __init sched_init(void)
rq->core_pick = NULL;
rq->core_enabled = 0;
rq->core_tree = RB_ROOT;
- rq->core_forceidle = false;
+ rq->core_forceidle_count = 0;
+ rq->core_forceidle_occupation = 0;
+ rq->core_forceidle_start = 0;

rq->core_cookie = 0UL;
#endif
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 517f72b..1fb4567 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,

enqueued = sched_core_enqueued(p);
if (enqueued)
- sched_core_dequeue(rq, p);
+ sched_core_dequeue(rq, p, DEQUEUE_SAVE);

old_cookie = p->core_cookie;
p->core_cookie = cookie;
@@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
* If task is currently running, it may not be compatible anymore after
* the cookie change, so enter the scheduler on its CPU to schedule it
* away.
+ *
+ * Note that it is possible that as a result of this cookie change, the
+ * core has now entered/left forced idle state. Defer accounting to the
+ * next scheduling edge, rather than always forcing a reschedule here.
*/
if (task_running(rq, p))
resched_curr(rq);
@@ -232,3 +236,63 @@ out:
return err;
}

+#ifdef CONFIG_SCHEDSTATS
+
+/* REQUIRES: rq->core's clock recently updated. */
+void __sched_core_account_forceidle(struct rq *rq)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
+ u64 delta, now = rq_clock(rq->core);
+ struct rq *rq_i;
+ struct task_struct *p;
+ int i;
+
+ lockdep_assert_rq_held(rq);
+
+ WARN_ON_ONCE(!rq->core->core_forceidle_count);
+
+ if (rq->core->core_forceidle_start == 0)
+ return;
+
+ delta = now - rq->core->core_forceidle_start;
+ if (unlikely((s64)delta <= 0))
+ return;
+
+ rq->core->core_forceidle_start = now;
+
+ if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) {
+ /* can't be forced idle without a running task */
+ } else if (rq->core->core_forceidle_count > 1 ||
+ rq->core->core_forceidle_occupation > 1) {
+ /*
+ * For larger SMT configurations, we need to scale the charged
+ * forced idle amount since there can be more than one forced
+ * idle sibling and more than one running cookied task.
+ */
+ delta *= rq->core->core_forceidle_count;
+ delta = div_u64(delta, rq->core->core_forceidle_occupation);
+ }
+
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+ p = rq_i->core_pick ?: rq_i->curr;
+
+ if (!p->core_cookie)
+ continue;
+
+ __schedstat_add(p->stats.core_forceidle_sum, delta);
+ }
+}
+
+void __sched_core_tick(struct rq *rq)
+{
+ if (!rq->core->core_forceidle_count)
+ return;
+
+ if (rq != rq->core)
+ update_rq_clock(rq->core);
+
+ __sched_core_account_forceidle(rq);
+}
+
+#endif /* CONFIG_SCHEDSTATS */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7dcbaa3..aa29211 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1023,6 +1023,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,

__PN(avg_atom);
__PN(avg_per_cpu);
+
+#ifdef CONFIG_SCHED_CORE
+ PN_SCHEDSTAT(core_forceidle_sum);
+#endif
}

__P(nr_switches);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6e476f6..884f29d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11068,7 +11068,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
* if we need to give up the CPU.
*/
- if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
+ if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
resched_curr(rq);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0e66749..eb97115 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1111,8 +1111,10 @@ struct rq {
unsigned int core_task_seq;
unsigned int core_pick_seq;
unsigned long core_cookie;
- unsigned char core_forceidle;
+ unsigned int core_forceidle_count;
unsigned int core_forceidle_seq;
+ unsigned int core_forceidle_occupation;
+ u64 core_forceidle_start;
#endif
};

@@ -1253,7 +1255,7 @@ static inline bool sched_core_enqueued(struct task_struct *p)
}

extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
-extern void sched_core_dequeue(struct rq *rq, struct task_struct *p);
+extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);

extern void sched_core_get(void);
extern void sched_core_put(void);
@@ -1854,6 +1856,32 @@ static inline void flush_smp_call_function_from_idle(void) { }
#include "stats.h"
#include "autogroup.h"

+#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
+
+extern void __sched_core_account_forceidle(struct rq *rq);
+
+static inline void sched_core_account_forceidle(struct rq *rq)
+{
+ if (schedstat_enabled())
+ __sched_core_account_forceidle(rq);
+}
+
+extern void __sched_core_tick(struct rq *rq);
+
+static inline void sched_core_tick(struct rq *rq)
+{
+ if (sched_core_enabled(rq) && schedstat_enabled())
+ __sched_core_tick(rq);
+}
+
+#else
+
+static inline void sched_core_account_forceidle(struct rq *rq) {}
+
+static inline void sched_core_tick(struct rq *rq) {}
+
+#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */
+
#ifdef CONFIG_CGROUP_SCHED

/*