2021-12-23 12:30:26

by Cruz Zhao

[permalink] [raw]
Subject: [PATCH 1/2] sched/core: Cookied forceidle accounting per cpu

Adds accounting for "forced idle" time per cpu, which is time where a
cpu is forced to idle by a cookie'd task running on its SMT sibling.

Josh Don's patch 4feee7d12603 ("sched/core: Forced idle accounting")
provides one means to measure the cost of enabling core scheduling
from the perspective of the task, and this patch provides another
means to do that from the perspective of the cpu.

A few details:
- Cookied forceidle time is displayed via the last column of
/proc/stat.
- Cookied forceidle time is ony accounted when this cpu is forced
idle and a sibling hyperthread is running with a cookie'd task.

Signed-off-by: Cruz Zhao <[email protected]>
---
fs/proc/stat.c | 15 +++++++++++++++
include/linux/kernel_stat.h | 3 +++
kernel/sched/core.c | 4 ++--
kernel/sched/core_sched.c | 20 ++++++++++++++++++--
kernel/sched/sched.h | 10 ++--------
5 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 4fb8729..3a2fbc9 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -109,6 +109,9 @@ static int show_stat(struct seq_file *p, void *v)
{
int i, j;
u64 user, nice, system, idle, iowait, irq, softirq, steal;
+#ifdef CONFIG_SCHED_CORE
+ u64 cookied_forceidle = 0;
+#endif
u64 guest, guest_nice;
u64 sum = 0;
u64 sum_softirq = 0;
@@ -140,6 +143,9 @@ static int show_stat(struct seq_file *p, void *v)
guest_nice += cpustat[CPUTIME_GUEST_NICE];
sum += kstat_cpu_irqs_sum(i);
sum += arch_irq_stat_cpu(i);
+#ifdef CONFIG_SCHED_CORE
+ cookied_forceidle += cpustat[CPUTIME_COOKIED_FORCEIDLE];
+#endif

for (j = 0; j < NR_SOFTIRQS; j++) {
unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
@@ -160,6 +166,9 @@ static int show_stat(struct seq_file *p, void *v)
seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
+#ifdef CONFIG_SCHED_CORE
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(cookied_forceidle));
+#endif
seq_putc(p, '\n');

for_each_online_cpu(i) {
@@ -179,6 +188,9 @@ static int show_stat(struct seq_file *p, void *v)
steal = cpustat[CPUTIME_STEAL];
guest = cpustat[CPUTIME_GUEST];
guest_nice = cpustat[CPUTIME_GUEST_NICE];
+#ifdef CONFIG_SCHED_CORE
+ cookied_forceidle = cpustat[CPUTIME_COOKIED_FORCEIDLE];
+#endif
seq_printf(p, "cpu%d", i);
seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
@@ -190,6 +202,9 @@ static int show_stat(struct seq_file *p, void *v)
seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
+#ifdef CONFIG_SCHED_CORE
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(cookied_forceidle));
+#endif
seq_putc(p, '\n');
}
seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 69ae6b2..a21b065 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,6 +28,9 @@ enum cpu_usage_stat {
CPUTIME_STEAL,
CPUTIME_GUEST,
CPUTIME_GUEST_NICE,
+#ifdef CONFIG_SCHED_CORE
+ CPUTIME_COOKIED_FORCEIDLE,
+#endif
NR_STATS,
};

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 956d699..f4f4b24 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5723,7 +5723,6 @@ static inline struct task_struct *pick_task(struct rq *rq)
need_sync = !!rq->core->core_cookie;

/* reset state */
- rq->core->core_cookie = 0UL;
if (rq->core->core_forceidle_count) {
if (!core_clock_updated) {
update_rq_clock(rq->core);
@@ -5737,6 +5736,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
need_sync = true;
fi_before = true;
}
+ rq->core->core_cookie = 0UL;

/*
* core->core_task_seq, core->core_pick_seq, rq->core_sched_seq
@@ -5821,7 +5821,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
}
}

- if (schedstat_enabled() && rq->core->core_forceidle_count) {
+ if (rq->core->core_forceidle_count) {
if (cookie)
rq->core->core_forceidle_start = rq_clock(rq->core);
rq->core->core_forceidle_occupation = occ;
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 1fb4567..bc5f45f 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -239,13 +239,14 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
#ifdef CONFIG_SCHEDSTATS

/* REQUIRES: rq->core's clock recently updated. */
-void __sched_core_account_forceidle(struct rq *rq)
+void sched_core_account_forceidle(struct rq *rq)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
u64 delta, now = rq_clock(rq->core);
struct rq *rq_i;
struct task_struct *p;
int i;
+ u64 *cpustat;

lockdep_assert_rq_held(rq);

@@ -260,6 +261,21 @@ void __sched_core_account_forceidle(struct rq *rq)

rq->core->core_forceidle_start = now;

+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+ p = rq_i->core_pick ?: rq_i->curr;
+
+ if (!rq->core->core_cookie)
+ continue;
+ if (p == rq_i->idle && rq_i->nr_running) {
+ cpustat = kcpustat_cpu(i).cpustat;
+ cpustat[CPUTIME_COOKIED_FORCEIDLE] += delta;
+ }
+ }
+
+ if (!schedstat_enabled())
+ return;
+
if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) {
/* can't be forced idle without a running task */
} else if (rq->core->core_forceidle_count > 1 ||
@@ -292,7 +308,7 @@ void __sched_core_tick(struct rq *rq)
if (rq != rq->core)
update_rq_clock(rq->core);

- __sched_core_account_forceidle(rq);
+ sched_core_account_forceidle(rq);
}

#endif /* CONFIG_SCHEDSTATS */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index de53be9..09cb1f2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1858,19 +1858,13 @@ static inline void flush_smp_call_function_from_idle(void) { }

#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)

-extern void __sched_core_account_forceidle(struct rq *rq);
-
-static inline void sched_core_account_forceidle(struct rq *rq)
-{
- if (schedstat_enabled())
- __sched_core_account_forceidle(rq);
-}
+extern void sched_core_account_forceidle(struct rq *rq);

extern void __sched_core_tick(struct rq *rq);

static inline void sched_core_tick(struct rq *rq)
{
- if (sched_core_enabled(rq) && schedstat_enabled())
+ if (sched_core_enabled(rq))
__sched_core_tick(rq);
}

--
1.8.3.1



2022-01-05 01:48:23

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/core: Cookied forceidle accounting per cpu

Hi Cruz,

Could you add a bit more background to help me understand what case
this patch solves? Is your issue that you want to be able to
attribute forced idle time to the specific cpus it happens on, or do
you simply want a more convenient way of summing forced idle without
iterating your cookie'd tasks and summing the schedstat manually?

> @@ -190,6 +202,9 @@ static int show_stat(struct seq_file *p, void *v)
> seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
> seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
> seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
> +#ifdef CONFIG_SCHED_CORE
> + seq_put_decimal_ull(p, " ", nsec_to_clock_t(cookied_forceidle));
> +#endif

IMO it would be better to always print this stat, otherwise it sets a
weird precedent for new stats added in the future (much more difficult
for userspace to reason about which column corresponds with which
field, since it would depend on kernel config).

Also, did you intend to put this in /proc/stat instead of
/proc/schedstat (the latter of which would be more attractive to
prevent calculation of these stats unless schestat was enabled)?

> diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
> @@ -260,6 +261,21 @@ void __sched_core_account_forceidle(struct rq *rq)
>
> rq->core->core_forceidle_start = now;
>
> + for_each_cpu(i, smt_mask) {
> + rq_i = cpu_rq(i);
> + p = rq_i->core_pick ?: rq_i->curr;
> +
> + if (!rq->core->core_cookie)
> + continue;

I see this is temporary given your other patch, but just a note that
if your other patch is dropped, this check can be pulled outside the
loop.

> + if (p == rq_i->idle && rq_i->nr_running) {
> + cpustat = kcpustat_cpu(i).cpustat;
> + cpustat[CPUTIME_COOKIED_FORCEIDLE] += delta;
> + }
> + }

I don't think this is right. If a cpu was simply idle while some other
SMT sibling on its core was forced idle, and then a task happens to
wake on the idle cpu, that cpu will now be charged the full delta here
as forced idle (when actually it was never forced idle, we just
haven't been through pick_next_task yet). One workaround would be to
add a boolean to struct rq to cache whether the rq was in forced idle
state.

Best,
Josh

2022-01-05 11:33:26

by Cruz Zhao

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/core: Cookied forceidle accounting per cpu



在 2022/1/5 上午9:48, Josh Don 写道:
> Hi Cruz,
>
Firstly, attributing forced idle time to the specific cpus it happens on
can help us measure the effect of steal_cookie_task(). We found out that
steal_cookie_task() conflicts with load balance sometimes, for example,
a cookie'd task is stolen by steal_cookie_task(), but it'll be migrated
to another core by load balance soon. Secondly, a more convenient way of
summing forced idle instead of iterating cookie'd task is indeed what we
need. In the multi-rent scenario, it'll be complex to maintain the list
of cookie'd task and it'll cost a lot to iterate it.

> Could you add a bit more background to help me understand what case
> this patch solves? Is your issue that you want to be able to
> attribute forced idle time to the specific cpus it happens on, or do
> you simply want a more convenient way of summing forced idle without
> iterating your cookie'd tasks and summing the schedstat manually?
>
>> @@ -190,6 +202,9 @@ static int show_stat(struct seq_file *p, void *v)
>> seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
>> seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
>> seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
>> +#ifdef CONFIG_SCHED_CORE
>> + seq_put_decimal_ull(p, " ", nsec_to_clock_t(cookied_forceidle));
>> +#endif
>
I'll put this in /proc/schedstat and fix the problem that accounting
simply idle as force idle in the next version.

Many thanks for suggestions.

Best,
Cruz Zhao
> IMO it would be better to always print this stat, otherwise it sets a
> weird precedent for new stats added in the future (much more difficult
> for userspace to reason about which column corresponds with which
> field, since it would depend on kernel config).
>
> Also, did you intend to put this in /proc/stat instead of
> /proc/schedstat (the latter of which would be more attractive to
> prevent calculation of these stats unless schestat was enabled)?
>
>> diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
>> @@ -260,6 +261,21 @@ void __sched_core_account_forceidle(struct rq *rq)
>>
>> rq->core->core_forceidle_start = now;
>>
>> + for_each_cpu(i, smt_mask) {
>> + rq_i = cpu_rq(i);
>> + p = rq_i->core_pick ?: rq_i->curr;
>> +
>> + if (!rq->core->core_cookie)
>> + continue;
>
> I see this is temporary given your other patch, but just a note that
> if your other patch is dropped, this check can be pulled outside the
> loop.
>
>> + if (p == rq_i->idle && rq_i->nr_running) {
>> + cpustat = kcpustat_cpu(i).cpustat;
>> + cpustat[CPUTIME_COOKIED_FORCEIDLE] += delta;
>> + }
>> + }
>
> I don't think this is right. If a cpu was simply idle while some other
> SMT sibling on its core was forced idle, and then a task happens to
> wake on the idle cpu, that cpu will now be charged the full delta here
> as forced idle (when actually it was never forced idle, we just
> haven't been through pick_next_task yet). One workaround would be to
> add a boolean to struct rq to cache whether the rq was in forced idle
> state.
>
> Best,
> Josh

2022-01-05 20:47:36

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/core: Cookied forceidle accounting per cpu

On Wed, Jan 5, 2022 at 3:33 AM cruzzhao <[email protected]> wrote:
>
> Firstly, attributing forced idle time to the specific cpus it happens on
> can help us measure the effect of steal_cookie_task(). We found out that
> steal_cookie_task() conflicts with load balance sometimes, for example,
> a cookie'd task is stolen by steal_cookie_task(), but it'll be migrated
> to another core by load balance soon.

I don't see how this is very helpful for steal_cookie_task(), since it
isn't a targeted metric for that specific case. If you were interested
in that specifically, I'd think you'd want to look at more direct
metrics, such as task migration counts, or adding some
accounting/histogram for the time between steal and load balance away.

> Secondly, a more convenient way of
> summing forced idle instead of iterating cookie'd task is indeed what we
> need. In the multi-rent scenario, it'll be complex to maintain the list
> of cookie'd task and it'll cost a lot to iterate it.

That motivation makes more sense to me. Have you considered
accumulating this at the cgroup level (ie. attributing it as another
type of usage)?

2022-01-06 12:09:56

by Cruz Zhao

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/core: Cookied forceidle accounting per cpu



在 2022/1/6 上午4:47, Josh Don 写道:
> On Wed, Jan 5, 2022 at 3:33 AM cruzzhao <[email protected]> wrote:

>
> I don't see how this is very helpful for steal_cookie_task(), since it
> isn't a targeted metric for that specific case. If you were interested
> in that specifically, I'd think you'd want to look at more direct
> metrics, such as task migration counts, or adding some
> accounting/histogram for the time between steal and load balance away.
>

I've already read the patch "sched: CGroup tagging interface for core
scheduling", but it hasn't been merged into linux-next. IMO it's better
to do this at the cgroup level after the cgroup tagging interface is
introduced.

Best,
Cruz Zhao

>
> That motivation makes more sense to me. Have you considered
> accumulating this at the cgroup level (ie. attributing it as another
> type of usage)?

2022-01-06 19:49:49

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/core: Cookied forceidle accounting per cpu

On Thu, Jan 6, 2022 at 4:10 AM cruzzhao <[email protected]> wrote:
>
> >
> > That motivation makes more sense to me. Have you considered
> > accumulating this at the cgroup level (ie. attributing it as another
> > type of usage)?
>
> I've already read the patch "sched: CGroup tagging interface for core
> scheduling", but it hasn't been merged into linux-next. IMO it's better
> to do this at the cgroup level after the cgroup tagging interface is
> introduced.
>
> Best,
> Cruz Zhao

There are no plans to introduce cgroup-level tagging for core sched.
But the accounting is a separate issue. Similar to how tasks account
usage both to themselves and to their cgroup hierarchy, we could
account forced idle in a similar way, and add another field to
cpu_extra_stat_show. That still gives you the total system forced idle
time by looking at the root cgroup, and allows you to slice the
accounting by different job groups. It also makes the accounting a
single value per cgroup rather than a per-cpu value (I still don't see
the value of attributing to specific cpus, as I described in my prior
reply).