2022-01-07 23:42:33

by Josh Don

[permalink] [raw]
Subject: [PATCH 1/2] cgroup: add cpu.stat_percpu

cpu.stat displays global metrics, such as cgroup usage. It would also be
useful to be able to break these down by cpu; to that end, this patch
adds a new interface, 'cpu.stat_percpu', to display the percpu values of
these stats.

Each line of the output corresponds to a particular metric. The format
of each line is the name of the metric, followed by space delimited
percpu values. The reason for this approach (vs having each line
correspond to a particular cpu) is to make it easier to display extra
subsystem-specific percpu fields.

Signed-off-by: Josh Don <[email protected]>
---
include/linux/cgroup-defs.h | 5 +
kernel/cgroup/cgroup-internal.h | 1 +
kernel/cgroup/cgroup.c | 10 ++
kernel/cgroup/rstat.c | 159 ++++++++++++++++++++++++++++----
4 files changed, 155 insertions(+), 20 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index db2e147e069f..7778a011f457 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -461,6 +461,11 @@ struct cgroup {
struct cgroup_base_stat bstat;
struct prev_cputime prev_cputime; /* for printing out cputime */

+ /* Per-cpu basic resource statistics. These are NULL on root. */
+ struct cgroup_base_stat __percpu *bstat_cpu;
+ struct cgroup_base_stat __percpu *last_bstat_cpu;
+ struct prev_cputime __percpu *prev_cputime_cpu;
+
/*
* list of pidlists, up to two for each namespace (one for procs, one
* for tasks); created on demand.
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index bfbeabc17a9d..07e932c4f875 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -254,6 +254,7 @@ int cgroup_rstat_init(struct cgroup *cgrp);
void cgroup_rstat_exit(struct cgroup *cgrp);
void cgroup_rstat_boot(void);
void cgroup_base_stat_cputime_show(struct seq_file *seq);
+void cgroup_base_stat_percpu_cputime_show(struct seq_file *seq);

/*
* namespace.c
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 919194de39c8..4f5ddce529eb 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3604,6 +3604,12 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
return ret;
}

+static int cpu_stat_percpu_show(struct seq_file *seq, void *v)
+{
+ cgroup_base_stat_percpu_cputime_show(seq);
+ return 0;
+}
+
#ifdef CONFIG_PSI
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
@@ -5014,6 +5020,10 @@ static struct cftype cgroup_base_files[] = {
.name = "cpu.stat",
.seq_show = cpu_stat_show,
},
+ {
+ .name = "cpu.stat_percpu",
+ .seq_show = cpu_stat_percpu_show,
+ },
#ifdef CONFIG_PSI
{
.name = "io.pressure",
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 1486768f2318..1af37333e5bf 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -253,7 +253,19 @@ int cgroup_rstat_init(struct cgroup *cgrp)
if (!cgrp->rstat_cpu) {
cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
if (!cgrp->rstat_cpu)
- return -ENOMEM;
+ goto error_nomem;
+
+ cgrp->last_bstat_cpu = alloc_percpu(struct cgroup_base_stat);
+ if (!cgrp->last_bstat_cpu)
+ goto error_nomem;
+
+ cgrp->bstat_cpu = alloc_percpu(struct cgroup_base_stat);
+ if (!cgrp->bstat_cpu)
+ goto error_nomem;
+
+ cgrp->prev_cputime_cpu = alloc_percpu(struct prev_cputime);
+ if (!cgrp->prev_cputime_cpu)
+ goto error_nomem;
}

/* ->updated_children list is self terminated */
@@ -265,6 +277,21 @@ int cgroup_rstat_init(struct cgroup *cgrp)
}

return 0;
+
+error_nomem:
+ free_percpu(cgrp->rstat_cpu);
+ cgrp->rstat_cpu = NULL;
+
+ free_percpu(cgrp->last_bstat_cpu);
+ cgrp->last_bstat_cpu = NULL;
+
+ free_percpu(cgrp->bstat_cpu);
+ cgrp->bstat_cpu = NULL;
+
+ free_percpu(cgrp->prev_cputime_cpu);
+ cgrp->prev_cputime_cpu = NULL;
+
+ return -ENOMEM;
}

void cgroup_rstat_exit(struct cgroup *cgrp)
@@ -284,6 +311,12 @@ void cgroup_rstat_exit(struct cgroup *cgrp)

free_percpu(cgrp->rstat_cpu);
cgrp->rstat_cpu = NULL;
+ free_percpu(cgrp->last_bstat_cpu);
+ cgrp->last_bstat_cpu = NULL;
+ free_percpu(cgrp->bstat_cpu);
+ cgrp->bstat_cpu = NULL;
+ free_percpu(cgrp->prev_cputime_cpu);
+ cgrp->prev_cputime_cpu = NULL;
}

void __init cgroup_rstat_boot(void)
@@ -319,22 +352,29 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_base_stat cur, delta;
+ struct cgroup_base_stat *bstat_cpu, *last_bstat_cpu;
unsigned seq;

/* Root-level stats are sourced from system-wide CPU stats */
if (!parent)
return;

+ /* these are not present on root */
+ bstat_cpu = per_cpu_ptr(cgrp->bstat_cpu, cpu);
+ last_bstat_cpu = per_cpu_ptr(cgrp->last_bstat_cpu, cpu);
+
/* fetch the current per-cpu values */
do {
seq = __u64_stats_fetch_begin(&rstatc->bsync);
cur.cputime = rstatc->bstat.cputime;
} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));

+
/* propagate percpu delta to global */
delta = cur;
cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
cgroup_base_stat_add(&cgrp->bstat, &delta);
+ cgroup_base_stat_add(bstat_cpu, &delta);
cgroup_base_stat_add(&rstatc->last_bstat, &delta);

/* propagate global delta to parent (unless that's root) */
@@ -343,6 +383,11 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
cgroup_base_stat_add(&parent->bstat, &delta);
cgroup_base_stat_add(&cgrp->last_bstat, &delta);
+
+ delta = *bstat_cpu;
+ cgroup_base_stat_sub(&delta, last_bstat_cpu);
+ cgroup_base_stat_add(per_cpu_ptr(parent->bstat_cpu, cpu), &delta);
+ cgroup_base_stat_add(last_bstat_cpu, &delta);
}
}

@@ -400,6 +445,30 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}

+/* See root_cgroup_cputime. Note that this does not first reset cputime. */
+static void root_cgroup_add_cputime_cpu(struct task_cputime *cputime, int cpu)
+{
+ struct kernel_cpustat kcpustat;
+ u64 *cpustat = kcpustat.cpustat;
+ u64 user = 0;
+ u64 sys = 0;
+
+ kcpustat_cpu_fetch(&kcpustat, cpu);
+
+ user += cpustat[CPUTIME_USER];
+ user += cpustat[CPUTIME_NICE];
+ cputime->utime += user;
+
+ sys += cpustat[CPUTIME_SYSTEM];
+ sys += cpustat[CPUTIME_IRQ];
+ sys += cpustat[CPUTIME_SOFTIRQ];
+ cputime->stime += sys;
+
+ cputime->sum_exec_runtime += user;
+ cputime->sum_exec_runtime += sys;
+ cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
+}
+
/*
* compute the cputime for the root cgroup by getting the per cpu data
* at a global level, then categorizing the fields in a manner consistent
@@ -414,25 +483,7 @@ static void root_cgroup_cputime(struct task_cputime *cputime)
cputime->utime = 0;
cputime->sum_exec_runtime = 0;
for_each_possible_cpu(i) {
- struct kernel_cpustat kcpustat;
- u64 *cpustat = kcpustat.cpustat;
- u64 user = 0;
- u64 sys = 0;
-
- kcpustat_cpu_fetch(&kcpustat, i);
-
- user += cpustat[CPUTIME_USER];
- user += cpustat[CPUTIME_NICE];
- cputime->utime += user;
-
- sys += cpustat[CPUTIME_SYSTEM];
- sys += cpustat[CPUTIME_IRQ];
- sys += cpustat[CPUTIME_SOFTIRQ];
- cputime->stime += sys;
-
- cputime->sum_exec_runtime += user;
- cputime->sum_exec_runtime += sys;
- cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
+ root_cgroup_add_cputime_cpu(cputime, i);
}
}

@@ -464,3 +515,71 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
"system_usec %llu\n",
usage, utime, stime);
}
+
+void cgroup_base_stat_percpu_cputime_show(struct seq_file *seq)
+{
+ static DEFINE_MUTEX(mutex);
+ static DEFINE_PER_CPU(struct cgroup_base_stat, cached_percpu_stats);
+ struct cgroup_base_stat *cached_bstat;
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ u64 val;
+ int cpu;
+
+ /* protects cached_percpu_stats */
+ mutex_lock(&mutex);
+
+ if (cgroup_parent(cgrp)) {
+ struct cgroup_base_stat *bstat_cpu;
+
+ cgroup_rstat_flush_hold(cgrp);
+
+ for_each_possible_cpu(cpu) {
+ bstat_cpu = per_cpu_ptr(cgrp->bstat_cpu, cpu);
+ cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu);
+
+ cached_bstat->cputime.sum_exec_runtime =
+ bstat_cpu->cputime.sum_exec_runtime;
+ cputime_adjust(&bstat_cpu->cputime,
+ per_cpu_ptr(cgrp->prev_cputime_cpu, cpu),
+ &cached_bstat->cputime.utime,
+ &cached_bstat->cputime.stime);
+ }
+
+ cgroup_rstat_flush_release();
+ } else {
+ for_each_possible_cpu(cpu) {
+ cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu);
+ memset(cached_bstat, 0, sizeof(*cached_bstat));
+ root_cgroup_add_cputime_cpu(&cached_bstat->cputime, cpu);
+ }
+ }
+
+ seq_puts(seq, "usage_usec");
+ for_each_possible_cpu(cpu) {
+ cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu);
+ val = cached_bstat->cputime.sum_exec_runtime;
+ do_div(val, NSEC_PER_USEC);
+ seq_printf(seq, " %llu", val);
+ }
+ seq_puts(seq, "\n");
+
+ seq_puts(seq, "user_usec");
+ for_each_possible_cpu(cpu) {
+ cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu);
+ val = cached_bstat->cputime.utime;
+ do_div(val, NSEC_PER_USEC);
+ seq_printf(seq, " %llu", val);
+ }
+ seq_puts(seq, "\n");
+
+ seq_puts(seq, "system_usec");
+ for_each_possible_cpu(cpu) {
+ cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu);
+ val = cached_bstat->cputime.stime;
+ do_div(val, NSEC_PER_USEC);
+ seq_printf(seq, " %llu", val);
+ }
+ seq_puts(seq, "\n");
+
+ mutex_unlock(&mutex);
+}
--
2.34.1.575.g55b058a8bb-goog



2022-01-07 23:42:51

by Josh Don

[permalink] [raw]
Subject: [PATCH 2/2] sched: show percpu throttled stats

Now that we have a cpu.stat_percpu interface, we can expose some
statistics percpu. From the cpu controller, CFS bandwidth throttling
time is a useful metric to expose percpu.

Signed-off-by: Josh Don <[email protected]>
---
include/linux/cgroup-defs.h | 3 ++-
kernel/cgroup/cgroup.c | 15 +++++++++++----
kernel/sched/core.c | 38 +++++++++++++++++++++++++++++++++++--
kernel/sched/fair.c | 5 ++++-
kernel/sched/sched.h | 1 +
5 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 7778a011f457..e6903a6e0a10 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -636,7 +636,8 @@ struct cgroup_subsys {
void (*css_reset)(struct cgroup_subsys_state *css);
void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu);
int (*css_extra_stat_show)(struct seq_file *seq,
- struct cgroup_subsys_state *css);
+ struct cgroup_subsys_state *css,
+ bool percpu);

int (*can_attach)(struct cgroup_taskset *tset);
void (*cancel_attach)(struct cgroup_taskset *tset);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4f5ddce529eb..05d9aeb3ff48 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3574,7 +3574,8 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
}

static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
- struct cgroup *cgrp, int ssid)
+ struct cgroup *cgrp, int ssid,
+ bool percpu)
{
struct cgroup_subsys *ss = cgroup_subsys[ssid];
struct cgroup_subsys_state *css;
@@ -3587,7 +3588,7 @@ static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
if (!css)
return 0;

- ret = ss->css_extra_stat_show(seq, css);
+ ret = ss->css_extra_stat_show(seq, css, percpu);
css_put(css);
return ret;
}
@@ -3599,15 +3600,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v)

cgroup_base_stat_cputime_show(seq);
#ifdef CONFIG_CGROUP_SCHED
- ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
+ ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id, /*percpu=*/false);
#endif
return ret;
}

static int cpu_stat_percpu_show(struct seq_file *seq, void *v)
{
+ struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
+ int ret = 0;
+
cgroup_base_stat_percpu_cputime_show(seq);
- return 0;
+#ifdef CONFIG_CGROUP_SCHED
+ ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id, /*percpu=*/true);
+#endif
+ return ret;
}

#ifdef CONFIG_PSI
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4604e80fac5e..8b383e58aaa2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10651,8 +10651,8 @@ static struct cftype cpu_legacy_files[] = {
{ } /* Terminate */
};

-static int cpu_extra_stat_show(struct seq_file *sf,
- struct cgroup_subsys_state *css)
+static void __cpu_extra_stat_show(struct seq_file *sf,
+ struct cgroup_subsys_state *css)
{
#ifdef CONFIG_CFS_BANDWIDTH
{
@@ -10674,6 +10674,40 @@ static int cpu_extra_stat_show(struct seq_file *sf,
throttled_usec, cfs_b->nr_burst, burst_usec);
}
#endif
+}
+
+static void __cpu_extra_stat_percpu_show(struct seq_file *sf,
+ struct cgroup_subsys_state *css)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ struct task_group *tg = css_tg(css);
+ struct cfs_rq *cfs_rq;
+ u64 throttled_usec;
+ int cpu;
+
+ seq_puts(sf, "throttled_usec");
+ for_each_possible_cpu(cpu) {
+ cfs_rq = tg->cfs_rq[cpu];
+
+ throttled_usec = READ_ONCE(cfs_rq->throttled_time);
+ do_div(throttled_usec, NSEC_PER_USEC);
+
+ seq_printf(sf, " %llu", throttled_usec);
+ }
+ seq_puts(sf, "\n");
+ }
+#endif
+}
+
+static int cpu_extra_stat_show(struct seq_file *sf,
+ struct cgroup_subsys_state *css,
+ bool percpu)
+{
+ if (percpu)
+ __cpu_extra_stat_percpu_show(sf, css);
+ else
+ __cpu_extra_stat_show(sf, css);
return 0;
}

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d4b463e015cd..2de0ce23ee99 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4878,6 +4878,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta;
+ u64 throttled_time;

se = cfs_rq->tg->se[cpu_of(rq)];

@@ -4886,7 +4887,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
update_rq_clock(rq);

raw_spin_lock(&cfs_b->lock);
- cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+ throttled_time = rq_clock(rq) - cfs_rq->throttled_clock;
+ cfs_b->throttled_time += throttled_time;
+ cfs_rq->throttled_time += throttled_time;
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index de53be905739..c1ac2b8d8dd5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -620,6 +620,7 @@ struct cfs_rq {
u64 throttled_clock;
u64 throttled_clock_task;
u64 throttled_clock_task_time;
+ u64 throttled_time;
int throttled;
int throttle_count;
struct list_head throttled_list;
--
2.34.1.575.g55b058a8bb-goog


2022-01-11 12:50:20

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/2] cgroup: add cpu.stat_percpu

On Fri, Jan 07, 2022 at 03:41:37PM -0800, Josh Don wrote:

> + seq_puts(seq, "usage_usec");
> + for_each_possible_cpu(cpu) {
> + cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu);
> + val = cached_bstat->cputime.sum_exec_runtime;
> + do_div(val, NSEC_PER_USEC);
> + seq_printf(seq, " %llu", val);
> + }
> + seq_puts(seq, "\n");
> +
> + seq_puts(seq, "user_usec");
> + for_each_possible_cpu(cpu) {
> + cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu);
> + val = cached_bstat->cputime.utime;
> + do_div(val, NSEC_PER_USEC);
> + seq_printf(seq, " %llu", val);
> + }
> + seq_puts(seq, "\n");
> +
> + seq_puts(seq, "system_usec");
> + for_each_possible_cpu(cpu) {
> + cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu);
> + val = cached_bstat->cputime.stime;
> + do_div(val, NSEC_PER_USEC);
> + seq_printf(seq, " %llu", val);
> + }
> + seq_puts(seq, "\n");

This is an anti-pattern; given enough CPUs (easy) this will trivially
overflow the 1 page seq buffer.

People are already struggling to fix existing ABI, lets not make the
problem worse.

2022-01-11 23:38:38

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH 1/2] cgroup: add cpu.stat_percpu

On Tue, Jan 11, 2022 at 4:50 AM Peter Zijlstra <[email protected]> wrote:
>
> On Fri, Jan 07, 2022 at 03:41:37PM -0800, Josh Don wrote:
>
> > + seq_puts(seq, "usage_usec");
> > + for_each_possible_cpu(cpu) {
> > + cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu);
> > + val = cached_bstat->cputime.sum_exec_runtime;
> > + do_div(val, NSEC_PER_USEC);
> > + seq_printf(seq, " %llu", val);
> > + }
> > + seq_puts(seq, "\n");
> > +
> > + seq_puts(seq, "user_usec");
> > + for_each_possible_cpu(cpu) {
> > + cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu);
> > + val = cached_bstat->cputime.utime;
> > + do_div(val, NSEC_PER_USEC);
> > + seq_printf(seq, " %llu", val);
> > + }
> > + seq_puts(seq, "\n");
> > +
> > + seq_puts(seq, "system_usec");
> > + for_each_possible_cpu(cpu) {
> > + cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu);
> > + val = cached_bstat->cputime.stime;
> > + do_div(val, NSEC_PER_USEC);
> > + seq_printf(seq, " %llu", val);
> > + }
> > + seq_puts(seq, "\n");
>
> This is an anti-pattern; given enough CPUs (easy) this will trivially
> overflow the 1 page seq buffer.
>
> People are already struggling to fix existing ABI, lets not make the
> problem worse.

Is the concern there just the extra overhead from making multiple
trips into this handler and re-allocating the buffer until it is large
enough to take all the output? In that case, we could pre-allocate
with a size of the right order of magnitude, similar to /proc/stat.

Lack of per-cpu stats is a gap between cgroup v1 and v2, for which v2
can easily support this interface given that it already tracks the
stats percpu internally. I opted to dump them all in a single file
here, to match the consolidation that occurred from cpuacct->cpu.stat.

2022-01-12 08:30:50

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/2] cgroup: add cpu.stat_percpu

On Tue, Jan 11, 2022 at 03:38:20PM -0800, Josh Don wrote:
> On Tue, Jan 11, 2022 at 4:50 AM Peter Zijlstra <[email protected]> wrote:
> >
> > On Fri, Jan 07, 2022 at 03:41:37PM -0800, Josh Don wrote:
> >
> > > + seq_puts(seq, "usage_usec");
> > > + for_each_possible_cpu(cpu) {
> > > + cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu);
> > > + val = cached_bstat->cputime.sum_exec_runtime;
> > > + do_div(val, NSEC_PER_USEC);
> > > + seq_printf(seq, " %llu", val);
> > > + }
> > > + seq_puts(seq, "\n");
> > > +
> > > + seq_puts(seq, "user_usec");
> > > + for_each_possible_cpu(cpu) {
> > > + cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu);
> > > + val = cached_bstat->cputime.utime;
> > > + do_div(val, NSEC_PER_USEC);
> > > + seq_printf(seq, " %llu", val);
> > > + }
> > > + seq_puts(seq, "\n");
> > > +
> > > + seq_puts(seq, "system_usec");
> > > + for_each_possible_cpu(cpu) {
> > > + cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu);
> > > + val = cached_bstat->cputime.stime;
> > > + do_div(val, NSEC_PER_USEC);
> > > + seq_printf(seq, " %llu", val);
> > > + }
> > > + seq_puts(seq, "\n");
> >
> > This is an anti-pattern; given enough CPUs (easy) this will trivially
> > overflow the 1 page seq buffer.
> >
> > People are already struggling to fix existing ABI, lets not make the
> > problem worse.
>
> Is the concern there just the extra overhead from making multiple
> trips into this handler and re-allocating the buffer until it is large
> enough to take all the output? In that case, we could pre-allocate
> with a size of the right order of magnitude, similar to /proc/stat.
>
> Lack of per-cpu stats is a gap between cgroup v1 and v2, for which v2
> can easily support this interface given that it already tracks the
> stats percpu internally. I opted to dump them all in a single file
> here, to match the consolidation that occurred from cpuacct->cpu.stat.

Hmm.. fancy new stuff there :-) Yes, I think that would aleviate the
immediate problem. I suppose /proc/interrupts ought to get some of that
too.

Still, I'm not sure having so much data in a single file is wise. But
I've not really kept up with the discussions around this problem much.

2022-01-12 20:24:09

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH 1/2] cgroup: add cpu.stat_percpu

Hello,

On Tue, Jan 11, 2022 at 03:38:20PM -0800, Josh Don wrote:
> Is the concern there just the extra overhead from making multiple
> trips into this handler and re-allocating the buffer until it is large
> enough to take all the output? In that case, we could pre-allocate
> with a size of the right order of magnitude, similar to /proc/stat.
>
> Lack of per-cpu stats is a gap between cgroup v1 and v2, for which v2
> can easily support this interface given that it already tracks the
> stats percpu internally. I opted to dump them all in a single file
> here, to match the consolidation that occurred from cpuacct->cpu.stat.

Yeah, nack on this. That part was dropped intentionally. These text pseudo
files aren't a great medium for this sort of (potentially large) data dump
and they scale really badly with new fields which we may want to expose in
the future. For detailed introspection, a better route would be using bpf
and if that's inconvenient for some reason trying to make them more
convenient.

Thanks.

--
tejun

2022-01-13 01:55:25

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH 1/2] cgroup: add cpu.stat_percpu

Hi Tejun,

On Wed, Jan 12, 2022 at 12:24 PM Tejun Heo <[email protected]> wrote:
>
> Hello,
>
> On Tue, Jan 11, 2022 at 03:38:20PM -0800, Josh Don wrote:
> > Is the concern there just the extra overhead from making multiple
> > trips into this handler and re-allocating the buffer until it is large
> > enough to take all the output? In that case, we could pre-allocate
> > with a size of the right order of magnitude, similar to /proc/stat.
> >
> > Lack of per-cpu stats is a gap between cgroup v1 and v2, for which v2
> > can easily support this interface given that it already tracks the
> > stats percpu internally. I opted to dump them all in a single file
> > here, to match the consolidation that occurred from cpuacct->cpu.stat.
>
> Yeah, nack on this. That part was dropped intentionally. These text pseudo
> files aren't a great medium for this sort of (potentially large) data dump
> and they scale really badly with new fields which we may want to expose in
> the future. For detailed introspection, a better route would be using bpf
> and if that's inconvenient for some reason trying to make them more
> convenient.

Fair enough. Apart from needing to expose an rstat flush mechanism to
BPF, the main issues that come to mind are conveniently associating
with a particular cgroup, and allowing non-root to collect stats
independently. Hao's patch series
(https://lkml.kernel.org/r/%[email protected]%3E)
makes this particularly convenient, but that's a topic for a separate
conversation :)