2011-02-16 03:20:32

by Paul Turner

[permalink] [raw]
Subject: [CFS Bandwidth Control v4 5/7] sched: add exports tracking cfs bandwidth control statistics

From: Nikhil Rao <[email protected]>

This change introduces statistics exports for the cpu sub-system, these are
added through the use of a stat file similar to that exported by other
subsystems.

The following exports are included:

nr_periods: number of periods in which execution occurred
nr_throttled: the number of periods above in which execution was throttle
throttled_time: cumulative wall-time that any cpus have been throttled for
this group

Signed-off-by: Paul Turner <[email protected]>
Signed-off-by: Nikhil Rao <[email protected]>
Signed-off-by: Bharata B Rao <[email protected]>
---
kernel/sched.c | 26 ++++++++++++++++++++++++++
kernel/sched_fair.c | 16 +++++++++++++++-
2 files changed, 41 insertions(+), 1 deletion(-)

Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -254,6 +254,11 @@ struct cfs_bandwidth {
ktime_t period;
u64 runtime, quota;
struct hrtimer period_timer;
+
+ /* throttle statistics */
+ u64 nr_periods;
+ u64 nr_throttled;
+ u64 throttled_time;
};
#endif

@@ -389,6 +394,7 @@ struct cfs_rq {
#ifdef CONFIG_CFS_BANDWIDTH
u64 quota_assigned, quota_used;
int throttled;
+ u64 throttled_timestamp;
#endif
#endif
};
@@ -426,6 +432,10 @@ void init_cfs_bandwidth(struct cfs_bandw

hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->period_timer.function = sched_cfs_period_timer;
+
+ cfs_b->nr_periods = 0;
+ cfs_b->nr_throttled = 0;
+ cfs_b->throttled_time = 0;
}

static
@@ -9332,6 +9342,18 @@ static int cpu_cfs_period_write_u64(stru
return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
}

+static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
+ struct cgroup_map_cb *cb)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+
+ cb->fill(cb, "nr_periods", cfs_b->nr_periods);
+ cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
+ cb->fill(cb, "throttled_time", cfs_b->throttled_time);
+
+ return 0;
+}
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */

@@ -9378,6 +9400,10 @@ static struct cftype cpu_files[] = {
.read_u64 = cpu_cfs_period_read_u64,
.write_u64 = cpu_cfs_period_write_u64,
},
+ {
+ .name = "stat",
+ .read_map = cpu_stats_show,
+ },
#endif
#ifdef CONFIG_RT_GROUP_SCHED
{
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1519,17 +1519,25 @@ static void throttle_cfs_rq(struct cfs_r

out_throttled:
cfs_rq->throttled = 1;
+ cfs_rq->throttled_timestamp = rq_of(cfs_rq)->clock;
update_cfs_rq_load_contribution(cfs_rq, 1);
}

static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;

se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];

update_rq_clock(rq);
+ /* update stats */
+ raw_spin_lock(&cfs_b->lock);
+ cfs_b->throttled_time += (rq->clock - cfs_rq->throttled_timestamp);
+ raw_spin_unlock(&cfs_b->lock);
+ cfs_rq->throttled_timestamp = 0;
+
/* (Try to) avoid maintaining share statistics for idle time */
cfs_rq->load_stamp = cfs_rq->load_last = rq->clock_task;

@@ -1571,7 +1579,7 @@ static void account_cfs_rq_quota(struct

static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
{
- int i, idle = 1;
+ int i, idle = 1, num_throttled = 0;
u64 delta;
const struct cpumask *span;

@@ -1593,6 +1601,7 @@ static int do_sched_cfs_period_timer(str

if (!cfs_rq_throttled(cfs_rq))
continue;
+ num_throttled++;

delta = tg_request_cfs_quota(cfs_rq->tg);

@@ -1608,6 +1617,11 @@ static int do_sched_cfs_period_timer(str
}
}

+ /* update throttled stats */
+ cfs_b->nr_periods++;
+ if (num_throttled)
+ cfs_b->nr_throttled++;
+
return idle;
}



2011-02-22 03:14:29

by Balbir Singh

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 5/7] sched: add exports tracking cfs bandwidth control statistics

* Paul Turner <[email protected]> [2011-02-15 19:18:36]:

> From: Nikhil Rao <[email protected]>
>
> This change introduces statistics exports for the cpu sub-system, these are
> added through the use of a stat file similar to that exported by other
> subsystems.
>
> The following exports are included:
>
> nr_periods: number of periods in which execution occurred
> nr_throttled: the number of periods above in which execution was throttle
> throttled_time: cumulative wall-time that any cpus have been throttled for
> this group
>
> Signed-off-by: Paul Turner <[email protected]>
> Signed-off-by: Nikhil Rao <[email protected]>
> Signed-off-by: Bharata B Rao <[email protected]>
> ---
> kernel/sched.c | 26 ++++++++++++++++++++++++++
> kernel/sched_fair.c | 16 +++++++++++++++-
> 2 files changed, 41 insertions(+), 1 deletion(-)
>
> Index: tip/kernel/sched.c
> ===================================================================
> --- tip.orig/kernel/sched.c
> +++ tip/kernel/sched.c
> @@ -254,6 +254,11 @@ struct cfs_bandwidth {
> ktime_t period;
> u64 runtime, quota;
> struct hrtimer period_timer;
> +
> + /* throttle statistics */
> + u64 nr_periods;
> + u64 nr_throttled;
> + u64 throttled_time;
> };
> #endif
>
> @@ -389,6 +394,7 @@ struct cfs_rq {
> #ifdef CONFIG_CFS_BANDWIDTH
> u64 quota_assigned, quota_used;
> int throttled;
> + u64 throttled_timestamp;
> #endif
> #endif
> };
> @@ -426,6 +432,10 @@ void init_cfs_bandwidth(struct cfs_bandw
>
> hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> cfs_b->period_timer.function = sched_cfs_period_timer;
> +
> + cfs_b->nr_periods = 0;
> + cfs_b->nr_throttled = 0;
> + cfs_b->throttled_time = 0;
> }
>
> static
> @@ -9332,6 +9342,18 @@ static int cpu_cfs_period_write_u64(stru
> return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
> }
>
> +static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
> + struct cgroup_map_cb *cb)
> +{
> + struct task_group *tg = cgroup_tg(cgrp);
> + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
> +
> + cb->fill(cb, "nr_periods", cfs_b->nr_periods);
> + cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
> + cb->fill(cb, "throttled_time", cfs_b->throttled_time);
> +
> + return 0;
> +}
> #endif /* CONFIG_CFS_BANDWIDTH */
> #endif /* CONFIG_FAIR_GROUP_SCHED */
>
> @@ -9378,6 +9400,10 @@ static struct cftype cpu_files[] = {
> .read_u64 = cpu_cfs_period_read_u64,
> .write_u64 = cpu_cfs_period_write_u64,
> },
> + {
> + .name = "stat",
> + .read_map = cpu_stats_show,
> + },
> #endif
> #ifdef CONFIG_RT_GROUP_SCHED
> {
> Index: tip/kernel/sched_fair.c
> ===================================================================
> --- tip.orig/kernel/sched_fair.c
> +++ tip/kernel/sched_fair.c
> @@ -1519,17 +1519,25 @@ static void throttle_cfs_rq(struct cfs_r
>
> out_throttled:
> cfs_rq->throttled = 1;
> + cfs_rq->throttled_timestamp = rq_of(cfs_rq)->clock;
> update_cfs_rq_load_contribution(cfs_rq, 1);
> }
>
> static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> {
> struct rq *rq = rq_of(cfs_rq);
> + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
> struct sched_entity *se;
>
> se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
>
> update_rq_clock(rq);
> + /* update stats */
> + raw_spin_lock(&cfs_b->lock);
> + cfs_b->throttled_time += (rq->clock - cfs_rq->throttled_timestamp);
> + raw_spin_unlock(&cfs_b->lock);
> + cfs_rq->throttled_timestamp = 0;
> +
> /* (Try to) avoid maintaining share statistics for idle time */
> cfs_rq->load_stamp = cfs_rq->load_last = rq->clock_task;
>
> @@ -1571,7 +1579,7 @@ static void account_cfs_rq_quota(struct
>
> static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
> {
> - int i, idle = 1;
> + int i, idle = 1, num_throttled = 0;
> u64 delta;
> const struct cpumask *span;
>
> @@ -1593,6 +1601,7 @@ static int do_sched_cfs_period_timer(str
>
> if (!cfs_rq_throttled(cfs_rq))
> continue;
> + num_throttled++;
>
> delta = tg_request_cfs_quota(cfs_rq->tg);
>
> @@ -1608,6 +1617,11 @@ static int do_sched_cfs_period_timer(str
> }
> }
>
> + /* update throttled stats */
> + cfs_b->nr_periods++;
> + if (num_throttled)
> + cfs_b->nr_throttled++;
> +
> return idle;
> }
>

Should we consider integrating this in cpuacct, it would be difficult
if we spill over stats between controllers.

--
Three Cheers,
Balbir

2011-02-22 04:13:18

by Bharata B Rao

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 5/7] sched: add exports tracking cfs bandwidth control statistics

On Tue, Feb 22, 2011 at 08:44:20AM +0530, Balbir Singh wrote:
> * Paul Turner <[email protected]> [2011-02-15 19:18:36]:
>
> > From: Nikhil Rao <[email protected]>
> >
> > This change introduces statistics exports for the cpu sub-system, these are
> > added through the use of a stat file similar to that exported by other
> > subsystems.
> >
> > The following exports are included:
> >
> > nr_periods: number of periods in which execution occurred
> > nr_throttled: the number of periods above in which execution was throttle
> > throttled_time: cumulative wall-time that any cpus have been throttled for
> > this group
> >
> > Signed-off-by: Paul Turner <[email protected]>
> > Signed-off-by: Nikhil Rao <[email protected]>
> > Signed-off-by: Bharata B Rao <[email protected]>
> > ---
> > kernel/sched.c | 26 ++++++++++++++++++++++++++
> > kernel/sched_fair.c | 16 +++++++++++++++-
> > 2 files changed, 41 insertions(+), 1 deletion(-)
> >
> > Index: tip/kernel/sched.c
> > ===================================================================
> > --- tip.orig/kernel/sched.c
> > +++ tip/kernel/sched.c
> > @@ -254,6 +254,11 @@ struct cfs_bandwidth {
> > ktime_t period;
> > u64 runtime, quota;
> > struct hrtimer period_timer;
> > +
> > + /* throttle statistics */
> > + u64 nr_periods;
> > + u64 nr_throttled;
> > + u64 throttled_time;
> > };
> > #endif
> >
> > @@ -389,6 +394,7 @@ struct cfs_rq {
> > #ifdef CONFIG_CFS_BANDWIDTH
> > u64 quota_assigned, quota_used;
> > int throttled;
> > + u64 throttled_timestamp;
> > #endif
> > #endif
> > };
> > @@ -426,6 +432,10 @@ void init_cfs_bandwidth(struct cfs_bandw
> >
> > hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> > cfs_b->period_timer.function = sched_cfs_period_timer;
> > +
> > + cfs_b->nr_periods = 0;
> > + cfs_b->nr_throttled = 0;
> > + cfs_b->throttled_time = 0;
> > }
> >
> > static
> > @@ -9332,6 +9342,18 @@ static int cpu_cfs_period_write_u64(stru
> > return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
> > }
> >
> > +static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
> > + struct cgroup_map_cb *cb)
> > +{
> > + struct task_group *tg = cgroup_tg(cgrp);
> > + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
> > +
> > + cb->fill(cb, "nr_periods", cfs_b->nr_periods);
> > + cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
> > + cb->fill(cb, "throttled_time", cfs_b->throttled_time);
> > +
> > + return 0;
> > +}
> > #endif /* CONFIG_CFS_BANDWIDTH */
> > #endif /* CONFIG_FAIR_GROUP_SCHED */
> >
> > @@ -9378,6 +9400,10 @@ static struct cftype cpu_files[] = {
> > .read_u64 = cpu_cfs_period_read_u64,
> > .write_u64 = cpu_cfs_period_write_u64,
> > },
> > + {
> > + .name = "stat",
> > + .read_map = cpu_stats_show,
> > + },
> > #endif
> > #ifdef CONFIG_RT_GROUP_SCHED
> > {
> > Index: tip/kernel/sched_fair.c
> > ===================================================================
> > --- tip.orig/kernel/sched_fair.c
> > +++ tip/kernel/sched_fair.c
> > @@ -1519,17 +1519,25 @@ static void throttle_cfs_rq(struct cfs_r
> >
> > out_throttled:
> > cfs_rq->throttled = 1;
> > + cfs_rq->throttled_timestamp = rq_of(cfs_rq)->clock;
> > update_cfs_rq_load_contribution(cfs_rq, 1);
> > }
> >
> > static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> > {
> > struct rq *rq = rq_of(cfs_rq);
> > + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
> > struct sched_entity *se;
> >
> > se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
> >
> > update_rq_clock(rq);
> > + /* update stats */
> > + raw_spin_lock(&cfs_b->lock);
> > + cfs_b->throttled_time += (rq->clock - cfs_rq->throttled_timestamp);
> > + raw_spin_unlock(&cfs_b->lock);
> > + cfs_rq->throttled_timestamp = 0;
> > +
> > /* (Try to) avoid maintaining share statistics for idle time */
> > cfs_rq->load_stamp = cfs_rq->load_last = rq->clock_task;
> >
> > @@ -1571,7 +1579,7 @@ static void account_cfs_rq_quota(struct
> >
> > static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
> > {
> > - int i, idle = 1;
> > + int i, idle = 1, num_throttled = 0;
> > u64 delta;
> > const struct cpumask *span;
> >
> > @@ -1593,6 +1601,7 @@ static int do_sched_cfs_period_timer(str
> >
> > if (!cfs_rq_throttled(cfs_rq))
> > continue;
> > + num_throttled++;
> >
> > delta = tg_request_cfs_quota(cfs_rq->tg);
> >
> > @@ -1608,6 +1617,11 @@ static int do_sched_cfs_period_timer(str
> > }
> > }
> >
> > + /* update throttled stats */
> > + cfs_b->nr_periods++;
> > + if (num_throttled)
> > + cfs_b->nr_throttled++;
> > +
> > return idle;
> > }
> >
>
> Should we consider integrating this in cpuacct, it would be difficult
> if we spill over stats between controllers.

Given that cpuacct controller can be mounted independently, I am not sure
if we should integrate these stats. These stats come from cpu controller.

I initially had similar stats as part of /proc/sched_debug since there
are a bunch of other group specific stats (including rt throttle stats)
in /proc/sched_debug.

Regards,
Bharata.

2011-02-22 04:40:44

by Balbir Singh

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 5/7] sched: add exports tracking cfs bandwidth control statistics

* Bharata B Rao <[email protected]> [2011-02-22 09:43:33]:

> >
> > Should we consider integrating this in cpuacct, it would be difficult
> > if we spill over stats between controllers.
>
> Given that cpuacct controller can be mounted independently, I am not sure
> if we should integrate these stats. These stats come from cpu controller.

The accounting controller was created to account. I'd still prefer
cpuacct, so that I can find everything in one place. NOTE: cpuacct was
created so that we do accounting with control - just account. I think
splitting stats creates a usability mess - no?

--
Three Cheers,
Balbir

2011-02-23 08:04:16

by Paul Turner

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 5/7] sched: add exports tracking cfs bandwidth control statistics

On Mon, Feb 21, 2011 at 8:40 PM, Balbir Singh <[email protected]> wrote:
> * Bharata B Rao <[email protected]> [2011-02-22 09:43:33]:
>
>> >
>> > Should we consider integrating this in cpuacct, it would be difficult
>> > if we spill over stats between controllers.
>>
>> Given that cpuacct controller can be mounted independently, I am not sure
>> if we should integrate these stats. These stats come from cpu controller.
>
> The accounting controller was created to account. I'd still prefer
> cpuacct, so that I can find everything in one place. NOTE: cpuacct was
> created so that we do accounting with control - just account. I think
> splitting stats creates a usability mess - no?
>

One problem with rolling it into cpuacct is that some of the
statistics have a 1:1 association with the hierarchy being throttled.
For example, the number of periods in which throttling occurred or the
count of elapsed periods.

If it were rolled into cpuacct the only meaningful export would be the
total throttled time -- perhaps this is sufficient?

> --
> ? ? ? ?Three Cheers,
> ? ? ? ?Balbir
>

2011-02-23 10:14:01

by Balbir Singh

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 5/7] sched: add exports tracking cfs bandwidth control statistics

* Paul Turner <[email protected]> [2011-02-23 00:03:42]:

> On Mon, Feb 21, 2011 at 8:40 PM, Balbir Singh <[email protected]> wrote:
> > * Bharata B Rao <[email protected]> [2011-02-22 09:43:33]:
> >
> >> >
> >> > Should we consider integrating this in cpuacct, it would be difficult
> >> > if we spill over stats between controllers.
> >>
> >> Given that cpuacct controller can be mounted independently, I am not sure
> >> if we should integrate these stats. These stats come from cpu controller.
> >
> > The accounting controller was created to account. I'd still prefer
> > cpuacct, so that I can find everything in one place. NOTE: cpuacct was
> > created so that we do accounting with control - just account. I think
> > splitting stats creates a usability mess - no?
> >
>
> One problem with rolling it into cpuacct is that some of the
> statistics have a 1:1 association with the hierarchy being throttled.
> For example, the number of periods in which throttling occurred or the
> count of elapsed periods.
>
> If it were rolled into cpuacct the only meaningful export would be the
> total throttled time -- perhaps this is sufficient?
>

Good point, lets keep it as is. nr_throttled and nr_periods is also
important (although it can be derived).

--
Three Cheers,
Balbir

2011-02-23 13:32:57

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 5/7] sched: add exports tracking cfs bandwidth control statistics

On Tue, 2011-02-15 at 19:18 -0800, Paul Turner wrote:
> + raw_spin_lock(&cfs_b->lock);
> + cfs_b->throttled_time += (rq->clock - cfs_rq->throttled_timestamp);
> + raw_spin_unlock(&cfs_b->lock);

That seems to put the cost of things on the wrong side. Read is rare,
update is frequent, and you made the frequent thing the most expensive
one.

2011-02-25 03:26:47

by Paul Turner

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 5/7] sched: add exports tracking cfs bandwidth control statistics

On Wed, Feb 23, 2011 at 5:32 AM, Peter Zijlstra <[email protected]> wrote:
> On Tue, 2011-02-15 at 19:18 -0800, Paul Turner wrote:
>> + ? ? ? raw_spin_lock(&cfs_b->lock);
>> + ? ? ? cfs_b->throttled_time += (rq->clock - cfs_rq->throttled_timestamp);
>> + ? ? ? raw_spin_unlock(&cfs_b->lock);
>
> That seems to put the cost of things on the wrong side. Read is rare,
> update is frequent, and you made the frequent thing the most expensive
> one.

Hum.. the trade-off here is non-trivial I think

- This update is only once per-quota period (*if* we throttled within
that period). This places the frequency in the 10s-100s of ms range.
- Sampling would probably occur on an order of once a second (assuming
some enterprise management system that cares about these statistics).

If we make the update cheaper by moving this per-cpu, then yes the
updates are cheaper but the reads now having per-cpu cost makes the
overall cost about the same (multiplying frequency by delta cost).

We could move the global accrual to an atomic, but this isn't any
cheaper given that this lock shouldn't be
contended.

>
>
>

2011-02-25 08:55:13

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 5/7] sched: add exports tracking cfs bandwidth control statistics

On Thu, 2011-02-24 at 19:26 -0800, Paul Turner wrote:
> On Wed, Feb 23, 2011 at 5:32 AM, Peter Zijlstra <[email protected]> wrote:
> > On Tue, 2011-02-15 at 19:18 -0800, Paul Turner wrote:
> >> + raw_spin_lock(&cfs_b->lock);
> >> + cfs_b->throttled_time += (rq->clock - cfs_rq->throttled_timestamp);
> >> + raw_spin_unlock(&cfs_b->lock);
> >
> > That seems to put the cost of things on the wrong side. Read is rare,
> > update is frequent, and you made the frequent thing the most expensive
> > one.
>
> Hum.. the trade-off here is non-trivial I think
>
> - This update is only once per-quota period (*if* we throttled within
> that period). This places the frequency in the 10s-100s of ms range.
> - Sampling would probably occur on an order of once a second (assuming
> some enterprise management system that cares about these statistics).

Ugh,. people are really polling state like that? If the event is rare
pushing state is much saner.