Overview
~~~~~~~~
The cpusets subsystem allows to assign a different set of CPUs to a cgroup. A
typical use case is to split large systems in small CPU/memory partitions and
isolate certain users/applications in these subsets of the system.
Sometimes, to have a quick overview of the state of each partition, we may be
interested to get the load average of the CPUs assigned to a particular cpuset,
rather than the global load average of the system.
Proposed solution
~~~~~~~~~~~~~~~~~
The proposal is to add a new file in the cpuset subsystem to report the load
average of the CPUs assinged to a particular cpuset cgroup.
Example:
# echo 0-1 > /sys/fs/cgroup/cpuset/foo/cpuset.cpus
# echo 2-3 > /sys/fs/cgroup/cpuset/bar/cpuset.cpus
# echo $$ > /sys/fs/cgroup/cpuset/foo/tasks
# for i in `seq 4`; do yes > /dev/null & done
... after ~5mins ...
# cat /proc/loadavg /sys/fs/cgroup/cpuset/{foo,bar}/cpuset.loadavg
3.99 2.66 1.24 6/377 2855
3.98 2.64 1.20
0.01 0.02 0.04
In this case we can easily find that the cpuset "foo" is the most busy in the
system.
ChangeLog v1->v2:
- convert rq->nr_uninterruptible to a percpu variable
- fix nr_uninterruptible accounting in the wakeup/sleep paths
- use DEFINE_PER_CPU() instead of NR_CPUS arrays
- in patch 2/3 add a comment to explain the validity of evaluating the cpuset
load average as the sum of the individual per-cpu load averages
[ Thanks to Peter Z. for the review and suggestions of v1 ]
TODO:
- report nr_running and nr_threads in cpuset.loadavg, producing the same
output as /proc/loadavg; in this way we could do nice things like, for
example, "mount --bind cpuset.loadavg /proc/loadavg" in a new mount
namespace for a specific user that we want to isolate into a specific
cpuset cgroup, etc...
[PATCH v2 1/3] sched: introduce distinct per-cpu load average
[PATCH v2 2/3] cpusets: add load avgerage interface
[PATCH v2 3/3] cpusets: add documentation of the loadavg file
Documentation/cgroups/cpusets.txt | 1 +
include/linux/sched.h | 6 ++
kernel/cpuset.c | 108 +++++++++++++++++++++++++++++++++++
kernel/sched/core.c | 112 +++++++++++++++++++++++++++++++------
kernel/sched/debug.c | 3 +-
kernel/sched/sched.h | 8 +--
6 files changed, 214 insertions(+), 24 deletions(-)
Account load average, nr_running and nr_uninterruptible tasks per-cpu.
The new task_struct attribute on_cpu_uninterruptible is added to
properly keep track of the cpu at deactivate time, when the task is set
to the uninterruptible sleep state.
Moreover, rq->nr_uninterruptible is converted to a percpu variable to
maintain a coherent nr_uninterruptible counter for each CPU (rather than
having a single global counter defined as the sum over all CPUs). This
adds less performance overhead than introducing atomic operations in the
wakeup/sleep path.
This feature is required by the cpusets cgroup subsystem to report the
load average per-cpuset.
Signed-off-by: Andrea Righi <[email protected]>
---
include/linux/sched.h | 6 +++
kernel/sched/core.c | 112 ++++++++++++++++++++++++++++++++++++++++++-------
kernel/sched/debug.c | 3 +-
kernel/sched/sched.h | 8 +---
4 files changed, 105 insertions(+), 24 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dd42a0..e5dfe2a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -80,6 +80,8 @@ struct blk_plug;
*/
extern unsigned long avenrun[]; /* Load averages */
extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
+extern void get_cpu_avenrun(unsigned long *loads, int cpu,
+ unsigned long offset, int shift);
#define FSHIFT 11 /* nr of bits of precision */
#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
@@ -98,7 +100,9 @@ extern int nr_threads;
DECLARE_PER_CPU(unsigned long, process_counts);
extern int nr_processes(void);
extern unsigned long nr_running(void);
+extern unsigned long nr_running_cpu(int cpu);
extern unsigned long nr_uninterruptible(void);
+extern unsigned long nr_uninterruptible_cpu(int cpu);
extern unsigned long nr_iowait(void);
extern unsigned long nr_iowait_cpu(int cpu);
extern unsigned long this_cpu_load(void);
@@ -1197,6 +1201,8 @@ struct task_struct {
#ifdef CONFIG_SMP
struct llist_node wake_entry;
int on_cpu;
+ /* Used to keep track of nr_uninterruptible tasks per-cpu */
+ int on_cpu_uninterruptible;
#endif
int on_rq;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927f..a1487ee 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -726,16 +726,20 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
- if (task_contributes_to_load(p))
- rq->nr_uninterruptible--;
+ if (task_contributes_to_load(p)) {
+ struct rq *prev_rq = cpu_rq(p->on_cpu_uninterruptible);
+ __this_cpu_dec(*prev_rq->nr_uninterruptible);
+ }
enqueue_task(rq, p, flags);
}
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
- if (task_contributes_to_load(p))
- rq->nr_uninterruptible++;
+ if (task_contributes_to_load(p)) {
+ __this_cpu_inc(*rq->nr_uninterruptible);
+ p->on_cpu_uninterruptible = cpu_of(rq);
+ }
dequeue_task(rq, p, flags);
}
@@ -1277,8 +1281,10 @@ static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
{
#ifdef CONFIG_SMP
- if (p->sched_contributes_to_load)
- rq->nr_uninterruptible--;
+ if (p->sched_contributes_to_load) {
+ struct rq *prev_rq = cpu_rq(p->on_cpu_uninterruptible);
+ __this_cpu_dec(*prev_rq->nr_uninterruptible);
+ }
#endif
ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
@@ -1916,12 +1922,17 @@ unsigned long nr_running(void)
return sum;
}
+unsigned long nr_running_cpu(int cpu)
+{
+ return cpu_rq(cpu)->nr_running;
+}
+
unsigned long nr_uninterruptible(void)
{
unsigned long i, sum = 0;
for_each_possible_cpu(i)
- sum += cpu_rq(i)->nr_uninterruptible;
+ sum += nr_uninterruptible_cpu(i);
/*
* Since we read the counters lockless, it might be slightly
@@ -1933,6 +1944,18 @@ unsigned long nr_uninterruptible(void)
return sum;
}
+unsigned long nr_uninterruptible_cpu(int cpu)
+{
+ struct rq *this = cpu_rq(cpu);
+ unsigned long val = 0;
+ int i;
+
+ for_each_online_cpu(i)
+ val += per_cpu(*this->nr_uninterruptible, i);
+
+ return val;
+}
+
unsigned long long nr_context_switches(void)
{
int i;
@@ -1980,7 +2003,8 @@ unsigned long this_cpu_load(void)
*
* nr_active = 0;
* for_each_possible_cpu(cpu)
- * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ * nr_active += cpu_of(cpu)->nr_running +
+ * (cpu_of(cpu)->nr_uninterruptible;
*
* avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
*
@@ -2004,13 +2028,6 @@ unsigned long this_cpu_load(void)
* This places an upper-bound on the IRQ-off latency of the machine. Then
* again, being late doesn't loose the delta, just wrecks the sample.
*
- * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
- * this would add another cross-cpu cacheline miss and atomic operation
- * to the wakeup path. Instead we increment on whatever cpu the task ran
- * when it went into uninterruptible state and decrement on whatever cpu
- * did the wakeup. This means that only the sum of nr_uninterruptible over
- * all cpus yields the correct result.
- *
* This covers the NO_HZ=n code, for extra head-aches, see the comment below.
*/
@@ -2035,12 +2052,15 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
loads[2] = (avenrun[2] + offset) << shift;
}
+static DEFINE_PER_CPU(unsigned long [3], cpu_avenrun);
+
static long calc_load_fold_active(struct rq *this_rq)
{
long nr_active, delta = 0;
+ int cpu = cpu_of(this_rq);
nr_active = this_rq->nr_running;
- nr_active += (long) this_rq->nr_uninterruptible;
+ nr_active += (long) nr_uninterruptible_cpu(cpu);
if (nr_active != this_rq->calc_load_active) {
delta = nr_active - this_rq->calc_load_active;
@@ -2062,6 +2082,23 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
return load >> FSHIFT;
}
+static void calc_global_load_percpu(void)
+{
+ long active;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ unsigned long *this_avenrun = per_cpu(cpu_avenrun, cpu);
+
+ active = cpu_rq(cpu)->calc_load_active;
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ this_avenrun[0] = calc_load(this_avenrun[0], EXP_1, active);
+ this_avenrun[1] = calc_load(this_avenrun[1], EXP_5, active);
+ this_avenrun[2] = calc_load(this_avenrun[2], EXP_15, active);
+ }
+}
+
#ifdef CONFIG_NO_HZ
/*
* Handle NO_HZ for the global load-average.
@@ -2248,6 +2285,25 @@ calc_load_n(unsigned long load, unsigned long exp,
return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
}
+static void calc_global_load_n_percpu(unsigned int n)
+{
+ long active;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ unsigned long *this_avenrun = per_cpu(cpu_avenrun, cpu);
+
+ active = cpu_rq(cpu)->calc_load_active;
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ this_avenrun[0] = calc_load_n(this_avenrun[0],
+ EXP_1, active, n);
+ this_avenrun[1] = calc_load_n(this_avenrun[1],
+ EXP_5, active, n);
+ this_avenrun[2] = calc_load_n(this_avenrun[2],
+ EXP_15, active, n);
+ }
+}
/*
* NO_HZ can leave us missing all per-cpu ticks calling
* calc_load_account_active(), but since an idle CPU folds its delta into
@@ -2275,6 +2331,8 @@ static void calc_global_nohz(void)
avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+ calc_global_load_n_percpu(n);
+
calc_load_update += n * LOAD_FREQ;
}
@@ -2320,6 +2378,8 @@ void calc_global_load(unsigned long ticks)
avenrun[1] = calc_load(avenrun[1], EXP_5, active);
avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+ calc_global_load_percpu();
+
calc_load_update += LOAD_FREQ;
/*
@@ -2328,6 +2388,24 @@ void calc_global_load(unsigned long ticks)
calc_global_nohz();
}
+/**
+ * get_cpu_avenrun - get the load average array of a single cpu
+ * @loads: pointer to dest load array
+ * @cpu: the cpu to read the load average
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_cpu_avenrun(unsigned long *loads, int cpu,
+ unsigned long offset, int shift)
+{
+ unsigned long *this_avenrun = per_cpu(cpu_avenrun, cpu);
+
+ loads[0] = (this_avenrun[0] + offset) << shift;
+ loads[1] = (this_avenrun[1] + offset) << shift;
+ loads[2] = (this_avenrun[2] + offset) << shift;
+}
/*
* Called from update_cpu_load() to periodically update this CPU's
* active count.
@@ -6873,6 +6951,8 @@ void __init sched_init(void)
#endif
init_rq_hrtick(rq);
atomic_set(&rq->nr_iowait, 0);
+ rq->nr_uninterruptible = alloc_percpu(unsigned long);
+ BUG_ON(!rq->nr_uninterruptible);
}
set_load_weight(&init_task);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6f79596..ac6c73f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -276,7 +276,8 @@ do { \
rq->load.weight);
P(nr_switches);
P(nr_load_updates);
- P(nr_uninterruptible);
+ SEQ_printf(m, " .%-30s: %lu\n", "nr_uninterruptible",
+ nr_uninterruptible_cpu(cpu));
PN(next_balance);
P(curr->pid);
PN(clock);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7a7db09..8a0d303 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -383,13 +383,7 @@ struct rq {
struct list_head leaf_rt_rq_list;
#endif
- /*
- * This is part of a global counter where only the total sum
- * over all CPUs matters. A task can increase this counter on
- * one CPU and if it got migrated afterwards it may decrease
- * it on another CPU. Always updated under the runqueue lock:
- */
- unsigned long nr_uninterruptible;
+ unsigned long __percpu *nr_uninterruptible;
struct task_struct *curr, *idle, *stop;
unsigned long next_balance;
--
1.7.10.4
Signed-off-by: Andrea Righi <[email protected]>
---
Documentation/cgroups/cpusets.txt | 1 +
1 file changed, 1 insertion(+)
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt
index cefd3d8..d5ddc36 100644
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroups/cpusets.txt
@@ -179,6 +179,7 @@ files describing that cpuset:
- cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
- cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
- cpuset.sched_relax_domain_level: the searching range when migrating tasks
+ - cpuset.loadavg: the load average of the CPUs in that cpuset
In addition, only the root cpuset has the following file:
- cpuset.memory_pressure_enabled flag: compute memory_pressure?
--
1.7.10.4
Add the new file loadavg to report the load average of the cpus assigned
to the cpuset cgroup.
The load average is reported using the typical three values as they
appear in /proc/loadavg, averaged over 1, 5 and 15 minutes.
Example:
# cat /sys/fs/cgroup/cpuset/foo/cpuset.loadavg
3.98 2.64 1.20
Signed-off-by: Andrea Righi <[email protected]>
---
kernel/cpuset.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 108 insertions(+)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f33c715..1bb10d1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1465,6 +1465,7 @@ typedef enum {
FILE_MEMORY_PRESSURE,
FILE_SPREAD_PAGE,
FILE_SPREAD_SLAB,
+ FILE_LOADAVG,
} cpuset_filetype_t;
static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
@@ -1686,6 +1687,107 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
return 0;
}
+/*
+ * XXX: move all of this to a better place and unify the different
+ * re-definition of these macros.
+ */
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+
+static void cpuset_show_loadavg(struct seq_file *m, const struct cpuset *cs)
+{
+ unsigned long avnrun[3] = {};
+ int cpu;
+
+ /*
+ * The global load average is an exponentially decaying average of:
+ *
+ * x(t) = nr_running(t) + nr_uninterruptible(t)
+ *
+ * The global load average of the system is evaluated as:
+ *
+ * load(t) = load(t - 1) * exp_k + x(t) * (1 - exp_k)
+ *
+ * So, the load average of a cpuset with N CPUS can be evaluated as:
+ *
+ * load_cs(t) = load_cs(t - 1) * exp_k + x_cs(t) * (1 - exp_k),
+ * x_cs(t) = \sum{i = 1}^{N} x_i(t)
+ *
+ * This is equivalent to the sum of all the partial load averages of
+ * each CPU assigned to the cpuset:
+ *
+ * load_cs(t) = \sum{i = 1}^{N} load_i(t)
+ *
+ * Proof:
+ *
+ * load_1(t) = load_1(t - 1) * exp_k + x_1(t) * (1 - exp_k)
+ * load_2(t) = load_2(t - 1) * exp_k + x_2(t) * (1 - exp_k)
+ * ...
+ * load_N(t) = load_N(t - 1) * exp_k + x_N(t) * (1 - exp_k)
+ *
+ * ===>
+ *
+ * load_1(t) = x_1(1) * (1 - exp_k) * exp_k^{t - 1} +
+ * x_1(2) * (1 - exp_k) * exp_k^{t - 2} +
+ * ... +
+ * x_1(t)(1 - exp_k)
+ * load_2(t) = x_2(1) * (1 - exp_k) * exp_k^{t - 1} +
+ * x_2(2) * (1 - exp_k) * exp_k^{t - 2} +
+ * ... +
+ * x_2(t)(1 - exp_k)
+ * ...
+ * load_N(t) = x_N(1) * (1 - exp_k) * exp_k^{t - 1} +
+ * x_N(2) * (1 - exp_k) * exp_k^{t - 2} +
+ * ... +
+ * x_N(t)(1 - exp_k)
+ *
+ * ===>
+ *
+ * load_1(t) + load_2(t) + ... + load_N(t) =
+ * \sum_{i = 1}^{N} x_i(1) * (1 - exp_k) * exp_k^{t - 1} +
+ * \sum_{i = 1}^{N} x_i(2) * (1 - exp_k) * exp_k^{t - 2} +
+ * ... +
+ * \sum_{i = 1}^{N} x_i(t) * (1 - exp_k) = load_cs(t)
+ */
+ for_each_cpu(cpu, cs->cpus_allowed) {
+ unsigned long cpu_avnrun[3];
+ int i;
+
+ get_cpu_avenrun(cpu_avnrun, cpu, FIXED_1/200, 0);
+
+ for (i = 0; i < ARRAY_SIZE(cpu_avnrun); i++)
+ avnrun[i] += cpu_avnrun[i];
+ }
+ /*
+ * TODO: also report nr_running/nr_threads and last_pid, producing the
+ * same output as /proc/loadavg.
+ *
+ * For nr_running we can just sum the nr_running_cpu() of the cores
+ * assigned to this cs; what should we report in nr_threads? maybe
+ * cgroup_task_count()? and what about last_pid?
+ */
+ seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu\n",
+ LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
+ LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
+ LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]));
+}
+
+static int cpuset_read_seq_string(struct cgroup *cont, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct cpuset *cs = cgroup_cs(cont);
+ cpuset_filetype_t type = cft->private;
+
+ switch (type) {
+ case FILE_LOADAVG:
+ cpuset_show_loadavg(m, cs);
+ break;
+ default:
+ BUG();
+ }
+
+ return 0;
+}
/*
* for the common functions, 'private' gives the type of file
@@ -1780,6 +1882,12 @@ static struct cftype files[] = {
.private = FILE_MEMORY_PRESSURE_ENABLED,
},
+ {
+ .name = "loadavg",
+ .read_seq_string = cpuset_read_seq_string,
+ .private = FILE_LOADAVG,
+ },
+
{ } /* terminate */
};
--
1.7.10.4
On Sat, 2012-10-20 at 21:06 +0200, Andrea Righi wrote:
> @@ -383,13 +383,7 @@ struct rq {
> struct list_head leaf_rt_rq_list;
> #endif
>
> + unsigned long __percpu *nr_uninterruptible;
This is O(nr_cpus^2) memory..
> +unsigned long nr_uninterruptible_cpu(int cpu)
> +{
> + struct rq *this = cpu_rq(cpu);
> + unsigned long val = 0;
> + int i;
> +
> + for_each_online_cpu(i)
> + val += per_cpu(*this->nr_uninterruptible, i);
> +
> + return val;
> +}
>
>
I suspect you've got an accounting leak here on hot-plug.
>
> unsigned long nr_uninterruptible(void)
> {
> unsigned long i, sum = 0;
>
> for_each_possible_cpu(i)
> - sum += cpu_rq(i)->nr_uninterruptible;
> + sum += nr_uninterruptible_cpu(i);
>
> /*
> * Since we read the counters lockless, it might be slightly
And this makes O(n^2) runtime!
On Mon, Oct 22, 2012 at 01:10:40PM +0200, Peter Zijlstra wrote:
> On Sat, 2012-10-20 at 21:06 +0200, Andrea Righi wrote:
> > @@ -383,13 +383,7 @@ struct rq {
> > struct list_head leaf_rt_rq_list;
> > #endif
> >
>
> > + unsigned long __percpu *nr_uninterruptible;
>
> This is O(nr_cpus^2) memory..
>
Correct, this doesn't add too much overhead to the wakeup/sleep path,
but it's bad both in terms of memory and performance overhead in the
other parts of the code for large SMP systems.
>
> > +unsigned long nr_uninterruptible_cpu(int cpu)
> > +{
> > + struct rq *this = cpu_rq(cpu);
> > + unsigned long val = 0;
> > + int i;
> > +
> > + for_each_online_cpu(i)
> > + val += per_cpu(*this->nr_uninterruptible, i);
> > +
> > + return val;
> > +}
> >
> >
> I suspect you've got an accounting leak here on hot-plug.
And I think you're right about the accounting leak with cpu hotplug.
I'll do more tests with this part, until I come up with a better idea
in general for the nr_uninterruptible accounting.
Thanks!
-Andrea
> >
> > unsigned long nr_uninterruptible(void)
> > {
> > unsigned long i, sum = 0;
> >
> > for_each_possible_cpu(i)
> > - sum += cpu_rq(i)->nr_uninterruptible;
> > + sum += nr_uninterruptible_cpu(i);
> >
> > /*
> > * Since we read the counters lockless, it might be slightly
>
> And this makes O(n^2) runtime!
>
On Sat, Oct 20, 2012 at 10:05 PM, Andrea Righi <[email protected]> wrote:
> Overview
> ~~~~~~~~
> The cpusets subsystem allows to assign a different set of CPUs to a cgroup. A
> typical use case is to split large systems in small CPU/memory partitions and
> isolate certain users/applications in these subsets of the system.
>
> Sometimes, to have a quick overview of the state of each partition, we may be
> interested to get the load average of the CPUs assigned to a particular cpuset,
> rather than the global load average of the system.
>
> Proposed solution
> ~~~~~~~~~~~~~~~~~
> The proposal is to add a new file in the cpuset subsystem to report the load
> average of the CPUs assinged to a particular cpuset cgroup.
Hi,
What is the situation with this patch?
--
Ozan Çağlayan
Research Assistant
Galatasaray University - Computer Engineering Dept.
http://www.ozancaglayan.com