2013-09-23 19:40:11

by Waiman Long

[permalink] [raw]
Subject: [PATCH 0/2] sched: Reduce contention on tg's load_avg & runnable_avg

These 2 patches try to reduce contention on the load_avg and
runnable_avg values of the task_group structure in large NUMA
machine by:

patch 1: Move them into their own cacheline & reduce actual read/write
to them.

patch 2: Reduce the frequecy of doing the average computation.

Waiman Long (2):
sched: reduce contention on tg's load_avg & runnable_avg
sched, numa: reduce load/runnable_avg computation frequency

init/Kconfig | 13 ++++++++
kernel/sched/core.c | 1 +
kernel/sched/fair.c | 84 +++++++++++++++++++++++++++++++++++++++++++++-----
kernel/sched/sched.h | 29 ++++++++++++++++-
4 files changed, 117 insertions(+), 10 deletions(-)


2013-09-23 19:40:16

by Waiman Long

[permalink] [raw]
Subject: [PATCH 2/2] sched, numa: reduce load/runnable_avg computation frequency

For large NUMA machine, there can be significant cacheline contention
due to the computation of the task_group's load_avg and runnable_avg
values, which should happen more or less simultaneously if the clock
ticks of the CPUs are synchronized. If they are not (like with the
highres=off clocksource=tsc kernel command-line options), it will not
be an issue at all. Having heavy cacheline contention reduces the CPU
cycles for other tasks. As a result, it may be desirable to reduce
the frequency of those computations. This trades less up-to-date
load data with slight better performance.

The frequency reduction is currently determined by the number of nodes
in the NUMA machine. An 8-node machine will have its load computation
frequency reduce to 1/8 of the original. The selection of cores for
load_avg computation are such that they are clustered around 2 nodes
each time to reduce inter-socket contention as much as possible for
each clock tick.

This frequency reduction feature is optional and is enabled by setting
the FGS_SPARSE_AVG configuration parameter. The table below shows the
performance difference of running kernbench (3.11 allmodconfig) with
this parameter enabled and disabled on a 3.11 kernel (with the tg load
reduction patch) on a 8-node 80-core DL980 with Westmere-EX processors:

FGS_SPARSE_AVG HT E.Time S.Time U.Time
-------------- -- ------ ------ ------
off off 149.3s 914.5s 6153.8s
on off 149.1s 910.6s 6111.8s
off on 140.0s 1313.3s 9944.3s
on on 138.2s 1316.7s 9907.1s

There are some slight decreases in the elapsed time as well as the
reported system and user times.

The follow perf profile shows the %CPU cycles spent in the scheduler
tick related functions with FGS_SPARSE_AVG off:

3.77% cc1 [kernel.kallsyms] [k] update_cfs_rq_blocked_load
1.50% cc1 [kernel.kallsyms] [k] update_curr
0.78% cc1 [kernel.kallsyms] [k] update_cfs_shares
0.11% cc1 [kernel.kallsyms] [k] entity_tick

With FGS_SPARSE_AVG on, the profile becomes:

0.84% cc1 [kernel.kallsyms] [k] update_cfs_rq_blocked_load
0.27% cc1 [kernel.kallsyms] [k] entity_tick
0.04% cc1 [kernel.kallsyms] [k] update_curr
0.07% cc1 [kernel.kallsyms] [k] update_cfs_shares

So there are significant reduction in the cpu time spent in those
functions.

Signed-off-by: Waiman Long <[email protected]>
---
init/Kconfig | 13 +++++++++++
kernel/sched/core.c | 1 +
kernel/sched/fair.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++-
kernel/sched/sched.h | 23 ++++++++++++++++++++
4 files changed, 91 insertions(+), 1 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index fed81b5..24ca9f1 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1033,6 +1033,19 @@ config RT_GROUP_SCHED
realtime bandwidth for them.
See Documentation/scheduler/sched-rt-group.txt for more information.

+config FGS_SPARSE_AVG
+ bool "Less frequent load computation to reduce overhead"
+ depends on FAIR_GROUP_SCHED
+ depends on NUMA
+ default n
+ help
+ This option reduces the frequency of doing average load computation
+ at clock tick time to reduce overhead and cacheline contention on a
+ NUMA systems. The frequency reduction factor is determined by the
+ number of nodes in the NUMA machine. It can slightly improve the
+ performance of a NUMA system at the expense of less up-to-date
+ average load value.
+
endif #CGROUP_SCHED

config BLK_CGROUP
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 05c39f0..55595d2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6360,6 +6360,7 @@ void __init sched_init_smp(void)
free_cpumask_var(non_isolated_cpus);

init_sched_rt_class();
+ init_tg_period_mask(&root_task_group);
}
#else
void __init sched_init_smp(void)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 979f04d..f8f4abf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1368,6 +1368,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
cfs_rq->tg_load_save =
atomic_long_add_return(tg_contrib, &tg->load_avg);
cfs_rq->tg_load_contrib += tg_contrib;
+#ifdef CONFIG_FGS_SPARSE_AVG
+ cfs_rq->tg_load_jiffies = jiffies;
+#endif
}
}

@@ -1390,6 +1393,9 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
cfs_rq->tg_runnable_save =
atomic_add_return(contrib, &tg->runnable_avg);
cfs_rq->tg_runnable_contrib += contrib;
+#ifdef CONFIG_FGS_SPARSE_AVG
+ cfs_rq->tg_runnable_jiffies = jiffies;
+#endif
}
}

@@ -1407,9 +1413,25 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
* Retrieve & clear the saved tg's load_avg and use it if not 0
*/
load_avg = cfs_rq->tg_load_save;
+#ifdef CONFIG_FGS_SPARSE_AVG
+ /*
+ * Reload the saved load_avg value if it expires
+ */
+ if (likely(load_avg)) {
+ unsigned long now = jiffies;
+
+ if (now - cfs_rq->tg_load_jiffies > tg->period_mask) {
+ load_avg = cfs_rq->tg_load_save
+ = atomic_long_read(&tg->load_avg);
+ cfs_rq->tg_load_jiffies = now;
+ }
+ } else
+ load_avg = atomic_long_read(&tg->load_avg);
+#else
cfs_rq->tg_load_save = 0;
if (unlikely(!load_avg))
load_avg = atomic_long_read(&tg->load_avg);
+#endif
se->avg.load_avg_contrib = div_u64(contrib, load_avg + 1);

/*
@@ -1436,9 +1458,25 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
* our upper bound of 1-cpu.
*/
runnable_avg = cfs_rq->tg_runnable_save;
+#ifdef CONFIG_FGS_SPARSE_AVG
+ /*
+ * Reload the saved runnable_avg value if it expires
+ */
+ if (likely(runnable_avg)) {
+ unsigned long now = jiffies;
+
+ if (now - cfs_rq->tg_runnable_jiffies > tg->period_mask) {
+ runnable_avg = cfs_rq->tg_runnable_save
+ = atomic_read(&tg->runnable_avg);
+ cfs_rq->tg_runnable_jiffies = now;
+ }
+ } else
+ runnable_avg = atomic_read(&tg->runnable_avg);
+#else
cfs_rq->tg_runnable_save = 0;
if (unlikely(!runnable_avg))
runnable_avg = atomic_read(&tg->runnable_avg);
+#endif
if (runnable_avg < NICE_0_LOAD) {
se->avg.load_avg_contrib *= runnable_avg;
se->avg.load_avg_contrib >>= NICE_0_SHIFT;
@@ -2037,6 +2075,20 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
+ int force_update = 1;
+
+#ifdef CONFIG_FGS_SPARSE_AVG
+ /*
+ * Force the update of load_avg only if it is its turn or the
+ * previous update was more than one period away.
+ */
+ unsigned long now = jiffies;
+ int period_mask = cfs_rq->tg->period_mask;
+
+ if (((now & period_mask) != numa_node_id()) &&
+ (now - cfs_rq->tg_load_jiffies <= period_mask + 1))
+ force_update = 0;
+#endif
/*
* Update run-time statistics of the 'current'.
*/
@@ -2045,7 +2097,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
- update_cfs_rq_blocked_load(cfs_rq, 1);
+ update_cfs_rq_blocked_load(cfs_rq, force_update);
update_cfs_shares(cfs_rq);
update_entity_load_avg(curr, 1);

@@ -6052,6 +6104,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
tg->shares = NICE_0_LOAD;

init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+ init_tg_period_mask(tg);

for_each_possible_cpu(i) {
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 255e340..76cbe26 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -147,6 +147,10 @@ struct task_group {
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
unsigned long shares;
+#ifdef CONFIG_FGS_SPARSE_AVG
+ /* Mask for spreading average computation */
+ unsigned period_mask;
+#endif

#ifdef CONFIG_SMP
atomic_long_t load_avg ____cacheline_aligned;
@@ -287,6 +291,12 @@ struct cfs_rq {
int tg_runnable_save;
unsigned long tg_load_contrib;
long tg_load_save;
+#ifdef CONFIG_FGS_SPARSE_AVG
+ /* Time of last load_avg update */
+ unsigned long tg_load_jiffies;
+ /* Time of last runnable_avg update */
+ unsigned long tg_runnable_jiffies;
+#endif
#endif /* CONFIG_FAIR_GROUP_SCHED */

/*
@@ -1105,6 +1115,19 @@ static inline u64 sched_avg_period(void)
return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
}

+#ifdef CONFIG_FGS_SPARSE_AVG
+static inline void init_tg_period_mask(struct task_group *tg)
+{
+ /*
+ * Convert number of nodes into bitmask that cover all the nodes
+ * e.g. 8 => 0b1111, 7 =>0b1111, 4 => 0b111
+ */
+ tg->period_mask = (1 << fls(num_possible_nodes() - 1)) - 1;
+}
+#else
+static inline void init_tg_period_mask(struct task_group *tg) {}
+#endif /* CONFIG_FGS_SPARSE_AVG */
+
#ifdef CONFIG_SCHED_HRTICK

/*
--
1.7.1

2013-09-23 19:40:38

by Waiman Long

[permalink] [raw]
Subject: [PATCH 1/2] sched: reduce contention on tg's load_avg & runnable_avg

It was found that with a perf profile of a kernbench running on a
8-socket 80-core system (HT off) on a 3.11 kernel that the scheduler
accounts for a non-insignificant portion of the total cpu cycles.

4.58% cc1 [kernel.kallsyms] [k] entity_tick
2.41% cc1 [kernel.kallsyms] [k] update_cfs_rq_blocked_load
0.38% cc1 [kernel.kallsyms] [k] update_curr
0.25% cc1 [kernel.kallsyms] [k] update_cfs_shares

The scheduler accounts for about 7.6% of the total CPU cycles. Of
the top 2 function in the above list, the reading and writing of the
tg->load_avg variable account for over 90% of the CPU cycles:

atomic_long_add(tg_contrib, &tg->load_avg);
atomic_long_read(&tg->load_avg) + 1);

This patch reduces the contention on the load_avg variable (and
secondarily on the runnable_avg variable) by the following 2 measures:

1. Make the load_avg and runnable_avg fields of the task_group
structure sit in their own cacheline without sharing it with others.
2. Use atomic_long_add_return() to update the fields and save the
returned value in a temporary location in the cfs structure to
be used later instead of reading the fields directly.

The second change does require some changes in the ordering of how
some of the average counts are being computed and hence may have a
slight effect on their behavior.

With these 2 changes, the perf profile becomes:

3.77% cc1 [kernel.kallsyms] [k] update_cfs_rq_blocked_load
1.50% cc1 [kernel.kallsyms] [k] update_curr
0.78% cc1 [kernel.kallsyms] [k] update_cfs_shares
0.11% cc1 [kernel.kallsyms] [k] entity_tick

The % CPU cycle is reduced to about 6.2%. The kernbench elapsed time
was reduced from 151.5s to 149s.

Apparently, the %CPU cycle reduction was bigger with HT on:

1.08% cc1 [kernel.kallsyms] [k] update_cfs_rq_blocked_load
0.21% cc1 [kernel.kallsyms] [k] update_cfs_shares
0.16% cc1 [kernel.kallsyms] [k] entity_tick
0.09% cc1 [kernel.kallsyms] [k] update_curr

However, the actual elapsed time reduction was less (from 139.8s to
139s) in this case.

Signed-off-by: Waiman Long <[email protected]>
---
kernel/sched/fair.c | 29 ++++++++++++++++++++++-------
kernel/sched/sched.h | 6 ++++--
2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 68f1609..979f04d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1075,7 +1075,10 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
* to gain a more accurate current total weight. See
* update_cfs_rq_load_contribution().
*/
- tg_weight = atomic_long_read(&tg->load_avg);
+ /* Use the saved version of tg's load_avg, if available */
+ tg_weight = cfs_rq->tg_load_save;
+ if (!tg_weight)
+ tg_weight = atomic_long_read(&tg->load_avg);
tg_weight -= cfs_rq->tg_load_contrib;
tg_weight += cfs_rq->load.weight;

@@ -1362,7 +1365,8 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
tg_contrib -= cfs_rq->tg_load_contrib;

if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
- atomic_long_add(tg_contrib, &tg->load_avg);
+ cfs_rq->tg_load_save =
+ atomic_long_add_return(tg_contrib, &tg->load_avg);
cfs_rq->tg_load_contrib += tg_contrib;
}
}
@@ -1383,7 +1387,8 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
contrib -= cfs_rq->tg_runnable_contrib;

if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
- atomic_add(contrib, &tg->runnable_avg);
+ cfs_rq->tg_runnable_save =
+ atomic_add_return(contrib, &tg->runnable_avg);
cfs_rq->tg_runnable_contrib += contrib;
}
}
@@ -1393,12 +1398,19 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
struct cfs_rq *cfs_rq = group_cfs_rq(se);
struct task_group *tg = cfs_rq->tg;
int runnable_avg;
+ long load_avg;

u64 contrib;

contrib = cfs_rq->tg_load_contrib * tg->shares;
- se->avg.load_avg_contrib = div_u64(contrib,
- atomic_long_read(&tg->load_avg) + 1);
+ /*
+ * Retrieve & clear the saved tg's load_avg and use it if not 0
+ */
+ load_avg = cfs_rq->tg_load_save;
+ cfs_rq->tg_load_save = 0;
+ if (unlikely(!load_avg))
+ load_avg = atomic_long_read(&tg->load_avg);
+ se->avg.load_avg_contrib = div_u64(contrib, load_avg + 1);

/*
* For group entities we need to compute a correction term in the case
@@ -1423,7 +1435,10 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
* of consequential size guaranteed to see n_i*w_i quickly converge to
* our upper bound of 1-cpu.
*/
- runnable_avg = atomic_read(&tg->runnable_avg);
+ runnable_avg = cfs_rq->tg_runnable_save;
+ cfs_rq->tg_runnable_save = 0;
+ if (unlikely(!runnable_avg))
+ runnable_avg = atomic_read(&tg->runnable_avg);
if (runnable_avg < NICE_0_LOAD) {
se->avg.load_avg_contrib *= runnable_avg;
se->avg.load_avg_contrib >>= NICE_0_SHIFT;
@@ -2030,9 +2045,9 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
- update_entity_load_avg(curr, 1);
update_cfs_rq_blocked_load(cfs_rq, 1);
update_cfs_shares(cfs_rq);
+ update_entity_load_avg(curr, 1);

#ifdef CONFIG_SCHED_HRTICK
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef0a7b2..255e340 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -149,8 +149,8 @@ struct task_group {
unsigned long shares;

#ifdef CONFIG_SMP
- atomic_long_t load_avg;
- atomic_t runnable_avg;
+ atomic_long_t load_avg ____cacheline_aligned;
+ atomic_t runnable_avg ____cacheline_aligned;
#endif
#endif

@@ -284,7 +284,9 @@ struct cfs_rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* Required to track per-cpu representation of a task_group */
u32 tg_runnable_contrib;
+ int tg_runnable_save;
unsigned long tg_load_contrib;
+ long tg_load_save;
#endif /* CONFIG_FAIR_GROUP_SCHED */

/*
--
1.7.1