2017-06-23 16:55:47

by Rik van Riel

[permalink] [raw]
Subject: [PATCH 0/4] NUMA improvements with task wakeup and load balancing

With these patches, and Peter Zijlstra's select_idle_sibling
scalability improvement, Jirka has seen these performance
gains on a 4.11 kernel:

NAS shows improvements in range 20-100%
SPECjbb2005 shows improvements around 6-8% in the single instance mode
SPECjvm2008 - improvements around 10%

Unfortunately the full set of tests takes about a week to
run, so numbers are not broken out for individual patches.

We have done previous runs with other scheduler changes,
which did not work out - they showed improvements on some
workloads, and regressions on others.

4.11 performance still lags behind 3.10 for some workloads.
I am trying to figure out why, and close that gap.

Diffstat:

fair.c | 271 +++++++++++++++++++++--------------------------------------------
1 file changed, 88 insertions(+), 183 deletions(-)


2017-06-23 16:55:48

by Rik van Riel

[permalink] [raw]
Subject: [PATCH 3/4] sched,numa: implement numa node level wake_affine

From: Rik van Riel <[email protected]>

Since select_idle_sibling can place a task anywhere on a socket,
comparing loads between individual CPU cores makes no real sense
for deciding whether to do an affine wakeup across sockets, either.

Instead, compare the load between the sockets in a similar way the
load balancer and the numa balancing code do.

Signed-off-by: Rik van Riel <[email protected]>
---
kernel/sched/fair.c | 130 ++++++++++++++++++++++++++++------------------------
1 file changed, 71 insertions(+), 59 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 949de24e36bd..d03a21e6627d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2590,6 +2590,60 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
}
}
}
+
+/*
+ * Can p be moved from prev_cpu to this_cpu without causing a load
+ * imbalance that would trigger the load balancer?
+ */
+static inline bool numa_wake_affine(struct sched_domain *sd,
+ struct task_struct *p, int this_cpu,
+ int prev_cpu, int sync)
+{
+ struct numa_stats prev_load, this_load;
+ s64 this_eff_load, prev_eff_load;
+
+ update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
+ update_numa_stats(&this_load, cpu_to_node(this_cpu));
+
+ /*
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current CPU:
+ */
+ if (sync) {
+ unsigned long current_load = task_h_load(current);
+
+ if (this_load.load > current_load)
+ this_load.load -= current_load;
+ else
+ this_load.load = 0;
+ }
+
+ /*
+ * In low-load situations, where this_cpu's node is idle due to the
+ * sync cause above having dropped this_load.load to 0, move the task.
+ * Moving to an idle socket will not create a bad imbalance.
+ *
+ * Otherwise check if the nodes are near enough in load to allow this
+ * task to be woken on this_cpu's node.
+ */
+ if (this_load.load > 0) {
+ unsigned long task_load = task_h_load(p);
+
+ this_eff_load = 100;
+ this_eff_load *= prev_load.compute_capacity;
+
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= this_load.compute_capacity;
+
+ this_eff_load *= this_load.load + task_load;
+ prev_eff_load *= prev_load.load - task_load;
+
+ return this_eff_load <= prev_eff_load;
+ }
+
+ return true;
+}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
@@ -2602,6 +2656,13 @@ static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
}
+
+static inline bool numa_wake_affine(struct sched_domain *sd,
+ struct task_struct *p, int this_cpu,
+ int prev_cpu, int sync)
+{
+ return true;
+}
#endif /* CONFIG_NUMA_BALANCING */

static void
@@ -5360,74 +5421,25 @@ static int wake_wide(struct task_struct *p)
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
- s64 this_load, load;
- s64 this_eff_load, prev_eff_load;
- int idx, this_cpu;
- struct task_group *tg;
- unsigned long weight;
- int balanced;
-
- idx = sd->wake_idx;
- this_cpu = smp_processor_id();
- load = source_load(prev_cpu, idx);
- this_load = target_load(this_cpu, idx);
+ int this_cpu = smp_processor_id();
+ bool affine = false;

/*
* Common case: CPUs are in the same socket, and select_idle_sibling
* will do its thing regardless of what we return.
*/
if (cpus_share_cache(prev_cpu, this_cpu))
- return true;
-
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current CPU:
- */
- if (sync) {
- tg = task_group(current);
- weight = current->se.avg.load_avg;
-
- this_load += effective_load(tg, this_cpu, -weight, -weight);
- load += effective_load(tg, prev_cpu, 0, -weight);
- }
-
- tg = task_group(p);
- weight = p->se.avg.load_avg;
-
- /*
- * In low-load situations, where prev_cpu is idle and this_cpu is idle
- * due to the sync cause above having dropped this_load to 0, we'll
- * always have an imbalance, but there's really nothing you can do
- * about that, so that's good too.
- *
- * Otherwise check if either cpus are near enough in load to allow this
- * task to be woken on this_cpu.
- */
- this_eff_load = 100;
- this_eff_load *= capacity_of(prev_cpu);
-
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= capacity_of(this_cpu);
-
- if (this_load > 0) {
- this_eff_load *= this_load +
- effective_load(tg, this_cpu, weight, weight);
-
- prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
- }
-
- balanced = this_eff_load <= prev_eff_load;
+ affine = true;
+ else
+ affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);

schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
+ if (affine) {
+ schedstat_inc(sd->ttwu_move_affine);
+ schedstat_inc(p->se.statistics.nr_wakeups_affine);
+ }

- if (!balanced)
- return 0;
-
- schedstat_inc(sd->ttwu_move_affine);
- schedstat_inc(p->se.statistics.nr_wakeups_affine);
-
- return 1;
+ return affine;
}

static inline int task_util(struct task_struct *p);
--
2.9.4

2017-06-23 16:55:46

by Rik van Riel

[permalink] [raw]
Subject: [PATCH 1/4] sched,numa: override part of migrate_degrades_locality when idle balancing

From: Rik van Riel <[email protected]>

Several tests in the NAS benchmark seem to run a lot slower with
NUMA balancing enabled, than with NUMA balancing disabled. The
slower run time corresponds with increased idle time.

Overriding the final test of migrate_degrades_locality (but still
doing the other NUMA tests first) seems to improve performance
of those benchmarks.

Reported-by: Jirka Hladky <[email protected]>
Signed-off-by: Rik van Riel <[email protected]>
Signed-off-by: Rik van Riel <[email protected]>
---
kernel/sched/fair.c | 4 ++++
1 file changed, 4 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2a0e71034e36..2180c8591e16 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6635,6 +6635,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
if (dst_nid == p->numa_preferred_nid)
return 0;

+ /* Leaving a core idle is often worse than degrading locality. */
+ if (env->idle != CPU_NOT_IDLE)
+ return -1;
+
if (numa_group) {
src_faults = group_faults(p, src_nid);
dst_faults = group_faults(p, dst_nid);
--
2.9.4

2017-06-23 16:55:44

by Rik van Riel

[permalink] [raw]
Subject: [PATCH 2/4] sched: simplify wake_affine for single socket case

From: Rik van Riel <[email protected]>

Then this_cpu and prev_cpu are in the same socket, select_idle_sibling
will do its thing regardless of the return value of wake_affine. Just
return true and don't look at all the other things.

Signed-off-by: Rik van Riel <[email protected]>
---
kernel/sched/fair.c | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2180c8591e16..949de24e36bd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5373,6 +5373,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
this_load = target_load(this_cpu, idx);

/*
+ * Common case: CPUs are in the same socket, and select_idle_sibling
+ * will do its thing regardless of what we return.
+ */
+ if (cpus_share_cache(prev_cpu, this_cpu))
+ return true;
+
+ /*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
* of the current CPU:
@@ -5960,11 +5967,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f

if (affine_sd) {
sd = NULL; /* Prefer wake_affine over balance flags */
- if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
+ if (cpu == prev_cpu)
+ goto pick_cpu;
+
+ if (wake_affine(affine_sd, p, prev_cpu, sync))
new_cpu = cpu;
}

if (!sd) {
+ pick_cpu:
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);

--
2.9.4

2017-06-23 16:56:30

by Rik van Riel

[permalink] [raw]
Subject: [PATCH 4/4] sched,fair: remove effective_load

From: Rik van Riel <[email protected]>

The function effective_load was only used by the NUMA balancing
code, and not by the regular load balancing code. Now that the
NUMA balancing code no longer uses it either, get rid of it.

Signed-off-by: Rik van Riel <[email protected]>
---
kernel/sched/fair.c | 124 +---------------------------------------------------
1 file changed, 1 insertion(+), 123 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d03a21e6627d..5d98836d9f73 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1387,7 +1387,6 @@ static unsigned long weighted_cpuload(const int cpu);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
static unsigned long capacity_of(int cpu);
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg);

/* Cached statistics for all CPUs within a node */
struct numa_stats {
@@ -3048,8 +3047,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
* differential update where we store the last value we propagated. This in
* turn allows skipping updates if the differential is 'small'.
*
- * Updating tg's load_avg is necessary before update_cfs_share() (which is
- * done) and effective_load() (which is not done because it is too costly).
+ * Updating tg's load_avg is necessary before update_cfs_share().
*/
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
@@ -5251,126 +5249,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
return 0;
}

-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * effective_load() calculates the load change as seen from the root_task_group
- *
- * Adding load to a group doesn't make a group heavier, but can cause movement
- * of group shares between cpus. Assuming the shares were perfectly aligned one
- * can calculate the shift in shares.
- *
- * Calculate the effective load difference if @wl is added (subtracted) to @tg
- * on this @cpu and results in a total addition (subtraction) of @wg to the
- * total group weight.
- *
- * Given a runqueue weight distribution (rw_i) we can compute a shares
- * distribution (s_i) using:
- *
- * s_i = rw_i / \Sum rw_j (1)
- *
- * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
- * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
- * shares distribution (s_i):
- *
- * rw_i = { 2, 4, 1, 0 }
- * s_i = { 2/7, 4/7, 1/7, 0 }
- *
- * As per wake_affine() we're interested in the load of two CPUs (the CPU the
- * task used to run on and the CPU the waker is running on), we need to
- * compute the effect of waking a task on either CPU and, in case of a sync
- * wakeup, compute the effect of the current task going to sleep.
- *
- * So for a change of @wl to the local @cpu with an overall group weight change
- * of @wl we can compute the new shares distribution (s'_i) using:
- *
- * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
- *
- * Suppose we're interested in CPUs 0 and 1, and want to compute the load
- * differences in waking a task to CPU 0. The additional task changes the
- * weight and shares distributions like:
- *
- * rw'_i = { 3, 4, 1, 0 }
- * s'_i = { 3/8, 4/8, 1/8, 0 }
- *
- * We can then compute the difference in effective weight by using:
- *
- * dw_i = S * (s'_i - s_i) (3)
- *
- * Where 'S' is the group weight as seen by its parent.
- *
- * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
- * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
- * 4/7) times the weight of the group.
- */
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-{
- struct sched_entity *se = tg->se[cpu];
-
- if (!tg->parent) /* the trivial, non-cgroup case */
- return wl;
-
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = se->my_q;
- long W, w = cfs_rq_load_avg(cfs_rq);
-
- tg = cfs_rq->tg;
-
- /*
- * W = @wg + \Sum rw_j
- */
- W = wg + atomic_long_read(&tg->load_avg);
-
- /* Ensure \Sum rw_j >= rw_i */
- W -= cfs_rq->tg_load_avg_contrib;
- W += w;
-
- /*
- * w = rw_i + @wl
- */
- w += wl;
-
- /*
- * wl = S * s'_i; see (2)
- */
- if (W > 0 && w < W)
- wl = (w * (long)scale_load_down(tg->shares)) / W;
- else
- wl = scale_load_down(tg->shares);
-
- /*
- * Per the above, wl is the new se->load.weight value; since
- * those are clipped to [MIN_SHARES, ...) do so now. See
- * calc_cfs_shares().
- */
- if (wl < MIN_SHARES)
- wl = MIN_SHARES;
-
- /*
- * wl = dw_i = S * (s'_i - s_i); see (3)
- */
- wl -= se->avg.load_avg;
-
- /*
- * Recursively apply this logic to all parent groups to compute
- * the final effective load change on the root group. Since
- * only the @tg group gets extra weight, all parent groups can
- * only redistribute existing shares. @wl is the shift in shares
- * resulting from this level per the above.
- */
- wg = 0;
- }
-
- return wl;
-}
-#else
-
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-{
- return wl;
-}
-
-#endif
-
static void record_wakee(struct task_struct *p)
{
/*
--
2.9.4

2017-06-24 06:58:35

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched,numa: override part of migrate_degrades_locality when idle balancing


* [email protected] <[email protected]> wrote:

> From: Rik van Riel <[email protected]>
>
> Several tests in the NAS benchmark seem to run a lot slower with
> NUMA balancing enabled, than with NUMA balancing disabled. The
> slower run time corresponds with increased idle time.
>
> Overriding the final test of migrate_degrades_locality (but still
> doing the other NUMA tests first) seems to improve performance
> of those benchmarks.
>
> Reported-by: Jirka Hladky <[email protected]>
> Signed-off-by: Rik van Riel <[email protected]>
> Signed-off-by: Rik van Riel <[email protected]>

Note, I removed the first SOB from the commit, assuming that you only meant to
include the second one?

Thanks,

Ingo

Subject: [tip:sched/core] sched/numa: Override part of migrate_degrades_locality() when idle balancing

Commit-ID: 739294fb03f590401bbd7faa6d31a507e3ffada5
Gitweb: http://git.kernel.org/tip/739294fb03f590401bbd7faa6d31a507e3ffada5
Author: Rik van Riel <[email protected]>
AuthorDate: Fri, 23 Jun 2017 12:55:27 -0400
Committer: Ingo Molnar <[email protected]>
CommitDate: Sat, 24 Jun 2017 08:57:46 +0200

sched/numa: Override part of migrate_degrades_locality() when idle balancing

Several tests in the NAS benchmark seem to run a lot slower with
NUMA balancing enabled, than with NUMA balancing disabled. The
slower run time corresponds with increased idle time.

Overriding the final test of migrate_degrades_locality (but still
doing the other NUMA tests first) seems to improve performance
of those benchmarks.

Reported-by: Jirka Hladky <[email protected]>
Signed-off-by: Rik van Riel <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched/fair.c | 4 ++++
1 file changed, 4 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 694c258..6e0c052 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6688,6 +6688,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
if (dst_nid == p->numa_preferred_nid)
return 0;

+ /* Leaving a core idle is often worse than degrading locality. */
+ if (env->idle != CPU_NOT_IDLE)
+ return -1;
+
if (numa_group) {
src_faults = group_faults(p, src_nid);
dst_faults = group_faults(p, dst_nid);

Subject: [tip:sched/core] sched/fair: Simplify wake_affine() for the single socket case

Commit-ID: 7d894e6e34a5cdd12309c7e4a3f830277ad4b7bf
Gitweb: http://git.kernel.org/tip/7d894e6e34a5cdd12309c7e4a3f830277ad4b7bf
Author: Rik van Riel <[email protected]>
AuthorDate: Fri, 23 Jun 2017 12:55:28 -0400
Committer: Ingo Molnar <[email protected]>
CommitDate: Sat, 24 Jun 2017 08:57:52 +0200

sched/fair: Simplify wake_affine() for the single socket case

Then 'this_cpu' and 'prev_cpu' are in the same socket, select_idle_sibling()
will do its thing regardless of the return value of wake_affine().

Just return true and don't look at all the other things.

Signed-off-by: Rik van Riel <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched/fair.c | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6e0c052..fe19016 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5420,6 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
this_load = target_load(this_cpu, idx);

/*
+ * Common case: CPUs are in the same socket, and select_idle_sibling()
+ * will do its thing regardless of what we return:
+ */
+ if (cpus_share_cache(prev_cpu, this_cpu))
+ return true;
+
+ /*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
* of the current CPU:
@@ -6007,11 +6014,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f

if (affine_sd) {
sd = NULL; /* Prefer wake_affine over balance flags */
- if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
+ if (cpu == prev_cpu)
+ goto pick_cpu;
+
+ if (wake_affine(affine_sd, p, prev_cpu, sync))
new_cpu = cpu;
}

if (!sd) {
+ pick_cpu:
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);


Subject: [tip:sched/core] sched/numa: Implement NUMA node level wake_affine()

Commit-ID: 3fed382b46baac83703130fe4cd3d9147f427fb9
Gitweb: http://git.kernel.org/tip/3fed382b46baac83703130fe4cd3d9147f427fb9
Author: Rik van Riel <[email protected]>
AuthorDate: Fri, 23 Jun 2017 12:55:29 -0400
Committer: Ingo Molnar <[email protected]>
CommitDate: Sat, 24 Jun 2017 08:57:52 +0200

sched/numa: Implement NUMA node level wake_affine()

Since select_idle_sibling() can place a task anywhere on a socket,
comparing loads between individual CPU cores makes no real sense
for deciding whether to do an affine wakeup across sockets, either.

Instead, compare the load between the sockets in a similar way the
load balancer and the numa balancing code do.

Signed-off-by: Rik van Riel <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched/fair.c | 130 ++++++++++++++++++++++++++++------------------------
1 file changed, 71 insertions(+), 59 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fe19016..79ac078 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2586,6 +2586,60 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
}
}
}
+
+/*
+ * Can a task be moved from prev_cpu to this_cpu without causing a load
+ * imbalance that would trigger the load balancer?
+ */
+static inline bool numa_wake_affine(struct sched_domain *sd,
+ struct task_struct *p, int this_cpu,
+ int prev_cpu, int sync)
+{
+ struct numa_stats prev_load, this_load;
+ s64 this_eff_load, prev_eff_load;
+
+ update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
+ update_numa_stats(&this_load, cpu_to_node(this_cpu));
+
+ /*
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current CPU:
+ */
+ if (sync) {
+ unsigned long current_load = task_h_load(current);
+
+ if (this_load.load > current_load)
+ this_load.load -= current_load;
+ else
+ this_load.load = 0;
+ }
+
+ /*
+ * In low-load situations, where this_cpu's node is idle due to the
+ * sync cause above having dropped this_load.load to 0, move the task.
+ * Moving to an idle socket will not create a bad imbalance.
+ *
+ * Otherwise check if the nodes are near enough in load to allow this
+ * task to be woken on this_cpu's node.
+ */
+ if (this_load.load > 0) {
+ unsigned long task_load = task_h_load(p);
+
+ this_eff_load = 100;
+ this_eff_load *= prev_load.compute_capacity;
+
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= this_load.compute_capacity;
+
+ this_eff_load *= this_load.load + task_load;
+ prev_eff_load *= prev_load.load - task_load;
+
+ return this_eff_load <= prev_eff_load;
+ }
+
+ return true;
+}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
@@ -2598,6 +2652,13 @@ static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
}
+
+static inline bool numa_wake_affine(struct sched_domain *sd,
+ struct task_struct *p, int this_cpu,
+ int prev_cpu, int sync)
+{
+ return true;
+}
#endif /* CONFIG_NUMA_BALANCING */

static void
@@ -5407,74 +5468,25 @@ static int wake_wide(struct task_struct *p)
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
- s64 this_load, load;
- s64 this_eff_load, prev_eff_load;
- int idx, this_cpu;
- struct task_group *tg;
- unsigned long weight;
- int balanced;
-
- idx = sd->wake_idx;
- this_cpu = smp_processor_id();
- load = source_load(prev_cpu, idx);
- this_load = target_load(this_cpu, idx);
+ int this_cpu = smp_processor_id();
+ bool affine = false;

/*
* Common case: CPUs are in the same socket, and select_idle_sibling()
* will do its thing regardless of what we return:
*/
if (cpus_share_cache(prev_cpu, this_cpu))
- return true;
-
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current CPU:
- */
- if (sync) {
- tg = task_group(current);
- weight = current->se.avg.load_avg;
-
- this_load += effective_load(tg, this_cpu, -weight, -weight);
- load += effective_load(tg, prev_cpu, 0, -weight);
- }
-
- tg = task_group(p);
- weight = p->se.avg.load_avg;
-
- /*
- * In low-load situations, where prev_cpu is idle and this_cpu is idle
- * due to the sync cause above having dropped this_load to 0, we'll
- * always have an imbalance, but there's really nothing you can do
- * about that, so that's good too.
- *
- * Otherwise check if either cpus are near enough in load to allow this
- * task to be woken on this_cpu.
- */
- this_eff_load = 100;
- this_eff_load *= capacity_of(prev_cpu);
-
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= capacity_of(this_cpu);
-
- if (this_load > 0) {
- this_eff_load *= this_load +
- effective_load(tg, this_cpu, weight, weight);
-
- prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
- }
-
- balanced = this_eff_load <= prev_eff_load;
+ affine = true;
+ else
+ affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);

schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
+ if (affine) {
+ schedstat_inc(sd->ttwu_move_affine);
+ schedstat_inc(p->se.statistics.nr_wakeups_affine);
+ }

- if (!balanced)
- return 0;
-
- schedstat_inc(sd->ttwu_move_affine);
- schedstat_inc(p->se.statistics.nr_wakeups_affine);
-
- return 1;
+ return affine;
}

static inline int task_util(struct task_struct *p);

Subject: [tip:sched/core] sched/fair: Remove effective_load()

Commit-ID: 815abf5af45f04f759f12f3172afd15226fd7f71
Gitweb: http://git.kernel.org/tip/815abf5af45f04f759f12f3172afd15226fd7f71
Author: Rik van Riel <[email protected]>
AuthorDate: Fri, 23 Jun 2017 12:55:30 -0400
Committer: Ingo Molnar <[email protected]>
CommitDate: Sat, 24 Jun 2017 08:57:53 +0200

sched/fair: Remove effective_load()

The effective_load() function was only used by the NUMA balancing
code, and not by the regular load balancing code. Now that the
NUMA balancing code no longer uses it either, get rid of it.

Signed-off-by: Rik van Riel <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched/fair.c | 124 +---------------------------------------------------
1 file changed, 1 insertion(+), 123 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 79ac078..6f4f155 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1382,7 +1382,6 @@ static unsigned long weighted_cpuload(const int cpu);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
static unsigned long capacity_of(int cpu);
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg);

/* Cached statistics for all CPUs within a node */
struct numa_stats {
@@ -3045,8 +3044,7 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
* differential update where we store the last value we propagated. This in
* turn allows skipping updates if the differential is 'small'.
*
- * Updating tg's load_avg is necessary before update_cfs_share() (which is
- * done) and effective_load() (which is not done because it is too costly).
+ * Updating tg's load_avg is necessary before update_cfs_share().
*/
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
@@ -5298,126 +5296,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
return 0;
}

-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * effective_load() calculates the load change as seen from the root_task_group
- *
- * Adding load to a group doesn't make a group heavier, but can cause movement
- * of group shares between cpus. Assuming the shares were perfectly aligned one
- * can calculate the shift in shares.
- *
- * Calculate the effective load difference if @wl is added (subtracted) to @tg
- * on this @cpu and results in a total addition (subtraction) of @wg to the
- * total group weight.
- *
- * Given a runqueue weight distribution (rw_i) we can compute a shares
- * distribution (s_i) using:
- *
- * s_i = rw_i / \Sum rw_j (1)
- *
- * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
- * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
- * shares distribution (s_i):
- *
- * rw_i = { 2, 4, 1, 0 }
- * s_i = { 2/7, 4/7, 1/7, 0 }
- *
- * As per wake_affine() we're interested in the load of two CPUs (the CPU the
- * task used to run on and the CPU the waker is running on), we need to
- * compute the effect of waking a task on either CPU and, in case of a sync
- * wakeup, compute the effect of the current task going to sleep.
- *
- * So for a change of @wl to the local @cpu with an overall group weight change
- * of @wl we can compute the new shares distribution (s'_i) using:
- *
- * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
- *
- * Suppose we're interested in CPUs 0 and 1, and want to compute the load
- * differences in waking a task to CPU 0. The additional task changes the
- * weight and shares distributions like:
- *
- * rw'_i = { 3, 4, 1, 0 }
- * s'_i = { 3/8, 4/8, 1/8, 0 }
- *
- * We can then compute the difference in effective weight by using:
- *
- * dw_i = S * (s'_i - s_i) (3)
- *
- * Where 'S' is the group weight as seen by its parent.
- *
- * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
- * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
- * 4/7) times the weight of the group.
- */
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-{
- struct sched_entity *se = tg->se[cpu];
-
- if (!tg->parent) /* the trivial, non-cgroup case */
- return wl;
-
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = se->my_q;
- long W, w = cfs_rq_load_avg(cfs_rq);
-
- tg = cfs_rq->tg;
-
- /*
- * W = @wg + \Sum rw_j
- */
- W = wg + atomic_long_read(&tg->load_avg);
-
- /* Ensure \Sum rw_j >= rw_i */
- W -= cfs_rq->tg_load_avg_contrib;
- W += w;
-
- /*
- * w = rw_i + @wl
- */
- w += wl;
-
- /*
- * wl = S * s'_i; see (2)
- */
- if (W > 0 && w < W)
- wl = (w * (long)scale_load_down(tg->shares)) / W;
- else
- wl = scale_load_down(tg->shares);
-
- /*
- * Per the above, wl is the new se->load.weight value; since
- * those are clipped to [MIN_SHARES, ...) do so now. See
- * calc_cfs_shares().
- */
- if (wl < MIN_SHARES)
- wl = MIN_SHARES;
-
- /*
- * wl = dw_i = S * (s'_i - s_i); see (3)
- */
- wl -= se->avg.load_avg;
-
- /*
- * Recursively apply this logic to all parent groups to compute
- * the final effective load change on the root group. Since
- * only the @tg group gets extra weight, all parent groups can
- * only redistribute existing shares. @wl is the shift in shares
- * resulting from this level per the above.
- */
- wg = 0;
- }
-
- return wl;
-}
-#else
-
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-{
- return wl;
-}
-
-#endif
-
static void record_wakee(struct task_struct *p)
{
/*

2017-06-24 23:45:57

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched,numa: override part of migrate_degrades_locality when idle balancing

On Sat, 2017-06-24 at 08:58 +0200, Ingo Molnar wrote:
> * [email protected] <[email protected]> wrote:
>
> > From: Rik van Riel <[email protected]>
> >
> > Several tests in the NAS benchmark seem to run a lot slower with
> > NUMA balancing enabled, than with NUMA balancing disabled. The
> > slower run time corresponds with increased idle time.
> >
> > Overriding the final test of migrate_degrades_locality (but still
> > doing the other NUMA tests first) seems to improve performance
> > of those benchmarks.
> >
> > Reported-by: Jirka Hladky <[email protected]>
> > Signed-off-by: Rik van Riel <[email protected]>
> > Signed-off-by: Rik van Riel <[email protected]>
>
> Note, I removed the first SOB from the commit, assuming that you only
> meant to 
> include the second one?

Thank you. It looks like I used the wrong email address
in the SOB I wrote in the patch, and git added the second
one automatically. The second one is indeed the one I want.

2017-06-26 14:44:04

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/4] sched,numa: implement numa node level wake_affine

On Fri, Jun 23, 2017 at 12:55:29PM -0400, [email protected] wrote:
> From: Rik van Riel <[email protected]>
>
> Since select_idle_sibling can place a task anywhere on a socket,
> comparing loads between individual CPU cores makes no real sense
> for deciding whether to do an affine wakeup across sockets, either.
>
> Instead, compare the load between the sockets in a similar way the
> load balancer and the numa balancing code do.

This seems to assume LLC == NUMA, which isn't strictly so.


2017-06-26 14:44:50

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 4/4] sched,fair: remove effective_load

On Fri, Jun 23, 2017 at 12:55:30PM -0400, [email protected] wrote:
> From: Rik van Riel <[email protected]>
>
> The function effective_load was only used by the NUMA balancing
> code, and not by the regular load balancing code. Now that the
> NUMA balancing code no longer uses it either, get rid of it.

Hmm,... funny. It used to be used by wake-affine. I'll have to go check
what happened.

2017-06-26 14:46:26

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 4/4] sched,fair: remove effective_load

On Mon, Jun 26, 2017 at 04:44:37PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 23, 2017 at 12:55:30PM -0400, [email protected] wrote:
> > From: Rik van Riel <[email protected]>
> >
> > The function effective_load was only used by the NUMA balancing
> > code, and not by the regular load balancing code. Now that the
> > NUMA balancing code no longer uses it either, get rid of it.
>
> Hmm,... funny. It used to be used by wake-affine. I'll have to go check
> what happened.

Ah, it fell pray to that LLC == NUMA confusion from the previous patch.

That really looks buggered.

2017-06-26 14:55:51

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH 4/4] sched,fair: remove effective_load

On Mon, 2017-06-26 at 16:46 +0200, Peter Zijlstra wrote:
> On Mon, Jun 26, 2017 at 04:44:37PM +0200, Peter Zijlstra wrote:
> > On Fri, Jun 23, 2017 at 12:55:30PM -0400, [email protected] wrote:
> > > From: Rik van Riel <[email protected]>
> > >
> > > The function effective_load was only used by the NUMA balancing
> > > code, and not by the regular load balancing code. Now that the
> > > NUMA balancing code no longer uses it either, get rid of it.
> >
> > Hmm,... funny. It used to be used by wake-affine. I'll have to go
> > check
> > what happened.
>
> Ah, it fell pray to that LLC == NUMA confusion from the previous
> patch.
>
> That really looks buggered.

Do the changelog or comments of that patch need fixing,
to avoid LLC / NUMA confusion?

I remember us talking about that in the past, but I do
not remember whether or not I changed the comments after
that discussion...

2017-06-26 15:04:10

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 4/4] sched,fair: remove effective_load

On Mon, Jun 26, 2017 at 10:55:41AM -0400, Rik van Riel wrote:
> On Mon, 2017-06-26 at 16:46 +0200, Peter Zijlstra wrote:
> > On Mon, Jun 26, 2017 at 04:44:37PM +0200, Peter Zijlstra wrote:
> > > On Fri, Jun 23, 2017 at 12:55:30PM -0400, [email protected] wrote:
> > > > From: Rik van Riel <[email protected]>
> > > >
> > > > The function effective_load was only used by the NUMA balancing
> > > > code, and not by the regular load balancing code. Now that the
> > > > NUMA balancing code no longer uses it either, get rid of it.
> > >
> > > Hmm,... funny. It used to be used by wake-affine. I'll have to go
> > > check
> > > what happened.
> >
> > Ah, it fell pray to that LLC == NUMA confusion from the previous
> > patch.
> >
> > That really looks buggered.
>
> Do the changelog or comments of that patch need fixing,
> to avoid LLC / NUMA confusion?

Neither, I think the code is actually wrong for the case where LLC <
NUMA (a somewhat rare case these days, granted, but something that might
still happen on !x86 perhaps).

2017-06-26 15:20:58

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH 4/4] sched,fair: remove effective_load

On Mon, 2017-06-26 at 17:04 +0200, Peter Zijlstra wrote:
> On Mon, Jun 26, 2017 at 10:55:41AM -0400, Rik van Riel wrote:
> > On Mon, 2017-06-26 at 16:46 +0200, Peter Zijlstra wrote:
> > > On Mon, Jun 26, 2017 at 04:44:37PM +0200, Peter Zijlstra wrote:
> > > > On Fri, Jun 23, 2017 at 12:55:30PM -0400, [email protected]
> > > > wrote:
> > > > > From: Rik van Riel <[email protected]>
> > > > >
> > > > > The function effective_load was only used by the NUMA
> > > > > balancing
> > > > > code, and not by the regular load balancing code. Now that
> > > > > the
> > > > > NUMA balancing code no longer uses it either, get rid of it.
> > > >
> > > > Hmm,... funny. It used to be used by wake-affine. I'll have to
> > > > go
> > > > check
> > > > what happened.
> > >
> > > Ah, it fell pray to that LLC == NUMA confusion from the previous
> > > patch.
> > >
> > > That really looks buggered.
> >
> > Do the changelog or comments of that patch need fixing,
> > to avoid LLC / NUMA confusion?
>
> Neither, I think the code is actually wrong for the case where LLC <
> NUMA (a somewhat rare case these days, granted, but something that
> might
> still happen on !x86 perhaps).

Oh, indeed. I guess in wake_affine() we should test
whether the CPUs are in the same NUMA node, rather than
doing cpus_share_cache() ?

Or, alternatively, have an update_numa_stats() variant
for numa_wake_affine() that works on the LLC level?

2017-06-26 16:13:10

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 4/4] sched,fair: remove effective_load

On Mon, Jun 26, 2017 at 11:20:54AM -0400, Rik van Riel wrote:

> Oh, indeed. I guess in wake_affine() we should test
> whether the CPUs are in the same NUMA node, rather than
> doing cpus_share_cache() ?

Well, since select_idle_sibling() is on LLC; the early test on
cpus_share_cache(prev,this) seems to actually make sense.

But then cutting out all the other bits seems wrong. Not in the least
because !NUMA_BALACING should also still keep working.

> Or, alternatively, have an update_numa_stats() variant
> for numa_wake_affine() that works on the LLC level?

I think we want to retain the existing behaviour for everything
larger than LLC, and when NUMA_BALANCING, smaller than NUMA.

Also note that your use of task_h_load() in the new numa thing suffers
from exactly the problem effective_load() is trying to solve.

2017-06-26 19:34:56

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH 4/4] sched,fair: remove effective_load

On Mon, 2017-06-26 at 18:12 +0200, Peter Zijlstra wrote:
> On Mon, Jun 26, 2017 at 11:20:54AM -0400, Rik van Riel wrote:
>
> > Oh, indeed.  I guess in wake_affine() we should test
> > whether the CPUs are in the same NUMA node, rather than
> > doing cpus_share_cache() ?
>
> Well, since select_idle_sibling() is on LLC; the early test on
> cpus_share_cache(prev,this) seems to actually make sense.
>
> But then cutting out all the other bits seems wrong. Not in the least
> because !NUMA_BALACING should also still keep working.

Even when !NUMA_BALANCING, I suspect it makes little sense
to compare the loads just one the cores in question, since
select_idle_sibling() will likely move the task somewhere
else.

I suspect we want to compare the load on the whole LLC
for that reason, even with NUMA_BALANCING disabled.

> > Or, alternatively, have an update_numa_stats() variant
> > for numa_wake_affine() that works on the LLC level?
>
> I think we want to retain the existing behaviour for everything
> larger than LLC, and when NUMA_BALANCING, smaller than NUMA.

What do you mean by this, exactly?

How does the "existing behaviour" of only looking at
the load on two cores make sense when doing LLC-level
task placement?

> Also note that your use of task_h_load() in the new numa thing
> suffers
> from exactly the problem effective_load() is trying to solve.

Are you saying task_h_load is wrong in task_numa_compare()
too, then? Should both use effective_load()?

2017-06-27 05:39:23

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 4/4] sched,fair: remove effective_load

On Mon, Jun 26, 2017 at 03:34:49PM -0400, Rik van Riel wrote:
> On Mon, 2017-06-26 at 18:12 +0200, Peter Zijlstra wrote:
> > On Mon, Jun 26, 2017 at 11:20:54AM -0400, Rik van Riel wrote:
> >
> > > Oh, indeed.??I guess in wake_affine() we should test
> > > whether the CPUs are in the same NUMA node, rather than
> > > doing cpus_share_cache() ?
> >
> > Well, since select_idle_sibling() is on LLC; the early test on
> > cpus_share_cache(prev,this) seems to actually make sense.
> >
> > But then cutting out all the other bits seems wrong. Not in the least
> > because !NUMA_BALACING should also still keep working.
>
> Even when !NUMA_BALANCING, I suspect it makes little sense
> to compare the loads just one the cores in question, since
> select_idle_sibling() will likely move the task somewhere
> else.
>
> I suspect we want to compare the load on the whole LLC
> for that reason, even with NUMA_BALANCING disabled.

But we don't have that data around :/ One thing we could do is try and
keep a copy of the last s*_lb_stats around in the sched_domain_shared
stuff or something and try and use that.

That way we can strictly keep things at the LLC level and not confuse
things with NUMA.

Similarly, we could use that same data to then avoid re-computing things
for the NUMA domain as well and do away with numa_stats.

> > > Or, alternatively, have an update_numa_stats() variant
> > > for numa_wake_affine() that works on the LLC level?
> >
> > I think we want to retain the existing behaviour for everything
> > larger than LLC, and when NUMA_BALANCING, smaller than NUMA.
>
> What do you mean by this, exactly?

As you noted, when prev and this are in the same LLC, it doesn't matter
and select_idle_sibling() will do its thing. So anything smaller than
the LLC need not do anything.

When NUMA_BALANCING we have the numa_stats thing and we can, as you
propose use that.

If LLC < NUMA or !NUMA_BALANCING we have a region that needs to do
_something_.

> How does the "existing behaviour" of only looking at
> the load on two cores make sense when doing LLC-level
> task placement?

Right, might not be ideal, but its what we have now. Supposedly its
better than not doing anything at all.

But see above for other ideas.

> > Also note that your use of task_h_load() in the new numa thing
> > suffers
> > from exactly the problem effective_load() is trying to solve.
>
> Are you saying task_h_load is wrong in task_numa_compare()
> too, then? Should both use effective_load()?

I need more than the few minutes I currently have, but probably. The
question is of course, how much does it matter and how painful will it
be to do it better.

2017-06-27 14:56:08

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH 4/4] sched,fair: remove effective_load

On Tue, 2017-06-27 at 07:39 +0200, Peter Zijlstra wrote:
> On Mon, Jun 26, 2017 at 03:34:49PM -0400, Rik van Riel wrote:
> > On Mon, 2017-06-26 at 18:12 +0200, Peter Zijlstra wrote:
> > > On Mon, Jun 26, 2017 at 11:20:54AM -0400, Rik van Riel wrote:
> > >
> > > > Oh, indeed.  I guess in wake_affine() we should test
> > > > whether the CPUs are in the same NUMA node, rather than
> > > > doing cpus_share_cache() ?
> > >
> > > Well, since select_idle_sibling() is on LLC; the early test on
> > > cpus_share_cache(prev,this) seems to actually make sense.
> > >
> > > But then cutting out all the other bits seems wrong. Not in the
> > > least
> > > because !NUMA_BALACING should also still keep working.
> >
> > Even when !NUMA_BALANCING, I suspect it makes little sense
> > to compare the loads just one the cores in question, since
> > select_idle_sibling() will likely move the task somewhere
> > else.
> >
> > I suspect we want to compare the load on the whole LLC
> > for that reason, even with NUMA_BALANCING disabled.
>
> But we don't have that data around :/ One thing we could do is try
> and
> keep a copy of the last s*_lb_stats around in the sched_domain_shared
> stuff or something and try and use that.
>
> That way we can strictly keep things at the LLC level and not confuse
> things with NUMA.
>
> Similarly, we could use that same data to then avoid re-computing
> things
> for the NUMA domain as well and do away with numa_stats.

That does seem like a useful optimization, though
I guess we would have to invalidate the cached data
every time we actually move a task?

The current code simply walks all the CPUs in the
cpumask_t, and adds up capacity and load. Doing
that appears to be better than poor task placement
(Jirka's numbers speak for themselves), but optimizing
this code path does seem like a worthwhile goal.

I'll look into it.

> > > > Or, alternatively, have an update_numa_stats() variant
> > > > for numa_wake_affine() that works on the LLC level?
> > >
> > > I think we want to retain the existing behaviour for everything
> > > larger than LLC, and when NUMA_BALANCING, smaller than NUMA.
> >
> > What do you mean by this, exactly?
>
> As you noted, when prev and this are in the same LLC, it doesn't
> matter
> and select_idle_sibling() will do its thing. So anything smaller than
> the LLC need not do anything.
>
> When NUMA_BALANCING we have the numa_stats thing and we can, as you
> propose use that.
>
> If LLC < NUMA or !NUMA_BALANCING we have a region that needs to do
> _something_.

Agreed. I will fix this. Given that this is a bit
of a corner case, I guess I can fix this with follow-up
patches, to be merged into -tip before the whole series
is sent on to Linus?

> > > Also note that your use of task_h_load() in the new numa thing
> > > suffers
> > > from exactly the problem effective_load() is trying to solve.
> >
> > Are you saying task_h_load is wrong in task_numa_compare()
> > too, then?  Should both use effective_load()?
>
> I need more than the few minutes I currently have, but probably. The
> question is of course, how much does it matter and how painful will
> it
> be to do it better.

I suspect it does not matter at all currenly, since the 
load balancing code does not use effective_load, and
having the wake_affine logic calculate things differently
from the load balancer is likely to result in both pieces
of code fighting against each other.

I suspect we should either use task_h_load everywhere,
or effective_load everywhere, but not have a mix and
match situation where one is used in some places, and
the other in others.

--
All rights reversed


Attachments:
signature.asc (473.00 B)
This is a digitally signed message part

2017-06-27 18:27:51

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH 4/4] sched,fair: remove effective_load

On Mon, 2017-06-26 at 18:12 +0200, Peter Zijlstra wrote:

> Also note that your use of task_h_load() in the new numa thing
> suffers
> from exactly the problem effective_load() is trying to solve.

Thinking about this some more, I suspect that
using effective_load() in combination with
select_idle_sibling(), which will often place
the task on a different CPU than the one specified,
may not lead to entirely useful results...

--
All rights reversed


Attachments:
signature.asc (473.00 B)
This is a digitally signed message part

2017-08-01 12:19:22

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH] sched/fair: Fix wake_affine() for !NUMA_BALANCING

On Tue, Jun 27, 2017 at 10:55:58AM -0400, Rik van Riel wrote:
> On Tue, 2017-06-27 at 07:39 +0200, Peter Zijlstra wrote:

> > If LLC < NUMA or !NUMA_BALANCING we have a region that needs to do
> > _something_.
>
> Agreed. I will fix this. Given that this is a bit
> of a corner case,


Ok, so it turns out all of facebook sits inside that corner case because
they have !NUMA_BALANCING.

How does the below patch work?

---
Subject: sched/fair: Fix wake_affine() for !NUMA_BALANCING
From: Peter Zijlstra <[email protected]>
Date: Mon Jul 31 17:50:05 CEST 2017

In commit:

3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")

Rik changed wake_affine to consider NUMA information when balancing
between LLC domains.

There are a number of problems here which this patch tries to address:

- LLC < NODE; in this case we'd use the wrong information to balance
- !NUMA_BALANCING: in this case, the new code doesn't do any
balancing at all
- re-computes the NUMA data for every wakeup, this can mean iterating
up to 64 CPUs for every wakeup.
- default affine wakeups inside a cache

We address these by saving the load/capacity values for each
sched_domain during regular load-balance and using these values in
wake_affine_llc(). The obvious down-side to using cached values is
that they can be too old and poorly reflect reality.

But this way we can use LLC wide information and thus not rely on
assuming LLC matches NODE. We also don't rely on NUMA_BALANCING nor do
we have to aggegate two nodes (or even cache domains) worth of CPUs
for each wakeup.

Cc: Josef Bacik <[email protected]>
Cc: Rik van Riel <[email protected]>
Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/sched/topology.h | 9 ++
kernel/sched/fair.c | 182 +++++++++++++++++++++++++----------------
2 files changed, 123 insertions(+), 68 deletions(-)

--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -71,6 +71,15 @@ struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
int has_idle_cores;
+
+ /*
+ * Some variables from the most recent sd_lb_stats for this domain.
+ *
+ * used by wake_affine().
+ */
+ unsigned long nr_running;
+ unsigned long load;
+ unsigned long capacity;
};

struct sched_domain {
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2586,59 +2586,6 @@ void task_tick_numa(struct rq *rq, struc
}
}

-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- */
-static inline bool numa_wake_affine(struct sched_domain *sd,
- struct task_struct *p, int this_cpu,
- int prev_cpu, int sync)
-{
- struct numa_stats prev_load, this_load;
- s64 this_eff_load, prev_eff_load;
-
- update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
- update_numa_stats(&this_load, cpu_to_node(this_cpu));
-
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current CPU:
- */
- if (sync) {
- unsigned long current_load = task_h_load(current);
-
- if (this_load.load > current_load)
- this_load.load -= current_load;
- else
- this_load.load = 0;
- }
-
- /*
- * In low-load situations, where this_cpu's node is idle due to the
- * sync cause above having dropped this_load.load to 0, move the task.
- * Moving to an idle socket will not create a bad imbalance.
- *
- * Otherwise check if the nodes are near enough in load to allow this
- * task to be woken on this_cpu's node.
- */
- if (this_load.load > 0) {
- unsigned long task_load = task_h_load(p);
-
- this_eff_load = 100;
- this_eff_load *= prev_load.compute_capacity;
-
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= this_load.compute_capacity;
-
- this_eff_load *= this_load.load + task_load;
- prev_eff_load *= prev_load.load - task_load;
-
- return this_eff_load <= prev_eff_load;
- }
-
- return true;
-}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
@@ -2652,14 +2599,6 @@ static inline void account_numa_dequeue(
{
}

-#ifdef CONFIG_SMP
-static inline bool numa_wake_affine(struct sched_domain *sd,
- struct task_struct *p, int this_cpu,
- int prev_cpu, int sync)
-{
- return true;
-}
-#endif /* !SMP */
#endif /* CONFIG_NUMA_BALANCING */

static void
@@ -5356,20 +5295,110 @@ static int wake_wide(struct task_struct
return 1;
}

+struct llc_stats {
+ unsigned long nr_running;
+ unsigned long load;
+ unsigned long capacity;
+ int has_capacity;
+};
+
+static void get_llc_stats(struct llc_stats *stats, int cpu)
+{
+ struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+
+ stats->nr_running = READ_ONCE(sds->nr_running);
+ stats->load = READ_ONCE(sds->load);
+ stats->capacity = READ_ONCE(sds->capacity);
+ stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
+}
+
+/*
+ * Can a task be moved from prev_cpu to this_cpu without causing a load
+ * imbalance that would trigger the load balancer?
+ *
+ * Since we're running on 'stale' values, we might in fact create an imbalance
+ * but recomputing these values is expensive, as that'd mean iteration 2 cache
+ * domains worth of CPUs.
+ */
+static bool
+wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
+{
+ struct llc_stats prev_stats, this_stats;
+ s64 this_eff_load, prev_eff_load;
+ unsigned long task_load;
+
+ get_llc_stats(&prev_stats, prev_cpu);
+ get_llc_stats(&this_stats, this_cpu);
+
+ /*
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current LLC.
+ */
+ if (sync) {
+ unsigned long current_load = task_h_load(current);
+
+ /* in this case load hits 0 and this LLC is considered 'idle' */
+ if (current_load > this_stats.load)
+ return true;
+
+ this_stats.load -= current_load;
+ }
+
+ /*
+ * The has_capacity stuff is not SMT aware, but by trying to balance
+ * the nr_running on both ends we try and fill the domain at equal
+ * rates, thereby first consuming cores before siblings.
+ */
+
+ /* if the old cache has capacity, stay there */
+ if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
+ return false;
+
+ /* if this cache has capacity, come here */
+ if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)
+ return true;
+
+
+ /*
+ * Check to see if we can move the load without causing too much
+ * imbalance.
+ */
+ task_load = task_h_load(p);
+
+ this_eff_load = 100;
+ this_eff_load *= prev_stats.capacity;
+
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= this_stats.capacity;
+
+ this_eff_load *= this_stats.load + task_load;
+ prev_eff_load *= prev_stats.load - task_load;
+
+ return this_eff_load <= prev_eff_load;
+}
+
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
int this_cpu = smp_processor_id();
- bool affine = false;
+ bool affine;

/*
- * Common case: CPUs are in the same socket, and select_idle_sibling()
- * will do its thing regardless of what we return:
+ * Default to no affine wakeups; wake_affine() should not effect a task
+ * placement the load-balancer feels inclined to undo. The conservative
+ * option is therefore to not move tasks when they wake up.
*/
- if (cpus_share_cache(prev_cpu, this_cpu))
- affine = true;
- else
- affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
+ affine = false;
+
+ /*
+ * If the wakeup is across cache domains, try to evaluate if movement
+ * makes sense, otherwise rely on select_idle_siblings() to do
+ * placement inside the cache domain.
+ */
+ if (!cpus_share_cache(prev_cpu, this_cpu))
+ affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);

schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (affine) {
@@ -7049,6 +7078,7 @@ struct sg_lb_stats {
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
+ unsigned long total_running;
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
@@ -7068,6 +7098,7 @@ static inline void init_sd_lb_stats(stru
*sds = (struct sd_lb_stats){
.busiest = NULL,
.local = NULL,
+ .total_running = 0UL,
.total_load = 0UL,
.total_capacity = 0UL,
.busiest_stat = {
@@ -7503,6 +7534,7 @@ static inline enum fbq_type fbq_classify
*/
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
+ struct sched_domain_shared *shared = env->sd->shared;
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
@@ -7559,6 +7591,7 @@ static inline void update_sd_lb_stats(st

next_group:
/* Now, start updating sd_lb_stats */
+ sds->total_running += sgs->sum_nr_running;
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;

@@ -7574,6 +7607,18 @@ static inline void update_sd_lb_stats(st
env->dst_rq->rd->overload = overload;
}

+ /*
+ * Since these are sums over groups they can contain some CPUs
+ * multiple times for the NUMA domains.
+ *
+ * Currently only wake_affine_llc() and find_busiest_group()
+ * uses these numbers, only the last is affected by this problem.
+ *
+ * XXX fix that.
+ */
+ WRITE_ONCE(shared->nr_running, sds->total_running);
+ WRITE_ONCE(shared->load, sds->total_load);
+ WRITE_ONCE(shared->capacity, sds->total_capacity);
}

/**
@@ -7803,6 +7848,7 @@ static struct sched_group *find_busiest_
if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;

+ /* XXX broken for overlapping NUMA groups */
sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
/ sds.total_capacity;


2017-08-01 19:26:58

by Josef Bacik

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: Fix wake_affine() for !NUMA_BALANCING

On Tue, Aug 01, 2017 at 02:19:12PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 27, 2017 at 10:55:58AM -0400, Rik van Riel wrote:
> > On Tue, 2017-06-27 at 07:39 +0200, Peter Zijlstra wrote:
>
> > > If LLC < NUMA or !NUMA_BALANCING we have a region that needs to do
> > > _something_.
> >
> > Agreed. I will fix this. Given that this is a bit
> > of a corner case,
>
>
> Ok, so it turns out all of facebook sits inside that corner case because
> they have !NUMA_BALANCING.
>
> How does the below patch work?
>
> ---
> Subject: sched/fair: Fix wake_affine() for !NUMA_BALANCING
> From: Peter Zijlstra <[email protected]>
> Date: Mon Jul 31 17:50:05 CEST 2017
>
> In commit:
>
> 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")
>
> Rik changed wake_affine to consider NUMA information when balancing
> between LLC domains.
>
> There are a number of problems here which this patch tries to address:
>
> - LLC < NODE; in this case we'd use the wrong information to balance
> - !NUMA_BALANCING: in this case, the new code doesn't do any
> balancing at all
> - re-computes the NUMA data for every wakeup, this can mean iterating
> up to 64 CPUs for every wakeup.
> - default affine wakeups inside a cache
>
> We address these by saving the load/capacity values for each
> sched_domain during regular load-balance and using these values in
> wake_affine_llc(). The obvious down-side to using cached values is
> that they can be too old and poorly reflect reality.
>
> But this way we can use LLC wide information and thus not rely on
> assuming LLC matches NODE. We also don't rely on NUMA_BALANCING nor do
> we have to aggegate two nodes (or even cache domains) worth of CPUs
> for each wakeup.
>
> Cc: Josef Bacik <[email protected]>
> Cc: Rik van Riel <[email protected]>
> Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> include/linux/sched/topology.h | 9 ++
> kernel/sched/fair.c | 182 +++++++++++++++++++++++++----------------
> 2 files changed, 123 insertions(+), 68 deletions(-)
>
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -71,6 +71,15 @@ struct sched_domain_shared {
> atomic_t ref;
> atomic_t nr_busy_cpus;
> int has_idle_cores;
> +
> + /*
> + * Some variables from the most recent sd_lb_stats for this domain.
> + *
> + * used by wake_affine().
> + */
> + unsigned long nr_running;
> + unsigned long load;
> + unsigned long capacity;
> };
>
> struct sched_domain {
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2586,59 +2586,6 @@ void task_tick_numa(struct rq *rq, struc
> }
> }
>
> -/*
> - * Can a task be moved from prev_cpu to this_cpu without causing a load
> - * imbalance that would trigger the load balancer?
> - */
> -static inline bool numa_wake_affine(struct sched_domain *sd,
> - struct task_struct *p, int this_cpu,
> - int prev_cpu, int sync)
> -{
> - struct numa_stats prev_load, this_load;
> - s64 this_eff_load, prev_eff_load;
> -
> - update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
> - update_numa_stats(&this_load, cpu_to_node(this_cpu));
> -
> - /*
> - * If sync wakeup then subtract the (maximum possible)
> - * effect of the currently running task from the load
> - * of the current CPU:
> - */
> - if (sync) {
> - unsigned long current_load = task_h_load(current);
> -
> - if (this_load.load > current_load)
> - this_load.load -= current_load;
> - else
> - this_load.load = 0;
> - }
> -
> - /*
> - * In low-load situations, where this_cpu's node is idle due to the
> - * sync cause above having dropped this_load.load to 0, move the task.
> - * Moving to an idle socket will not create a bad imbalance.
> - *
> - * Otherwise check if the nodes are near enough in load to allow this
> - * task to be woken on this_cpu's node.
> - */
> - if (this_load.load > 0) {
> - unsigned long task_load = task_h_load(p);
> -
> - this_eff_load = 100;
> - this_eff_load *= prev_load.compute_capacity;
> -
> - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
> - prev_eff_load *= this_load.compute_capacity;
> -
> - this_eff_load *= this_load.load + task_load;
> - prev_eff_load *= prev_load.load - task_load;
> -
> - return this_eff_load <= prev_eff_load;
> - }
> -
> - return true;
> -}
> #else
> static void task_tick_numa(struct rq *rq, struct task_struct *curr)
> {
> @@ -2652,14 +2599,6 @@ static inline void account_numa_dequeue(
> {
> }
>
> -#ifdef CONFIG_SMP
> -static inline bool numa_wake_affine(struct sched_domain *sd,
> - struct task_struct *p, int this_cpu,
> - int prev_cpu, int sync)
> -{
> - return true;
> -}
> -#endif /* !SMP */
> #endif /* CONFIG_NUMA_BALANCING */
>
> static void
> @@ -5356,20 +5295,110 @@ static int wake_wide(struct task_struct
> return 1;
> }
>
> +struct llc_stats {
> + unsigned long nr_running;
> + unsigned long load;
> + unsigned long capacity;
> + int has_capacity;
> +};
> +
> +static void get_llc_stats(struct llc_stats *stats, int cpu)
> +{
> + struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
> +
> + stats->nr_running = READ_ONCE(sds->nr_running);
> + stats->load = READ_ONCE(sds->load);
> + stats->capacity = READ_ONCE(sds->capacity);
> + stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
> +}
> +
> +/*
> + * Can a task be moved from prev_cpu to this_cpu without causing a load
> + * imbalance that would trigger the load balancer?
> + *
> + * Since we're running on 'stale' values, we might in fact create an imbalance
> + * but recomputing these values is expensive, as that'd mean iteration 2 cache
> + * domains worth of CPUs.
> + */
> +static bool
> +wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
> + int this_cpu, int prev_cpu, int sync)
> +{
> + struct llc_stats prev_stats, this_stats;
> + s64 this_eff_load, prev_eff_load;
> + unsigned long task_load;
> +
> + get_llc_stats(&prev_stats, prev_cpu);
> + get_llc_stats(&this_stats, this_cpu);
> +
> + /*
> + * If sync wakeup then subtract the (maximum possible)
> + * effect of the currently running task from the load
> + * of the current LLC.
> + */
> + if (sync) {
> + unsigned long current_load = task_h_load(current);
> +
> + /* in this case load hits 0 and this LLC is considered 'idle' */
> + if (current_load > this_stats.load)
> + return true;
> +
> + this_stats.load -= current_load;
> + }
> +
> + /*
> + * The has_capacity stuff is not SMT aware, but by trying to balance
> + * the nr_running on both ends we try and fill the domain at equal
> + * rates, thereby first consuming cores before siblings.
> + */
> +
> + /* if the old cache has capacity, stay there */
> + if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
> + return false;
> +
> + /* if this cache has capacity, come here */
> + if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)
> + return true;
> +
> +
> + /*
> + * Check to see if we can move the load without causing too much
> + * imbalance.
> + */
> + task_load = task_h_load(p);
> +
> + this_eff_load = 100;
> + this_eff_load *= prev_stats.capacity;
> +
> + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
> + prev_eff_load *= this_stats.capacity;
> +
> + this_eff_load *= this_stats.load + task_load;
> + prev_eff_load *= prev_stats.load - task_load;
> +
> + return this_eff_load <= prev_eff_load;
> +}
> +
> static int wake_affine(struct sched_domain *sd, struct task_struct *p,
> int prev_cpu, int sync)
> {
> int this_cpu = smp_processor_id();
> - bool affine = false;
> + bool affine;
>
> /*
> - * Common case: CPUs are in the same socket, and select_idle_sibling()
> - * will do its thing regardless of what we return:
> + * Default to no affine wakeups; wake_affine() should not effect a task
> + * placement the load-balancer feels inclined to undo. The conservative
> + * option is therefore to not move tasks when they wake up.
> */
> - if (cpus_share_cache(prev_cpu, this_cpu))
> - affine = true;
> - else
> - affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
> + affine = false;
> +
> + /*
> + * If the wakeup is across cache domains, try to evaluate if movement
> + * makes sense, otherwise rely on select_idle_siblings() to do
> + * placement inside the cache domain.
> + */
> + if (!cpus_share_cache(prev_cpu, this_cpu))
> + affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
>
> schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
> if (affine) {
> @@ -7049,6 +7078,7 @@ struct sg_lb_stats {
> struct sd_lb_stats {
> struct sched_group *busiest; /* Busiest group in this sd */
> struct sched_group *local; /* Local group in this sd */
> + unsigned long total_running;
> unsigned long total_load; /* Total load of all groups in sd */
> unsigned long total_capacity; /* Total capacity of all groups in sd */
> unsigned long avg_load; /* Average load across all groups in sd */
> @@ -7068,6 +7098,7 @@ static inline void init_sd_lb_stats(stru
> *sds = (struct sd_lb_stats){
> .busiest = NULL,
> .local = NULL,
> + .total_running = 0UL,
> .total_load = 0UL,
> .total_capacity = 0UL,
> .busiest_stat = {
> @@ -7503,6 +7534,7 @@ static inline enum fbq_type fbq_classify
> */
> static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
> {
> + struct sched_domain_shared *shared = env->sd->shared;
> struct sched_domain *child = env->sd->child;
> struct sched_group *sg = env->sd->groups;
> struct sg_lb_stats *local = &sds->local_stat;
> @@ -7559,6 +7591,7 @@ static inline void update_sd_lb_stats(st
>
> next_group:
> /* Now, start updating sd_lb_stats */
> + sds->total_running += sgs->sum_nr_running;
> sds->total_load += sgs->group_load;
> sds->total_capacity += sgs->group_capacity;
>
> @@ -7574,6 +7607,18 @@ static inline void update_sd_lb_stats(st
> env->dst_rq->rd->overload = overload;
> }
>
> + /*
> + * Since these are sums over groups they can contain some CPUs
> + * multiple times for the NUMA domains.
> + *
> + * Currently only wake_affine_llc() and find_busiest_group()
> + * uses these numbers, only the last is affected by this problem.
> + *
> + * XXX fix that.
> + */
> + WRITE_ONCE(shared->nr_running, sds->total_running);
> + WRITE_ONCE(shared->load, sds->total_load);
> + WRITE_ONCE(shared->capacity, sds->total_capacity);

This panic's on boot for me because shared is NULL. Same happens in
select_task_rq_fair when it tries to do the READ_ONCE. Here is my .config in
case it's something strange with my config. Thanks,

Josef


Attachments:
(No filename) (10.51 kB)
config (103.41 kB)
Download all attachments

2017-08-01 21:43:14

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: Fix wake_affine() for !NUMA_BALANCING

On Tue, Aug 01, 2017 at 03:26:51PM -0400, Josef Bacik wrote:
> > @@ -7574,6 +7607,18 @@ static inline void update_sd_lb_stats(st
> > env->dst_rq->rd->overload = overload;
> > }
> >
> > + /*
> > + * Since these are sums over groups they can contain some CPUs
> > + * multiple times for the NUMA domains.
> > + *
> > + * Currently only wake_affine_llc() and find_busiest_group()
> > + * uses these numbers, only the last is affected by this problem.
> > + *
> > + * XXX fix that.
> > + */
> > + WRITE_ONCE(shared->nr_running, sds->total_running);
> > + WRITE_ONCE(shared->load, sds->total_load);
> > + WRITE_ONCE(shared->capacity, sds->total_capacity);
>
> This panic's on boot for me because shared is NULL. Same happens in
> select_task_rq_fair when it tries to do the READ_ONCE. Here is my .config in
> case it's something strange with my config. Thanks,

Nah, its just me being an idiot and booting the wrong kernel. Unless I
messed up again, this one boots.

There is state during boot and hotplug where there are no domains, and
thus also no shared. Simply ignoring things when that happens should be
good enough I think.


---
Subject: sched/fair: Fix wake_affine() for !NUMA_BALANCING
From: Peter Zijlstra <[email protected]>
Date: Mon Jul 31 17:50:05 CEST 2017

In commit:

3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")

Rik changed wake_affine to consider NUMA information when balancing
between LLC domains.

There are a number of problems here which this patch tries to address:

- LLC < NODE; in this case we'd use the wrong information to balance
- !NUMA_BALANCING: in this case, the new code doesn't do any
balancing at all
- re-computes the NUMA data for every wakeup, this can mean iterating
up to 64 CPUs for every wakeup.
- default affine wakeups inside a cache

We address these by saving the load/capacity values for each
sched_domain during regular load-balance and using these values in
wake_affine_llc(). The obvious down-side to using cached values is
that they can be too old and poorly reflect reality.

But this way we can use LLC wide information and thus not rely on
assuming LLC matches NODE. We also don't rely on NUMA_BALANCING nor do
we have to aggegate two nodes (or even cache domains) worth of CPUs
for each wakeup.

Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")
Cc: Josef Bacik <[email protected]>
Cc: Rik van Riel <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/sched/topology.h | 9 +
kernel/sched/fair.c | 190 ++++++++++++++++++++++++++---------------
2 files changed, 131 insertions(+), 68 deletions(-)

--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -71,6 +71,15 @@ struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
int has_idle_cores;
+
+ /*
+ * Some variables from the most recent sd_lb_stats for this domain.
+ *
+ * used by wake_affine().
+ */
+ unsigned long nr_running;
+ unsigned long load;
+ unsigned long capacity;
};

struct sched_domain {
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2586,59 +2586,6 @@ void task_tick_numa(struct rq *rq, struc
}
}

-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- */
-static inline bool numa_wake_affine(struct sched_domain *sd,
- struct task_struct *p, int this_cpu,
- int prev_cpu, int sync)
-{
- struct numa_stats prev_load, this_load;
- s64 this_eff_load, prev_eff_load;
-
- update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
- update_numa_stats(&this_load, cpu_to_node(this_cpu));
-
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current CPU:
- */
- if (sync) {
- unsigned long current_load = task_h_load(current);
-
- if (this_load.load > current_load)
- this_load.load -= current_load;
- else
- this_load.load = 0;
- }
-
- /*
- * In low-load situations, where this_cpu's node is idle due to the
- * sync cause above having dropped this_load.load to 0, move the task.
- * Moving to an idle socket will not create a bad imbalance.
- *
- * Otherwise check if the nodes are near enough in load to allow this
- * task to be woken on this_cpu's node.
- */
- if (this_load.load > 0) {
- unsigned long task_load = task_h_load(p);
-
- this_eff_load = 100;
- this_eff_load *= prev_load.compute_capacity;
-
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= this_load.compute_capacity;
-
- this_eff_load *= this_load.load + task_load;
- prev_eff_load *= prev_load.load - task_load;
-
- return this_eff_load <= prev_eff_load;
- }
-
- return true;
-}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
@@ -2652,14 +2599,6 @@ static inline void account_numa_dequeue(
{
}

-#ifdef CONFIG_SMP
-static inline bool numa_wake_affine(struct sched_domain *sd,
- struct task_struct *p, int this_cpu,
- int prev_cpu, int sync)
-{
- return true;
-}
-#endif /* !SMP */
#endif /* CONFIG_NUMA_BALANCING */

static void
@@ -5356,20 +5295,115 @@ static int wake_wide(struct task_struct
return 1;
}

+struct llc_stats {
+ unsigned long nr_running;
+ unsigned long load;
+ unsigned long capacity;
+ int has_capacity;
+};
+
+static void get_llc_stats(struct llc_stats *stats, int cpu)
+{
+ struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+
+ if (!sds) {
+ memset(&stats, 0, sizeof(*stats));
+ return;
+ }
+
+ stats->nr_running = READ_ONCE(sds->nr_running);
+ stats->load = READ_ONCE(sds->load);
+ stats->capacity = READ_ONCE(sds->capacity);
+ stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
+}
+
+/*
+ * Can a task be moved from prev_cpu to this_cpu without causing a load
+ * imbalance that would trigger the load balancer?
+ *
+ * Since we're running on 'stale' values, we might in fact create an imbalance
+ * but recomputing these values is expensive, as that'd mean iteration 2 cache
+ * domains worth of CPUs.
+ */
+static bool
+wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
+{
+ struct llc_stats prev_stats, this_stats;
+ s64 this_eff_load, prev_eff_load;
+ unsigned long task_load;
+
+ get_llc_stats(&prev_stats, prev_cpu);
+ get_llc_stats(&this_stats, this_cpu);
+
+ /*
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current LLC.
+ */
+ if (sync) {
+ unsigned long current_load = task_h_load(current);
+
+ /* in this case load hits 0 and this LLC is considered 'idle' */
+ if (current_load > this_stats.load)
+ return true;
+
+ this_stats.load -= current_load;
+ }
+
+ /*
+ * The has_capacity stuff is not SMT aware, but by trying to balance
+ * the nr_running on both ends we try and fill the domain at equal
+ * rates, thereby first consuming cores before siblings.
+ */
+
+ /* if the old cache has capacity, stay there */
+ if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
+ return false;
+
+ /* if this cache has capacity, come here */
+ if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)
+ return true;
+
+
+ /*
+ * Check to see if we can move the load without causing too much
+ * imbalance.
+ */
+ task_load = task_h_load(p);
+
+ this_eff_load = 100;
+ this_eff_load *= prev_stats.capacity;
+
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= this_stats.capacity;
+
+ this_eff_load *= this_stats.load + task_load;
+ prev_eff_load *= prev_stats.load - task_load;
+
+ return this_eff_load <= prev_eff_load;
+}
+
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
int this_cpu = smp_processor_id();
- bool affine = false;
+ bool affine;

/*
- * Common case: CPUs are in the same socket, and select_idle_sibling()
- * will do its thing regardless of what we return:
+ * Default to no affine wakeups; wake_affine() should not effect a task
+ * placement the load-balancer feels inclined to undo. The conservative
+ * option is therefore to not move tasks when they wake up.
*/
- if (cpus_share_cache(prev_cpu, this_cpu))
- affine = true;
- else
- affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
+ affine = false;
+
+ /*
+ * If the wakeup is across cache domains, try to evaluate if movement
+ * makes sense, otherwise rely on select_idle_siblings() to do
+ * placement inside the cache domain.
+ */
+ if (!cpus_share_cache(prev_cpu, this_cpu))
+ affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);

schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (affine) {
@@ -7049,6 +7083,7 @@ struct sg_lb_stats {
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
+ unsigned long total_running;
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
@@ -7068,6 +7103,7 @@ static inline void init_sd_lb_stats(stru
*sds = (struct sd_lb_stats){
.busiest = NULL,
.local = NULL,
+ .total_running = 0UL,
.total_load = 0UL,
.total_capacity = 0UL,
.busiest_stat = {
@@ -7503,6 +7539,7 @@ static inline enum fbq_type fbq_classify
*/
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
+ struct sched_domain_shared *shared = env->sd->shared;
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
@@ -7559,6 +7596,7 @@ static inline void update_sd_lb_stats(st

next_group:
/* Now, start updating sd_lb_stats */
+ sds->total_running += sgs->sum_nr_running;
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;

@@ -7574,6 +7612,21 @@ static inline void update_sd_lb_stats(st
env->dst_rq->rd->overload = overload;
}

+ if (!shared)
+ return;
+
+ /*
+ * Since these are sums over groups they can contain some CPUs
+ * multiple times for the NUMA domains.
+ *
+ * Currently only wake_affine_llc() and find_busiest_group()
+ * uses these numbers, only the last is affected by this problem.
+ *
+ * XXX fix that.
+ */
+ WRITE_ONCE(shared->nr_running, sds->total_running);
+ WRITE_ONCE(shared->load, sds->total_load);
+ WRITE_ONCE(shared->capacity, sds->total_capacity);
}

/**
@@ -7803,6 +7856,7 @@ static struct sched_group *find_busiest_
if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;

+ /* XXX broken for overlapping NUMA groups */
sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
/ sds.total_capacity;


2017-08-24 22:30:24

by Chris Wilson

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: Fix wake_affine() for !NUMA_BALANCING

We've stumbled across the same regression on Broxton/Apollolake (J3455).

Quoting Peter Zijlstra (2017-08-01 22:43:07)
> On Tue, Aug 01, 2017 at 03:26:51PM -0400, Josef Bacik wrote:
> > > @@ -7574,6 +7607,18 @@ static inline void update_sd_lb_stats(st
> > > env->dst_rq->rd->overload = overload;
> > > }
> > >
> > > + /*
> > > + * Since these are sums over groups they can contain some CPUs
> > > + * multiple times for the NUMA domains.
> > > + *
> > > + * Currently only wake_affine_llc() and find_busiest_group()
> > > + * uses these numbers, only the last is affected by this problem.
> > > + *
> > > + * XXX fix that.
> > > + */
> > > + WRITE_ONCE(shared->nr_running, sds->total_running);
> > > + WRITE_ONCE(shared->load, sds->total_load);
> > > + WRITE_ONCE(shared->capacity, sds->total_capacity);
> >
> > This panic's on boot for me because shared is NULL. Same happens in
> > select_task_rq_fair when it tries to do the READ_ONCE. Here is my .config in
> > case it's something strange with my config. Thanks,
>
> Nah, its just me being an idiot and booting the wrong kernel. Unless I
> messed up again, this one boots.
>
> There is state during boot and hotplug where there are no domains, and
> thus also no shared. Simply ignoring things when that happens should be
> good enough I think.

This is still not as effective as the previous code in spreading across
siblings.

> +/*
> + * Can a task be moved from prev_cpu to this_cpu without causing a load
> + * imbalance that would trigger the load balancer?
> + *
> + * Since we're running on 'stale' values, we might in fact create an imbalance
> + * but recomputing these values is expensive, as that'd mean iteration 2 cache
> + * domains worth of CPUs.
> + */
> +static bool
> +wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
> + int this_cpu, int prev_cpu, int sync)
> +{
> + struct llc_stats prev_stats, this_stats;
> + s64 this_eff_load, prev_eff_load;
> + unsigned long task_load;
> +
> + get_llc_stats(&prev_stats, prev_cpu);
> + get_llc_stats(&this_stats, this_cpu);
> +
> + /*
> + * If sync wakeup then subtract the (maximum possible)
> + * effect of the currently running task from the load
> + * of the current LLC.
> + */
> + if (sync) {
> + unsigned long current_load = task_h_load(current);
> +
> + /* in this case load hits 0 and this LLC is considered 'idle' */
> + if (current_load > this_stats.load)
> + return true;
> +
> + this_stats.load -= current_load;
> + }
> +
> + /*
> + * The has_capacity stuff is not SMT aware, but by trying to balance
> + * the nr_running on both ends we try and fill the domain at equal
> + * rates, thereby first consuming cores before siblings.
> + */
> +
> + /* if the old cache has capacity, stay there */
> + if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
> + return false;
> +
> + /* if this cache has capacity, come here */
> + if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)

I think you mean,

if (this_stats.has_capacity && this_stats.nr_running + 1 < prev_stats.nr_running)

and with that our particular graphics benchmark behaves similarly to as
before (the regression appears fixed). But I'll let Eero double check.

> + return true;
> +
> +
> + /*
> + * Check to see if we can move the load without causing too much
> + * imbalance.
> + */
> + task_load = task_h_load(p);
> +
> + this_eff_load = 100;
> + this_eff_load *= prev_stats.capacity;
> +
> + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
> + prev_eff_load *= this_stats.capacity;
> +
> + this_eff_load *= this_stats.load + task_load;
> + prev_eff_load *= prev_stats.load - task_load;
> +
> + return this_eff_load <= prev_eff_load;
> +}

2017-08-25 15:46:43

by Chris Wilson

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: Fix wake_affine() for !NUMA_BALANCING

Quoting Peter Zijlstra (2017-08-01 22:43:07)
> @@ -5356,20 +5295,115 @@ static int wake_wide(struct task_struct
> return 1;
> }
>
> +struct llc_stats {
> + unsigned long nr_running;
> + unsigned long load;
> + unsigned long capacity;
> + int has_capacity;
> +};
> +
> +static void get_llc_stats(struct llc_stats *stats, int cpu)
> +{
> + struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
> +
> + if (!sds) {
> + memset(&stats, 0, sizeof(*stats));
^
Just in case, stray & before stats.
-Chris