LinuxLists.cc - [PATCH] sched/fair: Adjust the allowed NUMA imbalance when SD

2021-10-28 13:06:31

Subject: [PATCH] sched/fair: Adjust the allowed NUMA imbalance when SD_NUMA spans multiple LLCs

Commit 7d2b5dd0bcc4 ("sched/numa: Allow a floating imbalance between NUMA
nodes") allowed an imbalance between NUMA nodes such that communicating
tasks would not be pulled apart by the load balancer. This works fine when
there is a 1:1 relationship between LLC and node but can be suboptimal
for multiple LLCs if independent tasks prematurely use CPUs sharing cache.

Zen* has multiple LLCs per node with local memory channels and due to
the allowed imbalance, it's far harder to tune some workloads to run
optimally than it is on hardware that has 1 LLC per node. This patch
adjusts the imbalance on multi-LLC machines to allow an imbalance up to
the point where LLCs should be balanced between nodes.

On a Zen3 machine running STREAM parallelised with OMP to have on instance
per LLC the results and without binding, the results are

stream
5.15.0-rc3 5.15.0-rc3
vanilla sched-numaimb-v1r2
MB/sec copy-16 166652.10 ( 0.00%) 534760.46 ( 220.88%)
MB/sec scale-16 141550.36 ( 0.00%) 386871.58 ( 173.31%)
MB/sec add-16 156696.00 ( 0.00%) 631731.80 ( 303.16%)
MB/sec triad-16 155560.36 ( 0.00%) 622624.28 ( 300.25%)

STREAM can use directives to force the spread if the OpenMP is new
enough but that doesn't help if an application uses threads and
it's not known in advance how many threads will be created.

Coremark is a CPU and cache intensive benchmark parallelised with
pthreads. When running with 1 thread per instance, the vanilla kernel
allows threads to contend on cache. With the patch;

5.15.0-rc3 5.15.0-rc3
vanilla sched-numaimb-v1r2
Min Score-16 366090.84 ( 0.00%) 401505.65 ( 9.67%)
Hmean Score-16 391416.56 ( 0.00%) 452546.28 * 15.62%*
Stddev Score-16 16452.12 ( 0.00%) 31480.31 ( -91.35%)
CoeffVar Score-16 4.20 ( 0.00%) 6.92 ( -64.99%)
Max Score-16 416666.67 ( 0.00%) 483529.77 ( 16.05%)

It can also make a big difference for semi-realistic workloads
like specjbb which can execute arbitrary numbers of threads without
advance knowledge of how they should be placed

specjbb2005
5.15.0-rc3 5.15.0-rc3
vanilla sched-numaimb-v1r2
Hmean tput-1 72211.33 ( 0.00%) 69510.46 ( -3.74%)
Hmean tput-8 564617.72 ( 0.00%) 614862.80 * 8.90%*
Hmean tput-16 1001427.52 ( 0.00%) 1128073.47 * 12.65%*
Hmean tput-24 1391106.98 ( 0.00%) 1605210.23 * 15.39%*
Hmean tput-32 1685885.77 ( 0.00%) 1971077.42 * 16.92%*
Hmean tput-40 1840316.70 ( 0.00%) 2341328.12 * 27.22%*
Hmean tput-48 1900286.97 ( 0.00%) 2643100.06 * 39.09%*
Hmean tput-56 2161832.49 ( 0.00%) 2288492.08 ( 5.86%)
Hmean tput-64 1979696.79 ( 0.00%) 2970706.40 * 50.06%*
Hmean tput-72 2075744.37 ( 0.00%) 3036188.04 * 46.27%*
Hmean tput-80 2044842.51 ( 0.00%) 3116143.03 * 52.39%*
Hmean tput-88 2546189.47 ( 0.00%) 3095464.00 * 21.57%*
Hmean tput-96 2775456.33 ( 0.00%) 2628754.25 ( -5.29%)
Hmean tput-104 2591994.59 ( 0.00%) 3081532.21 * 18.89%*
Hmean tput-112 2817717.85 ( 0.00%) 2932890.32 ( 4.09%)
Hmean tput-120 2525230.39 ( 0.00%) 2967773.00 * 17.52%*
Hmean tput-128 2709652.37 ( 0.00%) 2912141.50 * 7.47%*

Signed-off-by: Mel Gorman <[email protected]>
---
kernel/sched/fair.c | 27 +++++++++++++++++----------
kernel/sched/sched.h | 1 +
kernel/sched/topology.c | 15 +++++++++++++++
3 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ff69f245b939..fda58bcbb1c0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1545,7 +1545,7 @@ struct task_numa_env {
static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
static unsigned long cpu_util(int cpu);
-static inline long adjust_numa_imbalance(int imbalance,
+static inline long adjust_numa_imbalance(int imbalance, int dst_cpu,
int dst_running, int dst_weight);

static inline enum
@@ -1926,8 +1926,8 @@ static void task_numa_find_cpu(struct task_numa_env *env,
src_running = env->src_stats.nr_running - 1;
dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running);
- imbalance = adjust_numa_imbalance(imbalance, dst_running,
- env->dst_stats.weight);
+ imbalance = adjust_numa_imbalance(imbalance, env->dst_cpu,
+ dst_running, env->dst_stats.weight);

/* Use idle CPU if there is no imbalance */
if (!imbalance) {
@@ -8989,9 +8989,13 @@ static bool update_pick_idlest(struct sched_group *idlest,
* This is an approximation as the number of running tasks may not be
* related to the number of busy CPUs due to sched_setaffinity.
*/
-static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
+static inline bool
+allow_numa_imbalance(int dst_cpu, int dst_running, int dst_weight)
{
- return (dst_running < (dst_weight >> 2));
+ /* Allowed NUMA imbalance */
+ dst_weight >>= per_cpu(sd_numaimb_shift, dst_cpu);
+
+ return dst_running < dst_weight;
}

/*
@@ -9111,8 +9115,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)

case group_has_spare:
if (sd->flags & SD_NUMA) {
+ int idlest_cpu = cpumask_first(sched_group_span(idlest));
+
#ifdef CONFIG_NUMA_BALANCING
- int idlest_cpu;
/*
* If there is spare capacity at NUMA, try to select
* the preferred node
@@ -9120,7 +9125,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
return NULL;

- idlest_cpu = cpumask_first(sched_group_span(idlest));
if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
return idlest;
#endif
@@ -9130,8 +9134,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
* a real need of migration, periodic load balance will
* take care of it.
*/
- if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
+ if (allow_numa_imbalance(idlest_cpu,
+ local_sgs.sum_nr_running, sd->span_weight)) {
return NULL;
+ }
}

/*
@@ -9221,10 +9227,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd

#define NUMA_IMBALANCE_MIN 2

-static inline long adjust_numa_imbalance(int imbalance,
+static inline long adjust_numa_imbalance(int imbalance, int dst_cpu,
int dst_running, int dst_weight)
{
- if (!allow_numa_imbalance(dst_running, dst_weight))
+ if (!allow_numa_imbalance(dst_cpu, dst_running, dst_weight))
return imbalance;

/*
@@ -9336,6 +9342,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/* Consider allowing a small imbalance between NUMA groups */
if (env->sd->flags & SD_NUMA) {
env->imbalance = adjust_numa_imbalance(env->imbalance,
+ env->src_cpu,
busiest->sum_nr_running, busiest->group_weight);
}

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3d3e5793e117..f2620d6b9918 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1777,6 +1777,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(int, sd_numaimb_shift);
DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 4e8698e62f07..08fb02510967 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -644,6 +644,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(int, sd_numaimb_shift);
DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
@@ -672,6 +673,20 @@ static void update_top_cache_domain(int cpu)
sd = lowest_flag_domain(cpu, SD_NUMA);
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);

+ /*
+ * Save the threshold where an imbalance is allowed between SD_NUMA
+ * domains. If LLC spans the entire node, then imbalances are allowed
+ * until 25% of the domain is active. Otherwise, allow an imbalance
+ * up to the point where LLCs between NUMA nodes should be balanced
+ * to maximise cache and memory bandwidth utilisation.
+ */
+ if (sd) {
+ if (sd->span_weight == size)
+ per_cpu(sd_numaimb_shift, cpu) = 2;
+ else
+ per_cpu(sd_numaimb_shift, cpu) = max(2, ilog2(sd->span_weight / size * num_online_nodes()));
+ }
+
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);

2021-11-05 19:46:12

by Srinivasan, Sadagopan

[permalink] [raw]

Subject: RE: [PATCH] sched/fair: Adjust the allowed NUMA imbalance when SD_NUMA spans multiple LLCs

[AMD Official Use Only]

+Krupa

-----Original Message-----
From: Valentin Schneider <[email protected]>
Sent: Friday, November 5, 2021 1:23 PM
To: Mel Gorman <[email protected]>; Peter Zijlstra <[email protected]>
Cc: Ingo Molnar <[email protected]>; Vincent Guittot <[email protected]>; Aubrey Li <[email protected]>; Srinivasan, Sadagopan <[email protected]>; LKML <[email protected]>
Subject: Re: [PATCH] sched/fair: Adjust the allowed NUMA imbalance when SD_NUMA spans multiple LLCs

[CAUTION: External Email]

On 28/10/21 14:03, Mel Gorman wrote:
> Commit 7d2b5dd0bcc4 ("sched/numa: Allow a floating imbalance between
> NUMA
> nodes") allowed an imbalance between NUMA nodes such that
> communicating tasks would not be pulled apart by the load balancer.
> This works fine when there is a 1:1 relationship between LLC and node
> but can be suboptimal for multiple LLCs if independent tasks prematurely use CPUs sharing cache.
>
> Zen* has multiple LLCs per node with local memory channels and due to
> the allowed imbalance, it's far harder to tune some workloads to run
> optimally than it is on hardware that has 1 LLC per node. This patch
> adjusts the imbalance on multi-LLC machines to allow an imbalance up
> to the point where LLCs should be balanced between nodes.
>

I've run out of brain juice for today and didn't get to decipher the logic you're implementing, but for now I do have a comment on the topology detection side of things (see inline).

> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -644,6 +644,7 @@ static void destroy_sched_domains(struct
> sched_domain *sd) DEFINE_PER_CPU(struct sched_domain __rcu *,
> sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int,
> sd_llc_id);
> +DEFINE_PER_CPU(int, sd_numaimb_shift);
> DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); @@
> -672,6 +673,20 @@ static void update_top_cache_domain(int cpu)
> sd = lowest_flag_domain(cpu, SD_NUMA);
> rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
>
> + /*
> + * Save the threshold where an imbalance is allowed between SD_NUMA
> + * domains. If LLC spans the entire node, then imbalances are allowed
> + * until 25% of the domain is active. Otherwise, allow an imbalance
> + * up to the point where LLCs between NUMA nodes should be balanced
> + * to maximise cache and memory bandwidth utilisation.
> + */
> + if (sd) {
> + if (sd->span_weight == size)
> + per_cpu(sd_numaimb_shift, cpu) = 2;
> + else
> + per_cpu(sd_numaimb_shift, cpu) = max(2, ilog2(sd->span_weight / size * num_online_nodes()));
> + }
> +

So nodes are covered by the NODE topology level which *doesn't* have SD_NUMA set. I always get confused on how MC/DIE/NODE is supposed to look on those sub-NUMA clustering thingies, but either way consider:

NUMA-20 [ ]
NODE [ ][ ]
DIE [ ][ ]
MC [ ][ ][ ][ ]

NODE level gets degenerated, update_top_cache_domain() is invoked with:

NUMA-20 [ ]
DIE [ ][ ]
MC [ ][ ][ ][ ]

That lowest_flag_domain(cpu, SD_NUMA) will span the entire system.

Conversely, with this topology where node == LLC:

NUMA-20 [ ]
NODE [ ][ ]
DIE [ ][ ]
MC [ ][ ]

You get

NUMA-20 [ ]
MC [ ][ ]

lowest_flag_domain(cpu, SD_NUMA)->span_weight > size, even though LLC = node.

Long story short, I think you want to use sd->child here - that *should* point to a domain that spans exactly one node (it's gonna be NODE, or some other domain that has the same span because NODE was degenerated).

> sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
> rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
>

2021-11-05 21:29:13

by Valentin Schneider

[permalink] [raw]

Subject: Re: [PATCH] sched/fair: Adjust the allowed NUMA imbalance when SD_NUMA spans multiple LLCs

On 28/10/21 14:03, Mel Gorman wrote:
> Commit 7d2b5dd0bcc4 ("sched/numa: Allow a floating imbalance between NUMA
> nodes") allowed an imbalance between NUMA nodes such that communicating
> tasks would not be pulled apart by the load balancer. This works fine when
> there is a 1:1 relationship between LLC and node but can be suboptimal
> for multiple LLCs if independent tasks prematurely use CPUs sharing cache.
>
> Zen* has multiple LLCs per node with local memory channels and due to
> the allowed imbalance, it's far harder to tune some workloads to run
> optimally than it is on hardware that has 1 LLC per node. This patch
> adjusts the imbalance on multi-LLC machines to allow an imbalance up to
> the point where LLCs should be balanced between nodes.
>

I've run out of brain juice for today and didn't get to decipher the logic
you're implementing, but for now I do have a comment on the topology
detection side of things (see inline).

> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -644,6 +644,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> DEFINE_PER_CPU(int, sd_llc_size);
> DEFINE_PER_CPU(int, sd_llc_id);
> +DEFINE_PER_CPU(int, sd_numaimb_shift);
> DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
> @@ -672,6 +673,20 @@ static void update_top_cache_domain(int cpu)
> sd = lowest_flag_domain(cpu, SD_NUMA);
> rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
>
> + /*
> + * Save the threshold where an imbalance is allowed between SD_NUMA
> + * domains. If LLC spans the entire node, then imbalances are allowed
> + * until 25% of the domain is active. Otherwise, allow an imbalance
> + * up to the point where LLCs between NUMA nodes should be balanced
> + * to maximise cache and memory bandwidth utilisation.
> + */
> + if (sd) {
> + if (sd->span_weight == size)
> + per_cpu(sd_numaimb_shift, cpu) = 2;
> + else
> + per_cpu(sd_numaimb_shift, cpu) = max(2, ilog2(sd->span_weight / size * num_online_nodes()));
> + }
> +

So nodes are covered by the NODE topology level which *doesn't* have
SD_NUMA set. I always get confused on how MC/DIE/NODE is supposed to look
on those sub-NUMA clustering thingies, but either way consider:

NUMA-20 [ ]
NODE [ ][ ]
DIE [ ][ ]
MC [ ][ ][ ][ ]

NODE level gets degenerated, update_top_cache_domain() is invoked with:

NUMA-20 [ ]
DIE [ ][ ]
MC [ ][ ][ ][ ]

That lowest_flag_domain(cpu, SD_NUMA) will span the entire system.

Conversely, with this topology where node == LLC:

NUMA-20 [ ]
NODE [ ][ ]
DIE [ ][ ]
MC [ ][ ]

You get

NUMA-20 [ ]
MC [ ][ ]

lowest_flag_domain(cpu, SD_NUMA)->span_weight > size, even though LLC =
node.

Long story short, I think you want to use sd->child here - that *should*
point to a domain that spans exactly one node (it's gonna be NODE, or some
other domain that has the same span because NODE was degenerated).

> sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
> rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
>

2021-11-08 14:59:03

by Mel Gorman

[permalink] [raw]

Subject: Re: [PATCH] sched/fair: Adjust the allowed NUMA imbalance when SD_NUMA spans multiple LLCs

On Fri, Nov 05, 2021 at 06:22:52PM +0000, Valentin Schneider wrote:
> On 28/10/21 14:03, Mel Gorman wrote:
> > Commit 7d2b5dd0bcc4 ("sched/numa: Allow a floating imbalance between NUMA
> > nodes") allowed an imbalance between NUMA nodes such that communicating
> > tasks would not be pulled apart by the load balancer. This works fine when
> > there is a 1:1 relationship between LLC and node but can be suboptimal
> > for multiple LLCs if independent tasks prematurely use CPUs sharing cache.
> >
> > Zen* has multiple LLCs per node with local memory channels and due to
> > the allowed imbalance, it's far harder to tune some workloads to run
> > optimally than it is on hardware that has 1 LLC per node. This patch
> > adjusts the imbalance on multi-LLC machines to allow an imbalance up to
> > the point where LLCs should be balanced between nodes.
> >
>
> I've run out of brain juice for today and didn't get to decipher the logic
> you're implementing, but for now I do have a comment on the topology
> detection side of things (see inline).
>
> > --- a/kernel/sched/topology.c
> > +++ b/kernel/sched/topology.c
> > @@ -644,6 +644,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
> > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> > DEFINE_PER_CPU(int, sd_llc_size);
> > DEFINE_PER_CPU(int, sd_llc_id);
> > +DEFINE_PER_CPU(int, sd_numaimb_shift);
> > DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
> > @@ -672,6 +673,20 @@ static void update_top_cache_domain(int cpu)
> > sd = lowest_flag_domain(cpu, SD_NUMA);
> > rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
> >
> > + /*
> > + * Save the threshold where an imbalance is allowed between SD_NUMA
> > + * domains. If LLC spans the entire node, then imbalances are allowed
> > + * until 25% of the domain is active. Otherwise, allow an imbalance
> > + * up to the point where LLCs between NUMA nodes should be balanced
> > + * to maximise cache and memory bandwidth utilisation.
> > + */
> > + if (sd) {
> > + if (sd->span_weight == size)
> > + per_cpu(sd_numaimb_shift, cpu) = 2;
> > + else
> > + per_cpu(sd_numaimb_shift, cpu) = max(2, ilog2(sd->span_weight / size * num_online_nodes()));
> > + }
> > +
>
> So nodes are covered by the NODE topology level which *doesn't* have
> SD_NUMA set. I always get confused on how MC/DIE/NODE is supposed to look
> on those sub-NUMA clustering thingies, but either way consider:
>

The Zen machines don't have sub-NUMA clustering as such, each LLC is not
represented as a separate NUMA node. For example, an example Zen3
machines looks like

available: 2 nodes (0-1)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
node 0 size: 128278 MB
node 0 free: 126193 MB
node 1 cpus: 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
node 1 size: 111571 MB
node 1 free: 109755 MB
node distances:
node 0 1
0: 10 32
1: 32 10

Each node has 8 LLCs and treated as if they are equal distance from memory
so node != LLC but for peak memory utilisation, it's best to spread based
on LLC and not just NUMA distance.

Hence, the intent of the patch is "allow some NUMA imbalance but not
enough imbalance that tasks share LLC prematurely". I didn't see an
obvious way of doing that with SD flag trickery.

--
Mel Gorman
SUSE Labs

2021-11-08 15:44:45

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH] sched/fair: Adjust the allowed NUMA imbalance when SD_NUMA spans multiple LLCs

On Thu, Oct 28, 2021 at 02:03:05PM +0100, Mel Gorman wrote:

> @@ -1926,8 +1926,8 @@ static void task_numa_find_cpu(struct task_numa_env *env,
> src_running = env->src_stats.nr_running - 1;
> dst_running = env->dst_stats.nr_running + 1;
> imbalance = max(0, dst_running - src_running);
> - imbalance = adjust_numa_imbalance(imbalance, dst_running,
> - env->dst_stats.weight);
> + imbalance = adjust_numa_imbalance(imbalance, env->dst_cpu,
> + dst_running, env->dst_stats.weight);

Can we please align at (0 ?

>
> /* Use idle CPU if there is no imbalance */
> if (!imbalance) {

> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 4e8698e62f07..08fb02510967 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -644,6 +644,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> DEFINE_PER_CPU(int, sd_llc_size);
> DEFINE_PER_CPU(int, sd_llc_id);
> +DEFINE_PER_CPU(int, sd_numaimb_shift);

Why does it make sense for this to be a per-cpu variable? Yes, I suppose
people can get creative with cpusets, but what you're trying to capture
seems like a global system propery, no?

At most this seems to want to be a sched_domain value.

> DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
> @@ -672,6 +673,20 @@ static void update_top_cache_domain(int cpu)
> sd = lowest_flag_domain(cpu, SD_NUMA);
> rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
>
> + /*
> + * Save the threshold where an imbalance is allowed between SD_NUMA
> + * domains. If LLC spans the entire node, then imbalances are allowed
> + * until 25% of the domain is active. Otherwise, allow an imbalance
> + * up to the point where LLCs between NUMA nodes should be balanced
> + * to maximise cache and memory bandwidth utilisation.
> + */
> + if (sd) {
> + if (sd->span_weight == size)
> + per_cpu(sd_numaimb_shift, cpu) = 2;
> + else
> + per_cpu(sd_numaimb_shift, cpu) = max(2, ilog2(sd->span_weight / size * num_online_nodes()));
> + }
> +
> sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
> rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);

I think I'm with Valentin here, this seems like something that wants to
use the sd/sd->child relation.

That also makes this the wrong place to do things since this is after
the degenerate code.

Perhaps this can be done in sd_init(), after all, we build the thing
bottom-up, so by the time we initialize the NODE, the MC level should
already be present.

I'm thinking you can perhaps use something like:

if (!(sd->flags & SD_SHARE_PKG_RESROUCES) &&
(child->flags & SD_SHARE_PKG_RESOURCES)) {

/* this is the first domain not sharing LLC */
sd->new_magic_imb = /* magic incantation goes here */
}

2021-11-08 16:02:44

by Mel Gorman

[permalink] [raw]

Subject: Re: [PATCH] sched/fair: Adjust the allowed NUMA imbalance when SD_NUMA spans multiple LLCs

On Mon, Nov 08, 2021 at 12:14:27PM +0100, Peter Zijlstra wrote:
> On Thu, Oct 28, 2021 at 02:03:05PM +0100, Mel Gorman wrote:
>
> > @@ -1926,8 +1926,8 @@ static void task_numa_find_cpu(struct task_numa_env *env,
> > src_running = env->src_stats.nr_running - 1;
> > dst_running = env->dst_stats.nr_running + 1;
> > imbalance = max(0, dst_running - src_running);
> > - imbalance = adjust_numa_imbalance(imbalance, dst_running,
> > - env->dst_stats.weight);
> > + imbalance = adjust_numa_imbalance(imbalance, env->dst_cpu,
> > + dst_running, env->dst_stats.weight);
>
> Can we please align at (0 ?
>

i.e.
imbalance = adjust_numa_imbalance(imbalance, env->dst_cpu,
dst_running,
env->dst_stats.weight);

?

> >
> > /* Use idle CPU if there is no imbalance */
> > if (!imbalance) {
>
> > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> > index 4e8698e62f07..08fb02510967 100644
> > --- a/kernel/sched/topology.c
> > +++ b/kernel/sched/topology.c
> > @@ -644,6 +644,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
> > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> > DEFINE_PER_CPU(int, sd_llc_size);
> > DEFINE_PER_CPU(int, sd_llc_id);
> > +DEFINE_PER_CPU(int, sd_numaimb_shift);
>
> Why does it make sense for this to be a per-cpu variable? Yes, I suppose
> people can get creative with cpusets, but what you're trying to capture
> seems like a global system propery, no?
>

I thought things might get weird around CPU hotplug and as llc_size was
tracked per-cpu, I thought it made sense to also do it for
sd_numaimb_shift.

> At most this seems to want to be a sched_domain value.
>
> > DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
> > @@ -672,6 +673,20 @@ static void update_top_cache_domain(int cpu)
> > sd = lowest_flag_domain(cpu, SD_NUMA);
> > rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
> >
> > + /*
> > + * Save the threshold where an imbalance is allowed between SD_NUMA
> > + * domains. If LLC spans the entire node, then imbalances are allowed
> > + * until 25% of the domain is active. Otherwise, allow an imbalance
> > + * up to the point where LLCs between NUMA nodes should be balanced
> > + * to maximise cache and memory bandwidth utilisation.
> > + */
> > + if (sd) {
> > + if (sd->span_weight == size)
> > + per_cpu(sd_numaimb_shift, cpu) = 2;
> > + else
> > + per_cpu(sd_numaimb_shift, cpu) = max(2, ilog2(sd->span_weight / size * num_online_nodes()));
> > + }
> > +
> > sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
> > rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
>
> I think I'm with Valentin here, this seems like something that wants to
> use the sd/sd->child relation.
>
> That also makes this the wrong place to do things since this is after
> the degenerate code.
>
> Perhaps this can be done in sd_init(), after all, we build the thing
> bottom-up, so by the time we initialize the NODE, the MC level should
> already be present.
>
> I'm thinking you can perhaps use something like:
>
> if (!(sd->flags & SD_SHARE_PKG_RESROUCES) &&
> (child->flags & SD_SHARE_PKG_RESOURCES)) {
>
> /* this is the first domain not sharing LLC */
> sd->new_magic_imb = /* magic incantation goes here */
> }

Thanks, I'll give it a shot and see what I come up with, it'll probably
take me a few days to clear my table of other crud to focus on it.

--
Mel Gorman
SUSE Labs

2021-11-08 16:59:48

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH] sched/fair: Adjust the allowed NUMA imbalance when SD_NUMA spans multiple LLCs

On Mon, Nov 08, 2021 at 11:59:48AM +0000, Mel Gorman wrote:
> On Mon, Nov 08, 2021 at 12:14:27PM +0100, Peter Zijlstra wrote:
> > On Thu, Oct 28, 2021 at 02:03:05PM +0100, Mel Gorman wrote:
> >
> > > @@ -1926,8 +1926,8 @@ static void task_numa_find_cpu(struct task_numa_env *env,
> > > src_running = env->src_stats.nr_running - 1;
> > > dst_running = env->dst_stats.nr_running + 1;
> > > imbalance = max(0, dst_running - src_running);
> > > - imbalance = adjust_numa_imbalance(imbalance, dst_running,
> > > - env->dst_stats.weight);
> > > + imbalance = adjust_numa_imbalance(imbalance, env->dst_cpu,
> > > + dst_running, env->dst_stats.weight);
> >
> > Can we please align at (0 ?
> >
>
> i.e.
> imbalance = adjust_numa_imbalance(imbalance, env->dst_cpu,
> dst_running,
> env->dst_stats.weight);
>
> ?

Yep. For those using vim: :set cino=(0:0

Might as well clean that up while we touch the thing anyway.

> > >
> > > /* Use idle CPU if there is no imbalance */
> > > if (!imbalance) {
> >
> > > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> > > index 4e8698e62f07..08fb02510967 100644
> > > --- a/kernel/sched/topology.c
> > > +++ b/kernel/sched/topology.c
> > > @@ -644,6 +644,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
> > > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> > > DEFINE_PER_CPU(int, sd_llc_size);
> > > DEFINE_PER_CPU(int, sd_llc_id);
> > > +DEFINE_PER_CPU(int, sd_numaimb_shift);
> >
> > Why does it make sense for this to be a per-cpu variable? Yes, I suppose
> > people can get creative with cpusets, but what you're trying to capture
> > seems like a global system propery, no?
> >
>
> I thought things might get weird around CPU hotplug and as llc_size was
> tracked per-cpu, I thought it made sense to also do it for
> sd_numaimb_shift.

Ah, there were performance arguments for llc_id (saves a bunch of
indirections trying to look up the LLC domain) and llc_size IIRC. While
in this case, the user actually has a struct sched_domain handy.

> > I'm thinking you can perhaps use something like:
> >
> > if (!(sd->flags & SD_SHARE_PKG_RESROUCES) &&
> > (child->flags & SD_SHARE_PKG_RESOURCES)) {
> >
> > /* this is the first domain not sharing LLC */
> > sd->new_magic_imb = /* magic incantation goes here */
> > }
>
> Thanks, I'll give it a shot and see what I come up with, it'll probably
> take me a few days to clear my table of other crud to focus on it.

Sure thing.

2021-11-11 09:38:26

by Mel Gorman

[permalink] [raw]

Subject: Re: [PATCH] sched/fair: Adjust the allowed NUMA imbalance when SD_NUMA spans multiple LLCs

On Mon, Nov 08, 2021 at 03:21:05PM +0100, Peter Zijlstra wrote:
> > > > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> > > > index 4e8698e62f07..08fb02510967 100644
> > > > --- a/kernel/sched/topology.c
> > > > +++ b/kernel/sched/topology.c
> > > > @@ -644,6 +644,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
> > > > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> > > > DEFINE_PER_CPU(int, sd_llc_size);
> > > > DEFINE_PER_CPU(int, sd_llc_id);
> > > > +DEFINE_PER_CPU(int, sd_numaimb_shift);
> > >
> > > Why does it make sense for this to be a per-cpu variable? Yes, I suppose
> > > people can get creative with cpusets, but what you're trying to capture
> > > seems like a global system propery, no?
> > >
> >
> > I thought things might get weird around CPU hotplug and as llc_size was
> > tracked per-cpu, I thought it made sense to also do it for
> > sd_numaimb_shift.
>
> Ah, there were performance arguments for llc_id (saves a bunch of
> indirections trying to look up the LLC domain) and llc_size IIRC. While
> in this case, the user actually has a struct sched_domain handy.
>

Very true or at least had recent access to the sd.

> > > Very true or at least had very recent access to it.
> > >
> > > I think I'm with Valentin here, this seems like something that wants to
> > > use the sd/sd->child relation.
> > >
> > > That also makes this the wrong place to do things since this is after
> > > the degenerate code.
> > >
> > > Perhaps this can be done in sd_init(), after all, we build the thing
> > > bottom-up, so by the time we initialize the NODE, the MC level should
> > > already be present.
> > >
> > > I'm thinking you can perhaps use something like:
> > >
> > > if (!(sd->flags & SD_SHARE_PKG_RESROUCES) &&
> > > (child->flags & SD_SHARE_PKG_RESOURCES)) {
> > >
> > > /* this is the first domain not sharing LLC */
> > > sd->new_magic_imb = /* magic incantation goes here */
> > > }
> >
> > Thanks, I'll give it a shot and see what I come up with, it'll probably
> > take me a few days to clear my table of other crud to focus on it.
>
> Sure thing.

At sd_init time, the sd span_weights and groups have not been fully
calculated yet so while the child domains are partially initialised,
we are missing some information, particularly the span of SD_NUMA (it
can be in inferred but it's non-obvious).

build_sched_domains
build_sched_domain
sd_init
(record span_weight)
build_sched_groups

I thought it would be easier to figure out the allowed numa imbalance
after groups are setup although basing it on the sd/sd->child relation
both addresses Zen and removes the magic "25%" value at least for 2-socket
machines with a change in behaviour for larger machines to balance
cache usage versus memory bandwidth.

It also became obvious when working on this that there was an inconsistency
between the domains used by find_busiest_group and find_idlest_group
which deserves a separate patch but included in this monolithic path.

End result;

---8<--
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 8f0f778b7c91..a8832e6e97f0 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -86,6 +86,7 @@ struct sched_domain {
unsigned int busy_factor; /* less balancing by factor if busy */
unsigned int imbalance_pct; /* No balance until over watermark */
unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
+ unsigned int imb_numa_nr; /* Nr imbalanced tasks allowed between nodes */

int nohz_idle; /* NOHZ IDLE status */
int flags; /* See SD_* */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f6a05d9b5443..582b739a5f02 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1531,6 +1531,7 @@ struct task_numa_env {

int src_cpu, src_nid;
int dst_cpu, dst_nid;
+ int imb_numa_nr;

struct numa_stats src_stats, dst_stats;

@@ -1927,7 +1928,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running);
imbalance = adjust_numa_imbalance(imbalance, dst_running,
- env->dst_stats.weight);
+ env->imb_numa_nr);

/* Use idle CPU if there is no imbalance */
if (!imbalance) {
@@ -1992,8 +1993,10 @@ static int task_numa_migrate(struct task_struct *p)
*/
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
- if (sd)
+ if (sd) {
env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+ env.imb_numa_nr = sd->imb_numa_nr;
+ }
rcu_read_unlock();

/*
@@ -8989,13 +8992,14 @@ static bool update_pick_idlest(struct sched_group *idlest,
}

/*
- * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
- * This is an approximation as the number of running tasks may not be
- * related to the number of busy CPUs due to sched_setaffinity.
+ * Allow a NUMA imbalance if busy CPUs is less than the allowed
+ * imbalance. This is an approximation as the number of running
+ * tasks may not be related to the number of busy CPUs due to
+ * sched_setaffinity.
*/
-static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
+static inline bool allow_numa_imbalance(int dst_running, int imb_numa_nr)
{
- return (dst_running < (dst_weight >> 2));
+ return dst_running < imb_numa_nr;
}

/*
@@ -9134,7 +9138,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
* a real need of migration, periodic load balance will
* take care of it.
*/
- if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
+ if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->imb_numa_nr))
return NULL;
}

@@ -9226,9 +9230,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
#define NUMA_IMBALANCE_MIN 2

static inline long adjust_numa_imbalance(int imbalance,
- int dst_running, int dst_weight)
+ int dst_running, int imb_numa_nr)
{
- if (!allow_numa_imbalance(dst_running, dst_weight))
+ if (!allow_numa_imbalance(dst_running, imb_numa_nr))
return imbalance;

/*
@@ -9340,7 +9344,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/* Consider allowing a small imbalance between NUMA groups */
if (env->sd->flags & SD_NUMA) {
env->imbalance = adjust_numa_imbalance(env->imbalance,
- busiest->sum_nr_running, busiest->group_weight);
+ busiest->sum_nr_running, env->sd->imb_numa_nr);
}

return;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 4e8698e62f07..a47cae382617 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2227,6 +2227,30 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
}
}

+ /* Calculate allowed NUMA imbalance */
+ for_each_cpu(i, cpu_map) {
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ struct sched_domain *child = sd->child;
+
+ if (!(sd->flags & SD_SHARE_PKG_RESOURCES) &&
+ (child->flags & SD_SHARE_PKG_RESOURCES)) {
+ struct sched_domain *sd_numa = sd;
+ int imb_numa_nr, nr_groups;
+
+ nr_groups = sd->span_weight / child->span_weight;
+ imb_numa_nr = nr_groups / num_online_nodes();
+
+ while (sd_numa) {
+ if (sd_numa->flags & SD_NUMA) {
+ sd_numa->imb_numa_nr = imb_numa_nr;
+ break;
+ }
+ sd_numa = sd_numa->parent;
+ }
+ }
+ }
+ }
+
/* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
if (!cpumask_test_cpu(i, cpu_map))

--
Mel Gorman
SUSE Labs