LinuxLists.cc - [tip: sched/core] sched: Add cluster scheduler level for x86

2021-10-15 17:57:26

Subject: [tip: sched/core] sched: Add cluster scheduler level for x86

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 66558b730f2533cc2bf2b74d51f5f80b81e2bad0
Gitweb: https://git.kernel.org/tip/66558b730f2533cc2bf2b74d51f5f80b81e2bad0
Author: Tim Chen <[email protected]>
AuthorDate: Fri, 24 Sep 2021 20:51:04 +12:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Fri, 15 Oct 2021 11:25:16 +02:00

sched: Add cluster scheduler level for x86

There are x86 CPU architectures (e.g. Jacobsville) where L2 cahce is
shared among a cluster of cores instead of being exclusive to one
single core.

To prevent oversubscription of L2 cache, load should be balanced
between such L2 clusters, especially for tasks with no shared data.
On benchmark such as SPECrate mcf test, this change provides a boost
to performance especially on medium load system on Jacobsville. on a
Jacobsville that has 24 Atom cores, arranged into 6 clusters of 4
cores each, the benchmark number is as follow:

Improvement over baseline kernel for mcf_r
copies run time base rate
1 -0.1% -0.2%
6 25.1% 25.1%
12 18.8% 19.0%
24 0.3% 0.3%

So this looks pretty good. In terms of the system's task distribution,
some pretty bad clumping can be seen for the vanilla kernel without
the L2 cluster domain for the 6 and 12 copies case. With the extra
domain for cluster, the load does get evened out between the clusters.

Note this patch isn't an universal win as spreading isn't necessarily
a win, particually for those workload who can benefit from packing.

Signed-off-by: Tim Chen <[email protected]>
Signed-off-by: Barry Song <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/Kconfig | 11 ++++++++-
arch/x86/include/asm/smp.h | 7 +++++-
arch/x86/include/asm/topology.h | 3 ++-
arch/x86/kernel/cpu/cacheinfo.c | 1 +-
arch/x86/kernel/cpu/common.c | 3 ++-
arch/x86/kernel/smpboot.c | 44 +++++++++++++++++++++++++++++++-
6 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ab83c22..349e59b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1001,6 +1001,17 @@ config NR_CPUS
This is purely to save memory: each supported CPU adds about 8KB
to the kernel image.

+config SCHED_CLUSTER
+ bool "Cluster scheduler support"
+ depends on SMP
+ default y
+ help
+ Cluster scheduler support improves the CPU scheduler's decision
+ making when dealing with machines that have clusters of CPUs.
+ Cluster usually means a couple of CPUs which are placed closely
+ by sharing mid-level caches, last-level cache tags or internal
+ busses.
+
config SCHED_SMT
def_bool y if SMP

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 630ff08..08b0e90 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -16,7 +16,9 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
/* cpus sharing the last level cache: */
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
+DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id);
DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);

static inline struct cpumask *cpu_llc_shared_mask(int cpu)
@@ -24,6 +26,11 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
return per_cpu(cpu_llc_shared_map, cpu);
}

+static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
+{
+ return per_cpu(cpu_l2c_shared_map, cpu);
+}
+
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 9239399..cc16477 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -103,6 +103,7 @@ static inline void setup_node_to_cpumask_map(void) { }
#include <asm-generic/topology.h>

extern const struct cpumask *cpu_coregroup_mask(int cpu);
+extern const struct cpumask *cpu_clustergroup_mask(int cpu);

#define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id)
#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
@@ -113,7 +114,9 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu);
extern unsigned int __max_die_per_package;

#ifdef CONFIG_SMP
+#define topology_cluster_id(cpu) (per_cpu(cpu_l2c_id, cpu))
#define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu))
+#define topology_cluster_cpumask(cpu) (cpu_clustergroup_mask(cpu))
#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))

diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index b5e36bd..fe98a14 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -846,6 +846,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
l2 = new_l2;
#ifdef CONFIG_SMP
per_cpu(cpu_llc_id, cpu) = l2_id;
+ per_cpu(cpu_l2c_id, cpu) = l2_id;
#endif
}

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0f88859..1c7897c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -85,6 +85,9 @@ u16 get_llc_id(unsigned int cpu)
}
EXPORT_SYMBOL_GPL(get_llc_id);

+/* L2 cache ID of each logical CPU */
+DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID;
+
/* correctly size the local cpu masks */
void __init setup_cpu_local_masks(void)
{
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 85f6e24..5094ab0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -101,6 +101,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_die_map);

DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);

+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
+
/* Per CPU bogomips and other parameters */
DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
EXPORT_PER_CPU_SYMBOL(cpu_info);
@@ -464,6 +466,21 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
return false;
}

+static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ /* Do not match if we do not have a valid APICID for cpu: */
+ if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID)
+ return false;
+
+ /* Do not match if L2 cache id does not match: */
+ if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2))
+ return false;
+
+ return topology_sane(c, o, "l2c");
+}
+
/*
* Unlike the other levels, we do not enforce keeping a
* multicore group inside a NUMA node. If this happens, we will
@@ -523,7 +540,7 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
}

-#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC)
static inline int x86_sched_itmt_flags(void)
{
return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
@@ -541,12 +558,21 @@ static int x86_smt_flags(void)
return cpu_smt_flags() | x86_sched_itmt_flags();
}
#endif
+#ifdef CONFIG_SCHED_CLUSTER
+static int x86_cluster_flags(void)
+{
+ return cpu_cluster_flags() | x86_sched_itmt_flags();
+}
+#endif
#endif

static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
#endif
+#ifdef CONFIG_SCHED_CLUSTER
+ { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
+#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
#endif
@@ -557,6 +583,9 @@ static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
#endif
+#ifdef CONFIG_SCHED_CLUSTER
+ { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
+#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
#endif
@@ -584,6 +613,7 @@ void set_cpu_sibling_map(int cpu)
if (!has_mp) {
cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu));
cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
c->booted_cores = 1;
@@ -602,6 +632,9 @@ void set_cpu_sibling_map(int cpu)
if ((i == cpu) || (has_mp && match_llc(c, o)))
link_mask(cpu_llc_shared_mask, cpu, i);

+ if ((i == cpu) || (has_mp && match_l2c(c, o)))
+ link_mask(cpu_l2c_shared_mask, cpu, i);
+
if ((i == cpu) || (has_mp && match_die(c, o)))
link_mask(topology_die_cpumask, cpu, i);
}
@@ -652,6 +685,11 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
return cpu_llc_shared_mask(cpu);
}

+const struct cpumask *cpu_clustergroup_mask(int cpu)
+{
+ return cpu_l2c_shared_mask(cpu);
+}
+
static void impress_friends(void)
{
int cpu;
@@ -1335,6 +1373,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
}

/*
@@ -1564,7 +1603,10 @@ static void remove_siblinginfo(int cpu)

for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
+ for_each_cpu(sibling, cpu_l2c_shared_mask(cpu))
+ cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling));
cpumask_clear(cpu_llc_shared_mask(cpu));
+ cpumask_clear(cpu_l2c_shared_mask(cpu));
cpumask_clear(topology_sibling_cpumask(cpu));
cpumask_clear(topology_core_cpumask(cpu));
cpumask_clear(topology_die_cpumask(cpu));

2021-10-20 13:17:39

by Tom Lendacky

[permalink] [raw]

Subject: Re: [tip: sched/core] sched: Add cluster scheduler level for x86

On 10/15/21 4:44 AM, tip-bot2 for Tim Chen wrote:
> The following commit has been merged into the sched/core branch of tip:
>
> Commit-ID: 66558b730f2533cc2bf2b74d51f5f80b81e2bad0
> Gitweb: https://git.kernel.org/tip/66558b730f2533cc2bf2b74d51f5f80b81e2bad0
> Author: Tim Chen <[email protected]>
> AuthorDate: Fri, 24 Sep 2021 20:51:04 +12:00
> Committer: Peter Zijlstra <[email protected]>
> CommitterDate: Fri, 15 Oct 2021 11:25:16 +02:00
>
> sched: Add cluster scheduler level for x86
>
> There are x86 CPU architectures (e.g. Jacobsville) where L2 cahce is
> shared among a cluster of cores instead of being exclusive to one
> single core.
>
> To prevent oversubscription of L2 cache, load should be balanced
> between such L2 clusters, especially for tasks with no shared data.
> On benchmark such as SPECrate mcf test, this change provides a boost
> to performance especially on medium load system on Jacobsville. on a
> Jacobsville that has 24 Atom cores, arranged into 6 clusters of 4
> cores each, the benchmark number is as follow:
>
> Improvement over baseline kernel for mcf_r
> copies run time base rate
> 1 -0.1% -0.2%
> 6 25.1% 25.1%
> 12 18.8% 19.0%
> 24 0.3% 0.3%
>
> So this looks pretty good. In terms of the system's task distribution,
> some pretty bad clumping can be seen for the vanilla kernel without
> the L2 cluster domain for the 6 and 12 copies case. With the extra
> domain for cluster, the load does get evened out between the clusters.
>
> Note this patch isn't an universal win as spreading isn't necessarily
> a win, particually for those workload who can benefit from packing.
>
> Signed-off-by: Tim Chen <[email protected]>
> Signed-off-by: Barry Song <[email protected]>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> Link: https://lore.kernel.org/r/[email protected]

I've bisected to this patch which now results in my EPYC systems issuing a
lot of:

[ 4.788480] BUG: arch topology borken
[ 4.789578] the SMT domain not a subset of the CLS domain

messages (one for each CPU in the system).

I haven't had a chance to dig deeper and understand everything, does
anyone have some quick insights/ideas?

Thanks,
Tom

2021-10-20 19:59:55

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [tip: sched/core] sched: Add cluster scheduler level for x86

On Wed, Oct 20, 2021 at 08:12:51AM -0500, Tom Lendacky wrote:
> On 10/15/21 4:44 AM, tip-bot2 for Tim Chen wrote:
> > The following commit has been merged into the sched/core branch of tip:
> >
> > Commit-ID: 66558b730f2533cc2bf2b74d51f5f80b81e2bad0
> > Gitweb: https://git.kernel.org/tip/66558b730f2533cc2bf2b74d51f5f80b81e2bad0
> > Author: Tim Chen <[email protected]>
> > AuthorDate: Fri, 24 Sep 2021 20:51:04 +12:00
> > Committer: Peter Zijlstra <[email protected]>
> > CommitterDate: Fri, 15 Oct 2021 11:25:16 +02:00
> >
> > sched: Add cluster scheduler level for x86
> >
> > There are x86 CPU architectures (e.g. Jacobsville) where L2 cahce is
> > shared among a cluster of cores instead of being exclusive to one
> > single core.
> >
> > To prevent oversubscription of L2 cache, load should be balanced
> > between such L2 clusters, especially for tasks with no shared data.
> > On benchmark such as SPECrate mcf test, this change provides a boost
> > to performance especially on medium load system on Jacobsville. on a
> > Jacobsville that has 24 Atom cores, arranged into 6 clusters of 4
> > cores each, the benchmark number is as follow:
> >
> > Improvement over baseline kernel for mcf_r
> > copies run time base rate
> > 1 -0.1% -0.2%
> > 6 25.1% 25.1%
> > 12 18.8% 19.0%
> > 24 0.3% 0.3%
> >
> > So this looks pretty good. In terms of the system's task distribution,
> > some pretty bad clumping can be seen for the vanilla kernel without
> > the L2 cluster domain for the 6 and 12 copies case. With the extra
> > domain for cluster, the load does get evened out between the clusters.
> >
> > Note this patch isn't an universal win as spreading isn't necessarily
> > a win, particually for those workload who can benefit from packing.
> >
> > Signed-off-by: Tim Chen <[email protected]>
> > Signed-off-by: Barry Song <[email protected]>
> > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> > Link: https://lore.kernel.org/r/[email protected]
>
> I've bisected to this patch which now results in my EPYC systems issuing a
> lot of:
>
> [ 4.788480] BUG: arch topology borken
> [ 4.789578] the SMT domain not a subset of the CLS domain
>
> messages (one for each CPU in the system).
>
> I haven't had a chance to dig deeper and understand everything, does anyone
> have some quick insights/ideas?

Urgh, sorry about that. So this stuff uses cpu_l2c_id to build 'clusters',
basically CPUs that share L2, as a subset of the 'multi-core' topology,
which is all CPUs that share LLC (L3 typically).

Your EPYC seems to think the SMT group is not a strict subset of the L2.
The implication is that you have threads with different L2, which would
franky be quite 'exotic' if true :-)

If it does boot, what does something like:

for i in /sys/devices/system/cpu/cpu*/topology/*{_id,_list}; do echo -n "${i}: " ; cat $i; done

produce?

I'll try and figure out how AMD sets l2c_id, that stuff is always a bit
of a maze.

2021-10-20 20:12:43

by Tom Lendacky

[permalink] [raw]

Subject: Re: [tip: sched/core] sched: Add cluster scheduler level for x86

On 10/20/21 2:51 PM, Peter Zijlstra wrote:
> On Wed, Oct 20, 2021 at 08:12:51AM -0500, Tom Lendacky wrote:
>> On 10/15/21 4:44 AM, tip-bot2 for Tim Chen wrote:
>>> The following commit has been merged into the sched/core branch of tip:

>
> If it does boot, what does something like:
>
> for i in /sys/devices/system/cpu/cpu*/topology/*{_id,_list}; do echo -n "${i}: " ; cat $i; done
>
> produce?

The output is about 160K in size, I'll email it to you off-list.

>
> I'll try and figure out how AMD sets l2c_id, that stuff is always a bit
> of a maze.

Thanks!

Tom

>

2021-10-20 20:31:07

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [tip: sched/core] sched: Add cluster scheduler level for x86

On Wed, Oct 20, 2021 at 03:08:41PM -0500, Tom Lendacky wrote:
> On 10/20/21 2:51 PM, Peter Zijlstra wrote:
> > On Wed, Oct 20, 2021 at 08:12:51AM -0500, Tom Lendacky wrote:
> > > On 10/15/21 4:44 AM, tip-bot2 for Tim Chen wrote:
> > > > The following commit has been merged into the sched/core branch of tip:
>
> >
> > If it does boot, what does something like:
> >
> > for i in /sys/devices/system/cpu/cpu*/topology/*{_id,_list}; do echo -n "${i}: " ; cat $i; done
> >
> > produce?
>
> The output is about 160K in size, I'll email it to you off-list.

/sys/devices/system/cpu/cpu0/topology/cluster_cpus_list: 0
/sys/devices/system/cpu/cpu0/topology/core_cpus_list: 0,128

/sys/devices/system/cpu/cpu128/topology/cluster_cpus_list: 128
/sys/devices/system/cpu/cpu128/topology/core_cpus_list: 0,128

So for some reason that thing thinks each SMT thread has it's own L2,
which seems rather unlikely. Or SMT has started to mean something
radically different than it used to be :-)

Let me continue trying to make sense of cacheinfo.c

2021-10-20 20:38:26

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [tip: sched/core] sched: Add cluster scheduler level for x86

On Wed, Oct 20, 2021 at 10:25:42PM +0200, Peter Zijlstra wrote:
> On Wed, Oct 20, 2021 at 03:08:41PM -0500, Tom Lendacky wrote:
> > On 10/20/21 2:51 PM, Peter Zijlstra wrote:
> > > On Wed, Oct 20, 2021 at 08:12:51AM -0500, Tom Lendacky wrote:
> > > > On 10/15/21 4:44 AM, tip-bot2 for Tim Chen wrote:
> > > > > The following commit has been merged into the sched/core branch of tip:
> >
> > >
> > > If it does boot, what does something like:
> > >
> > > for i in /sys/devices/system/cpu/cpu*/topology/*{_id,_list}; do echo -n "${i}: " ; cat $i; done
> > >
> > > produce?
> >
> > The output is about 160K in size, I'll email it to you off-list.
>
> /sys/devices/system/cpu/cpu0/topology/cluster_cpus_list: 0
> /sys/devices/system/cpu/cpu0/topology/core_cpus_list: 0,128
>
> /sys/devices/system/cpu/cpu128/topology/cluster_cpus_list: 128
> /sys/devices/system/cpu/cpu128/topology/core_cpus_list: 0,128
>
> So for some reason that thing thinks each SMT thread has it's own L2,
> which seems rather unlikely. Or SMT has started to mean something
> radically different than it used to be :-)
>
> Let me continue trying to make sense of cacheinfo.c

OK, I think I see what's happening.

AFAICT cacheinfo.c does *NOT* set l2c_id on AMD/Hygon hardware, this
means it's set to BAD_APICID.

This then results in match_l2c() to never match. And as a direct
consequence set_cpu_sibling_map() will generate cpu_l2c_shared_mask with
just the one CPU set.

And we have the above result and things come unstuck if we assume:
SMT <= L2 <= LLC

Now, the big question, how to fix this... Does AMD have means of
actually setting l2c_id or should we fall back to using match_smt() for
l2c_id == BAD_APICID ?

2021-10-20 20:42:15

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [tip: sched/core] sched: Add cluster scheduler level for x86

On Wed, Oct 20, 2021 at 10:36:19PM +0200, Peter Zijlstra wrote:

> OK, I think I see what's happening.
>
> AFAICT cacheinfo.c does *NOT* set l2c_id on AMD/Hygon hardware, this
> means it's set to BAD_APICID.
>
> This then results in match_l2c() to never match. And as a direct
> consequence set_cpu_sibling_map() will generate cpu_l2c_shared_mask with
> just the one CPU set.
>
> And we have the above result and things come unstuck if we assume:
> SMT <= L2 <= LLC
>
> Now, the big question, how to fix this... Does AMD have means of
> actually setting l2c_id or should we fall back to using match_smt() for
> l2c_id == BAD_APICID ?

The latter looks something like the below and ought to make EPYC at
least function as it did before.

---
arch/x86/kernel/smpboot.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 849159797101..c2671b2333d1 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -472,7 +472,7 @@ static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)

/* Do not match if we do not have a valid APICID for cpu: */
if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID)
- return false;
+ return match_smt(c, o); /* assume at least SMT shares L2 */

/* Do not match if L2 cache id does not match: */
if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2))

2021-10-20 20:42:48

by Tom Lendacky

[permalink] [raw]

Subject: Re: [tip: sched/core] sched: Add cluster scheduler level for x86

On 10/20/21 3:36 PM, Peter Zijlstra wrote:
> On Wed, Oct 20, 2021 at 10:25:42PM +0200, Peter Zijlstra wrote:
>> On Wed, Oct 20, 2021 at 03:08:41PM -0500, Tom Lendacky wrote:
>>> On 10/20/21 2:51 PM, Peter Zijlstra wrote:
>>>> On Wed, Oct 20, 2021 at 08:12:51AM -0500, Tom Lendacky wrote:
>>>>> On 10/15/21 4:44 AM, tip-bot2 for Tim Chen wrote:
>>>>>> The following commit has been merged into the sched/core branch of tip:
>>>
>>>>
>>>> If it does boot, what does something like:
>>>>
>>>> for i in /sys/devices/system/cpu/cpu*/topology/*{_id,_list}; do echo -n "${i}: " ; cat $i; done
>>>>
>>>> produce?
>>>
>>> The output is about 160K in size, I'll email it to you off-list.
>>
>> /sys/devices/system/cpu/cpu0/topology/cluster_cpus_list: 0
>> /sys/devices/system/cpu/cpu0/topology/core_cpus_list: 0,128
>>
>> /sys/devices/system/cpu/cpu128/topology/cluster_cpus_list: 128
>> /sys/devices/system/cpu/cpu128/topology/core_cpus_list: 0,128
>>
>> So for some reason that thing thinks each SMT thread has it's own L2,
>> which seems rather unlikely. Or SMT has started to mean something
>> radically different than it used to be :-)
>>
>> Let me continue trying to make sense of cacheinfo.c
>
> OK, I think I see what's happening.
>
> AFAICT cacheinfo.c does *NOT* set l2c_id on AMD/Hygon hardware, this
> means it's set to BAD_APICID.
>
> This then results in match_l2c() to never match. And as a direct
> consequence set_cpu_sibling_map() will generate cpu_l2c_shared_mask with
> just the one CPU set.
>
> And we have the above result and things come unstuck if we assume:
> SMT <= L2 <= LLC
>
> Now, the big question, how to fix this... Does AMD have means of
> actually setting l2c_id or should we fall back to using match_smt() for
> l2c_id == BAD_APICID ?

Let me include Suravee, who has more experience with our topology and
cache information.

Thanks,
Tom

>

2021-10-20 20:54:22

by Tom Lendacky

[permalink] [raw]

Subject: Re: [tip: sched/core] sched: Add cluster scheduler level for x86

On 10/20/21 3:40 PM, Peter Zijlstra wrote:
> On Wed, Oct 20, 2021 at 10:36:19PM +0200, Peter Zijlstra wrote:
>
>> OK, I think I see what's happening.
>>
>> AFAICT cacheinfo.c does *NOT* set l2c_id on AMD/Hygon hardware, this
>> means it's set to BAD_APICID.
>>
>> This then results in match_l2c() to never match. And as a direct
>> consequence set_cpu_sibling_map() will generate cpu_l2c_shared_mask with
>> just the one CPU set.
>>
>> And we have the above result and things come unstuck if we assume:
>> SMT <= L2 <= LLC
>>
>> Now, the big question, how to fix this... Does AMD have means of
>> actually setting l2c_id or should we fall back to using match_smt() for
>> l2c_id == BAD_APICID ?
>
> The latter looks something like the below and ought to make EPYC at
> least function as it did before.
>
>
> ---
> arch/x86/kernel/smpboot.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index 849159797101..c2671b2333d1 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -472,7 +472,7 @@ static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
>
> /* Do not match if we do not have a valid APICID for cpu: */
> if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID)
> - return false;
> + return match_smt(c, o); /* assume at least SMT shares L2 */
>
> /* Do not match if L2 cache id does not match: */
> if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2))
>

Adding Suravee.

2021-10-21 10:35:36

by Barry Song

[permalink] [raw]

Subject: Re: [tip: sched/core] sched: Add cluster scheduler level for x86

On Thu, Oct 21, 2021 at 9:43 AM Peter Zijlstra <[email protected]> wrote:
>
> On Wed, Oct 20, 2021 at 10:36:19PM +0200, Peter Zijlstra wrote:
>
> > OK, I think I see what's happening.
> >
> > AFAICT cacheinfo.c does *NOT* set l2c_id on AMD/Hygon hardware, this
> > means it's set to BAD_APICID.
> >
> > This then results in match_l2c() to never match. And as a direct
> > consequence set_cpu_sibling_map() will generate cpu_l2c_shared_mask with
> > just the one CPU set.
> >
> > And we have the above result and things come unstuck if we assume:
> > SMT <= L2 <= LLC
> >
> > Now, the big question, how to fix this... Does AMD have means of
> > actually setting l2c_id or should we fall back to using match_smt() for
> > l2c_id == BAD_APICID ?
>
> The latter looks something like the below and ought to make EPYC at
> least function as it did before.
>
>
> ---
> arch/x86/kernel/smpboot.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index 849159797101..c2671b2333d1 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -472,7 +472,7 @@ static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
>
> /* Do not match if we do not have a valid APICID for cpu: */
> if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID)
> - return false;
> + return match_smt(c, o); /* assume at least SMT shares L2 */

Rather than making a fake cluster_cpus and cluster_cpus_list which
will expose to userspace
through /sys/devices/cpus/cpux/topology, could we just fix the
sched_domain mask by the
below?
It will be odd to users that a cpu has BAD cluster_id but has
"meaningful" cluster_cpus and
cluster_cpus_list in sys.

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 5094ab0bae58..0f9d706a7507 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -687,6 +687,15 @@ const struct cpumask *cpu_coregroup_mask(int cpu)

const struct cpumask *cpu_clustergroup_mask(int cpu)
{
+ /*
+ * if L2(cluster) is not represented, make cluster sched_domain
+ * same with smt domain, so that this redundant sched_domain can
+ * be dropped and we can avoid the complaint "the SMT domain not
+ * a subset of the cluster domain"
+ */
+ if (cpumask_subset(cpu_l2c_shared_mask(cpu), cpu_smt_mask(cpu)))
+ return cpu_smt_mask(cpu);
+
return cpu_l2c_shared_mask(cpu);
}

>
> /* Do not match if L2 cache id does not match: */
> if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2))

Thanks
Barry

2021-10-21 10:57:24

by Barry Song

[permalink] [raw]

Subject: Re: [tip: sched/core] sched: Add cluster scheduler level for x86

On Thu, Oct 21, 2021 at 11:32 PM Barry Song <[email protected]> wrote:
>
> On Thu, Oct 21, 2021 at 9:43 AM Peter Zijlstra <[email protected]> wrote:
> >
> > On Wed, Oct 20, 2021 at 10:36:19PM +0200, Peter Zijlstra wrote:
> >
> > > OK, I think I see what's happening.
> > >
> > > AFAICT cacheinfo.c does *NOT* set l2c_id on AMD/Hygon hardware, this
> > > means it's set to BAD_APICID.
> > >
> > > This then results in match_l2c() to never match. And as a direct
> > > consequence set_cpu_sibling_map() will generate cpu_l2c_shared_mask with
> > > just the one CPU set.
> > >
> > > And we have the above result and things come unstuck if we assume:
> > > SMT <= L2 <= LLC
> > >
> > > Now, the big question, how to fix this... Does AMD have means of
> > > actually setting l2c_id or should we fall back to using match_smt() for
> > > l2c_id == BAD_APICID ?
> >
> > The latter looks something like the below and ought to make EPYC at
> > least function as it did before.
> >
> >
> > ---
> > arch/x86/kernel/smpboot.c | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> > index 849159797101..c2671b2333d1 100644
> > --- a/arch/x86/kernel/smpboot.c
> > +++ b/arch/x86/kernel/smpboot.c
> > @@ -472,7 +472,7 @@ static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
> >
> > /* Do not match if we do not have a valid APICID for cpu: */
> > if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID)
> > - return false;
> > + return match_smt(c, o); /* assume at least SMT shares L2 */
>
> Rather than making a fake cluster_cpus and cluster_cpus_list which
> will expose to userspace
> through /sys/devices/cpus/cpux/topology, could we just fix the
> sched_domain mask by the
> below?
> It will be odd to users that a cpu has BAD cluster_id but has
> "meaningful" cluster_cpus and
> cluster_cpus_list in sys.
>
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index 5094ab0bae58..0f9d706a7507 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -687,6 +687,15 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
>
> const struct cpumask *cpu_clustergroup_mask(int cpu)
> {
> + /*
> + * if L2(cluster) is not represented, make cluster sched_domain
> + * same with smt domain, so that this redundant sched_domain can
> + * be dropped and we can avoid the complaint "the SMT domain not
> + * a subset of the cluster domain"
> + */
> + if (cpumask_subset(cpu_l2c_shared_mask(cpu), cpu_smt_mask(cpu)))
> + return cpu_smt_mask(cpu);
> +
> return cpu_l2c_shared_mask(cpu);
> }

needs to change this line as well, otherwise, sys is still using the
cpu_clustergroup_mask:

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index cc164777e661..c8cd4e9732bd 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -116,7 +116,7 @@ extern unsigned int __max_die_per_package;
#ifdef CONFIG_SMP
#define topology_cluster_id(cpu) (per_cpu(cpu_l2c_id, cpu))
#define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu))
-#define topology_cluster_cpumask(cpu) (cpu_clustergroup_mask(cpu))
+#define topology_cluster_cpumask(cpu) (cpu_l2c_shared_mask(cpu))
#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))

so sys will use topology_cluster_cpumask which describes hardware as it is;
cpu_clustergroup_mask() describes what is given to the scheduler.

>
>
>
> >
> > /* Do not match if L2 cache id does not match: */
> > if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2))
>
> Thanks
> Barry

Thanks
barry

2021-10-21 13:28:41

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [tip: sched/core] sched: Add cluster scheduler level for x86

On Thu, Oct 21, 2021 at 11:32:36PM +1300, Barry Song wrote:
> On Thu, Oct 21, 2021 at 9:43 AM Peter Zijlstra <[email protected]> wrote:
> >
> > On Wed, Oct 20, 2021 at 10:36:19PM +0200, Peter Zijlstra wrote:
> >
> > > OK, I think I see what's happening.
> > >
> > > AFAICT cacheinfo.c does *NOT* set l2c_id on AMD/Hygon hardware, this
> > > means it's set to BAD_APICID.
> > >
> > > This then results in match_l2c() to never match. And as a direct
> > > consequence set_cpu_sibling_map() will generate cpu_l2c_shared_mask with
> > > just the one CPU set.
> > >
> > > And we have the above result and things come unstuck if we assume:
> > > SMT <= L2 <= LLC
> > >
> > > Now, the big question, how to fix this... Does AMD have means of
> > > actually setting l2c_id or should we fall back to using match_smt() for
> > > l2c_id == BAD_APICID ?
> >
> > The latter looks something like the below and ought to make EPYC at
> > least function as it did before.
> >
> >
> > ---
> > arch/x86/kernel/smpboot.c | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> > index 849159797101..c2671b2333d1 100644
> > --- a/arch/x86/kernel/smpboot.c
> > +++ b/arch/x86/kernel/smpboot.c
> > @@ -472,7 +472,7 @@ static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
> >
> > /* Do not match if we do not have a valid APICID for cpu: */
> > if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID)
> > - return false;
> > + return match_smt(c, o); /* assume at least SMT shares L2 */
>
> Rather than making a fake cluster_cpus and cluster_cpus_list which
> will expose to userspace
> through /sys/devices/cpus/cpux/topology, could we just fix the
> sched_domain mask by the
> below?

I don't think it's fake; SMT fundamentally has to share all cache
levels. And having the sched domains differ in setup from the reported
(nonsensical) topology also isn't appealing.

2021-10-21 22:25:33

by Song Bao Hua (Barry Song)

[permalink] [raw]

Subject: RE: [tip: sched/core] sched: Add cluster scheduler level for x86

> -----Original Message-----
> From: Peter Zijlstra [mailto:[email protected]]
> Sent: Friday, October 22, 2021 2:23 AM
> To: Barry Song <[email protected]>
> Cc: Tom Lendacky <[email protected]>; LKML
> <[email protected]>; [email protected]; Tim Chen
> <[email protected]>; Song Bao Hua (Barry Song)
> <[email protected]>; x86 <[email protected]>
> Subject: Re: [tip: sched/core] sched: Add cluster scheduler level for x86
>
> On Thu, Oct 21, 2021 at 11:32:36PM +1300, Barry Song wrote:
> > On Thu, Oct 21, 2021 at 9:43 AM Peter Zijlstra <[email protected]> wrote:
> > >
> > > On Wed, Oct 20, 2021 at 10:36:19PM +0200, Peter Zijlstra wrote:
> > >
> > > > OK, I think I see what's happening.
> > > >
> > > > AFAICT cacheinfo.c does *NOT* set l2c_id on AMD/Hygon hardware, this
> > > > means it's set to BAD_APICID.
> > > >
> > > > This then results in match_l2c() to never match. And as a direct
> > > > consequence set_cpu_sibling_map() will generate cpu_l2c_shared_mask with
> > > > just the one CPU set.
> > > >
> > > > And we have the above result and things come unstuck if we assume:
> > > > SMT <= L2 <= LLC
> > > >
> > > > Now, the big question, how to fix this... Does AMD have means of
> > > > actually setting l2c_id or should we fall back to using match_smt() for
> > > > l2c_id == BAD_APICID ?
> > >
> > > The latter looks something like the below and ought to make EPYC at
> > > least function as it did before.
> > >
> > >
> > > ---
> > > arch/x86/kernel/smpboot.c | 2 +-
> > > 1 file changed, 1 insertion(+), 1 deletion(-)
> > >
> > > diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> > > index 849159797101..c2671b2333d1 100644
> > > --- a/arch/x86/kernel/smpboot.c
> > > +++ b/arch/x86/kernel/smpboot.c
> > > @@ -472,7 +472,7 @@ static bool match_l2c(struct cpuinfo_x86 *c, struct
> cpuinfo_x86 *o)
> > >
> > > /* Do not match if we do not have a valid APICID for cpu: */
> > > if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID)
> > > - return false;
> > > + return match_smt(c, o); /* assume at least SMT shares L2 */
> >
> > Rather than making a fake cluster_cpus and cluster_cpus_list which
> > will expose to userspace
> > through /sys/devices/cpus/cpux/topology, could we just fix the
> > sched_domain mask by the
> > below?
>
> I don't think it's fake; SMT fundamentally has to share all cache
> levels. And having the sched domains differ in setup from the reported
> (nonsensical) topology also isn't appealing.

Fair enough. I was actually inspired by cpu_coregroup_mask() which
is a combination of a couple of cpumask set:
drivers/base/arch_topology.c

const struct cpumask *cpu_coregroup_mask(int cpu)
{
const cpumask_t *core_mask = cpumask_of_node(cpu_to_node(cpu));

/* Find the smaller of NUMA, core or LLC siblings */
if (cpumask_subset(&cpu_topology[cpu].core_sibling, core_mask)) {
/* not numa in package, lets use the package siblings */
core_mask = &cpu_topology[cpu].core_sibling;
}
if (cpu_topology[cpu].llc_id != -1) {
if (cpumask_subset(&cpu_topology[cpu].llc_sibling, core_mask))
core_mask = &cpu_topology[cpu].llc_sibling;
}

return core_mask;
}

Thanks
Barry

2021-10-22 13:36:36

by Tom Lendacky

[permalink] [raw]

Subject: Re: [tip: sched/core] sched: Add cluster scheduler level for x86

On 10/20/21 3:40 PM, Peter Zijlstra wrote:
> On Wed, Oct 20, 2021 at 10:36:19PM +0200, Peter Zijlstra wrote:
>
>> OK, I think I see what's happening.
>>
>> AFAICT cacheinfo.c does *NOT* set l2c_id on AMD/Hygon hardware, this
>> means it's set to BAD_APICID.
>>
>> This then results in match_l2c() to never match. And as a direct
>> consequence set_cpu_sibling_map() will generate cpu_l2c_shared_mask with
>> just the one CPU set.
>>
>> And we have the above result and things come unstuck if we assume:
>> SMT <= L2 <= LLC
>>
>> Now, the big question, how to fix this... Does AMD have means of
>> actually setting l2c_id or should we fall back to using match_smt() for
>> l2c_id == BAD_APICID ?
>
> The latter looks something like the below and ought to make EPYC at
> least function as it did before.
>
>
> ---
> arch/x86/kernel/smpboot.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index 849159797101..c2671b2333d1 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -472,7 +472,7 @@ static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
>
> /* Do not match if we do not have a valid APICID for cpu: */
> if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID)
> - return false;
> + return match_smt(c, o); /* assume at least SMT shares L2 */

This does eliminate the message and seems like an appropriate thing to do,
in general, if the l2c id is not set.

We're looking into setting the l2c id for AMD platforms, but need to test
against some older platforms. We'll let you know the results next week.

In the mean time, it is probably best to at least apply your above patch.

Thanks,
Tom

>
> /* Do not match if L2 cache id does not match: */
> if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2))
>

2021-10-22 13:39:17

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [tip: sched/core] sched: Add cluster scheduler level for x86

On Fri, Oct 22, 2021 at 08:31:40AM -0500, Tom Lendacky wrote:
> On 10/20/21 3:40 PM, Peter Zijlstra wrote:
> > arch/x86/kernel/smpboot.c | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> > index 849159797101..c2671b2333d1 100644
> > --- a/arch/x86/kernel/smpboot.c
> > +++ b/arch/x86/kernel/smpboot.c
> > @@ -472,7 +472,7 @@ static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
> > /* Do not match if we do not have a valid APICID for cpu: */
> > if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID)
> > - return false;
> > + return match_smt(c, o); /* assume at least SMT shares L2 */
>
> This does eliminate the message and seems like an appropriate thing to do,
> in general, if the l2c id is not set.
>
> We're looking into setting the l2c id for AMD platforms, but need to test
> against some older platforms. We'll let you know the results next week.
>
> In the mean time, it is probably best to at least apply your above patch.

OK, I'll go write up a Changelog and stick on your Reported- and
Tested-by tags.

Thanks!