Cluster scheduling domain was introduced in 5.16 to help even out load
between the clusters. In a last level cache, there can be multiple
clusters, with each cluster having its own resources and multiple CPUs
in it. With cluster scheduling, contention on cluster resource (e.g. L2
cache) can be reduced for better performance.
These patches made cluster scheduling configurable at run time and
boot time. When system is moderately loaded, it is worthwhile to do the
extra load balancing to balance out load between the clusters to reduce
contention on cluster resources (e.g. L2 cache). If the system is
fully utilized, load balancing among cluster is unlikely going to help
to reduce contention of resources a cluster as the cluster
is fully busy.
On a Jacobsville system with 24 Atom cores, where 4 Atom core per cluster
share an L2, we ran the mcf benchmark from very low load of 1 benchmark
copy to 24 benchmark copies on the 24 CPUs system. We see that
throughput is boosted for medium load but there is little improvement
from cluster scheduling when the system is fully loaded.
Improvement over baseline kernel for mcf_r
copies run time base rate
1 -0.1% -0.2%
6 25.1% 25.1%
12 18.8% 19.0%
24 0.3% 0.3%
If the system is expected to operate close to full utilization, the sys
admin could choose to turn off the cluster feature to reduce scheduler
overhead from load balancing at the cluster level.
Cluster scheduling is disabled by default for x86 hybrid CPUs in the
last patch of this series. For such asymmetric system, the system
should rely strictly on CPU priority to determine the order
of task scheduling.
Tim Chen (5):
scheduler: Create SDTL_SKIP flag to skip topology level
scheduler: Add SD_CLUSTER topology flag to cluster sched domain
scheduler: Add runtime knob sysctl_sched_cluster
scheduler: Add boot time enabling/disabling of cluster scheduling
scheduler: Default cluster scheduling to off on x86 hybrid CPU
.../admin-guide/kernel-parameters.txt | 4 +
arch/x86/kernel/smpboot.c | 26 +++++++
drivers/base/arch_topology.c | 23 +++++-
include/linux/sched/sd_flags.h | 7 ++
include/linux/sched/sysctl.h | 6 ++
include/linux/sched/topology.h | 3 +-
include/linux/topology.h | 7 ++
kernel/sched/core.c | 1 +
kernel/sched/sched.h | 6 ++
kernel/sched/topology.c | 75 ++++++++++++++++++-
kernel/sysctl.c | 11 +++
11 files changed, 163 insertions(+), 6 deletions(-)
--
2.20.1
A system admin may not want to use cluster scheduling. Make changes to
allow cluster topology level to be skipped when building sched domains.
Create SDTL_SKIP bit on the sched_domain_topology_level flag so we can
check if the cluster topology level should be skipped when building
sched domains.
Signed-off-by: Tim Chen <[email protected]>
---
include/linux/sched/topology.h | 1 +
kernel/sched/topology.c | 10 +++++++++-
2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index c07bfa2d80f2..a6ad1e02e57a 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -182,6 +182,7 @@ typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
typedef int (*sched_domain_flags_f)(void);
#define SDTL_OVERLAP 0x01
+#define SDTL_SKIP 0x02
struct sd_data {
struct sched_domain *__percpu *sd;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d201a7052a29..ee6b14ad3324 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1652,8 +1652,16 @@ static struct sched_domain_topology_level default_topology[] = {
static struct sched_domain_topology_level *sched_domain_topology =
default_topology;
+static struct sched_domain_topology_level *next_tl(struct sched_domain_topology_level *tl)
+{
+ ++tl;
+ while (tl->mask && tl->flags & SDTL_SKIP)
+ ++tl;
+ return tl;
+}
+
#define for_each_sd_topology(tl) \
- for (tl = sched_domain_topology; tl->mask; tl++)
+ for (tl = sched_domain_topology; tl->mask; tl = next_tl(tl))
void set_sched_topology(struct sched_domain_topology_level *tl)
{
--
2.20.1
Allow run time configuration of the scheduler to use cluster
scheduling. Configuration can be changed via the sysctl variable
/proc/sys/kernel/sched_cluster. Setting it to 1 enable cluster
scheduling and setting it to 0 turns it off.
Cluster scheduling should benefit independent tasks by load balancing
them between clusters. It reaps the most benefit when the system's CPUs
are not fully busy, so we can spread the tasks out between the clusters to
reduce contention on cluster resource (e.g. L2 cache).
However, if the system is expected to operate close to full utilization,
the system admin could turn this feature off so as not to incur
extra load balancing overhead between the cluster domains.
Signed-off-by: Tim Chen <[email protected]>
---
arch/x86/kernel/smpboot.c | 8 +++++++
drivers/base/arch_topology.c | 13 +++++++----
include/linux/sched/sysctl.h | 6 +++++
include/linux/topology.h | 1 +
kernel/sched/core.c | 1 +
kernel/sched/sched.h | 6 +++++
kernel/sched/topology.c | 44 ++++++++++++++++++++++++++++++++++++
kernel/sysctl.c | 11 +++++++++
8 files changed, 86 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ac2909f0cab3..bab5251f8e03 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -57,6 +57,7 @@
#include <linux/pgtable.h>
#include <linux/overflow.h>
#include <linux/syscore_ops.h>
+#include <linux/cpuset.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -127,6 +128,13 @@ int arch_update_cpu_topology(void)
return retval;
}
+void arch_rebuild_cpu_topology(void)
+{
+ x86_topology_update = true;
+ rebuild_sched_domains();
+ x86_topology_update = false;
+}
+
static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
{
unsigned long flags;
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index ff16a36a908b..bb129929410b 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -205,16 +205,21 @@ int topology_update_cpu_topology(void)
return update_topology;
}
+void __weak arch_rebuild_cpu_topology(void)
+{
+ update_topology = 1;
+ rebuild_sched_domains();
+ pr_debug("sched_domain hierarchy rebuilt, flags updated\n");
+ update_topology = 0;
+}
+
/*
* Updating the sched_domains can't be done directly from cpufreq callbacks
* due to locking, so queue the work for later.
*/
static void update_topology_flags_workfn(struct work_struct *work)
{
- update_topology = 1;
- rebuild_sched_domains();
- pr_debug("sched_domain hierarchy rebuilt, flags updated\n");
- update_topology = 0;
+ arch_rebuild_cpu_topology();
}
static DEFINE_PER_CPU(u32, freq_factor) = 1;
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 304f431178fd..bd1c29e8be50 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -75,6 +75,12 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer,
int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_SCHED_CLUSTER
+extern unsigned int sysctl_sched_cluster;
+int sched_cluster_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
+#endif
+
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
extern unsigned int sysctl_sched_energy_aware;
int sched_energy_aware_handler(struct ctl_table *table, int write,
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 0b3704ad13c8..42bcfd5d9fdb 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -44,6 +44,7 @@
if (nr_cpus_node(node))
int arch_update_cpu_topology(void);
+void arch_rebuild_cpu_topology(void);
/* Conform to ACPI 2.0 SLIT distance definitions */
#define LOCAL_DISTANCE 10
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 76f9deeaa942..42b5890a9873 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9201,6 +9201,7 @@ int sched_cpu_dying(unsigned int cpu)
void __init sched_init_smp(void)
{
sched_init_numa();
+ set_sched_cluster();
/*
* There's no userspace yet to cause hotplug operations; hence all the
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0e66749486e7..867ec74d9de0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1651,6 +1651,12 @@ this_rq_lock_irq(struct rq_flags *rf)
return rq;
}
+#ifdef CONFIG_SCHED_CLUSTER
+extern void set_sched_cluster(void);
+#else
+static inline void set_sched_cluster(void) { }
+#endif
+
#ifdef CONFIG_NUMA
enum numa_topology_type {
NUMA_DIRECT,
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 0c11531a64a0..e362bba29f95 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1655,6 +1655,50 @@ static struct sched_domain_topology_level default_topology[] = {
static struct sched_domain_topology_level *sched_domain_topology =
default_topology;
+#ifdef CONFIG_SCHED_CLUSTER
+void set_sched_cluster(void)
+{
+ struct sched_domain_topology_level *tl;
+
+ for (tl = sched_domain_topology; tl->mask; tl++) {
+ if (tl->sd_flags && (tl->sd_flags() & SD_CLUSTER)) {
+ if (!sysctl_sched_cluster)
+ tl->flags |= SDTL_SKIP;
+ else
+ tl->flags &= ~SDTL_SKIP;
+ break;
+ }
+ }
+}
+
+/* set via /proc/sys/kernel/sched_cluster */
+unsigned int __read_mostly sysctl_sched_cluster = 1;
+
+static DEFINE_MUTEX(sched_cluster_mutex);
+int sched_cluster_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+ unsigned int oldval;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&sched_cluster_mutex);
+ oldval = sysctl_sched_cluster;
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (!ret && write) {
+ if (oldval != sysctl_sched_cluster) {
+ set_sched_cluster();
+ arch_rebuild_cpu_topology();
+ }
+ }
+ mutex_unlock(&sched_cluster_mutex);
+
+ return ret;
+}
+#endif
+
static struct sched_domain_topology_level *next_tl(struct sched_domain_topology_level *tl)
{
++tl;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 083be6af29d7..149ddfafaacc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1901,6 +1901,17 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif
+#ifdef CONFIG_SCHED_CLUSTER
+ {
+ .procname = "sched_cluster",
+ .data = &sysctl_sched_cluster,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_cluster_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
--
2.20.1
Add SD_CLUSTER to flag cluster sched domain topology.
Scheduler needs to know if a topology level is at cluster level. It can
then decide if cluster sched domain should be built based on boot or run
time configuration specified in next patch.
Signed-off-by: Tim Chen <[email protected]>
---
include/linux/sched/sd_flags.h | 7 +++++++
include/linux/sched/topology.h | 2 +-
kernel/sched/topology.c | 3 +++
3 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
index 57bde66d95f7..2321f1b4cee6 100644
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -164,3 +164,10 @@ SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
* NEEDS_GROUPS: No point in preserving domain if it has a single group.
*/
SD_FLAG(SD_NUMA, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
+
+/*
+ * Domain members in the same CPU cluster
+ *
+ * NEEDS_GROUPS: Cluster resroucres are shared between groups.
+ */
+SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index a6ad1e02e57a..d71e75f03619 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -45,7 +45,7 @@ static inline int cpu_smt_flags(void)
#ifdef CONFIG_SCHED_CLUSTER
static inline int cpu_cluster_flags(void)
{
- return SD_SHARE_PKG_RESOURCES;
+ return SD_CLUSTER | SD_SHARE_PKG_RESOURCES;
}
#endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ee6b14ad3324..0c11531a64a0 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1504,6 +1504,8 @@ static unsigned long __read_mostly *sched_numa_onlined_nodes;
* function:
*
* SD_SHARE_CPUCAPACITY - describes SMT topologies
+ * SD_CLUSTER - describes multiple CPU clusters in a
+ * last level cache
* SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies
*
@@ -1514,6 +1516,7 @@ static unsigned long __read_mostly *sched_numa_onlined_nodes;
*/
#define TOPOLOGY_SD_FLAGS \
(SD_SHARE_CPUCAPACITY | \
+ SD_CLUSTER | \
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING)
--
2.20.1
Add boot time parameter sched_cluster to enable or disable cluster
scheduling. Set boot parameter as follow:
sched_cluster=0 disables cluster scheduling
sched_cluster=1 enables cluster scheduling
Signed-off-by: Tim Chen <[email protected]>
---
Documentation/admin-guide/kernel-parameters.txt | 4 ++++
kernel/sched/topology.c | 16 ++++++++++++++++
2 files changed, 20 insertions(+)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 9725c546a0d4..40ad997430e1 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5023,6 +5023,10 @@
sched_verbose [KNL] Enables verbose scheduler debug messages.
+ sched_cluster= Enable or disable cluster scheduling.
+ 0 -- disable.
+ 1 -- enable.
+
schedstats= [KNL,X86] Enable or disable scheduled statistics.
Allowed values are enable and disable. This feature
incurs a small amount of overhead in the scheduler
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index e362bba29f95..087854d505f7 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1697,6 +1697,22 @@ int sched_cluster_handler(struct ctl_table *table, int write,
return ret;
}
+
+static int __init sched_cluster_option(char *str)
+{
+ int enable;
+
+ if (get_option(&str, &enable)) {
+ if (enable != 0 && enable != 1)
+ return -EINVAL;
+
+ sysctl_sched_cluster = enable;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+early_param("sched_cluster", sched_cluster_option);
#endif
static struct sched_domain_topology_level *next_tl(struct sched_domain_topology_level *tl)
--
2.20.1
For x86 hybrid CPUs like Alder Lake, the order of CPU selection should
be based stricly on CPU priority. Turn off cluster scheduling
to avoid interference with such CPU selection order.
Signed-off-by: Tim Chen <[email protected]>
---
arch/x86/kernel/smpboot.c | 18 ++++++++++++++++++
drivers/base/arch_topology.c | 10 ++++++++++
include/linux/topology.h | 6 ++++++
kernel/sched/topology.c | 4 +++-
4 files changed, 37 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bab5251f8e03..4aa1c859dcea 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -46,6 +46,7 @@
#include <linux/sched/topology.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/task_stack.h>
+#include <linux/sched/sysctl.h>
#include <linux/percpu.h>
#include <linux/memblock.h>
#include <linux/err.h>
@@ -135,6 +136,23 @@ void arch_rebuild_cpu_topology(void)
x86_topology_update = false;
}
+#ifdef CONFIG_SCHED_CLUSTER
+void arch_set_def_cluster_topology(void)
+{
+ /*
+ * For hybrid CPUs, scheduling order between the CPUs should be
+ * based strictly on CPU priority. Turn off cluster scheduling
+ * for hybrid CPUs.
+ */
+ if (sysctl_sched_cluster > 1) {
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ sysctl_sched_cluster = 0;
+ else
+ sysctl_sched_cluster = 1;
+ }
+}
+#endif
+
static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
{
unsigned long flags;
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index bb129929410b..a3668cc4b727 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -18,6 +18,7 @@
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
+#include <linux/sched/sysctl.h>
static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data);
static struct cpumask scale_freq_counters_mask;
@@ -213,6 +214,15 @@ void __weak arch_rebuild_cpu_topology(void)
update_topology = 0;
}
+#ifdef CONFIG_SCHED_CLUSTER
+void __weak arch_set_def_cluster_topology(void)
+{
+ /* Use cluster topology by default unless disabled in boot option */
+ if (sysctl_sched_cluster > 1)
+ sysctl_sched_cluster = 1;
+}
+#endif
+
/*
* Updating the sched_domains can't be done directly from cpufreq callbacks
* due to locking, so queue the work for later.
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 42bcfd5d9fdb..87d893c05f0c 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -46,6 +46,12 @@
int arch_update_cpu_topology(void);
void arch_rebuild_cpu_topology(void);
+#ifdef CONFIG_SCHED_CLUSTER
+void arch_set_def_cluster_topology(void);
+#else
+static inline void arch_set_def_sched_cluster(void) { };
+#endif
+
/* Conform to ACPI 2.0 SLIT distance definitions */
#define LOCAL_DISTANCE 10
#define REMOTE_DISTANCE 20
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 087854d505f7..07dfc4666c09 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1660,6 +1660,8 @@ void set_sched_cluster(void)
{
struct sched_domain_topology_level *tl;
+ arch_set_def_cluster_topology();
+
for (tl = sched_domain_topology; tl->mask; tl++) {
if (tl->sd_flags && (tl->sd_flags() & SD_CLUSTER)) {
if (!sysctl_sched_cluster)
@@ -1672,7 +1674,7 @@ void set_sched_cluster(void)
}
/* set via /proc/sys/kernel/sched_cluster */
-unsigned int __read_mostly sysctl_sched_cluster = 1;
+unsigned int __read_mostly sysctl_sched_cluster = 2;
static DEFINE_MUTEX(sched_cluster_mutex);
int sched_cluster_handler(struct ctl_table *table, int write,
--
2.20.1
Hi Tim,
On 2021/12/4 4:32, Tim Chen wrote:
> Add boot time parameter sched_cluster to enable or disable cluster
> scheduling. Set boot parameter as follow:
>
> sched_cluster=0 disables cluster scheduling
> sched_cluster=1 enables cluster scheduling
>
> Signed-off-by: Tim Chen <[email protected]>
> ---
> Documentation/admin-guide/kernel-parameters.txt | 4 ++++
> kernel/sched/topology.c | 16 ++++++++++++++++
> 2 files changed, 20 insertions(+)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 9725c546a0d4..40ad997430e1 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -5023,6 +5023,10 @@
>
> sched_verbose [KNL] Enables verbose scheduler debug messages.
>
> + sched_cluster= Enable or disable cluster scheduling.
> + 0 -- disable.
> + 1 -- enable.
> +
One minor question here. We use 0|1 for sched_cluster here, but "enable" or
"disable" for schedstats below. Should we keep consistent?
> schedstats= [KNL,X86] Enable or disable scheduled statistics.
> Allowed values are enable and disable. This feature
> incurs a small amount of overhead in the scheduler
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index e362bba29f95..087854d505f7 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1697,6 +1697,22 @@ int sched_cluster_handler(struct ctl_table *table, int write,
>
> return ret;
> }
> +
> +static int __init sched_cluster_option(char *str)
> +{
> + int enable;
> +
> + if (get_option(&str, &enable)) {
> + if (enable != 0 && enable != 1)
> + return -EINVAL;
> +
> + sysctl_sched_cluster = enable;
> + return 0;
> + }
> +
> + return -EINVAL;
> +}
> +early_param("sched_cluster", sched_cluster_option);
> #endif
>
> static struct sched_domain_topology_level *next_tl(struct sched_domain_topology_level *tl)
>
On Fri, Dec 03, 2021 at 12:32:37PM -0800, Tim Chen wrote:
> Tim Chen (5):
> scheduler: Create SDTL_SKIP flag to skip topology level
> scheduler: Add SD_CLUSTER topology flag to cluster sched domain
> scheduler: Add runtime knob sysctl_sched_cluster
> scheduler: Add boot time enabling/disabling of cluster scheduling
> scheduler: Default cluster scheduling to off on x86 hybrid CPU
s/scheduler:/sched:/, surely?
> .../admin-guide/kernel-parameters.txt | 4 +
> arch/x86/kernel/smpboot.c | 26 +++++++
> drivers/base/arch_topology.c | 23 +++++-
> include/linux/sched/sd_flags.h | 7 ++
> include/linux/sched/sysctl.h | 6 ++
> include/linux/sched/topology.h | 3 +-
> include/linux/topology.h | 7 ++
> kernel/sched/core.c | 1 +
> kernel/sched/sched.h | 6 ++
> kernel/sched/topology.c | 75 ++++++++++++++++++-
> kernel/sysctl.c | 11 +++
> 11 files changed, 163 insertions(+), 6 deletions(-)
*groan*,... I was more thinking of something like so.
---
arch/x86/kernel/smpboot.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ac2909f0cab3..617012f4619f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -579,6 +579,17 @@ static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
{ NULL, },
};
+static struct sched_domain_topology_level x86_hybrid_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
@@ -1469,8 +1480,11 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
calculate_max_logical_packages();
+ /* XXX for now assume numa-in-package and hybrid don't overlap */
if (x86_has_numa_in_package)
set_sched_topology(x86_numa_in_package_topology);
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ set_sched_topology(x86_hybrid_topology);
nmi_selftest();
impress_friends();
On Sat, 2021-12-04 at 10:14 +0100, Peter Zijlstra wrote:
> On Fri, Dec 03, 2021 at 12:32:37PM -0800, Tim Chen wrote:
> > Tim Chen (5):
> > scheduler: Create SDTL_SKIP flag to skip topology level
> > scheduler: Add SD_CLUSTER topology flag to cluster sched domain
> > scheduler: Add runtime knob sysctl_sched_cluster
> > scheduler: Add boot time enabling/disabling of cluster scheduling
> > scheduler: Default cluster scheduling to off on x86 hybrid CPU
>
> s/scheduler:/sched:/, surely?
>
> > .../admin-guide/kernel-parameters.txt | 4 +
> > arch/x86/kernel/smpboot.c | 26 +++++++
> > drivers/base/arch_topology.c | 23 +++++-
> > include/linux/sched/sd_flags.h | 7 ++
> > include/linux/sched/sysctl.h | 6 ++
> > include/linux/sched/topology.h | 3 +-
> > include/linux/topology.h | 7 ++
> > kernel/sched/core.c | 1 +
> > kernel/sched/sched.h | 6 ++
> > kernel/sched/topology.c | 75
> > ++++++++++++++++++-
> > kernel/sysctl.c | 11 +++
> > 11 files changed, 163 insertions(+), 6 deletions(-)
>
> *groan*,... I was more thinking of something like so.
>
Thanks. This is much more straightforward for
updating the default topology for hybrid CPU.
Tim
> ---
> arch/x86/kernel/smpboot.c | 14 ++++++++++++++
> 1 file changed, 14 insertions(+)
>
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index ac2909f0cab3..617012f4619f 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -579,6 +579,17 @@ static struct sched_domain_topology_level
> x86_numa_in_package_topology[] = {
> { NULL, },
> };
>
> +static struct sched_domain_topology_level x86_hybrid_topology[] = {
> +#ifdef CONFIG_SCHED_SMT
> + { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
> +#endif
> +#ifdef CONFIG_SCHED_MC
> + { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
> +#endif
> + { cpu_cpu_mask, SD_INIT_NAME(DIE) },
> + { NULL, },
> +};
> +
> static struct sched_domain_topology_level x86_topology[] = {
> #ifdef CONFIG_SCHED_SMT
> { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
> @@ -1469,8 +1480,11 @@ void __init native_smp_cpus_done(unsigned int
> max_cpus)
>
> calculate_max_logical_packages();
>
> + /* XXX for now assume numa-in-package and hybrid don't overlap
> */
> if (x86_has_numa_in_package)
> set_sched_topology(x86_numa_in_package_topology);
> + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
> + set_sched_topology(x86_hybrid_topology);
>
> nmi_selftest();
> impress_friends();
On Sat, Dec 04, 2021 at 10:14:02AM +0100, Peter Zijlstra wrote:
> On Fri, Dec 03, 2021 at 12:32:37PM -0800, Tim Chen wrote:
> > Tim Chen (5):
> > scheduler: Create SDTL_SKIP flag to skip topology level
> > scheduler: Add SD_CLUSTER topology flag to cluster sched domain
> > scheduler: Add runtime knob sysctl_sched_cluster
> > scheduler: Add boot time enabling/disabling of cluster scheduling
> > scheduler: Default cluster scheduling to off on x86 hybrid CPU
>
> s/scheduler:/sched:/, surely?
>
> > .../admin-guide/kernel-parameters.txt | 4 +
> > arch/x86/kernel/smpboot.c | 26 +++++++
> > drivers/base/arch_topology.c | 23 +++++-
> > include/linux/sched/sd_flags.h | 7 ++
> > include/linux/sched/sysctl.h | 6 ++
> > include/linux/sched/topology.h | 3 +-
> > include/linux/topology.h | 7 ++
> > kernel/sched/core.c | 1 +
> > kernel/sched/sched.h | 6 ++
> > kernel/sched/topology.c | 75 ++++++++++++++++++-
> > kernel/sysctl.c | 11 +++
> > 11 files changed, 163 insertions(+), 6 deletions(-)
>
> *groan*,... I was more thinking of something like so.
>
> ---
> arch/x86/kernel/smpboot.c | 14 ++++++++++++++
> 1 file changed, 14 insertions(+)
>
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index ac2909f0cab3..617012f4619f 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -579,6 +579,17 @@ static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
> { NULL, },
> };
>
> +static struct sched_domain_topology_level x86_hybrid_topology[] = {
> +#ifdef CONFIG_SCHED_SMT
> + { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
> +#endif
> +#ifdef CONFIG_SCHED_MC
> + { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
> +#endif
> + { cpu_cpu_mask, SD_INIT_NAME(DIE) },
> + { NULL, },
> +};
> +
> static struct sched_domain_topology_level x86_topology[] = {
> #ifdef CONFIG_SCHED_SMT
> { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
> @@ -1469,8 +1480,11 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
>
> calculate_max_logical_packages();
>
> + /* XXX for now assume numa-in-package and hybrid don't overlap */
> if (x86_has_numa_in_package)
> set_sched_topology(x86_numa_in_package_topology);
> + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
> + set_sched_topology(x86_hybrid_topology);
>
> nmi_selftest();
> impress_friends();
FWIW, I tested this and now I don't see cluster scheduling interfering
with asymmetric packing when running on a x86 hybrid topology.
Tested-by: Ricardo Neri <[email protected]>
On Sat, 2021-12-04 at 10:14 +0100, Peter Zijlstra wrote:
>
>
Peter,
If you wish, you can use the following change log for your
abbreviated patch.
---
sched: Don't use cluster topology for x86 hybrid CPUs
For x86 hybrid CPUs like Alder Lake, the order of CPU selection should
be based strictly on CPU priority. Don't include cluster topology for
hybrid CPUs to avoid interference with such CPU selection order.
On Alder Lake, the Atom CPU cluster has more capacity (4 Atom CPUs)
vs Big core cluster (2 hyperthread CPUs). This could potentially
bias CPU selection towards Atom over Big Core, when Big core
CPU has higher priority.
---
Thanks.
Acked-by: Tim Chen <[email protected]>
Tim
> ---
> arch/x86/kernel/smpboot.c | 14 ++++++++++++++
> 1 file changed, 14 insertions(+)
>
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index ac2909f0cab3..617012f4619f 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -579,6 +579,17 @@ static struct sched_domain_topology_level
> x86_numa_in_package_topology[] = {
> { NULL, },
> };
>
> +static struct sched_domain_topology_level x86_hybrid_topology[] = {
> +#ifdef CONFIG_SCHED_SMT
> + { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
> +#endif
> +#ifdef CONFIG_SCHED_MC
> + { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
> +#endif
> + { cpu_cpu_mask, SD_INIT_NAME(DIE) },
> + { NULL, },
> +};
> +
> static struct sched_domain_topology_level x86_topology[] = {
> #ifdef CONFIG_SCHED_SMT
> { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
> @@ -1469,8 +1480,11 @@ void __init native_smp_cpus_done(unsigned int
> max_cpus)
>
> calculate_max_logical_packages();
>
> + /* XXX for now assume numa-in-package and hybrid don't overlap
> */
> if (x86_has_numa_in_package)
> set_sched_topology(x86_numa_in_package_topology);
> + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
> + set_sched_topology(x86_hybrid_topology);
>
> nmi_selftest();
> impress_friends();
The following commit has been merged into the sched/urgent branch of tip:
Commit-ID: cabdc3a8475b918e55744f43719b26a82dc8fa6b
Gitweb: https://git.kernel.org/tip/cabdc3a8475b918e55744f43719b26a82dc8fa6b
Author: Peter Zijlstra <[email protected]>
AuthorDate: Sat, 04 Dec 2021 10:14:02 +01:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Wed, 08 Dec 2021 22:15:37 +01:00
sched,x86: Don't use cluster topology for x86 hybrid CPUs
For x86 hybrid CPUs like Alder Lake, the order of CPU selection should
be based strictly on CPU priority. Don't include cluster topology for
hybrid CPUs to avoid interference with such CPU selection order.
On Alder Lake, the Atom CPU cluster has more capacity (4 Atom CPUs) vs
Big core cluster (2 hyperthread CPUs). This could potentially bias CPU
selection towards Atom over Big Core, when Big core CPU has higher
priority.
Fixes: 66558b730f25 ("sched: Add cluster scheduler level for x86")
Suggested-by: Tim Chen <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Acked-by: Tim Chen <[email protected]>
Tested-by: Ricardo Neri <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
arch/x86/kernel/smpboot.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ac2909f..617012f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -579,6 +579,17 @@ static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
{ NULL, },
};
+static struct sched_domain_topology_level x86_hybrid_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
@@ -1469,8 +1480,11 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
calculate_max_logical_packages();
+ /* XXX for now assume numa-in-package and hybrid don't overlap */
if (x86_has_numa_in_package)
set_sched_topology(x86_numa_in_package_topology);
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ set_sched_topology(x86_hybrid_topology);
nmi_selftest();
impress_friends();