From: Wang Qing <[email protected]>
Use nested cluster structures in DT to support describing multi-level
cluster topologies.
Notice: the clusters describing in DT currently are not physical
boundaries, since changing "cluster" to "socket" is too involved and error
prone, this patch will not have any effect on one-level cluster topo, but
can support the mutil-level cluster topo to support CLUSTER_SCHED.
Signed-off-by: Wang Qing <[email protected]>
---
drivers/base/arch_topology.c | 25 +++++++++++++++++--------
1 file changed, 17 insertions(+), 8 deletions(-)
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 1d6636ebaac5..f2ea8113d619 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -491,7 +491,7 @@ static int __init get_cpu_for_node(struct device_node *node)
}
static int __init parse_core(struct device_node *core, int package_id,
- int core_id)
+ int cluster_id, int core_id)
{
char name[20];
bool leaf = true;
@@ -507,6 +507,7 @@ static int __init parse_core(struct device_node *core, int package_id,
cpu = get_cpu_for_node(t);
if (cpu >= 0) {
cpu_topology[cpu].package_id = package_id;
+ cpu_topology[cpu].cluster_id = cluster_id;
cpu_topology[cpu].core_id = core_id;
cpu_topology[cpu].thread_id = i;
} else if (cpu != -ENODEV) {
@@ -528,6 +529,7 @@ static int __init parse_core(struct device_node *core, int package_id,
}
cpu_topology[cpu].package_id = package_id;
+ cpu_topology[cpu].cluster_id = cluster_id;
cpu_topology[cpu].core_id = core_id;
} else if (leaf && cpu != -ENODEV) {
pr_err("%pOF: Can't get CPU for leaf core\n", core);
@@ -544,13 +546,15 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
bool has_cores = false;
struct device_node *c;
static int package_id __initdata;
+ static int cluster_id __initdata;
int core_id = 0;
int i, ret;
/*
- * First check for child clusters; we currently ignore any
- * information about the nesting of clusters and present the
- * scheduler with a flat list of them.
+ * nesting of clusters :
+ * level 1: package_id
+ * level 2: cluster_id
+ * level 3+: ignore
*/
i = 0;
do {
@@ -559,6 +563,14 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
if (c) {
leaf = false;
ret = parse_cluster(c, depth + 1);
+ if (depth == 0) {
+ package_id++;
+ cluster_id = 0;
+ } else if (depth == 1)
+ cluster_id++;
+ else
+ pr_err("Ignore nested clusters with more than two levels!\n");
+
of_node_put(c);
if (ret != 0)
return ret;
@@ -582,7 +594,7 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
}
if (leaf) {
- ret = parse_core(c, package_id, core_id++);
+ ret = parse_core(c, package_id, cluster_id, core_id++);
} else {
pr_err("%pOF: Non-leaf cluster with core %s\n",
cluster, name);
@@ -599,9 +611,6 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
if (leaf && !has_cores)
pr_warn("%pOF: empty cluster\n", cluster);
- if (leaf)
- package_id++;
-
return 0;
}
--
2.7.4
On 05/05/2022 10:35, Qing Wang wrote:
> From: Wang Qing <[email protected]>
>
> Use nested cluster structures in DT to support describing multi-level
> cluster topologies.
>
> Notice: the clusters describing in DT currently are not physical
> boundaries, since changing "cluster" to "socket" is too involved and error
> prone, this patch will not have any effect on one-level cluster topo, but
> can support the mutil-level cluster topo to support CLUSTER_SCHED.
>
> Signed-off-by: Wang Qing <[email protected]>
> ---
> drivers/base/arch_topology.c | 25 +++++++++++++++++--------
> 1 file changed, 17 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
> index 1d6636ebaac5..f2ea8113d619 100644
> --- a/drivers/base/arch_topology.c
> +++ b/drivers/base/arch_topology.c
> @@ -491,7 +491,7 @@ static int __init get_cpu_for_node(struct device_node *node)
> }
>
> static int __init parse_core(struct device_node *core, int package_id,
> - int core_id)
> + int cluster_id, int core_id)
> {
> char name[20];
> bool leaf = true;
> @@ -507,6 +507,7 @@ static int __init parse_core(struct device_node *core, int package_id,
> cpu = get_cpu_for_node(t);
> if (cpu >= 0) {
> cpu_topology[cpu].package_id = package_id;
> + cpu_topology[cpu].cluster_id = cluster_id;
> cpu_topology[cpu].core_id = core_id;
> cpu_topology[cpu].thread_id = i;
> } else if (cpu != -ENODEV) {
> @@ -528,6 +529,7 @@ static int __init parse_core(struct device_node *core, int package_id,
> }
>
> cpu_topology[cpu].package_id = package_id;
> + cpu_topology[cpu].cluster_id = cluster_id;
> cpu_topology[cpu].core_id = core_id;
> } else if (leaf && cpu != -ENODEV) {
> pr_err("%pOF: Can't get CPU for leaf core\n", core);
> @@ -544,13 +546,15 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
> bool has_cores = false;
> struct device_node *c;
> static int package_id __initdata;
> + static int cluster_id __initdata;
Starting with cluster_id = 0 breaks existing platforms like
./arch/arm64/boot/dts/arm/juno.dts. For them it has to be still set to -1.
You get e.g.:
# cat /sys/kernel/debug/sched/domains/cpu1/domain*/name
CLS
DIE
instead of:
# cat /sys/kernel/debug/sched/domains/cpu1/domain*/name
MC
DIE
> int core_id = 0;
> int i, ret;
>
> /*
> - * First check for child clusters; we currently ignore any
> - * information about the nesting of clusters and present the
> - * scheduler with a flat list of them.
> + * nesting of clusters :
> + * level 1: package_id
> + * level 2: cluster_id
> + * level 3+: ignore
> */
> i = 0;
> do {
> @@ -559,6 +563,14 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
> if (c) {
> leaf = false;
> ret = parse_cluster(c, depth + 1);
> + if (depth == 0) {
> + package_id++;
> + cluster_id = 0;
- cluster_id = 0;
+ cluster_id = -1;
Would have to be cluster_id = -1. 0 is a valid 2. level cluster.
Otherwise you're not removing the CLS data from CPU4 to CPU7 in the
`Armv9 with L2 complexes` cpu-map example I used for testing:
cpu-map {
cluster0 {
cluster0 {
core0 {
cpu = <&cpu0>;
};
core1 {
cpu = <&cpu1>;
};
};
cluster1 {
core0 {
cpu = <&cpu2>;
};
core1 {
cpu = <&cpu3>;
};
};
};
cluster1 {
core0 {
cpu = <&cpu4>;
};
core1 {
cpu = <&cpu5>;
};
core2 {
cpu = <&cpu6>;
};
};
cluster2 {
core0 {
cpu = <&cpu7>;
};
};
};
> + } else if (depth == 1)
> + cluster_id++;
> + else
> + pr_err("Ignore nested clusters with more than two levels!\n");
> +
> of_node_put(c);
> if (ret != 0)
> return ret;
> @@ -582,7 +594,7 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
> }
>
> if (leaf) {
> - ret = parse_core(c, package_id, core_id++);
> + ret = parse_core(c, package_id, cluster_id, core_id++);
> } else {
> pr_err("%pOF: Non-leaf cluster with core %s\n",
> cluster, name);
> @@ -599,9 +611,6 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
> if (leaf && !has_cores)
> pr_warn("%pOF: empty cluster\n", cluster);
>
> - if (leaf)
> - package_id++;
> -
> return 0;
> }
Looks like you also need to adapt update_siblings_masks() to only set
cpu in &cpu_topo->thread_sibling and &cpuid_topo->thread_sibling when
`cpu_topo->thread_id != -1`.
@@ -723,11 +723,11 @@ void update_siblings_masks(unsigned int cpuid)
cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
cpumask_set_cpu(cpu, &cpuid_topo->core_sibling);
- if (cpuid_topo->core_id != cpu_topo->core_id)
- continue;
-
- cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling);
- cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling);
+ if (cpu_topo->thread_id != -1 &&
+ cpuid_topo->core_id == cpu_topo->core_id) {
+ cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling);
+ cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling);
+
>> From: Wang Qing <[email protected]>
>>
>> Use nested cluster structures in DT to support describing multi-level
>> cluster topologies.
>>
>> Notice: the clusters describing in DT currently are not physical
>> boundaries, since changing "cluster" to "socket" is too involved and error
>> prone, this patch will not have any effect on one-level cluster topo, but
>> can support the mutil-level cluster topo to support CLUSTER_SCHED.
>>
>> Signed-off-by: Wang Qing <[email protected]>
>> ---
>> drivers/base/arch_topology.c | 25 +++++++++++++++++--------
>> 1 file changed, 17 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
>> index 1d6636ebaac5..f2ea8113d619 100644
>> --- a/drivers/base/arch_topology.c
>> +++ b/drivers/base/arch_topology.c
>> @@ -491,7 +491,7 @@ static int __init get_cpu_for_node(struct device_node *node)
>> }
>>
>> static int __init parse_core(struct device_node *core, int package_id,
>> - int core_id)
>> + int cluster_id, int core_id)
>> {
>> char name[20];
>> bool leaf = true;
>> @@ -507,6 +507,7 @@ static int __init parse_core(struct device_node *core, int package_id,
>> cpu = get_cpu_for_node(t);
>> if (cpu >= 0) {
>> cpu_topology[cpu].package_id = package_id;
>> + cpu_topology[cpu].cluster_id = cluster_id;
>> cpu_topology[cpu].core_id = core_id;
>> cpu_topology[cpu].thread_id = i;
>> } else if (cpu != -ENODEV) {
>> @@ -528,6 +529,7 @@ static int __init parse_core(struct device_node *core, int package_id,
>> }
>>
>> cpu_topology[cpu].package_id = package_id;
>> + cpu_topology[cpu].cluster_id = cluster_id;
>> cpu_topology[cpu].core_id = core_id;
>> } else if (leaf && cpu != -ENODEV) {
>> pr_err("%pOF: Can't get CPU for leaf core\n", core);
>> @@ -544,13 +546,15 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
>> bool has_cores = false;
>> struct device_node *c;
>> static int package_id __initdata;
>> + static int cluster_id __initdata;
>
>Starting with cluster_id = 0 breaks existing platforms like
>./arch/arm64/boot/dts/arm/juno.dts. For them it has to be still set to -1.
Yes, I noticed this problem, please help to review version V2.
>
>You get e.g.:
>
># cat /sys/kernel/debug/sched/domains/cpu1/domain*/name
>CLS
>DIE
>
>instead of:
>
># cat /sys/kernel/debug/sched/domains/cpu1/domain*/name
>MC
>DIE
>
>
>> int core_id = 0;
>> int i, ret;
>>
>> /*
>> - * First check for child clusters; we currently ignore any
>> - * information about the nesting of clusters and present the
>> - * scheduler with a flat list of them.
>> + * nesting of clusters :
>> + * level 1: package_id
>> + * level 2: cluster_id
>> + * level 3+: ignore
>> */
>> i = 0;
>> do {
>> @@ -559,6 +563,14 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
>> if (c) {
>> leaf = false;
>> ret = parse_cluster(c, depth + 1);
>> + if (depth == 0) {
>> + package_id++;
>> + cluster_id = 0;
>
>- cluster_id = 0;
>+ cluster_id = -1;
This modification is not enough..
>
>Would have to be cluster_id = -1. 0 is a valid 2. level cluster.
>Otherwise you're not removing the CLS data from CPU4 to CPU7 in the
>`Armv9 with L2 complexes` cpu-map example I used for testing:
>
> cpu-map {
> cluster0 {
> cluster0 {
> core0 {
> cpu = <&cpu0>;
> };
> core1 {
> cpu = <&cpu1>;
> };
> };
> cluster1 {
> core0 {
> cpu = <&cpu2>;
> };
> core1 {
> cpu = <&cpu3>;
> };
> };
> };
> cluster1 {
> core0 {
> cpu = <&cpu4>;
> };
> core1 {
> cpu = <&cpu5>;
> };
> core2 {
> cpu = <&cpu6>;
> };
> };
> cluster2 {
> core0 {
> cpu = <&cpu7>;
> };
> };
> };
>
>> + } else if (depth == 1)
>> + cluster_id++;
>> + else
>> + pr_err("Ignore nested clusters with more than two levels!\n");
>> +
>> of_node_put(c);
>> if (ret != 0)
>> return ret;
>> @@ -582,7 +594,7 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
>> }
>>
>> if (leaf) {
>> - ret = parse_core(c, package_id, core_id++);
>> + ret = parse_core(c, package_id, cluster_id, core_id++);
>> } else {
>> pr_err("%pOF: Non-leaf cluster with core %s\n",
>> cluster, name);
>> @@ -599,9 +611,6 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
>> if (leaf && !has_cores)
>> pr_warn("%pOF: empty cluster\n", cluster);
>>
>> - if (leaf)
>> - package_id++;
>> -
>> return 0;
>> }
>
>Looks like you also need to adapt update_siblings_masks() to only set
>cpu in &cpu_topo->thread_sibling and &cpuid_topo->thread_sibling when
>`cpu_topo->thread_id != -1`.
>
>@@ -723,11 +723,11 @@ void update_siblings_masks(unsigned int cpuid)
> cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
> cpumask_set_cpu(cpu, &cpuid_topo->core_sibling);
>
>- if (cpuid_topo->core_id != cpu_topo->core_id)
>- continue;
>-
>- cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling);
>- cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling);
>+ if (cpu_topo->thread_id != -1 &&
>+ cpuid_topo->core_id == cpu_topo->core_id) {
>+ cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling);
>+ cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling);
>+
This seems like another problem?
Thanks,
Qing
On 11/05/2022 11:38, 王擎 wrote:
>
>>> From: Wang Qing <[email protected]>
[...]
>> Would have to be cluster_id = -1. 0 is a valid 2. level cluster.
>> Otherwise you're not removing the CLS data from CPU4 to CPU7 in the
>> `Armv9 with L2 complexes` cpu-map example I used for testing:
>>
>> cpu-map {
>> cluster0 {
>> cluster0 {
>> core0 {
>> cpu = <&cpu0>;
>> };
>> core1 {
>> cpu = <&cpu1>;
>> };
>> };
>> cluster1 {
>> core0 {
>> cpu = <&cpu2>;
>> };
>> core1 {
>> cpu = <&cpu3>;
>> };
>> };
>> };
>> cluster1 {
>> core0 {
>> cpu = <&cpu4>;
>> };
>> core1 {
>> cpu = <&cpu5>;
>> };
>> core2 {
>> cpu = <&cpu6>;
>> };
>> };
>> cluster2 {
>> core0 {
>> cpu = <&cpu7>;
>> };
>> };
>> };
[...]
>> Looks like you also need to adapt update_siblings_masks() to only set
>> cpu in &cpu_topo->thread_sibling and &cpuid_topo->thread_sibling when
>> `cpu_topo->thread_id != -1`.
>>
>> @@ -723,11 +723,11 @@ void update_siblings_masks(unsigned int cpuid)
>> cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
>> cpumask_set_cpu(cpu, &cpuid_topo->core_sibling);
>>
>> - if (cpuid_topo->core_id != cpu_topo->core_id)
>> - continue;
>> -
>> - cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling);
>> - cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling);
>> + if (cpu_topo->thread_id != -1 &&
>> + cpuid_topo->core_id == cpu_topo->core_id) {
>> + cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling);
>> + cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling);
>> +
>
> This seems like another problem?
I don't think so. If you run a system with the cpu-map mentioned above
you get:
# cat /sys/kernel/debug/sched/domains/cpu0/domain*/name
SMT <-- !!!
CLS
MC
root@e125579:~# cat /proc/schedstat | awk '{print $1 " " $2 }' | grep
^[cd] | head -5
cpu0 0
domain0 05 <-- !!!
domain1 07 <-- !!!
domain2 0f
domain3 ff
So you get an unwanted SMT level for CPU0-3 and messed-up cpumasks
without this change.
>>
>>>> From: Wang Qing <[email protected]>
>
>[...]
>
>>> Would have to be cluster_id = -1. 0 is a valid 2. level cluster.
>>> Otherwise you're not removing the CLS data from CPU4 to CPU7 in the
>>> `Armv9 with L2 complexes` cpu-map example I used for testing:
>>>
>>> cpu-map {
>>> cluster0 {
>>> cluster0 {
>>> core0 {
>>> cpu = <&cpu0>;
>>> };
>>> core1 {
>>> cpu = <&cpu1>;
>>> };
>>> };
>>> cluster1 {
>>> core0 {
>>> cpu = <&cpu2>;
>>> };
>>> core1 {
>>> cpu = <&cpu3>;
>>> };
>>> };
>>> };
>>> cluster1 {
>>> core0 {
>>> cpu = <&cpu4>;
>>> };
>>> core1 {
>>> cpu = <&cpu5>;
>>> };
>>> core2 {
>>> cpu = <&cpu6>;
>>> };
>>> };
>>> cluster2 {
>>> core0 {
>>> cpu = <&cpu7>;
>>> };
>>> };
>>> };
>
>[...]
>
>>> Looks like you also need to adapt update_siblings_masks() to only set
>>> cpu in &cpu_topo->thread_sibling and &cpuid_topo->thread_sibling when
>>> `cpu_topo->thread_id != -1`.
>>>
>>> @@ -723,11 +723,11 @@ void update_siblings_masks(unsigned int cpuid)
>>> cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
>>> cpumask_set_cpu(cpu, &cpuid_topo->core_sibling);
>>>
>>> - if (cpuid_topo->core_id != cpu_topo->core_id)
>>> - continue;
>>> -
>>> - cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling);
>>> - cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling);
>>> + if (cpu_topo->thread_id != -1 &&
>>> + cpuid_topo->core_id == cpu_topo->core_id) {
>>> + cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling);
>>> + cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling);
>>> +
>>
>> This seems like another problem?
>
>I don't think so. If you run a system with the cpu-map mentioned above
Here is not enough, cpu0,1,4,5 will make up SMT below:
cluster0 {
cluster0 {
core0 {
thread0 {
cpu = <&CPU0>;
};
thread1 {
cpu = <&CPU1>;
};
};
};
core1 {
thread0 {
cpu = <&CPU2>;
};
thread1 {
cpu = <&CPU3>;
};
};
};
cluster1 {
core0 {
thread0 {
cpu = <&CPU4>;
};
thread1 {
cpu = <&CPU5>;
};
};
};
core1 {
thread0 {
cpu = <&CPU6>;
};
thread1 {
cpu = <&CPU7>;
};
};
};
...
I will handle this.
Thanks,
Qing
>you get:
>
># cat /sys/kernel/debug/sched/domains/cpu0/domain*/name
>SMT <-- !!!
>CLS
>MC
>
>root@e125579:~# cat /proc/schedstat | awk '{print $1 " " $2 }' | grep
>^[cd] | head -5
>cpu0 0
>domain0 05 <-- !!!
>domain1 07 <-- !!!
>domain2 0f
>domain3 ff
>
>So you get an unwanted SMT level for CPU0-3 and messed-up cpumasks
>without this change.