From: Wang Qing <[email protected]>
Some architectures(e.g. ARM64), caches are implemented below:
cluster: ****** cluster 0 ***** ****** cluster 1 *****
core: 0 1 2 3 4 5 6 7
cache(Leveln): **cache0** **cache1** **cache2** **cache3**
sd_llc_id(current): 0 0 0 0 4 4 4 4
sd_llc_id(should be): 0 0 2 2 4 4 6 6
Caches and cpus have different topology, this causes cpus_share_cache()
return the wrong value, which will affect the CPU load balance.
Cache topology should be separated with CPU topology, it can be obtained
from "next-level-cache" in DTS preferentially.
Signed-off-by: Wang Qing <[email protected]>
---
arch/arm64/kernel/smp.c | 1 +
drivers/base/arch_topology.c | 23 +++++++++++++++++++++++
include/linux/arch_topology.h | 3 +++
kernel/sched/topology.c | 33 +++++++++++++++++++++++++++++++--
4 files changed, 58 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 27df5c1..94cf649
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -723,6 +723,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
unsigned int this_cpu;
init_cpu_topology();
+ init_cpu_cache_topology();
this_cpu = smp_processor_id();
store_cpu_topology(this_cpu);
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 9761541..613213f
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -613,6 +613,7 @@ static int __init parse_dt_topology(void)
*/
struct cpu_topology cpu_topology[NR_CPUS];
EXPORT_SYMBOL_GPL(cpu_topology);
+struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
const struct cpumask *cpu_coregroup_mask(int cpu)
{
@@ -738,4 +739,26 @@ void __init init_cpu_topology(void)
else if (of_have_populated_dt() && parse_dt_topology())
reset_cpu_topology();
}
+
+void __init init_cpu_cache_topology(void)
+{
+ struct device_node *node_cpu, *node_cache;
+ int cpu, level;
+
+ for_each_possible_cpu(cpu) {
+ node_cpu = of_get_cpu_node(cpu, NULL);
+ if (!node_cpu)
+ continue;
+
+ level = 0;
+ node_cache = node_cpu;
+ while (level < MAX_CPU_CACHE_LEVEL) {
+ node_cache = of_parse_phandle(node_cache, "next-level-cache", 0);
+ if (!node_cache)
+ break;
+ cache_topology[cpu][level++] = node_cache;
+ }
+ of_node_put(node_cpu);
+ }
+}
#endif
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index cce6136b..d37f47d
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -72,6 +72,8 @@ struct cpu_topology {
};
#ifdef CONFIG_GENERIC_ARCH_TOPOLOGY
+#define MAX_CPU_CACHE_LEVEL 7
+extern struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
extern struct cpu_topology cpu_topology[NR_CPUS];
#define topology_physical_package_id(cpu) (cpu_topology[cpu].package_id)
@@ -82,6 +84,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
#define topology_cluster_cpumask(cpu) (&cpu_topology[cpu].cluster_sibling)
#define topology_llc_cpumask(cpu) (&cpu_topology[cpu].llc_sibling)
void init_cpu_topology(void);
+void init_cpu_cache_topology(void);
void store_cpu_topology(unsigned int cpuid);
const struct cpumask *cpu_coregroup_mask(int cpu);
const struct cpumask *cpu_clustergroup_mask(int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d201a70..10850d6
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -650,6 +650,36 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
+static void set_sd_llc(int cpu, struct sched_domain *sd, int *first_cpu, int *cpu_num)
+{
+ int cache_level, cpu_id;
+ int first, last;
+ int id = cpumask_first(sched_domain_span(sd));
+ int size = cpumask_weight(sched_domain_span(sd));
+
+ *first_cpu = id;
+ *cpu_num = size;
+
+ for (cache_level = 0; cache_level < MAX_CPU_CACHE_LEVEL; cache_level++) {
+ if (!cache_topology[cpu][cache_level])
+ break;
+
+ first = -1;
+ last = id;
+ for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) {
+ if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
+ if (cpu_id >= id && cpu_id < id + size) {
+ first = (first == -1)?cpu_id:first;
+ last = cpu_id;
+ } else
+ return;
+ }
+ }
+ *first_cpu = first;
+ *cpu_num = last - first + 1;
+ }
+}
+
static void update_top_cache_domain(int cpu)
{
struct sched_domain_shared *sds = NULL;
@@ -659,8 +689,7 @@ static void update_top_cache_domain(int cpu)
sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
if (sd) {
- id = cpumask_first(sched_domain_span(sd));
- size = cpumask_weight(sched_domain_span(sd));
+ set_sd_llc(cpu, sd, &id, &size);
sds = sd->shared;
}
--
2.7.4
Hi Qing,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on arm64/for-next/core]
[also build test WARNING on driver-core/driver-core-testing tip/sched/core linus/master v5.17-rc7 next-20220310]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/0day-ci/linux/commits/Qing-Wang/sched-topology-make-cache-topology-separate-from-cpu-topology/20220310-210139
base: https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-next/core
config: i386-randconfig-a003 (https://download.01.org/0day-ci/archive/20220311/[email protected]/config)
compiler: gcc-9 (Ubuntu 9.4.0-1ubuntu1~20.04) 9.4.0
reproduce (this is a W=1 build):
# https://github.com/0day-ci/linux/commit/035814a878b0c4cb313ebd35464e2149d8592d89
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Qing-Wang/sched-topology-make-cache-topology-separate-from-cpu-topology/20220310-210139
git checkout 035814a878b0c4cb313ebd35464e2149d8592d89
# save the config file to linux build tree
mkdir build_dir
make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash kernel/sched/
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>
All warnings (new ones prefixed by >>):
kernel/sched/topology.c: In function 'set_sd_llc':
kernel/sched/topology.c:663:38: error: 'MAX_CPU_CACHE_LEVEL' undeclared (first use in this function)
663 | for (cache_level = 0; cache_level < MAX_CPU_CACHE_LEVEL; cache_level++) {
| ^~~~~~~~~~~~~~~~~~~
kernel/sched/topology.c:663:38: note: each undeclared identifier is reported only once for each function it appears in
kernel/sched/topology.c:664:8: error: 'cache_topology' undeclared (first use in this function); did you mean 'cpu_topology'?
664 | if (!cache_topology[cpu][cache_level])
| ^~~~~~~~~~~~~~
| cpu_topology
>> kernel/sched/topology.c:653:28: warning: parameter 'cpu' set but not used [-Wunused-but-set-parameter]
653 | static void set_sd_llc(int cpu, struct sched_domain *sd, int *first_cpu, int *cpu_num)
| ~~~~^~~
vim +/cpu +653 kernel/sched/topology.c
652
> 653 static void set_sd_llc(int cpu, struct sched_domain *sd, int *first_cpu, int *cpu_num)
654 {
655 int cache_level, cpu_id;
656 int first, last;
657 int id = cpumask_first(sched_domain_span(sd));
658 int size = cpumask_weight(sched_domain_span(sd));
659
660 *first_cpu = id;
661 *cpu_num = size;
662
> 663 for (cache_level = 0; cache_level < MAX_CPU_CACHE_LEVEL; cache_level++) {
664 if (!cache_topology[cpu][cache_level])
665 break;
666
667 first = -1;
668 last = id;
669 for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) {
670 if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
671 if (cpu_id >= id && cpu_id < id + size) {
672 first = (first == -1)?cpu_id:first;
673 last = cpu_id;
674 } else
675 return;
676 }
677 }
678 *first_cpu = first;
679 *cpu_num = last - first + 1;
680 }
681 }
682
---
0-DAY CI Kernel Test Service
https://lists.01.org/hyperkitty/list/[email protected]
>>
>>
>> >>
>> >>
>> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <[email protected]> wrote:
>> >> >>
>> >> >> From: Wang Qing <[email protected]>
>> >> >>
>> >> >> Some architectures(e.g. ARM64), caches are implemented below:
>> >> >> cluster: ****** cluster 0 ***** ****** cluster 1 *****
>> >> >> core: 0 1 2 3 4 5 6 7
>> >> (add cache level 1) c0 c1 c2 c3 c4 c5 c6 c7
>> >> >> cache(Leveln): **cache0** **cache1** **cache2** **cache3**
>> >> (add cache level 3) *************share level 3 cache ***************
>> >> >> sd_llc_id(current): 0 0 0 0 4 4 4 4
>> >> >> sd_llc_id(should be): 0 0 2 2 4 4 6 6
>> >> >>
>> >> Here, n always be 2 in ARM64, but others are also possible.
>> >> core[0,1] form a complex(ARMV9), which share L2 cache, core[2,3] is the same.
>> >>
>> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
>> >> >> return the wrong value, which will affect the CPU load balance.
>> >> >>
>> >> >What does your current scheduler topology look like?
>> >> >
>> >> >For CPU 0 to 3, do you have the below ?
>> >> >DIE [0 - 3] [4-7]
>> >> >MC [0] [1] [2] [3]
>> >>
>> >> The current scheduler topology consistent with CPU topology:
>> >> DIE [0-7]
>> >> MC [0-3] [4-7] (SD_SHARE_PKG_RESOURCES)
>> >> Most Android phones have this topology.
>> >> >
>> >> >But you would like something like below for cpu 0-1 instead ?
>> >> >DIE [0 - 3] [4-7]
>> >> >CLS [0 - 1] [2 - 3]
>> >> >MC [0] [1]
>> >> >
>> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
>> >>
>> >> We don't change the current scheduler topology, but the
>> >> cache topology should be separated like below:
>> >
>> >The scheduler topology is not only cpu topology but a mixed of cpu and
>> >cache/memory cache topology
>> >
>> >> [0-7] (shared level 3 cache )
>> >> [0-1] [2-3][4-5][6-7] (shared level 2 cache )
>> >
>> >So you don't bother the intermediate cluster level which is even simpler.
>> >you have to modify generic arch topology so that cpu_coregroup_mask
>> >returns the correct cpu mask directly.
>> >
>> >You will notice a llc_sibling field that is currently used by acpi but
>> >not DT to return llc cpu mask
>> >
>> cpu_topology[].llc_sibling describe the last level cache of whole system,
>> not in the sched_domain.
>>
>> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
>
>If llc_sibling was 0xff([0-7] on your system, you would have only one level:
>MC[0-7]
Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
shared the llc(L3), but we also have two level:
DIE [0-7]
MC [0-3][4-6]
It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
when misfit. We won't change it.
>
>> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
>> in sd, which should be [0-1] instead of [0-3].
>
>sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
>If you want llc to be [0-3] make sure that the
>sched_domain_topology_level array returns the correct cpumask with
>this flag
Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
SD_SHARE_PKG_RESOURCES flag, the sd_llc will be [0][1][2][3]. It's not true.
So we must separate sd_llc from sd topology, or the demand cannot be
met in any case under the existing mechanism.
Thanks,
Wang
On Fri, 11 Mar 2022 at 09:18, 王擎 <[email protected]> wrote:
>
>
> >>
> >>
> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <[email protected]> wrote:
> >> >>
> >> >> From: Wang Qing <[email protected]>
> >> >>
> >> >> Some architectures(e.g. ARM64), caches are implemented below:
> >> >> cluster: ****** cluster 0 ***** ****** cluster 1 *****
> >> >> core: 0 1 2 3 4 5 6 7
> >> (add cache level 1) c0 c1 c2 c3 c4 c5 c6 c7
> >> >> cache(Leveln): **cache0** **cache1** **cache2** **cache3**
> >> (add cache level 3) *************share level 3 cache ***************
> >> >> sd_llc_id(current): 0 0 0 0 4 4 4 4
> >> >> sd_llc_id(should be): 0 0 2 2 4 4 6 6
> >> >>
> >> Here, n always be 2 in ARM64, but others are also possible.
> >> core[0,1] form a complex(ARMV9), which share L2 cache, core[2,3] is the same.
> >>
> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
> >> >> return the wrong value, which will affect the CPU load balance.
> >> >>
> >> >What does your current scheduler topology look like?
> >> >
> >> >For CPU 0 to 3, do you have the below ?
> >> >DIE [0 - 3] [4-7]
> >> >MC [0] [1] [2] [3]
> >>
> >> The current scheduler topology consistent with CPU topology:
> >> DIE [0-7]
> >> MC [0-3] [4-7] (SD_SHARE_PKG_RESOURCES)
> >> Most Android phones have this topology.
> >> >
> >> >But you would like something like below for cpu 0-1 instead ?
> >> >DIE [0 - 3] [4-7]
> >> >CLS [0 - 1] [2 - 3]
> >> >MC [0] [1]
> >> >
> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
> >>
> >> We don't change the current scheduler topology, but the
> >> cache topology should be separated like below:
> >
> >The scheduler topology is not only cpu topology but a mixed of cpu and
> >cache/memory cache topology
> >
> >> [0-7] (shared level 3 cache )
> >> [0-1] [2-3][4-5][6-7] (shared level 2 cache )
> >
> >So you don't bother the intermediate cluster level which is even simpler.
> >you have to modify generic arch topology so that cpu_coregroup_mask
> >returns the correct cpu mask directly.
> >
> >You will notice a llc_sibling field that is currently used by acpi but
> >not DT to return llc cpu mask
> >
> cpu_topology[].llc_sibling describe the last level cache of whole system,
> not in the sched_domain.
>
> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
If llc_sibling was 0xff([0-7] on your system, you would have only one level:
MC[0-7]
> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
> in sd, which should be [0-1] instead of [0-3].
sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
If you want llc to be [0-3] make sure that the
sched_domain_topology_level array returns the correct cpumask with
this flag
>
> Thanks,
> Wang
>On Thu, 10 Mar 2022 at 13:59, Qing Wang <[email protected]> wrote:
>>
>> From: Wang Qing <[email protected]>
>>
>> Some architectures(e.g. ARM64), caches are implemented below:
>> cluster: ****** cluster 0 ***** ****** cluster 1 *****
>> core: 0 1 2 3 4 5 6 7
(add cache level 1) c0 c1 c2 c3 c4 c5 c6 c7
>> cache(Leveln): **cache0** **cache1** **cache2** **cache3**
(add cache level 3) *************share level 3 cache ***************
>> sd_llc_id(current): 0 0 0 0 4 4 4 4
>> sd_llc_id(should be): 0 0 2 2 4 4 6 6
>>
Here, n always be 2 in ARM64, but others are also possible.
core[0,1] form a complex(ARMV9), which share L2 cache, core[2,3] is the same.
>> Caches and cpus have different topology, this causes cpus_share_cache()
>> return the wrong value, which will affect the CPU load balance.
>>
>What does your current scheduler topology look like?
>
>For CPU 0 to 3, do you have the below ?
>DIE [0 - 3] [4-7]
>MC [0] [1] [2] [3]
The current scheduler topology consistent with CPU topology:
DIE [0-7]
MC [0-3] [4-7] (SD_SHARE_PKG_RESOURCES)
Most Android phones have this topology.
>
>But you would like something like below for cpu 0-1 instead ?
>DIE [0 - 3] [4-7]
>CLS [0 - 1] [2 - 3]
>MC [0] [1]
>
>with SD_SHARE_PKG_RESOURCES only set to MC level ?
We don't change the current scheduler topology, but the
cache topology should be separated like below:
[0-7] (shared level 3 cache )
[0-1] [2-3][4-5][6-7] (shared level 2 cache )
>
>>
>> Cache topology should be separated with CPU topology, it can be obtained
>> from "next-level-cache" in DTS preferentially.
>>
>> Signed-off-by: Wang Qing <[email protected]>
>> ---
>> arch/arm64/kernel/smp.c | 1 +
>> drivers/base/arch_topology.c | 23 +++++++++++++++++++++++
>> include/linux/arch_topology.h | 3 +++
>> kernel/sched/topology.c | 33 +++++++++++++++++++++++++++++++--
>> 4 files changed, 58 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
>> index 27df5c1..94cf649
>> --- a/arch/arm64/kernel/smp.c
>> +++ b/arch/arm64/kernel/smp.c
>> @@ -723,6 +723,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>> unsigned int this_cpu;
>>
>> init_cpu_topology();
>> + init_cpu_cache_topology();
>>
>> this_cpu = smp_processor_id();
>> store_cpu_topology(this_cpu);
>> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
>> index 9761541..613213f
>> --- a/drivers/base/arch_topology.c
>> +++ b/drivers/base/arch_topology.c
>> @@ -613,6 +613,7 @@ static int __init parse_dt_topology(void)
>> */
>> struct cpu_topology cpu_topology[NR_CPUS];
>> EXPORT_SYMBOL_GPL(cpu_topology);
>> +struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
>
>AFAICT, arch_topology.c is only used by arm/arm64 and riscv so this is
>not initialized for other archs
I see, will be fixed in V2.
Thanks,
Wang
>
>>
>> const struct cpumask *cpu_coregroup_mask(int cpu)
>> {
>> @@ -738,4 +739,26 @@ void __init init_cpu_topology(void)
>> else if (of_have_populated_dt() && parse_dt_topology())
>> reset_cpu_topology();
>> }
>> +
>> +void __init init_cpu_cache_topology(void)
>> +{
>> + struct device_node *node_cpu, *node_cache;
>> + int cpu, level;
>> +
>> + for_each_possible_cpu(cpu) {
>> + node_cpu = of_get_cpu_node(cpu, NULL);
>> + if (!node_cpu)
>> + continue;
>> +
>> + level = 0;
>> + node_cache = node_cpu;
>> + while (level < MAX_CPU_CACHE_LEVEL) {
>> + node_cache = of_parse_phandle(node_cache, "next-level-cache", 0);
>> + if (!node_cache)
>> + break;
>> + cache_topology[cpu][level++] = node_cache;
>> + }
>> + of_node_put(node_cpu);
>> + }
>> +}
>> #endif
>> diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
>> index cce6136b..d37f47d
>> --- a/include/linux/arch_topology.h
>> +++ b/include/linux/arch_topology.h
>> @@ -72,6 +72,8 @@ struct cpu_topology {
>> };
>>
>> #ifdef CONFIG_GENERIC_ARCH_TOPOLOGY
>> +#define MAX_CPU_CACHE_LEVEL 7
>> +extern struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
>> extern struct cpu_topology cpu_topology[NR_CPUS];
>>
>> #define topology_physical_package_id(cpu) (cpu_topology[cpu].package_id)
>> @@ -82,6 +84,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
>> #define topology_cluster_cpumask(cpu) (&cpu_topology[cpu].cluster_sibling)
>> #define topology_llc_cpumask(cpu) (&cpu_topology[cpu].llc_sibling)
>> void init_cpu_topology(void);
>> +void init_cpu_cache_topology(void);
>> void store_cpu_topology(unsigned int cpuid);
>> const struct cpumask *cpu_coregroup_mask(int cpu);
>> const struct cpumask *cpu_clustergroup_mask(int cpu);
>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>> index d201a70..10850d6
>> --- a/kernel/sched/topology.c
>> +++ b/kernel/sched/topology.c
>> @@ -650,6 +650,36 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
>> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
>> DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
>>
>> +static void set_sd_llc(int cpu, struct sched_domain *sd, int *first_cpu, int *cpu_num)
>> +{
>> + int cache_level, cpu_id;
>> + int first, last;
>> + int id = cpumask_first(sched_domain_span(sd));
>> + int size = cpumask_weight(sched_domain_span(sd));
>> +
>> + *first_cpu = id;
>> + *cpu_num = size;
>> +
>> + for (cache_level = 0; cache_level < MAX_CPU_CACHE_LEVEL; cache_level++) {
>> + if (!cache_topology[cpu][cache_level])
>> + break;
>> +
>> + first = -1;
>> + last = id;
>> + for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) {
>> + if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
>> + if (cpu_id >= id && cpu_id < id + size) {
>> + first = (first == -1)?cpu_id:first;
>> + last = cpu_id;
>> + } else
>> + return;
>> + }
>> + }
>> + *first_cpu = first;
>> + *cpu_num = last - first + 1;
>> + }
>> +}
>> +
>> static void update_top_cache_domain(int cpu)
>> {
>> struct sched_domain_shared *sds = NULL;
>> @@ -659,8 +689,7 @@ static void update_top_cache_domain(int cpu)
>>
>> sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
>> if (sd) {
>> - id = cpumask_first(sched_domain_span(sd));
>> - size = cpumask_weight(sched_domain_span(sd));
>> + set_sd_llc(cpu, sd, &id, &size);
>
>In scheduler, we look for the last level of SD_SHARE_PKG_RESOURCES to
>find shared memory. It seems that cpu_coregroup_mask doesn't return
>the correct cpumask in your case as it returns a full cluster instead
>of a subset
>
>> sds = sd->shared;
>
>sds must stay aligned with id and size so instead of modifying id and
>size you should returns a cpumask that reflects your topology
>
>> }
>>
>> --
>> 2.7.4
>
>>
>>
>> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <[email protected]> wrote:
>> >>
>> >> From: Wang Qing <[email protected]>
>> >>
>> >> Some architectures(e.g. ARM64), caches are implemented below:
>> >> cluster: ****** cluster 0 ***** ****** cluster 1 *****
>> >> core: 0 1 2 3 4 5 6 7
>> (add cache level 1) c0 c1 c2 c3 c4 c5 c6 c7
>> >> cache(Leveln): **cache0** **cache1** **cache2** **cache3**
>> (add cache level 3) *************share level 3 cache ***************
>> >> sd_llc_id(current): 0 0 0 0 4 4 4 4
>> >> sd_llc_id(should be): 0 0 2 2 4 4 6 6
>> >>
>> Here, n always be 2 in ARM64, but others are also possible.
>> core[0,1] form a complex(ARMV9), which share L2 cache, core[2,3] is the same.
>>
>> >> Caches and cpus have different topology, this causes cpus_share_cache()
>> >> return the wrong value, which will affect the CPU load balance.
>> >>
>> >What does your current scheduler topology look like?
>> >
>> >For CPU 0 to 3, do you have the below ?
>> >DIE [0 - 3] [4-7]
>> >MC [0] [1] [2] [3]
>>
>> The current scheduler topology consistent with CPU topology:
>> DIE [0-7]
>> MC [0-3] [4-7] (SD_SHARE_PKG_RESOURCES)
>> Most Android phones have this topology.
>> >
>> >But you would like something like below for cpu 0-1 instead ?
>> >DIE [0 - 3] [4-7]
>> >CLS [0 - 1] [2 - 3]
>> >MC [0] [1]
>> >
>> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
>>
>> We don't change the current scheduler topology, but the
>> cache topology should be separated like below:
>
>The scheduler topology is not only cpu topology but a mixed of cpu and
>cache/memory cache topology
>
>> [0-7] (shared level 3 cache )
>> [0-1] [2-3][4-5][6-7] (shared level 2 cache )
>
>So you don't bother the intermediate cluster level which is even simpler.
>you have to modify generic arch topology so that cpu_coregroup_mask
>returns the correct cpu mask directly.
>
>You will notice a llc_sibling field that is currently used by acpi but
>not DT to return llc cpu mask
>
cpu_topology[].llc_sibling describe the last level cache of whole system,
not in the sched_domain.
in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
the L3 cache sibling, but sd_llc_id describes the maximum shared cache
in sd, which should be [0-1] instead of [0-3].
Thanks,
Wang
On Thu, Mar 10, 2022 at 04:58:44AM -0800, Qing Wang wrote:
> From: Wang Qing <[email protected]>
>
> Some architectures(e.g. ARM64), caches are implemented below:
> cluster: ****** cluster 0 ***** ****** cluster 1 *****
> core: 0 1 2 3 4 5 6 7
> cache(Leveln): **cache0** **cache1** **cache2** **cache3**
> sd_llc_id(current): 0 0 0 0 4 4 4 4
> sd_llc_id(should be): 0 0 2 2 4 4 6 6
>
> Caches and cpus have different topology, this causes cpus_share_cache()
> return the wrong value, which will affect the CPU load balance.
>
> Cache topology should be separated with CPU topology, it can be obtained
> from "next-level-cache" in DTS preferentially.
If your clusters do not have cache, then you're currently setting
SD_SHARE_PKG_RESOURCES wrong, if they do, things are correct.
If you want to represent L2, use the new fangled cluster level or
something, that's what it's there for.
That is, you can represent the above like:
DIE: 0-7
MC: 0-3, 4-7
CLS: 0-1,1-2, 4-5,6-7
But if there is cache at MC, LLC is what it is.
On Thu, 10 Mar 2022 at 13:59, Qing Wang <[email protected]> wrote:
>
> From: Wang Qing <[email protected]>
>
> Some architectures(e.g. ARM64), caches are implemented below:
> cluster: ****** cluster 0 ***** ****** cluster 1 *****
> core: 0 1 2 3 4 5 6 7
> cache(Leveln): **cache0** **cache1** **cache2** **cache3**
> sd_llc_id(current): 0 0 0 0 4 4 4 4
> sd_llc_id(should be): 0 0 2 2 4 4 6 6
>
> Caches and cpus have different topology, this causes cpus_share_cache()
> return the wrong value, which will affect the CPU load balance.
What does your current scheduler topology look like?
For CPU 0 to 3, do you have the below ?
DIE [0 - 3] [4-7]
MC [0] [1] [2] [3]
But you would like something like below for cpu 0-1 instead ?
DIE [0 - 3] [4-7]
CLS [0 - 1] [2 - 3]
MC [0] [1]
with SD_SHARE_PKG_RESOURCES only set to MC level ?
>
> Cache topology should be separated with CPU topology, it can be obtained
> from "next-level-cache" in DTS preferentially.
>
> Signed-off-by: Wang Qing <[email protected]>
> ---
> arch/arm64/kernel/smp.c | 1 +
> drivers/base/arch_topology.c | 23 +++++++++++++++++++++++
> include/linux/arch_topology.h | 3 +++
> kernel/sched/topology.c | 33 +++++++++++++++++++++++++++++++--
> 4 files changed, 58 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index 27df5c1..94cf649
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -723,6 +723,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
> unsigned int this_cpu;
>
> init_cpu_topology();
> + init_cpu_cache_topology();
>
> this_cpu = smp_processor_id();
> store_cpu_topology(this_cpu);
> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
> index 9761541..613213f
> --- a/drivers/base/arch_topology.c
> +++ b/drivers/base/arch_topology.c
> @@ -613,6 +613,7 @@ static int __init parse_dt_topology(void)
> */
> struct cpu_topology cpu_topology[NR_CPUS];
> EXPORT_SYMBOL_GPL(cpu_topology);
> +struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
AFAICT, arch_topology.c is only used by arm/arm64 and riscv so this is
not initialized for other archs
>
> const struct cpumask *cpu_coregroup_mask(int cpu)
> {
> @@ -738,4 +739,26 @@ void __init init_cpu_topology(void)
> else if (of_have_populated_dt() && parse_dt_topology())
> reset_cpu_topology();
> }
> +
> +void __init init_cpu_cache_topology(void)
> +{
> + struct device_node *node_cpu, *node_cache;
> + int cpu, level;
> +
> + for_each_possible_cpu(cpu) {
> + node_cpu = of_get_cpu_node(cpu, NULL);
> + if (!node_cpu)
> + continue;
> +
> + level = 0;
> + node_cache = node_cpu;
> + while (level < MAX_CPU_CACHE_LEVEL) {
> + node_cache = of_parse_phandle(node_cache, "next-level-cache", 0);
> + if (!node_cache)
> + break;
> + cache_topology[cpu][level++] = node_cache;
> + }
> + of_node_put(node_cpu);
> + }
> +}
> #endif
> diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
> index cce6136b..d37f47d
> --- a/include/linux/arch_topology.h
> +++ b/include/linux/arch_topology.h
> @@ -72,6 +72,8 @@ struct cpu_topology {
> };
>
> #ifdef CONFIG_GENERIC_ARCH_TOPOLOGY
> +#define MAX_CPU_CACHE_LEVEL 7
> +extern struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
> extern struct cpu_topology cpu_topology[NR_CPUS];
>
> #define topology_physical_package_id(cpu) (cpu_topology[cpu].package_id)
> @@ -82,6 +84,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
> #define topology_cluster_cpumask(cpu) (&cpu_topology[cpu].cluster_sibling)
> #define topology_llc_cpumask(cpu) (&cpu_topology[cpu].llc_sibling)
> void init_cpu_topology(void);
> +void init_cpu_cache_topology(void);
> void store_cpu_topology(unsigned int cpuid);
> const struct cpumask *cpu_coregroup_mask(int cpu);
> const struct cpumask *cpu_clustergroup_mask(int cpu);
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index d201a70..10850d6
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -650,6 +650,36 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
>
> +static void set_sd_llc(int cpu, struct sched_domain *sd, int *first_cpu, int *cpu_num)
> +{
> + int cache_level, cpu_id;
> + int first, last;
> + int id = cpumask_first(sched_domain_span(sd));
> + int size = cpumask_weight(sched_domain_span(sd));
> +
> + *first_cpu = id;
> + *cpu_num = size;
> +
> + for (cache_level = 0; cache_level < MAX_CPU_CACHE_LEVEL; cache_level++) {
> + if (!cache_topology[cpu][cache_level])
> + break;
> +
> + first = -1;
> + last = id;
> + for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) {
> + if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
> + if (cpu_id >= id && cpu_id < id + size) {
> + first = (first == -1)?cpu_id:first;
> + last = cpu_id;
> + } else
> + return;
> + }
> + }
> + *first_cpu = first;
> + *cpu_num = last - first + 1;
> + }
> +}
> +
> static void update_top_cache_domain(int cpu)
> {
> struct sched_domain_shared *sds = NULL;
> @@ -659,8 +689,7 @@ static void update_top_cache_domain(int cpu)
>
> sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
> if (sd) {
> - id = cpumask_first(sched_domain_span(sd));
> - size = cpumask_weight(sched_domain_span(sd));
> + set_sd_llc(cpu, sd, &id, &size);
In scheduler, we look for the last level of SD_SHARE_PKG_RESOURCES to
find shared memory. It seems that cpu_coregroup_mask doesn't return
the correct cpumask in your case as it returns a full cluster instead
of a subset
> sds = sd->shared;
sds must stay aligned with id and size so instead of modifying id and
size you should returns a cpumask that reflects your topology
> }
>
> --
> 2.7.4
>
On Fri, 11 Mar 2022 at 03:03, 王擎 <[email protected]> wrote:
>
>
> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <[email protected]> wrote:
> >>
> >> From: Wang Qing <[email protected]>
> >>
> >> Some architectures(e.g. ARM64), caches are implemented below:
> >> cluster: ****** cluster 0 ***** ****** cluster 1 *****
> >> core: 0 1 2 3 4 5 6 7
> (add cache level 1) c0 c1 c2 c3 c4 c5 c6 c7
> >> cache(Leveln): **cache0** **cache1** **cache2** **cache3**
> (add cache level 3) *************share level 3 cache ***************
> >> sd_llc_id(current): 0 0 0 0 4 4 4 4
> >> sd_llc_id(should be): 0 0 2 2 4 4 6 6
> >>
> Here, n always be 2 in ARM64, but others are also possible.
> core[0,1] form a complex(ARMV9), which share L2 cache, core[2,3] is the same.
>
> >> Caches and cpus have different topology, this causes cpus_share_cache()
> >> return the wrong value, which will affect the CPU load balance.
> >>
> >What does your current scheduler topology look like?
> >
> >For CPU 0 to 3, do you have the below ?
> >DIE [0 - 3] [4-7]
> >MC [0] [1] [2] [3]
>
> The current scheduler topology consistent with CPU topology:
> DIE [0-7]
> MC [0-3] [4-7] (SD_SHARE_PKG_RESOURCES)
> Most Android phones have this topology.
> >
> >But you would like something like below for cpu 0-1 instead ?
> >DIE [0 - 3] [4-7]
> >CLS [0 - 1] [2 - 3]
> >MC [0] [1]
> >
> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
>
> We don't change the current scheduler topology, but the
> cache topology should be separated like below:
The scheduler topology is not only cpu topology but a mixed of cpu and
cache/memory cache topology
> [0-7] (shared level 3 cache )
> [0-1] [2-3][4-5][6-7] (shared level 2 cache )
So you don't bother the intermediate cluster level which is even simpler.
you have to modify generic arch topology so that cpu_coregroup_mask
returns the correct cpu mask directly.
You will notice a llc_sibling field that is currently used by acpi but
not DT to return llc cpu mask
> >
> >>
> >> Cache topology should be separated with CPU topology, it can be obtained
> >> from "next-level-cache" in DTS preferentially.
> >>
> >> Signed-off-by: Wang Qing <[email protected]>
> >> ---
> >> arch/arm64/kernel/smp.c | 1 +
> >> drivers/base/arch_topology.c | 23 +++++++++++++++++++++++
> >> include/linux/arch_topology.h | 3 +++
> >> kernel/sched/topology.c | 33 +++++++++++++++++++++++++++++++--
> >> 4 files changed, 58 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> >> index 27df5c1..94cf649
> >> --- a/arch/arm64/kernel/smp.c
> >> +++ b/arch/arm64/kernel/smp.c
> >> @@ -723,6 +723,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
> >> unsigned int this_cpu;
> >>
> >> init_cpu_topology();
> >> + init_cpu_cache_topology();
> >>
> >> this_cpu = smp_processor_id();
> >> store_cpu_topology(this_cpu);
> >> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
> >> index 9761541..613213f
> >> --- a/drivers/base/arch_topology.c
> >> +++ b/drivers/base/arch_topology.c
> >> @@ -613,6 +613,7 @@ static int __init parse_dt_topology(void)
> >> */
> >> struct cpu_topology cpu_topology[NR_CPUS];
> >> EXPORT_SYMBOL_GPL(cpu_topology);
> >> +struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
> >
> >AFAICT, arch_topology.c is only used by arm/arm64 and riscv so this is
> >not initialized for other archs
>
> I see, will be fixed in V2.
>
> Thanks,
> Wang
> >
> >>
> >> const struct cpumask *cpu_coregroup_mask(int cpu)
> >> {
> >> @@ -738,4 +739,26 @@ void __init init_cpu_topology(void)
> >> else if (of_have_populated_dt() && parse_dt_topology())
> >> reset_cpu_topology();
> >> }
> >> +
> >> +void __init init_cpu_cache_topology(void)
> >> +{
> >> + struct device_node *node_cpu, *node_cache;
> >> + int cpu, level;
> >> +
> >> + for_each_possible_cpu(cpu) {
> >> + node_cpu = of_get_cpu_node(cpu, NULL);
> >> + if (!node_cpu)
> >> + continue;
> >> +
> >> + level = 0;
> >> + node_cache = node_cpu;
> >> + while (level < MAX_CPU_CACHE_LEVEL) {
> >> + node_cache = of_parse_phandle(node_cache, "next-level-cache", 0);
> >> + if (!node_cache)
> >> + break;
> >> + cache_topology[cpu][level++] = node_cache;
> >> + }
> >> + of_node_put(node_cpu);
> >> + }
> >> +}
> >> #endif
> >> diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
> >> index cce6136b..d37f47d
> >> --- a/include/linux/arch_topology.h
> >> +++ b/include/linux/arch_topology.h
> >> @@ -72,6 +72,8 @@ struct cpu_topology {
> >> };
> >>
> >> #ifdef CONFIG_GENERIC_ARCH_TOPOLOGY
> >> +#define MAX_CPU_CACHE_LEVEL 7
> >> +extern struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
> >> extern struct cpu_topology cpu_topology[NR_CPUS];
> >>
> >> #define topology_physical_package_id(cpu) (cpu_topology[cpu].package_id)
> >> @@ -82,6 +84,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
> >> #define topology_cluster_cpumask(cpu) (&cpu_topology[cpu].cluster_sibling)
> >> #define topology_llc_cpumask(cpu) (&cpu_topology[cpu].llc_sibling)
> >> void init_cpu_topology(void);
> >> +void init_cpu_cache_topology(void);
> >> void store_cpu_topology(unsigned int cpuid);
> >> const struct cpumask *cpu_coregroup_mask(int cpu);
> >> const struct cpumask *cpu_clustergroup_mask(int cpu);
> >> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> >> index d201a70..10850d6
> >> --- a/kernel/sched/topology.c
> >> +++ b/kernel/sched/topology.c
> >> @@ -650,6 +650,36 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
> >> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> >> DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
> >>
> >> +static void set_sd_llc(int cpu, struct sched_domain *sd, int *first_cpu, int *cpu_num)
> >> +{
> >> + int cache_level, cpu_id;
> >> + int first, last;
> >> + int id = cpumask_first(sched_domain_span(sd));
> >> + int size = cpumask_weight(sched_domain_span(sd));
> >> +
> >> + *first_cpu = id;
> >> + *cpu_num = size;
> >> +
> >> + for (cache_level = 0; cache_level < MAX_CPU_CACHE_LEVEL; cache_level++) {
> >> + if (!cache_topology[cpu][cache_level])
> >> + break;
> >> +
> >> + first = -1;
> >> + last = id;
> >> + for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) {
> >> + if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
> >> + if (cpu_id >= id && cpu_id < id + size) {
> >> + first = (first == -1)?cpu_id:first;
> >> + last = cpu_id;
> >> + } else
> >> + return;
> >> + }
> >> + }
> >> + *first_cpu = first;
> >> + *cpu_num = last - first + 1;
> >> + }
> >> +}
> >> +
> >> static void update_top_cache_domain(int cpu)
> >> {
> >> struct sched_domain_shared *sds = NULL;
> >> @@ -659,8 +689,7 @@ static void update_top_cache_domain(int cpu)
> >>
> >> sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
> >> if (sd) {
> >> - id = cpumask_first(sched_domain_span(sd));
> >> - size = cpumask_weight(sched_domain_span(sd));
> >> + set_sd_llc(cpu, sd, &id, &size);
> >
> >In scheduler, we look for the last level of SD_SHARE_PKG_RESOURCES to
> >find shared memory. It seems that cpu_coregroup_mask doesn't return
> >the correct cpumask in your case as it returns a full cluster instead
> >of a subset
> >
> >> sds = sd->shared;
> >
> >sds must stay aligned with id and size so instead of modifying id and
> >size you should returns a cpumask that reflects your topology
> >
> >> }
> >>
> >> --
> >> 2.7.4
> >
On Fri, 11 Mar 2022 at 10:30, 王擎 <[email protected]> wrote:
>
>
> >>
> >>
> >> >>
> >> >>
> >> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <[email protected]> wrote:
> >> >> >>
> >> >> >> From: Wang Qing <[email protected]>
> >> >> >>
> >> >> >> Some architectures(e.g. ARM64), caches are implemented below:
> >> >> >> cluster: ****** cluster 0 ***** ****** cluster 1 *****
> >> >> >> core: 0 1 2 3 4 5 6 7
> >> >> (add cache level 1) c0 c1 c2 c3 c4 c5 c6 c7
> >> >> >> cache(Leveln): **cache0** **cache1** **cache2** **cache3**
> >> >> (add cache level 3) *************share level 3 cache ***************
> >> >> >> sd_llc_id(current): 0 0 0 0 4 4 4 4
> >> >> >> sd_llc_id(should be): 0 0 2 2 4 4 6 6
> >> >> >>
> >> >> Here, n always be 2 in ARM64, but others are also possible.
> >> >> core[0,1] form a complex(ARMV9), which share L2 cache, core[2,3] is the same.
> >> >>
> >> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
> >> >> >> return the wrong value, which will affect the CPU load balance.
> >> >> >>
> >> >> >What does your current scheduler topology look like?
> >> >> >
> >> >> >For CPU 0 to 3, do you have the below ?
> >> >> >DIE [0 - 3] [4-7]
> >> >> >MC [0] [1] [2] [3]
> >> >>
> >> >> The current scheduler topology consistent with CPU topology:
> >> >> DIE [0-7]
> >> >> MC [0-3] [4-7] (SD_SHARE_PKG_RESOURCES)
> >> >> Most Android phones have this topology.
> >> >> >
> >> >> >But you would like something like below for cpu 0-1 instead ?
> >> >> >DIE [0 - 3] [4-7]
> >> >> >CLS [0 - 1] [2 - 3]
> >> >> >MC [0] [1]
> >> >> >
> >> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
> >> >>
> >> >> We don't change the current scheduler topology, but the
> >> >> cache topology should be separated like below:
> >> >
> >> >The scheduler topology is not only cpu topology but a mixed of cpu and
> >> >cache/memory cache topology
> >> >
> >> >> [0-7] (shared level 3 cache )
> >> >> [0-1] [2-3][4-5][6-7] (shared level 2 cache )
> >> >
> >> >So you don't bother the intermediate cluster level which is even simpler.
> >> >you have to modify generic arch topology so that cpu_coregroup_mask
> >> >returns the correct cpu mask directly.
> >> >
> >> >You will notice a llc_sibling field that is currently used by acpi but
> >> >not DT to return llc cpu mask
> >> >
> >> cpu_topology[].llc_sibling describe the last level cache of whole system,
> >> not in the sched_domain.
> >>
> >> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
> >
> >If llc_sibling was 0xff([0-7] on your system, you would have only one level:
> >MC[0-7]
>
> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
> shared the llc(L3), but we also have two level:
> DIE [0-7]
> MC [0-3][4-6]
> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
> when misfit. We won't change it.
>
> >
> >> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
> >> in sd, which should be [0-1] instead of [0-3].
> >
> >sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
> >If you want llc to be [0-3] make sure that the
> >sched_domain_topology_level array returns the correct cpumask with
> >this flag
>
> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
sd_llc_id refers to a scheduler domain but your patch breaks this so
if you want a llc that reflects this topo: [0-1] [2-3] you must
provide a sched_domain level with this topo
side question, why don't you want llc to be the L3 one ?
> SD_SHARE_PKG_RESOURCES flag, the sd_llc will be [0][1][2][3]. It's not true.
The only entry point for describing the scheduler domain is the
sched_domain_topology_level array which provides some cpumask and some
associated flags. By default, SD_SHARE_PKG_RESOURCES is set for
scheduler MC level which implies that the cpus shared their cache. If
this is not the case for your system, you should either remove this
flag or update the cpumask to reflect which CPUs really share their
caches.
>
> So we must separate sd_llc from sd topology, or the demand cannot be
> met in any case under the existing mechanism.
There is a default array with DIE, MC, CLS and SMT levels with
SD_SHARE_PKG_RESOURCES set up to MC which is considered to be the LLC
but a different array than the default one can be provided with
set_sched_topology()
Thanks
Vincent
>
> Thanks,
> Wang
>
> > From: Wang Qing <[email protected]>
> >
> > Some architectures(e.g. ARM64), caches are implemented below:
> > cluster: ****** cluster 0 ***** ****** cluster 1 *****
> > core: 0 1 2 3 4 5 6 7
> > cache(Leveln): **cache0** **cache1** **cache2** **cache3**
> > sd_llc_id(current): 0 0 0 0 4 4 4 4
> > sd_llc_id(should be): 0 0 2 2 4 4 6 6
> >
> > Caches and cpus have different topology, this causes cpus_share_cache()
> > return the wrong value, which will affect the CPU load balance.
> >
> > Cache topology should be separated with CPU topology, it can be obtained
> > from "next-level-cache" in DTS preferentially.
>
> If your clusters do not have cache, then you're currently setting
> SD_SHARE_PKG_RESOURCES wrong, if they do, things are correct.
If there is a shared cache(L3) between clusters(cls 0 and cls 1) for all cores,
but not within the cluster like above, should we set SD_SHARE_PKG_RESOURCES
for MC0(cls 0), or just set SD_SHARE_PKG_RESOURCES for CLS?
>
> If you want to represent L2, use the new fangled cluster level or
> something, that's what it's there for.
>
> That is, you can represent the above like:
>
> DIE: 0-7
> MC: 0-3, 4-7
> CLS: 0-1,1-2, 4-5,6-7
>
> But if there is cache at MC, LLC is what it is.
There is no CLS support for LTS now, any plans to backport?
Thanks,
Wang
On Mon, Mar 14, 2022 at 02:37:48AM +0000, 王擎 wrote:
> > If you want to represent L2, use the new fangled cluster level or
> > something, that's what it's there for.
> >
> > That is, you can represent the above like:
> >
> > DIE: 0-7
> > MC: 0-3, 4-7
> > CLS: 0-1,1-2, 4-5,6-7
> >
> > But if there is cache at MC, LLC is what it is.
>
> There is no CLS support for LTS now, any plans to backport?
-ENOTMYPROBLEM
On Tue, 15 Mar 2022 at 02:58, 王擎 <[email protected]> wrote:
>
>
> >>
> >>
> >> >>
> >> >>
> >> >> >>
> >> >> >>
> >> >> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <[email protected]> wrote:
> >> >> >> >>
> >> >> >> >> From: Wang Qing <[email protected]>
> >> >> >> >>
> >> >> >> >> Some architectures(e.g. ARM64), caches are implemented below:
> >> >> >> >> cluster: ****** cluster 0 ***** ****** cluster 1 *****
> >> >> >> >> core: 0 1 2 3 4 5 6 7
> >> >> >> (add cache level 1) c0 c1 c2 c3 c4 c5 c6 c7
> >> >> >> >> cache(Leveln): **cache0** **cache1** **cache2** **cache3**
> >> >> >> (add cache level 3) *************share level 3 cache ***************
> >> >> >> >> sd_llc_id(current): 0 0 0 0 4 4 4 4
> >> >> >> >> sd_llc_id(should be): 0 0 2 2 4 4 6 6
> >> >> >> >>
> >> >> >> Here, n always be 2 in ARM64, but others are also possible.
> >> >> >> core[0,1] form a complex(ARMV9), which share L2 cache, core[2,3] is the same.
> >> >> >>
> >> >> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
> >> >> >> >> return the wrong value, which will affect the CPU load balance.
> >> >> >> >>
> >> >> >> >What does your current scheduler topology look like?
> >> >> >> >
> >> >> >> >For CPU 0 to 3, do you have the below ?
> >> >> >> >DIE [0 - 3] [4-7]
> >> >> >> >MC [0] [1] [2] [3]
> >> >> >>
> >> >> >> The current scheduler topology consistent with CPU topology:
> >> >> >> DIE [0-7]
> >> >> >> MC [0-3] [4-7] (SD_SHARE_PKG_RESOURCES)
> >> >> >> Most Android phones have this topology.
> >> >> >> >
> >> >> >> >But you would like something like below for cpu 0-1 instead ?
> >> >> >> >DIE [0 - 3] [4-7]
> >> >> >> >CLS [0 - 1] [2 - 3]
> >> >> >> >MC [0] [1]
> >> >> >> >
> >> >> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
> >> >> >>
> >> >> >> We don't change the current scheduler topology, but the
> >> >> >> cache topology should be separated like below:
> >> >> >
> >> >> >The scheduler topology is not only cpu topology but a mixed of cpu and
> >> >> >cache/memory cache topology
> >> >> >
> >> >> >> [0-7] (shared level 3 cache )
> >> >> >> [0-1] [2-3][4-5][6-7] (shared level 2 cache )
> >> >> >
> >> >> >So you don't bother the intermediate cluster level which is even simpler.
> >> >> >you have to modify generic arch topology so that cpu_coregroup_mask
> >> >> >returns the correct cpu mask directly.
> >> >> >
> >> >> >You will notice a llc_sibling field that is currently used by acpi but
> >> >> >not DT to return llc cpu mask
> >> >> >
> >> >> cpu_topology[].llc_sibling describe the last level cache of whole system,
> >> >> not in the sched_domain.
> >> >>
> >> >> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
> >> >
> >> >If llc_sibling was 0xff([0-7] on your system, you would have only one level:
> >> >MC[0-7]
> >>
> >> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
> >> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
> >> shared the llc(L3), but we also have two level:
> >> DIE [0-7]
> >> MC [0-3][4-6]
> >> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
> >> when misfit. We won't change it.
> >>
> >> >
> >> >> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
> >> >> in sd, which should be [0-1] instead of [0-3].
> >> >
> >> >sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
> >> >If you want llc to be [0-3] make sure that the
> >> >sched_domain_topology_level array returns the correct cpumask with
> >> >this flag
> >>
> >> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
> >
> >sd_llc_id refers to a scheduler domain but your patch breaks this so
> >if you want a llc that reflects this topo: [0-1] [2-3] you must
> >provide a sched_domain level with this topo
>
> Maybe we should add a shared-cache level(SC), like what CLS does:
>
> DIE [0-7] (shared level 3 cache, SD_SHARE_PKG_RESOURCES)
> MC [0-3] [4-7] (not SD_SHARE_PKG_RESOURCES)
> CLS (if necessary)
> SC [0-1][2-3][4-5][6-7] (shared level 2 cache, SD_SHARE_PKG_RESOURCES)
> SMT (if necessary)
>
> SC means a couple of CPUs which are placed closely by sharing
> mid-level caches, but not enough to be a cluster.
what you name SC above looks the same as CLS which should not be mixed
with Arm cluster terminology
> >
> >side question, why don't you want llc to be the L3 one ?
>
> Yes, we should set SD_SHARE_PKG_RESOURCES to DIE, but we also want to
> represent the mid-level caches to improve throughput.
so your topology (from a scheduler poV) should be for cpu0:
DIE [0 - 3] [4 - 7] (SD_SHARE_PKG_RESOURCES)
MC [0 - 1] [2 - 3] (SD_SHARE_PKG_RESOURCES)
CLS [0] [1] (SD_SHARE_PKG_RESOURCES)
so the llc will be the DIE level and load balance will spread tasks in
the different group of cpus
And regarding EAS, it will look at DIE level for task placement
And of course, this doesn't need any change in scheduler but the
arch_topology.c to return the correct cpumask for your system
>
> Thanks,
> Wang
> >
> >> SD_SHARE_PKG_RESOURCES flag, the sd_llc will be [0][1][2][3]. It's not true.
> >
> >The only entry point for describing the scheduler domain is the
> >sched_domain_topology_level array which provides some cpumask and some
> >associated flags. By default, SD_SHARE_PKG_RESOURCES is set for
> >scheduler MC level which implies that the cpus shared their cache. If
> >this is not the case for your system, you should either remove this
> >flag or update the cpumask to reflect which CPUs really share their
> >caches.
> >
> >>
> >> So we must separate sd_llc from sd topology, or the demand cannot be
> >> met in any case under the existing mechanism.
> >
> >There is a default array with DIE, MC, CLS and SMT levels with
> >SD_SHARE_PKG_RESOURCES set up to MC which is considered to be the LLC
> >but a different array than the default one can be provided with
> >set_sched_topology()
> >
> >Thanks
> >Vincent
> >
> >>
> >> Thanks,
> >> Wang
> >>
>>
>>
>> >>
>> >>
>> >> >>
>> >> >>
>> >> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <[email protected]> wrote:
>> >> >> >>
>> >> >> >> From: Wang Qing <[email protected]>
>> >> >> >>
>> >> >> >> Some architectures(e.g. ARM64), caches are implemented below:
>> >> >> >> cluster: ****** cluster 0 ***** ****** cluster 1 *****
>> >> >> >> core: 0 1 2 3 4 5 6 7
>> >> >> (add cache level 1) c0 c1 c2 c3 c4 c5 c6 c7
>> >> >> >> cache(Leveln): **cache0** **cache1** **cache2** **cache3**
>> >> >> (add cache level 3) *************share level 3 cache ***************
>> >> >> >> sd_llc_id(current): 0 0 0 0 4 4 4 4
>> >> >> >> sd_llc_id(should be): 0 0 2 2 4 4 6 6
>> >> >> >>
>> >> >> Here, n always be 2 in ARM64, but others are also possible.
>> >> >> core[0,1] form a complex(ARMV9), which share L2 cache, core[2,3] is the same.
>> >> >>
>> >> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
>> >> >> >> return the wrong value, which will affect the CPU load balance.
>> >> >> >>
>> >> >> >What does your current scheduler topology look like?
>> >> >> >
>> >> >> >For CPU 0 to 3, do you have the below ?
>> >> >> >DIE [0 - 3] [4-7]
>> >> >> >MC [0] [1] [2] [3]
>> >> >>
>> >> >> The current scheduler topology consistent with CPU topology:
>> >> >> DIE [0-7]
>> >> >> MC [0-3] [4-7] (SD_SHARE_PKG_RESOURCES)
>> >> >> Most Android phones have this topology.
>> >> >> >
>> >> >> >But you would like something like below for cpu 0-1 instead ?
>> >> >> >DIE [0 - 3] [4-7]
>> >> >> >CLS [0 - 1] [2 - 3]
>> >> >> >MC [0] [1]
>> >> >> >
>> >> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
>> >> >>
>> >> >> We don't change the current scheduler topology, but the
>> >> >> cache topology should be separated like below:
>> >> >
>> >> >The scheduler topology is not only cpu topology but a mixed of cpu and
>> >> >cache/memory cache topology
>> >> >
>> >> >> [0-7] (shared level 3 cache )
>> >> >> [0-1] [2-3][4-5][6-7] (shared level 2 cache )
>> >> >
>> >> >So you don't bother the intermediate cluster level which is even simpler.
>> >> >you have to modify generic arch topology so that cpu_coregroup_mask
>> >> >returns the correct cpu mask directly.
>> >> >
>> >> >You will notice a llc_sibling field that is currently used by acpi but
>> >> >not DT to return llc cpu mask
>> >> >
>> >> cpu_topology[].llc_sibling describe the last level cache of whole system,
>> >> not in the sched_domain.
>> >>
>> >> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
>> >
>> >If llc_sibling was 0xff([0-7] on your system, you would have only one level:
>> >MC[0-7]
>>
>> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
>> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
>> shared the llc(L3), but we also have two level:
>> DIE [0-7]
>> MC [0-3][4-6]
>> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
>> when misfit. We won't change it.
>>
>> >
>> >> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
>> >> in sd, which should be [0-1] instead of [0-3].
>> >
>> >sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
>> >If you want llc to be [0-3] make sure that the
>> >sched_domain_topology_level array returns the correct cpumask with
>> >this flag
>>
>> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
>
>sd_llc_id refers to a scheduler domain but your patch breaks this so
>if you want a llc that reflects this topo: [0-1] [2-3] you must
>provide a sched_domain level with this topo
Maybe we should add a shared-cache level(SC), like what CLS does:
DIE [0-7] (shared level 3 cache, SD_SHARE_PKG_RESOURCES)
MC [0-3] [4-7] (not SD_SHARE_PKG_RESOURCES)
CLS (if necessary)
SC [0-1][2-3][4-5][6-7] (shared level 2 cache, SD_SHARE_PKG_RESOURCES)
SMT (if necessary)
SC means a couple of CPUs which are placed closely by sharing
mid-level caches, but not enough to be a cluster.
>
>side question, why don't you want llc to be the L3 one ?
Yes, we should set SD_SHARE_PKG_RESOURCES to DIE, but we also want to
represent the mid-level caches to improve throughput.
Thanks,
Wang
>
>> SD_SHARE_PKG_RESOURCES flag, the sd_llc will be [0][1][2][3]. It's not true.
>
>The only entry point for describing the scheduler domain is the
>sched_domain_topology_level array which provides some cpumask and some
>associated flags. By default, SD_SHARE_PKG_RESOURCES is set for
>scheduler MC level which implies that the cpus shared their cache. If
>this is not the case for your system, you should either remove this
>flag or update the cpumask to reflect which CPUs really share their
>caches.
>
>>
>> So we must separate sd_llc from sd topology, or the demand cannot be
>> met in any case under the existing mechanism.
>
>There is a default array with DIE, MC, CLS and SMT levels with
>SD_SHARE_PKG_RESOURCES set up to MC which is considered to be the LLC
>but a different array than the default one can be provided with
>set_sched_topology()
>
>Thanks
>Vincent
>
>>
>> Thanks,
>> Wang
>>
Hi Qing,
On 2022/4/2 17:34, 王擎 wrote:
>
>>>
>>>
>>>>>
>>>>>
>>>>>>>
>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> On Thu, 10 Mar 2022 at 13:59, Qing Wang <[email protected]> wrote:
>>>>>>>>>>>
>>>>>>>>>>> From: Wang Qing <[email protected]>
>>>>>>>>>>>
>>>>>>>>>>> Some architectures(e.g. ARM64), caches are implemented below:
>>>>>>>>>>> cluster: ****** cluster 0 ***** ****** cluster 1 *****
>>>>>>>>>>> core: 0 1 2 3 4 5 6 7
>>>>>>>>> (add cache level 1) c0 c1 c2 c3 c4 c5 c6 c7
>>>>>>>>>>> cache(Leveln): **cache0** **cache1** **cache2** **cache3**
>>>>>>>>> (add cache level 3) *************share level 3 cache ***************
>>>>>>>>>>> sd_llc_id(current): 0 0 0 0 4 4 4 4
>>>>>>>>>>> sd_llc_id(should be): 0 0 2 2 4 4 6 6
>>>>>>>>>>>
>>>>>>>>> Here, n always be 2 in ARM64, but others are also possible.
>>>>>>>>> core[0,1] form a complex(ARMV9), which share L2 cache, core[2,3] is the same.
>>>>>>>>>
>>>>>>>>>>> Caches and cpus have different topology, this causes cpus_share_cache()
>>>>>>>>>>> return the wrong value, which will affect the CPU load balance.
>>>>>>>>>>>
>>>>>>>>>> What does your current scheduler topology look like?
>>>>>>>>>>
>>>>>>>>>> For CPU 0 to 3, do you have the below ?
>>>>>>>>>> DIE [0 - 3] [4-7]
>>>>>>>>>> MC [0] [1] [2] [3]
>>>>>>>>>
>>>>>>>>> The current scheduler topology consistent with CPU topology:
>>>>>>>>> DIE [0-7]
>>>>>>>>> MC [0-3] [4-7] (SD_SHARE_PKG_RESOURCES)
>>>>>>>>> Most Android phones have this topology.
>>>>>>>>>>
>>>>>>>>>> But you would like something like below for cpu 0-1 instead ?
>>>>>>>>>> DIE [0 - 3] [4-7]
>>>>>>>>>> CLS [0 - 1] [2 - 3]
>>>>>>>>>> MC [0] [1]
>>>>>>>>>>
>>>>>>>>>> with SD_SHARE_PKG_RESOURCES only set to MC level ?
>>>>>>>>>
>>>>>>>>> We don't change the current scheduler topology, but the
>>>>>>>>> cache topology should be separated like below:
>>>>>>>>
>>>>>>>> The scheduler topology is not only cpu topology but a mixed of cpu and
>>>>>>>> cache/memory cache topology
>>>>>>>>
>>>>>>>>> [0-7] (shared level 3 cache )
>>>>>>>>> [0-1] [2-3][4-5][6-7] (shared level 2 cache )
>>>>>>>>
>>>>>>>> So you don't bother the intermediate cluster level which is even simpler.
>>>>>>>> you have to modify generic arch topology so that cpu_coregroup_mask
>>>>>>>> returns the correct cpu mask directly.
>>>>>>>>
>>>>>>>> You will notice a llc_sibling field that is currently used by acpi but
>>>>>>>> not DT to return llc cpu mask
>>>>>>>>
>>>>>>> cpu_topology[].llc_sibling describe the last level cache of whole system,
>>>>>>> not in the sched_domain.
>>>>>>>
>>>>>>> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
>>>>>>
>>>>>> If llc_sibling was 0xff([0-7] on your system, you would have only one level:
>>>>>> MC[0-7]
>>>>>
>>>>> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
>>>>> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
>>>>> shared the llc(L3), but we also have two level:
>>>>> DIE [0-7]
>>>>> MC [0-3][4-6]
>>>>> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
>>>>> when misfit. We won't change it.
>>>>>
>>>>>>
>>>>>>> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
>>>>>>> in sd, which should be [0-1] instead of [0-3].
>>>>>>
>>>>>> sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
>>>>>> If you want llc to be [0-3] make sure that the
>>>>>> sched_domain_topology_level array returns the correct cpumask with
>>>>>> this flag
>>>>>
>>>>> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
>>>>
>>>> sd_llc_id refers to a scheduler domain but your patch breaks this so
>>>> if you want a llc that reflects this topo: [0-1] [2-3] you must
>>>> provide a sched_domain level with this topo
>>>
>>> Maybe we should add a shared-cache level(SC), like what CLS does:
>>>
>>> DIE [0-7] (shared level 3 cache, SD_SHARE_PKG_RESOURCES)
>>> MC [0-3] [4-7] (not SD_SHARE_PKG_RESOURCES)
>>> CLS (if necessary)
>>> SC [0-1][2-3][4-5][6-7] (shared level 2 cache, SD_SHARE_PKG_RESOURCES)
>>> SMT (if necessary)
>>>
>>> SC means a couple of CPUs which are placed closely by sharing
>>> mid-level caches, but not enough to be a cluster.
>>
>> what you name SC above looks the same as CLS which should not be mixed
>> with Arm cluster terminology
>
> Do you mean cluster is equal to shared cache instead of containing, SC just
> means shared cache, but not form a cluster, a CLS can contain many SCs.
>
The cluster is a topology level above the CPUs but under LLC. On Kunpeng 920 the cpus
in a CLS will share L3T and on Intel's Jacobsville cpus in a CLS will share L2[1].
Seems you're using a DT based system. I think the parsing of cluster level is not
supported on DT yet so you cannot see it. Otherwise with right cpu topology reported
you will have a CLS level in which the cpus share L2 cache, just like Jacobsville.
[1] https://lore.kernel.org/all/[email protected]/
> If as you said, SC looks the same as CLS, should we rename CLS to SC to
> avoid confusion?
>
> Thanks,
> Wang
>
>>
>>
>> >>
>> >>
>> >> >>
>> >> >>
>> >> >> >>
>> >> >> >>
>> >> >> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <[email protected]> wrote:
>> >> >> >> >>
>> >> >> >> >> From: Wang Qing <[email protected]>
>> >> >> >> >>
>> >> >> >> >> Some architectures(e.g. ARM64), caches are implemented below:
>> >> >> >> >> cluster: ****** cluster 0 ***** ****** cluster 1 *****
>> >> >> >> >> core: 0 1 2 3 4 5 6 7
>> >> >> >> (add cache level 1) c0 c1 c2 c3 c4 c5 c6 c7
>> >> >> >> >> cache(Leveln): **cache0** **cache1** **cache2** **cache3**
>> >> >> >> (add cache level 3) *************share level 3 cache ***************
>> >> >> >> >> sd_llc_id(current): 0 0 0 0 4 4 4 4
>> >> >> >> >> sd_llc_id(should be): 0 0 2 2 4 4 6 6
>> >> >> >> >>
>> >> >> >> Here, n always be 2 in ARM64, but others are also possible.
>> >> >> >> core[0,1] form a complex(ARMV9), which share L2 cache, core[2,3] is the same.
>> >> >> >>
>> >> >> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
>> >> >> >> >> return the wrong value, which will affect the CPU load balance.
>> >> >> >> >>
>> >> >> >> >What does your current scheduler topology look like?
>> >> >> >> >
>> >> >> >> >For CPU 0 to 3, do you have the below ?
>> >> >> >> >DIE [0 - 3] [4-7]
>> >> >> >> >MC [0] [1] [2] [3]
>> >> >> >>
>> >> >> >> The current scheduler topology consistent with CPU topology:
>> >> >> >> DIE [0-7]
>> >> >> >> MC [0-3] [4-7] (SD_SHARE_PKG_RESOURCES)
>> >> >> >> Most Android phones have this topology.
>> >> >> >> >
>> >> >> >> >But you would like something like below for cpu 0-1 instead ?
>> >> >> >> >DIE [0 - 3] [4-7]
>> >> >> >> >CLS [0 - 1] [2 - 3]
>> >> >> >> >MC [0] [1]
>> >> >> >> >
>> >> >> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
>> >> >> >>
>> >> >> >> We don't change the current scheduler topology, but the
>> >> >> >> cache topology should be separated like below:
>> >> >> >
>> >> >> >The scheduler topology is not only cpu topology but a mixed of cpu and
>> >> >> >cache/memory cache topology
>> >> >> >
>> >> >> >> [0-7] (shared level 3 cache )
>> >> >> >> [0-1] [2-3][4-5][6-7] (shared level 2 cache )
>> >> >> >
>> >> >> >So you don't bother the intermediate cluster level which is even simpler.
>> >> >> >you have to modify generic arch topology so that cpu_coregroup_mask
>> >> >> >returns the correct cpu mask directly.
>> >> >> >
>> >> >> >You will notice a llc_sibling field that is currently used by acpi but
>> >> >> >not DT to return llc cpu mask
>> >> >> >
>> >> >> cpu_topology[].llc_sibling describe the last level cache of whole system,
>> >> >> not in the sched_domain.
>> >> >>
>> >> >> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
>> >> >
>> >> >If llc_sibling was 0xff([0-7] on your system, you would have only one level:
>> >> >MC[0-7]
>> >>
>> >> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
>> >> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
>> >> shared the llc(L3), but we also have two level:
>> >> DIE [0-7]
>> >> MC [0-3][4-6]
>> >> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
>> >> when misfit. We won't change it.
>> >>
>> >> >
>> >> >> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
>> >> >> in sd, which should be [0-1] instead of [0-3].
>> >> >
>> >> >sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
>> >> >If you want llc to be [0-3] make sure that the
>> >> >sched_domain_topology_level array returns the correct cpumask with
>> >> >this flag
>> >>
>> >> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
>> >
>> >sd_llc_id refers to a scheduler domain but your patch breaks this so
>> >if you want a llc that reflects this topo: [0-1] [2-3] you must
>> >provide a sched_domain level with this topo
>>
>> Maybe we should add a shared-cache level(SC), like what CLS does:
>>
>> DIE [0-7] (shared level 3 cache, SD_SHARE_PKG_RESOURCES)
>> MC [0-3] [4-7] (not SD_SHARE_PKG_RESOURCES)
>> CLS (if necessary)
>> SC [0-1][2-3][4-5][6-7] (shared level 2 cache, SD_SHARE_PKG_RESOURCES)
>> SMT (if necessary)
>>
>> SC means a couple of CPUs which are placed closely by sharing
>> mid-level caches, but not enough to be a cluster.
>
>what you name SC above looks the same as CLS which should not be mixed
>with Arm cluster terminology
Do you mean cluster is equal to shared cache instead of containing, SC just
means shared cache, but not form a cluster, a CLS can contain many SCs.
If as you said, SC looks the same as CLS, should we rename CLS to SC to
avoid confusion?
Thanks,
Wang
On Sat, 2 Apr 2022 at 11:34, 王擎 <[email protected]> wrote:
>
>
> >>
> >>
> >> >>
> >> >>
> >> >> >>
> >> >> >>
> >> >> >> >>
> >> >> >> >>
> >> >> >> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <[email protected]> wrote:
> >> >> >> >> >>
> >> >> >> >> >> From: Wang Qing <[email protected]>
> >> >> >> >> >>
> >> >> >> >> >> Some architectures(e.g. ARM64), caches are implemented below:
> >> >> >> >> >> cluster: ****** cluster 0 ***** ****** cluster 1 *****
> >> >> >> >> >> core: 0 1 2 3 4 5 6 7
> >> >> >> >> (add cache level 1) c0 c1 c2 c3 c4 c5 c6 c7
> >> >> >> >> >> cache(Leveln): **cache0** **cache1** **cache2** **cache3**
> >> >> >> >> (add cache level 3) *************share level 3 cache ***************
> >> >> >> >> >> sd_llc_id(current): 0 0 0 0 4 4 4 4
> >> >> >> >> >> sd_llc_id(should be): 0 0 2 2 4 4 6 6
> >> >> >> >> >>
> >> >> >> >> Here, n always be 2 in ARM64, but others are also possible.
> >> >> >> >> core[0,1] form a complex(ARMV9), which share L2 cache, core[2,3] is the same.
> >> >> >> >>
> >> >> >> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
> >> >> >> >> >> return the wrong value, which will affect the CPU load balance.
> >> >> >> >> >>
> >> >> >> >> >What does your current scheduler topology look like?
> >> >> >> >> >
> >> >> >> >> >For CPU 0 to 3, do you have the below ?
> >> >> >> >> >DIE [0 - 3] [4-7]
> >> >> >> >> >MC [0] [1] [2] [3]
> >> >> >> >>
> >> >> >> >> The current scheduler topology consistent with CPU topology:
> >> >> >> >> DIE [0-7]
> >> >> >> >> MC [0-3] [4-7] (SD_SHARE_PKG_RESOURCES)
> >> >> >> >> Most Android phones have this topology.
> >> >> >> >> >
> >> >> >> >> >But you would like something like below for cpu 0-1 instead ?
> >> >> >> >> >DIE [0 - 3] [4-7]
> >> >> >> >> >CLS [0 - 1] [2 - 3]
> >> >> >> >> >MC [0] [1]
> >> >> >> >> >
> >> >> >> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
> >> >> >> >>
> >> >> >> >> We don't change the current scheduler topology, but the
> >> >> >> >> cache topology should be separated like below:
> >> >> >> >
> >> >> >> >The scheduler topology is not only cpu topology but a mixed of cpu and
> >> >> >> >cache/memory cache topology
> >> >> >> >
> >> >> >> >> [0-7] (shared level 3 cache )
> >> >> >> >> [0-1] [2-3][4-5][6-7] (shared level 2 cache )
> >> >> >> >
> >> >> >> >So you don't bother the intermediate cluster level which is even simpler.
> >> >> >> >you have to modify generic arch topology so that cpu_coregroup_mask
> >> >> >> >returns the correct cpu mask directly.
> >> >> >> >
> >> >> >> >You will notice a llc_sibling field that is currently used by acpi but
> >> >> >> >not DT to return llc cpu mask
> >> >> >> >
> >> >> >> cpu_topology[].llc_sibling describe the last level cache of whole system,
> >> >> >> not in the sched_domain.
> >> >> >>
> >> >> >> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
> >> >> >
> >> >> >If llc_sibling was 0xff([0-7] on your system, you would have only one level:
> >> >> >MC[0-7]
> >> >>
> >> >> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
> >> >> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
> >> >> shared the llc(L3), but we also have two level:
> >> >> DIE [0-7]
> >> >> MC [0-3][4-6]
> >> >> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
> >> >> when misfit. We won't change it.
> >> >>
> >> >> >
> >> >> >> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
> >> >> >> in sd, which should be [0-1] instead of [0-3].
> >> >> >
> >> >> >sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
> >> >> >If you want llc to be [0-3] make sure that the
> >> >> >sched_domain_topology_level array returns the correct cpumask with
> >> >> >this flag
> >> >>
> >> >> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
> >> >
> >> >sd_llc_id refers to a scheduler domain but your patch breaks this so
> >> >if you want a llc that reflects this topo: [0-1] [2-3] you must
> >> >provide a sched_domain level with this topo
> >>
> >> Maybe we should add a shared-cache level(SC), like what CLS does:
> >>
> >> DIE [0-7] (shared level 3 cache, SD_SHARE_PKG_RESOURCES)
> >> MC [0-3] [4-7] (not SD_SHARE_PKG_RESOURCES)
> >> CLS (if necessary)
> >> SC [0-1][2-3][4-5][6-7] (shared level 2 cache, SD_SHARE_PKG_RESOURCES)
> >> SMT (if necessary)
> >>
> >> SC means a couple of CPUs which are placed closely by sharing
> >> mid-level caches, but not enough to be a cluster.
> >
> >what you name SC above looks the same as CLS which should not be mixed
> >with Arm cluster terminology
>
> Do you mean cluster is equal to shared cache instead of containing, SC just
> means shared cache, but not form a cluster, a CLS can contain many SCs.
CLS in the scheduler topology is not strictly tied to the "Arm
cluster" but it's the generic name to describe an intermediate group
of CPUs with common properties. CLS is also used by some intel
platforms as an example. What I mean is that you can use the scheduler
CLS level to describe what you call an Arm SC level.
>
> If as you said, SC looks the same as CLS, should we rename CLS to SC to
> avoid confusion?
CLS is a generic scheduler name and I don't think that we need to
rename it to a Arm specific label
Thanks,
Vincent
>
> Thanks,
> Wang
>>
>>
>> >>
>> >>
>> >> >>
>> >> >>
>> >> >> >>
>> >> >> >>
>> >> >> >> >>
>> >> >> >> >>
>> >> >> >> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <[email protected]> wrote:
>> >> >> >> >> >>
>> >> >> >> >> >> From: Wang Qing <[email protected]>
>> >> >> >> >> >>
>> >> >> >> >> >> Some architectures(e.g. ARM64), caches are implemented below:
>> >> >> >> >> >> cluster: ****** cluster 0 ***** ****** cluster 1 *****
>> >> >> >> >> >> core: 0 1 2 3 4 5 6 7
>> >> >> >> >> (add cache level 1) c0 c1 c2 c3 c4 c5 c6 c7
>> >> >> >> >> >> cache(Leveln): **cache0** **cache1** **cache2** **cache3**
>> >> >> >> >> (add cache level 3) *************share level 3 cache ***************
>> >> >> >> >> >> sd_llc_id(current): 0 0 0 0 4 4 4 4
>> >> >> >> >> >> sd_llc_id(should be): 0 0 2 2 4 4 6 6
>> >> >> >> >> >>
>> >> >> >> >> Here, n always be 2 in ARM64, but others are also possible.
>> >> >> >> >> core[0,1] form a complex(ARMV9), which share L2 cache, core[2,3] is the same.
>> >> >> >> >>
>> >> >> >> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
>> >> >> >> >> >> return the wrong value, which will affect the CPU load balance.
>> >> >> >> >> >>
>> >> >> >> >> >What does your current scheduler topology look like?
>> >> >> >> >> >
>> >> >> >> >> >For CPU 0 to 3, do you have the below ?
>> >> >> >> >> >DIE [0 - 3] [4-7]
>> >> >> >> >> >MC [0] [1] [2] [3]
>> >> >> >> >>
>> >> >> >> >> The current scheduler topology consistent with CPU topology:
>> >> >> >> >> DIE [0-7]
>> >> >> >> >> MC [0-3] [4-7] (SD_SHARE_PKG_RESOURCES)
>> >> >> >> >> Most Android phones have this topology.
>> >> >> >> >> >
>> >> >> >> >> >But you would like something like below for cpu 0-1 instead ?
>> >> >> >> >> >DIE [0 - 3] [4-7]
>> >> >> >> >> >CLS [0 - 1] [2 - 3]
>> >> >> >> >> >MC [0] [1]
>> >> >> >> >> >
>> >> >> >> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
>> >> >> >> >>
>> >> >> >> >> We don't change the current scheduler topology, but the
>> >> >> >> >> cache topology should be separated like below:
>> >> >> >> >
>> >> >> >> >The scheduler topology is not only cpu topology but a mixed of cpu and
>> >> >> >> >cache/memory cache topology
>> >> >> >> >
>> >> >> >> >> [0-7] (shared level 3 cache )
>> >> >> >> >> [0-1] [2-3][4-5][6-7] (shared level 2 cache )
>> >> >> >> >
>> >> >> >> >So you don't bother the intermediate cluster level which is even simpler.
>> >> >> >> >you have to modify generic arch topology so that cpu_coregroup_mask
>> >> >> >> >returns the correct cpu mask directly.
>> >> >> >> >
>> >> >> >> >You will notice a llc_sibling field that is currently used by acpi but
>> >> >> >> >not DT to return llc cpu mask
>> >> >> >> >
>> >> >> >> cpu_topology[].llc_sibling describe the last level cache of whole system,
>> >> >> >> not in the sched_domain.
>> >> >> >>
>> >> >> >> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
>> >> >> >
>> >> >> >If llc_sibling was 0xff([0-7] on your system, you would have only one level:
>> >> >> >MC[0-7]
>> >> >>
>> >> >> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
>> >> >> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
>> >> >> shared the llc(L3), but we also have two level:
>> >> >> DIE [0-7]
>> >> >> MC [0-3][4-6]
>> >> >> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
>> >> >> when misfit. We won't change it.
>> >> >>
>> >> >> >
>> >> >> >> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
>> >> >> >> in sd, which should be [0-1] instead of [0-3].
>> >> >> >
>> >> >> >sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
>> >> >> >If you want llc to be [0-3] make sure that the
>> >> >> >sched_domain_topology_level array returns the correct cpumask with
>> >> >> >this flag
>> >> >>
>> >> >> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
>> >> >
>> >> >sd_llc_id refers to a scheduler domain but your patch breaks this so
>> >> >if you want a llc that reflects this topo: [0-1] [2-3] you must
>> >> >provide a sched_domain level with this topo
>> >>
>> >> Maybe we should add a shared-cache level(SC), like what CLS does:
>> >>
>> >> DIE [0-7] (shared level 3 cache, SD_SHARE_PKG_RESOURCES)
>> >> MC [0-3] [4-7] (not SD_SHARE_PKG_RESOURCES)
>> >> CLS (if necessary)
>> >> SC [0-1][2-3][4-5][6-7] (shared level 2 cache, SD_SHARE_PKG_RESOURCES)
>> >> SMT (if necessary)
>> >>
>> >> SC means a couple of CPUs which are placed closely by sharing
>> >> mid-level caches, but not enough to be a cluster.
>> >
>> >what you name SC above looks the same as CLS which should not be mixed
>> >with Arm cluster terminology
>>
>> Do you mean cluster is equal to shared cache instead of containing, SC just
>> means shared cache, but not form a cluster, a CLS can contain many SCs.
>
>CLS in the scheduler topology is not strictly tied to the "Arm
>cluster" but it's the generic name to describe an intermediate group
>of CPUs with common properties. CLS is also used by some intel
>platforms as an example. What I mean is that you can use the scheduler
>CLS level to describe what you call an Arm SC level.
It won't work, because cluster_sibling is assigned according to cluster_id,
which is strictly tied to the "Arm cluster".
And if we have used CLS to describe the cluster sd, how do we describe
shared cache sd, like complex, which shared self cache within a cluster.
>
>>
>> If as you said, SC looks the same as CLS, should we rename CLS to SC to
>> avoid confusion?
>
>CLS is a generic scheduler name and I don't think that we need to
>rename it to a Arm specific label
I still insist on adding sc level within the cls, because maybe we have
already used CLS to describe the cluster sd, please consider about it.
Thanks,
Wang
>
>Thanks,
>Vincent
>
>>
>> Thanks,
>> Wang