This patch series adds the cpu topology for RISC-V. It contains
both the DT binding and actual source code. It has been tested on
QEMU & Unleashed board.
The idea is based on cpu-map in ARM with changes related to how
we define SMT systems. The reason for adopting a similar approach
to ARM as I feel it provides a very clear way of defining the
topology compared to parsing cache nodes to figure out which cpus
share the same package or core. I am open to any other idea to
implement cpu-topology as well.
Atish Patra (2):
dt-bindings: topology: Add RISC-V cpu topology.
RISC-V: Introduce cpu topology.
.../devicetree/bindings/riscv/topology.txt | 154 ++++++++++++++++
arch/riscv/include/asm/topology.h | 28 +++
arch/riscv/kernel/Makefile | 1 +
arch/riscv/kernel/smpboot.c | 5 +-
arch/riscv/kernel/topology.c | 194 +++++++++++++++++++++
5 files changed, 381 insertions(+), 1 deletion(-)
create mode 100644 Documentation/devicetree/bindings/riscv/topology.txt
create mode 100644 arch/riscv/include/asm/topology.h
create mode 100644 arch/riscv/kernel/topology.c
--
2.7.4
Currently, cpu topology is not defined for RISC-V.
Parse cpu-topology from a new DT entry "cpu-topology"
to create different cpu sibling maps.
As of now, only bare minimum requirements are implemented
but it is capable of describing any type of topology in future.
CPU topology after applying the patch.
$cat /sys/devices/system/cpu/cpu2/topology/core_siblings_list
0-3
$cat /sys/devices/system/cpu/cpu3/topology/core_siblings_list
0-3
$cat /sys/devices/system/cpu/cpu3/topology/physical_package_id
0
$cat /sys/devices/system/cpu/cpu3/topology/core_id
3
Signed-off-by: Atish Patra <[email protected]>
---
arch/riscv/include/asm/topology.h | 28 ++++++
arch/riscv/kernel/Makefile | 1 +
arch/riscv/kernel/smpboot.c | 5 +-
arch/riscv/kernel/topology.c | 194 ++++++++++++++++++++++++++++++++++++++
4 files changed, 227 insertions(+), 1 deletion(-)
create mode 100644 arch/riscv/include/asm/topology.h
create mode 100644 arch/riscv/kernel/topology.c
diff --git a/arch/riscv/include/asm/topology.h b/arch/riscv/include/asm/topology.h
new file mode 100644
index 00000000..d412edc8
--- /dev/null
+++ b/arch/riscv/include/asm/topology.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef __ASM_TOPOLOGY_H
+#define __ASM_TOPOLOGY_H
+
+#include <linux/cpumask.h>
+#include <asm-generic/topology.h>
+
+struct riscv_cpu_topology {
+ int core_id;
+ int package_id;
+ int hart_id;
+ cpumask_t thread_sibling;
+ cpumask_t core_sibling;
+};
+
+extern struct riscv_cpu_topology cpu_topology[NR_CPUS];
+
+#define topology_physical_package_id(cpu) (cpu_topology[cpu].package_id)
+#define topology_core_id(cpu) (cpu_topology[cpu].core_id)
+#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling)
+#define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling)
+
+void init_cpu_topology(void);
+void remove_cpu_topology(unsigned int cpuid);
+void set_topology_masks(unsigned int cpuid);
+
+#endif /* _ASM_RISCV_TOPOLOGY_H */
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index e1274fc0..128766f8 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -27,6 +27,7 @@ obj-y += riscv_ksyms.o
obj-y += stacktrace.o
obj-y += vdso.o
obj-y += cacheinfo.o
+obj-y += topology.o
obj-y += vdso/
CFLAGS_setup.o := -mcmodel=medany
diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index 56abab6a..1324f4b2 100644
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -45,6 +45,7 @@ void __init smp_prepare_boot_cpu(void)
void __init smp_prepare_cpus(unsigned int max_cpus)
{
+ init_cpu_topology();
}
void __init setup_smp(void)
@@ -98,13 +99,15 @@ void __init smp_cpus_done(unsigned int max_cpus)
asmlinkage void __init smp_callin(void)
{
struct mm_struct *mm = &init_mm;
+ int cpu = smp_processor_id();
/* All kernel threads share the same mm context. */
atomic_inc(&mm->mm_count);
current->active_mm = mm;
trap_init();
- notify_cpu_starting(smp_processor_id());
+ notify_cpu_starting(cpu);
+ set_topology_masks(cpu);
set_cpu_online(smp_processor_id(), 1);
local_flush_tlb_all();
local_irq_enable();
diff --git a/arch/riscv/kernel/topology.c b/arch/riscv/kernel/topology.c
new file mode 100644
index 00000000..5195de14
--- /dev/null
+++ b/arch/riscv/kernel/topology.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018 Western Digital Corporation or its affiliates.
+ *
+ * Based on the arm64 version arch/arm64/kernel/topology.c
+ *
+ */
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/of.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+
+#include <asm/topology.h>
+
+/*
+ * cpu topology array
+ */
+struct riscv_cpu_topology cpu_topology[NR_CPUS];
+EXPORT_SYMBOL_GPL(cpu_topology);
+
+void set_topology_masks(unsigned int cpuid)
+{
+ struct riscv_cpu_topology *ctopo, *cpuid_topo = &cpu_topology[cpuid];
+ int cpu;
+
+ /* update core and thread sibling masks */
+ for_each_online_cpu(cpu) {
+ ctopo = &cpu_topology[cpu];
+
+ if (cpuid_topo->package_id != ctopo->package_id)
+ continue;
+
+ cpumask_set_cpu(cpuid, &ctopo->core_sibling);
+ cpumask_set_cpu(cpu, &cpuid_topo->core_sibling);
+
+ if (cpuid_topo->core_id != ctopo->core_id)
+ continue;
+
+ cpumask_set_cpu(cpuid, &ctopo->thread_sibling);
+ cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling);
+ }
+}
+
+static int __init get_hartid_for_cnode(struct device_node *node,
+ unsigned int count)
+{
+ char name[10];
+ struct device_node *cpu_node;
+ int cpu;
+
+ snprintf(name, sizeof(name), "cpu%d", count);
+ cpu_node = of_parse_phandle(node, name, 0);
+ if (!cpu_node)
+ return -1;
+
+ cpu = of_cpu_node_to_id(cpu_node);
+ if (cpu < 0)
+ pr_err("Unable to find CPU node for %pOF\n", cpu_node);
+
+ of_node_put(cpu_node);
+ return cpu;
+}
+
+static int __init parse_core(struct device_node *core, int pid)
+{
+ char name[10];
+ struct device_node *cnode;
+ int count, hid = 0;
+ int coreid = 0;
+ bool found_hart = false;
+
+ do {
+ snprintf(name, sizeof(name), "core%d", coreid);
+ cnode = of_get_child_by_name(core, name);
+ if (cnode) {
+ count = 0;
+ do {
+ hid = get_hartid_for_cnode(cnode, count);
+ if (hid >= 0) {
+ found_hart = true;
+ cpu_topology[hid].package_id = pid;
+ cpu_topology[hid].core_id = coreid;
+ cpu_topology[hid].hart_id = hid;
+ }
+ count++;
+ } while (hid >= 0);
+ coreid++;
+ of_node_put(cnode);
+ }
+ } while (cnode);
+
+ if (!found_hart) {
+ pr_err("%pOF: no hart found\n", cnode);
+ of_node_put(cnode);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+static int __init parse_package(struct device_node *package)
+{
+ char name[10];
+ struct device_node *pnode;
+ int ret, package_id = 0;
+
+ /*
+ * Current RISC-V system doesn't have child package node. It has a
+ * flat package hierarchy. Once, we have such a system, the following
+ * code can be modified to support that.
+ */
+ do {
+ snprintf(name, sizeof(name), "package%d", package_id);
+ pnode = of_get_child_by_name(package, name);
+ if (pnode) {
+ ret = parse_core(pnode, package_id);
+ if (ret < 0)
+ pr_warn("%pOF: empty package\n", package);
+
+ package_id++;
+ }
+ } while (pnode);
+
+ return 0;
+}
+
+static int __init parse_dt_topology(void)
+{
+ struct device_node *cn, *map;
+ int ret = 0;
+ int cpu;
+
+ cn = of_find_node_by_path("/cpus");
+ if (!cn) {
+ pr_err("No CPU information found in DT\n");
+ return 0;
+ }
+
+ map = of_get_child_by_name(cn, "cpu-topology");
+ if (!map)
+ goto out;
+
+ ret = parse_package(map);
+ if (ret != 0)
+ goto out_map;
+
+ /*
+ * Check that all cores are in the topology; the SMP code will
+ * only mark cores described in the DT as possible.
+ */
+ for_each_possible_cpu(cpu)
+ if (cpu_topology[cpu].package_id == -1)
+ ret = -EINVAL;
+
+out_map:
+ of_node_put(map);
+out:
+ of_node_put(cn);
+ return ret;
+}
+
+static void clear_all_topology_masks(int cpu)
+{
+ struct riscv_cpu_topology *ctopo = &cpu_topology[cpu];
+
+ cpumask_clear(&ctopo->core_sibling);
+ cpumask_set_cpu(cpu, &ctopo->core_sibling);
+ cpumask_clear(&ctopo->thread_sibling);
+ cpumask_set_cpu(cpu, &ctopo->thread_sibling);
+}
+
+static void __init reset_cpu_topology(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct riscv_cpu_topology *ctopo = &cpu_topology[cpu];
+
+ ctopo->hart_id = 0;
+ ctopo->core_id = 0;
+ ctopo->package_id = -1;
+
+ clear_all_topology_masks(cpu);
+ }
+}
+
+void __init init_cpu_topology(void)
+{
+ reset_cpu_topology();
+
+ if (of_have_populated_dt() && parse_dt_topology())
+ reset_cpu_topology();
+}
--
2.7.4
Define a RISC-V cpu topology. This is based on cpu-map in ARM world.
But it doesn't need a separate thread node for defining SMT systems.
Multiple cpu phandle properties can be parsed to identify the sibling
hardware threads. Moreover, we do not have cluster concept in RISC-V.
So package is a better word choice than cluster for RISC-V.
Signed-off-by: Atish Patra <[email protected]>
---
.../devicetree/bindings/riscv/topology.txt | 154 +++++++++++++++++++++
1 file changed, 154 insertions(+)
create mode 100644 Documentation/devicetree/bindings/riscv/topology.txt
diff --git a/Documentation/devicetree/bindings/riscv/topology.txt b/Documentation/devicetree/bindings/riscv/topology.txt
new file mode 100644
index 00000000..96039ed3
--- /dev/null
+++ b/Documentation/devicetree/bindings/riscv/topology.txt
@@ -0,0 +1,154 @@
+===========================================
+RISC-V cpu topology binding description
+===========================================
+
+===========================================
+1 - Introduction
+===========================================
+
+In a RISC-V system, the hierarchy of CPUs can be defined through following nodes that
+are used to describe the layout of physical CPUs in the system:
+
+- packages
+- core
+
+The cpu nodes (bindings defined in [1]) represent the devices that
+correspond to physical CPUs and are to be mapped to the hierarchy levels.
+Simultaneous multi-threading (SMT) systems can also represent their topology
+by defining multiple cpu phandles inside core node. The details are explained
+in paragraph 3.
+
+The remainder of this document provides the topology bindings for ARM, based
+on the Devicetree Specification, available from:
+
+https://www.devicetree.org/specifications/
+
+If not stated otherwise, whenever a reference to a cpu node phandle is made its
+value must point to a cpu node compliant with the cpu node bindings as
+documented in [1].
+A topology description containing phandles to cpu nodes that are not compliant
+with bindings standardized in [1] is therefore considered invalid.
+
+This cpu topology binding description is mostly based on the topology defined
+in ARM [2].
+===========================================
+2 - cpu-topology node
+===========================================
+
+The RISC-V CPU topology is defined within the "cpu-topology" node, which is a direct
+child of the "cpus" node and provides a container where the actual topology
+nodes are listed.
+
+- cpu-topology node
+
+ Usage: Optional - RISC-V SMP systems need to provide CPUs topology to
+ the OS. RISC-V uniprocessor systems do not require a
+ topology description and therefore should not define a
+ cpu-topology node.
+
+ Description: The cpu-topology node is just a container node where its
+ subnodes describe the CPU topology.
+
+ Node name must be "cpu-topology".
+
+ The cpu-topology node's parent node must be the cpus node.
+
+ The cpu-topology node's child nodes can be:
+
+ - one or more package nodes
+
+ Any other configuration is considered invalid.
+
+The cpu-topology node can only contain two types of child nodes:
+
+- package node
+- core node
+
+whose bindings are described in paragraph 3.
+
+=================================================
+2.1 - cpu-topology child nodes naming convention
+=================================================
+
+cpu-topology child nodes must follow a naming convention where the node name
+must be "packageN", "coreN" depending on the node type (i.e. package/core).
+For SMT systems, coreN node can contain several cpuN to indicate individual
+SMT harts (where N = {0, 1, ...} is the node number; nodes which are siblings
+within a single common parent node must be given a unique and sequential N
+value, starting from 0). cpu-topology child nodes which do not share a common
+parent node can have the same name (i.e. same number N as other cpu-topology
+child nodes at different device tree levels) since name uniqueness will be
+guaranteed by the device tree hierarchy.
+
+===========================================
+3 - package/core node bindings
+===========================================
+
+Bindings for package/core nodes are defined as follows:
+
+- package node
+
+ Description: must be declared within a cpu-topology node, one node
+ per package. A system can contain several layers of
+ package nodes. It can also be contained in parent
+ package nodes.
+
+ The package node name must be "packageN" as described in 2.1 above.
+ A package node can not be a leaf node.
+
+ A package node's child nodes must be:
+
+ - one or more package nodes; or
+ - one or more core nodes
+
+ Any other configuration is considered invalid.
+
+- core node
+
+ Description: must be declared in a package node, one node per core in
+ the package.
+
+ The core node name must be "coreN" as described in 2.1 above.
+
+ A core node must always be a leaf node.
+
+ Properties for core nodes :
+
+ - cpuN
+ Usage: required
+ Value type: <phandle>
+ Definition: a phandle to the cpu node that corresponds to the
+ core node.
+ For SMT systems, a core node will contain multiple cpuN phandles.
+
+ Any other configuration is considered invalid.
+
+===========================================
+4 - Example dts
+===========================================
+
+Example : HiFive Unleashed (RISC-V 64 bit, 4 core system)
+
+L100: cpu-topology {
+ package0 {
+ core0 {
+ cpu0 = <&L12>;
+ };
+ core1 {
+ cpu0 = <&L15>;
+ };
+ core2 {
+ cpu0 = <&L18>;
+ };
+ core3 {
+ cpu0 = <&L21>;
+ };
+ };
+ };
+===============================================================================
+[1] RISC-V cpus documentation
+ Documentation/devicetree/binding/riscv/cpus.txt
+[2] ARM topology documentation
+ Documentation/devicetree/binding/arm/topology.txt
+
+===============================================================================
--
2.7.4
On Thu, Nov 1, 2018 at 6:04 PM Atish Patra <[email protected]> wrote:
>
> Define a RISC-V cpu topology. This is based on cpu-map in ARM world.
> But it doesn't need a separate thread node for defining SMT systems.
> Multiple cpu phandle properties can be parsed to identify the sibling
> hardware threads. Moreover, we do not have cluster concept in RISC-V.
> So package is a better word choice than cluster for RISC-V.
There was a proposal to add package info for ARM recently. Not sure
what happened to that, but we don't need 2 different ways.
There's never going to be clusters for RISC-V? What prevents that?
Seems shortsighted to me.
>
> Signed-off-by: Atish Patra <[email protected]>
> ---
> .../devicetree/bindings/riscv/topology.txt | 154 +++++++++++++++++++++
> 1 file changed, 154 insertions(+)
> create mode 100644 Documentation/devicetree/bindings/riscv/topology.txt
>
> diff --git a/Documentation/devicetree/bindings/riscv/topology.txt b/Documentation/devicetree/bindings/riscv/topology.txt
> new file mode 100644
> index 00000000..96039ed3
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/riscv/topology.txt
> @@ -0,0 +1,154 @@
> +===========================================
> +RISC-V cpu topology binding description
> +===========================================
> +
> +===========================================
> +1 - Introduction
> +===========================================
> +
> +In a RISC-V system, the hierarchy of CPUs can be defined through following nodes that
> +are used to describe the layout of physical CPUs in the system:
> +
> +- packages
> +- core
> +
> +The cpu nodes (bindings defined in [1]) represent the devices that
> +correspond to physical CPUs and are to be mapped to the hierarchy levels.
> +Simultaneous multi-threading (SMT) systems can also represent their topology
> +by defining multiple cpu phandles inside core node. The details are explained
> +in paragraph 3.
I don't see a reason to do this differently than ARM. That said, I
don't think the thread part is in use on ARM, so it could possibly be
changed.
> +
> +The remainder of this document provides the topology bindings for ARM, based
for ARM?
> +on the Devicetree Specification, available from:
> +
> +https://www.devicetree.org/specifications/
> +
> +If not stated otherwise, whenever a reference to a cpu node phandle is made its
> +value must point to a cpu node compliant with the cpu node bindings as
> +documented in [1].
> +A topology description containing phandles to cpu nodes that are not compliant
> +with bindings standardized in [1] is therefore considered invalid.
> +
> +This cpu topology binding description is mostly based on the topology defined
> +in ARM [2].
> +===========================================
> +2 - cpu-topology node
cpu-map. Why change this?
What I would like to see is the ARM topology binding reworked to be
common or some good reasons why it doesn't work for RISC-V as-is.
Rob
On Fri, Nov 02, 2018 at 08:09:39AM -0500, Rob Herring wrote:
> On Thu, Nov 1, 2018 at 6:04 PM Atish Patra <[email protected]> wrote:
> >
> > Define a RISC-V cpu topology. This is based on cpu-map in ARM world.
> > But it doesn't need a separate thread node for defining SMT systems.
> > Multiple cpu phandle properties can be parsed to identify the sibling
> > hardware threads. Moreover, we do not have cluster concept in RISC-V.
> > So package is a better word choice than cluster for RISC-V.
>
> There was a proposal to add package info for ARM recently. Not sure
> what happened to that, but we don't need 2 different ways.
>
We still need that, I can brush it up and post what Lorenzo had previously
proposed[1]. We want to keep both DT and ACPI CPU topology story aligned.
--
Regards,
Sudeep
[1] https://marc.info/?l=devicetree&m=151817774202854&w=2
On Fri, Nov 2, 2018 at 8:31 AM Sudeep Holla <[email protected]> wrote:
>
> On Fri, Nov 02, 2018 at 08:09:39AM -0500, Rob Herring wrote:
> > On Thu, Nov 1, 2018 at 6:04 PM Atish Patra <[email protected]> wrote:
> > >
> > > Define a RISC-V cpu topology. This is based on cpu-map in ARM world.
> > > But it doesn't need a separate thread node for defining SMT systems.
> > > Multiple cpu phandle properties can be parsed to identify the sibling
> > > hardware threads. Moreover, we do not have cluster concept in RISC-V.
> > > So package is a better word choice than cluster for RISC-V.
> >
> > There was a proposal to add package info for ARM recently. Not sure
> > what happened to that, but we don't need 2 different ways.
> >
>
> We still need that, I can brush it up and post what Lorenzo had previously
> proposed[1]. We want to keep both DT and ACPI CPU topology story aligned.
Frankly, I don't care what the ACPI story is. I care whether each cpu
arch does its own thing in DT or not. If a package prop works for
RISC-V folks and that happens to align with ACPI, then okay. Though I
tend to prefer a package represented as a node rather than a property
as I think that's more consistent.
Any comments on the thread aspect (whether it has ever been used)?
Though I think thread as a node level is more consistent with each
topology level being a node (same with package).
Rob
On Fri, Nov 02, 2018 at 10:11:38AM -0500, Rob Herring wrote:
> On Fri, Nov 2, 2018 at 8:31 AM Sudeep Holla <[email protected]> wrote:
> >
> > On Fri, Nov 02, 2018 at 08:09:39AM -0500, Rob Herring wrote:
> > > On Thu, Nov 1, 2018 at 6:04 PM Atish Patra <[email protected]> wrote:
> > > >
> > > > Define a RISC-V cpu topology. This is based on cpu-map in ARM world.
> > > > But it doesn't need a separate thread node for defining SMT systems.
> > > > Multiple cpu phandle properties can be parsed to identify the sibling
> > > > hardware threads. Moreover, we do not have cluster concept in RISC-V.
> > > > So package is a better word choice than cluster for RISC-V.
> > >
> > > There was a proposal to add package info for ARM recently. Not sure
> > > what happened to that, but we don't need 2 different ways.
> > >
> >
> > We still need that, I can brush it up and post what Lorenzo had previously
> > proposed[1]. We want to keep both DT and ACPI CPU topology story aligned.
>
> Frankly, I don't care what the ACPI story is. I care whether each cpu
Sorry I meant feature parity with ACPI and didn't refer to the mechanics.
> arch does its own thing in DT or not. If a package prop works for
> RISC-V folks and that happens to align with ACPI, then okay. Though I
> tend to prefer a package represented as a node rather than a property
> as I think that's more consistent.
>
Sounds good. One of the reason for making it *optional* property is for
backward compatibility. But we should be able to deal with that even with
node.
> Any comments on the thread aspect (whether it has ever been used)?
> Though I think thread as a node level is more consistent with each
> topology level being a node (same with package).
>
Not 100% sure, the only multi threaded core in the market I know is
Cavium TX2 which is ACPI.
--
Regards,
Sudeep
Hello All,
Στις 2018-11-02 01:04, Atish Patra έγραψε:
> This patch series adds the cpu topology for RISC-V. It contains
> both the DT binding and actual source code. It has been tested on
> QEMU & Unleashed board.
>
> The idea is based on cpu-map in ARM with changes related to how
> we define SMT systems. The reason for adopting a similar approach
> to ARM as I feel it provides a very clear way of defining the
> topology compared to parsing cache nodes to figure out which cpus
> share the same package or core. I am open to any other idea to
> implement cpu-topology as well.
>
I was also about to start a discussion about CPU topology on RISC-V
after the last swtools group meeting. The goal is to provide the
scheduler with hints on how to distribute tasks more efficiently
between harts, by populating the scheduling domain topology levels
(https://elixir.bootlin.com/linux/v4.19/ident/sched_domain_topology_level).
What we want to do is define cpu groups and assign them to
scheduling domains with the appropriate SD_ flags
(https://github.com/torvalds/linux/blob/master/include/linux/sched/topology.h#L16).
So the cores that belong to a scheduling domain may share:
CPU capacity (SD_SHARE_CPUCAPACITY / SD_ASYM_CPUCAPACITY)
Package resources -e.g. caches, units etc- (SD_SHARE_PKG_RESOURCES)
Power domain (SD_SHARE_POWERDOMAIN)
In this context I believe using words like "core", "package",
"socket" etc can be misleading. For example the sample topology you
use on the documentation says that there are 4 cores that are part
of a package, however "package" has a different meaning to the
scheduler. Also we don't say anything in case they share a power
domain or if they have the same capacity or not. This mapping deals
only with cache hierarchy or other shared resources.
How about defining a dt scheme to describe the scheduler domain
topology levels instead ? e.g:
2 sets (or clusters if you prefer) of 2 SMT cores, each set with
a different capacity and power domain:
sched_topology {
level0 { // SMT
shared = "power", "capacity", "resources";
group0 {
members = <&hart0>, <&hart1>;
}
group1 {
members = <&hart2>, <&hart3>;
}
group2 {
members = <&hart4>, <&hart5>;
}
group3 {
members = <&hart6>, <&hart7>;
}
}
level1 { // MC
shared = "power", "capacity"
group0 {
members = <&hart0>, <&hart1>, <&hart2>, <&hart3>;
}
group1 {
members = <&hart4>, <&hart5>, <&hart6>, <&hart7>;
}
}
top_level { // A group with all harts in it
shared = "" // There is nothing common for ALL harts, we could have
capacity here
}
}
Regards,
Nick
On 11/2/18 6:09 AM, Rob Herring wrote:
> On Thu, Nov 1, 2018 at 6:04 PM Atish Patra <[email protected]> wrote:
>>
>> Define a RISC-V cpu topology. This is based on cpu-map in ARM world.
>> But it doesn't need a separate thread node for defining SMT systems.
>> Multiple cpu phandle properties can be parsed to identify the sibling
>> hardware threads. Moreover, we do not have cluster concept in RISC-V.
>> So package is a better word choice than cluster for RISC-V.
>
> There was a proposal to add package info for ARM recently. Not sure
> what happened to that, but we don't need 2 different ways.
>
> There's never going to be clusters for RISC-V? What prevents that?
> Seems shortsighted to me.
>
Agreed. My intention was to keep it simple at first go.
If package node is added, that would work for us as well.
>>
>> Signed-off-by: Atish Patra <[email protected]>
>> ---
>> .../devicetree/bindings/riscv/topology.txt | 154 +++++++++++++++++++++
>> 1 file changed, 154 insertions(+)
>> create mode 100644 Documentation/devicetree/bindings/riscv/topology.txt
>>
>> diff --git a/Documentation/devicetree/bindings/riscv/topology.txt b/Documentation/devicetree/bindings/riscv/topology.txt
>> new file mode 100644
>> index 00000000..96039ed3
>> --- /dev/null
>> +++ b/Documentation/devicetree/bindings/riscv/topology.txt
>> @@ -0,0 +1,154 @@
>> +===========================================
>> +RISC-V cpu topology binding description
>> +===========================================
>> +
>> +===========================================
>> +1 - Introduction
>> +===========================================
>> +
>> +In a RISC-V system, the hierarchy of CPUs can be defined through following nodes that
>> +are used to describe the layout of physical CPUs in the system:
>> +
>> +- packages
>> +- core
>> +
>> +The cpu nodes (bindings defined in [1]) represent the devices that
>> +correspond to physical CPUs and are to be mapped to the hierarchy levels.
>> +Simultaneous multi-threading (SMT) systems can also represent their topology
>> +by defining multiple cpu phandles inside core node. The details are explained
>> +in paragraph 3.
>
> I don't see a reason to do this differently than ARM. That said, I
> don't think the thread part is in use on ARM, so it could possibly be
> changed.
>
>> +
>> +The remainder of this document provides the topology bindings for ARM, based
>
> for ARM?
>
Sorry for the typo.
>> +on the Devicetree Specification, available from:
>> +
>> +https://www.devicetree.org/specifications/
>> +
>> +If not stated otherwise, whenever a reference to a cpu node phandle is made its
>> +value must point to a cpu node compliant with the cpu node bindings as
>> +documented in [1].
>> +A topology description containing phandles to cpu nodes that are not compliant
>> +with bindings standardized in [1] is therefore considered invalid.
>> +
>> +This cpu topology binding description is mostly based on the topology defined
>> +in ARM [2].
>> +===========================================
>> +2 - cpu-topology node
>
> cpu-map. Why change this?
>
To my ears, it felt a better name. But I don't mind dropping it in favor
of cpu-map if we are going to standardize cpu-map for both ARM & RISC-V.
> What I would like to see is the ARM topology binding reworked to be
> common or some good reasons why it doesn't work for RISC-V as-is.
>
IMHO, ARM topology can be reworked and put it in a common place so that
RISC-V can leverage that.
My intention for this RFC patch was to start the ball rolling on cpu
topology in RISC-V and see if DT approach is fine with everybody. I
would be happy to see ARM code to move it to a common code base where
RISC-V can reuse it.
Regards,
Atish
> Rob
>
On 11/2/18 8:50 AM, Sudeep Holla wrote:
> On Fri, Nov 02, 2018 at 10:11:38AM -0500, Rob Herring wrote:
>> On Fri, Nov 2, 2018 at 8:31 AM Sudeep Holla <[email protected]> wrote:
>>>
>>> On Fri, Nov 02, 2018 at 08:09:39AM -0500, Rob Herring wrote:
>>>> On Thu, Nov 1, 2018 at 6:04 PM Atish Patra <[email protected]> wrote:
>>>>>
>>>>> Define a RISC-V cpu topology. This is based on cpu-map in ARM world.
>>>>> But it doesn't need a separate thread node for defining SMT systems.
>>>>> Multiple cpu phandle properties can be parsed to identify the sibling
>>>>> hardware threads. Moreover, we do not have cluster concept in RISC-V.
>>>>> So package is a better word choice than cluster for RISC-V.
>>>>
>>>> There was a proposal to add package info for ARM recently. Not sure
>>>> what happened to that, but we don't need 2 different ways.
>>>>
>>>
>>> We still need that, I can brush it up and post what Lorenzo had previously
>>> proposed[1]. We want to keep both DT and ACPI CPU topology story aligned.
>>
>> Frankly, I don't care what the ACPI story is. I care whether each cpu
>
> Sorry I meant feature parity with ACPI and didn't refer to the mechanics.
>
>> arch does its own thing in DT or not. If a package prop works for
>> RISC-V folks and that happens to align with ACPI, then okay. Though I
>> tend to prefer a package represented as a node rather than a property
>> as I think that's more consistent.
>>
>
> Sounds good. One of the reason for making it *optional* property is for
> backward compatibility. But we should be able to deal with that even with
> node.
>
If you are introducing a package node, can we make cluster node
optional? I feel it is a redundant node for use cases where one doesn't
have a different grouped cpus in a package.
We may have some architecture that requires cluster, it can be added to
the DT at that time, I believe.
>> Any comments on the thread aspect (whether it has ever been used)?
>> Though I think thread as a node level is more consistent with each
>> topology level being a node (same with package).
>>
> Not 100% sure, the only multi threaded core in the market I know is
> Cavium TX2 which is ACPI.
>
Any advantages of keeping thread node if it's not being used. If I am
not wrong, we can always use multiple cpuN phandles to represent SMT
nodes. It will result in less code and DT documentation as well.
Regards,
Atish
> --
> Regards,
> Sudeep
>
On Fri, Nov 2, 2018 at 3:53 PM Atish Patra <[email protected]> wrote:
>
> On 11/2/18 8:50 AM, Sudeep Holla wrote:
> > On Fri, Nov 02, 2018 at 10:11:38AM -0500, Rob Herring wrote:
> >> On Fri, Nov 2, 2018 at 8:31 AM Sudeep Holla <[email protected]> wrote:
> >>>
> >>> On Fri, Nov 02, 2018 at 08:09:39AM -0500, Rob Herring wrote:
> >>>> On Thu, Nov 1, 2018 at 6:04 PM Atish Patra <[email protected]> wrote:
> >>>>>
> >>>>> Define a RISC-V cpu topology. This is based on cpu-map in ARM world.
> >>>>> But it doesn't need a separate thread node for defining SMT systems.
> >>>>> Multiple cpu phandle properties can be parsed to identify the sibling
> >>>>> hardware threads. Moreover, we do not have cluster concept in RISC-V.
> >>>>> So package is a better word choice than cluster for RISC-V.
> >>>>
> >>>> There was a proposal to add package info for ARM recently. Not sure
> >>>> what happened to that, but we don't need 2 different ways.
> >>>>
> >>>
> >>> We still need that, I can brush it up and post what Lorenzo had previously
> >>> proposed[1]. We want to keep both DT and ACPI CPU topology story aligned.
> >>
> >> Frankly, I don't care what the ACPI story is. I care whether each cpu
> >
> > Sorry I meant feature parity with ACPI and didn't refer to the mechanics.
> >
> >> arch does its own thing in DT or not. If a package prop works for
> >> RISC-V folks and that happens to align with ACPI, then okay. Though I
> >> tend to prefer a package represented as a node rather than a property
> >> as I think that's more consistent.
> >>
> >
> > Sounds good. One of the reason for making it *optional* property is for
> > backward compatibility. But we should be able to deal with that even with
> > node.
> >
>
> If you are introducing a package node, can we make cluster node
> optional? I feel it is a redundant node for use cases where one doesn't
> have a different grouped cpus in a package.
Certainly not. A common doc can make every level optional and each
arch can define what's mandatory.
> We may have some architecture that requires cluster, it can be added to
> the DT at that time, I believe.
>
> >> Any comments on the thread aspect (whether it has ever been used)?
> >> Though I think thread as a node level is more consistent with each
> >> topology level being a node (same with package).
> >>
> > Not 100% sure, the only multi threaded core in the market I know is
> > Cavium TX2 which is ACPI.
> >
>
> Any advantages of keeping thread node if it's not being used. If I am
> not wrong, we can always use multiple cpuN phandles to represent SMT
> nodes. It will result in less code and DT documentation as well.
It's more consistent to make each level a node IMO and we've already
discussed and defined it this way. I don't see how it's really less
code or documentation.
BTW, powerpc defined threads with multiple reg entries in the cpu
nodes. You could do that if you wanted.
Rob
On 11/2/18 11:59 AM, Nick Kossifidis wrote:
> Hello All,
>
> Στις 2018-11-02 01:04, Atish Patra έγραψε:
>> This patch series adds the cpu topology for RISC-V. It contains
>> both the DT binding and actual source code. It has been tested on
>> QEMU & Unleashed board.
>>
>> The idea is based on cpu-map in ARM with changes related to how
>> we define SMT systems. The reason for adopting a similar approach
>> to ARM as I feel it provides a very clear way of defining the
>> topology compared to parsing cache nodes to figure out which cpus
>> share the same package or core. I am open to any other idea to
>> implement cpu-topology as well.
>>
>
> I was also about to start a discussion about CPU topology on RISC-V
> after the last swtools group meeting. The goal is to provide the
> scheduler with hints on how to distribute tasks more efficiently
> between harts, by populating the scheduling domain topology levels
> (https://elixir.bootlin.com/linux/v4.19/ident/sched_domain_topology_level).
> What we want to do is define cpu groups and assign them to
> scheduling domains with the appropriate SD_ flags
> (https://github.com/torvalds/linux/blob/master/include/linux/sched/topology.h#L16).
>
Scheduler domain topology is already getting all the hints in the
following way.
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};
#ifdef CONFIG_SCHED_SMT
static inline const struct cpumask *cpu_smt_mask(int cpu)
{
return topology_sibling_cpumask(cpu);
}
#endif
const struct cpumask *cpu_coregroup_mask(int cpu)
{
return &cpu_topology[cpu].core_sibling;
}
> So the cores that belong to a scheduling domain may share:
> CPU capacity (SD_SHARE_CPUCAPACITY / SD_ASYM_CPUCAPACITY)
> Package resources -e.g. caches, units etc- (SD_SHARE_PKG_RESOURCES)
> Power domain (SD_SHARE_POWERDOMAIN)
>
> In this context I believe using words like "core", "package",
> "socket" etc can be misleading. For example the sample topology you
> use on the documentation says that there are 4 cores that are part
> of a package, however "package" has a different meaning to the
> scheduler. Also we don't say anything in case they share a power
> domain or if they have the same capacity or not. This mapping deals
> only with cache hierarchy or other shared resources.
>
> How about defining a dt scheme to describe the scheduler domain
> topology levels instead ? e.g:
>
> 2 sets (or clusters if you prefer) of 2 SMT cores, each set with
> a different capacity and power domain:
>
> sched_topology {
> level0 { // SMT
> shared = "power", "capacity", "resources";
> group0 {
> members = <&hart0>, <&hart1>;
> }
> group1 {
> members = <&hart2>, <&hart3>;
> }
> group2 {
> members = <&hart4>, <&hart5>;
> }
> group3 {
> members = <&hart6>, <&hart7>;
> }
> }
> level1 { // MC
> shared = "power", "capacity"
> group0 {
> members = <&hart0>, <&hart1>, <&hart2>, <&hart3>;
> }
> group1 {
> members = <&hart4>, <&hart5>, <&hart6>, <&hart7>;
> }
> }
> top_level { // A group with all harts in it
> shared = "" // There is nothing common for ALL harts, we could have
> capacity here
> }
> }
>
I agree that naming could have been better in the past. But it is what
it is now. I don't see any big advantages in this approach compared to
the existing approach where DT specifies what hardware looks like and
scheduler sets up it's domain based on different cpumasks.
Regards,
Atish
> Regards,
> Nick
>
> _______________________________________________
> linux-riscv mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-riscv
>
Στις 2018-11-02 23:14, Atish Patra έγραψε:
> On 11/2/18 11:59 AM, Nick Kossifidis wrote:
>> Hello All,
>>
>> Στις 2018-11-02 01:04, Atish Patra έγραψε:
>>> This patch series adds the cpu topology for RISC-V. It contains
>>> both the DT binding and actual source code. It has been tested on
>>> QEMU & Unleashed board.
>>>
>>> The idea is based on cpu-map in ARM with changes related to how
>>> we define SMT systems. The reason for adopting a similar approach
>>> to ARM as I feel it provides a very clear way of defining the
>>> topology compared to parsing cache nodes to figure out which cpus
>>> share the same package or core. I am open to any other idea to
>>> implement cpu-topology as well.
>>>
>>
>> I was also about to start a discussion about CPU topology on RISC-V
>> after the last swtools group meeting. The goal is to provide the
>> scheduler with hints on how to distribute tasks more efficiently
>> between harts, by populating the scheduling domain topology levels
>> (https://elixir.bootlin.com/linux/v4.19/ident/sched_domain_topology_level).
>> What we want to do is define cpu groups and assign them to
>> scheduling domains with the appropriate SD_ flags
>> (https://github.com/torvalds/linux/blob/master/include/linux/sched/topology.h#L16).
>>
>
> Scheduler domain topology is already getting all the hints in the
> following way.
>
> static struct sched_domain_topology_level default_topology[] = {
> #ifdef CONFIG_SCHED_SMT
> { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
> #endif
> #ifdef CONFIG_SCHED_MC
> { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
> #endif
> { cpu_cpu_mask, SD_INIT_NAME(DIE) },
> { NULL, },
> };
>
> #ifdef CONFIG_SCHED_SMT
> static inline const struct cpumask *cpu_smt_mask(int cpu)
> {
> return topology_sibling_cpumask(cpu);
> }
> #endif
>
> const struct cpumask *cpu_coregroup_mask(int cpu)
> {
> return &cpu_topology[cpu].core_sibling;
> }
>
>
That's a static definition of two scheduling domains that only deal
with SMT and MC, the only difference between them is the
SD_SHARE_PKG_RESOURCES flag. You can't even have multiple levels
of shared resources this way, whatever you have larger than a core
is ignored (it just goes to the MC domain). There is also no handling
of SD_SHARE_POWERDOMAIN or SD_SHARE_CPUCAPACITY.
>> So the cores that belong to a scheduling domain may share:
>> CPU capacity (SD_SHARE_CPUCAPACITY / SD_ASYM_CPUCAPACITY)
>> Package resources -e.g. caches, units etc- (SD_SHARE_PKG_RESOURCES)
>> Power domain (SD_SHARE_POWERDOMAIN)
>>
>> In this context I believe using words like "core", "package",
>> "socket" etc can be misleading. For example the sample topology you
>> use on the documentation says that there are 4 cores that are part
>> of a package, however "package" has a different meaning to the
>> scheduler. Also we don't say anything in case they share a power
>> domain or if they have the same capacity or not. This mapping deals
>> only with cache hierarchy or other shared resources.
>>
>> How about defining a dt scheme to describe the scheduler domain
>> topology levels instead ? e.g:
>>
>> 2 sets (or clusters if you prefer) of 2 SMT cores, each set with
>> a different capacity and power domain:
>>
>> sched_topology {
>> level0 { // SMT
>> shared = "power", "capacity", "resources";
>> group0 {
>> members = <&hart0>, <&hart1>;
>> }
>> group1 {
>> members = <&hart2>, <&hart3>;
>> }
>> group2 {
>> members = <&hart4>, <&hart5>;
>> }
>> group3 {
>> members = <&hart6>, <&hart7>;
>> }
>> }
>> level1 { // MC
>> shared = "power", "capacity"
>> group0 {
>> members = <&hart0>, <&hart1>, <&hart2>, <&hart3>;
>> }
>> group1 {
>> members = <&hart4>, <&hart5>, <&hart6>, <&hart7>;
>> }
>> }
>> top_level { // A group with all harts in it
>> shared = "" // There is nothing common for ALL harts, we could
>> have
>> capacity here
>> }
>> }
>>
>
> I agree that naming could have been better in the past. But it is what
> it is now. I don't see any big advantages in this approach compared to
> the existing approach where DT specifies what hardware looks like and
> scheduler sets up it's domain based on different cpumasks.
>
It is what it is on ARM, it doesn't have to be the same on RISC-V,
anyway
the name is a minor issue. The advantage of this approach is that you
define the
scheduling domains on the device tree without needing a "translation" of
a
topology map to scheduling domains. It can handle any scenario the
scheduler
can handle, using all the available flags. In your approach no matter
what
gets put to the device tree, the only hint the scheduler will get is one
level of SMT, one level of MC and the rest of the system. No power
domain
sharing, no asymmetric scheduling, no multiple levels possible. Many
features
of the scheduler remain unused. This approach can also get extended more
easily
to e.g. support NUMA nodes and associate memory regions with groups.
Regards,
Nick
On Fri, 02 Nov 2018 06:09:39 PDT (-0700), [email protected] wrote:
> On Thu, Nov 1, 2018 at 6:04 PM Atish Patra <[email protected]> wrote:
>>
>> Define a RISC-V cpu topology. This is based on cpu-map in ARM world.
>> But it doesn't need a separate thread node for defining SMT systems.
>> Multiple cpu phandle properties can be parsed to identify the sibling
>> hardware threads. Moreover, we do not have cluster concept in RISC-V.
>> So package is a better word choice than cluster for RISC-V.
>
> There was a proposal to add package info for ARM recently. Not sure
> what happened to that, but we don't need 2 different ways.
>
> There's never going to be clusters for RISC-V? What prevents that?
> Seems shortsighted to me.
>
>>
>> Signed-off-by: Atish Patra <[email protected]>
>> ---
>> .../devicetree/bindings/riscv/topology.txt | 154 +++++++++++++++++++++
>> 1 file changed, 154 insertions(+)
>> create mode 100644 Documentation/devicetree/bindings/riscv/topology.txt
>>
>> diff --git a/Documentation/devicetree/bindings/riscv/topology.txt b/Documentation/devicetree/bindings/riscv/topology.txt
>> new file mode 100644
>> index 00000000..96039ed3
>> --- /dev/null
>> +++ b/Documentation/devicetree/bindings/riscv/topology.txt
>> @@ -0,0 +1,154 @@
>> +===========================================
>> +RISC-V cpu topology binding description
>> +===========================================
>> +
>> +===========================================
>> +1 - Introduction
>> +===========================================
>> +
>> +In a RISC-V system, the hierarchy of CPUs can be defined through following nodes that
>> +are used to describe the layout of physical CPUs in the system:
>> +
>> +- packages
>> +- core
>> +
>> +The cpu nodes (bindings defined in [1]) represent the devices that
>> +correspond to physical CPUs and are to be mapped to the hierarchy levels.
>> +Simultaneous multi-threading (SMT) systems can also represent their topology
>> +by defining multiple cpu phandles inside core node. The details are explained
>> +in paragraph 3.
>
> I don't see a reason to do this differently than ARM. That said, I
> don't think the thread part is in use on ARM, so it could possibly be
> changed.
>
>> +
>> +The remainder of this document provides the topology bindings for ARM, based
>
> for ARM?
>
>> +on the Devicetree Specification, available from:
>> +
>> +https://www.devicetree.org/specifications/
>> +
>> +If not stated otherwise, whenever a reference to a cpu node phandle is made its
>> +value must point to a cpu node compliant with the cpu node bindings as
>> +documented in [1].
>> +A topology description containing phandles to cpu nodes that are not compliant
>> +with bindings standardized in [1] is therefore considered invalid.
>> +
>> +This cpu topology binding description is mostly based on the topology defined
>> +in ARM [2].
>> +===========================================
>> +2 - cpu-topology node
>
> cpu-map. Why change this?
>
> What I would like to see is the ARM topology binding reworked to be
> common or some good reasons why it doesn't work for RISC-V as-is.
I think it would be great if CPU topologies were not a RISC-V specific thing.
We don't really do anything different than anyone else, so it'd be great if we
could all share the same spec and code. Looking quickly at the ARM cpu-map
bindings, I don't see any reason why we can't just use the same thing on RISC-V
-- it's not quite how I'd do it, but I don't think the differences are worth
having another implementation. Mechanically I'm not sure how to do this:
should there just be a "Documentation/devicetree/bindings/cpu-map.txt"?
If everyone is OK with that then I vote we just go ahead and genericise the ARM
"cpu-map" stuff for CPU topology. Sharing the implementation looks fairly
straight-forward as well.
On Mon, Nov 5, 2018 at 1:39 PM Palmer Dabbelt <[email protected]> wrote:
>
> On Fri, 02 Nov 2018 06:09:39 PDT (-0700), [email protected] wrote:
> > On Thu, Nov 1, 2018 at 6:04 PM Atish Patra <[email protected]> wrote:
> >>
> >> Define a RISC-V cpu topology. This is based on cpu-map in ARM world.
> >> But it doesn't need a separate thread node for defining SMT systems.
> >> Multiple cpu phandle properties can be parsed to identify the sibling
> >> hardware threads. Moreover, we do not have cluster concept in RISC-V.
> >> So package is a better word choice than cluster for RISC-V.
> >
> > There was a proposal to add package info for ARM recently. Not sure
> > what happened to that, but we don't need 2 different ways.
> >
> > There's never going to be clusters for RISC-V? What prevents that?
> > Seems shortsighted to me.
> >
> >>
> >> Signed-off-by: Atish Patra <[email protected]>
> >> ---
> >> .../devicetree/bindings/riscv/topology.txt | 154 +++++++++++++++++++++
> >> 1 file changed, 154 insertions(+)
> >> create mode 100644 Documentation/devicetree/bindings/riscv/topology.txt
> >>
> >> diff --git a/Documentation/devicetree/bindings/riscv/topology.txt b/Documentation/devicetree/bindings/riscv/topology.txt
> >> new file mode 100644
> >> index 00000000..96039ed3
> >> --- /dev/null
> >> +++ b/Documentation/devicetree/bindings/riscv/topology.txt
> >> @@ -0,0 +1,154 @@
> >> +===========================================
> >> +RISC-V cpu topology binding description
> >> +===========================================
> >> +
> >> +===========================================
> >> +1 - Introduction
> >> +===========================================
> >> +
> >> +In a RISC-V system, the hierarchy of CPUs can be defined through following nodes that
> >> +are used to describe the layout of physical CPUs in the system:
> >> +
> >> +- packages
> >> +- core
> >> +
> >> +The cpu nodes (bindings defined in [1]) represent the devices that
> >> +correspond to physical CPUs and are to be mapped to the hierarchy levels.
> >> +Simultaneous multi-threading (SMT) systems can also represent their topology
> >> +by defining multiple cpu phandles inside core node. The details are explained
> >> +in paragraph 3.
> >
> > I don't see a reason to do this differently than ARM. That said, I
> > don't think the thread part is in use on ARM, so it could possibly be
> > changed.
> >
> >> +
> >> +The remainder of this document provides the topology bindings for ARM, based
> >
> > for ARM?
> >
> >> +on the Devicetree Specification, available from:
> >> +
> >> +https://www.devicetree.org/specifications/
> >> +
> >> +If not stated otherwise, whenever a reference to a cpu node phandle is made its
> >> +value must point to a cpu node compliant with the cpu node bindings as
> >> +documented in [1].
> >> +A topology description containing phandles to cpu nodes that are not compliant
> >> +with bindings standardized in [1] is therefore considered invalid.
> >> +
> >> +This cpu topology binding description is mostly based on the topology defined
> >> +in ARM [2].
> >> +===========================================
> >> +2 - cpu-topology node
> >
> > cpu-map. Why change this?
> >
> > What I would like to see is the ARM topology binding reworked to be
> > common or some good reasons why it doesn't work for RISC-V as-is.
>
> I think it would be great if CPU topologies were not a RISC-V specific thing.
> We don't really do anything different than anyone else, so it'd be great if we
> could all share the same spec and code. Looking quickly at the ARM cpu-map
> bindings, I don't see any reason why we can't just use the same thing on RISC-V
> -- it's not quite how I'd do it, but I don't think the differences are worth
> having another implementation. Mechanically I'm not sure how to do this:
> should there just be a "Documentation/devicetree/bindings/cpu-map.txt"?
Yes, but ".../bindings/cpu/cpu-topology.txt".
And if we need $arch extensions, they can be moved there. (Really, I'd
like to get rid of /bindings/$arch/* except for maybe a few things.)
> If everyone is OK with that then I vote we just go ahead and genericise the ARM
> "cpu-map" stuff for CPU topology. Sharing the implementation looks fairly
> straight-forward as well.
On 11/5/18 12:11 PM, Rob Herring wrote:
> On Mon, Nov 5, 2018 at 1:39 PM Palmer Dabbelt <[email protected]> wrote:
>>
>> On Fri, 02 Nov 2018 06:09:39 PDT (-0700), [email protected] wrote:
>>> On Thu, Nov 1, 2018 at 6:04 PM Atish Patra <[email protected]> wrote:
>>>>
>>>> Define a RISC-V cpu topology. This is based on cpu-map in ARM world.
>>>> But it doesn't need a separate thread node for defining SMT systems.
>>>> Multiple cpu phandle properties can be parsed to identify the sibling
>>>> hardware threads. Moreover, we do not have cluster concept in RISC-V.
>>>> So package is a better word choice than cluster for RISC-V.
>>>
>>> There was a proposal to add package info for ARM recently. Not sure
>>> what happened to that, but we don't need 2 different ways.
>>>
>>> There's never going to be clusters for RISC-V? What prevents that?
>>> Seems shortsighted to me.
>>>
>>>>
>>>> Signed-off-by: Atish Patra <[email protected]>
>>>> ---
>>>> .../devicetree/bindings/riscv/topology.txt | 154 +++++++++++++++++++++
>>>> 1 file changed, 154 insertions(+)
>>>> create mode 100644 Documentation/devicetree/bindings/riscv/topology.txt
>>>>
>>>> diff --git a/Documentation/devicetree/bindings/riscv/topology.txt b/Documentation/devicetree/bindings/riscv/topology.txt
>>>> new file mode 100644
>>>> index 00000000..96039ed3
>>>> --- /dev/null
>>>> +++ b/Documentation/devicetree/bindings/riscv/topology.txt
>>>> @@ -0,0 +1,154 @@
>>>> +===========================================
>>>> +RISC-V cpu topology binding description
>>>> +===========================================
>>>> +
>>>> +===========================================
>>>> +1 - Introduction
>>>> +===========================================
>>>> +
>>>> +In a RISC-V system, the hierarchy of CPUs can be defined through following nodes that
>>>> +are used to describe the layout of physical CPUs in the system:
>>>> +
>>>> +- packages
>>>> +- core
>>>> +
>>>> +The cpu nodes (bindings defined in [1]) represent the devices that
>>>> +correspond to physical CPUs and are to be mapped to the hierarchy levels.
>>>> +Simultaneous multi-threading (SMT) systems can also represent their topology
>>>> +by defining multiple cpu phandles inside core node. The details are explained
>>>> +in paragraph 3.
>>>
>>> I don't see a reason to do this differently than ARM. That said, I
>>> don't think the thread part is in use on ARM, so it could possibly be
>>> changed.
>>>
>>>> +
>>>> +The remainder of this document provides the topology bindings for ARM, based
>>>
>>> for ARM?
>>>
>>>> +on the Devicetree Specification, available from:
>>>> +
>>>> +https://www.devicetree.org/specifications/
>>>> +
>>>> +If not stated otherwise, whenever a reference to a cpu node phandle is made its
>>>> +value must point to a cpu node compliant with the cpu node bindings as
>>>> +documented in [1].
>>>> +A topology description containing phandles to cpu nodes that are not compliant
>>>> +with bindings standardized in [1] is therefore considered invalid.
>>>> +
>>>> +This cpu topology binding description is mostly based on the topology defined
>>>> +in ARM [2].
>>>> +===========================================
>>>> +2 - cpu-topology node
>>>
>>> cpu-map. Why change this?
>>>
>>> What I would like to see is the ARM topology binding reworked to be
>>> common or some good reasons why it doesn't work for RISC-V as-is.
>>
>> I think it would be great if CPU topologies were not a RISC-V specific thing.
>> We don't really do anything different than anyone else, so it'd be great if we
>> could all share the same spec and code. Looking quickly at the ARM cpu-map
>> bindings, I don't see any reason why we can't just use the same thing on RISC-V
>> -- it's not quite how I'd do it, but I don't think the differences are worth
>> having another implementation. Mechanically I'm not sure how to do this:
>> should there just be a "Documentation/devicetree/bindings/cpu-map.txt"?
>
> Yes, but ".../bindings/cpu/cpu-topology.txt".
>
> And if we need $arch extensions, they can be moved there. (Really, I'd
> like to get rid of /bindings/$arch/* except for maybe a few things.)
>
>> If everyone is OK with that then I vote we just go ahead and genericise the ARM
>> "cpu-map" stuff for CPU topology. Sharing the implementation looks fairly
>> straight-forward as well.
>
+1 for a common code base. I am happy to take it up if nobody else has
not already started working on it.
Is there a ARM hardware test farm that can be used to test such changes?
Regards,
Atish
Στις 2018-11-05 21:38, Palmer Dabbelt έγραψε:
> On Fri, 02 Nov 2018 06:09:39 PDT (-0700), [email protected] wrote:
>> On Thu, Nov 1, 2018 at 6:04 PM Atish Patra <[email protected]>
>> wrote:
>>>
>>> Define a RISC-V cpu topology. This is based on cpu-map in ARM world.
>>> But it doesn't need a separate thread node for defining SMT systems.
>>> Multiple cpu phandle properties can be parsed to identify the sibling
>>> hardware threads. Moreover, we do not have cluster concept in RISC-V.
>>> So package is a better word choice than cluster for RISC-V.
>>
>> There was a proposal to add package info for ARM recently. Not sure
>> what happened to that, but we don't need 2 different ways.
>>
>> There's never going to be clusters for RISC-V? What prevents that?
>> Seems shortsighted to me.
>>
>>>
>>> Signed-off-by: Atish Patra <[email protected]>
>>> ---
>>> .../devicetree/bindings/riscv/topology.txt | 154
>>> +++++++++++++++++++++
>>> 1 file changed, 154 insertions(+)
>>> create mode 100644
>>> Documentation/devicetree/bindings/riscv/topology.txt
>>>
>>> diff --git a/Documentation/devicetree/bindings/riscv/topology.txt
>>> b/Documentation/devicetree/bindings/riscv/topology.txt
>>> new file mode 100644
>>> index 00000000..96039ed3
>>> --- /dev/null
>>> +++ b/Documentation/devicetree/bindings/riscv/topology.txt
>>> @@ -0,0 +1,154 @@
>>> +===========================================
>>> +RISC-V cpu topology binding description
>>> +===========================================
>>> +
>>> +===========================================
>>> +1 - Introduction
>>> +===========================================
>>> +
>>> +In a RISC-V system, the hierarchy of CPUs can be defined through
>>> following nodes that
>>> +are used to describe the layout of physical CPUs in the system:
>>> +
>>> +- packages
>>> +- core
>>> +
>>> +The cpu nodes (bindings defined in [1]) represent the devices that
>>> +correspond to physical CPUs and are to be mapped to the hierarchy
>>> levels.
>>> +Simultaneous multi-threading (SMT) systems can also represent their
>>> topology
>>> +by defining multiple cpu phandles inside core node. The details are
>>> explained
>>> +in paragraph 3.
>>
>> I don't see a reason to do this differently than ARM. That said, I
>> don't think the thread part is in use on ARM, so it could possibly be
>> changed.
>>
>>> +
>>> +The remainder of this document provides the topology bindings for
>>> ARM, based
>>
>> for ARM?
>>
>>> +on the Devicetree Specification, available from:
>>> +
>>> +https://www.devicetree.org/specifications/
>>> +
>>> +If not stated otherwise, whenever a reference to a cpu node phandle
>>> is made its
>>> +value must point to a cpu node compliant with the cpu node bindings
>>> as
>>> +documented in [1].
>>> +A topology description containing phandles to cpu nodes that are not
>>> compliant
>>> +with bindings standardized in [1] is therefore considered invalid.
>>> +
>>> +This cpu topology binding description is mostly based on the
>>> topology defined
>>> +in ARM [2].
>>> +===========================================
>>> +2 - cpu-topology node
>>
>> cpu-map. Why change this?
>>
>> What I would like to see is the ARM topology binding reworked to be
>> common or some good reasons why it doesn't work for RISC-V as-is.
>
> I think it would be great if CPU topologies were not a RISC-V specific
> thing. We don't really do anything different than anyone else, so
> it'd be great if we could all share the same spec and code. Looking
> quickly at the ARM cpu-map bindings, I don't see any reason why we
> can't just use the same thing on RISC-V -- it's not quite how I'd do
> it, but I don't think the differences are worth having another
> implementation. Mechanically I'm not sure how to do this: should
> there just be a "Documentation/devicetree/bindings/cpu-map.txt"?
>
> If everyone is OK with that then I vote we just go ahead and
> genericise the ARM "cpu-map" stuff for CPU topology. Sharing the
> implementation looks fairly straight-forward as well.
>
Please check this out...
https://lkml.org/lkml/2018/11/3/99
It's also non arch-dependent and it can handle the scheduler's
capabilities
better than cpu-map.
On Tue, Nov 06, 2018 at 12:03:17PM +0200, Nick Kossifidis wrote:
> Στις 2018-11-05 21:38, Palmer Dabbelt έγραψε:
> > On Fri, 02 Nov 2018 06:09:39 PDT (-0700), [email protected] wrote:
> > > On Thu, Nov 1, 2018 at 6:04 PM Atish Patra <[email protected]>
> > > wrote:
> > > >
> > > > Define a RISC-V cpu topology. This is based on cpu-map in ARM world.
> > > > But it doesn't need a separate thread node for defining SMT systems.
> > > > Multiple cpu phandle properties can be parsed to identify the sibling
> > > > hardware threads. Moreover, we do not have cluster concept in RISC-V.
> > > > So package is a better word choice than cluster for RISC-V.
> > >
> > > There was a proposal to add package info for ARM recently. Not sure
> > > what happened to that, but we don't need 2 different ways.
> > >
> > > There's never going to be clusters for RISC-V? What prevents that?
> > > Seems shortsighted to me.
> > >
> > > >
> > > > Signed-off-by: Atish Patra <[email protected]>
> > > > ---
> > > > .../devicetree/bindings/riscv/topology.txt | 154
> > > > +++++++++++++++++++++
> > > > 1 file changed, 154 insertions(+)
> > > > create mode 100644
> > > > Documentation/devicetree/bindings/riscv/topology.txt
> > > >
> > > > diff --git
> > > > a/Documentation/devicetree/bindings/riscv/topology.txt
> > > > b/Documentation/devicetree/bindings/riscv/topology.txt
> > > > new file mode 100644
> > > > index 00000000..96039ed3
> > > > --- /dev/null
> > > > +++ b/Documentation/devicetree/bindings/riscv/topology.txt
> > > > @@ -0,0 +1,154 @@
> > > > +===========================================
> > > > +RISC-V cpu topology binding description
> > > > +===========================================
> > > > +
> > > > +===========================================
> > > > +1 - Introduction
> > > > +===========================================
> > > > +
> > > > +In a RISC-V system, the hierarchy of CPUs can be defined
> > > > through following nodes that
> > > > +are used to describe the layout of physical CPUs in the system:
> > > > +
> > > > +- packages
> > > > +- core
> > > > +
> > > > +The cpu nodes (bindings defined in [1]) represent the devices that
> > > > +correspond to physical CPUs and are to be mapped to the
> > > > hierarchy levels.
> > > > +Simultaneous multi-threading (SMT) systems can also represent
> > > > their topology
> > > > +by defining multiple cpu phandles inside core node. The details
> > > > are explained
> > > > +in paragraph 3.
> > >
> > > I don't see a reason to do this differently than ARM. That said, I
> > > don't think the thread part is in use on ARM, so it could possibly be
> > > changed.
> > >
> > > > +
> > > > +The remainder of this document provides the topology bindings
> > > > for ARM, based
> > >
> > > for ARM?
> > >
> > > > +on the Devicetree Specification, available from:
> > > > +
> > > > +https://www.devicetree.org/specifications/
> > > > +
> > > > +If not stated otherwise, whenever a reference to a cpu node
> > > > phandle is made its
> > > > +value must point to a cpu node compliant with the cpu node
> > > > bindings as
> > > > +documented in [1].
> > > > +A topology description containing phandles to cpu nodes that
> > > > are not compliant
> > > > +with bindings standardized in [1] is therefore considered invalid.
> > > > +
> > > > +This cpu topology binding description is mostly based on the
> > > > topology defined
> > > > +in ARM [2].
> > > > +===========================================
> > > > +2 - cpu-topology node
> > >
> > > cpu-map. Why change this?
> > >
> > > What I would like to see is the ARM topology binding reworked to be
> > > common or some good reasons why it doesn't work for RISC-V as-is.
> >
> > I think it would be great if CPU topologies were not a RISC-V specific
> > thing. We don't really do anything different than anyone else, so
> > it'd be great if we could all share the same spec and code. Looking
> > quickly at the ARM cpu-map bindings, I don't see any reason why we
> > can't just use the same thing on RISC-V -- it's not quite how I'd do
> > it, but I don't think the differences are worth having another
> > implementation. Mechanically I'm not sure how to do this: should
> > there just be a "Documentation/devicetree/bindings/cpu-map.txt"?
> >
> > If everyone is OK with that then I vote we just go ahead and
> > genericise the ARM "cpu-map" stuff for CPU topology. Sharing the
> > implementation looks fairly straight-forward as well.
> >
>
> Please check this out...
> https://lkml.org/lkml/2018/11/3/99
>
> It's also non arch-dependent and it can handle the scheduler's capabilities
> better than cpu-map.
Could you elaborate on what this does better than cpu-map? I'd be
curious to see if we could/should augment cpu-map to cater for those
details.
Generally, we deliberately avoid placing software abstractions in the DT
(e.g. scheduler domains), because the design of these abstractions are
specific to a given software stack rather than the hardware, and even in
the context of that specific software stack these abstractions change
over time.
Placing software abstractions in the DT may seem simpler initially, but
longer term it ends up requiring translation anyway as abstractions
change, and typically it involves placing some policy in the DT, which
restricts the ability of software to do something better.
Having some translation between the DT and the Linux-internal
abstraction is how we expect things to work generally, and allows for a
more natural representation in the DT (which may be more or less
powerful than the current Linux abstraction).
Thanks,
Mark.
Στις 2018-11-06 16:13, Sudeep Holla έγραψε:
> On Fri, Nov 02, 2018 at 08:58:39PM +0200, Nick Kossifidis wrote:
>> Hello All,
>>
>> Στις 2018-11-02 01:04, Atish Patra έγραψε:
>> > This patch series adds the cpu topology for RISC-V. It contains
>> > both the DT binding and actual source code. It has been tested on
>> > QEMU & Unleashed board.
>> >
>> > The idea is based on cpu-map in ARM with changes related to how
>> > we define SMT systems. The reason for adopting a similar approach
>> > to ARM as I feel it provides a very clear way of defining the
>> > topology compared to parsing cache nodes to figure out which cpus
>> > share the same package or core. I am open to any other idea to
>> > implement cpu-topology as well.
>> >
>>
>> I was also about to start a discussion about CPU topology on RISC-V
>> after the last swtools group meeting. The goal is to provide the
>> scheduler with hints on how to distribute tasks more efficiently
>> between harts, by populating the scheduling domain topology levels
>> (https://elixir.bootlin.com/linux/v4.19/ident/sched_domain_topology_level).
>> What we want to do is define cpu groups and assign them to
>> scheduling domains with the appropriate SD_ flags
>> (https://github.com/torvalds/linux/blob/master/include/linux/sched/topology.h#L16).
>>
>
> OK are we defining a CPU topology binding for Linux scheduler ?
> NACK for all the approaches that assumes any knowledge of OS scheduler.
>
Is there any standard regarding CPU topology on the device tree spec ?
As far as
I know there is none. We are talking about a Linux-specific Device Tree
binding
so I don't see why defining a binding for the Linux scheduler is out of
scope.
Do you have cpu-map on other OSes as well ?
>> So the cores that belong to a scheduling domain may share:
>> CPU capacity (SD_SHARE_CPUCAPACITY / SD_ASYM_CPUCAPACITY)
>> Package resources -e.g. caches, units etc- (SD_SHARE_PKG_RESOURCES)
>> Power domain (SD_SHARE_POWERDOMAIN)
>>
>
> Too Linux kernel/scheduler specific to be part of $subject
>
All lists on the cc list are Linux specific, again I don't see your
point here
are we talking about defining a standard CPU topology scheme for the
device tree
spec or a Linux-specific CPU topology binding such as cpu-map ?
Even on this case your point is not valid, the information of two harts
sharing
a common power domain or having the same or not capacity/max frequency
(or maybe
capabilities/extensions in the future), is not Linux specific. I just
used the
Linux specific macros used by the Linux scheduler to point out the code
path.
Even on other OSes we still need a way to include this information on
the CPU
topology, and currently cpu-map doesn't. Also the Linux implementation
of cpu-map
ignores multiple levels of shared resources, we only get one level for
SMT and
one level for MC last time I checked.
>> In this context I believe using words like "core", "package",
>> "socket" etc can be misleading. For example the sample topology you
>> use on the documentation says that there are 4 cores that are part
>> of a package, however "package" has a different meaning to the
>> scheduler. Also we don't say anything in case they share a power
>> domain or if they have the same capacity or not. This mapping deals
>> only with cache hierarchy or other shared resources.
>>
>
> {Un,}fortunately those are terms used by hardware people.
>
And they are wrong, how the harts are physically packed doesn't imply
their
actual topology. In general the "translation" is not always straight
forward,
there are assumptions in place. We could use "cluster" of harts or
"group" of
harts instead, they are more abstract.
Regards,
Nick
On Tue, Nov 06, 2018 at 05:26:01PM +0200, Nick Kossifidis wrote:
> Στις 2018-11-06 16:13, Sudeep Holla έγραψε:
> > On Fri, Nov 02, 2018 at 08:58:39PM +0200, Nick Kossifidis wrote:
> > > Hello All,
> > >
> > > Στις 2018-11-02 01:04, Atish Patra έγραψε:
> > > > This patch series adds the cpu topology for RISC-V. It contains
> > > > both the DT binding and actual source code. It has been tested on
> > > > QEMU & Unleashed board.
> > > >
> > > > The idea is based on cpu-map in ARM with changes related to how
> > > > we define SMT systems. The reason for adopting a similar approach
> > > > to ARM as I feel it provides a very clear way of defining the
> > > > topology compared to parsing cache nodes to figure out which cpus
> > > > share the same package or core. I am open to any other idea to
> > > > implement cpu-topology as well.
> > > >
> > >
> > > I was also about to start a discussion about CPU topology on RISC-V
> > > after the last swtools group meeting. The goal is to provide the
> > > scheduler with hints on how to distribute tasks more efficiently
> > > between harts, by populating the scheduling domain topology levels
> > > (https://elixir.bootlin.com/linux/v4.19/ident/sched_domain_topology_level).
> > > What we want to do is define cpu groups and assign them to
> > > scheduling domains with the appropriate SD_ flags
> > > (https://github.com/torvalds/linux/blob/master/include/linux/sched/topology.h#L16).
> > >
> >
> > OK are we defining a CPU topology binding for Linux scheduler ?
> > NACK for all the approaches that assumes any knowledge of OS scheduler.
> >
>
> Is there any standard regarding CPU topology on the device tree spec ? As
> far as I know there is none. We are talking about a Linux-specific Device
> Tree binding so I don't see why defining a binding for the Linux scheduler
> is out of scope.
There may not be much on CPU topology in device tree spec, but that
doesn't mean we are defining something Linux specific here just because
there's bunch of LKML are cc-ed. We do have dedicated device tree ML for
a reason.
> Do you have cpu-map on other OSes as well ?
>
Nothing prevents them not to. I have seen increase in the projects
relying on DT these days.
> > > So the cores that belong to a scheduling domain may share:
> > > CPU capacity (SD_SHARE_CPUCAPACITY / SD_ASYM_CPUCAPACITY)
> > > Package resources -e.g. caches, units etc- (SD_SHARE_PKG_RESOURCES)
> > > Power domain (SD_SHARE_POWERDOMAIN)
> > >
> >
> > Too Linux kernel/scheduler specific to be part of $subject
> >
>
> All lists on the cc list are Linux specific, again I don't see your point
> here are we talking about defining a standard CPU topology scheme for the
> device tree spec or a Linux-specific CPU topology binding such as cpu-map ?
>
See above.
> Even on this case your point is not valid, the information of two harts
> sharing a common power domain or having the same or not capacity/max
> frequency (or maybe capabilities/extensions in the future), is not Linux
> specific. I just used the Linux specific macros used by the Linux scheduler
> to point out the code path.
The CPU topology can be different from the frequency or the power domains
and we do have specific bindings to provide those information. So let's
try to keep that out of this discussion.
> Even on other OSes we still need a way to include this information on the
> CPU topology, and currently cpu-map doesn't. Also the Linux implementation
> of cpu-map ignores multiple levels of shared resources, we only get one
> level for SMT and one level for MC last time I checked.
>
But that doesn't make it any easy if you influence the bindings based on
Linux scheduler. So just define how hardware is and allow each OS to
choose it's own way to utilise that information. That's how most of the
generic DT bindings are defined.
> > > In this context I believe using words like "core", "package",
> > > "socket" etc can be misleading. For example the sample topology you
> > > use on the documentation says that there are 4 cores that are part
> > > of a package, however "package" has a different meaning to the
> > > scheduler. Also we don't say anything in case they share a power
> > > domain or if they have the same capacity or not. This mapping deals
> > > only with cache hierarchy or other shared resources.
> > >
> >
> > {Un,}fortunately those are terms used by hardware people.
> >
>
> And they are wrong, how the harts are physically packed doesn't imply
> their actual topology. In general the "translation" is not always straight
> forward, there are assumptions in place. We could use "cluster" of harts or
> "group" of harts instead, they are more abstract.
>
Indeed. I agree those terminologies may not be best, but they are already
used one. We need to map on to those generic ones, though the translations
may not be simple. We do have the same issues on ARM. If we try to build
in such information into DT, then it becomes more configuration file for
OS than platform description IMO.
--
Regards,
Sudeep
On Tue, Nov 06, 2018 at 05:26:01PM +0200, Nick Kossifidis wrote:
> Στις 2018-11-06 16:13, Sudeep Holla έγραψε:
> > On Fri, Nov 02, 2018 at 08:58:39PM +0200, Nick Kossifidis wrote:
> > > Στις 2018-11-02 01:04, Atish Patra έγραψε:
> > > > This patch series adds the cpu topology for RISC-V. It contains
> > > > both the DT binding and actual source code. It has been tested on
> > > > QEMU & Unleashed board.
> > > >
> > > > The idea is based on cpu-map in ARM with changes related to how
> > > > we define SMT systems. The reason for adopting a similar approach
> > > > to ARM as I feel it provides a very clear way of defining the
> > > > topology compared to parsing cache nodes to figure out which cpus
> > > > share the same package or core. I am open to any other idea to
> > > > implement cpu-topology as well.
> > >
> > > I was also about to start a discussion about CPU topology on RISC-V
> > > after the last swtools group meeting. The goal is to provide the
> > > scheduler with hints on how to distribute tasks more efficiently
> > > between harts, by populating the scheduling domain topology levels
> > > (https://elixir.bootlin.com/linux/v4.19/ident/sched_domain_topology_level).
> > > What we want to do is define cpu groups and assign them to
> > > scheduling domains with the appropriate SD_ flags
> > > (https://github.com/torvalds/linux/blob/master/include/linux/sched/topology.h#L16).
> >
> > OK are we defining a CPU topology binding for Linux scheduler ?
> > NACK for all the approaches that assumes any knowledge of OS scheduler.
>
> Is there any standard regarding CPU topology on the device tree spec ?
> As far as I know there is none. We are talking about a Linux-specific
> Device Tree binding so I don't see why defining a binding for the
> Linux scheduler is out of scope.
Speaking as a DT binding maintainer, please avoid OS-specific DT
bindings wherever possible.
While DT bindings live in the kernel tree, they are not intended to be
Linux-specific, and other OSs (e.g. FreeBSD, zephyr) are aiming to
support the same bindings.
In general, targeting a specific OS is a bad idea, because the
implementation details of that OS change over time, or the bindings end
up being tailored to one specific use-case. Exposing details to the OS
such that the OS can make decisions at runtime is typically better.
> Do you have cpu-map on other OSes as well ?
There is nothing OS-specific about cpu-map, and it may be of use to
other OSs.
> > > So the cores that belong to a scheduling domain may share:
> > > CPU capacity (SD_SHARE_CPUCAPACITY / SD_ASYM_CPUCAPACITY)
> > > Package resources -e.g. caches, units etc- (SD_SHARE_PKG_RESOURCES)
> > > Power domain (SD_SHARE_POWERDOMAIN)
> > >
> >
> > Too Linux kernel/scheduler specific to be part of $subject
>
> All lists on the cc list are Linux specific, again I don't see your
> point here are we talking about defining a standard CPU topology
> scheme for the device tree spec or a Linux-specific CPU topology
> binding such as cpu-map ?
The cpu-map binding is not intended to be Linux specific, and avoids
Linux-specific terminology.
While the cpu-map binding documentation is in the Linux source tree, the
binding itseld is not intended to be Linux-specific, and it deliberately
avoids Linux implementation details.
> Even on this case your point is not valid, the information of two
> harts sharing a common power domain or having the same or not
> capacity/max frequency (or maybe capabilities/extensions in the
> future), is not Linux specific. I just used the Linux specific macros
> used by the Linux scheduler to point out the code path. Even on other
> OSes we still need a way to include this information on the CPU
> topology, and currently cpu-map doesn't. Also the Linux implementation
> of cpu-map ignores multiple levels of shared resources, we only get
> one level for SMT and one level for MC last time I checked.
Given clusters can be nested, as in the very first example, I don't see
what prevents multiple levels of shared resources.
Can you please given an example of the topology your considering? Does
that share some resources across clusters at some level?
We are certainly open to improving the cpu-map binding.
Thanks,
Mark.
On Fri, Nov 02, 2018 at 08:58:39PM +0200, Nick Kossifidis wrote:
> Hello All,
>
> Στις 2018-11-02 01:04, Atish Patra έγραψε:
> > This patch series adds the cpu topology for RISC-V. It contains
> > both the DT binding and actual source code. It has been tested on
> > QEMU & Unleashed board.
> >
> > The idea is based on cpu-map in ARM with changes related to how
> > we define SMT systems. The reason for adopting a similar approach
> > to ARM as I feel it provides a very clear way of defining the
> > topology compared to parsing cache nodes to figure out which cpus
> > share the same package or core. I am open to any other idea to
> > implement cpu-topology as well.
> >
>
> I was also about to start a discussion about CPU topology on RISC-V
> after the last swtools group meeting. The goal is to provide the
> scheduler with hints on how to distribute tasks more efficiently
> between harts, by populating the scheduling domain topology levels
> (https://elixir.bootlin.com/linux/v4.19/ident/sched_domain_topology_level).
> What we want to do is define cpu groups and assign them to
> scheduling domains with the appropriate SD_ flags
> (https://github.com/torvalds/linux/blob/master/include/linux/sched/topology.h#L16).
>
OK are we defining a CPU topology binding for Linux scheduler ?
NACK for all the approaches that assumes any knowledge of OS scheduler.
> So the cores that belong to a scheduling domain may share:
> CPU capacity (SD_SHARE_CPUCAPACITY / SD_ASYM_CPUCAPACITY)
> Package resources -e.g. caches, units etc- (SD_SHARE_PKG_RESOURCES)
> Power domain (SD_SHARE_POWERDOMAIN)
>
Too Linux kernel/scheduler specific to be part of $subject
> In this context I believe using words like "core", "package",
> "socket" etc can be misleading. For example the sample topology you
> use on the documentation says that there are 4 cores that are part
> of a package, however "package" has a different meaning to the
> scheduler. Also we don't say anything in case they share a power
> domain or if they have the same capacity or not. This mapping deals
> only with cache hierarchy or other shared resources.
>
{Un,}fortunately those are terms used by hardware people.
> How about defining a dt scheme to describe the scheduler domain
> topology levels instead ? e.g:
>
NACK as already mentioned above.
--
Regards,
Sudeep
Στις 2018-11-06 18:20, Mark Rutland έγραψε:
> On Tue, Nov 06, 2018 at 05:26:01PM +0200, Nick Kossifidis wrote:
>> Στις 2018-11-06 16:13, Sudeep Holla έγραψε:
>> > On Fri, Nov 02, 2018 at 08:58:39PM +0200, Nick Kossifidis wrote:
>> > > Στις 2018-11-02 01:04, Atish Patra έγραψε:
>> > > > This patch series adds the cpu topology for RISC-V. It contains
>> > > > both the DT binding and actual source code. It has been tested on
>> > > > QEMU & Unleashed board.
>> > > >
>> > > > The idea is based on cpu-map in ARM with changes related to how
>> > > > we define SMT systems. The reason for adopting a similar approach
>> > > > to ARM as I feel it provides a very clear way of defining the
>> > > > topology compared to parsing cache nodes to figure out which cpus
>> > > > share the same package or core. I am open to any other idea to
>> > > > implement cpu-topology as well.
>> > >
>> > > I was also about to start a discussion about CPU topology on RISC-V
>> > > after the last swtools group meeting. The goal is to provide the
>> > > scheduler with hints on how to distribute tasks more efficiently
>> > > between harts, by populating the scheduling domain topology levels
>> > > (https://elixir.bootlin.com/linux/v4.19/ident/sched_domain_topology_level).
>> > > What we want to do is define cpu groups and assign them to
>> > > scheduling domains with the appropriate SD_ flags
>> > > (https://github.com/torvalds/linux/blob/master/include/linux/sched/topology.h#L16).
>> >
>> > OK are we defining a CPU topology binding for Linux scheduler ?
>> > NACK for all the approaches that assumes any knowledge of OS scheduler.
>>
>> Is there any standard regarding CPU topology on the device tree spec ?
>> As far as I know there is none. We are talking about a Linux-specific
>> Device Tree binding so I don't see why defining a binding for the
>> Linux scheduler is out of scope.
>
> Speaking as a DT binding maintainer, please avoid OS-specific DT
> bindings wherever possible.
>
> While DT bindings live in the kernel tree, they are not intended to be
> Linux-specific, and other OSs (e.g. FreeBSD, zephyr) are aiming to
> support the same bindings.
>
> In general, targeting a specific OS is a bad idea, because the
> implementation details of that OS change over time, or the bindings end
> up being tailored to one specific use-case. Exposing details to the OS
> such that the OS can make decisions at runtime is typically better.
>
>> Do you have cpu-map on other OSes as well ?
>
> There is nothing OS-specific about cpu-map, and it may be of use to
> other OSs.
>
>> > > So the cores that belong to a scheduling domain may share:
>> > > CPU capacity (SD_SHARE_CPUCAPACITY / SD_ASYM_CPUCAPACITY)
>> > > Package resources -e.g. caches, units etc- (SD_SHARE_PKG_RESOURCES)
>> > > Power domain (SD_SHARE_POWERDOMAIN)
>> > >
>> >
>> > Too Linux kernel/scheduler specific to be part of $subject
>>
>> All lists on the cc list are Linux specific, again I don't see your
>> point here are we talking about defining a standard CPU topology
>> scheme for the device tree spec or a Linux-specific CPU topology
>> binding such as cpu-map ?
>
> The cpu-map binding is not intended to be Linux specific, and avoids
> Linux-specific terminology.
>
> While the cpu-map binding documentation is in the Linux source tree,
> the
> binding itseld is not intended to be Linux-specific, and it
> deliberately
> avoids Linux implementation details.
>
>> Even on this case your point is not valid, the information of two
>> harts sharing a common power domain or having the same or not
>> capacity/max frequency (or maybe capabilities/extensions in the
>> future), is not Linux specific. I just used the Linux specific macros
>> used by the Linux scheduler to point out the code path. Even on other
>> OSes we still need a way to include this information on the CPU
>> topology, and currently cpu-map doesn't. Also the Linux implementation
>> of cpu-map ignores multiple levels of shared resources, we only get
>> one level for SMT and one level for MC last time I checked.
>
> Given clusters can be nested, as in the very first example, I don't see
> what prevents multiple levels of shared resources.
>
> Can you please given an example of the topology your considering? Does
> that share some resources across clusters at some level?
>
> We are certainly open to improving the cpu-map binding.
>
> Thanks,
> Mark.
Mark and Sundeep thanks a lot for your feedback, I guess you convinced
me
that having a device tree binding for the scheduler is not a correct
approach.
It's not a device after all and I agree that the device tree shouldn't
become
an OS configuration file. Regarding multiple levels of shared resources
my point
is that since cpu-map doesn't contain any information of what is shared
among
the cluster/core members it's not easy to do any further translation.
Last time
I checked the arm code that uses cpu-map, it only defines one domain for
SMT, one
for MC and then everything else is ignored. No matter how many clusters
have been
defined, anything above the core level is the same (and then I guess you
started
talking about adding "packages" on the representation side).
The reason I proposed to have a binding for the scheduler directly is
not only
because it's simpler and closer to what really happens in the code, it
also makes
more sense to me than the combination of cpu-map with all the related
mappings e.g.
for numa or caches or power domains etc.
However you are right we could definitely augment cpu-map to include
support for
what I'm saying and clean things up, and since you are open about
improving it
here is a proposal that I hope you find interesting:
At first let's get rid of the <thread> nodes, they don't make sense:
thread0 {
cpu = <&CPU0>;
};
A thread node can't have more than one cpu entry and any properties
should be on the cpu node itself, so it doesn't / can't add any
more information. We could just have an array of cpu nodes on the
<core> node, it's much cleaner this way.
core0 {
members = <&CPU0>, <&CPU1>;
};
Then let's allow the cluster and core nodes to accept attributes that
are
common for the cpus they contain. Right now this is considered invalid.
For power domains we have a generic binding described on
Documentation/devicetree/bindings/power/power_domain.txt
which basically says that we need to put power-domains = <power domain
specifiers>
attribute on each of the cpu nodes.
The same happens with the capacity binding specified for arm on
Documentation/devicetree/bindings/arm/cpu-capacity.txt
which says we should add the capacity-dmips-mhz on each of the cpu
nodes.
The same also happens with the generic numa binding on
Documentation/devicetree/bindings/numa.txt
which says we should add the nuna-node-id on each of the cpu nodes.
We could allow for these attributes to exist on cluster and core nodes
as well so that we can represent their properties better. It shouldn't
be a big deal and it can be done in a backwards-compatible way (if we
don't find them on the cpu node, climb up the topology hierarchy until
we find them / not find them at all). All I'm saying is that I prefer
this:
cpus {
cpu@0 {
...
};
cpu@1 {
...
};
cpu@2 {
...
};
cpu@3 {
...
};
};
cluster0 {
cluster0 {
core0 {
power-domains = <&pdc 0>;
numa-node-id = <0>;
capacity-dmips-mhz = <578>;
members = <&cpu0>, <&cpu1>;
}
};
cluster1 {
capacity-dmips-mhz = <1024>;
core0 {
power-domains = <&pdc 1>;
numa-node-id = <1>;
members = <&cpu2>;
};
core1 {
power-domains = <&pdc 2>;
numa-node-id = <2>;
members = <&cpu3>;
};
};
}
over this:
cpus {
cpu@0 {
...
power-domains = <&pdc 0>;
capacity-dmips-mhz = <578>;
numa-node-id = <0>;
...
};
cpu@1 {
...
power-domains = <&pdc 0>;
capacity-dmips-mhz = <578>;
numa-node-id = <0>;
...
};
cpu@2 {
...
power-domains = <&pdc 1>;
capacity-dmips-mhz = <1024>;
numa-node-id = <1>;
...
};
cpu@3 {
...
power-domains = <&pdc 2>;
capacity-dmips-mhz = <1024>;
numa-node-id = <2>;
...
};
};
cluster0 {
cluster0 {
core0 {
members = <&cpu0>, <&cpu1>;
}
};
cluster1 {
core0 {
members = <&cpu2>;
}
};
cluster2 {
core0 {
members = <&cpu3>;
}
};
}
When it comes to shared resources, the standard dt mappings we have are
for
caches and are on the device spec standard (coming from power pc's ePAPR
standard I think). The below comes from HiFive unleashed's device tree
(U540Config.dts) that follows the spec:
cpus {
cpu@1 {
...
next-level-cache = <&L24 &L0>;
...
};
cpu@2 {
...
next-level-cache = <&L24 &L0>;
...
};
cpu@3 {
...
next-level-cache = <&L24 &L0>;
...
};
cpu@4 {
...
next-level-cache = <&L24 &L0>;
...
};
};
L2: soc {
L0: cache-controller@2010000 {
cache-block-size = <64>;
cache-level = <2>;
cache-sets = <2048>;
cache-size = <2097152>;
cache-unified;
compatible = "sifive,ccache0", "cache";
...
};
}
Note that the cache-controller node that's common between the 4 cores
can
exist anywhere BUT the cluster node ! However it's a property of the
cluster.
A quick search through the tree got me r8a77980.dtsi that defines the
cache
on the cpus node and I'm sure there are other similar cases. Wouldn't
this
be better ?
cluster0 {
core0 {
cache-controller@2010000 {
cache-block-size = <64>;
cache-level = <2>;
cache-sets = <2048>;
cache-size = <2097152>;
cache-unified;
compatible = "sifive,ccache0", "cache";
...
};
members = <&cpu0>, <&cpu1>, <&cpu2>, <&cpu3>;
};
};
We could even remove next-level-cache from the cpu nodes and infer it
from the
topology (search the topology upwards until we get a node that's
"cache"-compatible), we can again make this backwards-compatible.
Finally from the examples above I'd like to stress out that the
distinction
between a cluster and a core doesn't make much sense and it also makes
the
representation more complicated. To begin with, how would you call the
setup
on HiFive Unleashed ? A cluster of 4 cores that share the same L3 cache
?
One core with 4 harts that share the same L3 cache ? We could represent
it
like this instead:
cluster0 {
cache-controller@2010000 {
cache-block-size = <64>;
cache-level = <2>;
cache-sets = <2048>;
cache-size = <2097152>;
cache-unified;
compatible = "sifive,ccache0", "cache";
...
};
core0 {
members = <&cpu0>;
};
core1 {
members = <&cpu1>;
};
core2 {
members = <&cpu2>;
};
core3 {
members = <&cpu3>;
};
};
We could e.g. keep only cluster nodes and allow them to contain either
an array
of harts or other cluster sub-nodes + optionally a set of attributes,
common to
the members/sub-nodes of the cluster. This way we'll get in the first
example:
cluster0 {
cluster0 {
power-domains = <&pdc 0>;
numa-node-id = <0>;
capacity-dmips-mhz = <578>;
members = <&cpu0>, <&cpu1>;
};
cluster1 {
capacity-dmips-mhz = <1024>;
cluster0 {
power-domains = <&pdc 1>;
numa-node-id = <1>;
members = <&cpu2>;
};
cluster1 {
power-domains = <&pdc 2>;
numa-node-id = <2>;
members = <&cpu3>;
};
};
}
and in the second example:
cluster0 {
cache-controller@2010000 {
cache-block-size = <64>;
cache-level = <2>;
cache-sets = <2048>;
cache-size = <2097152>;
cache-unified;
compatible = "sifive,ccache0", "cache";
...
};
members = <&cpu0>, <&cpu1>, <&cpu2>, <&cpu3>;
};
Thank you for your time !
Regards,
Nick
On Wed, Nov 07, 2018 at 04:31:34AM +0200, Nick Kossifidis wrote:
> Mark and Sundeep thanks a lot for your feedback, I guess you convinced
> me that having a device tree binding for the scheduler is not a
> correct approach. It's not a device after all and I agree that the
> device tree shouldn't become an OS configuration file.
Good to hear.
> Regarding multiple levels of shared resources my point is that since
> cpu-map doesn't contain any information of what is shared among the
> cluster/core members it's not easy to do any further translation. Last
> time I checked the arm code that uses cpu-map, it only defines one
> domain for SMT, one for MC and then everything else is ignored. No
> matter how many clusters have been defined, anything above the core
> level is the same (and then I guess you started talking about adding
> "packages" on the representation side).
While cpu-map doesn't contain that information today, we can *add* that
information to the cpu-map binding if necessary.
> The reason I proposed to have a binding for the scheduler directly is
> not only because it's simpler and closer to what really happens in the
> code, it also makes more sense to me than the combination of cpu-map
> with all the related mappings e.g. for numa or caches or power
> domains etc.
>
> However you are right we could definitely augment cpu-map to include
> support for what I'm saying and clean things up, and since you are
> open about improving it here is a proposal that I hope you find
> interesting:
>
> At first let's get rid of the <thread> nodes, they don't make sense:
>
> thread0 {
> cpu = <&CPU0>;
> };
>
> A thread node can't have more than one cpu entry and any properties
> should be on the cpu node itself, so it doesn't / can't add any
> more information. We could just have an array of cpu nodes on the
> <core> node, it's much cleaner this way.
>
> core0 {
> members = <&CPU0>, <&CPU1>;
> };
Hold on. Rather than reinventing things from first principles, can we
please discuss what you want to *achieve*, i.e. what information you
need?
Having a node is not a significant cost, and there are reasons we may
want thread nodes. For example, it means that we can always refer to any
level of topology with a phandle, and we might want to describe
thread-affine devices in future.
There are a tonne of existing bindings that are ugly, but re-inventing
them for taste reasons alone is more costly to the ecosystem than simply
using the existing bindings. We avoid re-inventing bindings unless there
is a functional problem e.g. cases which they cannot possibly describe.
> Then let's allow the cluster and core nodes to accept attributes that are
> common for the cpus they contain. Right now this is considered invalid.
>
> For power domains we have a generic binding described on
> Documentation/devicetree/bindings/power/power_domain.txt
> which basically says that we need to put power-domains = <power domain
> specifiers>
> attribute on each of the cpu nodes.
FWIW, given this is arguably topological, I'm not personally averse to
describing this in the cpu-map, if that actually gains us more than the
complexity require to support it.
Given we don't do this for device power domains, I suspect that it's
simpler to stick with the existing binding.
> The same happens with the capacity binding specified for arm on
> Documentation/devicetree/bindings/arm/cpu-capacity.txt
> which says we should add the capacity-dmips-mhz on each of the cpu nodes.
The cpu-map was intended to expose topological dtails, and this isn't
really a topological property. For example, Arm DynamIQ systems can have
heterogeneous CPUs within clusters.
I do not think it's worth moving this, tbh.
> The same also happens with the generic numa binding on
> Documentation/devicetree/bindings/numa.txt
> which says we should add the nuna-node-id on each of the cpu nodes.
Is there a strong gain from moving this?
[...]
> Finally from the examples above I'd like to stress out that the distinction
> between a cluster and a core doesn't make much sense and it also makes the
> representation more complicated. To begin with, how would you call the setup
> on HiFive Unleashed ? A cluster of 4 cores that share the same L3 cache ?
Not knowing much about the hardware, I can't really say.
I'm not sure I follow why the distinction between a cluster and a core
is non-sensical. A cluster is always a collection of cores.
A hart could be a core in its own right, or it could be a thread under a
core, which shares functional units with other harts within that core.
Arguably, we could have mandated that the topology always needed to
describe down to a thread, even if a core only had a single thread. That
ship has sailed, however.
Thanks,
Mark.
On Wed, Nov 07, 2018 at 04:31:34AM +0200, Nick Kossifidis wrote:
[...]
>
> Mark and Sudeep thanks a lot for your feedback, I guess you convinced me
> that having a device tree binding for the scheduler is not a correct
> approach.
>
Thanks :)
> It's not a device after all and I agree that the device tree shouldn't
> become an OS configuration file. Regarding multiple levels of shared
> resources my point is that since cpu-map doesn't contain any information of
> what is shared among the cluster/core members it's not easy to do any
> further translation. Last time I checked the arm code that uses cpu-map, it
> only defines one domain for SMT, one for MC and then everything else is
> ignored. No matter how many clusters have been defined, anything above the
> core level is the same (and then I guess you started talking about adding
> "packages" on the representation side).
>
Correct.
> The reason I proposed to have a binding for the scheduler directly is not
> only because it's simpler and closer to what really happens in the code, it
> also makes more sense to me than the combination of cpu-map with all the
> related mappings e.g. for numa or caches or power domains etc.
>
Again you are just looking at it with Linux kernel perspective.
> However you are right we could definitely augment cpu-map to include support
> for what I'm saying and clean things up, and since you are open about
> improving it here is a proposal that I hope you find interesting: At first
> let's get rid of the <thread> nodes, they don't make sense:
>
> thread0 {
> cpu = <&CPU0>;
> };
>
Do you have any strong reasons to do so ?
Since it's already there for some time, I believe we can't remove it for
backward compatibility reasons.
> A thread node can't have more than one cpu entry and any properties
> should be on the cpu node itself, so it doesn't / can't add any
> more information. We could just have an array of cpu nodes on the
> <core> node, it's much cleaner this way.
>
> core0 {
> members = <&CPU0>, <&CPU1>;
> };
>
I agree, but we have kernel code using it(arm64/kernel/topology.c). It's
too late to remove it. But we can always keep to optional if we move the
ARM64 binding as generic to start with and mandate it for only ARM64.
> Then let's allow the cluster and core nodes to accept attributes that are
> common for the cpus they contain. Right now this is considered invalid.
>
Yes, we have discussed in the past and decided not to. I am fine if we
need to change it, but assuming the topology implies other information
could be wrong. On ARM platforms we have learnt it, so we kept any
information away from topology. I assume same with RISC-V, different
vendors implement in different ways, so it's better to consider those
factors.
> For power domains we have a generic binding described on
> Documentation/devicetree/bindings/power/power_domain.txt
> which basically says that we need to put power-domains = <power domain
> specifiers> attribute on each of the cpu nodes.
>
OK, but what's wrong with that. I gives full flexibility.
> The same happens with the capacity binding specified for arm on
> Documentation/devicetree/bindings/arm/cpu-capacity.txt
> which says we should add the capacity-dmips-mhz on each of the cpu nodes.
>
Ditto, we may need this for our single cluster DSU systems.
> The same also happens with the generic numa binding on
> Documentation/devicetree/bindings/numa.txt
> which says we should add the nuna-node-id on each of the cpu nodes.
>
Yes, but again what's the problem ?
> We could allow for these attributes to exist on cluster and core nodes
> as well so that we can represent their properties better. It shouldn't
> be a big deal and it can be done in a backwards-compatible way (if we
> don't find them on the cpu node, climb up the topology hierarchy until
> we find them / not find them at all). All I'm saying is that I prefer this:
>
[...]
>
> cluster0 {
> cluster0 {
> core0 {
> power-domains = <&pdc 0>;
> numa-node-id = <0>;
> capacity-dmips-mhz = <578>;
> members = <&cpu0>, <&cpu1>;
> }
> };
> cluster1 {
> capacity-dmips-mhz = <1024>;
> core0 {
> power-domains = <&pdc 1>;
> numa-node-id = <1>;
> members = <&cpu2>;
> };
> core1 {
> power-domains = <&pdc 2>;
> numa-node-id = <2>;
> members = <&cpu3>;
> };
> };
> }
>
Why are you so keen on optimising the representation ?
If you are worried about large systems, generate one instead of
handcrafted.
> When it comes to shared resources, the standard dt mappings we have are for
> caches and are on the device spec standard (coming from power pc's ePAPR
> standard I think). The below comes from HiFive unleashed's device tree
> (U540Config.dts) that follows the spec:
>
I don't understand what you are trying to explain, ePAPR does specify
per CPU entries.
[...]
> Note that the cache-controller node that's common between the 4 cores can
> exist anywhere BUT the cluster node ! However it's a property of the
> cluster.
> A quick search through the tree got me r8a77980.dtsi that defines the cache
> on the cpus node and I'm sure there are other similar cases. Wouldn't this
> be better ?
>
> cluster0 {
> core0 {
> cache-controller@2010000 {
> cache-block-size = <64>;
> cache-level = <2>;
> cache-sets = <2048>;
> cache-size = <2097152>;
> cache-unified;
> compatible = "sifive,ccache0", "cache";
> ...
> };
> members = <&cpu0>, <&cpu1>, <&cpu2>, <&cpu3>;
Not a good idea IMO.
> We could even remove next-level-cache from the cpu nodes and infer it from
> the topology (search the topology upwards until we get a node that's
> "cache"-compatible), we can again make this backwards-compatible.
>
Why are you assuming that they *have* to be so aligned with topology ?
How do you deal with different kind of systems ?
>
> Finally from the examples above I'd like to stress out that the distinction
> between a cluster and a core doesn't make much sense and it also makes the
> representation more complicated. To begin with, how would you call the setup
> on HiFive Unleashed ? A cluster of 4 cores that share the same L3 cache ?
> One core with 4 harts that share the same L3 cache ? We could represent it
> like this instead:
>
Just represent each physical cache and get the list of CPUs sharing it.
Doesn't matter what it is: cluster or cluster of clusters or cluster of
harts, blah, blah. It really doesn't matter.
>
> We could e.g. keep only cluster nodes and allow them to contain either an
> array of harts or other cluster sub-nodes + optionally a set of attributes,
> common to the members/sub-nodes of the cluster. This way we'll get in the
> first example:
>
All these fancy new ideas you are proposing are good if vendors follow
some things religiously, but I really doubt if that's true. So just
have maximum flexibility so that we can represent maximum number of
systems without having to redefine the bindings again and again for the
same thing.
So instead of finding ways to optimise, you should really come up with
list of shortcomings in the existing bindings so that we cover more
platforms with generic bindings. IMO you are too concerned on optimisation
of DT representation which may defeat the purpose of generic bindings.
--
Regards,
Sudeep
Στις 2018-11-07 14:06, Mark Rutland έγραψε:
> On Wed, Nov 07, 2018 at 04:31:34AM +0200, Nick Kossifidis wrote:
>> Mark and Sundeep thanks a lot for your feedback, I guess you convinced
>> me that having a device tree binding for the scheduler is not a
>> correct approach. It's not a device after all and I agree that the
>> device tree shouldn't become an OS configuration file.
>
> Good to hear.
>
>> Regarding multiple levels of shared resources my point is that since
>> cpu-map doesn't contain any information of what is shared among the
>> cluster/core members it's not easy to do any further translation. Last
>> time I checked the arm code that uses cpu-map, it only defines one
>> domain for SMT, one for MC and then everything else is ignored. No
>> matter how many clusters have been defined, anything above the core
>> level is the same (and then I guess you started talking about adding
>> "packages" on the representation side).
>
> While cpu-map doesn't contain that information today, we can *add* that
> information to the cpu-map binding if necessary.
>
>> The reason I proposed to have a binding for the scheduler directly is
>> not only because it's simpler and closer to what really happens in the
>> code, it also makes more sense to me than the combination of cpu-map
>> with all the related mappings e.g. for numa or caches or power
>> domains etc.
>>
>> However you are right we could definitely augment cpu-map to include
>> support for what I'm saying and clean things up, and since you are
>> open about improving it here is a proposal that I hope you find
>> interesting:
>>
>> At first let's get rid of the <thread> nodes, they don't make sense:
>>
>> thread0 {
>> cpu = <&CPU0>;
>> };
>>
>> A thread node can't have more than one cpu entry and any properties
>> should be on the cpu node itself, so it doesn't / can't add any
>> more information. We could just have an array of cpu nodes on the
>> <core> node, it's much cleaner this way.
>>
>> core0 {
>> members = <&CPU0>, <&CPU1>;
>> };
>
> Hold on. Rather than reinventing things from first principles, can we
> please discuss what you want to *achieve*, i.e. what information you
> need?
>
> Having a node is not a significant cost, and there are reasons we may
> want thread nodes. For example, it means that we can always refer to
> any
> level of topology with a phandle, and we might want to describe
> thread-affine devices in future.
>
You can use the phandle of the cpu node, the thread node doesn't add
anything more than complexity to the representation.
> There are a tonne of existing bindings that are ugly, but re-inventing
> them for taste reasons alone is more costly to the ecosystem than
> simply
> using the existing bindings. We avoid re-inventing bindings unless
> there
> is a functional problem e.g. cases which they cannot possibly describe.
>
We are talking about using something for RISC-V and possibly common
across
different archs and, I don't see why we should keep the ugliness of a
binding spec plus in this case the <trhead> node can't possibly describe
anything else than a cpu=<node> alias, it's redundant.
>> Then let's allow the cluster and core nodes to accept attributes that
>> are
>> common for the cpus they contain. Right now this is considered
>> invalid.
>>
>> For power domains we have a generic binding described on
>> Documentation/devicetree/bindings/power/power_domain.txt
>> which basically says that we need to put power-domains = <power domain
>> specifiers>
>> attribute on each of the cpu nodes.
>
> FWIW, given this is arguably topological, I'm not personally averse to
> describing this in the cpu-map, if that actually gains us more than the
> complexity require to support it.
>
> Given we don't do this for device power domains, I suspect that it's
> simpler to stick with the existing binding.
>
>> The same happens with the capacity binding specified for arm on
>> Documentation/devicetree/bindings/arm/cpu-capacity.txt
>> which says we should add the capacity-dmips-mhz on each of the cpu
>> nodes.
>
> The cpu-map was intended to expose topological dtails, and this isn't
> really a topological property. For example, Arm DynamIQ systems can
> have
> heterogeneous CPUs within clusters.
>
> I do not think it's worth moving this, tbh.
>
>> The same also happens with the generic numa binding on
>> Documentation/devicetree/bindings/numa.txt
>> which says we should add the nuna-node-id on each of the cpu nodes.
>
> Is there a strong gain from moving this?
>
> [...]
>
Right now with the device tree spec and the above bindings we can
use the information from cpu nodes to infer the cache topology,
the memory topology, the power domain topology etc.
We have of_find_next_cache_node and of_find_last_cache_level for example
that use "next-level-cache" and are used on powerpc (where this spec
comes
from), that we can further build on top of them (since this is now part
of the device tree spec anyway), to e.g. populate the levels of cache,
the levels of shared cache and finally create cpu masks for the
different
cache sharing levels.
This is standards-compliant, arch-independent (they are on of/base.c),
and
it provides a concrete hint on CPU topology rather grouping harts to
classes
like "core", "package", "socket", "cluster" etc that don't mean anything
specific and we assume to map to specific levels of cache sharing.
The same goes for the power domain topology, numa topology, or for
understanding
how the cpus are grouped in terms of capacity, we can always just use
the
above bindings on cpu nodes and be done with it.
The way I see it cpu-map doesn't add anything new to the spec at this
point, it's just there for convenience so that people don't have to add
all
these infos on the cpu nodes. Instead of adding cache nodes and use the
next-level-cache property like the device tree spec says, we have
cpu-map
that presents how the harts are "packed" inside the chip, assuming that
their packing also aligns on how they share their caches (they say
nothing about how they share their power domain or other attributes).
In a sense it goes against your rule of "re-inventing them for taste
reasons
alone is more costly to the ecosystem than simply using the existing
bindings",
I fail to see anything else than "taste reasons" when it comes to
cpu-map,
since there are existing bindings for inferring the CPU topology
already,
they are just not that convenient.
I'm proposing to add the option (not enforce) of adding those attributes
that are meant to be used on cpu nodes, on the various groups/classes
of the cpu-map, this way cpu-map will provide something more meaningful
and at least improve the representation side of things. On the
implementation
side it might save us the burden of infering the topology from parsing
all cpu
nodes each time. It's also backwards compatible with what you already
have, the only thing that's not backwards compatible is the removal
of the <thread> node, which I don't think is such a big deal to fix.
>> Finally from the examples above I'd like to stress out that the
>> distinction
>> between a cluster and a core doesn't make much sense and it also makes
>> the
>> representation more complicated. To begin with, how would you call the
>> setup
>> on HiFive Unleashed ? A cluster of 4 cores that share the same L3
>> cache ?
>
> Not knowing much about the hardware, I can't really say.
>
> I'm not sure I follow why the distinction between a cluster and a core
> is non-sensical. A cluster is always a collection of cores.
>
> A hart could be a core in its own right, or it could be a thread under
> a
> core, which shares functional units with other harts within that core.
>
> Arguably, we could have mandated that the topology always needed to
> describe down to a thread, even if a core only had a single thread.
> That
> ship has sailed, however.
>
So we agree, the "core" doesn't say anything useful regarding the
topology,
why keep using this distinction on a binding for RISC-V and possibly
other
archs (since we are also talking on an arch-independent approach) ?
Regards,
Nick
Στις 2018-11-07 14:28, Sudeep Holla έγραψε:
> On Wed, Nov 07, 2018 at 04:31:34AM +0200, Nick Kossifidis wrote:
> [...]
>
>>
>> Mark and Sudeep thanks a lot for your feedback, I guess you convinced
>> me
>> that having a device tree binding for the scheduler is not a correct
>> approach.
>>
>
> Thanks :)
>
>> It's not a device after all and I agree that the device tree shouldn't
>> become an OS configuration file. Regarding multiple levels of shared
>> resources my point is that since cpu-map doesn't contain any
>> information of
>> what is shared among the cluster/core members it's not easy to do any
>> further translation. Last time I checked the arm code that uses
>> cpu-map, it
>> only defines one domain for SMT, one for MC and then everything else
>> is
>> ignored. No matter how many clusters have been defined, anything above
>> the
>> core level is the same (and then I guess you started talking about
>> adding
>> "packages" on the representation side).
>>
>
> Correct.
>
>> The reason I proposed to have a binding for the scheduler directly is
>> not
>> only because it's simpler and closer to what really happens in the
>> code, it
>> also makes more sense to me than the combination of cpu-map with all
>> the
>> related mappings e.g. for numa or caches or power domains etc.
>>
>
> Again you are just looking at it with Linux kernel perspective.
>
>> However you are right we could definitely augment cpu-map to include
>> support
>> for what I'm saying and clean things up, and since you are open about
>> improving it here is a proposal that I hope you find interesting: At
>> first
>> let's get rid of the <thread> nodes, they don't make sense:
>>
>> thread0 {
>> cpu = <&CPU0>;
>> };
>>
>
> Do you have any strong reasons to do so ?
> Since it's already there for some time, I believe we can't remove it
> for
> backward compatibility reasons.
>
>> A thread node can't have more than one cpu entry and any properties
>> should be on the cpu node itself, so it doesn't / can't add any
>> more information. We could just have an array of cpu nodes on the
>> <core> node, it's much cleaner this way.
>>
>> core0 {
>> members = <&CPU0>, <&CPU1>;
>> };
>>
>
> I agree, but we have kernel code using it(arm64/kernel/topology.c).
> It's
> too late to remove it. But we can always keep to optional if we move
> the
> ARM64 binding as generic to start with and mandate it for only ARM64.
>
That's my point as well, if we are going to define something to be used
by everybody and in this case, at least for RISC-V, there is no need to
carry this from the ARM64 binding. It shouldn't be that hard to fix this
in the future for ARM64 as well, we may give the new mapping another
name,
maybe cpu-map2 or cpu-topology to slowly move to the new one. Changing
the
dts files shouldn't be this hard, we can provide a script for it. We can
even contain some compatibility code that also understands <thread>
nodes
and e.g. merges them together on a core node.
>> Then let's allow the cluster and core nodes to accept attributes that
>> are
>> common for the cpus they contain. Right now this is considered
>> invalid.
>>
>
> Yes, we have discussed in the past and decided not to. I am fine if we
> need to change it, but assuming the topology implies other information
> could be wrong. On ARM platforms we have learnt it, so we kept any
> information away from topology. I assume same with RISC-V, different
> vendors implement in different ways, so it's better to consider those
> factors.
>
>> For power domains we have a generic binding described on
>> Documentation/devicetree/bindings/power/power_domain.txt
>> which basically says that we need to put power-domains = <power domain
>> specifiers> attribute on each of the cpu nodes.
>>
>
> OK, but what's wrong with that. I gives full flexibility.
>
>> The same happens with the capacity binding specified for arm on
>> Documentation/devicetree/bindings/arm/cpu-capacity.txt
>> which says we should add the capacity-dmips-mhz on each of the cpu
>> nodes.
>>
>
> Ditto, we may need this for our single cluster DSU systems.
>
>> The same also happens with the generic numa binding on
>> Documentation/devicetree/bindings/numa.txt
>> which says we should add the nuna-node-id on each of the cpu nodes.
>>
>
> Yes, but again what's the problem ?
>
There is no problem with the above bindings, the problem is that we have
to put them on each cpu node which is messy. We could instead put them
(optionally) on the various groupings used on cpu-map. This would allow
cpu-map to be more specific of what is shared across the members of each
group (core/cluster/whatever).
As I wrote on my answer to Mark previously, the bindings for infering
the cache topology, numa topology, power domain topology etc are already
there, they are in the devicet tree spec and provide very specific
informations we can use. Much "stronger" hints of what's going on at
the hw level. The cpu-map doesn't provide such information, it just
provides a view of how the various harts/threads are "packed" on the
chip,
not what they share inside each level of "packing". It's useful because
it saves people from having to define a bunch of cache nodes and
describe
the cache hierarchy on the device tree using the standard spec.
So since cpu-map is there for convenience let's make it more convenient
!
What I'm saying is that cpu-map could be a more compact way of using the
existing bindings for adding properties on groups of harts instead of
putting them on each hart individually. It will simplify the
representation
and may also optimize the implementation a bit (we may get the
information
we need faster). I don't see any other reason for using cpu-map on
RISC-V
or for making it global across archs.
>> We could allow for these attributes to exist on cluster and core nodes
>> as well so that we can represent their properties better. It shouldn't
>> be a big deal and it can be done in a backwards-compatible way (if we
>> don't find them on the cpu node, climb up the topology hierarchy until
>> we find them / not find them at all). All I'm saying is that I prefer
>> this:
>>
>
> [...]
>
>>
>> cluster0 {
>> cluster0 {
>> core0 {
>> power-domains = <&pdc 0>;
>> numa-node-id = <0>;
>> capacity-dmips-mhz = <578>;
>> members = <&cpu0>, <&cpu1>;
>> }
>> };
>> cluster1 {
>> capacity-dmips-mhz = <1024>;
>> core0 {
>> power-domains = <&pdc 1>;
>> numa-node-id = <1>;
>> members = <&cpu2>;
>> };
>> core1 {
>> power-domains = <&pdc 2>;
>> numa-node-id = <2>;
>> members = <&cpu3>;
>> };
>> };
>> }
>>
>
> Why are you so keen on optimising the representation ?
> If you are worried about large systems, generate one instead of
> handcrafted.
>
I don't see a reason not to try to optimize it, since we are talking
about a binding to be used by RISC-V and potentially everybody, I think
it makes sens to improve upon what we already have.
>> When it comes to shared resources, the standard dt mappings we have
>> are for
>> caches and are on the device spec standard (coming from power pc's
>> ePAPR
>> standard I think). The below comes from HiFive unleashed's device tree
>> (U540Config.dts) that follows the spec:
>>
>
> I don't understand what you are trying to explain, ePAPR does specify
> per CPU entries.
>
> [...]
>
I'm saying we could allow moving these properties on the groupings of
cpu-map
to simplify the representation and possibly optimize the implementation.
This
way I believe cpu-map will be much more useful than it is now.
>> Note that the cache-controller node that's common between the 4 cores
>> can
>> exist anywhere BUT the cluster node ! However it's a property of the
>> cluster.
>> A quick search through the tree got me r8a77980.dtsi that defines the
>> cache
>> on the cpus node and I'm sure there are other similar cases. Wouldn't
>> this
>> be better ?
>>
>> cluster0 {
>> core0 {
>> cache-controller@2010000 {
>> cache-block-size = <64>;
>> cache-level = <2>;
>> cache-sets = <2048>;
>> cache-size = <2097152>;
>> cache-unified;
>> compatible = "sifive,ccache0", "cache";
>> ...
>> };
>> members = <&cpu0>, <&cpu1>, <&cpu2>, <&cpu3>;
>
> Not a good idea IMO.
>
Where in your opinion should this cache node go ? On the first example
from
a production dts it goes to the SoC node and on another production dts
it goes
to the cpus/ node. I think it makes more sense to go to the core node or
the
cluster node, depending on how it's shared. It makes it also easier
for the implementation to understand the levels of shared caches and
creating
cpu masks.
>> We could even remove next-level-cache from the cpu nodes and infer it
>> from
>> the topology (search the topology upwards until we get a node that's
>> "cache"-compatible), we can again make this backwards-compatible.
>>
>
> Why are you assuming that they *have* to be so aligned with topology ?
> How do you deal with different kind of systems ?
>
It depends on how you define topology. I believe that the cache
hierarchy is
a much "stronger" definition of the CPU topology than grouping
harts/threads
on classes like "core", "socket", "packet" etc that don't provide any
information
of what's going on inside the hardware. You can think of the question
the other
way around, why are you assuming that the current topology as specified
on
cpu-map is aligned with how the harts and cores share their "resources"
?
Because that's what's going on at the implementation side of cpu-map.
>>
>> Finally from the examples above I'd like to stress out that the
>> distinction
>> between a cluster and a core doesn't make much sense and it also makes
>> the
>> representation more complicated. To begin with, how would you call the
>> setup
>> on HiFive Unleashed ? A cluster of 4 cores that share the same L3
>> cache ?
>> One core with 4 harts that share the same L3 cache ? We could
>> represent it
>> like this instead:
>>
>
> Just represent each physical cache and get the list of CPUs sharing it.
> Doesn't matter what it is: cluster or cluster of clusters or cluster of
> harts, blah, blah. It really doesn't matter.
>
I totally agree with you, but in this case what's the reason of having
cpu-map ? We can infer the topology from what we already have, at least
cpu-map adds some convenience (but if we go all the way down the
"convinience"
path we'll end up on something like my initial approach which was as we
both agree now, wrong).
>>
>> We could e.g. keep only cluster nodes and allow them to contain either
>> an
>> array of harts or other cluster sub-nodes + optionally a set of
>> attributes,
>> common to the members/sub-nodes of the cluster. This way we'll get in
>> the
>> first example:
>>
>
> All these fancy new ideas you are proposing are good if vendors follow
> some things religiously, but I really doubt if that's true. So just
> have maximum flexibility so that we can represent maximum number of
> systems without having to redefine the bindings again and again for the
> same thing.
>
> So instead of finding ways to optimise, you should really come up with
> list of shortcomings in the existing bindings so that we cover more
> platforms with generic bindings. IMO you are too concerned on
> optimisation
> of DT representation which may defeat the purpose of generic bindings.
>
I get your point, but if that's the case, isn't cpu-map an optimization
over
the generic bindings for cpu nodes anyway ? As for the flexibility, what
I'm
saying is to allow the flexibility of adding the properties that we
would
otherwise add to cpu nodes, on the groups we use on cpu-map. Optionaly
so that
we are also backwards compatible. I see this as more flexible, not less
flexible. The same goes for removing the distinction between "core",
"cluster" etc, having specific names for each of the topology levels is
IMHO
less flexible than using abstract names.
Also what's the point of defining a binding if vendors don't follow it ?
I think it would be easier for the vendors to just use what's there
instead
of defining a new binding and writing the code for it. On the other hand
you are probably more experienced on this than I am, I haven't dealt
with
various different vendors and their different ways.
Regards,
Nick
On Thu, Nov 08, 2018 at 03:45:36PM +0200, Nick Kossifidis wrote:
> Στις 2018-11-07 14:06, Mark Rutland έγραψε:
> > On Wed, Nov 07, 2018 at 04:31:34AM +0200, Nick Kossifidis wrote:
> > > Mark and Sundeep thanks a lot for your feedback, I guess you convinced
> > > me that having a device tree binding for the scheduler is not a
> > > correct approach. It's not a device after all and I agree that the
> > > device tree shouldn't become an OS configuration file.
> >
> > Good to hear.
> >
> > > Regarding multiple levels of shared resources my point is that since
> > > cpu-map doesn't contain any information of what is shared among the
> > > cluster/core members it's not easy to do any further translation. Last
> > > time I checked the arm code that uses cpu-map, it only defines one
> > > domain for SMT, one for MC and then everything else is ignored. No
> > > matter how many clusters have been defined, anything above the core
> > > level is the same (and then I guess you started talking about adding
> > > "packages" on the representation side).
> >
> > While cpu-map doesn't contain that information today, we can *add* that
> > information to the cpu-map binding if necessary.
> >
> > > The reason I proposed to have a binding for the scheduler directly is
> > > not only because it's simpler and closer to what really happens in the
> > > code, it also makes more sense to me than the combination of cpu-map
> > > with all the related mappings e.g. for numa or caches or power
> > > domains etc.
> > >
> > > However you are right we could definitely augment cpu-map to include
> > > support for what I'm saying and clean things up, and since you are
> > > open about improving it here is a proposal that I hope you find
> > > interesting:
> > >
> > > At first let's get rid of the <thread> nodes, they don't make sense:
> > >
> > > thread0 {
> > > cpu = <&CPU0>;
> > > };
> > >
> > > A thread node can't have more than one cpu entry and any properties
> > > should be on the cpu node itself, so it doesn't / can't add any
> > > more information. We could just have an array of cpu nodes on the
> > > <core> node, it's much cleaner this way.
> > >
> > > core0 {
> > > members = <&CPU0>, <&CPU1>;
> > > };
> >
> > Hold on. Rather than reinventing things from first principles, can we
> > please discuss what you want to *achieve*, i.e. what information you
> > need?
> >
> > Having a node is not a significant cost, and there are reasons we may
> > want thread nodes. For example, it means that we can always refer to any
> > level of topology with a phandle, and we might want to describe
> > thread-affine devices in future.
>
> You can use the phandle of the cpu node, the thread node doesn't add
> anything more than complexity to the representation.
That adds complexity elsewhere, because the phandle of a CPU node is not
a child of the cpu-map node, and you can't simply walk up the tree
hierarchy to find parent nodes.
I see no reason to change this part of the binding. Given it's already
out there, with existing parsing code, changing it only serves to add
complexity to any common code which has to parse this.
Can we please drop the idea of dropping the thread node?
> > There are a tonne of existing bindings that are ugly, but re-inventing
> > them for taste reasons alone is more costly to the ecosystem than simply
> > using the existing bindings. We avoid re-inventing bindings unless there
> > is a functional problem e.g. cases which they cannot possibly describe.
>
> We are talking about using something for RISC-V and possibly common across
> different archs and, I don't see why we should keep the ugliness of a
> binding spec plus in this case the <trhead> node can't possibly describe
> anything else than a cpu=<node> alias, it's redundant.
While it may be ugly, removing it only serves to add complexity for
common parsing code, and removes flexibility that we may want in future.
Its presence does not cause any technical problem.
For better or worse we're all sharing the same codebase here. The common
case matters, as does the complexity of the codebase as a whole.
I realise it can be frustrating to have to use something you feel is
sub-optimal, but putting up with a few nodes which you personally feel
are redundant is of much lower cost to the ecosystem than having
incompatible bindings where we could have one. If you put up with that,
you can focus your efforts on more worthwhile technical exercises.
> > > Then let's allow the cluster and core nodes to accept attributes
> > > that are
> > > common for the cpus they contain. Right now this is considered
> > > invalid.
> > >
> > > For power domains we have a generic binding described on
> > > Documentation/devicetree/bindings/power/power_domain.txt
> > > which basically says that we need to put power-domains = <power domain
> > > specifiers>
> > > attribute on each of the cpu nodes.
> >
> > FWIW, given this is arguably topological, I'm not personally averse to
> > describing this in the cpu-map, if that actually gains us more than the
> > complexity require to support it.
> >
> > Given we don't do this for device power domains, I suspect that it's
> > simpler to stick with the existing binding.
> >
> > > The same happens with the capacity binding specified for arm on
> > > Documentation/devicetree/bindings/arm/cpu-capacity.txt
> > > which says we should add the capacity-dmips-mhz on each of the cpu
> > > nodes.
> >
> > The cpu-map was intended to expose topological dtails, and this isn't
> > really a topological property. For example, Arm DynamIQ systems can have
> > heterogeneous CPUs within clusters.
> >
> > I do not think it's worth moving this, tbh.
> >
> > > The same also happens with the generic numa binding on
> > > Documentation/devicetree/bindings/numa.txt
> > > which says we should add the nuna-node-id on each of the cpu nodes.
> >
> > Is there a strong gain from moving this?
> >
> > [...]
>
> Right now with the device tree spec and the above bindings we can
> use the information from cpu nodes to infer the cache topology,
> the memory topology, the power domain topology etc.
>
> We have of_find_next_cache_node and of_find_last_cache_level for example
> that use "next-level-cache" and are used on powerpc (where this spec comes
> from), that we can further build on top of them (since this is now part
> of the device tree spec anyway), to e.g. populate the levels of cache,
> the levels of shared cache and finally create cpu masks for the different
> cache sharing levels.
The cpu-map binding does not describe cache topology. I never suggested
that it did.
> This is standards-compliant, arch-independent (they are on of/base.c), and
> it provides a concrete hint on CPU topology rather grouping harts to classes
> like "core", "package", "socket", "cluster" etc that don't mean anything
> specific and we assume to map to specific levels of cache sharing.
Please note that I have never suggested "package", and I'm not sure what
that's in response to.
Please also not that while the cache topology and CPU topology are
somewhat related, in general (this is at least true on ARM systems)
there's no strict mapping between the two, which is why we have separate
bindings in the first place.
Distinct CPU topologies could look identical in cache topology, and
vice-versa.
> The same goes for the power domain topology, numa topology, or for
> understanding how the cpus are grouped in terms of capacity, we can
> always just use the above bindings on cpu nodes and be done with it.
>
> The way I see it cpu-map doesn't add anything new to the spec at this
> point, it's just there for convenience so that people don't have to add all
> these infos on the cpu nodes. Instead of adding cache nodes and use the
> next-level-cache property like the device tree spec says, we have cpu-map
> that presents how the harts are "packed" inside the chip, assuming that
> their packing also aligns on how they share their caches (they say
> nothing about how they share their power domain or other attributes).
The cpu-map and cache bindings describe separate topological details,
and neither is intended to replace the other. There are plenty of DT
files with both.
> In a sense it goes against your rule of "re-inventing them for taste
> reasons alone is more costly to the ecosystem than simply using the
> existing bindings", I fail to see anything else than "taste reasons"
> when it comes to cpu-map, since there are existing bindings for
> inferring the CPU topology already, they are just not that convenient.
If you believe that's the case, then you can perfectly use the existing
cache and NUMA bindings, and not describe the cpu topology at all, no
need to change cpu-map or come up with a new binding.
> I'm proposing to add the option (not enforce) of adding those
> attributes that are meant to be used on cpu nodes, on the various
> groups/classes of the cpu-map, this way cpu-map will provide something
> more meaningful and at least improve the representation side of
> things. On the implementation side it might save us the burden of
> infering the topology from parsing all cpu nodes each time. It's also
> backwards compatible with what you already have, the only thing that's
> not backwards compatible is the removal of the <thread> node, which I
> don't think is such a big deal to fix.
Sorry, but as I have stated repeatedly, this complicates the common code
for what you admit is a matter of taste. I would rather maintain the
simplicity of this binding and existing parsing code, and not
potentially break other parsers, if the only cost is a few nodes which
you personally consider to be redundant.
> > > Finally from the examples above I'd like to stress out that the
> > > distinction between a cluster and a core doesn't make much sense
> > > and it also makes the representation more complicated. To begin
> > > with, how would you call the setup on HiFive Unleashed ? A cluster
> > > of 4 cores that share the same L3 cache ?
> >
> > Not knowing much about the hardware, I can't really say.
> >
> > I'm not sure I follow why the distinction between a cluster and a core
> > is non-sensical. A cluster is always a collection of cores.
> >
> > A hart could be a core in its own right, or it could be a thread under a
> > core, which shares functional units with other harts within that core.
> >
> > Arguably, we could have mandated that the topology always needed to
> > describe down to a thread, even if a core only had a single thread. That
> > ship has sailed, however.
>
> So we agree, the "core" doesn't say anything useful regarding the
> topology, why keep using this distinction on a binding for RISC-V and
> possibly other archs (since we are also talking on an arch-independent
> approach) ?
We keep it because the binding has already been finalised and is use in
practice. The existing binding and parsing code is *sufficient* for your
needs. Personal taste and minor savings in the number of nodes in a DT
are not compelling reasons to change this when the cost is complexity
that we have to maintain *forever*.
Thanks,
Mark.
On Thu, Nov 08, 2018 at 04:52:30PM +0200, Nick Kossifidis wrote:
> Στις 2018-11-07 14:28, Sudeep Holla έγραψε:
> >
> > I agree, but we have kernel code using it(arm64/kernel/topology.c). It's
> > too late to remove it. But we can always keep to optional if we move the
> > ARM64 binding as generic to start with and mandate it for only ARM64.
> >
>
> That's my point as well, if we are going to define something to be used
> by everybody and in this case, at least for RISC-V, there is no need to
> carry this from the ARM64 binding.
Sure, whatever you don't need in RISC-V you can see if they can be made
optional. I don't think that should be a problem.
> It shouldn't be that hard to fix this
> in the future for ARM64 as well, we may give the new mapping another name,
> maybe cpu-map2 or cpu-topology to slowly move to the new one.
No, we have it and we will continue to support it. It's not broken to
fix on ARM64. Why do you think that it's broken on ARM64 ?
> Changing the
> dts files shouldn't be this hard, we can provide a script for it. We can
> even contain some compatibility code that also understands <thread> nodes
> and e.g. merges them together on a core node.
>
Sure, hang on this idea of scripting, we can make a better use of it.
Details later further in the mail.
[...]
> > > The same also happens with the generic numa binding on
> > > Documentation/devicetree/bindings/numa.txt
> > > which says we should add the nuna-node-id on each of the cpu nodes.
> > >
> >
> > Yes, but again what's the problem ?
> >
>
> There is no problem with the above bindings, the problem is that we have
> to put them on each cpu node which is messy. We could instead put them
> (optionally) on the various groupings used on cpu-map. This would allow
> cpu-map to be more specific of what is shared across the members of each
> group (core/cluster/whatever).
>
I think Mark has already explain why/how generic bindings are useful.
If you still have concerns, take it up separately and see how you can
build *perfect* bindings for RISC-V to avoid any legacy baggage.
We have reasons why we can't assume information about cache or power
domain topology from CPU topology. I have summarised them already and
we are not discussing. There may not be perfect bindings but there are
already supported and nothing is broken here to fix. DT bindings are
*not* same as code to fix it with a patch to the bindings themselves.
Once agreed and merged, they need to be treated like user ABI.
> As I wrote on my answer to Mark previously, the bindings for infering
> the cache topology, numa topology, power domain topology etc are already
> there, they are in the devicet tree spec and provide very specific
> informations we can use. Much "stronger" hints of what's going on at
> the hw level. The cpu-map doesn't provide such information, it just
> provides a view of how the various harts/threads are "packed" on the chip,
> not what they share inside each level of "packing". It's useful because
> it saves people from having to define a bunch of cache nodes and describe
> the cache hierarchy on the device tree using the standard spec.
>
Ah, here comes. If you want to save people's time or whatever, you can use
your scripting magic you have mentioned above to define those bunch of nodes
you want to avoid.
> So since cpu-map is there for convenience let's make it more convenient !
> What I'm saying is that cpu-map could be a more compact way of using the
> existing bindings for adding properties on groups of harts instead of
> putting them on each hart individually. It will simplify the representation
> and may also optimize the implementation a bit (we may get the information
> we need faster). I don't see any other reason for using cpu-map on RISC-V
> or for making it global across archs.
>
Sure, I don't have strong opinions there. Just stop mentioning that this
is the only solution and all existing ones are broken. They are not and
needs to be supported until they are explicitly deprecated, becomes
obsolete and finally removed.
[...]
> >
> > Why are you so keen on optimising the representation ?
> > If you are worried about large systems, generate one instead of
> > handcrafted.
> >
>
> I don't see a reason not to try to optimize it, since we are talking
> about a binding to be used by RISC-V and potentially everybody, I think
> it makes sens to improve upon what we already have.
>
Sure, you can always unless you stop treating existing ones are broken.
I have already told DT bindings are not *normal code*. You can just
replace existing ones with new optimised ones. You can only add the new
(*optimised*) ones to the existing ones. You *need* to understand that
concept first, otherwise there's not point in this endless discussion
IMO.
I will stop here as I will have to repeat whatever I have already
mentioned to comment on your arguments below.
In summary, I am not against improving the bindings if you think it's
possible, but I don't see how it's more beneficially especially if we
are going to support existing ones also. Mark has already given all the
details.
--
Regards,
Sudeep
Στις 2018-11-08 18:48, Sudeep Holla έγραψε:
> On Thu, Nov 08, 2018 at 04:52:30PM +0200, Nick Kossifidis wrote:
>> Στις 2018-11-07 14:28, Sudeep Holla έγραψε:
>> >
>> > I agree, but we have kernel code using it(arm64/kernel/topology.c). It's
>> > too late to remove it. But we can always keep to optional if we move the
>> > ARM64 binding as generic to start with and mandate it for only ARM64.
>> >
>>
>> That's my point as well, if we are going to define something to be
>> used
>> by everybody and in this case, at least for RISC-V, there is no need
>> to
>> carry this from the ARM64 binding.
>
> Sure, whatever you don't need in RISC-V you can see if they can be made
> optional. I don't think that should be a problem.
>
>> It shouldn't be that hard to fix this
>> in the future for ARM64 as well, we may give the new mapping another
>> name,
>> maybe cpu-map2 or cpu-topology to slowly move to the new one.
>
> No, we have it and we will continue to support it. It's not broken to
> fix on ARM64. Why do you think that it's broken on ARM64 ?
>
I never said it's broken, I just assumed that if this binding becomes
common
across archs you'd probably like to switch to it, so it should either be
backwards compatible with what you have (or as you said "keep them
optional")
or take steps for the transition. I'm ok either way, It's not such a
serious
thing to have the <thread> nodes supported or keep the distinction
between
"cores", "clusters" or whatever, I'm sorry if it looked that way. I'm
just
saying that I'd like to have something cleaner for RISC-V and/or a
binding
that's common for all, we can support both and be backwards compatible,
or we
can keep it as is and mandate the <thread> nodes only for ARM64 as you
suggested.
>> Changing the
>> dts files shouldn't be this hard, we can provide a script for it. We
>> can
>> even contain some compatibility code that also understands <thread>
>> nodes
>> and e.g. merges them together on a core node.
>>
>
> Sure, hang on this idea of scripting, we can make a better use of it.
> Details later further in the mail.
>
> [...]
>
>> > > The same also happens with the generic numa binding on
>> > > Documentation/devicetree/bindings/numa.txt
>> > > which says we should add the nuna-node-id on each of the cpu nodes.
>> > >
>> >
>> > Yes, but again what's the problem ?
>> >
>>
>> There is no problem with the above bindings, the problem is that we
>> have
>> to put them on each cpu node which is messy. We could instead put them
>> (optionally) on the various groupings used on cpu-map. This would
>> allow
>> cpu-map to be more specific of what is shared across the members of
>> each
>> group (core/cluster/whatever).
>>
>
> I think Mark has already explain why/how generic bindings are useful.
> If you still have concerns, take it up separately and see how you can
> build *perfect* bindings for RISC-V to avoid any legacy baggage.
>
> We have reasons why we can't assume information about cache or power
> domain topology from CPU topology. I have summarised them already and
> we are not discussing.
But that's what you do on arch/arm/kernel/topology.c, you assume
information on power domain and cache topology based on the CPU topology
when you define the scheduling domain topology levels.
PowerPC also does the same on arch/powerpc/kernel/smp.c and basically if
you track the usage of struct sched_domain_topology_level you'll see
that
every arch that uses it to instruct the scheduler about the scheduling
domains, does it based on its CPU topology, which I think we both agree
is wrong.
> There may not be perfect bindings but there are
> already supported and nothing is broken here to fix. DT bindings are
> *not* same as code to fix it with a patch to the bindings themselves.
> Once agreed and merged, they need to be treated like user ABI.
>
The only use of the devicetree spec bindings for caches if I'm not
mistaken are used on your cacheinfo driver for exporting them to
userspace
through sysfs (thank you for cleaning this up btw). Even if we have the
cache topology using the generic bindings, we are not doing anything
with
that information but export it to userspace.
As for the power domain bindings, we have drivers/base/power/domain.c
that
handles the generic bindings and it's being used by some drivers to put
devices to power domains (something used by the pm code), but from a
quick
search it's not used for cpu nodes and I didn't see this info being used
for providing hints to the scheduler either. Even with the generic power
domain bindings in place, for CPU idle states, on ARM we have another
binding that's basically the same with the addition of
"wakeup-latency-us".
For having different capacity there is no generic binding but I guess we
could use ARM's.
Of the generic bindings that relate to the scheduler's behavior, only
the
numa binding is used as expected and only by ARM64, powerpc and sparc
use
their own stuff.
So there may be nothing broken regarding the generic / already existing
bindings, but their support needs fixing. The way they are now we can't
use them on RISC-V for configuring the scheduler and supporting SMT and
MC properly + I'm not in favor of using CPU topology blindly as the
other archs do.
>> As I wrote on my answer to Mark previously, the bindings for infering
>> the cache topology, numa topology, power domain topology etc are
>> already
>> there, they are in the devicet tree spec and provide very specific
>> informations we can use. Much "stronger" hints of what's going on at
>> the hw level. The cpu-map doesn't provide such information, it just
>> provides a view of how the various harts/threads are "packed" on the
>> chip,
>> not what they share inside each level of "packing". It's useful
>> because
>> it saves people from having to define a bunch of cache nodes and
>> describe
>> the cache hierarchy on the device tree using the standard spec.
>>
>
> Ah, here comes. If you want to save people's time or whatever, you can
> use
> your scripting magic you have mentioned above to define those bunch of
> nodes
> you want to avoid.
>
I mentioned the script as a transitioning method, but you are right, it
goes
both ways.
>> So since cpu-map is there for convenience let's make it more
>> convenient !
>> What I'm saying is that cpu-map could be a more compact way of using
>> the
>> existing bindings for adding properties on groups of harts instead of
>> putting them on each hart individually. It will simplify the
>> representation
>> and may also optimize the implementation a bit (we may get the
>> information
>> we need faster). I don't see any other reason for using cpu-map on
>> RISC-V
>> or for making it global across archs.
>>
>
> Sure, I don't have strong opinions there. Just stop mentioning that
> this
> is the only solution and all existing ones are broken. They are not and
> needs to be supported until they are explicitly deprecated, becomes
> obsolete and finally removed.
>
> [...]
>
But I never said that's "the only solution and everything else is
broken" !
I'm just proposing an extension to cpu-map since Mark suggested that
it's
possible. My goal is to try and consolidate cpu-map with what Atish
proposed
for RISC-V (so that we can use the same code) and point out some issues
on how we use and define the CPU topology.
>> >
>> > Why are you so keen on optimising the representation ?
>> > If you are worried about large systems, generate one instead of
>> > handcrafted.
>> >
>>
>> I don't see a reason not to try to optimize it, since we are talking
>> about a binding to be used by RISC-V and potentially everybody, I
>> think
>> it makes sens to improve upon what we already have.
>>
>
> Sure, you can always unless you stop treating existing ones are broken.
> I have already told DT bindings are not *normal code*. You can just
> replace existing ones with new optimised ones. You can only add the new
> (*optimised*) ones to the existing ones. You *need* to understand that
> concept first, otherwise there's not point in this endless discussion
> IMO.
>
> I will stop here as I will have to repeat whatever I have already
> mentioned to comment on your arguments below.
>
> In summary, I am not against improving the bindings if you think it's
> possible, but I don't see how it's more beneficially especially if we
> are going to support existing ones also. Mark has already given all the
> details.
>
ACK, thanks a lot for your time and the discussion so far, I really
appreciate it.
Regards,
Nick
Στις 2018-11-08 17:54, Mark Rutland έγραψε:
> On Thu, Nov 08, 2018 at 03:45:36PM +0200, Nick Kossifidis wrote:
>> Στις 2018-11-07 14:06, Mark Rutland έγραψε:
>> > On Wed, Nov 07, 2018 at 04:31:34AM +0200, Nick Kossifidis wrote:
>> > > Mark and Sundeep thanks a lot for your feedback, I guess you convinced
>> > > me that having a device tree binding for the scheduler is not a
>> > > correct approach. It's not a device after all and I agree that the
>> > > device tree shouldn't become an OS configuration file.
>> >
>> > Good to hear.
>> >
>> > > Regarding multiple levels of shared resources my point is that since
>> > > cpu-map doesn't contain any information of what is shared among the
>> > > cluster/core members it's not easy to do any further translation. Last
>> > > time I checked the arm code that uses cpu-map, it only defines one
>> > > domain for SMT, one for MC and then everything else is ignored. No
>> > > matter how many clusters have been defined, anything above the core
>> > > level is the same (and then I guess you started talking about adding
>> > > "packages" on the representation side).
>> >
>> > While cpu-map doesn't contain that information today, we can *add* that
>> > information to the cpu-map binding if necessary.
>> >
>> > > The reason I proposed to have a binding for the scheduler directly is
>> > > not only because it's simpler and closer to what really happens in the
>> > > code, it also makes more sense to me than the combination of cpu-map
>> > > with all the related mappings e.g. for numa or caches or power
>> > > domains etc.
>> > >
>> > > However you are right we could definitely augment cpu-map to include
>> > > support for what I'm saying and clean things up, and since you are
>> > > open about improving it here is a proposal that I hope you find
>> > > interesting:
>> > >
>> > > At first let's get rid of the <thread> nodes, they don't make sense:
>> > >
>> > > thread0 {
>> > > cpu = <&CPU0>;
>> > > };
>> > >
>> > > A thread node can't have more than one cpu entry and any properties
>> > > should be on the cpu node itself, so it doesn't / can't add any
>> > > more information. We could just have an array of cpu nodes on the
>> > > <core> node, it's much cleaner this way.
>> > >
>> > > core0 {
>> > > members = <&CPU0>, <&CPU1>;
>> > > };
>> >
>> > Hold on. Rather than reinventing things from first principles, can we
>> > please discuss what you want to *achieve*, i.e. what information you
>> > need?
>> >
>> > Having a node is not a significant cost, and there are reasons we may
>> > want thread nodes. For example, it means that we can always refer to any
>> > level of topology with a phandle, and we might want to describe
>> > thread-affine devices in future.
>>
>> You can use the phandle of the cpu node, the thread node doesn't add
>> anything more than complexity to the representation.
>
> That adds complexity elsewhere, because the phandle of a CPU node is
> not
> a child of the cpu-map node, and you can't simply walk up the tree
> hierarchy to find parent nodes.
>
> I see no reason to change this part of the binding. Given it's already
> out there, with existing parsing code, changing it only serves to add
> complexity to any common code which has to parse this.
>
> Can we please drop the idea of dropping the thread node?
>
>> > There are a tonne of existing bindings that are ugly, but re-inventing
>> > them for taste reasons alone is more costly to the ecosystem than simply
>> > using the existing bindings. We avoid re-inventing bindings unless there
>> > is a functional problem e.g. cases which they cannot possibly describe.
>>
>> We are talking about using something for RISC-V and possibly common
>> across
>> different archs and, I don't see why we should keep the ugliness of a
>> binding spec plus in this case the <trhead> node can't possibly
>> describe
>> anything else than a cpu=<node> alias, it's redundant.
>
> While it may be ugly, removing it only serves to add complexity for
> common parsing code, and removes flexibility that we may want in
> future.
> Its presence does not cause any technical problem.
>
> For better or worse we're all sharing the same codebase here. The
> common
> case matters, as does the complexity of the codebase as a whole.
>
> I realise it can be frustrating to have to use something you feel is
> sub-optimal, but putting up with a few nodes which you personally feel
> are redundant is of much lower cost to the ecosystem than having
> incompatible bindings where we could have one. If you put up with that,
> you can focus your efforts on more worthwhile technical exercises.
>
The only reason I mentioned the issue with the <thread> node is because
you said that you are open for improving cpu-map. I don't think it's
such a big deal and even if we decide against it, it's not a big deal
to be backwards compatible with what's already there. I fully understand
what you say about the impact on the ecosystem and agree with you.
>> > > Then let's allow the cluster and core nodes to accept attributes
>> > > that are
>> > > common for the cpus they contain. Right now this is considered
>> > > invalid.
>> > >
>> > > For power domains we have a generic binding described on
>> > > Documentation/devicetree/bindings/power/power_domain.txt
>> > > which basically says that we need to put power-domains = <power domain
>> > > specifiers>
>> > > attribute on each of the cpu nodes.
>> >
>> > FWIW, given this is arguably topological, I'm not personally averse to
>> > describing this in the cpu-map, if that actually gains us more than the
>> > complexity require to support it.
>> >
>> > Given we don't do this for device power domains, I suspect that it's
>> > simpler to stick with the existing binding.
>> >
>> > > The same happens with the capacity binding specified for arm on
>> > > Documentation/devicetree/bindings/arm/cpu-capacity.txt
>> > > which says we should add the capacity-dmips-mhz on each of the cpu
>> > > nodes.
>> >
>> > The cpu-map was intended to expose topological dtails, and this isn't
>> > really a topological property. For example, Arm DynamIQ systems can have
>> > heterogeneous CPUs within clusters.
>> >
>> > I do not think it's worth moving this, tbh.
>> >
>> > > The same also happens with the generic numa binding on
>> > > Documentation/devicetree/bindings/numa.txt
>> > > which says we should add the nuna-node-id on each of the cpu nodes.
>> >
>> > Is there a strong gain from moving this?
>> >
>> > [...]
>>
>> Right now with the device tree spec and the above bindings we can
>> use the information from cpu nodes to infer the cache topology,
>> the memory topology, the power domain topology etc.
>>
>> We have of_find_next_cache_node and of_find_last_cache_level for
>> example
>> that use "next-level-cache" and are used on powerpc (where this spec
>> comes
>> from), that we can further build on top of them (since this is now
>> part
>> of the device tree spec anyway), to e.g. populate the levels of
>> cache,
>> the levels of shared cache and finally create cpu masks for the
>> different
>> cache sharing levels.
>
> The cpu-map binding does not describe cache topology. I never suggested
> that it did.
>
Sorry for the misunderstanding Mark !
On ARM the CPU topology is used for configuring the scheduling domain
topology
(arch/arm/kernel/topology.c) and blindly sets the SD_SHARE_PKG_RESOURCES
and SD_SHARE_POWERDOMAIN flags for a core without taking into account
the
cache hierarchy (hence validating that SD_SHARE_PKG_RESOURCES is
correct)
or the power domains (but ok, I doubt it's possible to have two threads
on
the same core that are powered independently). It also supports only two
scheduling domains, one for SMT and another one for MC (hence my
comments
on multiple levels of shared resources not being supported).
Since according to the docs cpu-map is a binding common between ARM and
ARM64
for describing the CPU topology, I assumed it was a part of that
process.
Turns out it's only used on ARM64, where set_sched_topology() is not
used for
configuring the scheduling domain topology (there is a commend there
that
implies that you pass on the clusters to the scheduler that also led me
to
believe the issue is also present there). So my assumption was wrong and
cpu-map has nothing to do with configuring the scheduler and the issues
I mentioned above.
>> This is standards-compliant, arch-independent (they are on of/base.c),
>> and
>> it provides a concrete hint on CPU topology rather grouping harts to
>> classes
>> like "core", "package", "socket", "cluster" etc that don't mean
>> anything
>> specific and we assume to map to specific levels of cache sharing.
>
> Please note that I have never suggested "package", and I'm not sure
> what
> that's in response to.
>
https://lkml.org/lkml/2018/11/7/918
[...]
>> In a sense it goes against your rule of "re-inventing them for taste
>> reasons alone is more costly to the ecosystem than simply using the
>> existing bindings", I fail to see anything else than "taste reasons"
>> when it comes to cpu-map, since there are existing bindings for
>> inferring the CPU topology already, they are just not that convenient.
>
> If you believe that's the case, then you can perfectly use the existing
> cache and NUMA bindings, and not describe the cpu topology at all, no
> need to change cpu-map or come up with a new binding.
>
The problem is that right now nobody is using the generic bindings
for configuring the scheduler domain topology. Only the numa bindings
are used as expected on ARM64. Since cpu-map is only there for
representing
the cpu topology/layout, allowing them to exist there is obviously not
the
right approach.
>> I'm proposing to add the option (not enforce) of adding those
>> attributes that are meant to be used on cpu nodes, on the various
>> groups/classes of the cpu-map, this way cpu-map will provide something
>> more meaningful and at least improve the representation side of
>> things. On the implementation side it might save us the burden of
>> infering the topology from parsing all cpu nodes each time. It's also
>> backwards compatible with what you already have, the only thing that's
>> not backwards compatible is the removal of the <thread> node, which I
>> don't think is such a big deal to fix.
>
> Sorry, but as I have stated repeatedly, this complicates the common
> code
> for what you admit is a matter of taste. I would rather maintain the
> simplicity of this binding and existing parsing code, and not
> potentially break other parsers, if the only cost is a few nodes which
> you personally consider to be redundant.
>
[...]
>> So we agree, the "core" doesn't say anything useful regarding the
>> topology, why keep using this distinction on a binding for RISC-V and
>> possibly other archs (since we are also talking on an arch-independent
>> approach) ?
>
> We keep it because the binding has already been finalised and is use in
> practice. The existing binding and parsing code is *sufficient* for
> your
> needs. Personal taste and minor savings in the number of nodes in a DT
> are not compelling reasons to change this when the cost is complexity
> that we have to maintain *forever*.
>
Totally fine with that, as I mentioned above I pointed these out since I
consider it an improvement. Forgive me if I gave you the impression that
was a deal-breaker or something.
Regards,
Nick
On Fri, Nov 09, 2018 at 04:36:19AM +0200, Nick Kossifidis wrote:
> Στις 2018-11-08 18:48, Sudeep Holla έγραψε:
[...]
> we can keep it as is and mandate the <thread> nodes only for ARM64 as you
> suggested.
>
Fine by me.
[...]
> > We have reasons why we can't assume information about cache or power
> > domain topology from CPU topology. I have summarised them already and
> > we are not discussing.
>
> But that's what you do on arch/arm/kernel/topology.c, you assume
> information on power domain and cache topology based on the CPU topology
> when you define the scheduling domain topology levels.
>
NO, we don't assume any knowledge about power domain and cache topology
based on the CPU topology. We have updated scheduling domains for ACPI
based systems and plan to do the same for DT. The current code may not
be optimal.
> PowerPC also does the same on arch/powerpc/kernel/smp.c and basically if
> you track the usage of struct sched_domain_topology_level you'll see that
> every arch that uses it to instruct the scheduler about the scheduling
> domains, does it based on its CPU topology, which I think we both agree
> is wrong.
>
Again, that's arch specific, so I can't comment on anything other than ARM.
> > There may not be perfect bindings but there are
> > already supported and nothing is broken here to fix. DT bindings are
> > *not* same as code to fix it with a patch to the bindings themselves.
> > Once agreed and merged, they need to be treated like user ABI.
> >
>
> The only use of the devicetree spec bindings for caches if I'm not
> mistaken are used on your cacheinfo driver for exporting them to userspace
> through sysfs (thank you for cleaning this up btw). Even if we have the
> cache topology using the generic bindings, we are not doing anything with
> that information but export it to userspace.
>
OK, we may use LLC for sched domains in future, we do for ACPI systems.
> As for the power domain bindings, we have drivers/base/power/domain.c that
> handles the generic bindings and it's being used by some drivers to put
> devices to power domains (something used by the pm code), but from a quick
> search it's not used for cpu nodes and I didn't see this info being used
> for providing hints to the scheduler either. Even with the generic power
> domain bindings in place, for CPU idle states, on ARM we have another
> binding that's basically the same with the addition of "wakeup-latency-us".
>
OK, it's just an extension.
> For having different capacity there is no generic binding but I guess we
> could use ARM's.
>
Better.
> Of the generic bindings that relate to the scheduler's behavior, only the
> numa binding is used as expected and only by ARM64, powerpc and sparc use
> their own stuff.
>
Yes, PPC and SPARC has lots of Open Firmware base which is different from
DT.
> So there may be nothing broken regarding the generic / already existing
> bindings, but their support needs fixing. The way they are now we can't
> use them on RISC-V for configuring the scheduler and supporting SMT and
> MC properly + I'm not in favor of using CPU topology blindly as the
> other archs do.
>
Sure, if you want to improve support for the existing DT bindings that's
fine. Happy to see the patches.
DT bindings can be generic, you can still interpret and manage the sched
domains in the best way for RISC-V.
[...]
> I mentioned the script as a transitioning method, but you are right, it goes
> both ways.
>
Thanks :)
> But I never said that's "the only solution and everything else is broken" !
> I'm just proposing an extension to cpu-map since Mark suggested that it's
> possible. My goal is to try and consolidate cpu-map with what Atish proposed
> for RISC-V (so that we can use the same code) and point out some issues
> on how we use and define the CPU topology.
>
Good, we are in agreement here.
> ACK, thanks a lot for your time and the discussion so far, I really
> appreciate it.
>
No worries, glad if it helps.
--
Regards,
Sudeep