2023-08-14 09:44:56

by Thomas Gleixner

[permalink] [raw]
Subject: [patch V4 24/41] x86/cpu: Provide cpu_init/parse_topology()

Topology evaluation is a complete disaster and impenetrable mess. It's
scattered all over the place with some vendor implementations doing early
evaluation and some not. The most horrific part is the permanent
overwriting of smt_max_siblings and __max_die_per_package, instead of
establishing them once on the boot CPU and validating the result on the
APs.

The goals are:

- One topology evaluation entry point

- Proper sharing of pointlessly duplicated code

- Proper structuring of the evaluation logic and preferences.

- Evaluating important system wide information only once on the boot CPU

- Making the 0xb/0x1f leaf parsing less convoluted and actually fixing
the short comings of leaf 0x1f evaluation.

Start to consolidate the topology evaluation code by providing the entry
points for the early boot CPU evaluation and for the final parsing on the
boot CPU and the APs.

Move the trivial pieces into that new code:

- The initialization of cpuinfo_x86::topo

- The evaluation of CPUID leaf 1, which presets topo::initial_apicid

- topo_apicid is set to topo::initial_apicid when invoked from early
boot. When invoked for the final evaluation on the boot CPU it reads
the actual APIC ID, which makes apic_get_initial_apicid() obsolete
once everything is converted over.

Provide a temporary helper function topo_converted() which shields off the
not yet converted CPU vendors from invoking code which would break them.
This shielding covers all vendor CPUs which support SMP, but not the
historical pure UP ones as they only need the topology info init and
eventually the initial APIC initialization.

Provide two new members in cpuinfo_x86::topo to store the maximum number of
SMT siblings and the number of dies per package and add them to the debugfs
readout. These two members will be used to populate this information on the
boot CPU and to validate the APs against it.

Signed-off-by: Thomas Gleixner <[email protected]>
Tested-by: Juergen Gross <[email protected]>
Tested-by: Sohil Mehta <[email protected]>
Tested-by: Michael Kelley <[email protected]>

---
V2: Make core ID package relativ not relative to the next level - Rui
---
arch/x86/include/asm/topology.h | 19 +++
arch/x86/kernel/cpu/Makefile | 3
arch/x86/kernel/cpu/common.c | 23 +---
arch/x86/kernel/cpu/cpu.h | 6 +
arch/x86/kernel/cpu/debugfs.c | 37 ++++++
arch/x86/kernel/cpu/topology.h | 36 ++++++
arch/x86/kernel/cpu/topology_common.c | 188 ++++++++++++++++++++++++++++++++++
7 files changed, 295 insertions(+), 17 deletions(-)

--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -102,6 +102,25 @@ static inline void setup_node_to_cpumask

#include <asm-generic/topology.h>

+/* Topology information */
+enum x86_topology_domains {
+ TOPO_SMT_DOMAIN,
+ TOPO_CORE_DOMAIN,
+ TOPO_MODULE_DOMAIN,
+ TOPO_TILE_DOMAIN,
+ TOPO_DIE_DOMAIN,
+ TOPO_PKG_DOMAIN,
+ TOPO_ROOT_DOMAIN,
+ TOPO_MAX_DOMAIN,
+};
+
+struct x86_topology_system {
+ unsigned int dom_shifts[TOPO_MAX_DOMAIN];
+ unsigned int dom_size[TOPO_MAX_DOMAIN];
+};
+
+extern struct x86_topology_system x86_topo_system;
+
extern const struct cpumask *cpu_coregroup_mask(int cpu);
extern const struct cpumask *cpu_clustergroup_mask(int cpu);

--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -17,7 +17,8 @@ KMSAN_SANITIZE_common.o := n
# As above, instrumenting secondary CPU boot code causes boot hangs.
KCSAN_SANITIZE_common.o := n

-obj-y := cacheinfo.o scattered.o topology.o
+obj-y := cacheinfo.o scattered.o
+obj-y += topology_common.o topology.o
obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1553,6 +1553,8 @@ static void __init early_identify_cpu(st
setup_force_cpu_cap(X86_FEATURE_CPUID);
cpu_parse_early_param();

+ cpu_init_topology(c);
+
if (this_cpu->c_early_init)
this_cpu->c_early_init(c);

@@ -1563,6 +1565,7 @@ static void __init early_identify_cpu(st
this_cpu->c_bsp_init(c);
} else {
setup_clear_cpu_cap(X86_FEATURE_CPUID);
+ cpu_init_topology(c);
}

setup_force_cpu_cap(X86_FEATURE_ALWAYS);
@@ -1708,18 +1711,6 @@ static void generic_identify(struct cpui

get_cpu_address_sizes(c);

- if (c->cpuid_level >= 0x00000001) {
- c->topo.initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
-#ifdef CONFIG_X86_32
-# ifdef CONFIG_SMP
- c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0);
-# else
- c->topo.apicid = c->topo.initial_apicid;
-# endif
-#endif
- c->topo.pkg_id = c->topo.initial_apicid;
- }
-
get_model_name(c); /* Default name */

/*
@@ -1778,9 +1769,6 @@ static void identify_cpu(struct cpuinfo_
c->x86_model_id[0] = '\0'; /* Unset */
c->x86_max_cores = 1;
c->x86_coreid_bits = 0;
- c->topo.cu_id = 0xff;
- c->topo.llc_id = BAD_APICID;
- c->topo.l2c_id = BAD_APICID;
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
c->x86_phys_bits = 36;
@@ -1799,6 +1787,8 @@ static void identify_cpu(struct cpuinfo_

generic_identify(c);

+ cpu_parse_topology(c);
+
if (this_cpu->c_identify)
this_cpu->c_identify(c);

@@ -1806,7 +1796,8 @@ static void identify_cpu(struct cpuinfo_
apply_forced_caps(c);

#ifdef CONFIG_X86_64
- c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0);
+ if (!topo_is_converted(c))
+ c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0);
#endif

/*
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -2,6 +2,11 @@
#ifndef ARCH_X86_CPU_H
#define ARCH_X86_CPU_H

+#include <asm/cpu.h>
+#include <asm/topology.h>
+
+#include "topology.h"
+
/* attempt to consolidate cpu attributes */
struct cpu_dev {
const char *c_vendor;
@@ -95,4 +100,5 @@ static inline bool spectre_v2_in_eibrs_m
mode == SPECTRE_V2_EIBRS_RETPOLINE ||
mode == SPECTRE_V2_EIBRS_LFENCE;
}
+
#endif /* ARCH_X86_CPU_H */
--- a/arch/x86/kernel/cpu/debugfs.c
+++ b/arch/x86/kernel/cpu/debugfs.c
@@ -5,6 +5,8 @@
#include <asm/apic.h>
#include <asm/processor.h>

+#include "cpu.h"
+
static int cpu_debug_show(struct seq_file *m, void *p)
{
unsigned long cpu = (unsigned long)m->private;
@@ -42,12 +44,47 @@ static const struct file_operations dfs_
.release = single_release,
};

+static int dom_debug_show(struct seq_file *m, void *p)
+{
+ static const char *domain_names[TOPO_ROOT_DOMAIN] = {
+ [TOPO_SMT_DOMAIN] = "Thread",
+ [TOPO_CORE_DOMAIN] = "Core",
+ [TOPO_MODULE_DOMAIN] = "Module",
+ [TOPO_TILE_DOMAIN] = "Tile",
+ [TOPO_DIE_DOMAIN] = "Die",
+ [TOPO_PKG_DOMAIN] = "Package",
+ };
+ unsigned int dom, nthreads = 1;
+
+ for (dom = 0; dom < TOPO_ROOT_DOMAIN; dom++) {
+ nthreads *= x86_topo_system.dom_size[dom];
+ seq_printf(m, "domain: %-10s shift: %u dom_size: %5u max_threads: %5u\n",
+ domain_names[dom], x86_topo_system.dom_shifts[dom],
+ x86_topo_system.dom_size[dom], nthreads);
+ }
+ return 0;
+}
+
+static int dom_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, dom_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_dom_ops = {
+ .open = dom_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
static __init int cpu_init_debugfs(void)
{
struct dentry *dir, *base = debugfs_create_dir("topo", arch_debugfs_dir);
unsigned long id;
char name[10];

+ debugfs_create_file("domains", 0444, base, NULL, &dfs_dom_ops);
+
dir = debugfs_create_dir("cpus", base);
for_each_possible_cpu(id) {
sprintf(name, "%lu", id);
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_TOPOLOGY_H
+#define ARCH_X86_TOPOLOGY_H
+
+struct topo_scan {
+ struct cpuinfo_x86 *c;
+ unsigned int dom_shifts[TOPO_MAX_DOMAIN];
+ unsigned int dom_ncpus[TOPO_MAX_DOMAIN];
+};
+
+bool topo_is_converted(struct cpuinfo_x86 *c);
+void cpu_init_topology(struct cpuinfo_x86 *c);
+void cpu_parse_topology(struct cpuinfo_x86 *c);
+void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus);
+
+static inline u32 topo_shift_apicid(u32 apicid, enum x86_topology_domains dom)
+{
+ if (dom == TOPO_SMT_DOMAIN)
+ return apicid;
+ return apicid >> x86_topo_system.dom_shifts[dom - 1];
+}
+
+static inline u32 topo_relative_domain_id(u32 apicid, enum x86_topology_domains dom)
+{
+ if (dom != TOPO_SMT_DOMAIN)
+ apicid >>= x86_topo_system.dom_shifts[dom - 1];
+ return apicid & (x86_topo_system.dom_size[dom] - 1);
+}
+
+static inline u32 topo_domain_mask(enum x86_topology_domains dom)
+{
+ return (1U << x86_topo_system.dom_shifts[dom]) - 1;
+}
+
+#endif /* ARCH_X86_TOPOLOGY_H */
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology_common.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <xen/xen.h>
+
+#include <asm/apic.h>
+#include <asm/processor.h>
+#include <asm/smp.h>
+
+#include "cpu.h"
+
+struct x86_topology_system x86_topo_system __ro_after_init;
+
+void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus)
+{
+ tscan->dom_shifts[dom] = shift;
+ tscan->dom_ncpus[dom] = ncpus;
+
+ /* Propagate to the upper levels */
+ for (dom++; dom < TOPO_MAX_DOMAIN; dom++) {
+ tscan->dom_shifts[dom] = tscan->dom_shifts[dom - 1];
+ tscan->dom_ncpus[dom] = tscan->dom_ncpus[dom - 1];
+ }
+}
+
+bool topo_is_converted(struct cpuinfo_x86 *c)
+{
+ /* Temporary until everything is converted over. */
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ case X86_VENDOR_CENTAUR:
+ case X86_VENDOR_INTEL:
+ case X86_VENDOR_HYGON:
+ case X86_VENDOR_ZHAOXIN:
+ return false;
+ default:
+ /* Let all UP systems use the below */
+ return true;
+ }
+}
+
+static bool fake_topology(struct topo_scan *tscan)
+{
+ /*
+ * Preset the CORE level shift for CPUID less systems and XEN_PV,
+ * which has useless CPUID information.
+ */
+ topology_set_dom(tscan, TOPO_SMT_DOMAIN, 0, 1);
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, 1, 1);
+
+ return tscan->c->cpuid_level < 1 || xen_pv_domain();
+}
+
+static void parse_topology(struct topo_scan *tscan, bool early)
+{
+ const struct cpuinfo_topology topo_defaults = {
+ .cu_id = 0xff,
+ .llc_id = BAD_APICID,
+ .l2c_id = BAD_APICID,
+ };
+ struct cpuinfo_x86 *c = tscan->c;
+ struct {
+ u32 unused0 : 16,
+ nproc : 8,
+ apicid : 8;
+ } ebx;
+
+ c->topo = topo_defaults;
+
+ if (fake_topology(tscan))
+ return;
+
+ /* Preset Initial APIC ID from CPUID leaf 1 */
+ cpuid_leaf_reg(1, CPUID_EBX, &ebx);
+ c->topo.initial_apicid = ebx.apicid;
+
+ /*
+ * The initial invocation from early_identify_cpu() happens before
+ * the APIC is mapped or X2APIC enabled. For establishing the
+ * topology, that's not required. Use the initial APIC ID.
+ */
+ if (early)
+ c->topo.apicid = c->topo.initial_apicid;
+ else
+ c->topo.apicid = read_apic_id();
+
+ /* The above is sufficient for UP */
+ if (!IS_ENABLED(CONFIG_SMP))
+ return;
+}
+
+static void topo_set_ids(struct topo_scan *tscan)
+{
+ struct cpuinfo_x86 *c = tscan->c;
+ u32 apicid = c->topo.apicid;
+
+ c->topo.pkg_id = topo_shift_apicid(apicid, TOPO_PKG_DOMAIN);
+ c->topo.die_id = topo_shift_apicid(apicid, TOPO_DIE_DOMAIN);
+
+ /* Package relative core ID */
+ c->topo.core_id = (apicid & topo_domain_mask(TOPO_PKG_DOMAIN)) >>
+ x86_topo_system.dom_shifts[TOPO_SMT_DOMAIN];
+}
+
+static void topo_set_max_cores(struct topo_scan *tscan)
+{
+ /*
+ * Bug compatible for now. This is broken on hybrid systems:
+ * 8 cores SMT + 8 cores w/o SMT
+ * tscan.dom_ncpus[TOPO_CORE_DOMAIN] = 24; 24 / 2 = 12 !!
+ *
+ * Cannot be fixed without further topology enumeration changes.
+ */
+ tscan->c->x86_max_cores = tscan->dom_ncpus[TOPO_CORE_DOMAIN] >>
+ x86_topo_system.dom_shifts[TOPO_SMT_DOMAIN];
+}
+
+void cpu_parse_topology(struct cpuinfo_x86 *c)
+{
+ unsigned int dom, cpu = smp_processor_id();
+ struct topo_scan tscan = { .c = c, };
+
+ parse_topology(&tscan, false);
+
+ if (!topo_is_converted(c))
+ return;
+
+ for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) {
+ if (tscan.dom_shifts[dom] == x86_topo_system.dom_shifts[dom])
+ continue;
+ pr_err(FW_BUG "CPU%d: Topology domain %u shift %u != %u\n", cpu, dom,
+ tscan.dom_shifts[dom], x86_topo_system.dom_shifts[dom]);
+ }
+
+ /* Bug compatible with the existing parsers */
+ if (tscan.dom_ncpus[TOPO_SMT_DOMAIN] > smp_num_siblings) {
+ if (system_state == SYSTEM_BOOTING) {
+ pr_warn_once("CPU%d: SMT detected and enabled late\n", cpu);
+ smp_num_siblings = tscan.dom_ncpus[TOPO_SMT_DOMAIN];
+ } else {
+ pr_warn_once("CPU%d: SMT detected after init. Too late!\n", cpu);
+ }
+ }
+
+ topo_set_ids(&tscan);
+ topo_set_max_cores(&tscan);
+}
+
+void __init cpu_init_topology(struct cpuinfo_x86 *c)
+{
+ struct topo_scan tscan = { .c = c, };
+ unsigned int dom, sft;
+
+ parse_topology(&tscan, true);
+
+ if (!topo_is_converted(c))
+ return;
+
+ /* Copy the shift values and calculate the unit sizes. */
+ memcpy(x86_topo_system.dom_shifts, tscan.dom_shifts, sizeof(x86_topo_system.dom_shifts));
+
+ dom = TOPO_SMT_DOMAIN;
+ x86_topo_system.dom_size[dom] = 1U << x86_topo_system.dom_shifts[dom];
+
+ for (dom++; dom < TOPO_MAX_DOMAIN; dom++) {
+ sft = x86_topo_system.dom_shifts[dom] - x86_topo_system.dom_shifts[dom - 1];
+ x86_topo_system.dom_size[dom] = 1U << sft;
+ }
+
+ topo_set_ids(&tscan);
+ topo_set_max_cores(&tscan);
+
+ /*
+ * Bug compatible with the existing code. If the boot CPU does not
+ * have SMT this ends up with one sibling. This needs way deeper
+ * changes further down the road to get it right during early boot.
+ */
+ smp_num_siblings = tscan.dom_ncpus[TOPO_SMT_DOMAIN];
+
+ /*
+ * Neither it's clear whether there are as many dies as the APIC
+ * space indicating die level is. But assume that the actual number
+ * of CPUs gives a proper indication for now to stay bug compatible.
+ */
+ __max_die_per_package = tscan.dom_ncpus[TOPO_DIE_DOMAIN] /
+ tscan.dom_ncpus[TOPO_DIE_DOMAIN - 1];
+}





2023-09-14 12:31:31

by K Prateek Nayak

[permalink] [raw]
Subject: Re: [patch V4 24/41] x86/cpu: Provide cpu_init/parse_topology()

Hello Thomas,

On 8/28/2023 7:58 PM, K Prateek Nayak wrote:
> Hello Thomas,
>
> On 8/28/2023 3:35 PM, Thomas Gleixner wrote:
>> Prateek!
>>
>> On Mon, Aug 28 2023 at 11:37, K. Prateek Nayak wrote:
>>> On 8/14/2023 2:24 PM, Thomas Gleixner wrote:
>>>
>>> Since these enums come from the description of level type of CPUID leaf
>>> 0x1f, can we have a short description clarifying what each signify. This
>>> will also help clarify the mappings for AMD's extended CPUID leaf
>>> 0x80000026 (specifically for CCX and CCD level types). I had following
>>> in my mind:
>>
>> Makes sense.
>>
>>> TOPO_MODULE_DOMAIN,
>>> + /*
>>> + * If exists, represents a group of tiles within
>>> + * an instance of the next domain
>>> + *
>>> + * On Intel: This level contains a group of Tile
>>> + * type as described by CPUID leaf 0x1f
>>> + *
>>> + * On AMD: This is the group of "Complex" type
>>> + * instances as described by CPUID leaf
>>> + * 0x8000_0026
>>> + */
>>> TOPO_TILE_DOMAIN,
>>> + /*
>>> + * If exists, represents a group of dies within an
>>> + * instance of the next domain
>>> + *
>>> + * On Intel: This level contains group of Die
>>> + * type as described by CPUID leaf 0x1f
>>> + *
>>> + * On AMD: This is the group of "CCD (Die)"
>>> + * type instances as described by CPUID leaf
>>> + * 0x8000_0026
>>> + */
>>> TOPO_DIE_DOMAIN,
>>> + /*
>>> + * If exists, represents a group of packages
>>> + * within the root domain
>>> + */
>>> TOPO_PKG_DOMAIN,
>>> + /* Topmost domain with a singular instance */
>>> TOPO_ROOT_DOMAIN,
>>> TOPO_MAX_DOMAIN,
>>> };
>>
>> Now this begs the obvious question what the actual meaning of these
>> domains is and what's their relevance for the kernel.
>>
>> It's probably undisputed what SMT/CORE mean and what their relevance is.
>> The PKG/DIE domains are pretty clear too.
>
> Yup! Those seem to be clear.
>
>>
>> Now we have:
>>
>> MODULE (Intel only)
>>
>> TILE Intel, AMD names it "Complex"
>
> So here is my interpretation of 0x1f since I could not find much in the
> SDM for these level types.

Turns out, there is indeed more information about these types in SDM.
Intel SDM Vol. 3A, in Sec. 9.9.1 "Hierarchical Mapping of Shared
Resources" contains description of these level. I'll quote it here for
ease of readers:

"""
The decomposition of an APIC_ID may consist of several sub fields
representing the topology within a physical processor package, the
higher-order bits of an APIC ID may also be used by cluster vendors to
represent the topology of cluster nodes of each coherent multiprocessor
systems:

o Cluster : Some multi-threading environments consists of multiple
clusters of multi-processor systems. The CLUSTER_ID
sub-field is usually supported by vendor firmware to
distinguish different clusters. For nonclustered systems,
CLUSTER_ID is usually 0 and system topology is reduced.

o Package : A physical processor package mates with a socket. A package
may contain one or more software visible die. The PACKAGE_ID
sub-field distinguishes different physical packages within a
cluster.

o Die : A software-visible chip inside a package. The DIE_ID
sub-field distinguishes different die within a package. If
there are no software visible die, the width of this bit
field is 0.

o DieGrp : A group of die that share certain resources.

o Tile : A set of cores that share certain resources. The TILE_ID
sub-field distinguishes different tiles. If there are no
software visible tiles, the width of this bit field is 0.

o Module : A set of cores that share certain resources. The MODULE_ID
sub-field distinguishes different modules. If there are no
software visible modules, the width of this bit field is 0.

o Core : Processor cores may be contained within modules, within
tiles, on software-visible die, or appear directly at the
package domain. The CORE_ID sub-field distinguishes
processor cores. For a single-core processor, the width of
this bit field is 0.

o Logical Processor : A processor core provides one or more logical
processors sharing execution resources. The
LOGICAL_PROCESSOR_ID sub-field distinguishes logical
processors in a core. The width of this bit field is
non-zero if a processor core provides more than one logical
processors.
"""

So some questions to Intel folks to determine whether mapping AMD's
Complex to Tile makes sense or not:

- What are the "certain resources" a group of module / tile share?

- Module and Tile both use the phrase "set of cores" in their
description. Does this mean their existence is mutually exclusive?

AMD's Complex (CCX) marks the L3 cache boundary. If either of the
"certain resources" refer to L3 cache, then Complex can be safely mapped
to the respective level without any fear of misinterpretation of the
characteristics.

Also, I do not see a "DieGrp" domain in the "x86_topology_domains". Is
this because of the lack of "software visible" aspect of it despite its
possible existence?

> [..snip..]
--
Thanks and Regards,
Prateek

2023-09-15 13:50:58

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch V4 24/41] x86/cpu: Provide cpu_init/parse_topology()

Prateek!

On Thu, Sep 14 2023 at 14:50, K. Prateek Nayak wrote:
> On 8/28/2023 7:58 PM, K Prateek Nayak wrote:
> o Logical Processor : A processor core provides one or more logical
> processors sharing execution resources. The
> LOGICAL_PROCESSOR_ID sub-field distinguishes logical
> processors in a core. The width of this bit field is
> non-zero if a processor core provides more than one logical
> processors.
> """
>
> So some questions to Intel folks to determine whether mapping AMD's
> Complex to Tile makes sense or not:
>
> - What are the "certain resources" a group of module / tile share?
>
> - Module and Tile both use the phrase "set of cores" in their
> description. Does this mean their existence is mutually exclusive?

That's definitely a good question.

> AMD's Complex (CCX) marks the L3 cache boundary. If either of the
> "certain resources" refer to L3 cache, then Complex can be safely mapped
> to the respective level without any fear of misinterpretation of the
> characteristics.

I don't think it's a good idea to try deducing cache hierarchy from the
basic topology. The boundaries have changed over time and AMD has made
it impossible on older CPUs to use the CPU topology for that.

I tried to do that and gave up because I realized that we need both and
then do the proper association by combining the information.

> Also, I do not see a "DieGrp" domain in the "x86_topology_domains". Is
> this because of the lack of "software visible" aspect of it despite its
> possible existence?

No. It's subsumed by the package domain. Let me look at the mechanics
some more to make this more obvious.

Thanks,

tglx

2023-09-15 14:46:32

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch V4 24/41] x86/cpu: Provide cpu_init/parse_topology()

On Tue, Aug 29, 2023 at 08:46:14AM +0530, K Prateek Nayak wrote:
> Hello Arjan,
>
> On 8/28/2023 8:04 PM, Arjan van de Ven wrote:
> > On 8/28/2023 7:28 AM, K Prateek Nayak wrote:
> >>> ??? - Are these really different between AMD and Intel or is this some
> >>> ????? naming convention issue which needs to be resolved?
> >> ????They do have different characteristics since, on Sapphire
> >> ????Rapids, the LLC is at a socket boundary despite having multiple
> >> ????tiles. (Please correct me if I'm wrong, I'm going off of
> >> ????llc_id shared in this report by Qiuxu Zhuo -
> >> ????https://lore.kernel.org/all/[email protected]/)
> >>
> >
> > Sapphire reports itself as 1 tile though (since logically it is) as far as I know
> >
>
> I believe there are some variants with multiple tiles, at least the
> following press-release suggests that:
>
> https://www.intc.com/news-events/press-releases/detail/1598/intel-launches-4th-gen-xeon-scalable-processors-max-series
>
> specifically "... combining up to four Intel 7-built tiles on a single
> package, connected using Intel EMIB ...". Perhaps the one from Qiuxu
> Zhuo's report does not contain multiple tiles.

I think what Arjan was saying that despite them being build using
multipe physical tiles, they describe themselves, in the topology leave,
as being a single tile.

2023-09-15 18:56:30

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [patch V4 24/41] x86/cpu: Provide cpu_init/parse_topology()

On 9/15/2023 4:54 AM, Peter Zijlstra wrote:
> On Tue, Aug 29, 2023 at 08:46:14AM +0530, K Prateek Nayak wrote:
>> Hello Arjan,
>>
>> On 8/28/2023 8:04 PM, Arjan van de Ven wrote:
>>> On 8/28/2023 7:28 AM, K Prateek Nayak wrote:
>>>>>     - Are these really different between AMD and Intel or is this some
>>>>>       naming convention issue which needs to be resolved?
>>>>     They do have different characteristics since, on Sapphire
>>>>     Rapids, the LLC is at a socket boundary despite having multiple
>>>>     tiles. (Please correct me if I'm wrong, I'm going off of
>>>>     llc_id shared in this report by Qiuxu Zhuo -
>>>>     https://lore.kernel.org/all/[email protected]/)
>>>>
>>>
>>> Sapphire reports itself as 1 tile though (since logically it is) as far as I know
>>>
>>
>> I believe there are some variants with multiple tiles, at least the
>> following press-release suggests that:
>>
>> https://www.intc.com/news-events/press-releases/detail/1598/intel-launches-4th-gen-xeon-scalable-processors-max-series
>>
>> specifically "... combining up to four Intel 7-built tiles on a single
>> package, connected using Intel EMIB ...". Perhaps the one from Qiuxu
>> Zhuo's report does not contain multiple tiles.
>
> I think what Arjan was saying that despite them being build using
> multipe physical tiles, they describe themselves, in the topology leave,
> as being a single tile.

and more than that -- from a software perspective, they truely act as if they are 1 tile
(you can do SNC to break that sort of but that's not default and has its own list of downsides)


2023-09-19 10:53:10

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch V4 24/41] x86/cpu: Provide cpu_init/parse_topology()

On Tue, Sep 19 2023 at 09:24, K Prateek Nayak wrote:
> If possible, can you please elaborate on the "software perspective". Say
> CPUID leaf 0x1f reports multiple tile, would the data access latency or
> cache to cache latency see a noticeable difference?
>
> I would like to understand what the characteristics of a "Tile" are and
> whether they are similar to AMD's CCX instances discoverable by AMD's
> extended CPUID leaf 0x80000026. That way, in future, when the generic
> topology is used by other subsystems, the data from "TOPO_TILE_DOMAIN"
> can be used generically for both Intel and AMD.

I'm not convinced that this is possible. The meaning of these elements
is unfortunately not cast in stone, so the association of e.g. cache
boundaries is not necessarily static accross a larger range of CPU
generations..

We really need to differentiate performance characteristic, hardware
feature scope, power management scope etc. and create abstractions which
are actually useful for kernel facilities and user space.

From a performance perspective it's irrelevant whether the scope is TILE
or whatever. The information needed is the hierarchy, the performance
characteristics of a segment in the hierarchy and the costs for cross
segment access and cross segment migration.

For hardware features like perf the information needed is what the scope
of a particular resource is. Again it's not necessarily useful to know
what particular domain type it is.

Powermanagement does not care about the particual type either. It needs
to know which scopes affect c-states, clock speeds ... to give the
scheduler informtion of how to steer workloads in the most efficient way.

Thanks,

tglx

2023-09-19 13:16:44

by K Prateek Nayak

[permalink] [raw]
Subject: Re: [patch V4 24/41] x86/cpu: Provide cpu_init/parse_topology()

Hello Arjan, Peter,

On 9/15/2023 7:34 PM, Arjan van de Ven wrote:
> On 9/15/2023 4:54 AM, Peter Zijlstra wrote:
>> On Tue, Aug 29, 2023 at 08:46:14AM +0530, K Prateek Nayak wrote:
>>> Hello Arjan,
>>>
>>> On 8/28/2023 8:04 PM, Arjan van de Ven wrote:
>>>> On 8/28/2023 7:28 AM, K Prateek Nayak wrote:
>>>>>>      - Are these really different between AMD and Intel or is this some
>>>>>>        naming convention issue which needs to be resolved?
>>>>>      They do have different characteristics since, on Sapphire
>>>>>      Rapids, the LLC is at a socket boundary despite having multiple
>>>>>      tiles. (Please correct me if I'm wrong, I'm going off of
>>>>>      llc_id shared in this report by Qiuxu Zhuo -
>>>>>      https://lore.kernel.org/all/[email protected]/)
>>>>>
>>>>
>>>> Sapphire reports itself as 1 tile though (since logically it is) as far as I know
>>>>
>>>
>>> I believe there are some variants with multiple tiles, at least the
>>> following press-release suggests that:
>>>
>>>    https://www.intc.com/news-events/press-releases/detail/1598/intel-launches-4th-gen-xeon-scalable-processors-max-series
>>>
>>> specifically "... combining up to four Intel 7-built tiles on a single
>>> package, connected using Intel EMIB ...". Perhaps the one from Qiuxu
>>> Zhuo's report does not contain multiple tiles.
>>
>> I think what Arjan was saying that despite them being build using
>> multipe physical tiles, they describe themselves, in the topology leave,
>> as being a single tile.
>
> and more than that -- from a software perspective, they truely act as if they are 1 tile

If possible, can you please elaborate on the "software perspective". Say
CPUID leaf 0x1f reports multiple tile, would the data access latency or
cache to cache latency see a noticeable difference?

I would like to understand what the characteristics of a "Tile" are and
whether they are similar to AMD's CCX instances discoverable by AMD's
extended CPUID leaf 0x80000026. That way, in future, when the generic
topology is used by other subsystems, the data from "TOPO_TILE_DOMAIN"
can be used generically for both Intel and AMD.

> (you can do SNC to break that sort of but that's not default and has its own list of downsides)
>

Thank you for giving the basic clarification of the Tile instances.

--
Thanks and Regards,
Prateek

2023-09-19 21:09:05

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [patch V4 24/41] x86/cpu: Provide cpu_init/parse_topology()


>>>> specifically "... combining up to four Intel 7-built tiles on a single
>>>> package, connected using Intel EMIB ...". Perhaps the one from Qiuxu
>>>> Zhuo's report does not contain multiple tiles.
>>>
>>> I think what Arjan was saying that despite them being build using
>>> multipe physical tiles, they describe themselves, in the topology leave,
>>> as being a single tile.
>>
>> and more than that -- from a software perspective, they truely act as if they are 1 tile
>
> If possible, can you please elaborate on the "software perspective". Say
> CPUID leaf 0x1f reports multiple tile, would the data access latency or
> cache to cache latency see a noticeable difference?

no. (not on SPR unless you turn on SNC, which is a whole different world)

>
> I would like to understand what the characteristics of a "Tile" are and
> whether they are similar to AMD's CCX instances discoverable by AMD's
> extended CPUID leaf 0x80000026. That way, in future, when the generic
> topology is used by other subsystems, the data from "TOPO_TILE_DOMAIN"
> can be used generically for both Intel and AMD.

SPR for all intents and purposes for software does not have tiles. So please
lets not design for that ;-)

The reality is that we really should not hardcode topology things to cache things.
Sure today tile is an L3 boundary for AMD, and on all no-tile systems by construction
of the topology tree.
But maybe some smart person in AMD decides
that for a next generation, it's faster to split the L3 in half -- or to make that
extra HBM-like cache span 2 tiles or .. or ..

CPUID enumerates cache domains pretty much separate and that;s a good thing.
We absolutely need a "cache view" of the system, but that is a mapping to topology,
not hardcoded in topology (so one level of indirection + of course cached/computed
bitmaps etc for cheap access)


2023-09-20 07:02:49

by K Prateek Nayak

[permalink] [raw]
Subject: Re: [patch V4 24/41] x86/cpu: Provide cpu_init/parse_topology()

Hello Thomas,

On 9/19/2023 1:43 PM, Thomas Gleixner wrote:
> On Tue, Sep 19 2023 at 09:24, K Prateek Nayak wrote:
>> If possible, can you please elaborate on the "software perspective". Say
>> CPUID leaf 0x1f reports multiple tile, would the data access latency or
>> cache to cache latency see a noticeable difference?
>>
>> I would like to understand what the characteristics of a "Tile" are and
>> whether they are similar to AMD's CCX instances discoverable by AMD's
>> extended CPUID leaf 0x80000026. That way, in future, when the generic
>> topology is used by other subsystems, the data from "TOPO_TILE_DOMAIN"
>> can be used generically for both Intel and AMD.
>
> I'm not convinced that this is possible. The meaning of these elements
> is unfortunately not cast in stone, so the association of e.g. cache
> boundaries is not necessarily static accross a larger range of CPU
> generations..
>
> We really need to differentiate performance characteristic, hardware
> feature scope, power management scope etc. and create abstractions which
> are actually useful for kernel facilities and user space.
>
> From a performance perspective it's irrelevant whether the scope is TILE
> or whatever. The information needed is the hierarchy, the performance
> characteristics of a segment in the hierarchy and the costs for cross
> segment access and cross segment migration.

That makes sense. Thank you for the detailed explanation.

>
> For hardware features like perf the information needed is what the scope
> of a particular resource is. Again it's not necessarily useful to know
> what particular domain type it is.
>
> Powermanagement does not care about the particual type either. It needs
> to know which scopes affect c-states, clock speeds ... to give the
> scheduler informtion of how to steer workloads in the most efficient way.
>
> Thanks,
>
> tglx

--
Thanks and Regards,
Prateek

2023-09-20 09:12:17

by K Prateek Nayak

[permalink] [raw]
Subject: Re: [patch V4 24/41] x86/cpu: Provide cpu_init/parse_topology()

Hello Arjan,

On 9/19/2023 7:14 PM, Arjan van de Ven wrote:
>
>>>>> specifically "... combining up to four Intel 7-built tiles on a single
>>>>> package, connected using Intel EMIB ...". Perhaps the one from Qiuxu
>>>>> Zhuo's report does not contain multiple tiles.
>>>>
>>>> I think what Arjan was saying that despite them being build using
>>>> multipe physical tiles, they describe themselves, in the topology leave,
>>>> as being a single tile.
>>>
>>> and more than that -- from a software perspective, they truely act as if they are 1 tile
>>
>> If possible, can you please elaborate on the "software perspective". Say
>> CPUID leaf 0x1f reports multiple tile, would the data access latency or
>> cache to cache latency see a noticeable difference?
>
> no. (not on SPR unless you turn on SNC, which is a whole different world)
>
>>
>> I would like to understand what the characteristics of a "Tile" are and
>> whether they are similar to AMD's CCX instances discoverable by AMD's
>> extended CPUID leaf 0x80000026. That way, in future, when the generic
>> topology is used by other subsystems, the data from "TOPO_TILE_DOMAIN"
>> can be used generically for both Intel and AMD.
>
> SPR for all intents and purposes for software does not have tiles. So please
> lets not design for that ;-)
>
> The reality is that we really should not hardcode topology things to cache things.
> Sure today tile is an L3 boundary for AMD, and on all no-tile systems by construction
> of the topology tree.
> But maybe some smart person in AMD decides
> that for a next generation, it's faster to split the L3 in half -- or to make that
> extra HBM-like cache span 2 tiles or .. or ..
>
> CPUID enumerates cache domains pretty much separate and that;s a good thing.
> We absolutely need a "cache view" of the system, but that is a mapping to topology,
> not hardcoded in topology (so one level of indirection + of course cached/computed
> bitmaps etc for cheap access)
>
>
Makes sense! I think that is the reason we have Cache Identifiers in the
first place. I'm satisfied with the explanation from Thomas
(https://lore.kernel.org/lkml/87y1h2wpfw.ffs@tglx/) as long as their use
in the future, in generic code, does not assume any characteristics that
is not generic to the whole of x86 :)

--
Thanks and Regards,
Prateek