2023-07-28 13:01:39

by Thomas Gleixner

[permalink] [raw]
Subject: [patch v2 21/38] x86/cpu: Provide cpu_init/parse_topology()

Topology evaluation is a complete disaster and impenetrable mess. It's
scattered all over the place with some vendor implementatins doing early
evaluation and some not. The most horrific part is the permanent
overwriting of smt_max_siblings and __max_die_per_package, instead of
establishing them once on the boot CPU and validating the result on the
APs.

The goals are:

- One topology evaluation entry point

- Proper sharing of pointlessly duplicated code

- Proper structuring of the evaluation logic and preferences.

- Evaluating important system wide information only once on the boot CPU

- Making the 0xb/0x1f leaf parsing less convoluted and actually fixing
the short comings of leaf 0x1f evaluation.

Start to consolidate the topology evaluation code by providing the entry
points for the early boot CPU evaluation and for the final parsing on the
boot CPU and the APs.

Move the trivial pieces into that new code:

- The initialization of cpuinfo_x86::topo

- The evaluation of CPUID leaf 1, which presets topo::initial_apicid

- topo_apicid is set to topo::initial_apicid when invoked from early
boot. When invoked for the final evaluation on the boot CPU it reads
the actual APIC ID, which makes apic_get_initial_apicid() obsolete
once everything is converted over.

Provide a temporary helper function topo_converted() which shields off the
not yet converted CPU vendors from invoking code which would break them.
This shielding covers all vendor CPUs which support SMP, but not the
historical pure UP ones as they only need the topology info init and
eventually the initial APIC initialization.

Provide two new members in cpuinfo_x86::topo to store the maximum number of
SMT siblings and the number of dies per package and add them to the debugfs
readout. These two members will be used to populate this information on the
boot CPU and to validate the APs against it.

Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/include/asm/topology.h | 19 +++
arch/x86/kernel/cpu/Makefile | 3
arch/x86/kernel/cpu/common.c | 23 +---
arch/x86/kernel/cpu/cpu.h | 6 +
arch/x86/kernel/cpu/debugfs.c | 37 ++++++
arch/x86/kernel/cpu/topology.h | 32 +++++
arch/x86/kernel/cpu/topology_common.c | 187 ++++++++++++++++++++++++++++++++++
7 files changed, 290 insertions(+), 17 deletions(-)

--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -102,6 +102,25 @@ static inline void setup_node_to_cpumask

#include <asm-generic/topology.h>

+/* Topology information */
+enum x86_topology_domains {
+ TOPO_SMT_DOMAIN,
+ TOPO_CORE_DOMAIN,
+ TOPO_MODULE_DOMAIN,
+ TOPO_TILE_DOMAIN,
+ TOPO_DIE_DOMAIN,
+ TOPO_PKG_DOMAIN,
+ TOPO_ROOT_DOMAIN,
+ TOPO_MAX_DOMAIN,
+};
+
+struct x86_topology_system {
+ unsigned int dom_shifts[TOPO_MAX_DOMAIN];
+ unsigned int dom_size[TOPO_MAX_DOMAIN];
+};
+
+extern struct x86_topology_system x86_topo_system;
+
extern const struct cpumask *cpu_coregroup_mask(int cpu);
extern const struct cpumask *cpu_clustergroup_mask(int cpu);

--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -17,7 +17,8 @@ KMSAN_SANITIZE_common.o := n
# As above, instrumenting secondary CPU boot code causes boot hangs.
KCSAN_SANITIZE_common.o := n

-obj-y := cacheinfo.o scattered.o topology.o
+obj-y := cacheinfo.o scattered.o
+obj-y += topology_common.o topology.o
obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1553,6 +1553,8 @@ static void __init early_identify_cpu(st
setup_force_cpu_cap(X86_FEATURE_CPUID);
cpu_parse_early_param();

+ cpu_init_topology(c);
+
if (this_cpu->c_early_init)
this_cpu->c_early_init(c);

@@ -1563,6 +1565,7 @@ static void __init early_identify_cpu(st
this_cpu->c_bsp_init(c);
} else {
setup_clear_cpu_cap(X86_FEATURE_CPUID);
+ cpu_init_topology(c);
}

setup_force_cpu_cap(X86_FEATURE_ALWAYS);
@@ -1708,18 +1711,6 @@ static void generic_identify(struct cpui

get_cpu_address_sizes(c);

- if (c->cpuid_level >= 0x00000001) {
- c->topo.initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
-#ifdef CONFIG_X86_32
-# ifdef CONFIG_SMP
- c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0);
-# else
- c->topo.apicid = c->topo.initial_apicid;
-# endif
-#endif
- c->topo.pkg_id = c->topo.initial_apicid;
- }
-
get_model_name(c); /* Default name */

/*
@@ -1778,9 +1769,6 @@ static void identify_cpu(struct cpuinfo_
c->x86_model_id[0] = '\0'; /* Unset */
c->x86_max_cores = 1;
c->x86_coreid_bits = 0;
- c->topo.cu_id = 0xff;
- c->topo.llc_id = BAD_APICID;
- c->topo.l2c_id = BAD_APICID;
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
c->x86_phys_bits = 36;
@@ -1799,6 +1787,8 @@ static void identify_cpu(struct cpuinfo_

generic_identify(c);

+ cpu_parse_topology(c);
+
if (this_cpu->c_identify)
this_cpu->c_identify(c);

@@ -1806,7 +1796,8 @@ static void identify_cpu(struct cpuinfo_
apply_forced_caps(c);

#ifdef CONFIG_X86_64
- c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0);
+ if (!topo_is_converted(c))
+ c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0);
#endif

/*
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -2,6 +2,11 @@
#ifndef ARCH_X86_CPU_H
#define ARCH_X86_CPU_H

+#include <asm/cpu.h>
+#include <asm/topology.h>
+
+#include "topology.h"
+
/* attempt to consolidate cpu attributes */
struct cpu_dev {
const char *c_vendor;
@@ -95,4 +100,5 @@ static inline bool spectre_v2_in_eibrs_m
mode == SPECTRE_V2_EIBRS_RETPOLINE ||
mode == SPECTRE_V2_EIBRS_LFENCE;
}
+
#endif /* ARCH_X86_CPU_H */
--- a/arch/x86/kernel/cpu/debugfs.c
+++ b/arch/x86/kernel/cpu/debugfs.c
@@ -5,6 +5,8 @@
#include <asm/apic.h>
#include <asm/processor.h>

+#include "cpu.h"
+
static int cpu_debug_show(struct seq_file *m, void *p)
{
unsigned long cpu = (unsigned long)m->private;
@@ -43,12 +45,47 @@ static const struct file_operations dfs_
.release = single_release,
};

+static int dom_debug_show(struct seq_file *m, void *p)
+{
+ static const char *domain_names[TOPO_ROOT_DOMAIN] = {
+ [TOPO_SMT_DOMAIN] = "Thread",
+ [TOPO_CORE_DOMAIN] = "Core",
+ [TOPO_MODULE_DOMAIN] = "Module",
+ [TOPO_TILE_DOMAIN] = "Tile",
+ [TOPO_DIE_DOMAIN] = "Die",
+ [TOPO_PKG_DOMAIN] = "Package",
+ };
+ unsigned int dom, nthreads = 1;
+
+ for (dom = 0; dom < TOPO_ROOT_DOMAIN; dom++) {
+ nthreads *= x86_topo_system.dom_size[dom];
+ seq_printf(m, "domain: %-10s shift: %u dom_size: %5u max_threads: %5u\n",
+ domain_names[dom], x86_topo_system.dom_shifts[dom],
+ x86_topo_system.dom_size[dom], nthreads);
+ }
+ return 0;
+}
+
+static int dom_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, dom_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_dom_ops = {
+ .open = dom_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
static __init int cpu_init_debugfs(void)
{
struct dentry *dir, *base = debugfs_create_dir("topo", arch_debugfs_dir);
unsigned long id;
char name [10];

+ debugfs_create_file("domains", 0444, base, NULL, &dfs_dom_ops);
+
dir = debugfs_create_dir("cpus", base);
for_each_possible_cpu(id) {
sprintf(name, "%lu", id);
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_TOPOLOGY_H
+#define ARCH_X86_TOPOLOGY_H
+
+struct topo_scan {
+ struct cpuinfo_x86 *c;
+ unsigned int dom_shifts[TOPO_MAX_DOMAIN];
+ unsigned int dom_ncpus[TOPO_MAX_DOMAIN];
+
+};
+
+bool topo_is_converted(struct cpuinfo_x86 *c);
+void cpu_init_topology(struct cpuinfo_x86 *c);
+void cpu_parse_topology(struct cpuinfo_x86 *c);
+void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus);
+
+static inline u32 topo_shift_apicid(u32 apicid, enum x86_topology_domains dom)
+{
+ if (dom == TOPO_SMT_DOMAIN)
+ return apicid;
+ return apicid >> x86_topo_system.dom_shifts[dom - 1];
+}
+
+static inline u32 topo_relative_domain_id(u32 apicid, enum x86_topology_domains dom)
+{
+ if (dom != TOPO_SMT_DOMAIN)
+ apicid >>= x86_topo_system.dom_shifts[dom - 1];
+ return apicid & (x86_topo_system.dom_size[dom] - 1);
+}
+
+#endif /* ARCH_X86_TOPOLOGY_H */
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology_common.c
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <xen/xen.h>
+
+#include <asm/apic.h>
+#include <asm/processor.h>
+#include <asm/smp.h>
+
+#include "cpu.h"
+
+struct x86_topology_system x86_topo_system __ro_after_init;
+
+void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus)
+{
+ tscan->dom_shifts[dom] = shift;
+ tscan->dom_ncpus[dom] = ncpus;
+
+ /* Propagate to the upper levels */
+ for (dom++; dom < TOPO_MAX_DOMAIN; dom++) {
+ tscan->dom_shifts[dom] = tscan->dom_shifts[dom - 1];
+ tscan->dom_ncpus[dom] = tscan->dom_ncpus[dom - 1];
+ }
+}
+
+bool topo_is_converted(struct cpuinfo_x86 *c)
+{
+ /* Temporary until everything is converted over. */
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ case X86_VENDOR_CENTAUR:
+ case X86_VENDOR_INTEL:
+ case X86_VENDOR_HYGON:
+ case X86_VENDOR_ZHAOXIN:
+ return false;
+ default:
+ /* Let all UP systems use the below */
+ return true;
+ }
+}
+
+static bool fake_topology(struct topo_scan *tscan)
+{
+ /*
+ * Preset the CORE level shift for CPUID less systems and XEN_PV,
+ * which has useless CPUID information.
+ */
+ topology_set_dom(tscan, TOPO_SMT_DOMAIN, 0, 1);
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, 1, 1);
+
+ return tscan->c->cpuid_level < 1 || xen_pv_domain();
+}
+
+static void parse_topology(struct topo_scan *tscan, bool early)
+{
+ const struct cpuinfo_topology topo_defaults = {
+ .cu_id = 0xff,
+ .llc_id = BAD_APICID,
+ .l2c_id = BAD_APICID,
+ };
+ struct cpuinfo_x86 *c = tscan->c;
+ struct {
+ u32 unused0 : 16,
+ nproc : 8,
+ apicid : 8;
+ } ebx;
+
+ c->topo = topo_defaults;
+
+ if (fake_topology(tscan))
+ return;
+
+ /* Preset Initial APIC ID from CPUID leaf 1 */
+ cpuid_leaf_reg(1, CPUID_EBX, &ebx);
+ c->topo.initial_apicid = ebx.apicid;
+
+ /*
+ * The initial invocation from early_identify_cpu() happens before
+ * the APIC is mapped or X2APIC enabled. For establishing the
+ * topology, that's not required. Use the initial APIC ID.
+ */
+ if (early)
+ c->topo.apicid = c->topo.initial_apicid;
+ else
+ c->topo.apicid = read_apic_id();
+
+ /* The above is sufficient for UP */
+ if (!IS_ENABLED(CONFIG_SMP))
+ return;
+}
+
+static void topo_set_ids(struct topo_scan *tscan)
+{
+ struct cpuinfo_x86 *c = tscan->c;
+ u32 apicid = c->topo.apicid;
+
+ c->topo.pkg_id = topo_shift_apicid(apicid, TOPO_ROOT_DOMAIN);
+ c->topo.die_id = topo_shift_apicid(apicid, TOPO_DIE_DOMAIN);
+
+ /* Relative core ID */
+ c->topo.core_id = topo_relative_domain_id(apicid, TOPO_CORE_DOMAIN);
+}
+
+static void topo_set_max_cores(struct topo_scan *tscan)
+{
+ /*
+ * Bug compatible for now. This is broken on hybrid systems:
+ * 8 cores SMT + 8 cores w/o SMT
+ * tscan.dom_ncpus[TOPO_CORE_DOMAIN] = 24; 24 / 2 = 12 !!
+ *
+ * Cannot be fixed without further topology enumeration changes.
+ */
+ tscan->c->x86_max_cores = tscan->dom_ncpus[TOPO_CORE_DOMAIN] >>
+ x86_topo_system.dom_shifts[TOPO_SMT_DOMAIN];
+}
+
+void cpu_parse_topology(struct cpuinfo_x86 *c)
+{
+ unsigned int dom, cpu = smp_processor_id();
+ struct topo_scan tscan = { .c = c, };
+
+ parse_topology(&tscan, false);
+
+ if (!topo_is_converted(c))
+ return;
+
+ for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) {
+ if (tscan.dom_shifts[dom] == x86_topo_system.dom_shifts[dom])
+ continue;
+ pr_err(FW_BUG "CPU%d: Topology domain %u shift %u != %u\n", cpu, dom,
+ tscan.dom_shifts[dom], x86_topo_system.dom_shifts[dom]);
+ }
+
+ /* Bug compatible with the existing parsers */
+ if (tscan.dom_ncpus[TOPO_SMT_DOMAIN] > smp_num_siblings) {
+ if (system_state == SYSTEM_BOOTING) {
+ pr_warn_once("CPU%d: SMT detected and enabled late\n", cpu);
+ smp_num_siblings = tscan.dom_ncpus[TOPO_SMT_DOMAIN];
+ } else {
+ pr_warn_once("CPU%d: SMT detected after init. Too late!\n", cpu);
+ }
+ }
+
+ topo_set_ids(&tscan);
+ topo_set_max_cores(&tscan);
+}
+
+void __init cpu_init_topology(struct cpuinfo_x86 *c)
+{
+ struct topo_scan tscan = { .c = c, };
+ unsigned int dom, sft;
+
+ parse_topology(&tscan, true);
+
+ if (!topo_is_converted(c))
+ return;
+
+ /* Copy the shift values and calculate the unit sizes. */
+ memcpy(x86_topo_system.dom_shifts, tscan.dom_shifts, sizeof(x86_topo_system.dom_shifts));
+
+ dom = TOPO_SMT_DOMAIN;
+ x86_topo_system.dom_size[dom] = 1U << x86_topo_system.dom_shifts[dom];
+
+ for (dom++; dom < TOPO_MAX_DOMAIN; dom++) {
+ sft = x86_topo_system.dom_shifts[dom] - x86_topo_system.dom_shifts[dom - 1];
+ x86_topo_system.dom_size[dom] = 1U << sft;
+ }
+
+ topo_set_ids(&tscan);
+ topo_set_max_cores(&tscan);
+
+ /*
+ * Bug compatible with the existing code. If the boot CPU does not
+ * have SMT this ends up with one sibling. This needs way deeper
+ * changes further down the road to get it right during early boot.
+ */
+ smp_num_siblings = tscan.dom_ncpus[TOPO_SMT_DOMAIN];
+
+ /*
+ * Neither it's clear whether there are as many dies as the APIC
+ * space indicating die level is. But assume that the actual number
+ * of CPUs gives a proper indication for now to stay bug compatible.
+ */
+ __max_die_per_package = tscan.dom_ncpus[TOPO_DIE_DOMAIN] /
+ tscan.dom_ncpus[TOPO_DIE_DOMAIN - 1];
+}



2023-07-29 01:38:19

by Sohil Mehta

[permalink] [raw]
Subject: Re: [patch v2 21/38] x86/cpu: Provide cpu_init/parse_topology()

On 7/28/2023 5:13 AM, Thomas Gleixner wrote:
> Topology evaluation is a complete disaster and impenetrable mess. It's
> scattered all over the place with some vendor implementatins doing early

s/implementatins/implementations

> +static void parse_topology(struct topo_scan *tscan, bool early)
> +{
> + const struct cpuinfo_topology topo_defaults = {
> + .cu_id = 0xff,
> + .llc_id = BAD_APICID,
> + .l2c_id = BAD_APICID,
> + };
> + struct cpuinfo_x86 *c = tscan->c;
> + struct {
> + u32 unused0 : 16,
> + nproc : 8,
> + apicid : 8;
> + } ebx;
> +
> + c->topo = topo_defaults;
> +
> + if (fake_topology(tscan))
> + return;
> +

Spaces used for indenting "return" instead of a tab.



2023-07-31 04:58:52

by Michael Kelley (LINUX)

[permalink] [raw]
Subject: RE: [patch v2 21/38] x86/cpu: Provide cpu_init/parse_topology()

From: Thomas Gleixner <[email protected]> Sent: Friday, July 28, 2023 5:13 AM
>
> Topology evaluation is a complete disaster and impenetrable mess. It's
> scattered all over the place with some vendor implementatins doing early
> evaluation and some not. The most horrific part is the permanent
> overwriting of smt_max_siblings and __max_die_per_package, instead of
> establishing them once on the boot CPU and validating the result on the
> APs.
>
> The goals are:
>
> - One topology evaluation entry point
>
> - Proper sharing of pointlessly duplicated code
>
> - Proper structuring of the evaluation logic and preferences.
>
> - Evaluating important system wide information only once on the boot CPU
>
> - Making the 0xb/0x1f leaf parsing less convoluted and actually fixing
> the short comings of leaf 0x1f evaluation.
>
> Start to consolidate the topology evaluation code by providing the entry
> points for the early boot CPU evaluation and for the final parsing on the
> boot CPU and the APs.
>
> Move the trivial pieces into that new code:
>
> - The initialization of cpuinfo_x86::topo
>
> - The evaluation of CPUID leaf 1, which presets topo::initial_apicid
>
> - topo_apicid is set to topo::initial_apicid when invoked from early
> boot. When invoked for the final evaluation on the boot CPU it reads
> the actual APIC ID, which makes apic_get_initial_apicid() obsolete
> once everything is converted over.

>
> Provide a temporary helper function topo_converted() which shields off the
> not yet converted CPU vendors from invoking code which would break them.
> This shielding covers all vendor CPUs which support SMP, but not the
> historical pure UP ones as they only need the topology info init and
> eventually the initial APIC initialization.
>
> Provide two new members in cpuinfo_x86::topo to store the maximum number of
> SMT siblings and the number of dies per package and add them to the debugfs
> readout. These two members will be used to populate this information on the
> boot CPU and to validate the APs against it.
>
> Signed-off-by: Thomas Gleixner <[email protected]>
> ---
> arch/x86/include/asm/topology.h | 19 +++
> arch/x86/kernel/cpu/Makefile | 3
> arch/x86/kernel/cpu/common.c | 23 +---
> arch/x86/kernel/cpu/cpu.h | 6 +
> arch/x86/kernel/cpu/debugfs.c | 37 ++++++
> arch/x86/kernel/cpu/topology.h | 32 +++++
> arch/x86/kernel/cpu/topology_common.c | 187
> ++++++++++++++++++++++++++++++++++
> 7 files changed, 290 insertions(+), 17 deletions(-)
>

[snip]

> +
> +static void parse_topology(struct topo_scan *tscan, bool early)
> +{
> + const struct cpuinfo_topology topo_defaults = {
> + .cu_id = 0xff,
> + .llc_id = BAD_APICID,
> + .l2c_id = BAD_APICID,
> + };
> + struct cpuinfo_x86 *c = tscan->c;
> + struct {
> + u32 unused0 : 16,
> + nproc : 8,
> + apicid : 8;
> + } ebx;
> +
> + c->topo = topo_defaults;
> +
> + if (fake_topology(tscan))
> + return;
> +
> + /* Preset Initial APIC ID from CPUID leaf 1 */
> + cpuid_leaf_reg(1, CPUID_EBX, &ebx);
> + c->topo.initial_apicid = ebx.apicid;
> +
> + /*
> + * The initial invocation from early_identify_cpu() happens before
> + * the APIC is mapped or X2APIC enabled. For establishing the
> + * topology, that's not required. Use the initial APIC ID.
> + */
> + if (early)
> + c->topo.apicid = c->topo.initial_apicid;
> + else
> + c->topo.apicid = read_apic_id();

Using the value from the local APIC ID reg turns out to cause a problem in
some Hyper-V VM configurations. If a VM has multiple L3 caches (probably
due to multiple NUMA nodes) and the # of CPUs in the span of the L3 cache
is not a power of 2, the APIC IDs for the CPUs in the span of the 1st L3 cache
are sequential starting with 0. But then there is a gap before starting the
APIC IDs for the CPUs in the span of the 2nd L3 cache. The gap is
repeated if there are additional L3 caches.

The CPUID instruction executed on a guest vCPU correctly reports the APIC
IDs. However, the ACPI MADT assigns the APIC IDs sequentially with no
gaps, and the guest firmware sets the APIC_ID register for each local APIC
to match the MADT. When parse_topology() sets the apicid field based on
reading the local APIC ID register, the value it sets is different from the
initial_apicid value for CPUs in the span of the 2nd and subsequent L3
caches, because there's no gap in the APIC IDs read from the local APIC.
Linux boots and runs, but the topology is set up with the wrong span for
the L3 cache and for the associated scheduling domains.

The old code derives the apicid from the initial_apicid via the
phys_pkg_id() callback, so these bad Hyper-V VM configs skate by. The
wrong value in the local APIC ID register and MADT does not affect
anything, except that the check in validate_apic_and_package_id() fails
during boot, and a set of "Firmware bug:" messages is correctly output.

Three thoughts:

1) Are Hyper-V VMs the only place where the local APIC ID register might
have a bogus value? Probably so, but you never know what might crawl out.

2) The natural response is "Well, fix Hyper-V!" I first had this conversation
with the Hyper-V team about 5 years ago. Some cases of the problem were
fixed, but some cases remain unfixed. It's a long story.

3) Since Hyper-V code in Linux already has an override for the apic->read()
function, it's possible to do a hack in that override so that apicid gets set to
the same value as initial_apicid, which matches the old code. Here's the diff:

diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 72d9931da3a2..2e7b18557186 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -58,6 +58,8 @@ static u32 hv_apic_read(u32 reg)
u32 reg_val, hi;

switch (reg) {
+ case APIC_ID:
+ return __this_cpu_read(cpu_info.topo.initial_apicid) << 24;
case APIC_EOI:
rdmsr(HV_X64_MSR_EOI, reg_val, hi);
(void)hi;
@@ -311,6 +313,7 @@ void __init hv_apic_init(void)
* both xapic and x2apic because the field layout is the same.
*/
apic_update_callback(eoi, hv_apic_eoi_write);
+ apic->apic_id_registered = NULL;
if (!x2apic_enabled()) {
apic_update_callback(read, hv_apic_read);
apic_update_callback(write, hv_apic_write);

Setting apic->apic_id_registered to NULL is necessary because it does
read_apic_id() and checks that the value matches an APIC ID that was
registered when the MADT was parsed. This test fails for some vCPUs
in the VM because the APIC IDs from the MADT are also sequential
with no gaps as mentioned above. I don't see any big hazard in
bypassing the check.

The hv_apic_read() override is used only in VMs with an xapic.
I still need to check a few things, but I believe Hyper-V gets
MADT and local APIC ID reg numbering correct when an x2apic
is used, so I don't think any hacks are needed for that path.

Does anyone have suggestions on a different way to handle
this that's better than the above diff? Other thoughts?

Michael

2023-07-31 13:20:01

by Thomas Gleixner

[permalink] [raw]
Subject: RE: [patch v2 21/38] x86/cpu: Provide cpu_init/parse_topology()

On Mon, Jul 31 2023 at 04:05, Michael Kelley wrote:
>> + /*
>> + * The initial invocation from early_identify_cpu() happens before
>> + * the APIC is mapped or X2APIC enabled. For establishing the
>> + * topology, that's not required. Use the initial APIC ID.
>> + */
>> + if (early)
>> + c->topo.apicid = c->topo.initial_apicid;
>> + else
>> + c->topo.apicid = read_apic_id();
>
> Using the value from the local APIC ID reg turns out to cause a problem in
> some Hyper-V VM configurations. If a VM has multiple L3 caches (probably
> due to multiple NUMA nodes) and the # of CPUs in the span of the L3 cache
> is not a power of 2, the APIC IDs for the CPUs in the span of the 1st L3 cache
> are sequential starting with 0. But then there is a gap before starting the
> APIC IDs for the CPUs in the span of the 2nd L3 cache. The gap is
> repeated if there are additional L3 caches.
>
> The CPUID instruction executed on a guest vCPU correctly reports the APIC
> IDs. However, the ACPI MADT assigns the APIC IDs sequentially with no
> gaps, and the guest firmware sets the APIC_ID register for each local APIC
> to match the MADT. When parse_topology() sets the apicid field based on
> reading the local APIC ID register, the value it sets is different from the
> initial_apicid value for CPUs in the span of the 2nd and subsequent L3
> caches, because there's no gap in the APIC IDs read from the local APIC.
> Linux boots and runs, but the topology is set up with the wrong span for
> the L3 cache and for the associated scheduling domains.

TBH. That's an insanity. MADT and the actual APIC ID determine the
topology. So the gaps should be reflected in MADT and the actual APIC
IDs should be set correctly if the intent is to provide topology
information.

Just for the record. This hack works only on Intel today, because AMD
init sets topo.apicid = read_apic_id() unconditionally. So this is
inconsistent already, no?

> The old code derives the apicid from the initial_apicid via the
> phys_pkg_id() callback, so these bad Hyper-V VM configs skate by. The
> wrong value in the local APIC ID register and MADT does not affect
> anything, except that the check in validate_apic_and_package_id() fails
> during boot, and a set of "Firmware bug:" messages is correctly
> output.

So instead of fixing the firmware bugs, hyper-v just moves on and
pretends that everything works fine, right?

> Three thoughts:
>
> 1) Are Hyper-V VMs the only place where the local APIC ID register might
> have a bogus value? Probably so, but you never know what might crawl
> out.

Define bogus. MADT is the primary source of information because that's
how we know how many CPUs (APICs) are there and what their APIC ID is
which we can use to wake them up. So there is a reasonable expectation
that this information is consistent with the rest of the system.

The Intel SDM clearly says in Vol 3A section 9.4.5 Identifying Logical
Processors in an MP System:

"After the BIOS has completed the MP initialization protocol, each
logical processor can be uniquely identified by its local APIC
ID. Software can access these APIC IDs in either of the following
ways:"

These ways include read from APIC, read MADT, read CPUID and implies
that this must be consistent. For X2APIC it's actually written out:

"If the local APIC unit supports x2APIC and is operating in x2APIC
mode, 32-bit APIC ID can be read by executing a RDMSR instruction to
read the processor’s x2APIC ID register. This method is equivalent to
executing CPUID leaf 0BH described below."

AMD has not been following that in the early 64bit systems as they moved
the APIC ID space to start at 32 for the first CPU in the first socket
for whatever reasons. But since then the kernel reads back the APIC ID
on AMD systems into topo.apicid. But that was long ago and can easily be
dealt with because at least the real APIC ID and the MADT/MPTABLE
entries are consistent.

Hypervisors have their own CPUID space to override functionality with
their own magic stuff, but imposing their nutbolt ideas on the
architectural part of the system is not only wrong, it's disrespectful
against the OS developers who try to keep their system sane.

> 2) The natural response is "Well, fix Hyper-V!" I first had this conversation
> with the Hyper-V team about 5 years ago. Some cases of the problem were
> fixed, but some cases remain unfixed. It's a long story.
>
> 3) Since Hyper-V code in Linux already has an override for the apic->read()
> function, it's possible to do a hack in that override so that apicid gets set to
> the same value as initial_apicid, which matches the old code. Here's the diff:

This collides massively with the other work I'm doing, which uses the
MADT provided information to actually evaluate various topology related
things upfront and later during bringup. Thats badly needed because lots
of todays infrastructure is based on heuristics and guesswork.

But it seems I wasted a month on reworking all of this just to be
stopped cold in the tracks by completely undocumented and unnecessary
hyper-v abuse.

So if Hyper-V insists on abusing the initial APIC ID as read from CPUID
for topology information related to L3, then hyper-v should override the
cache topology mechanism and not impose this insanity on the basic
topology evaluation infrastructure.

Yours seriously grumpy

tglx

2023-07-31 14:14:54

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [patch v2 21/38] x86/cpu: Provide cpu_init/parse_topology()

On 7/30/2023 9:05 PM, Michael Kelley (LINUX) wrote:
> Does anyone have suggestions on a different way to handle
> this that's better than the above diff? Other thoughts?

how badly do you need xapic ? Meaning, can x2apic just be used instead always


2023-07-31 15:21:27

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch v2 21/38] x86/cpu: Provide cpu_init/parse_topology()

On Mon, Jul 31, 2023 at 02:34:39PM +0200, Thomas Gleixner wrote:

> This collides massively with the other work I'm doing, which uses the
> MADT provided information to actually evaluate various topology related
> things upfront and later during bringup. Thats badly needed because lots
> of todays infrastructure is based on heuristics and guesswork.
>
> But it seems I wasted a month on reworking all of this just to be
> stopped cold in the tracks by completely undocumented and unnecessary
> hyper-v abuse.
>
> So if Hyper-V insists on abusing the initial APIC ID as read from CPUID
> for topology information related to L3, then hyper-v should override the
> cache topology mechanism and not impose this insanity on the basic
> topology evaluation infrastructure.

So I'm very tempted to suggest you continue with the topology rewrite
and let Hyper-V keep the pieces. They're very clearly violating the SDM.

Thing as they stand are untenable, the whole topology thing as it exists
today is an untenable shitshow.

Michael, is there anything you can do early (as in MADT parse early) to
fix up the APIC-IDs?

2023-07-31 16:28:06

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch v2 21/38] x86/cpu: Provide cpu_init/parse_topology()

On Mon, Jul 31 2023 at 15:27, Peter Zijlstra wrote:
> On Mon, Jul 31, 2023 at 02:34:39PM +0200, Thomas Gleixner wrote:
>> This collides massively with the other work I'm doing, which uses the
>> MADT provided information to actually evaluate various topology related
>> things upfront and later during bringup. Thats badly needed because lots
>> of todays infrastructure is based on heuristics and guesswork.
>>
>> But it seems I wasted a month on reworking all of this just to be
>> stopped cold in the tracks by completely undocumented and unnecessary
>> hyper-v abuse.
>>
>> So if Hyper-V insists on abusing the initial APIC ID as read from CPUID
>> for topology information related to L3, then hyper-v should override the
>> cache topology mechanism and not impose this insanity on the basic
>> topology evaluation infrastructure.
>
> So I'm very tempted to suggest you continue with the topology rewrite
> and let Hyper-V keep the pieces. They're very clearly violating the SDM.
>
> Thing as they stand are untenable, the whole topology thing as it exists
> today is an untenable shitshow.
>
> Michael, is there anything you can do early (as in MADT parse early) to
> fix up the APIC-IDs?

I don't think so.

Michael, can you please provide me a table of:

APICID (real/MADT) APICID (CPUID)

from one of the tinker VMs please?

Thanks,

tglx

2023-07-31 16:32:15

by Michael Kelley (LINUX)

[permalink] [raw]
Subject: RE: [patch v2 21/38] x86/cpu: Provide cpu_init/parse_topology()

From: Thomas Gleixner <[email protected]> Sent: Monday, July 31, 2023 8:38 AM
>
> On Mon, Jul 31 2023 at 15:27, Peter Zijlstra wrote:
> > On Mon, Jul 31, 2023 at 02:34:39PM +0200, Thomas Gleixner wrote:
> >> This collides massively with the other work I'm doing, which uses the
> >> MADT provided information to actually evaluate various topology related
> >> things upfront and later during bringup. Thats badly needed because lots
> >> of todays infrastructure is based on heuristics and guesswork.
> >>
> >> But it seems I wasted a month on reworking all of this just to be
> >> stopped cold in the tracks by completely undocumented and unnecessary
> >> hyper-v abuse.
> >>
> >> So if Hyper-V insists on abusing the initial APIC ID as read from CPUID
> >> for topology information related to L3, then hyper-v should override the
> >> cache topology mechanism and not impose this insanity on the basic
> >> topology evaluation infrastructure.
> >
> > So I'm very tempted to suggest you continue with the topology rewrite
> > and let Hyper-V keep the pieces. They're very clearly violating the SDM.
> >
> > Thing as they stand are untenable, the whole topology thing as it exists
> > today is an untenable shitshow.
> >
> > Michael, is there anything you can do early (as in MADT parse early) to
> > fix up the APIC-IDs?
>
> I don't think so.
>
> Michael, can you please provide me a table of:
>
> APICID (real/MADT) APICID (CPUID)
>
> from one of the tinker VMs please?
>

The VM is an F72s_v2 in Azure running your patch set. The VM has
72 vCPUs in two NUMA nodes across two physical Intel processors, with
36 vCPUs in each NUMA node.

The output is from /sys/kernel/debug/x86/topo/cpus, so the initial_apicid
is from CPUID, while the apicid is from read_apic_id() and matches the
MADT. As expected, the two values match for the first 36 vCPUs, but differ
by 28 (decimal) for the remaining 36.

initial_apicid: 0 apicid: 0
initial_apicid: 1 apicid: 1
initial_apicid: 2 apicid: 2
initial_apicid: 3 apicid: 3
initial_apicid: 4 apicid: 4
initial_apicid: 5 apicid: 5
initial_apicid: 6 apicid: 6
initial_apicid: 7 apicid: 7
initial_apicid: 8 apicid: 8
initial_apicid: 9 apicid: 9
initial_apicid: a apicid: a
initial_apicid: b apicid: b
initial_apicid: c apicid: c
initial_apicid: d apicid: d
initial_apicid: e apicid: e
initial_apicid: f apicid: f
initial_apicid: 10 apicid: 10
initial_apicid: 11 apicid: 11
initial_apicid: 12 apicid: 12
initial_apicid: 13 apicid: 13
initial_apicid: 14 apicid: 14
initial_apicid: 15 apicid: 15
initial_apicid: 16 apicid: 16
initial_apicid: 17 apicid: 17
initial_apicid: 18 apicid: 18
initial_apicid: 19 apicid: 19
initial_apicid: 1a apicid: 1a
initial_apicid: 1b apicid: 1b
initial_apicid: 1c apicid: 1c
initial_apicid: 1d apicid: 1d
initial_apicid: 1e apicid: 1e
initial_apicid: 1f apicid: 1f
initial_apicid: 20 apicid: 20
initial_apicid: 21 apicid: 21
initial_apicid: 22 apicid: 22
initial_apicid: 23 apicid: 23
initial_apicid: 40 apicid: 24
initial_apicid: 41 apicid: 25
initial_apicid: 42 apicid: 26
initial_apicid: 43 apicid: 27
initial_apicid: 44 apicid: 28
initial_apicid: 45 apicid: 29
initial_apicid: 46 apicid: 2a
initial_apicid: 47 apicid: 2b
initial_apicid: 48 apicid: 2c
initial_apicid: 49 apicid: 2d
initial_apicid: 4a apicid: 2e
initial_apicid: 4b apicid: 2f
initial_apicid: 4c apicid: 30
initial_apicid: 4d apicid: 31
initial_apicid: 4e apicid: 32
initial_apicid: 4f apicid: 33
initial_apicid: 50 apicid: 34
initial_apicid: 51 apicid: 35
initial_apicid: 52 apicid: 36
initial_apicid: 53 apicid: 37
initial_apicid: 54 apicid: 38
initial_apicid: 55 apicid: 39
initial_apicid: 56 apicid: 3a
initial_apicid: 57 apicid: 3b
initial_apicid: 58 apicid: 3c
initial_apicid: 59 apicid: 3d
initial_apicid: 5a apicid: 3e
initial_apicid: 5b apicid: 3f
initial_apicid: 5c apicid: 40
initial_apicid: 5d apicid: 41
initial_apicid: 5e apicid: 42
initial_apicid: 5f apicid: 43
initial_apicid: 60 apicid: 44
initial_apicid: 61 apicid: 45
initial_apicid: 62 apicid: 46
initial_apicid: 63 apicid: 47

Michael

2023-07-31 19:22:17

by Michael Kelley (LINUX)

[permalink] [raw]
Subject: RE: [patch v2 21/38] x86/cpu: Provide cpu_init/parse_topology()

From: Thomas Gleixner <[email protected]> Sent: Monday, July 31, 2023 5:35 AM
>
> On Mon, Jul 31 2023 at 04:05, Michael Kelley wrote:
> >> + /*
> >> + * The initial invocation from early_identify_cpu() happens before
> >> + * the APIC is mapped or X2APIC enabled. For establishing the
> >> + * topology, that's not required. Use the initial APIC ID.
> >> + */
> >> + if (early)
> >> + c->topo.apicid = c->topo.initial_apicid;
> >> + else
> >> + c->topo.apicid = read_apic_id();
> >
> > Using the value from the local APIC ID reg turns out to cause a problem in
> > some Hyper-V VM configurations. If a VM has multiple L3 caches (probably
> > due to multiple NUMA nodes) and the # of CPUs in the span of the L3 cache
> > is not a power of 2, the APIC IDs for the CPUs in the span of the 1st L3 cache
> > are sequential starting with 0. But then there is a gap before starting the
> > APIC IDs for the CPUs in the span of the 2nd L3 cache. The gap is
> > repeated if there are additional L3 caches.
> >
> > The CPUID instruction executed on a guest vCPU correctly reports the APIC
> > IDs. However, the ACPI MADT assigns the APIC IDs sequentially with no
> > gaps, and the guest firmware sets the APIC_ID register for each local APIC
> > to match the MADT. When parse_topology() sets the apicid field based on
> > reading the local APIC ID register, the value it sets is different from the
> > initial_apicid value for CPUs in the span of the 2nd and subsequent L3
> > caches, because there's no gap in the APIC IDs read from the local APIC.
> > Linux boots and runs, but the topology is set up with the wrong span for
> > the L3 cache and for the associated scheduling domains.
>
> TBH. That's an insanity. MADT and the actual APIC ID determine the
> topology. So the gaps should be reflected in MADT and the actual APIC
> IDs should be set correctly if the intent is to provide topology
> information.
>
> Just for the record. This hack works only on Intel today, because AMD
> init sets topo.apicid = read_apic_id() unconditionally. So this is
> inconsistent already, no?
>

Correct. But given that the L3 cache span in the AMD Zen1 and Zen2
processors is only 8 CPUs, there's much less reason to configure a VM
that only uses some of the CPUs in an L3 cache span. Hyper-V does
the APIC ID numbering correctly for Zen3 with its 16 CPUs in the L3
cache span.

> > The old code derives the apicid from the initial_apicid via the
> > phys_pkg_id() callback, so these bad Hyper-V VM configs skate by. The
> > wrong value in the local APIC ID register and MADT does not affect
> > anything, except that the check in validate_apic_and_package_id() fails
> > during boot, and a set of "Firmware bug:" messages is correctly
> > output.
>
> So instead of fixing the firmware bugs, hyper-v just moves on and
> pretends that everything works fine, right?

What can I say. :-(

>
> > Three thoughts:
> >
> > 1) Are Hyper-V VMs the only place where the local APIC ID register might
> > have a bogus value? Probably so, but you never know what might crawl
> > out.
>
> Define bogus. MADT is the primary source of information because that's
> how we know how many CPUs (APICs) are there and what their APIC ID is
> which we can use to wake them up. So there is a reasonable expectation
> that this information is consistent with the rest of the system.

Commit d49597fd3bc7 "x86/cpu: Deal with broken firmware (VMWare/Xen)"
mentions VMware and XEN implementations that violate the spec. The
commit is from late 2016. Have these bad systems aged out and no longer
need accommodation?

>
> The Intel SDM clearly says in Vol 3A section 9.4.5 Identifying Logical
> Processors in an MP System:
>
> "After the BIOS has completed the MP initialization protocol, each
> logical processor can be uniquely identified by its local APIC
> ID. Software can access these APIC IDs in either of the following
> ways:"
>
> These ways include read from APIC, read MADT, read CPUID and implies
> that this must be consistent. For X2APIC it's actually written out:
>
> "If the local APIC unit supports x2APIC and is operating in x2APIC
> mode, 32-bit APIC ID can be read by executing a RDMSR instruction to
> read the processor’s x2APIC ID register. This method is equivalent to
> executing CPUID leaf 0BH described below."
>
> AMD has not been following that in the early 64bit systems as they moved
> the APIC ID space to start at 32 for the first CPU in the first socket
> for whatever reasons. But since then the kernel reads back the APIC ID
> on AMD systems into topo.apicid. But that was long ago and can easily be
> dealt with because at least the real APIC ID and the MADT/MPTABLE
> entries are consistent.
>
> Hypervisors have their own CPUID space to override functionality with
> their own magic stuff, but imposing their nutbolt ideas on the
> architectural part of the system is not only wrong, it's disrespectful
> against the OS developers who try to keep their system sane.
>
> > 2) The natural response is "Well, fix Hyper-V!" I first had this conversation
> > with the Hyper-V team about 5 years ago. Some cases of the problem were
> > fixed, but some cases remain unfixed. It's a long story.
> >
> > 3) Since Hyper-V code in Linux already has an override for the apic->read()
> > function, it's possible to do a hack in that override so that apicid gets set to
> > the same value as initial_apicid, which matches the old code. Here's the diff:
>
> This collides massively with the other work I'm doing, which uses the
> MADT provided information to actually evaluate various topology related
> things upfront and later during bringup. Thats badly needed because lots
> of todays infrastructure is based on heuristics and guesswork.

Fair enough. And I've re-raised the issue with the Hyper-V team.

>
> But it seems I wasted a month on reworking all of this just to be
> stopped cold in the tracks by completely undocumented and unnecessary
> hyper-v abuse.
>
> So if Hyper-V insists on abusing the initial APIC ID as read from CPUID
> for topology information related to L3, then hyper-v should override the
> cache topology mechanism and not impose this insanity on the basic
> topology evaluation infrastructure.
>
> Yours seriously grumpy
>
> tglx

2023-07-31 22:30:15

by Thomas Gleixner

[permalink] [raw]
Subject: RE: [patch v2 21/38] x86/cpu: Provide cpu_init/parse_topology()

On Mon, Jul 31 2023 at 16:25, Michael Kelley wrote:
> From: Thomas Gleixner <[email protected]> Sent: Monday, July 31, 2023 5:35 AM
>> Define bogus. MADT is the primary source of information because that's
>> how we know how many CPUs (APICs) are there and what their APIC ID is
>> which we can use to wake them up. So there is a reasonable expectation
>> that this information is consistent with the rest of the system.
>
> Commit d49597fd3bc7 "x86/cpu: Deal with broken firmware (VMWare/Xen)"
> mentions VMware and XEN implementations that violate the spec. The
> commit is from late 2016. Have these bad systems aged out and no longer
> need accommodation?

They do, but this commit explicitely uses the MADT/real APIC ID value:

c->initial_apicid = apicid;

So the new mechanics are accomodating for those, right?

Thanks,

tglx

2023-07-31 22:30:16

by Michael Kelley (LINUX)

[permalink] [raw]
Subject: RE: [patch v2 21/38] x86/cpu: Provide cpu_init/parse_topology()

From: Thomas Gleixner <[email protected]> Sent: Monday, July 31, 2023 1:49 PM
>
> On Mon, Jul 31 2023 at 16:10, Michael Kelley wrote:
> > From: Thomas Gleixner <[email protected]> Sent: Monday, July 31, 2023 8:38 AM
> > The VM is an F72s_v2 in Azure running your patch set. The VM has
> > 72 vCPUs in two NUMA nodes across two physical Intel processors, with
> > 36 vCPUs in each NUMA node.
> >
> > The output is from /sys/kernel/debug/x86/topo/cpus, so the initial_apicid
> > is from CPUID, while the apicid is from read_apic_id() and matches the
> > MADT. As expected, the two values match for the first 36 vCPUs, but differ
> > by 28 (decimal) for the remaining 36.
> >
> > initial_apicid: 0 apicid: 0
> ...
> > initial_apicid: 23 apicid: 23
>
> > initial_apicid: 40 apicid: 24
> ...
> > initial_apicid: 63 apicid: 47
>
> Is there any indication in some other CPUID leaf which lets us deduce this
> wreckage?

You can detect being a Hyper-V guest with leaf 0x40000000. See Linux
kernel function ms_hyperv_platform(). But I'm not aware of anything
to indicate that a specific Hyper-V VM has the APIC numbering problem
vs. doesn't have the problem.

>
> I don't think the hypervisor space (0x40000xx) has anything helpful, but
> staring at the architectural ones provided by hyper-V to the guest might
> give us an hint. Can you provide a cpuid dump for the boot CPU please?
>

I'm not sure if you want the raw or decoded output. Here's both.

Michael

# taskset -c 0 cpuid -r -1
CPU:
0x00000000 0x00: eax=0x00000015 ebx=0x756e6547 ecx=0x6c65746e edx=0x49656e69
0x00000001 0x00: eax=0x000606a6 ebx=0x00400800 ecx=0xfeda3223 edx=0x1f8bfbff
0x00000002 0x00: eax=0x00feff01 ebx=0x000000f0 ecx=0x00000000 edx=0x00000000
0x00000003 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x00000004 0x00: eax=0x7c004121 ebx=0x02c0003f ecx=0x0000003f edx=0x00000000
0x00000004 0x01: eax=0x7c004122 ebx=0x01c0003f ecx=0x0000003f edx=0x00000000
0x00000004 0x02: eax=0x7c004143 ebx=0x04c0003f ecx=0x000003ff edx=0x00000000
0x00000004 0x03: eax=0x7c0fc163 ebx=0x02c0003f ecx=0x0000ffff edx=0x00000000
0x00000005 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x00000006 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x00000007 0x00: eax=0x00000000 ebx=0xd09f2fb9 ecx=0x00000000 edx=0x00000400
0x00000008 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x00000009 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x0000000a 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x0000000b 0x00: eax=0x00000001 ebx=0x00000002 ecx=0x00000100 edx=0x00000000
0x0000000b 0x01: eax=0x00000006 ebx=0x00000040 ecx=0x00000201 edx=0x00000000
0x0000000c 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x0000000d 0x00: eax=0x000000e7 ebx=0x00000a80 ecx=0x00000a80 edx=0x00000000
0x0000000d 0x01: eax=0x0000000b ebx=0x00000980 ecx=0x00000000 edx=0x00000000
0x0000000d 0x02: eax=0x00000100 ebx=0x00000240 ecx=0x00000000 edx=0x00000000
0x0000000d 0x05: eax=0x00000040 ebx=0x00000440 ecx=0x00000000 edx=0x00000000
0x0000000d 0x06: eax=0x00000200 ebx=0x00000480 ecx=0x00000000 edx=0x00000000
0x0000000d 0x07: eax=0x00000400 ebx=0x00000680 ecx=0x00000000 edx=0x00000000
0x0000000e 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x0000000f 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x00000010 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x00000011 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x00000012 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x00000013 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x00000014 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x00000015 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x40000000 0x00: eax=0x4000000a ebx=0x7263694d ecx=0x666f736f edx=0x76482074
0x40000001 0x00: eax=0x31237648 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x40000002 0x00: eax=0x00004f7c ebx=0x000a0000 ecx=0x00000001 edx=0x000005b6
0x40000003 0x00: eax=0x00002e7f ebx=0x003b8030 ecx=0x00000002 edx=0x000ed7b2
0x40000004 0x00: eax=0x00064e24 ebx=0x00000fff ecx=0x0000002e edx=0x00000000
0x40000005 0x00: eax=0x000000f0 ebx=0x00000400 ecx=0x00005d00 edx=0x00000000
0x40000006 0x00: eax=0x0000000f ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x40000007 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x40000008 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x40000009 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x4000000a 0x00: eax=0x000e0101 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x20000000 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x80000000 0x00: eax=0x80000008 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x80000001 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000121 edx=0x2c100800
0x80000002 0x00: eax=0x65746e49 ebx=0x2952286c ecx=0x6f655820 edx=0x2952286e
0x80000003 0x00: eax=0x616c5020 ebx=0x756e6974 ecx=0x3338206d edx=0x20433037
0x80000004 0x00: eax=0x20555043 ebx=0x2e322040 ecx=0x48473038 edx=0x0000007a
0x80000005 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x80000006 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x01006040 edx=0x00000000
0x80000007 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x80000008 0x00: eax=0x0000302e ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0x80860000 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
0xc0000000 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000

CPU:
vendor_id = "GenuineIntel"
version information (1/eax):
processor type = primary processor (0)
family = 0x6 (6)
model = 0xa (10)
stepping id = 0x6 (6)
extended family = 0x0 (0)
extended model = 0x6 (6)
(family synth) = 0x6 (6)
(model synth) = 0x6a (106)
(simple synth) = Intel Core (Ice Lake) [Sunny Cove] {Sunny Cove}, 10nm
miscellaneous (1/ebx):
process local APIC physical ID = 0x0 (0)
cpu count = 0x40 (64)
CLFLUSH line size = 0x8 (8)
brand index = 0x0 (0)
brand id = 0x00 (0): unknown
feature information (1/edx):
x87 FPU on chip = true
VME: virtual-8086 mode enhancement = true
DE: debugging extensions = true
PSE: page size extensions = true
TSC: time stamp counter = true
RDMSR and WRMSR support = true
PAE: physical address extensions = true
MCE: machine check exception = true
CMPXCHG8B inst. = true
APIC on chip = true
SYSENTER and SYSEXIT = true
MTRR: memory type range registers = true
PTE global bit = true
MCA: machine check architecture = true
CMOV: conditional move/compare instr = true
PAT: page attribute table = true
PSE-36: page size extension = true
PSN: processor serial number = false
CLFLUSH instruction = true
DS: debug store = false
ACPI: thermal monitor and clock ctrl = false
MMX Technology = true
FXSAVE/FXRSTOR = true
SSE extensions = true
SSE2 extensions = true
SS: self snoop = true
hyper-threading / multi-core supported = true
TM: therm. monitor = false
IA64 = false
PBE: pending break event = false
feature information (1/ecx):
PNI/SSE3: Prescott New Instructions = true
PCLMULDQ instruction = true
DTES64: 64-bit debug store = false
MONITOR/MWAIT = false
CPL-qualified debug store = false
VMX: virtual machine extensions = true
SMX: safer mode extensions = false
Enhanced Intel SpeedStep Technology = false
TM2: thermal monitor 2 = false
SSSE3 extensions = true
context ID: adaptive or shared L1 data = false
SDBG: IA32_DEBUG_INTERFACE = false
FMA instruction = true
CMPXCHG16B instruction = true
xTPR disable = false
PDCM: perfmon and debug = false
PCID: process context identifiers = true
DCA: direct cache access = false
SSE4.1 extensions = true
SSE4.2 extensions = true
x2APIC: extended xAPIC support = false
MOVBE instruction = true
POPCNT instruction = true
time stamp counter deadline = false
AES instruction = true
XSAVE/XSTOR states = true
OS-enabled XSAVE/XSTOR = true
AVX: advanced vector extensions = true
F16C half-precision convert instruction = true
RDRAND instruction = true
hypervisor guest status = true
cache and TLB information (2):
0xff: cache data is in CPUID leaf 4
0xfe: TLB data is in CPUID leaf 0x18
0xf0: 64 byte prefetching
processor serial number = 0006-06A6-0000-0000-0000-0000
deterministic cache parameters (4):
--- cache 0 ---
cache type = data cache (1)
cache level = 0x1 (1)
self-initializing cache level = true
fully associative cache = false
extra threads sharing this cache = 0x1 (1)
extra processor cores on this die = 0x1f (31)
system coherency line size = 0x40 (64)
physical line partitions = 0x1 (1)
ways of associativity = 0xc (12)
number of sets = 0x40 (64)
WBINVD/INVD acts on lower caches = false
inclusive to lower caches = false
complex cache indexing = false
number of sets (s) = 64
(size synth) = 49152 (48 KB)
--- cache 1 ---
cache type = instruction cache (2)
cache level = 0x1 (1)
self-initializing cache level = true
fully associative cache = false
extra threads sharing this cache = 0x1 (1)
extra processor cores on this die = 0x1f (31)
system coherency line size = 0x40 (64)
physical line partitions = 0x1 (1)
ways of associativity = 0x8 (8)
number of sets = 0x40 (64)
WBINVD/INVD acts on lower caches = false
inclusive to lower caches = false
complex cache indexing = false
number of sets (s) = 64
(size synth) = 32768 (32 KB)
--- cache 2 ---
cache type = unified cache (3)
cache level = 0x2 (2)
self-initializing cache level = true
fully associative cache = false
extra threads sharing this cache = 0x1 (1)
extra processor cores on this die = 0x1f (31)
system coherency line size = 0x40 (64)
physical line partitions = 0x1 (1)
ways of associativity = 0x14 (20)
number of sets = 0x400 (1024)
WBINVD/INVD acts on lower caches = false
inclusive to lower caches = false
complex cache indexing = false
number of sets (s) = 1024
(size synth) = 1310720 (1.2 MB)
--- cache 3 ---
cache type = unified cache (3)
cache level = 0x3 (3)
self-initializing cache level = true
fully associative cache = false
extra threads sharing this cache = 0x3f (63)
extra processor cores on this die = 0x1f (31)
system coherency line size = 0x40 (64)
physical line partitions = 0x1 (1)
ways of associativity = 0xc (12)
number of sets = 0x10000 (65536)
WBINVD/INVD acts on lower caches = false
inclusive to lower caches = false
complex cache indexing = false
number of sets (s) = 65536
(size synth) = 50331648 (48 MB)
MONITOR/MWAIT (5):
smallest monitor-line size (bytes) = 0x0 (0)
largest monitor-line size (bytes) = 0x0 (0)
enum of Monitor-MWAIT exts supported = false
supports intrs as break-event for MWAIT = false
number of C0 sub C-states using MWAIT = 0x0 (0)
number of C1 sub C-states using MWAIT = 0x0 (0)
number of C2 sub C-states using MWAIT = 0x0 (0)
number of C3 sub C-states using MWAIT = 0x0 (0)
number of C4 sub C-states using MWAIT = 0x0 (0)
number of C5 sub C-states using MWAIT = 0x0 (0)
number of C6 sub C-states using MWAIT = 0x0 (0)
number of C7 sub C-states using MWAIT = 0x0 (0)
Thermal and Power Management Features (6):
digital thermometer = false
Intel Turbo Boost Technology = false
ARAT always running APIC timer = false
PLN power limit notification = false
ECMD extended clock modulation duty = false
PTM package thermal management = false
HWP base registers = false
HWP notification = false
HWP activity window = false
HWP energy performance preference = false
HWP package level request = false
HDC base registers = false
Intel Turbo Boost Max Technology 3.0 = false
HWP capabilities = false
HWP PECI override = false
flexible HWP = false
IA32_HWP_REQUEST MSR fast access mode = false
HW_FEEDBACK = false
ignoring idle logical processor HWP req = false
digital thermometer thresholds = 0x0 (0)
hardware coordination feedback = false
ACNT2 available = false
performance-energy bias capability = false
performance capability reporting = false
energy efficiency capability reporting = false
size of feedback struct (4KB pages) = 0x0 (0)
index of CPU's row in feedback struct = 0x0 (0)
extended feature flags (7):
FSGSBASE instructions = true
IA32_TSC_ADJUST MSR supported = false
SGX: Software Guard Extensions supported = false
BMI1 instructions = true
HLE hardware lock elision = true
AVX2: advanced vector extensions 2 = true
FDP_EXCPTN_ONLY = false
SMEP supervisor mode exec protection = true
BMI2 instructions = true
enhanced REP MOVSB/STOSB = true
INVPCID instruction = true
RTM: restricted transactional memory = true
RDT-CMT/PQoS cache monitoring = false
deprecated FPU CS/DS = true
MPX: intel memory protection extensions = false
RDT-CAT/PQE cache allocation = false
AVX512F: AVX-512 foundation instructions = true
AVX512DQ: double & quadword instructions = true
RDSEED instruction = true
ADX instructions = true
SMAP: supervisor mode access prevention = true
AVX512IFMA: fused multiply add = false
PCOMMIT instruction = false
CLFLUSHOPT instruction = true
CLWB instruction = false
Intel processor trace = false
AVX512PF: prefetch instructions = false
AVX512ER: exponent & reciprocal instrs = false
AVX512CD: conflict detection instrs = true
SHA instructions = false
AVX512BW: byte & word instructions = true
AVX512VL: vector length = true
PREFETCHWT1 = false
AVX512VBMI: vector byte manipulation = false
UMIP: user-mode instruction prevention = false
PKU protection keys for user-mode = false
OSPKE CR4.PKE and RDPKRU/WRPKRU = false
WAITPKG instructions = false
AVX512_VBMI2: byte VPCOMPRESS, VPEXPAND = false
CET_SS: CET shadow stack = false
GFNI: Galois Field New Instructions = false
VAES instructions = false
VPCLMULQDQ instruction = false
AVX512_VNNI: neural network instructions = false
AVX512_BITALG: bit count/shiffle = false
TME: Total Memory Encryption = false
AVX512: VPOPCNTDQ instruction = false
5-level paging = false
BNDLDX/BNDSTX MAWAU value in 64-bit mode = 0x0 (0)
RDPID: read processor D supported = false
CLDEMOTE supports cache line demote = false
MOVDIRI instruction = false
MOVDIR64B instruction = false
ENQCMD instruction = false
SGX_LC: SGX launch config supported = false
AVX512_4VNNIW: neural network instrs = false
AVX512_4FMAPS: multiply acc single prec = false
fast short REP MOV = false
AVX512_VP2INTERSECT: intersect mask regs = false
VERW md-clear microcode support = true
hybrid part = false
PCONFIG instruction = false
CET_IBT: CET indirect branch tracking = false
IBRS/IBPB: indirect branch restrictions = false
STIBP: 1 thr indirect branch predictor = false
L1D_FLUSH: IA32_FLUSH_CMD MSR = false
IA32_ARCH_CAPABILITIES MSR = false
IA32_CORE_CAPABILITIES MSR = false
SSBD: speculative store bypass disable = false
Direct Cache Access Parameters (9):
PLATFORM_DCA_CAP MSR bits = 0
Architecture Performance Monitoring Features (0xa/eax):
version ID = 0x0 (0)
number of counters per logical processor = 0x0 (0)
bit width of counter = 0x0 (0)
length of EBX bit vector = 0x0 (0)
Architecture Performance Monitoring Features (0xa/ebx):
core cycle event not available = false
instruction retired event not available = false
reference cycles event not available = false
last-level cache ref event not available = false
last-level cache miss event not avail = false
branch inst retired event not available = false
branch mispred retired event not avail = false
Architecture Performance Monitoring Features (0xa/edx):
number of fixed counters = 0x0 (0)
bit width of fixed counters = 0x0 (0)
anythread deprecation = false
x2APIC features / processor topology (0xb):
extended APIC ID = 0
--- level 0 ---
level number = 0x0 (0)
level type = thread (1)
bit width of level = 0x1 (1)
number of logical processors at level = 0x2 (2)
--- level 1 ---
level number = 0x1 (1)
level type = core (2)
bit width of level = 0x6 (6)
number of logical processors at level = 0x40 (64)
XSAVE features (0xd/0):
XCR0 lower 32 bits valid bit field mask = 0x000000e7
XCR0 upper 32 bits valid bit field mask = 0x00000000
XCR0 supported: x87 state = true
XCR0 supported: SSE state = true
XCR0 supported: AVX state = true
XCR0 supported: MPX BNDREGS = false
XCR0 supported: MPX BNDCSR = false
XCR0 supported: AVX-512 opmask = true
XCR0 supported: AVX-512 ZMM_Hi256 = true
XCR0 supported: AVX-512 Hi16_ZMM = true
IA32_XSS supported: PT state = false
XCR0 supported: PKRU state = false
XCR0 supported: CET_U state = false
XCR0 supported: CET_S state = false
IA32_XSS supported: HDC state = false
bytes required by fields in XCR0 = 0x00000a80 (2688)
bytes required by XSAVE/XRSTOR area = 0x00000a80 (2688)
XSAVE features (0xd/1):
XSAVEOPT instruction = true
XSAVEC instruction = true
XGETBV instruction = false
XSAVES/XRSTORS instructions = true
SAVE area size in bytes = 0x00000980 (2432)
IA32_XSS lower 32 bits valid bit field mask = 0x00000000
IA32_XSS upper 32 bits valid bit field mask = 0x00000000
AVX/YMM features (0xd/2):
AVX/YMM save state byte size = 0x00000100 (256)
AVX/YMM save state byte offset = 0x00000240 (576)
supported in IA32_XSS or XCR0 = XCR0 (user state)
64-byte alignment in compacted XSAVE = false
AVX-512 opmask features (0xd/5):
AVX-512 opmask save state byte size = 0x00000040 (64)
AVX-512 opmask save state byte offset = 0x00000440 (1088)
supported in IA32_XSS or XCR0 = XCR0 (user state)
64-byte alignment in compacted XSAVE = false
AVX-512 ZMM_Hi256 features (0xd/6):
AVX-512 ZMM_Hi256 save state byte size = 0x00000200 (512)
AVX-512 ZMM_Hi256 save state byte offset = 0x00000480 (1152)
supported in IA32_XSS or XCR0 = XCR0 (user state)
64-byte alignment in compacted XSAVE = false
AVX-512 Hi16_ZMM features (0xd/7):
AVX-512 Hi16_ZMM save state byte size = 0x00000400 (1024)
AVX-512 Hi16_ZMM save state byte offset = 0x00000680 (1664)
supported in IA32_XSS or XCR0 = XCR0 (user state)
64-byte alignment in compacted XSAVE = false
Quality of Service Monitoring Resource Type (0xf/0):
Maximum range of RMID = 0
supports L3 cache QoS monitoring = false
Resource Director Technology Allocation (0x10/0):
L3 cache allocation technology supported = false
L2 cache allocation technology supported = false
memory bandwidth allocation supported = false
0x00000011 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
Software Guard Extensions (SGX) capability (0x12/0):
SGX1 supported = false
SGX2 supported = false
SGX ENCLV E*VIRTCHILD, ESETCONTEXT = false
SGX ENCLS ETRACKC, ERDINFO, ELDBC, ELDUC = false
MISCSELECT.EXINFO supported: #PF & #GP = false
MISCSELECT.CPINFO supported: #CP = false
MaxEnclaveSize_Not64 (log2) = 0x0 (0)
MaxEnclaveSize_64 (log2) = 0x0 (0)
0x00000013 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000
Intel Processor Trace (0x14):
IA32_RTIT_CR3_MATCH is accessible = false
configurable PSB & cycle-accurate = false
IP & TraceStop filtering; PT preserve = false
MTC timing packet; suppress COFI-based = false
PTWRITE support = false
power event trace support = false
ToPA output scheme support = false
ToPA can hold many output entries = false
single-range output scheme support = false
output to trace transport = false
IP payloads have LIP values & CS = false
Time Stamp Counter/Core Crystal Clock Information (0x15):
TSC/clock ratio = 0/0
nominal core crystal clock = 0 Hz
hypervisor_id = "Microsoft Hv"
hypervisor interface identification (0x40000001/eax):
version = "Hv#1"
hypervisor system identity (0x40000002):
build = 20348
version = 10.0
service pack = 1
service branch = 0
service number = 1462
hypervisor feature identification (0x40000003/eax):
VP run time = true
partition reference counter = true
basic synIC MSRs = true
synthetic timer MSRs = true
APIC access MSRs = true
hypercall MSRs = true
access virtual process index MSR = true
virtual system reset MSR = false
map/unmap statistics pages MSR = false
reference TSC access = true
guest idle state MSR = true
TSC/APIC frequency MSRs = true
guest debugging MSRs = false
hypervisor partition creation flags (0x40000003/ebx):
CreatePartitions = false
AccessPartitionId = false
AccessMemoryPool = false
AdjustMessageBuffers = false
PostMessages = true
SignalEvents = true
CreatePort = false
ConnectPort = false
AccessStats = false
Debugging = false
CPUManagement = false
ConfigureProfiler = false
AccessVSM = true
AccessVpRegisters = true
EnableExtendedHypercalls = true
StartVirtualProcessor = true
hypervisor power management features (0x40000003/ecx):
maximum process power state = 0x2 (2)
hypervisor feature identification (0x40000003/edx):
MWAIT available = false
guest debugging support available = true
performance monitor support available = false
CPU dynamic partitioning events avail = false
hypercall XMM input parameters available = true
virtual guest idle state available = true
hypervisor sleep state available = false
query NUMA distance available = true
determine timer frequency available = true
inject synthetic machine check available = true
guest crash MSRs available = true
debug MSRs available = false
NPIEP available = true
disable hypervisor available = false
extended gva ranges for flush virt addrs = true
hypercall XMM register return available = true
sint polling mode available = true
hypercall MSR lock available = true
use direct synthetic timers = true
hypervisor recommendations (0x40000004/eax):
use hypercalls for AS switches = false
use hypercalls for local TLB flushes = false
use hypercalls for remote TLB flushes = true
use MSRs to access EOI, ICR, TPR = false
use MSRs to initiate system RESET = false
use relaxed timing = true
use DMA remapping = false
use interrupt remapping = false
use x2APIC MSRs = false
deprecate AutoEOI = true
use SyntheticClusterIpi hypercall = true
use ExProcessorMasks = true
hypervisor is nested with Hyper-V = false
use INT for MBEC system calls = false
use enlightened VMCS interface = true
maximum number of spinlock retry attempts = 0xfff (4095)
hypervisor implementation limits (0x40000005):
maximum number of virtual processors = 0xf0 (240)
maximum number of logical processors = 0x400 (1024)
maximum number of physical interrupt vectors for remapping = 0x5d00 (23808)
hypervisor hardware features used (0x40000006/eax):
APIC overlay assist = true
MSR bitmaps = true
performance counters = true
second-level address translation = true
DMA remapping = false
interrupt remapping = false
memory patrol scrubber = false
DMA protection = false
HPET requested = false
synthetic timers are volatile = false
hypervisor root partition enlightenments (0x40000007):
StartLogicalProcessor = false
CreateRootvirtualProcessor = false
ProcessorPowerManagement = false
MwaitIdleStates = false
LogicalProcessorIdling = false
hypervisor shared virtual memory (0x40000008):
SvmSupported = false
MaxPasidSpacePasidCount = 0x0 (0)
hypervisor nested hypervisor features (0x40000009):
AccessSynicRegs = false
AccessIntrCtrlRegs = false
AccessHypercallMsrs = false
AccessVpIndex = false
AccessReenlightenmentControls = false
XmmRegistersForFastHypercallAvailable = false
FastHypercallOutputAvailable = false
SintPoillingModeAvailable = false
hypervisor nested virtualization features (0x4000000a):
enlightened VMCS version (low) = 0x1 (1)
enlightened VMCS version (high) = 0x1 (1)
direct virtual flush hypercalls support = true
HvFlushGuestPhysicalAddress* hypercalls = true
enlightened MSR bitmap support = true
extended feature flags (0x80000001/edx):
SYSCALL and SYSRET instructions = true
execution disable = true
1-GB large page support = true
RDTSCP = true
64-bit extensions technology available = true
Intel feature flags (0x80000001/ecx):
LAHF/SAHF supported in 64-bit mode = true
LZCNT advanced bit manipulation = true
3DNow! PREFETCH/PREFETCHW instructions = true
brand = "Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz"
L1 TLB/cache information: 2M/4M pages & L1 TLB (0x80000005/eax):
instruction # entries = 0x0 (0)
instruction associativity = 0x0 (0)
data # entries = 0x0 (0)
data associativity = 0x0 (0)
L1 TLB/cache information: 4K pages & L1 TLB (0x80000005/ebx):
instruction # entries = 0x0 (0)
instruction associativity = 0x0 (0)
data # entries = 0x0 (0)
data associativity = 0x0 (0)
L1 data cache information (0x80000005/ecx):
line size (bytes) = 0x0 (0)
lines per tag = 0x0 (0)
associativity = 0x0 (0)
size (KB) = 0x0 (0)
L1 instruction cache information (0x80000005/edx):
line size (bytes) = 0x0 (0)
lines per tag = 0x0 (0)
associativity = 0x0 (0)
size (KB) = 0x0 (0)
L2 TLB/cache information: 2M/4M pages & L2 TLB (0x80000006/eax):
instruction # entries = 0x0 (0)
instruction associativity = L2 off (0)
data # entries = 0x0 (0)
data associativity = L2 off (0)
L2 TLB/cache information: 4K pages & L2 TLB (0x80000006/ebx):
instruction # entries = 0x0 (0)
instruction associativity = L2 off (0)
data # entries = 0x0 (0)
data associativity = L2 off (0)
L2 unified cache information (0x80000006/ecx):
line size (bytes) = 0x40 (64)
lines per tag = 0x0 (0)
associativity = 8-way (6)
size (KB) = 0x100 (256)
L3 cache information (0x80000006/edx):
line size (bytes) = 0x0 (0)
lines per tag = 0x0 (0)
associativity = L2 off (0)
size (in 512KB units) = 0x0 (0)
RAS Capability (0x80000007/ebx):
MCA overflow recovery support = false
SUCCOR support = false
HWA: hardware assert support = false
scalable MCA support = false
Advanced Power Management Features (0x80000007/ecx):
CmpUnitPwrSampleTimeRatio = 0x0 (0)
Advanced Power Management Features (0x80000007/edx):
TS: temperature sensing diode = false
FID: frequency ID control = false
VID: voltage ID control = false
TTP: thermal trip = false
TM: thermal monitor = false
STC: software thermal control = false
100 MHz multiplier control = false
hardware P-State control = false
TscInvariant = false
CPB: core performance boost = false
read-only effective frequency interface = false
processor feedback interface = false
APM power reporting = false
connected standby = false
RAPL: running average power limit = false
Physical Address and Linear Address Size (0x80000008/eax):
maximum physical address bits = 0x2e (46)
maximum linear (virtual) address bits = 0x30 (48)
maximum guest physical address bits = 0x0 (0)
Extended Feature Extensions ID (0x80000008/ebx):
CLZERO instruction = false
instructions retired count support = false
always save/restore error pointers = false
RDPRU instruction = false
memory bandwidth enforcement = false
WBNOINVD instruction = false
IBPB: indirect branch prediction barrier = false
IBRS: indirect branch restr speculation = false
STIBP: 1 thr indirect branch predictor = false
STIBP always on preferred mode = false
ppin processor id number supported = false
SSBD: speculative store bypass disable = false
virtualized SSBD = false
SSBD fixed in hardware = false
Size Identifiers (0x80000008/ecx):
number of CPU cores = 0x1 (1)
ApicIdCoreIdSize = 0x0 (0)
performance time-stamp counter size = 0x0 (0)
Feature Extended Size (0x80000008/edx):
RDPRU instruction max input support = 0x0 (0)
(multi-processing synth) = multi-core (c=32), hyper-threaded (t=2)
(multi-processing method) = Intel leaf 0xb
(APIC widths synth): CORE_width=6 SMT_width=1
(APIC synth): PKG_ID=0 CORE_ID=0 SMT_ID=0
(uarch synth) = Intel Sunny Cove {Sunny Cove}, 10nm
(synth) = Intel Core (Ice Lake) [Sunny Cove] {Sunny Cove}, 10nm

2023-07-31 22:46:50

by Thomas Gleixner

[permalink] [raw]
Subject: RE: [patch v2 21/38] x86/cpu: Provide cpu_init/parse_topology()

On Mon, Jul 31 2023 at 21:27, Michael Kelley wrote:
> From: Thomas Gleixner <[email protected]> Sent: Monday, July 31, 2023 1:49 PM
>> Is there any indication in some other CPUID leaf which lets us deduce this
>> wreckage?
>
> You can detect being a Hyper-V guest with leaf 0x40000000. See Linux
> kernel function ms_hyperv_platform(). But I'm not aware of anything
> to indicate that a specific Hyper-V VM has the APIC numbering problem
> vs. doesn't have the problem.

That's what I said :) here:

>> I don't think the hypervisor space (0x40000xx) has anything helpful, but
>> staring at the architectural ones provided by hyper-V to the guest might
>> give us an hint. Can you provide a cpuid dump for the boot CPU please?
>>
>
> I'm not sure if you want the raw or decoded output. Here's both.

Either way is fine.

Clearly the hyper-v BIOS people put a lot of thoughts into this:

> x2APIC features / processor topology (0xb):
> extended APIC ID = 0
> --- level 0 ---
> level number = 0x0 (0)
> level type = thread (1)
> bit width of level = 0x1 (1)
> number of logical processors at level = 0x2 (2)
> --- level 1 ---
> level number = 0x1 (1)
> level type = core (2)
> bit width of level = 0x6 (6)
> number of logical processors at level = 0x40 (64)

FAIL: ^^^^^

While that field is not meant for topology evaluation it is at least
expected to tell the actual number of logical processors at that level
which are actually available.

The CPUID APIC ID aka initial_apicid clearly tells that the topology has
36 logical CPUs in package 0 and 36 in package 1 according to your
table.

On real hardware this looks like this:

--- level 1 ---
level number = 0x1 (1)
level type = core (2)
bit width of level = 0x6 (6)
number of logical processors at level = 0x38 (56)

Which corresponds to reality and is consistent. But sure, consistency is
overrated.

Thanks,

tglx





2023-07-31 23:01:33

by Thomas Gleixner

[permalink] [raw]
Subject: RE: [patch v2 21/38] x86/cpu: Provide cpu_init/parse_topology()

On Mon, Jul 31 2023 at 16:10, Michael Kelley wrote:
> From: Thomas Gleixner <[email protected]> Sent: Monday, July 31, 2023 8:38 AM
> The VM is an F72s_v2 in Azure running your patch set. The VM has
> 72 vCPUs in two NUMA nodes across two physical Intel processors, with
> 36 vCPUs in each NUMA node.
>
> The output is from /sys/kernel/debug/x86/topo/cpus, so the initial_apicid
> is from CPUID, while the apicid is from read_apic_id() and matches the
> MADT. As expected, the two values match for the first 36 vCPUs, but differ
> by 28 (decimal) for the remaining 36.
>
> initial_apicid: 0 apicid: 0
...
> initial_apicid: 23 apicid: 23

> initial_apicid: 40 apicid: 24
...
> initial_apicid: 63 apicid: 47

Is there any indication in some other CPUID leaf which lets us deduce this
wreckage?

I don't think the hypervisor space (0x40000xx) has anything helpful, but
staring at the architectural ones provided by hyper-V to the guest might
give us an hint. Can you provide a cpuid dump for the boot CPU please?

Thanks,

tglx



2023-08-01 07:36:06

by Gautham R. Shenoy

[permalink] [raw]
Subject: Re: [patch v2 21/38] x86/cpu: Provide cpu_init/parse_topology()

Hello Thomas,

On Fri, Jul 28, 2023 at 02:13:08PM +0200, Thomas Gleixner wrote:
> Topology evaluation is a complete disaster and impenetrable mess. It's
> scattered all over the place with some vendor implementatins doing early
> evaluation and some not. The most horrific part is the permanent
> overwriting of smt_max_siblings and __max_die_per_package, instead of
> establishing them once on the boot CPU and validating the result on the
> APs.
>
> The goals are:
>
> - One topology evaluation entry point
>
> - Proper sharing of pointlessly duplicated code
>
> - Proper structuring of the evaluation logic and preferences.
>
> - Evaluating important system wide information only once on the boot CPU
>
> - Making the 0xb/0x1f leaf parsing less convoluted and actually fixing
> the short comings of leaf 0x1f evaluation.
>
> Start to consolidate the topology evaluation code by providing the entry
> points for the early boot CPU evaluation and for the final parsing on the
> boot CPU and the APs.
>
> Move the trivial pieces into that new code:
>
> - The initialization of cpuinfo_x86::topo
>
> - The evaluation of CPUID leaf 1, which presets topo::initial_apicid
>
> - topo_apicid is set to topo::initial_apicid when invoked from early
> boot. When invoked for the final evaluation on the boot CPU it reads
> the actual APIC ID, which makes apic_get_initial_apicid() obsolete
> once everything is converted over.
>
> Provide a temporary helper function topo_converted() which shields off the
> not yet converted CPU vendors from invoking code which would break them.
> This shielding covers all vendor CPUs which support SMP, but not the
> historical pure UP ones as they only need the topology info init and
> eventually the initial APIC initialization.
>
> Provide two new members in cpuinfo_x86::topo to store the maximum number of
> SMT siblings and the number of dies per package and add them to the debugfs
> readout. These two members will be used to populate this information on the
> boot CPU and to validate the APs against it.
>
> Signed-off-by: Thomas Gleixner <[email protected]>
> ---
> arch/x86/include/asm/topology.h | 19 +++
> arch/x86/kernel/cpu/Makefile | 3
> arch/x86/kernel/cpu/common.c | 23 +---
> arch/x86/kernel/cpu/cpu.h | 6 +
> arch/x86/kernel/cpu/debugfs.c | 37 ++++++
> arch/x86/kernel/cpu/topology.h | 32 +++++
> arch/x86/kernel/cpu/topology_common.c | 187 ++++++++++++++++++++++++++++++++++
> 7 files changed, 290 insertions(+), 17 deletions(-)
>
> --- a/arch/x86/include/asm/topology.h
> +++ b/arch/x86/include/asm/topology.h
> @@ -102,6 +102,25 @@ static inline void setup_node_to_cpumask
>
> #include <asm-generic/topology.h>
>
> +/* Topology information */
> +enum x86_topology_domains {
> + TOPO_SMT_DOMAIN,
> + TOPO_CORE_DOMAIN,
> + TOPO_MODULE_DOMAIN,
> + TOPO_TILE_DOMAIN,
> + TOPO_DIE_DOMAIN,
> + TOPO_PKG_DOMAIN,
> + TOPO_ROOT_DOMAIN,
> + TOPO_MAX_DOMAIN,
> +};
> +

[..snip..]

> +static void topo_set_ids(struct topo_scan *tscan)
> +{
> + struct cpuinfo_x86 *c = tscan->c;
> + u32 apicid = c->topo.apicid;
> +
> + c->topo.pkg_id = topo_shift_apicid(apicid, TOPO_ROOT_DOMAIN);

Shouldn't this use TOPO_PKG_DOMAIN instead of TOPO_ROOT_DOMAIN ?


> + c->topo.die_id = topo_shift_apicid(apicid, TOPO_DIE_DOMAIN);
> +
> + /* Relative core ID */
> + c->topo.core_id = topo_relative_domain_id(apicid, TOPO_CORE_DOMAIN);
> +}
> +

--
Thanks and Regards
gautham.


2023-08-01 08:57:27

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch v2 21/38] x86/cpu: Provide cpu_init/parse_topology()

On Tue, Aug 01 2023 at 12:35, Gautham R. Shenoy wrote:
> On Fri, Jul 28, 2023 at 02:13:08PM +0200, Thomas Gleixner wrote:
>> +static void topo_set_ids(struct topo_scan *tscan)
>> +{
>> + struct cpuinfo_x86 *c = tscan->c;
>> + u32 apicid = c->topo.apicid;
>> +
>> + c->topo.pkg_id = topo_shift_apicid(apicid, TOPO_ROOT_DOMAIN);
>
> Shouldn't this use TOPO_PKG_DOMAIN instead of TOPO_ROOT_DOMAIN ?

Yup. It does not make a difference in that case. That's why I didn't
notice, but let me fix this for conistency sake.


2023-08-01 23:09:09

by Thomas Gleixner

[permalink] [raw]
Subject: RE: [patch v2 21/38] x86/cpu: Provide cpu_init/parse_topology()

Michael!

On Tue, Aug 01 2023 at 00:12, Thomas Gleixner wrote:
> On Mon, Jul 31 2023 at 21:27, Michael Kelley wrote:
> Clearly the hyper-v BIOS people put a lot of thoughts into this:
>
>> x2APIC features / processor topology (0xb):
>> extended APIC ID = 0
>> --- level 0 ---
>> level number = 0x0 (0)
>> level type = thread (1)
>> bit width of level = 0x1 (1)
>> number of logical processors at level = 0x2 (2)
>> --- level 1 ---
>> level number = 0x1 (1)
>> level type = core (2)
>> bit width of level = 0x6 (6)
>> number of logical processors at level = 0x40 (64)
>
> FAIL: ^^^^^
>
> While that field is not meant for topology evaluation it is at least
> expected to tell the actual number of logical processors at that level
> which are actually available.
>
> The CPUID APIC ID aka initial_apicid clearly tells that the topology has
> 36 logical CPUs in package 0 and 36 in package 1 according to your
> table.
>
> On real hardware this looks like this:
>
> --- level 1 ---
> level number = 0x1 (1)
> level type = core (2)
> bit width of level = 0x6 (6)
> number of logical processors at level = 0x38 (56)
>
> Which corresponds to reality and is consistent. But sure, consistency is
> overrated.

So I looked really hard to find some hint how to detect this situation
on the boot CPU, which allows us to mitigate it, but there is none at
all.

So we are caught between a rock and a hard place, which provides us two
mutually exclusive options to chose from:

1) Have a sane topology evaluation mechanism which solves the known
problems of hybrid systems, wrong sizing estimates and other
unpleasantries.

2) Support the Hyper-V BIOS trainwreck forever.

Unsurprisingly #2 is not really an option as #1 is a crucial issue for
the kernel and we need it resolved urgently as of yesterday.

So while I'm definitely a strong supporter of no-regression policy, I
have to make an argument here why this particular issue is _not_
covered:

1) Hyper-V BIOS/firmware violates the firmware specification and
requirements which are clearly spelled out in the SDM.

2) This violatation is reported on every boot with one promiment
message per brought up AP where the initial APIC ID as provided by
CPUID leaf 0xB deviates from the APIC ID read from "hardware", which is
also provided by MADT starting with CPU 36 in the provided example:

"[FIRMWARE BUG] CPU36: APIC id mismatch. Firmware: 40 APIC: 24"

repeating itself up to CPU71 with the relevant diverging APIC IDs.

At least that's what the upstream kernel produces according to
validate_apic_and_package_id() in such an situation.

3) This is known for years and the Hyper-V Linux team tried to get this
resolved, but obviously their arguments fell on deaf ears.

IOW, the firmware BUG message has been ignored willfully for years
due to "works for me, why should I care?" attitude.

Seriously, kernel development cannot be held hostage forever by the
wilful ignorance of a BIOS team, which refuses to adhere to
specifications and defines their own world order.

The x86 maintainer team is chosing the lesser of two evils and lets
those who created the problem and refused to resolve it deal with the
outcome.

Just to clarify. This is not preventing affected guests from booting.
The worst consequence is a slight performance regression because the
firmware provided topology information is not matching reality and
therefore the scheduler placement vs. L3 affinity sucks. That's clearly
not a kernel problem.

I'm happy to aid accelerating this thought process by elevating the
existing pr_err(FW_BUG....) to a solid WARN_ON_ONCE(). See below.

Thanks,

tglx
---
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1688,7 +1688,7 @@ static void validate_apic_and_package_id

apicid = apic->cpu_present_to_apicid(cpu);

- if (apicid != c->topo.apicid) {
+ if (WARN_ON_ONCE(apicid != c->topo.apicid)) {
pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n",
cpu, apicid, c->topo.initial_apicid);
}

2023-08-02 15:05:23

by Michael Kelley (LINUX)

[permalink] [raw]
Subject: RE: [patch v2 21/38] x86/cpu: Provide cpu_init/parse_topology()

From: Thomas Gleixner <[email protected]> Sent: Tuesday, August 1, 2023 3:25 PM
>
> Michael!
>
> On Tue, Aug 01 2023 at 00:12, Thomas Gleixner wrote:
> > On Mon, Jul 31 2023 at 21:27, Michael Kelley wrote:
> > Clearly the hyper-v BIOS people put a lot of thoughts into this:
> >
> >> x2APIC features / processor topology (0xb):
> >> extended APIC ID = 0
> >> --- level 0 ---
> >> level number = 0x0 (0)
> >> level type = thread (1)
> >> bit width of level = 0x1 (1)
> >> number of logical processors at level = 0x2 (2)
> >> --- level 1 ---
> >> level number = 0x1 (1)
> >> level type = core (2)
> >> bit width of level = 0x6 (6)
> >> number of logical processors at level = 0x40 (64)
> >
> > FAIL: ^^^^^
> >
> > While that field is not meant for topology evaluation it is at least
> > expected to tell the actual number of logical processors at that level
> > which are actually available.
> >
> > The CPUID APIC ID aka initial_apicid clearly tells that the topology has
> > 36 logical CPUs in package 0 and 36 in package 1 according to your
> > table.
> >
> > On real hardware this looks like this:
> >
> > --- level 1 ---
> > level number = 0x1 (1)
> > level type = core (2)
> > bit width of level = 0x6 (6)
> > number of logical processors at level = 0x38 (56)
> >
> > Which corresponds to reality and is consistent. But sure, consistency is
> > overrated.
>
> So I looked really hard to find some hint how to detect this situation
> on the boot CPU, which allows us to mitigate it, but there is none at
> all.
>
> So we are caught between a rock and a hard place, which provides us two
> mutually exclusive options to chose from:
>
> 1) Have a sane topology evaluation mechanism which solves the known
> problems of hybrid systems, wrong sizing estimates and other
> unpleasantries.
>
> 2) Support the Hyper-V BIOS trainwreck forever.
>
> Unsurprisingly #2 is not really an option as #1 is a crucial issue for
> the kernel and we need it resolved urgently as of yesterday.
>
> So while I'm definitely a strong supporter of no-regression policy, I
> have to make an argument here why this particular issue is _not_
> covered:
>
> 1) Hyper-V BIOS/firmware violates the firmware specification and
> requirements which are clearly spelled out in the SDM.
>
> 2) This violatation is reported on every boot with one promiment
> message per brought up AP where the initial APIC ID as provided by
> CPUID leaf 0xB deviates from the APIC ID read from "hardware", which is
> also provided by MADT starting with CPU 36 in the provided example:
>
> "[FIRMWARE BUG] CPU36: APIC id mismatch. Firmware: 40 APIC: 24"
>
> repeating itself up to CPU71 with the relevant diverging APIC IDs.
>
> At least that's what the upstream kernel produces according to
> validate_apic_and_package_id() in such an situation.
>
> 3) This is known for years and the Hyper-V Linux team tried to get this
> resolved, but obviously their arguments fell on deaf ears.
>
> IOW, the firmware BUG message has been ignored willfully for years
> due to "works for me, why should I care?" attitude.
>
> Seriously, kernel development cannot be held hostage forever by the
> wilful ignorance of a BIOS team, which refuses to adhere to
> specifications and defines their own world order.
>
> The x86 maintainer team is chosing the lesser of two evils and lets
> those who created the problem and refused to resolve it deal with the
> outcome.

Fair enough. I don't have any basis to argue otherwise. I'm in
discussions with the Hyper-V team about getting it fully fixed in
Hyper-V, and it looks like there's some movement to make it happen.

>
> Just to clarify. This is not preventing affected guests from booting.
> The worst consequence is a slight performance regression because the
> firmware provided topology information is not matching reality and
> therefore the scheduler placement vs. L3 affinity sucks. That's clearly
> not a kernel problem.

Yes, if Linux will still boots and runs, that helps. Then it really is up the
(virtual) firmware in Hyper-V to provide the correct topology information
so performance is as expected.

>
> I'm happy to aid accelerating this thought process by elevating the
> existing pr_err(FW_BUG....) to a solid WARN_ON_ONCE(). See below.

Your choice. In this particular case, it won't make a difference either
way.

Michael

>
> Thanks,
>
> tglx
> ---
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -1688,7 +1688,7 @@ static void validate_apic_and_package_id
>
> apicid = apic->cpu_present_to_apicid(cpu);
>
> - if (apicid != c->topo.apicid) {
> + if (WARN_ON_ONCE(apicid != c->topo.apicid)) {
> pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n",
> cpu, apicid, c->topo.initial_apicid);
> }