2023-07-24 18:32:41

by Thomas Gleixner

[permalink] [raw]
Subject: [patch 17/29] x86/cpu: Provide a sane leaf 0xb/0x1f parser

detect_extended_topology() along with it's early() variant is a classic
example for duct tape engineering:

- It evaluates an array of subleafs with a boatload of local variables
for the relevant topology levels instead of using an array to save the
enumerated information and propagate it to the right level

- It has no boundary checks for subleafs

- It prevents updating the die_id with a crude workaround instead of
checking for leaf 0xb which does not provide die information.

- It's broken vs. the number of dies evaluation as it uses:

num_processors[DIE_LEVEL] / num_processors[CORE_LEVEL]

which "works" only correctly if there is none of the intermediate
topology levels (MODULE/TILE) enumerated.

There is zero value in trying to "fix" that code as the only proper fix is
to rewrite it from scratch.

Implement a sane parser with proper code documentation, which will be used
for the consolidated topology evaluation in the next step.

Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/cpu/Makefile | 2
arch/x86/kernel/cpu/topology.h | 12 +++
arch/x86/kernel/cpu/topology_ext.c | 136 +++++++++++++++++++++++++++++++++++++
3 files changed, 149 insertions(+), 1 deletion(-)

--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -18,7 +18,7 @@ KMSAN_SANITIZE_common.o := n
KCSAN_SANITIZE_common.o := n

obj-y := cacheinfo.o scattered.o
-obj-y += topology_common.o topology.o
+obj-y += topology_common.o topology_ext.o topology.o
obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
--- a/arch/x86/kernel/cpu/topology.h
+++ b/arch/x86/kernel/cpu/topology.h
@@ -16,6 +16,7 @@ void cpu_init_topology(struct cpuinfo_x8
void cpu_parse_topology(struct cpuinfo_x86 *c);
void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
unsigned int shift, unsigned int ncpus);
+bool cpu_parse_topology_ext(struct topo_scan *tscan);

static inline u16 topo_shift_apicid(u16 apicid, enum x86_topology_domains dom)
{
@@ -31,4 +32,15 @@ static inline u16 topo_relative_domain_i
return apicid & (x86_topo_system.dom_size[dom] - 1);
}

+/*
+ * Update a domain level after the fact without propagating. Used to fixup
+ * broken CPUID enumerations.
+ */
+static inline void topology_update_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus)
+{
+ tscan->dom_shifts[dom] = shift;
+ tscan->dom_ncpus[dom] = ncpus;
+}
+
#endif /* ARCH_X86_TOPOLOGY_H */
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology_ext.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <asm/apic.h>
+#include <asm/memtype.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+enum topo_types {
+ INVALID_TYPE = 0,
+ SMT_TYPE = 1,
+ CORE_TYPE = 2,
+ MODULE_TYPE = 3,
+ TILE_TYPE = 4,
+ DIE_TYPE = 5,
+ DIEGRP_TYPE = 6,
+ MAX_TYPE = 7,
+};
+
+/*
+ * Use a lookup table for the case that there are future types > 6 which
+ * describe an intermediate domain level which does not exist today.
+ *
+ * A table will also be handy to parse the new AMD 0x80000026 leaf which
+ * has defined different domain types, but otherwise uses the same layout
+ * with some of the reserved bits used for new information.
+ */
+static const unsigned int topo_domain_map[MAX_TYPE] = {
+ [SMT_TYPE] = TOPO_SMT_DOMAIN,
+ [CORE_TYPE] = TOPO_CORE_DOMAIN,
+ [MODULE_TYPE] = TOPO_MODULE_DOMAIN,
+ [TILE_TYPE] = TOPO_TILE_DOMAIN,
+ [DIE_TYPE] = TOPO_DIE_DOMAIN,
+ [DIEGRP_TYPE] = TOPO_PKG_DOMAIN,
+};
+
+static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf)
+{
+ unsigned int dom, maxtype = leaf == 0xb ? CORE_TYPE + 1 : MAX_TYPE;
+ struct {
+ // eax
+ u32 x2apic_shift : 5, // Number of bits to shift APIC ID right
+ // for the topology ID at the next level
+ __rsvd0 : 27; // Reserved
+ // ebx
+ u32 num_processors : 16, // Number of processors at current level
+ __rsvd1 : 16; // Reserved
+ // ecx
+ u32 level : 8, // Current topology level. Same as sub leaf number
+ type : 8, // Level type. If 0, invalid
+ __rsvd2 : 16; // Reserved
+ // edx
+ u32 x2apic_id : 32; // X2APIC ID of the current logical processor
+ } sl;
+
+ cpuid_subleaf(leaf, subleaf, &sl);
+
+ if (!sl.num_processors || sl.type == INVALID_TYPE)
+ return false;
+
+ if (sl.type >= maxtype) {
+ /*
+ * As the subleafs are ordered in domain level order, this
+ * could be recovered in theory by propagating the
+ * information at the last parsed level.
+ *
+ * But if the infinite wisdom of hardware folks decides to
+ * create a new domain type between CORE and MODULE or DIE
+ * and DIEGRP, then that would overwrite the CORE or DIE
+ * information.
+ *
+ * It really would have been too obvious to make the domain
+ * type space sparse and leave a few reserved types between
+ * the points which might change instead of forcing
+ * software to either create a monstrosity of workarounds
+ * or just being up the creek without a paddle.
+ *
+ * Refuse to implement monstrosity, emit an error and try
+ * to survive.
+ */
+ pr_err_once("Topology: leaf 0x%x:%d Unknown domain type %u\n",
+ leaf, subleaf, sl.type);
+ return true;
+ }
+
+ dom = topo_domain_map[sl.type];
+ if (!dom) {
+ tscan->c->topo.initial_apicid = sl.x2apic_id;
+ } else if (tscan->c->topo.initial_apicid != sl.x2apic_id) {
+ pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf %d APIC ID mismatch %x != %x\n",
+ leaf, subleaf, tscan->c->topo.initial_apicid, sl.x2apic_id);
+ }
+
+ topology_set_dom(tscan, dom, sl.x2apic_shift, sl.num_processors);
+ return true;
+}
+
+static bool parse_topology_leaf(struct topo_scan *tscan, u32 leaf)
+{
+ u32 subleaf;
+
+ if (tscan->c->cpuid_level < leaf)
+ return false;
+
+ /* Read all available subleafs and populate the levels */
+ for (subleaf = 0; topo_subleaf(tscan, leaf, subleaf); subleaf++);
+
+ /* If subleaf 0 failed to parse, give up */
+ if (!subleaf)
+ return false;
+
+ /*
+ * There are machines in the wild which have shift 0 in the subleaf
+ * 0, but advertise 2 logical processors at that level. They are
+ * truly SMT.
+ */
+ if (!tscan->dom_shifts[TOPO_SMT_DOMAIN] && tscan->dom_ncpus[TOPO_SMT_DOMAIN] > 1) {
+ u16 sft = get_count_order(tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
+
+ pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf 0 has shift level 0 but %u CPUs\n",
+ leaf, tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
+ topology_update_dom(tscan, TOPO_SMT_DOMAIN, sft, tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
+ }
+
+ set_cpu_cap(tscan->c, X86_FEATURE_XTOPOLOGY);
+ return true;
+}
+
+bool cpu_parse_topology_ext(struct topo_scan *tscan)
+{
+ /* Try lead 0x1F first. If not available try leaf 0x0b */
+ if (parse_topology_leaf(tscan, 0x1f))
+ return true;
+ return parse_topology_leaf(tscan, 0x0b);
+}



2023-07-24 21:03:03

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 17/29] x86/cpu: Provide a sane leaf 0xb/0x1f parser

On Mon, Jul 24, 2023 at 07:44:17PM +0200, Thomas Gleixner wrote:

> +static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf)
> +{
> + unsigned int dom, maxtype = leaf == 0xb ? CORE_TYPE + 1 : MAX_TYPE;
> + struct {
> + // eax
> + u32 x2apic_shift : 5, // Number of bits to shift APIC ID right
> + // for the topology ID at the next level
> + __rsvd0 : 27; // Reserved
> + // ebx
> + u32 num_processors : 16, // Number of processors at current level
> + __rsvd1 : 16; // Reserved
> + // ecx
> + u32 level : 8, // Current topology level. Same as sub leaf number
> + type : 8, // Level type. If 0, invalid
> + __rsvd2 : 16; // Reserved
> + // edx
> + u32 x2apic_id : 32; // X2APIC ID of the current logical processor

That comment seems inconsistent, either have then all aligned or move
all register names left.

But yeah, I think this is more or less what we ended up with last time I
went through this.

> + } sl;
> +
> + cpuid_subleaf(leaf, subleaf, &sl);
> +
> + if (!sl.num_processors || sl.type == INVALID_TYPE)
> + return false;
> +
> + if (sl.type >= maxtype) {
> + /*
> + * As the subleafs are ordered in domain level order, this
> + * could be recovered in theory by propagating the
> + * information at the last parsed level.
> + *
> + * But if the infinite wisdom of hardware folks decides to
> + * create a new domain type between CORE and MODULE or DIE
> + * and DIEGRP, then that would overwrite the CORE or DIE
> + * information.
> + *
> + * It really would have been too obvious to make the domain
> + * type space sparse and leave a few reserved types between
> + * the points which might change instead of forcing
> + * software to either create a monstrosity of workarounds
> + * or just being up the creek without a paddle.
> + *
> + * Refuse to implement monstrosity, emit an error and try
> + * to survive.
> + */
> + pr_err_once("Topology: leaf 0x%x:%d Unknown domain type %u\n",
> + leaf, subleaf, sl.type);
> + return true;
> + }
> +
> + dom = topo_domain_map[sl.type];
> + if (!dom) {
> + tscan->c->topo.initial_apicid = sl.x2apic_id;
> + } else if (tscan->c->topo.initial_apicid != sl.x2apic_id) {
> + pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf %d APIC ID mismatch %x != %x\n",
> + leaf, subleaf, tscan->c->topo.initial_apicid, sl.x2apic_id);
> + }
> +
> + topology_set_dom(tscan, dom, sl.x2apic_shift, sl.num_processors);
> + return true;
> +}
> +
> +static bool parse_topology_leaf(struct topo_scan *tscan, u32 leaf)
> +{
> + u32 subleaf;
> +
> + if (tscan->c->cpuid_level < leaf)
> + return false;
> +
> + /* Read all available subleafs and populate the levels */
> + for (subleaf = 0; topo_subleaf(tscan, leaf, subleaf); subleaf++);

Personally I prefer:

for (;;)
;

that is, have the semicolon on it's own line, but meh.

> +
> + /* If subleaf 0 failed to parse, give up */
> + if (!subleaf)
> + return false;
> +
> + /*
> + * There are machines in the wild which have shift 0 in the subleaf
> + * 0, but advertise 2 logical processors at that level. They are
> + * truly SMT.
> + */
> + if (!tscan->dom_shifts[TOPO_SMT_DOMAIN] && tscan->dom_ncpus[TOPO_SMT_DOMAIN] > 1) {
> + u16 sft = get_count_order(tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
> +
> + pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf 0 has shift level 0 but %u CPUs\n",
> + leaf, tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
> + topology_update_dom(tscan, TOPO_SMT_DOMAIN, sft, tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
> + }
> +
> + set_cpu_cap(tscan->c, X86_FEATURE_XTOPOLOGY);
> + return true;
> +}
> +
> +bool cpu_parse_topology_ext(struct topo_scan *tscan)
> +{
> + /* Try lead 0x1F first. If not available try leaf 0x0b */
> + if (parse_topology_leaf(tscan, 0x1f))
> + return true;
> + return parse_topology_leaf(tscan, 0x0b);
> +}
>

2023-07-24 21:17:37

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 17/29] x86/cpu: Provide a sane leaf 0xb/0x1f parser

On Mon, Jul 24, 2023 at 10:49:42PM +0200, Peter Zijlstra wrote:
> On Mon, Jul 24, 2023 at 07:44:17PM +0200, Thomas Gleixner wrote:
>
> > +static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf)
> > +{
> > + unsigned int dom, maxtype = leaf == 0xb ? CORE_TYPE + 1 : MAX_TYPE;
> > + struct {
> > + // eax
> > + u32 x2apic_shift : 5, // Number of bits to shift APIC ID right
> > + // for the topology ID at the next level
> > + __rsvd0 : 27; // Reserved
> > + // ebx
> > + u32 num_processors : 16, // Number of processors at current level
> > + __rsvd1 : 16; // Reserved
> > + // ecx
> > + u32 level : 8, // Current topology level. Same as sub leaf number
> > + type : 8, // Level type. If 0, invalid
> > + __rsvd2 : 16; // Reserved
> > + // edx
> > + u32 x2apic_id : 32; // X2APIC ID of the current logical processor
>
> That comment seems inconsistent, either have then all aligned or move
> all register names left.

AMD code seems to have the reg names left aligned -- perhaps do the same
here.

2023-07-25 07:36:01

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch 17/29] x86/cpu: Provide a sane leaf 0xb/0x1f parser

On Mon, Jul 24 2023 at 22:49, Peter Zijlstra wrote:
> On Mon, Jul 24, 2023 at 07:44:17PM +0200, Thomas Gleixner wrote:
>
>> +static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf)
>> +{
>> + unsigned int dom, maxtype = leaf == 0xb ? CORE_TYPE + 1 : MAX_TYPE;
>> + struct {
>> + // eax
>> + u32 x2apic_shift : 5, // Number of bits to shift APIC ID right
>> + // for the topology ID at the next level
>> + __rsvd0 : 27; // Reserved
>> + // ebx
>> + u32 num_processors : 16, // Number of processors at current level
>> + __rsvd1 : 16; // Reserved
>> + // ecx
>> + u32 level : 8, // Current topology level. Same as sub leaf number
>> + type : 8, // Level type. If 0, invalid
>> + __rsvd2 : 16; // Reserved
>> + // edx
>> + u32 x2apic_id : 32; // X2APIC ID of the current logical processor
>
> That comment seems inconsistent, either have then all aligned or move
> all register names left.

Bah. I had all the register names left at some point. No idea how I lost
that again. Probably when I rolled back to some earlier version after
screwing up :)

>> +
>> + /* Read all available subleafs and populate the levels */
>> + for (subleaf = 0; topo_subleaf(tscan, leaf, subleaf); subleaf++);
>
> Personally I prefer:
>
> for (;;)
> ;
>
> that is, have the semicolon on it's own line, but meh.

:)