2001-10-25 23:03:48

by Martin J. Bligh

[permalink] [raw]
Subject: Patch to read/parse the MPC oem tables

This patch will parse the OEM extensions to the mps tables
(if present). This gives me a mapping to tell which device
lies in which NUMA node (the current code just guesses).

Patch is against 2.4.13 - if it looks OK, please could you add it?

Thanks,

Martin.

Binary files virgin-2.4.13/arch/i386/kernel/.mpparse.c.swp and linux-2.4.13/arch/i386/kernel/.mpparse.c.swp differ
diff -urN virgin-2.4.13/arch/i386/kernel/mpparse.c linux-2.4.13/arch/i386/kernel/mpparse.c
--- virgin-2.4.13/arch/i386/kernel/mpparse.c Thu Oct 4 18:42:54 2001
+++ linux-2.4.13/arch/i386/kernel/mpparse.c Thu Oct 25 10:13:18 2001
@@ -36,6 +36,7 @@
*/
int apic_version [MAX_APICS];
int mp_bus_id_to_type [MAX_MP_BUSSES];
+int mp_bus_id_to_node [MAX_MP_BUSSES];
int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
int mp_current_pci_id;

@@ -55,6 +56,7 @@

/* Processor that is doing the boot up */
unsigned int boot_cpu_physical_apicid = -1U;
+unsigned int boot_cpu_logical_apicid = -1U;
/* Internal processor count */
static unsigned int num_processors;

@@ -118,18 +120,37 @@
return n;
}

+/*
+ * Have to match translation table entries to main table entries by counter
+ * hence the mpc_record variable .... can't see a less disgusting way of
+ * doing this ....
+ */
+static int mpc_record;
+static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY];
+
static void __init MP_processor_info (struct mpc_config_processor *m)
{
- int ver;
+ int ver, quad, logical_apicid;

if (!(m->mpc_cpuflag & CPU_ENABLED))
return;

- printk("Processor #%d %s APIC version %d\n",
- m->mpc_apicid,
- mpc_family( (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8 ,
- (m->mpc_cpufeature & CPU_MODEL_MASK)>>4),
- m->mpc_apicver);
+ if (clustered_apic_mode) {
+ quad = translation_table[mpc_record]->trans_quad;
+ logical_apicid = (quad << 4) +
+ (m->mpc_apicid ? m->mpc_apicid << 1 : 1);
+ printk("Processor #%d %s APIC version %d (quad %d, apic %d)\n",
+ m->mpc_apicid,
+ mpc_family((m->mpc_cpufeature & CPU_FAMILY_MASK)>>8 ,
+ (m->mpc_cpufeature & CPU_MODEL_MASK)>>4),
+ m->mpc_apicver, quad, logical_apicid);
+ } else {
+ printk("Processor #%d %s APIC version %d\n",
+ m->mpc_apicid,
+ mpc_family((m->mpc_cpufeature & CPU_FAMILY_MASK)>>8 ,
+ (m->mpc_cpufeature & CPU_MODEL_MASK)>>4),
+ m->mpc_apicver);
+ }

if (m->mpc_featureflag&(1<<0))
Dprintk(" Floating point unit present.\n");
@@ -181,6 +202,7 @@
if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
Dprintk(" Bootup CPU\n");
boot_cpu_physical_apicid = m->mpc_apicid;
+ boot_cpu_logical_apicid = logical_apicid;
}

num_processors++;
@@ -192,12 +214,11 @@
}
ver = m->mpc_apicver;

- if (clustered_apic_mode)
- /* Crude temporary hack. Assumes processors are sequential */
- phys_cpu_present_map |= 1 << (num_processors-1);
- else
+ if (clustered_apic_mode) {
+ phys_cpu_present_map |= (logical_apicid&0xf) << (4*quad);
+ } else {
phys_cpu_present_map |= 1 << m->mpc_apicid;
-
+ }
/*
* Validate version
*/
@@ -214,7 +235,13 @@

memcpy(str, m->mpc_bustype, 6);
str[6] = 0;
- Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
+
+ if (clustered_apic_mode) {
+ mp_bus_id_to_node[m->mpc_busid] = translation_table[mpc_record]->trans_quad;
+ printk("Bus #%d is %s (node %d)\n", m->mpc_busid, str, mp_bus_id_to_node[m->mpc_busid]);
+ } else {
+ Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
+ }

if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
@@ -286,6 +313,62 @@
BUG();
}

+static void __init MP_translation_info (struct mpc_config_translation *m)
+{
+ printk("Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type,
+ m->trans_quad, m->trans_global, m->trans_local);
+
+ if (mpc_record >= MAX_MPC_ENTRY)
+ printk("MAX_MPC_ENTRY exceeded!\n");
+ translation_table[mpc_record] = m; /* stash this for later */
+}
+
+/*
+ * Read/parse the MPC oem tables
+ */
+
+static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
+ unsigned short oemsize)
+{
+ int count = sizeof (*oemtable); /* the header size */
+ unsigned char *oemptr = ((unsigned char *)oemtable)+count;
+
+ printk("Found an OEM MPC table at %08lx - parsing it ... \n", (u_long) oemtable);
+ if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
+ {
+ printk("SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
+ oemtable->oem_signature[0],
+ oemtable->oem_signature[1],
+ oemtable->oem_signature[2],
+ oemtable->oem_signature[3]);
+ return;
+ }
+ if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
+ {
+ printk("SMP oem mptable: checksum error!\n");
+ return;
+ }
+ while (count < oemtable->oem_length) {
+ switch (*oemptr) {
+ case MP_TRANSLATION:
+ {
+ struct mpc_config_translation *m=
+ (struct mpc_config_translation *)oemptr;
+ MP_translation_info(m);
+ oemptr += sizeof(*m);
+ count += sizeof(*m);
+ ++mpc_record;
+ break;
+ }
+ default:
+ {
+ printk("Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
+ return;
+ }
+ }
+ }
+}
+
/*
* Read/parse the MPC
*/
@@ -330,6 +413,13 @@
/* save the local APIC address, it might be non-default */
mp_lapic_addr = mpc->mpc_lapic;

+ if (clustered_apic_mode && mpc->mpc_oemptr) {
+ /* We need to process the oem mpc tables to tell us which quad things are in ... */
+ mpc_record = 0;
+ smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr, mpc->mpc_oemsize);
+ mpc_record = 0;
+ }
+
/*
* Now process the configuration blocks.
*/
@@ -381,7 +471,13 @@
count+=sizeof(*m);
break;
}
+ default:
+ {
+ count = mpc->mpc_length;
+ break;
+ }
}
+ ++mpc_record;
}
if (clustered_apic_mode && nr_ioapics > 2) {
/* don't initialise IO apics on secondary quads */
diff -urN virgin-2.4.13/include/asm-i386/mpspec.h linux-2.4.13/include/asm-i386/mpspec.h
--- virgin-2.4.13/include/asm-i386/mpspec.h Thu Oct 4 18:42:54 2001
+++ linux-2.4.13/include/asm-i386/mpspec.h Thu Oct 25 14:31:12 2001
@@ -16,7 +16,13 @@
/*
* a maximum of 16 APICs with the current APIC ID architecture.
*/
+#ifdef CONFIG_MULTIQUAD
+#define MAX_APICS 256
+#else /* !CONFIG_MULTIQUAD */
#define MAX_APICS 16
+#endif /* CONFIG_MULTIQUAD */
+
+#define MAX_MPC_ENTRY 1024

struct intel_mp_floating
{
@@ -55,6 +61,7 @@
#define MP_IOAPIC 2
#define MP_INTSRC 3
#define MP_LINTSRC 4
+#define MP_TRANSLATION 192

struct mpc_config_processor
{
@@ -144,6 +151,27 @@
unsigned char mpc_destapiclint;
};

+struct mp_config_oemtable
+{
+ char oem_signature[4];
+#define MPC_OEM_SIGNATURE "_OEM"
+ unsigned short oem_length; /* Size of table */
+ char oem_rev; /* 0x01 */
+ char oem_checksum;
+ char mpc_oem[8];
+};
+
+struct mpc_config_translation
+{
+ unsigned char mpc_type;
+ unsigned char trans_len;
+ unsigned char trans_type;
+ unsigned char trans_quad;
+ unsigned char trans_global;
+ unsigned char trans_local;
+ unsigned short trans_reserved;
+};
+
/*
* Default configurations
*
@@ -156,7 +184,12 @@
* 7 2 CPU MCA+PCI
*/

-#define MAX_IRQ_SOURCES 256
+#ifdef CONFIG_MULTIQUAD
+#define MAX_IRQ_SOURCES 512
+#else /* !CONFIG_MULTIQUAD */
+#define MAX_IRQ_SOURCES 128
+#endif /* CONFIG_MULTIQUAD */
+
#define MAX_MP_BUSSES 32
enum mp_bustype {
MP_BUS_ISA = 1,




2001-10-26 16:01:26

by Randy.Dunlap

[permalink] [raw]
Subject: Re: Patch to read/parse the MPC oem tables

Hi Martin-

Overall this looks like a mostly-clean patch.

Questions and comments below.

~Randy


"Martin J. Bligh" wrote:
>
> This patch will parse the OEM extensions to the mps tables
> (if present). This gives me a mapping to tell which device
> lies in which NUMA node (the current code just guesses).

So these extensions are OEM-specific, not part of the MP spec,
right?


> Patch is against 2.4.13 - if it looks OK, please could you add it?

> diff -urN virgin-2.4.13/arch/i386/kernel/mpparse.c linux-2.4.13/arch/i386/kernel/mpparse.c
> --- virgin-2.4.13/arch/i386/kernel/mpparse.c Thu Oct 4 18:42:54 2001
> +++ linux-2.4.13/arch/i386/kernel/mpparse.c Thu Oct 25 10:13:18 2001
> @@ -118,18 +120,37 @@
> +static int mpc_record;
> +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY];

Could this array be __initdata or reduced in size some,
for people who don't need it? (more about this below)
E.g., I bet most people don't need this static 4 KB array.

Also, could the array of structs <mp_irqs and mp_ioapics> (in
mpparse.c) be made __initdata, so that they could be discarded
after init?
I tested this idea (without this patch, just by changing
mp_irqs[] and mp_ioapics[] to __initdata, and it booted OK,
and they are put into the .data.init section according to
objdump. Are there some other/different problems doing this, anyone?
OTOH, with a 16 GB system, you won't worry much about saving a few KB,
eh?


> @@ -286,6 +313,62 @@
>
> +static void __init MP_translation_info (struct mpc_config_translation *m)
> +{
> + printk("Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type,
> + m->trans_quad, m->trans_global, m->trans_local);
> +
> + if (mpc_record >= MAX_MPC_ENTRY)
> + printk("MAX_MPC_ENTRY exceeded!\n");

Add "else" here to keep from stepping out of the array bounds.

> + translation_table[mpc_record] = m; /* stash this for later */
> +}
> +
> +/*
> + * Read/parse the MPC oem tables
> + */
> +
> +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
> + unsigned short oemsize)
> +{
> + int count = sizeof (*oemtable); /* the header size */
> + unsigned char *oemptr = ((unsigned char *)oemtable)+count;
> +
> + printk("Found an OEM MPC table at %08lx - parsing it ... \n", (u_long) oemtable);

BTW, "%p" prints pointers also, without casting them.

> + while (count < oemtable->oem_length) {
> + switch (*oemptr) {
> + case MP_TRANSLATION:
> + {
> + struct mpc_config_translation *m=
> + (struct mpc_config_translation *)oemptr;
> + MP_translation_info(m);
> + oemptr += sizeof(*m);
> + count += sizeof(*m);
> + ++mpc_record;
> + break;
> + }

> /*
> * Read/parse the MPC
> */
> @@ -330,6 +413,13 @@
> /* save the local APIC address, it might be non-default */
> mp_lapic_addr = mpc->mpc_lapic;
>
> + if (clustered_apic_mode && mpc->mpc_oemptr) {
> + /* We need to process the oem mpc tables to tell us which quad things are in ... */
> + mpc_record = 0;
> + smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr, mpc->mpc_oemsize);
> + mpc_record = 0;

What's this =0 for?

> @@ -381,7 +471,13 @@
> count+=sizeof(*m);
> break;
> }
> + default:
> + {
> + count = mpc->mpc_length;
> + break;
> + }
> }
> + ++mpc_record;

And what's this increment for?

> diff -urN virgin-2.4.13/include/asm-i386/mpspec.h linux-2.4.13/include/asm-i386/mpspec.h
> --- virgin-2.4.13/include/asm-i386/mpspec.h Thu Oct 4 18:42:54 2001
> +++ linux-2.4.13/include/asm-i386/mpspec.h Thu Oct 25 14:31:12 2001
> @@ -16,7 +16,13 @@
> /*
> * a maximum of 16 APICs with the current APIC ID architecture.
> */
> +#ifdef CONFIG_MULTIQUAD
> +#define MAX_APICS 256
> +#else /* !CONFIG_MULTIQUAD */
> #define MAX_APICS 16
> +#endif /* CONFIG_MULTIQUAD */
> +
> +#define MAX_MPC_ENTRY 1024

How about #defining MAX_MPC_ENTRY above here (depending on MULTIQUAD),
so that it can be smaller for non-MULTIQUAD targets?

> @@ -55,6 +61,7 @@
> #define MP_IOAPIC 2
> #define MP_INTSRC 3
> #define MP_LINTSRC 4
> +#define MP_TRANSLATION 192

Where does this value (192) come from, and the
mpc_config_oemtable and mpc_config_translation structs?
Not in the MP 1.4 spec, right? (yes, I searched)
So maybe some comment about it being used by IBM would
be good (or even qualified by CONFIG_MULTIQUAD somehow;
that would be easy in the .h file, but not so easy
in mpparse.c -- without being ugly).

Or is it some de facto standard?
Is it used by other large-systems manufacturers for the
same purpose?

> @@ -144,6 +151,27 @@
> +struct mp_config_oemtable
> +{
> + char oem_signature[4];
> +#define MPC_OEM_SIGNATURE "_OEM"
> + unsigned short oem_length; /* Size of table */
> + char oem_rev; /* 0x01 */
> + char oem_checksum;
> + char mpc_oem[8];
> +};
> +
> +struct mpc_config_translation
> +{
> + unsigned char mpc_type;
> + unsigned char trans_len;
> + unsigned char trans_type;
> + unsigned char trans_quad;
> + unsigned char trans_global;
> + unsigned char trans_local;
> + unsigned short trans_reserved;
> +};

2001-10-26 19:41:56

by Martin J. Bligh

[permalink] [raw]
Subject: Re: Patch to read/parse the MPC oem tables

>> This patch will parse the OEM extensions to the mps tables
>> (if present). This gives me a mapping to tell which device
>> lies in which NUMA node (the current code just guesses).
>
> So these extensions are OEM-specific, not part of the MP spec,
> right?

As I understand this, the concept of OEM extensions is inside
the MPS spec (there's a pointer for it inside the main table),
though what's actually contained therein is OEM specific.

>> +static int mpc_record;
>> +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY];
>
> Could this array be __initdata or reduced in size some,
> for people who don't need it? (more about this below)
> E.g., I bet most people don't need this static 4 KB array.

I originally had this under #ifdef, but that causes compilation
problems. __initdata is probably better - thanks. Tested it,
seems to work fine.

> Also, could the array of structs <mp_irqs and mp_ioapics> (in
> mpparse.c) be made __initdata, so that they could be discarded
> after init?

Probably, but they're used in lots of places, so it would take some
research to figure out all the possible combinations ;-) I'll leave that
possiblity for a seperate patch (the structures were there already
like this).

> Add "else" here to keep from stepping out of the array bounds.

Good point. Fixed.

> BTW, "%p" prints pointers also, without casting them.

OK, I'll try that.

>> + mpc_record = 0;
>
> What's this =0 for?
> ...
> And what's this increment for?

We go round the counter twice - the only way to match up the
records in the trans table to the records in the main table is by
their position. The first time we go round the counter (for the
trans table), we record everything in it's appropriate position in
the trans array, the next time, the counter is an index to fish
things back out of the trans array by.

Or, at least, that was the intent ;-) Seems to work.

>> +#define MAX_MPC_ENTRY 1024
>
> How about #defining MAX_MPC_ENTRY above here (depending on MULTIQUAD),
> so that it can be smaller for non-MULTIQUAD targets?

That would be slightly messy - it's meant to indicate the maximum
possible number of entries in the MPC table. With your __initdata
suggestion above, it doesn't matter anyway - all the mem we use
will be freed.

>> +#define MP_TRANSLATION 192
>
> Where does this value (192) come from, and the

Reverse engineering. You're right - a comment would be appropriate.
Fixed.

A patch is attached below to fix these issues, plus one other bit
of idiocy I found - I'd accidentally reduced the number of
MAX_IRQ_SOURCES (I think I pulled the change forward from
an older kernel, and dropped someone's fix. Oops).

----------------------------------------------------------------------------------------

diff -urN linux-2.4.13-mpparse.3/arch/i386/kernel/mpparse.c linux-2.4.13-mpparse.4/arch/i386/kernel/mpparse.c
--- linux-2.4.13-mpparse.3/arch/i386/kernel/mpparse.c Fri Oct 26 11:25:46 2001
+++ linux-2.4.13-mpparse.4/arch/i386/kernel/mpparse.c Fri Oct 26 10:52:29 2001
@@ -126,7 +126,7 @@
* doing this ....
*/
static int mpc_record;
-static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY];
+static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata;

static void __init MP_processor_info (struct mpc_config_processor *m)
{
@@ -320,7 +320,8 @@

if (mpc_record >= MAX_MPC_ENTRY)
printk("MAX_MPC_ENTRY exceeded!\n");
- translation_table[mpc_record] = m; /* stash this for later */
+ else
+ translation_table[mpc_record] = m; /* stash this for later */
}

/*
@@ -333,7 +334,7 @@
int count = sizeof (*oemtable); /* the header size */
unsigned char *oemptr = ((unsigned char *)oemtable)+count;

- printk("Found an OEM MPC table at %08lx - parsing it ... \n", (u_long) oemtable);
+ printk("Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
{
printk("SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
diff -urN linux-2.4.13-mpparse.3/include/asm-i386/mpspec.h linux-2.4.13-mpparse.4/include/asm-i386/mpspec.h
--- linux-2.4.13-mpparse.3/include/asm-i386/mpspec.h Fri Oct 26 11:25:46 2001
+++ linux-2.4.13-mpparse.4/include/asm-i386/mpspec.h Fri Oct 26 10:20:22 2001
@@ -61,7 +61,7 @@
#define MP_IOAPIC 2
#define MP_INTSRC 3
#define MP_LINTSRC 4
-#define MP_TRANSLATION 192
+#define MP_TRANSLATION 192 /* Used by IBM NUMA-Q to describe node locality */

struct mpc_config_processor
{
@@ -187,7 +187,7 @@
#ifdef CONFIG_MULTIQUAD
#define MAX_IRQ_SOURCES 512
#else /* !CONFIG_MULTIQUAD */
-#define MAX_IRQ_SOURCES 128
+#define MAX_IRQ_SOURCES 256
#endif /* CONFIG_MULTIQUAD */

#define MAX_MP_BUSSES 32




2001-10-26 21:33:58

by Randy.Dunlap

[permalink] [raw]
Subject: Re: Patch to read/parse the MPC oem tables

"Martin J. Bligh" wrote:
>
> >> This patch will parse the OEM extensions to the mps tables
> >> (if present). This gives me a mapping to tell which device
> >> lies in which NUMA node (the current code just guesses).
> >
> > So these extensions are OEM-specific, not part of the MP spec,
> > right?
>
> As I understand this, the concept of OEM extensions is inside
> the MPS spec (there's a pointer for it inside the main table),
> though what's actually contained therein is OEM specific.

Right/agreed. I didn't mean to imply that OEM extensions were
new, just that their contents are OEM-specific and not part
of the spec.

> > Also, could the array of structs <mp_irqs and mp_ioapics> (in
> > mpparse.c) be made __initdata, so that they could be discarded
> > after init?
>
> Probably, but they're used in lots of places, so it would take some
> research to figure out all the possible combinations ;-) I'll leave that
> possiblity for a seperate patch (the structures were there already
> like this).

Right. :)

> A patch is attached below to fix these issues, plus one other bit
> of idiocy I found - I'd accidentally reduced the number of
> MAX_IRQ_SOURCES (I think I pulled the change forward from
> an older kernel, and dropped someone's fix. Oops).

Looks sane to me.

Thanks,
~Randy