2024-04-04 15:15:20

by Yazen Ghannam

[permalink] [raw]
Subject: [PATCH v2 04/16] x86/mce/amd: Look up bank type by IPID

Scalable MCA systems use values within the MCA_IPID register to describe
a bank's type. Other information is not needed.

Currently, the bank types are cached during boot and this information is
used during boot and run time. The cached values are per-CPU and
per-bank. The boot path needs the cached values, but this should be
removed. The run time path does not need the cached values.

Determine a Scalable MCA bank's type using only the MCA_IPID values.

Keep old code until init path is cleaned up.

Signed-off-by: Yazen Ghannam <[email protected]>
---

Notes:
Link:
https://lkml.kernel.org/r/[email protected]

v1->v2:
* Include bitops started in dropped patches. (Yazen)
* Update all users of smca_get_bank_type(). (Yazen)

arch/x86/include/asm/mce.h | 4 +-
arch/x86/kernel/cpu/mce/amd.c | 99 ++++++++++++++++++++++---
drivers/edac/amd64_edac.c | 2 +-
drivers/edac/mce_amd.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +-
5 files changed, 94 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index de3118305838..adad99bac567 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -59,8 +59,6 @@
* - TCC bit is present in MCx_STATUS.
*/
#define MCI_CONFIG_MCAX 0x1
-#define MCI_IPID_MCATYPE 0xFFFF0000
-#define MCI_IPID_HWID 0xFFF

/*
* Note that the full MCACOD field of IA32_MCi_STATUS MSR is
@@ -342,7 +340,7 @@ extern int mce_threshold_create_device(unsigned int cpu);
extern int mce_threshold_remove_device(unsigned int cpu);

void mce_amd_feature_init(struct cpuinfo_x86 *c);
-enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank);
+enum smca_bank_types smca_get_bank_type(u64 ipid);
#else

static inline int mce_threshold_create_device(unsigned int cpu) { return 0; };
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index bc78e751dfcc..c76bc158b6b6 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -7,6 +7,7 @@
*
* All MC4_MISCi registers are shared between cores on a node.
*/
+#include <linux/bitfield.h>
#include <linux/interrupt.h>
#include <linux/notifier.h>
#include <linux/kobject.h>
@@ -51,6 +52,10 @@
#define DEF_INT_TYPE_APIC 0x2

/* Scalable MCA: */
+#define MCI_IPID_MCATYPE GENMASK_ULL(47, 44)
+#define MCI_IPID_HWID GENMASK_ULL(43, 32)
+#define MCI_IPID_MCATYPE_OLD 0xFFFF0000
+#define MCI_IPID_HWID_OLD 0xFFF

/* Threshold LVT offset is at MSR0xC0000410[15:12] */
#define SMCA_THR_LVT_OFF 0xF000
@@ -131,7 +136,7 @@ static const char *smca_get_name(enum smca_bank_types t)
return smca_names[t];
}

-enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank)
+static enum smca_bank_types smca_get_bank_type_old(unsigned int cpu, unsigned int bank)
{
struct smca_bank *b;

@@ -144,9 +149,8 @@ enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank)

return b->hwid->bank_type;
}
-EXPORT_SYMBOL_GPL(smca_get_bank_type);

-static const struct smca_hwid smca_hwid_mcatypes[] = {
+static const struct smca_hwid smca_hwid_mcatypes_old[] = {
/* { bank_type, hwid_mcatype } */

/* Reserved type */
@@ -210,6 +214,83 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) },
};

+/* Keep sorted first by HWID then by McaType. */
+static const u32 smca_hwid_mcatypes[] = {
+ /* Reserved type */
+ [SMCA_RESERVED] = HWID_MCATYPE(0x00, 0x0),
+
+ /* System Management Unit MCA type */
+ [SMCA_SMU] = HWID_MCATYPE(0x01, 0x0),
+ [SMCA_SMU_V2] = HWID_MCATYPE(0x01, 0x1),
+
+ /* Microprocessor 5 Unit MCA type */
+ [SMCA_MP5] = HWID_MCATYPE(0x01, 0x2),
+
+ /* MPDMA MCA type */
+ [SMCA_MPDMA] = HWID_MCATYPE(0x01, 0x3),
+
+ /* Parameter Block MCA type */
+ [SMCA_PB] = HWID_MCATYPE(0x05, 0x0),
+
+ /* Northbridge IO Unit MCA type */
+ [SMCA_NBIO] = HWID_MCATYPE(0x18, 0x0),
+
+ /* Data Fabric MCA types */
+ [SMCA_CS] = HWID_MCATYPE(0x2E, 0x0),
+ [SMCA_PIE] = HWID_MCATYPE(0x2E, 0x1),
+ [SMCA_CS_V2] = HWID_MCATYPE(0x2E, 0x2),
+
+ /* PCI Express Unit MCA type */
+ [SMCA_PCIE] = HWID_MCATYPE(0x46, 0x0),
+ [SMCA_PCIE_V2] = HWID_MCATYPE(0x46, 0x1),
+
+ [SMCA_XGMI_PCS] = HWID_MCATYPE(0x50, 0x0),
+ [SMCA_NBIF] = HWID_MCATYPE(0x6C, 0x0),
+ [SMCA_SHUB] = HWID_MCATYPE(0x80, 0x0),
+
+ /* Unified Memory Controller MCA type */
+ [SMCA_UMC] = HWID_MCATYPE(0x96, 0x0),
+ [SMCA_UMC_V2] = HWID_MCATYPE(0x96, 0x1),
+
+ [SMCA_SATA] = HWID_MCATYPE(0xA8, 0x0),
+ [SMCA_USB] = HWID_MCATYPE(0xAA, 0x0),
+
+ /* ZN Core (HWID=0xB0) MCA types */
+ [SMCA_LS] = HWID_MCATYPE(0xB0, 0x0),
+ [SMCA_IF] = HWID_MCATYPE(0xB0, 0x1),
+ [SMCA_L2_CACHE] = HWID_MCATYPE(0xB0, 0x2),
+ [SMCA_DE] = HWID_MCATYPE(0xB0, 0x3),
+ /* HWID 0xB0 MCATYPE 0x4 is Reserved */
+ [SMCA_EX] = HWID_MCATYPE(0xB0, 0x5),
+ [SMCA_FP] = HWID_MCATYPE(0xB0, 0x6),
+ [SMCA_L3_CACHE] = HWID_MCATYPE(0xB0, 0x7),
+ [SMCA_LS_V2] = HWID_MCATYPE(0xB0, 0x10),
+
+ /* Platform Security Processor MCA type */
+ [SMCA_PSP] = HWID_MCATYPE(0xFF, 0x0),
+ [SMCA_PSP_V2] = HWID_MCATYPE(0xFF, 0x1),
+
+ [SMCA_GMI_PCS] = HWID_MCATYPE(0x241, 0x0),
+ [SMCA_XGMI_PHY] = HWID_MCATYPE(0x259, 0x0),
+ [SMCA_WAFL_PHY] = HWID_MCATYPE(0x267, 0x0),
+ [SMCA_GMI_PHY] = HWID_MCATYPE(0x269, 0x0),
+};
+
+enum smca_bank_types smca_get_bank_type(u64 ipid)
+{
+ enum smca_bank_types type;
+ u32 hwid_mcatype = HWID_MCATYPE(FIELD_GET(MCI_IPID_HWID, ipid),
+ FIELD_GET(MCI_IPID_MCATYPE, ipid));
+
+ for (type = 0; type < ARRAY_SIZE(smca_hwid_mcatypes); type++) {
+ if (hwid_mcatype == smca_hwid_mcatypes[type])
+ return type;
+ }
+
+ return N_SMCA_BANK_TYPES;
+}
+EXPORT_SYMBOL_GPL(smca_get_bank_type);
+
/*
* In SMCA enabled processors, we can have multiple banks for a given IP type.
* So to define a unique name for each bank, we use a temp c-string to append
@@ -310,11 +391,11 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
return;
}

- hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID,
- (high & MCI_IPID_MCATYPE) >> 16);
+ hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID_OLD,
+ (high & MCI_IPID_MCATYPE_OLD) >> 16);

- for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
- s_hwid = &smca_hwid_mcatypes[i];
+ for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes_old); i++) {
+ s_hwid = &smca_hwid_mcatypes_old[i];

if (hwid_mcatype == s_hwid->hwid_mcatype) {
this_cpu_ptr(smca_banks)[bank].hwid = s_hwid;
@@ -724,7 +805,7 @@ static bool smca_mce_is_memory_error(struct mce *m)
if (XEC(m->status, 0x3f))
return false;

- bank_type = smca_get_bank_type(m->extcpu, m->bank);
+ bank_type = smca_get_bank_type(m->ipid);

return bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2;
}
@@ -1097,7 +1178,7 @@ static const char *get_name(unsigned int cpu, unsigned int bank, struct threshol
return th_names[bank];
}

- bank_type = smca_get_bank_type(cpu, bank);
+ bank_type = smca_get_bank_type_old(cpu, bank);
if (bank_type >= N_SMCA_BANK_TYPES)
return NULL;

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 1f3520d76861..4b3764ea7c59 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1041,7 +1041,7 @@ static int fixup_node_id(int node_id, struct mce *m)
/* MCA_IPID[InstanceIdHi] give the AMD Node ID for the bank. */
u8 nid = (m->ipid >> 44) & 0xF;

- if (smca_get_bank_type(m->extcpu, m->bank) != SMCA_UMC_V2)
+ if (smca_get_bank_type(m->ipid) != SMCA_UMC_V2)
return node_id;

/* Nodes below the GPU base node are CPU nodes and don't need a fixup. */
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 8130c3dc64da..e02af5da1ec2 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -731,7 +731,7 @@ static const char *smca_get_long_name(enum smca_bank_types t)
/* Decode errors according to Scalable MCA specification */
static void decode_smca_error(struct mce *m)
{
- enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
+ enum smca_bank_types bank_type = smca_get_bank_type(m->ipid);
u8 xec = XEC(m->status, xec_mask);

if (bank_type >= N_SMCA_BANK_TYPES)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 8ebab6f22e5a..c543600b759b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3546,7 +3546,7 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,
* and error occurred in DramECC (Extended error code = 0) then only
* process the error, else bail out.
*/
- if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) &&
+ if (!m || !((smca_get_bank_type(m->ipid) == SMCA_UMC_V2) &&
(XEC(m->status, 0x3f) == 0x0)))
return NOTIFY_DONE;

--
2.34.1



2024-04-23 17:12:10

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v2 04/16] x86/mce/amd: Look up bank type by IPID

On Thu, Apr 04, 2024 at 10:13:47AM -0500, Yazen Ghannam wrote:
> Scalable MCA systems use values within the MCA_IPID register to describe
> a bank's type. Other information is not needed.
>
> Currently, the bank types are cached during boot and this information is
> used during boot and run time. The cached values are per-CPU and
> per-bank. The boot path needs the cached values, but this should be
> removed. The run time path does not need the cached values.
>
> Determine a Scalable MCA bank's type using only the MCA_IPID values.

Lemme get this straight: You want to switch it all to use this new array
smca_hwid_mcatypes[] to lookup HWIDs?

If so, where is the patch which removes the _old thing?

> arch/x86/include/asm/mce.h | 4 +-
> arch/x86/kernel/cpu/mce/amd.c | 99 ++++++++++++++++++++++---
> drivers/edac/amd64_edac.c | 2 +-
> drivers/edac/mce_amd.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +-

$ ./scripts/get_maintainer.pl -f drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
Alex Deucher <[email protected]> (supporter:RADEON and AMDGPU DRM DRIVERS,commit_signer:91/89=100%)
"Christian König" <[email protected]> (supporter:RADEON and AMDGPU DRM DRIVERS)
"Pan, Xinhui" <[email protected]> (supporter:RADEON and AMDGPU DRM DRIVERS)
David Airlie <[email protected]> (maintainer:DRM DRIVERS
..

You need to CC folks on changes touching their area...

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2024-04-23 19:17:04

by Yazen Ghannam

[permalink] [raw]
Subject: Re: [PATCH v2 04/16] x86/mce/amd: Look up bank type by IPID

On 4/23/2024 1:06 PM, Borislav Petkov wrote:
> On Thu, Apr 04, 2024 at 10:13:47AM -0500, Yazen Ghannam wrote:
>> Scalable MCA systems use values within the MCA_IPID register to describe
>> a bank's type. Other information is not needed.
>>
>> Currently, the bank types are cached during boot and this information is
>> used during boot and run time. The cached values are per-CPU and
>> per-bank. The boot path needs the cached values, but this should be
>> removed. The run time path does not need the cached values.
>>
>> Determine a Scalable MCA bank's type using only the MCA_IPID values.
>
> Lemme get this straight: You want to switch it all to use this new array
> smca_hwid_mcatypes[] to lookup HWIDs?
>
> If so, where is the patch which removes the _old thing?
>

It would be part of a follow up set.

Here's where it would finally be removed after a bunch of refactoring.

https://github.com/AMDESE/linux/commit/291c85ae22277

>> arch/x86/include/asm/mce.h | 4 +-
>> arch/x86/kernel/cpu/mce/amd.c | 99 ++++++++++++++++++++++---
>> drivers/edac/amd64_edac.c | 2 +-
>> drivers/edac/mce_amd.c | 2 +-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +-
>
> $ ./scripts/get_maintainer.pl -f drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> Alex Deucher <[email protected]> (supporter:RADEON and AMDGPU DRM DRIVERS,commit_signer:91/89=100%)
> "Christian König" <[email protected]> (supporter:RADEON and AMDGPU DRM DRIVERS)
> "Pan, Xinhui" <[email protected]> (supporter:RADEON and AMDGPU DRM DRIVERS)
> David Airlie <[email protected]> (maintainer:DRM DRIVERS
> ...
>
> You need to CC folks on changes touching their area...
>

Yep, sorry I missed that. The same is true for the wrapper struct patches
later in the set. They touch all places with MCE notifiers.

Thanks,
Yazen