Some Sapphire Rapids workstations' absent memory controllers
still appear as PCIe devices that fool the i10nm_edac driver
and result in "shift exponet -66 is negative" call traces
from skx_get_dimm_info().
Skip the absent memory controllers to avoid the call traces.
Reported-by: Kai-Heng Feng <[email protected]>
Closes: https://lore.kernel.org/linux-edac/CAAd53p41Ku1m1rapeqb1xtD+kKuk+BaUW=dumuoF0ZO3GhFjFA@mail.gmail.com/T/#m5de16dce60a8c836ec235868c7c16e3fefad0cc2
Reported-by: Koba Ko <[email protected]>
Closes: https://lore.kernel.org/linux-edac/SA1PR11MB71305B71CCCC3D9305835202892AA@SA1PR11MB7130.namprd11.prod.outlook.com/T/#t
Fixes: d4dc89d069aa ("EDAC, i10nm: Add a driver for Intel 10nm server processors")
Signed-off-by: Qiuxu Zhuo <[email protected]>
---
drivers/edac/i10nm_base.c | 54 +++++++++++++++++++++++++++++++++++----
1 file changed, 49 insertions(+), 5 deletions(-)
diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index a897b6aff368..349ff6cfb379 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -658,13 +658,49 @@ static struct pci_dev *get_ddr_munit(struct skx_dev *d, int i, u32 *offset, unsi
return mdev;
}
+/**
+ * i10nm_imc_absent() - Check whether the memory controller @imc is absent
+ *
+ * @imc : The pointer to the structure of memory controller EDAC device.
+ *
+ * RETURNS : true if the memory controller EDAC device is absent, false otherwise.
+ */
+static bool i10nm_imc_absent(struct skx_imc *imc)
+{
+ u32 mcmtr;
+ int i;
+
+ switch (res_cfg->type) {
+ case SPR:
+ for (i = 0; i < res_cfg->ddr_chan_num; i++) {
+ mcmtr = I10NM_GET_MCMTR(imc, i);
+ edac_dbg(1, "ch%d mcmtr reg %x\n", i, mcmtr);
+ if (mcmtr != ~0)
+ return false;
+ }
+
+ /*
+ * Some workstations' absent memory controllers still
+ * appear as PCIe devices, misleading the EDAC driver.
+ * By observing that the MMIO registers of these absent
+ * memory controllers consistently hold the value of ~0.
+ *
+ * We identify a memory controller as absent by checking
+ * if its MMIO register "mcmtr" == ~0 in all its channels.
+ */
+ return true;
+ default:
+ return false;
+ }
+}
+
static int i10nm_get_ddr_munits(void)
{
struct pci_dev *mdev;
void __iomem *mbase;
unsigned long size;
struct skx_dev *d;
- int i, j = 0;
+ int i, lmc, j = 0;
u32 reg, off;
u64 base;
@@ -690,7 +726,7 @@ static int i10nm_get_ddr_munits(void)
edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n",
j++, base, reg);
- for (i = 0; i < res_cfg->ddr_imc_num; i++) {
+ for (lmc = 0, i = 0; i < res_cfg->ddr_imc_num; i++) {
mdev = get_ddr_munit(d, i, &off, &size);
if (i == 0 && !mdev) {
@@ -700,8 +736,6 @@ static int i10nm_get_ddr_munits(void)
if (!mdev)
continue;
- d->imc[i].mdev = mdev;
-
edac_dbg(2, "mc%d mmio base 0x%llx size 0x%lx (reg 0x%x)\n",
i, base + off, size, reg);
@@ -712,7 +746,17 @@ static int i10nm_get_ddr_munits(void)
return -ENODEV;
}
- d->imc[i].mbase = mbase;
+ d->imc[lmc].mbase = mbase;
+ if (i10nm_imc_absent(&d->imc[lmc])) {
+ pci_dev_put(mdev);
+ iounmap(mbase);
+ d->imc[lmc].mbase = NULL;
+ edac_dbg(2, "Skip absent mc%d\n", i);
+ continue;
+ } else {
+ d->imc[lmc].mdev = mdev;
+ lmc++;
+ }
}
}
--
2.17.1
On Thu, Jul 6, 2023 at 9:46 PM Qiuxu Zhuo <[email protected]> wrote:
>
> Some Sapphire Rapids workstations' absent memory controllers
> still appear as PCIe devices that fool the i10nm_edac driver
> and result in "shift exponet -66 is negative" call traces
> from skx_get_dimm_info().
>
> Skip the absent memory controllers to avoid the call traces.
>
> Reported-by: Kai-Heng Feng <[email protected]>
> Closes: https://lore.kernel.org/linux-edac/CAAd53p41Ku1m1rapeqb1xtD+kKuk+BaUW=dumuoF0ZO3GhFjFA@mail.gmail.com/T/#m5de16dce60a8c836ec235868c7c16e3fefad0cc2
> Reported-by: Koba Ko <[email protected]>
> Closes: https://lore.kernel.org/linux-edac/SA1PR11MB71305B71CCCC3D9305835202892AA@SA1PR11MB7130.namprd11.prod.outlook.com/T/#t
> Fixes: d4dc89d069aa ("EDAC, i10nm: Add a driver for Intel 10nm server processors")
> Signed-off-by: Qiuxu Zhuo <[email protected]>
Tested-by: Kai-Heng Feng <[email protected]>
> ---
> drivers/edac/i10nm_base.c | 54 +++++++++++++++++++++++++++++++++++----
> 1 file changed, 49 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
> index a897b6aff368..349ff6cfb379 100644
> --- a/drivers/edac/i10nm_base.c
> +++ b/drivers/edac/i10nm_base.c
> @@ -658,13 +658,49 @@ static struct pci_dev *get_ddr_munit(struct skx_dev *d, int i, u32 *offset, unsi
> return mdev;
> }
>
> +/**
> + * i10nm_imc_absent() - Check whether the memory controller @imc is absent
> + *
> + * @imc : The pointer to the structure of memory controller EDAC device.
> + *
> + * RETURNS : true if the memory controller EDAC device is absent, false otherwise.
> + */
> +static bool i10nm_imc_absent(struct skx_imc *imc)
> +{
> + u32 mcmtr;
> + int i;
> +
> + switch (res_cfg->type) {
> + case SPR:
> + for (i = 0; i < res_cfg->ddr_chan_num; i++) {
> + mcmtr = I10NM_GET_MCMTR(imc, i);
> + edac_dbg(1, "ch%d mcmtr reg %x\n", i, mcmtr);
> + if (mcmtr != ~0)
> + return false;
> + }
> +
> + /*
> + * Some workstations' absent memory controllers still
> + * appear as PCIe devices, misleading the EDAC driver.
> + * By observing that the MMIO registers of these absent
> + * memory controllers consistently hold the value of ~0.
> + *
> + * We identify a memory controller as absent by checking
> + * if its MMIO register "mcmtr" == ~0 in all its channels.
> + */
> + return true;
> + default:
> + return false;
> + }
> +}
> +
> static int i10nm_get_ddr_munits(void)
> {
> struct pci_dev *mdev;
> void __iomem *mbase;
> unsigned long size;
> struct skx_dev *d;
> - int i, j = 0;
> + int i, lmc, j = 0;
> u32 reg, off;
> u64 base;
>
> @@ -690,7 +726,7 @@ static int i10nm_get_ddr_munits(void)
> edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n",
> j++, base, reg);
>
> - for (i = 0; i < res_cfg->ddr_imc_num; i++) {
> + for (lmc = 0, i = 0; i < res_cfg->ddr_imc_num; i++) {
> mdev = get_ddr_munit(d, i, &off, &size);
>
> if (i == 0 && !mdev) {
> @@ -700,8 +736,6 @@ static int i10nm_get_ddr_munits(void)
> if (!mdev)
> continue;
>
> - d->imc[i].mdev = mdev;
> -
> edac_dbg(2, "mc%d mmio base 0x%llx size 0x%lx (reg 0x%x)\n",
> i, base + off, size, reg);
>
> @@ -712,7 +746,17 @@ static int i10nm_get_ddr_munits(void)
> return -ENODEV;
> }
>
> - d->imc[i].mbase = mbase;
> + d->imc[lmc].mbase = mbase;
> + if (i10nm_imc_absent(&d->imc[lmc])) {
> + pci_dev_put(mdev);
> + iounmap(mbase);
> + d->imc[lmc].mbase = NULL;
> + edac_dbg(2, "Skip absent mc%d\n", i);
> + continue;
> + } else {
> + d->imc[lmc].mdev = mdev;
> + lmc++;
> + }
> }
> }
>
> --
> 2.17.1
>
> From: Kai-Heng Feng <[email protected]>
> ...
> Subject: Re: [PATCH 1/1] EDAC/i10nm: Skip the absent memory controllers
>
> On Thu, Jul 6, 2023 at 9:46 PM Qiuxu Zhuo <[email protected]> wrote:
> >
> > Some Sapphire Rapids workstations' absent memory controllers still
> > appear as PCIe devices that fool the i10nm_edac driver and result in
> > "shift exponet -66 is negative" call traces from skx_get_dimm_info().
> >
> > Skip the absent memory controllers to avoid the call traces.
> >
> > Reported-by: Kai-Heng Feng <[email protected]>
> ...
> Tested-by: Kai-Heng Feng <[email protected]>
Thanks for the testing feedback.
-Qiuxu
Some Sapphire Rapids workstations' absent memory controllers
still appear as PCIe devices that fool the i10nm_edac driver
and result in "shift exponent -66 is negative" call traces
from skx_get_dimm_info().
Skip the absent memory controllers to avoid the call traces.
Reported-by: Kai-Heng Feng <[email protected]>
Closes: https://lore.kernel.org/linux-edac/CAAd53p41Ku1m1rapeqb1xtD+kKuk+BaUW=dumuoF0ZO3GhFjFA@mail.gmail.com/T/#m5de16dce60a8c836ec235868c7c16e3fefad0cc2
Tested-by: Kai-Heng Feng <[email protected]>
Reported-by: Koba Ko <[email protected]>
Closes: https://lore.kernel.org/linux-edac/SA1PR11MB71305B71CCCC3D9305835202892AA@SA1PR11MB7130.namprd11.prod.outlook.com/T/#t
Tested-by: Koba Ko <[email protected]>
Fixes: d4dc89d069aa ("EDAC, i10nm: Add a driver for Intel 10nm server processors")
Signed-off-by: Qiuxu Zhuo <[email protected]>
---
v1->v2:
- No function changes.
- s/exponet/exponent/ in the commit message.
- Add two tags of "Tested-by".
drivers/edac/i10nm_base.c | 54 +++++++++++++++++++++++++++++++++++----
1 file changed, 49 insertions(+), 5 deletions(-)
diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index a897b6aff368..349ff6cfb379 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -658,13 +658,49 @@ static struct pci_dev *get_ddr_munit(struct skx_dev *d, int i, u32 *offset, unsi
return mdev;
}
+/**
+ * i10nm_imc_absent() - Check whether the memory controller @imc is absent
+ *
+ * @imc : The pointer to the structure of memory controller EDAC device.
+ *
+ * RETURNS : true if the memory controller EDAC device is absent, false otherwise.
+ */
+static bool i10nm_imc_absent(struct skx_imc *imc)
+{
+ u32 mcmtr;
+ int i;
+
+ switch (res_cfg->type) {
+ case SPR:
+ for (i = 0; i < res_cfg->ddr_chan_num; i++) {
+ mcmtr = I10NM_GET_MCMTR(imc, i);
+ edac_dbg(1, "ch%d mcmtr reg %x\n", i, mcmtr);
+ if (mcmtr != ~0)
+ return false;
+ }
+
+ /*
+ * Some workstations' absent memory controllers still
+ * appear as PCIe devices, misleading the EDAC driver.
+ * By observing that the MMIO registers of these absent
+ * memory controllers consistently hold the value of ~0.
+ *
+ * We identify a memory controller as absent by checking
+ * if its MMIO register "mcmtr" == ~0 in all its channels.
+ */
+ return true;
+ default:
+ return false;
+ }
+}
+
static int i10nm_get_ddr_munits(void)
{
struct pci_dev *mdev;
void __iomem *mbase;
unsigned long size;
struct skx_dev *d;
- int i, j = 0;
+ int i, lmc, j = 0;
u32 reg, off;
u64 base;
@@ -690,7 +726,7 @@ static int i10nm_get_ddr_munits(void)
edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n",
j++, base, reg);
- for (i = 0; i < res_cfg->ddr_imc_num; i++) {
+ for (lmc = 0, i = 0; i < res_cfg->ddr_imc_num; i++) {
mdev = get_ddr_munit(d, i, &off, &size);
if (i == 0 && !mdev) {
@@ -700,8 +736,6 @@ static int i10nm_get_ddr_munits(void)
if (!mdev)
continue;
- d->imc[i].mdev = mdev;
-
edac_dbg(2, "mc%d mmio base 0x%llx size 0x%lx (reg 0x%x)\n",
i, base + off, size, reg);
@@ -712,7 +746,17 @@ static int i10nm_get_ddr_munits(void)
return -ENODEV;
}
- d->imc[i].mbase = mbase;
+ d->imc[lmc].mbase = mbase;
+ if (i10nm_imc_absent(&d->imc[lmc])) {
+ pci_dev_put(mdev);
+ iounmap(mbase);
+ d->imc[lmc].mbase = NULL;
+ edac_dbg(2, "Skip absent mc%d\n", i);
+ continue;
+ } else {
+ d->imc[lmc].mdev = mdev;
+ lmc++;
+ }
}
}
--
2.17.1
> Some Sapphire Rapids workstations' absent memory controllers
> still appear as PCIe devices that fool the i10nm_edac driver
> and result in "shift exponent -66 is negative" call traces
> from skx_get_dimm_info().
>
> Skip the absent memory controllers to avoid the call traces.
Applied to RAS tree for next merge window.
Thanks
-Tony