2022-08-31 08:05:49

by Justin He

[permalink] [raw]
Subject: [PATCH v4 5/8] EDAC/ghes: Make ghes_edac a proper module to remove the dependency on ghes

Commit dc4e8c07e9e2 ("ACPI: APEI: explicit init of HEST and GHES in
apci_init()") introduced a bug that ghes_edac_register() would be invoked
before edac_init(). Because at that time, the bus "edac" hadn't been even
registered, this created sysfs /devices/mc0 instead of
/sys/devices/system/edac/mc/mc0 on an Ampere eMag server.

To remove the dependency of ghes_edac on ghes, make it a proper module. Use
a list to save the probing devices in ghes_probe(), and defer the
ghes_edac_register() to module_init() of the new ghes_edac module by
iterating over the devices list.

Co-developed-by: Borislav Petkov <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
Signed-off-by: Jia He <[email protected]>
Fixes: dc4e8c07e9e2 ("ACPI: APEI: explicit init of HEST and GHES in apci_init()")
Cc: Shuai Xue <[email protected]>
---
drivers/acpi/apei/ghes.c | 22 ++++++++++++++--
drivers/edac/Kconfig | 4 +--
drivers/edac/ghes_edac.c | 56 ++++++++++++++++++++++++----------------
include/acpi/ghes.h | 24 +++++------------
4 files changed, 62 insertions(+), 44 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index d798a58fbbd9..d11e3b2e5345 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -130,6 +130,9 @@ static bool ghes_present;
static LIST_HEAD(ghes_hed);
static DEFINE_MUTEX(ghes_list_mutex);

+static LIST_HEAD(ghes_devs);
+static DEFINE_MUTEX(ghes_devs_mutex);
+
/*
* Because the memory area used to transfer hardware error information
* from BIOS to Linux can be determined only in NMI, IRQ or timer
@@ -1388,7 +1391,11 @@ static int ghes_probe(struct platform_device *ghes_dev)

platform_set_drvdata(ghes_dev, ghes);

- ghes_edac_register(ghes, &ghes_dev->dev);
+ ghes->dev = &ghes_dev->dev;
+
+ mutex_lock(&ghes_devs_mutex);
+ list_add_tail(&ghes->elist, &ghes_devs);
+ mutex_unlock(&ghes_devs_mutex);

ghes_present = true;

@@ -1454,7 +1461,9 @@ static int ghes_remove(struct platform_device *ghes_dev)

ghes_fini(ghes);

- ghes_edac_unregister(ghes);
+ mutex_lock(&ghes_devs_mutex);
+ list_del(&ghes->elist);
+ mutex_unlock(&ghes_devs_mutex);

kfree(ghes);

@@ -1540,6 +1549,15 @@ bool ghes_edac_preferred(void)
}
EXPORT_SYMBOL_GPL(ghes_edac_preferred);

+struct list_head *ghes_get_devices(void)
+{
+ if (!ghes_edac_preferred())
+ return NULL;
+
+ return &ghes_devs;
+}
+EXPORT_SYMBOL_GPL(ghes_get_devices);
+
void ghes_register_report_chain(struct notifier_block *nb)
{
atomic_notifier_chain_register(&ghes_report_chain, nb);
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 17562cf1fe97..df45db81858b 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -53,8 +53,8 @@ config EDAC_DECODE_MCE
has been initialized.

config EDAC_GHES
- bool "Output ACPI APEI/GHES BIOS detected errors via EDAC"
- depends on ACPI_APEI_GHES && (EDAC=y)
+ tristate "Output ACPI APEI/GHES BIOS detected errors via EDAC"
+ depends on ACPI_APEI_GHES
select UEFI_CPER
help
Not all machines support hardware-driven error report. Some of those
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index f716d5d8f1df..ec654222e227 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -56,6 +56,8 @@ static DEFINE_SPINLOCK(ghes_lock);

static bool system_scanned;

+static struct list_head *ghes_devs;
+
/* Memory Device - Type 17 of SMBIOS spec */
struct memdev_dmi_entry {
u8 type;
@@ -383,34 +385,15 @@ static struct notifier_block ghes_edac_mem_err_nb = {
.priority = 0,
};

-/*
- * Known systems that are safe to enable this module.
- */
-static struct acpi_platform_list plat_list[] = {
- {"HPE ", "Server ", 0, ACPI_SIG_FADT, all_versions},
- { } /* End */
-};
-
-int ghes_edac_register(struct ghes *ghes, struct device *dev)
+static int ghes_edac_register(struct device *dev)
{
bool fake = false;
struct mem_ctl_info *mci;
struct ghes_pvt *pvt;
struct edac_mc_layer layers[1];
unsigned long flags;
- int idx = -1;
int rc = 0;

- if (IS_ENABLED(CONFIG_X86)) {
- /* Check if safe to enable on this system */
- idx = acpi_match_platform_list(plat_list);
- if (!ghes_edac_force_load && idx < 0)
- return -ENODEV;
- } else {
- ghes_edac_force_load = true;
- idx = 0;
- }
-
/* finish another registration/unregistration instance first */
mutex_lock(&ghes_reg_mutex);

@@ -454,7 +437,7 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n");
pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n");
pr_info("work on such system. Use this driver with caution\n");
- } else if (idx < 0) {
+ } else if (ghes_edac_force_load) {
pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n");
pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n");
pr_info("So, the end result of using this driver varies from vendor to vendor.\n");
@@ -526,7 +509,7 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
return rc;
}

-void ghes_edac_unregister(struct ghes *ghes)
+static void ghes_edac_unregister(struct ghes *ghes)
{
struct mem_ctl_info *mci;
unsigned long flags;
@@ -562,3 +545,32 @@ void ghes_edac_unregister(struct ghes *ghes)
unlock:
mutex_unlock(&ghes_reg_mutex);
}
+
+static int __init ghes_edac_init(void)
+{
+ struct ghes *g, *g_tmp;
+
+ ghes_devs = ghes_get_devices();
+ if (!ghes_devs)
+ return -ENODEV;
+
+ list_for_each_entry_safe(g, g_tmp, ghes_devs, elist) {
+ ghes_edac_register(g->dev);
+ }
+
+ return 0;
+}
+module_init(ghes_edac_init);
+
+static void __exit ghes_edac_exit(void)
+{
+ struct ghes *g, *g_tmp;
+
+ list_for_each_entry_safe(g, g_tmp, ghes_devs, elist) {
+ ghes_edac_unregister(g);
+ }
+}
+module_exit(ghes_edac_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Output ACPI APEI/GHES BIOS detected errors module via EDAC");
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 7ce91c0ff3eb..08f12dde99ae 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -27,6 +27,8 @@ struct ghes {
struct timer_list timer;
unsigned int irq;
};
+ struct device *dev;
+ struct list_head elist;
};

struct ghes_estatus_node {
@@ -69,30 +71,16 @@ int ghes_register_vendor_record_notifier(struct notifier_block *nb);
* @nb: pointer to the notifier_block structure of the vendor record handler.
*/
void ghes_unregister_vendor_record_notifier(struct notifier_block *nb);
-#endif
-
-int ghes_estatus_pool_init(int num_ghes);
-
-/* From drivers/edac/ghes_edac.c */
-
-#ifdef CONFIG_EDAC_GHES
-int ghes_edac_register(struct ghes *ghes, struct device *dev);
-
-void ghes_edac_unregister(struct ghes *ghes);

+struct list_head *ghes_get_devices(void);
bool ghes_edac_preferred(void);
#else
-static inline int ghes_edac_register(struct ghes *ghes, struct device *dev)
-{
- return -ENODEV;
-}
-
-static inline void ghes_edac_unregister(struct ghes *ghes)
-{
-}
+static inline struct list_head *ghes_get_devices(void) { return NULL; }
static bool ghes_edac_preferred(void) { return false; }
#endif

+int ghes_estatus_pool_init(int num_ghes);
+
static inline int acpi_hest_get_version(struct acpi_hest_generic_data *gdata)
{
return gdata->revision >> 8;
--
2.25.1


2022-09-02 17:39:22

by Kani, Toshimitsu

[permalink] [raw]
Subject: RE: [PATCH v4 5/8] EDAC/ghes: Make ghes_edac a proper module to remove the dependency on ghes

On Wednesday, August 31, 2022 1:40 AM, Jia He wrote:
> @@ -454,7 +437,7 @@ int ghes_edac_register(struct ghes *ghes, struct
> device *dev)
> pr_info("This system has a very crappy BIOS: It doesn't even
> list the DIMMS.\n");
> pr_info("Its SMBIOS info is wrong. It is doubtful that the error
> report would\n");
> pr_info("work on such system. Use this driver with
> caution\n");
> - } else if (idx < 0) {
> + } else if (ghes_edac_force_load) {

This change causes the following messages to start showing up on Arm.
Is that what you intend to do?

The messages can be avoided by not setting the force flag on Arm unconditionally.
This will need some change to the flag check in ghes_edac_unregister() though.

> pr_info("This EDAC driver relies on BIOS to enumerate
> memory and get error reports.\n");
> pr_info("Unfortunately, not all BIOSes reflect the memory
> layout correctly.\n");
> pr_info("So, the end result of using this driver varies from
> vendor to vendor.\n");

Toshi

2022-09-05 15:14:29

by Justin He

[permalink] [raw]
Subject: RE: [PATCH v4 5/8] EDAC/ghes: Make ghes_edac a proper module to remove the dependency on ghes

Hi Kani
As per your review for previous patches, I will update the commit log in the next version.
Please see my comments below:

> -----Original Message-----
> > @@ -454,7 +437,7 @@ int ghes_edac_register(struct ghes *ghes, struct
> > device *dev)
> > pr_info("This system has a very crappy BIOS: It doesn't even list
> > the DIMMS.\n");
> > pr_info("Its SMBIOS info is wrong. It is doubtful that the error
> > report would\n");
> > pr_info("work on such system. Use this driver with caution\n");
> > - } else if (idx < 0) {
> > + } else if (ghes_edac_force_load) {
>
> This change causes the following messages to start showing up on Arm.
> Is that what you intend to do?
>
No

> The messages can be avoided by not setting the force flag on Arm
> unconditionally.
> This will need some change to the flag check in ghes_edac_unregister() though.
Yes, looks like I can remove the ghes_edac_force_load check in the ghes_edac_unregister()
since it had been checked in the ghes_edac_init