Subject: [PATCH 2/2] EDAC: add ARM Cortex A15 L2 internal asynchronous error detection driver

This driver adds support for L2 internal asynchronous error detection
caused by L2 RAM double-bit ECC error or illegal writes to the
Interrupt Controller memory-map region on the Cortex A15.

Signed-off-by: Wladislav Wiebe <[email protected]>
---
MAINTAINERS | 1 +
drivers/edac/Kconfig | 11 +++
drivers/edac/Makefile | 1 +
drivers/edac/cortex_a15_l2_async_edac.c | 134 ++++++++++++++++++++++++++++++++
4 files changed, 147 insertions(+)
create mode 100644 drivers/edac/cortex_a15_l2_async_edac.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 0796ad6e6490..84dc501b2582 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1100,6 +1100,7 @@ L: [email protected]
L: [email protected] (moderated for non-subscribers)
S: Supported
F: Documentation/devicetree/bindings/edac/cortex_a15_l2_async_edac.txt
+F: drivers/edac/cortex_a15_l2_async_edac.c

ARM INTEGRATOR, VERSATILE AND REALVIEW SUPPORT
M: Linus Walleij <[email protected]>
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 41c9ccdd20d6..8722203948e0 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -475,4 +475,15 @@ config EDAC_QCOM
For debugging issues having to do with stability and overall system
health, you should probably say 'Y' here.

+config EDAC_CORTEX_A15_L2_ASYNC
+ tristate "Cortex A15 ASYNC L2 & illegal GIC write error detection"
+ depends on ARM
+ help
+ Support for L2 internal asynchronous error detection caused by L2 RAM
+ double-bit ECC error or illegal writes to the Interrupt Controller
+ memory-map region on the Cortex A15.
+
+ This driver works in interrupt mode triggered by the nINTERRIRQ and
+ reports only uncorrectable errors.
+
endif # EDAC
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 716096d08ea0..12d15cf5ff4e 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -78,3 +78,4 @@ obj-$(CONFIG_EDAC_SYNOPSYS) += synopsys_edac.o
obj-$(CONFIG_EDAC_XGENE) += xgene_edac.o
obj-$(CONFIG_EDAC_TI) += ti_edac.o
obj-$(CONFIG_EDAC_QCOM) += qcom_edac.o
+obj-$(CONFIG_EDAC_CORTEX_A15_L2_ASYNC) += cortex_a15_l2_async_edac.o
diff --git a/drivers/edac/cortex_a15_l2_async_edac.c b/drivers/edac/cortex_a15_l2_async_edac.c
new file mode 100644
index 000000000000..26252568e961
--- /dev/null
+++ b/drivers/edac/cortex_a15_l2_async_edac.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/of.h>
+
+#include "edac_module.h"
+
+#define DRIVER_NAME "cortex_a15_l2_async_edac"
+
+#define L2ECTLR_L2_ASYNC_ERR BIT(30)
+
+static irqreturn_t cortex_a15_l2_async_edac_err_handler(int irq, void *dev_id)
+{
+ struct edac_device_ctl_info *dci = dev_id;
+ u32 status = 0;
+
+ /*
+ * Read and clear L2ECTLR L2 ASYNC error bit caused by INTERRIRQ.
+ * Reason could be a L2 RAM double-bit ECC error or illegal writes
+ * to the Interrupt Controller memory-map region.
+ */
+ asm("mrc p15, 1, %0, c9, c0, 3" : "=r" (status));
+ if (status & L2ECTLR_L2_ASYNC_ERR) {
+ status &= ~L2ECTLR_L2_ASYNC_ERR;
+ asm("mcr p15, 1, %0, c9, c0, 3" : : "r" (status));
+ edac_printk(KERN_EMERG, DRIVER_NAME,
+ "L2 internal asynchronous error occurred!\n");
+ edac_device_handle_ue(dci, 0, 0, dci->ctl_name);
+
+ return IRQ_HANDLED;
+ }
+
+ return IRQ_NONE;
+}
+
+static int cortex_a15_l2_async_edac_probe(struct platform_device *pdev)
+{
+ struct edac_device_ctl_info *dci;
+ struct device_node *np = pdev->dev.of_node;
+ char *ctl_name = (char *)np->name;
+ int i = 0, ret = 0, err_irq = 0, irq_count = 0;
+
+ /* We can have multiple CPU clusters with one INTERRIRQ per cluster */
+ irq_count = platform_irq_count(pdev);
+ if (irq_count < 0) {
+ edac_printk(KERN_ERR, DRIVER_NAME,
+ "No L2 ASYNC error IRQ found!\n");
+ return -EINVAL;
+ }
+
+ dci = edac_device_alloc_ctl_info(0, ctl_name, 1, ctl_name,
+ irq_count, 0, NULL, 0,
+ edac_device_alloc_index());
+ if (!dci)
+ return -ENOMEM;
+
+ dci->dev = &pdev->dev;
+ dci->mod_name = DRIVER_NAME;
+ dci->ctl_name = ctl_name;
+ dci->dev_name = dev_name(&pdev->dev);
+ platform_set_drvdata(pdev, dci);
+
+ if (edac_device_add_device(dci))
+ goto err;
+
+ for (i = 0; i < irq_count; i++) {
+ err_irq = platform_get_irq(pdev, i);
+ ret = devm_request_irq(&pdev->dev, err_irq,
+ cortex_a15_l2_async_edac_err_handler, 0,
+ dev_name(&pdev->dev), dci);
+
+ if (ret < 0) {
+ edac_printk(KERN_ERR, DRIVER_NAME,
+ "Failed to register L2 ASYNC error IRQ %d\n",
+ err_irq);
+ goto err2;
+ }
+ }
+
+ return 0;
+err2:
+ edac_device_del_device(&pdev->dev);
+err:
+ edac_device_free_ctl_info(dci);
+
+ return ret;
+}
+
+static int cortex_a15_l2_async_edac_remove(struct platform_device *pdev)
+{
+ struct edac_device_ctl_info *dci = platform_get_drvdata(pdev);
+
+ edac_device_del_device(&pdev->dev);
+ edac_device_free_ctl_info(dci);
+
+ return 0;
+}
+
+static const struct of_device_id cortex_a15_l2_async_edac_of_match[] = {
+ { .compatible = "arm,cortex-a15-l2-async-edac", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, cortex_a15_l2_async_edac_of_match);
+
+static struct platform_driver cortex_a15_l2_async_edac_driver = {
+ .probe = cortex_a15_l2_async_edac_probe,
+ .remove = cortex_a15_l2_async_edac_remove,
+ .driver = {
+ .name = DRIVER_NAME,
+ .of_match_table = cortex_a15_l2_async_edac_of_match,
+ },
+};
+module_platform_driver(cortex_a15_l2_async_edac_driver);
+
+MODULE_AUTHOR("Wladislav Wiebe <[email protected]>");
+MODULE_DESCRIPTION("ARM Cortex A15 L2 internal asynchronous error detection");
+MODULE_LICENSE("GPL v2");
--
2.16.1


2019-01-08 10:43:39

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 2/2] EDAC: add ARM Cortex A15 L2 internal asynchronous error detection driver

+ James and leaving in the rest for reference.

So the first thing to figure out here is how generic is this and if
so, to make it a cortex_a15_edac.c driver which contains all the RAS
functionality for A15. Definitely not an EDAC driver per functional unit
but rather per vendor or even ARM core.

James?

On Tue, Jan 08, 2019 at 08:10:45AM +0000, Wiebe, Wladislav (Nokia - DE/Ulm) wrote:
> This driver adds support for L2 internal asynchronous error detection
> caused by L2 RAM double-bit ECC error or illegal writes to the
> Interrupt Controller memory-map region on the Cortex A15.
>
> Signed-off-by: Wladislav Wiebe <[email protected]>
> ---
> MAINTAINERS | 1 +
> drivers/edac/Kconfig | 11 +++
> drivers/edac/Makefile | 1 +
> drivers/edac/cortex_a15_l2_async_edac.c | 134 ++++++++++++++++++++++++++++++++
> 4 files changed, 147 insertions(+)
> create mode 100644 drivers/edac/cortex_a15_l2_async_edac.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 0796ad6e6490..84dc501b2582 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1100,6 +1100,7 @@ L: [email protected]
> L: [email protected] (moderated for non-subscribers)
> S: Supported
> F: Documentation/devicetree/bindings/edac/cortex_a15_l2_async_edac.txt
> +F: drivers/edac/cortex_a15_l2_async_edac.c
>
> ARM INTEGRATOR, VERSATILE AND REALVIEW SUPPORT
> M: Linus Walleij <[email protected]>
> diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
> index 41c9ccdd20d6..8722203948e0 100644
> --- a/drivers/edac/Kconfig
> +++ b/drivers/edac/Kconfig
> @@ -475,4 +475,15 @@ config EDAC_QCOM
> For debugging issues having to do with stability and overall system
> health, you should probably say 'Y' here.
>
> +config EDAC_CORTEX_A15_L2_ASYNC
> + tristate "Cortex A15 ASYNC L2 & illegal GIC write error detection"
> + depends on ARM
> + help
> + Support for L2 internal asynchronous error detection caused by L2 RAM
> + double-bit ECC error or illegal writes to the Interrupt Controller
> + memory-map region on the Cortex A15.
> +
> + This driver works in interrupt mode triggered by the nINTERRIRQ and
> + reports only uncorrectable errors.
> +
> endif # EDAC
> diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
> index 716096d08ea0..12d15cf5ff4e 100644
> --- a/drivers/edac/Makefile
> +++ b/drivers/edac/Makefile
> @@ -78,3 +78,4 @@ obj-$(CONFIG_EDAC_SYNOPSYS) += synopsys_edac.o
> obj-$(CONFIG_EDAC_XGENE) += xgene_edac.o
> obj-$(CONFIG_EDAC_TI) += ti_edac.o
> obj-$(CONFIG_EDAC_QCOM) += qcom_edac.o
> +obj-$(CONFIG_EDAC_CORTEX_A15_L2_ASYNC) += cortex_a15_l2_async_edac.o
> diff --git a/drivers/edac/cortex_a15_l2_async_edac.c b/drivers/edac/cortex_a15_l2_async_edac.c
> new file mode 100644
> index 000000000000..26252568e961
> --- /dev/null
> +++ b/drivers/edac/cortex_a15_l2_async_edac.c
> @@ -0,0 +1,134 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2018 Nokia Corporation
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along with
> + * this program. If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/interrupt.h>
> +#include <linux/platform_device.h>
> +#include <linux/of.h>
> +
> +#include "edac_module.h"
> +
> +#define DRIVER_NAME "cortex_a15_l2_async_edac"
> +
> +#define L2ECTLR_L2_ASYNC_ERR BIT(30)
> +
> +static irqreturn_t cortex_a15_l2_async_edac_err_handler(int irq, void *dev_id)
> +{
> + struct edac_device_ctl_info *dci = dev_id;
> + u32 status = 0;
> +
> + /*
> + * Read and clear L2ECTLR L2 ASYNC error bit caused by INTERRIRQ.
> + * Reason could be a L2 RAM double-bit ECC error or illegal writes
> + * to the Interrupt Controller memory-map region.
> + */
> + asm("mrc p15, 1, %0, c9, c0, 3" : "=r" (status));
> + if (status & L2ECTLR_L2_ASYNC_ERR) {
> + status &= ~L2ECTLR_L2_ASYNC_ERR;
> + asm("mcr p15, 1, %0, c9, c0, 3" : : "r" (status));
> + edac_printk(KERN_EMERG, DRIVER_NAME,
> + "L2 internal asynchronous error occurred!\n");
> + edac_device_handle_ue(dci, 0, 0, dci->ctl_name);
> +
> + return IRQ_HANDLED;
> + }
> +
> + return IRQ_NONE;
> +}
> +
> +static int cortex_a15_l2_async_edac_probe(struct platform_device *pdev)
> +{
> + struct edac_device_ctl_info *dci;
> + struct device_node *np = pdev->dev.of_node;
> + char *ctl_name = (char *)np->name;
> + int i = 0, ret = 0, err_irq = 0, irq_count = 0;
> +
> + /* We can have multiple CPU clusters with one INTERRIRQ per cluster */
> + irq_count = platform_irq_count(pdev);
> + if (irq_count < 0) {
> + edac_printk(KERN_ERR, DRIVER_NAME,
> + "No L2 ASYNC error IRQ found!\n");
> + return -EINVAL;
> + }
> +
> + dci = edac_device_alloc_ctl_info(0, ctl_name, 1, ctl_name,
> + irq_count, 0, NULL, 0,
> + edac_device_alloc_index());
> + if (!dci)
> + return -ENOMEM;
> +
> + dci->dev = &pdev->dev;
> + dci->mod_name = DRIVER_NAME;
> + dci->ctl_name = ctl_name;
> + dci->dev_name = dev_name(&pdev->dev);
> + platform_set_drvdata(pdev, dci);
> +
> + if (edac_device_add_device(dci))
> + goto err;
> +
> + for (i = 0; i < irq_count; i++) {
> + err_irq = platform_get_irq(pdev, i);
> + ret = devm_request_irq(&pdev->dev, err_irq,
> + cortex_a15_l2_async_edac_err_handler, 0,
> + dev_name(&pdev->dev), dci);
> +
> + if (ret < 0) {
> + edac_printk(KERN_ERR, DRIVER_NAME,
> + "Failed to register L2 ASYNC error IRQ %d\n",
> + err_irq);
> + goto err2;
> + }
> + }
> +
> + return 0;
> +err2:
> + edac_device_del_device(&pdev->dev);
> +err:
> + edac_device_free_ctl_info(dci);
> +
> + return ret;
> +}
> +
> +static int cortex_a15_l2_async_edac_remove(struct platform_device *pdev)
> +{
> + struct edac_device_ctl_info *dci = platform_get_drvdata(pdev);
> +
> + edac_device_del_device(&pdev->dev);
> + edac_device_free_ctl_info(dci);
> +
> + return 0;
> +}
> +
> +static const struct of_device_id cortex_a15_l2_async_edac_of_match[] = {
> + { .compatible = "arm,cortex-a15-l2-async-edac", },
> + {},
> +};
> +MODULE_DEVICE_TABLE(of, cortex_a15_l2_async_edac_of_match);
> +
> +static struct platform_driver cortex_a15_l2_async_edac_driver = {
> + .probe = cortex_a15_l2_async_edac_probe,
> + .remove = cortex_a15_l2_async_edac_remove,
> + .driver = {
> + .name = DRIVER_NAME,
> + .of_match_table = cortex_a15_l2_async_edac_of_match,
> + },
> +};
> +module_platform_driver(cortex_a15_l2_async_edac_driver);
> +
> +MODULE_AUTHOR("Wladislav Wiebe <[email protected]>");
> +MODULE_DESCRIPTION("ARM Cortex A15 L2 internal asynchronous error detection");
> +MODULE_LICENSE("GPL v2");
> --
> 2.16.1

--
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

2019-01-08 17:59:29

by James Morse

[permalink] [raw]
Subject: Re: [PATCH 2/2] EDAC: add ARM Cortex A15 L2 internal asynchronous error detection driver

Hi Boris, Wladislav,

On 08/01/2019 10:42, Borislav Petkov wrote:
> + James and leaving in the rest for reference.

(thanks!)

> So the first thing to figure out here is how generic is this and if
> so, to make it a cortex_a15_edac.c driver which contains all the RAS
> functionality for A15. Definitely not an EDAC driver per functional unit
> but rather per vendor or even ARM core.

This is implementation-defined/specific-to-A15 and is documented in the TRM [0].
(On the 'all the RAS functionality for A15' front: there are two more registers:
L2MERRSR and CPUMERRSR. These are both accessible from the normal-world, and
don't appear to need enabling.)


But we have the usual pre-v8.2 problems, and in addition cluster-interrupts, as
this signal might be per-cluster, or it might be combined.

Wladislav, I'm afraid we've had a few attempts at pre-8.2 EDAC drivers, the
below list of problems is what we've learnt along the way. The upshot is that
before the architected RAS extensions, the expectation is firmware will handle
all this, as its difficult for the OS to deal with.


My first question is how useful is a 'something bad happened' edac event? Before
the v8.2 extensions with its classification of errors, we don't know anything more.

The usual suspects are, (partly taken from the thread at [1]):
* A15 exists in big/little configurations. We need to know which CPUs are A15.
* We need to know we aren't running under a hypervisor, (a hypervisor can trap
accesses to these imp-def register, KVM does).
* Nothing else should be clearing these bits, e.g. secure-world software, or
another CPU.
* Secure-world needs to enable write-access to L2ECTLR, and we need to
know its done it. This needs doing on every CPU, and needs to not 'go missing'
over cpu-hotplug or cpu-idle.

These are things that don't naturally live in the DT.


The new-one is these cluster-interrupts: How do we know which set of CPUs each
interrupt goes with? What happens if user-space tries to rebalance them?

Another SoC with A15 may combine all the cluster-interrupts into a single
'something bad happened' interrupt. Done like this, we would need to cross-call
to the other CPUs when we take an interrupt - which is not something we can do.

Is this a level or edge interrupt? Is it necessary to clear that bit in the
register to lower the interrupt line?
The TRM talks about 'pending L2 internal asynchronous error', pending makes me
suspect this is at least possible. If it is, a level-interrupt to one CPU, that
can only be cleared by another leads to deadlock.


Thanks,

James

> On Tue, Jan 08, 2019 at 08:10:45AM +0000, Wiebe, Wladislav (Nokia - DE/Ulm) wrote:
>> This driver adds support for L2 internal asynchronous error detection
>> caused by L2 RAM double-bit ECC error or illegal writes to the
>> Interrupt Controller memory-map region on the Cortex A15.

>> diff --git a/drivers/edac/cortex_a15_l2_async_edac.c b/drivers/edac/cortex_a15_l2_async_edac.c
>> new file mode 100644
>> index 000000000000..26252568e961
>> --- /dev/null
>> +++ b/drivers/edac/cortex_a15_l2_async_edac.c
>> @@ -0,0 +1,134 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Copyright (C) 2018 Nokia Corporation

(boiler plate not needed with the SPDX header)

>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/interrupt.h>
>> +#include <linux/platform_device.h>
>> +#include <linux/of.h>
>> +
>> +#include "edac_module.h"
>> +
>> +#define DRIVER_NAME "cortex_a15_l2_async_edac"
>> +
>> +#define L2ECTLR_L2_ASYNC_ERR BIT(30)
>> +
>> +static irqreturn_t cortex_a15_l2_async_edac_err_handler(int irq, void *dev_id)
>> +{
>> + struct edac_device_ctl_info *dci = dev_id;
>> + u32 status = 0;
>> +
>> + /*
>> + * Read and clear L2ECTLR L2 ASYNC error bit caused by INTERRIRQ.
>> + * Reason could be a L2 RAM double-bit ECC error or illegal writes
>> + * to the Interrupt Controller memory-map region.
>> + */
>> + asm("mrc p15, 1, %0, c9, c0, 3" : "=r" (status));

"L2 internal asynchronous error caused by L2 RAM double-bit ECC error" doesn't
tell us if a CPU consumed the error, or if the error has caused a write to go
missing. Without the classification, this means 'something bad happened'.

I'd prefer to panic() when we see one of these. I'd like it even more if
firmware rebooted for us.


>> + if (status & L2ECTLR_L2_ASYNC_ERR) {
>> + status &= ~L2ECTLR_L2_ASYNC_ERR;
>> + asm("mcr p15, 1, %0, c9, c0, 3" : : "r" (status));

4.3.49 "L2 Extended Control Register" of the A15 TRM says this field can be
read-only/write-ignored for the normal world if NSACR.NS_L2ERR is 0.

How do we know if firmware has set this bit on all CPUs? We can't clear the
error otherwise.


>> + edac_printk(KERN_EMERG, DRIVER_NAME,
>> + "L2 internal asynchronous error occurred!\n");
>> + edac_device_handle_ue(dci, 0, 0, dci->ctl_name);

>> +
>> + return IRQ_HANDLED;
>> + }
>> +
>> + return IRQ_NONE;
>> +}
>> +
>> +static int cortex_a15_l2_async_edac_probe(struct platform_device *pdev)
>> +{
>> + struct edac_device_ctl_info *dci;
>> + struct device_node *np = pdev->dev.of_node;
>> + char *ctl_name = (char *)np->name;
>> + int i = 0, ret = 0, err_irq = 0, irq_count = 0;
>> +
>> + /* We can have multiple CPU clusters with one INTERRIRQ per cluster */

Surely this an integration choice?

You're accessing the cluster through a cpu register in the handler, what happens
if the interrupt is delivered to the wrong cluster?
How do we know which interrupt maps to which cluster?
How do we stop user-space 'balancing' the interrupts?


>> + irq_count = platform_irq_count(pdev);
>> + if (irq_count < 0) {
>> + edac_printk(KERN_ERR, DRIVER_NAME,
>> + "No L2 ASYNC error IRQ found!\n");
>> + return -EINVAL;
>> + }
>> +
>> + dci = edac_device_alloc_ctl_info(0, ctl_name, 1, ctl_name,
>> + irq_count, 0, NULL, 0,
>> + edac_device_alloc_index());
>> + if (!dci)
>> + return -ENOMEM;
>> +
>> + dci->dev = &pdev->dev;
>> + dci->mod_name = DRIVER_NAME;
>> + dci->ctl_name = ctl_name;
>> + dci->dev_name = dev_name(&pdev->dev);
>> + platform_set_drvdata(pdev, dci);
>> +
>> + if (edac_device_add_device(dci))
>> + goto err;
>> +
>> + for (i = 0; i < irq_count; i++) {
>> + err_irq = platform_get_irq(pdev, i);
>> + ret = devm_request_irq(&pdev->dev, err_irq,
>> + cortex_a15_l2_async_edac_err_handler, 0,
>> + dev_name(&pdev->dev), dci);
>> +
>> + if (ret < 0) {
>> + edac_printk(KERN_ERR, DRIVER_NAME,
>> + "Failed to register L2 ASYNC error IRQ %d\n",
>> + err_irq);
>> + goto err2;
>> + }
>> + }
>> +
>> + return 0;
>> +err2:
>> + edac_device_del_device(&pdev->dev);
>> +err:
>> + edac_device_free_ctl_info(dci);
>> +
>> + return ret;
>> +}


[0]
http://infocenter.arm.com/help/topic/com.arm.doc.ddi0438c/DDI0438C_cortex_a15_r2p0_trm.pdf
[1] https://lore.kernel.org/lkml/[email protected]/

2019-01-08 18:13:50

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [PATCH 2/2] EDAC: add ARM Cortex A15 L2 internal asynchronous error detection driver

On Tue, Jan 08, 2019 at 05:57:24PM +0000, James Morse wrote:
> >> diff --git a/drivers/edac/cortex_a15_l2_async_edac.c b/drivers/edac/cortex_a15_l2_async_edac.c
> >> new file mode 100644
> >> index 000000000000..26252568e961
> >> --- /dev/null
> >> +++ b/drivers/edac/cortex_a15_l2_async_edac.c
> >> @@ -0,0 +1,134 @@
> >> +// SPDX-License-Identifier: GPL-2.0
> >> +/*
> >> + * Copyright (C) 2018 Nokia Corporation
>
> (boiler plate not needed with the SPDX header)

Not true at all. Copyright is not the same thing as an SPDX license
identifier in any way, shape, or form. As-is, this is just fine.

Now I will not go into the issues where a "Copyright..." line really
means nothing at all anymore, as lawyers like to cargo-cult worse than
programmers do, so it's good to have those lines there just to make them
happy :)

thanks,

greg k-h

2019-01-09 10:08:44

by James Morse

[permalink] [raw]
Subject: Re: [PATCH 2/2] EDAC: add ARM Cortex A15 L2 internal asynchronous error detection driver

Hi Greg,

On 01/08/2019 06:12 PM, [email protected] wrote:
> On Tue, Jan 08, 2019 at 05:57:24PM +0000, James Morse wrote:
>>>> diff --git a/drivers/edac/cortex_a15_l2_async_edac.c b/drivers/edac/cortex_a15_l2_async_edac.c
>>>> new file mode 100644
>>>> index 000000000000..26252568e961
>>>> --- /dev/null
>>>> +++ b/drivers/edac/cortex_a15_l2_async_edac.c
>>>> @@ -0,0 +1,134 @@
>>>> +// SPDX-License-Identifier: GPL-2.0
>>>> +/*
>>>> + * Copyright (C) 2018 Nokia Corporation
>>
>> (boiler plate not needed with the SPDX header)
>
> Not true at all. Copyright is not the same thing as an SPDX license
> identifier in any way, shape, or form. As-is, this is just fine.


This looks funny because I removed the GPL license text that was here,
but left this comment when trimming the reply. I should have left a few
lines of it for context.

(Originally:
https://lore.kernel.org/lkml/[email protected]/T/#u)


Thanks,

James

Subject: RE: [PATCH 2/2] EDAC: add ARM Cortex A15 L2 internal asynchronous error detection driver

Hi James,

first of all thanks a lot for the constructive and fast feedback!

> -----Original Message-----
> From: James Morse <[email protected]>
> Sent: Tuesday, January 08, 2019 6:57 PM
>
> Hi Boris, Wladislav,
>
> On 08/01/2019 10:42, Borislav Petkov wrote:
> > + James and leaving in the rest for reference.
>
> (thanks!)
>
> > So the first thing to figure out here is how generic is this and if
> > so, to make it a cortex_a15_edac.c driver which contains all the RAS
> > functionality for A15. Definitely not an EDAC driver per functional
> > unit but rather per vendor or even ARM core.
>
> This is implementation-defined/specific-to-A15 and is documented in the
> TRM [0].
> (On the 'all the RAS functionality for A15' front: there are two more registers:
> L2MERRSR and CPUMERRSR. These are both accessible from the normal-
> world, and don't appear to need enabling.)
>
>
> But we have the usual pre-v8.2 problems, and in addition cluster-interrupts,
> as this signal might be per-cluster, or it might be combined.
>
> Wladislav, I'm afraid we've had a few attempts at pre-8.2 EDAC drivers, the
> below list of problems is what we've learnt along the way. The upshot is that
> before the architected RAS extensions, the expectation is firmware will
> handle all this, as its difficult for the OS to deal with.
>
>
> My first question is how useful is a 'something bad happened' edac event?

We experienced sometimes random user-space crashes where we didn't
expect a bug in the application code. If there would be a notification
by such edac event, we would at least know that something bad happened before.

> Before the v8.2 extensions with its classification of errors, we don't know
> anything more.
>
> The usual suspects are, (partly taken from the thread at [1]):
> * A15 exists in big/little configurations. We need to know which CPUs are
> A15.
> * We need to know we aren't running under a hypervisor, (a hypervisor can
> trap
> accesses to these imp-def register, KVM does).
> * Nothing else should be clearing these bits, e.g. secure-world software, or
> another CPU.
> * Secure-world needs to enable write-access to L2ECTLR, and we need to
> know its done it. This needs doing on every CPU, and needs to not 'go
> missing'
> over cpu-hotplug or cpu-idle.
>
> These are things that don't naturally live in the DT.
>
>
> The new-one is these cluster-interrupts: How do we know which set of CPUs
> each interrupt goes with? What happens if user-space tries to rebalance
> them?

Valid question - so far, I didn't consider this case.

> Another SoC with A15 may combine all the cluster-interrupts into a single
> 'something bad happened' interrupt. Done like this, we would need to cross-
> call to the other CPUs when we take an interrupt - which is not something we
> can do.
>
> Is this a level or edge interrupt? Is it necessary to clear that bit in the register
> to lower the interrupt line?
> The TRM talks about 'pending L2 internal asynchronous error', pending
> makes me suspect this is at least possible. If it is, a level-interrupt to one
> CPU, that can only be cleared by another leads to deadlock.
>
>
> Thanks,
>
> James
>
> > On Tue, Jan 08, 2019 at 08:10:45AM +0000, Wiebe, Wladislav (Nokia -
> DE/Ulm) wrote:
> >> This driver adds support for L2 internal asynchronous error detection
> >> caused by L2 RAM double-bit ECC error or illegal writes to the
> >> Interrupt Controller memory-map region on the Cortex A15.
>
> >> diff --git a/drivers/edac/cortex_a15_l2_async_edac.c
> >> b/drivers/edac/cortex_a15_l2_async_edac.c
> >> new file mode 100644
> >> index 000000000000..26252568e961
> >> --- /dev/null
> >> +++ b/drivers/edac/cortex_a15_l2_async_edac.c
> >> @@ -0,0 +1,134 @@
> >> +// SPDX-License-Identifier: GPL-2.0
> >> +/*
> >> + * Copyright (C) 2018 Nokia Corporation
>
> (boiler plate not needed with the SPDX header)
>
> >> + */
> >> +
> >> +#include <linux/module.h>
> >> +#include <linux/interrupt.h>
> >> +#include <linux/platform_device.h>
> >> +#include <linux/of.h>
> >> +
> >> +#include "edac_module.h"
> >> +
> >> +#define DRIVER_NAME "cortex_a15_l2_async_edac"
> >> +
> >> +#define L2ECTLR_L2_ASYNC_ERR BIT(30)
> >> +
> >> +static irqreturn_t cortex_a15_l2_async_edac_err_handler(int irq,
> >> +void *dev_id) {
> >> + struct edac_device_ctl_info *dci = dev_id;
> >> + u32 status = 0;
> >> +
> >> + /*
> >> + * Read and clear L2ECTLR L2 ASYNC error bit caused by INTERRIRQ.
> >> + * Reason could be a L2 RAM double-bit ECC error or illegal writes
> >> + * to the Interrupt Controller memory-map region.
> >> + */
> >> + asm("mrc p15, 1, %0, c9, c0, 3" : "=r" (status));
>
> "L2 internal asynchronous error caused by L2 RAM double-bit ECC error"
> doesn't tell us if a CPU consumed the error, or if the error has caused a write
> to go missing. Without the classification, this means 'something bad
> happened'.
>
> I'd prefer to panic() when we see one of these. I'd like it even more if
> firmware rebooted for us.

The EDAC subsystem allows to configure a panic() from userspace/sysfs.
So we can be flexible at this point I think.

>
> >> + if (status & L2ECTLR_L2_ASYNC_ERR) {
> >> + status &= ~L2ECTLR_L2_ASYNC_ERR;
> >> + asm("mcr p15, 1, %0, c9, c0, 3" : : "r" (status));
>
> 4.3.49 "L2 Extended Control Register" of the A15 TRM says this field can be
> read-only/write-ignored for the normal world if NSACR.NS_L2ERR is 0.
>
> How do we know if firmware has set this bit on all CPUs? We can't clear the
> error otherwise.

Valid point!

>
> >> + edac_printk(KERN_EMERG, DRIVER_NAME,
> >> + "L2 internal asynchronous error occurred!\n");
> >> + edac_device_handle_ue(dci, 0, 0, dci->ctl_name);
>
> >> +
> >> + return IRQ_HANDLED;
> >> + }
> >> +
> >> + return IRQ_NONE;
> >> +}
> >> +
> >> +static int cortex_a15_l2_async_edac_probe(struct platform_device
> >> +*pdev) {
> >> + struct edac_device_ctl_info *dci;
> >> + struct device_node *np = pdev->dev.of_node;
> >> + char *ctl_name = (char *)np->name;
> >> + int i = 0, ret = 0, err_irq = 0, irq_count = 0;
> >> +
> >> + /* We can have multiple CPU clusters with one INTERRIRQ per cluster
> >> +*/
>
> Surely this an integration choice?
>
> You're accessing the cluster through a cpu register in the handler, what
> happens if the interrupt is delivered to the wrong cluster?
> How do we know which interrupt maps to which cluster?
> How do we stop user-space 'balancing' the interrupts?

You are right, based on all your inputs I think we can stop using this driver
as generic A15 solution (at least I would need more time to do
the refactoring considering all points you stated and experienced already).

Thanks a lot!

- Wladislav

2019-01-11 20:58:37

by James Morse

[permalink] [raw]
Subject: Re: [PATCH 2/2] EDAC: add ARM Cortex A15 L2 internal asynchronous error detection driver

Hi Wladislav,

On 09/01/2019 14:44, Wiebe, Wladislav (Nokia - DE/Ulm) wrote:
>> From: James Morse <[email protected]>
>> Sent: Tuesday, January 08, 2019 6:57 PM

>> On 08/01/2019 10:42, Borislav Petkov wrote:
>>> So the first thing to figure out here is how generic is this and if
>>> so, to make it a cortex_a15_edac.c driver which contains all the RAS
>>> functionality for A15. Definitely not an EDAC driver per functional
>>> unit but rather per vendor or even ARM core.
>>
>> This is implementation-defined/specific-to-A15 and is documented in the
>> TRM [0].
>> (On the 'all the RAS functionality for A15' front: there are two more registers:
>> L2MERRSR and CPUMERRSR. These are both accessible from the normal-
>> world, and don't appear to need enabling.)

After I sent this it occurred to me the core can't know about errors in the L3
cache (if there is one) or the memory-controller. These may have edac/ras
abilities, but they are selected by the soc integrator, so could be per soc.
This goes against Boris's no-per-functional-unit edac drivers. If we had to pick
one out of that set, I think the memory-controller is most useful as DRAM is the
most likely to be affected by errors.


>> But we have the usual pre-v8.2 problems, and in addition cluster-interrupts,
>> as this signal might be per-cluster, or it might be combined.
>>
>> Wladislav, I'm afraid we've had a few attempts at pre-8.2 EDAC drivers, the
>> below list of problems is what we've learnt along the way. The upshot is that
>> before the architected RAS extensions, the expectation is firmware will
>> handle all this, as its difficult for the OS to deal with.
>>
>>
>> My first question is how useful is a 'something bad happened' edac event?
>
> We experienced sometimes random user-space crashes where we didn't
> expect a bug in the application code. If there would be a notification
> by such edac event,

Sure, but we always have to assume its the worst case: an uncontained error (to
use the v8.2 terms). A write has gone somewhere it shouldn't, we can't trust
memory anymore.

> we would at least know that something bad happened before.

>>> On Tue, Jan 08, 2019 at 08:10:45AM +0000, Wiebe, Wladislav (Nokia -
>> DE/Ulm) wrote:
>>>> This driver adds support for L2 internal asynchronous error detection
>>>> caused by L2 RAM double-bit ECC error or illegal writes to the
>>>> Interrupt Controller memory-map region on the Cortex A15.
>>
>>>> diff --git a/drivers/edac/cortex_a15_l2_async_edac.c
>>>> b/drivers/edac/cortex_a15_l2_async_edac.c
>>>> new file mode 100644
>>>> index 000000000000..26252568e961
>>>> --- /dev/null
>>>> +++ b/drivers/edac/cortex_a15_l2_async_edac.c
>>>> @@ -0,0 +1,134 @@

>>>> +static int cortex_a15_l2_async_edac_probe(struct platform_device
>>>> +*pdev) {
>>>> + struct edac_device_ctl_info *dci;
>>>> + struct device_node *np = pdev->dev.of_node;
>>>> + char *ctl_name = (char *)np->name;
>>>> + int i = 0, ret = 0, err_irq = 0, irq_count = 0;
>>>> +
>>>> + /* We can have multiple CPU clusters with one INTERRIRQ per cluster
>>>> +*/
>>
>> Surely this an integration choice?
>>
>> You're accessing the cluster through a cpu register in the handler, what
>> happens if the interrupt is delivered to the wrong cluster?
>> How do we know which interrupt maps to which cluster?
>> How do we stop user-space 'balancing' the interrupts?
>
> You are right, based on all your inputs I think we can stop using this driver
> as generic A15 solution

Handling this interrupt in firmware is probably the best for your soc. For a
generic a15 driver in the kernel, we would have to consider 'no interrupt',
(e.g. the interrupt is wired to some other SCP/BMC thing). Once we've got
polling code for these registers, we may as well always use it.


Thanks,

James

2019-01-11 21:00:22

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 2/2] EDAC: add ARM Cortex A15 L2 internal asynchronous error detection driver

On Fri, Jan 11, 2019 at 06:11:04PM +0000, James Morse wrote:
> After I sent this it occurred to me the core can't know about errors in the L3
> cache (if there is one) or the memory-controller. These may have edac/ras
> abilities, but they are selected by the soc integrator, so could be per soc.
> This goes against Boris's no-per-functional-unit edac drivers. If we had to pick
> one out of that set, I think the memory-controller is most useful as DRAM is the
> most likely to be affected by errors.

We have similar "designs" already :)

Memory controller driver drivers/edac/fsl_ddr_edac.c gets linked together with:

mpc85xx_edac_mod-y := fsl_ddr_edac.o mpc85xx_edac.o
obj-$(CONFIG_EDAC_MPC85XX) += mpc85xx_edac_mod.o

layerscape_edac_mod-y := fsl_ddr_edac.o layerscape_edac.o
obj-$(CONFIG_EDAC_LAYERSCAPE) += layerscape_edac_mod.o

--
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.