Currently the GHES code only calls into the AER driver for
recoverable type errors. This is incorrect because errors of
other severities do not get logged by the AER driver and do not
get exposed to user space via the AER trace event. So, call
into the AER driver for PCIe errors regardless of the severity
Signed-off-by: Tyler Baicar <[email protected]>
---
drivers/acpi/apei/ghes.c | 22 +++++++++++++++++-----
1 file changed, 17 insertions(+), 5 deletions(-)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 839c3d5..15dbf65 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -458,14 +458,26 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
#endif
}
-static void ghes_handle_aer(struct acpi_hest_generic_data *gdata, int sev, int sec_sev)
+/*
+ * PCIe AER errors need to be sent to the AER driver for reporting and
+ * recovery. The GHES severities map to the following AER severities and
+ * require the following handling:
+ *
+ * GHES_SEV_CORRECTABLE -> AER_CORRECTABLE
+ * These need to be reported by the AER driver but no recovery is
+ * necessary.
+ * GHES_SEV_RECOVERABLE -> AER_NONFATAL
+ * GHES_SEV_RECOVERABLE && CPER_SEC_RESET -> AER_FATAL
+ * These both need to be reported and recovered from by the AER driver.
+ * GHES_SEV_PANIC does not make it to this handling since the kernel must
+ * panic.
+ */
+static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
{
#ifdef CONFIG_ACPI_APEI_PCIEAER
struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
- if (sev == GHES_SEV_RECOVERABLE &&
- sec_sev == GHES_SEV_RECOVERABLE &&
- pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
+ if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO) {
unsigned int devfn;
int aer_severity;
@@ -519,7 +531,7 @@ static void ghes_do_proc(struct ghes *ghes,
ghes_handle_memory_failure(gdata, sev);
}
else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
- ghes_handle_aer(gdata, sev, sec_sev);
+ ghes_handle_aer(gdata);
}
else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
From 1585284357487869231@xxx Tue Nov 28 04:56:15 +0000 2017
X-GM-THRID: 1585215873126639897
X-Gmail-Labels: Inbox,Category Forums,HistoricalUnread
On 11/13/2017 7:36 AM, Dongdong Liu wrote:
>
> 在 2017/11/9 3:13, Tyler Baicar 写道:
>> Currently the GHES code only calls into the AER driver for
>> recoverable type errors. This is incorrect because errors of
>> other severities do not get logged by the AER driver and do not
>> get exposed to user space via the AER trace event. So, call
>> into the AER driver for PCIe errors regardless of the severity
>
> It will also call do_recovery() regardless of the severity for AER correctable
> errors.
> Correctable errors include those error conditions where hardware can recover
> without any loss of information.
> Hardware corrects these errors and software intervention is not required.
> So we'd better modify the code as below.
> diff --git a/drivers/pci/pcie/aer/aerdrv_core.c
> b/drivers/pci/pcie/aer/aerdrv_core.c
> index 7448052..a7f77549 100644
> --- a/drivers/pci/pcie/aer/aerdrv_core.c
> +++ b/drivers/pci/pcie/aer/aerdrv_core.c
> @@ -633,7 +633,8 @@ static void aer_recover_work_func(struct work_struct *work)
> continue;
> }
> cper_print_aer(pdev, entry.severity, entry.regs);
> - do_recovery(pdev, entry.severity);
> + if(entry.severity != AER_CORRECTABLE)
> + do_recovery(pdev, entry.severity);
> pci_dev_put(pdev);
> }
> }
Hello Dongdong,
Yes, I have a patch for this that needs to be picked up.
https://lkml.org/lkml/2017/8/28/848
Thanks,
Tyler
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
From 1583954444323732921@xxx Mon Nov 13 12:37:51 +0000 2017
X-GM-THRID: 1583526487899117441
X-Gmail-Labels: Inbox,Category Forums,HistoricalUnread
�� 2017/11/9 3:13, Tyler Baicar �:
> Currently the GHES code only calls into the AER driver for
> recoverable type errors. This is incorrect because errors of
> other severities do not get logged by the AER driver and do not
> get exposed to user space via the AER trace event. So, call
> into the AER driver for PCIe errors regardless of the severity
It will also call do_recovery() regardless of the severity for AER correctable errors.
Correctable errors include those error conditions where hardware can recover without any loss of information.
Hardware corrects these errors and software intervention is not required.
So we'd better modify the code as below.
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index 7448052..a7f77549 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -633,7 +633,8 @@ static void aer_recover_work_func(struct work_struct *work)
continue;
}
cper_print_aer(pdev, entry.severity, entry.regs);
- do_recovery(pdev, entry.severity);
+ if(entry.severity != AER_CORRECTABLE)
+ do_recovery(pdev, entry.severity);
pci_dev_put(pdev);
}
}
Thanks,
Dongdong
>
> Signed-off-by: Tyler Baicar <[email protected]>
> ---
> drivers/acpi/apei/ghes.c | 8 +++-----
> 1 file changed, 3 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 839c3d5..bb65fa6 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -458,14 +458,12 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
> #endif
> }
>
> -static void ghes_handle_aer(struct acpi_hest_generic_data *gdata, int sev, int sec_sev)
> +static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
> {
> #ifdef CONFIG_ACPI_APEI_PCIEAER
> struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
>
> - if (sev == GHES_SEV_RECOVERABLE &&
> - sec_sev == GHES_SEV_RECOVERABLE &&
> - pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
> + if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
> pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO) {
> unsigned int devfn;
> int aer_severity;
> @@ -519,7 +517,7 @@ static void ghes_do_proc(struct ghes *ghes,
> ghes_handle_memory_failure(gdata, sev);
> }
> else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
> - ghes_handle_aer(gdata, sev, sec_sev);
> + ghes_handle_aer(gdata);
> }
> else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
> struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
>
From 1583610482121280575@xxx Thu Nov 09 17:30:43 +0000 2017
X-GM-THRID: 1583526487899117441
X-Gmail-Labels: Inbox,Category Forums,HistoricalUnread
On Thu, Nov 09, 2017 at 10:14:35AM -0500, Tyler Baicar wrote:
> On 11/9/2017 4:46 AM, Borislav Petkov wrote:
> > On Wed, Nov 08, 2017 at 12:13:12PM -0700, Tyler Baicar wrote:
> > > Currently the GHES code only calls into the AER driver for
> > > recoverable type errors. This is incorrect because errors of
> > > other severities do not get logged by the AER driver and do not
> > > get exposed to user space via the AER trace event. So, call
> > > into the AER driver for PCIe errors regardless of the severity
> > >
> > > Signed-off-by: Tyler Baicar <[email protected]>
> > > ---
> > > drivers/acpi/apei/ghes.c | 8 +++-----
> > > 1 file changed, 3 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> > > index 839c3d5..bb65fa6 100644
> > > --- a/drivers/acpi/apei/ghes.c
> > > +++ b/drivers/acpi/apei/ghes.c
> > > @@ -458,14 +458,12 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
> > > #endif
> > > }
> > Where did the explanatory comment go?
> >
> > +/*
> > + * PCIe AER errors need to be sent to the AER driver for reporting and
> > + * recovery. The GHES severities map to the following AER severities and
> > + * require the following handling:
> > + *
> > + * GHES_SEV_CORRECTABLE -> AER_CORRECTABLE
> > + * These need to be reported by the AER driver but no recovery is
> > + * necessary.
> > + * GHES_SEV_RECOVERABLE -> AER_NONFATAL
> > + * GHES_SEV_RECOVERABLE && CPER_SEC_RESET -> AER_FATAL
> > + * These both need to be reported and recovered from by the AER driver.
> > + * GHES_SEV_PANIC does not make it to this handling since the kernel must
> > + * panic.
> > + */
> >
> > <--- ???
> Updated patch including the comment:
When you decide to do the reckless thing of pasting a patch into
thunderbird on *windoze*, first send it to yourself only and try
applying it.
Because I see this:
[boris@pd: ~/kernel/linux> test-apply.sh /tmp/tbaicar.02
checking file drivers/acpi/apei/ghes.c
patch: **** malformed patch at line 64: @@ -519,7 +531,7 @@ static void ghes_do_proc(struct ghes *ghes,
Not good.
--
Regards/Gruss,
Boris.
SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
--
From 1583601970801560793@xxx Thu Nov 09 15:15:26 +0000 2017
X-GM-THRID: 1583526487899117441
X-Gmail-Labels: Inbox,Category Forums,HistoricalUnread
On 11/9/2017 4:46 AM, Borislav Petkov wrote:
> On Wed, Nov 08, 2017 at 12:13:12PM -0700, Tyler Baicar wrote:
>> Currently the GHES code only calls into the AER driver for
>> recoverable type errors. This is incorrect because errors of
>> other severities do not get logged by the AER driver and do not
>> get exposed to user space via the AER trace event. So, call
>> into the AER driver for PCIe errors regardless of the severity
>>
>> Signed-off-by: Tyler Baicar <[email protected]>
>> ---
>> drivers/acpi/apei/ghes.c | 8 +++-----
>> 1 file changed, 3 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index 839c3d5..bb65fa6 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -458,14 +458,12 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
>> #endif
>> }
> Where did the explanatory comment go?
>
> +/*
> + * PCIe AER errors need to be sent to the AER driver for reporting and
> + * recovery. The GHES severities map to the following AER severities and
> + * require the following handling:
> + *
> + * GHES_SEV_CORRECTABLE -> AER_CORRECTABLE
> + * These need to be reported by the AER driver but no recovery is
> + * necessary.
> + * GHES_SEV_RECOVERABLE -> AER_NONFATAL
> + * GHES_SEV_RECOVERABLE && CPER_SEC_RESET -> AER_FATAL
> + * These both need to be reported and recovered from by the AER driver.
> + * GHES_SEV_PANIC does not make it to this handling since the kernel must
> + * panic.
> + */
>
> <--- ???
Updated patch including the comment:
Currently the GHES code only calls into the AER driver for
recoverable type errors. This is incorrect because errors of
other severities do not get logged by the AER driver and do not
get exposed to user space via the AER trace event. So, call
into the AER driver for PCIe errors regardless of the severity
Signed-off-by: Tyler Baicar <[email protected]>
---
drivers/acpi/apei/ghes.c | 22 +++++++++++++++++-----
1 file changed, 17 insertions(+), 5 deletions(-)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 839c3d5..15dbf65 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -458,14 +458,26 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
#endif
}
-static void ghes_handle_aer(struct acpi_hest_generic_data *gdata, int sev, int sec_sev)
+/*
+ * PCIe AER errors need to be sent to the AER driver for reporting and
+ * recovery. The GHES severities map to the following AER severities and
+ * require the following handling:
+ *
+ * GHES_SEV_CORRECTABLE -> AER_CORRECTABLE
+ * These need to be reported by the AER driver but no recovery is
+ * necessary.
+ * GHES_SEV_RECOVERABLE -> AER_NONFATAL
+ * GHES_SEV_RECOVERABLE && CPER_SEC_RESET -> AER_FATAL
+ * These both need to be reported and recovered from by the AER driver.
+ * GHES_SEV_PANIC does not make it to this handling since the kernel must
+ * panic.
+ */
+static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
{
#ifdef CONFIG_ACPI_APEI_PCIEAER
struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
- if (sev == GHES_SEV_RECOVERABLE &&
- sec_sev == GHES_SEV_RECOVERABLE &&
- pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
+ if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO) {
unsigned int devfn;
int aer_severity;
@@ -519,7 +531,7 @@ static void ghes_do_proc(struct ghes *ghes,
ghes_handle_memory_failure(gdata, sev);
}
else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
- ghes_handle_aer(gdata, sev, sec_sev);
+ ghes_handle_aer(gdata);
}
else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
--
Thanks,
Tyler
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
From 1583601163965597455@xxx Thu Nov 09 15:02:37 +0000 2017
X-GM-THRID: 1583526487899117441
X-Gmail-Labels: Inbox,Category Forums,HistoricalUnread
On Thu, Nov 09, 2017 at 09:37:45AM -0500, Tyler Baicar wrote:
> Ah, forgot to but that back in. I'll send an update shortly.
Just the one patch which needs updating pls, as a reply to the the
respective message.
Thx.
--
Regards/Gruss,
Boris.
SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
--
From 1583599660900042919@xxx Thu Nov 09 14:38:44 +0000 2017
X-GM-THRID: 1583526487899117441
X-Gmail-Labels: Inbox,Category Forums,HistoricalUnread
On 11/9/2017 4:46 AM, Borislav Petkov wrote:
> On Wed, Nov 08, 2017 at 12:13:12PM -0700, Tyler Baicar wrote:
>> Currently the GHES code only calls into the AER driver for
>> recoverable type errors. This is incorrect because errors of
>> other severities do not get logged by the AER driver and do not
>> get exposed to user space via the AER trace event. So, call
>> into the AER driver for PCIe errors regardless of the severity
>>
>> Signed-off-by: Tyler Baicar <[email protected]>
>> ---
>> drivers/acpi/apei/ghes.c | 8 +++-----
>> 1 file changed, 3 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index 839c3d5..bb65fa6 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -458,14 +458,12 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
>> #endif
>> }
> Where did the explanatory comment go?
Ah, forgot to but that back in. I'll send an update shortly.
Thanks,
Tyler
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
From 1583581366548807704@xxx Thu Nov 09 09:47:57 +0000 2017
X-GM-THRID: 1583526487899117441
X-Gmail-Labels: Inbox,Category Forums,HistoricalUnread
On Wed, Nov 08, 2017 at 12:13:12PM -0700, Tyler Baicar wrote:
> Currently the GHES code only calls into the AER driver for
> recoverable type errors. This is incorrect because errors of
> other severities do not get logged by the AER driver and do not
> get exposed to user space via the AER trace event. So, call
> into the AER driver for PCIe errors regardless of the severity
>
> Signed-off-by: Tyler Baicar <[email protected]>
> ---
> drivers/acpi/apei/ghes.c | 8 +++-----
> 1 file changed, 3 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 839c3d5..bb65fa6 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -458,14 +458,12 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
> #endif
> }
Where did the explanatory comment go?
+/*
+ * PCIe AER errors need to be sent to the AER driver for reporting and
+ * recovery. The GHES severities map to the following AER severities and
+ * require the following handling:
+ *
+ * GHES_SEV_CORRECTABLE -> AER_CORRECTABLE
+ * These need to be reported by the AER driver but no recovery is
+ * necessary.
+ * GHES_SEV_RECOVERABLE -> AER_NONFATAL
+ * GHES_SEV_RECOVERABLE && CPER_SEC_RESET -> AER_FATAL
+ * These both need to be reported and recovered from by the AER driver.
+ * GHES_SEV_PANIC does not make it to this handling since the kernel must
+ * panic.
+ */
<--- ???
> -static void ghes_handle_aer(struct acpi_hest_generic_data *gdata, int sev, int sec_sev)
> +static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
> {
> #ifdef CONFIG_ACPI_APEI_PCIEAER
> struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
>
> - if (sev == GHES_SEV_RECOVERABLE &&
> - sec_sev == GHES_SEV_RECOVERABLE &&
> - pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
> + if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
> pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO) {
> unsigned int devfn;
> int aer_severity;
> @@ -519,7 +517,7 @@ static void ghes_do_proc(struct ghes *ghes,
> ghes_handle_memory_failure(gdata, sev);
> }
> else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
> - ghes_handle_aer(gdata, sev, sec_sev);
> + ghes_handle_aer(gdata);
> }
> else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
> struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
> --
--
Regards/Gruss,
Boris.
SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
--
From 1583526487899117441@xxx Wed Nov 08 19:15:40 +0000 2017
X-GM-THRID: 1583526487899117441
X-Gmail-Labels: Inbox,Category Forums,HistoricalUnread