2015-07-21 23:36:59

by Zhang, Jonathan Zhixiong

[permalink] [raw]
Subject: [PATCH 0/2] process vendor proprietary CPER error section

From: "Jonathan (Zhixiong) Zhang" <[email protected]>

UEFI spec allows for non-standard (eg. vendor proprietary) error
section in CPER (COmmon Platform Error Record), as defined in section
N2.3 of UEFI version 2.5.

If section Type field of Generic Error Data Entry matches with one
of the GUID as defined in include/linux/cper.h, print out the raw data
in dmesg buffer, and also add a tracepoint for reporting such error.

Jonathan (Zhixiong) Zhang (2):
efi: parse vendor proprietary CPER section
ras: acpi/apei: trace event for vendor proprietary CPER section

drivers/firmware/efi/cper.c | 61 +++++++++++++++++++++++++++++++++++++++++++--
include/linux/cper.h | 4 +++
drivers/acpi/apei/ghes.c | 29 +++++++++++++++++++++++++++--
drivers/ras/ras.c | 1 +
include/ras/ras_event.h | 30 ++++++++++++++++++++++++++++++
5 files changed, 121 insertions(+), 4 deletions(-)

--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project


2015-07-21 23:37:10

by Zhang, Jonathan Zhixiong

[permalink] [raw]
Subject: [PATCH 1/2] efi: parse vendor proprietary CPER section

From: "Jonathan (Zhixiong) Zhang" <[email protected]>

UEFI spec allows for non-standard section in Common Platform Error
Record. This is defined in section N.2.3 of UEFI version 2.5.

If Section Type field of Generic Error Data Entry indicates a
non-standard section (eg. matchs a vendor proprietary GUID as defined
in include/linux/cper.h), print out the raw data in hex in dmesg
buffer. Data length is taken from Error Data length field of
Generic Error Data Entry.

Following is a sample output from dmesg:
{1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 2
{1}[Hardware Error]: It has been corrected by h/w and requires no further action
{1}[Hardware Error]: event severity: corrected
{1}[Hardware Error]: Error 0, type: corrected
{1}[Hardware Error]: fru_id: 00000000-0000-0000-0000-000000000000
{1}[Hardware Error]: fru_text:
{1}[Hardware Error]: section_type: Qualcomm Technologies Inc. proprietary error
{1}[Hardware Error]: section length: 88
{1}[Hardware Error]: 00000000: 01002011 20150416 01000001 00000002
{1}[Hardware Error]: 00000010: 5f434345 525f4543 0000574d 00000000
{1}[Hardware Error]: 00000020: 00000000 00000000 00000000 00000000
{1}[Hardware Error]: 00000030: 00000000 00000000 00000000 00000000
{1}[Hardware Error]: 00000040: 00000000 00000000 fe800000 00000000
{1}[Hardware Error]: 00000050: 00000004 5f434345

---
checkpatch.pl gives following warnings on this patch:
WARNING: printk() should include KERN_ facility level
This is a false warning as pfx parameter includes KERN_ facility
level. Also such practice is consistent with the rest of the file.
---

Change-Id: I9a5bb6039beef1c0a36097765268b382e6a28498
Signed-off-by: Jonathan (Zhixiong) Zhang <[email protected]>
---
drivers/firmware/efi/cper.c | 61 +++++++++++++++++++++++++++++++++++++++++++--
include/linux/cper.h | 4 +++
2 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index 4fd9961d552e..29af8846ffd1 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -32,12 +32,50 @@
#include <linux/acpi.h>
#include <linux/pci.h>
#include <linux/aer.h>
+#include <linux/printk.h>

#define INDENT_SP " "

+#define ROW_SIZE 16
+#define GROUP_SIZE 4
+
+struct sec_vendor {
+ uuid_le guid;
+ const char *name;
+};
+
+static struct sec_vendor sec_vendors[] = {
+ {CPER_SEC_QTI_ERR, "Qualcomm Technologies Inc. proprietary error"},
+ {NULL_UUID_LE, NULL},
+};
+
static char rcd_decode_str[CPER_REC_LEN];

/*
+ * In case of CPER error record, there should be only two message
+ * levels: KERN_ERR and KERN_WARNING
+ */
+static const char *cper_kern_level(const char *pfx)
+{
+ switch (printk_get_level(pfx)) {
+ case '3': return KERN_ERR;
+ case '4': return KERN_WARNING;
+ default: return KERN_DEFAULT;
+ }
+}
+
+/*
+ * cper_print_hex - print hex from a binary buffer
+ * @pfx: prefix for each line, including log level and prefix string
+ * @buf: buffer pointer
+ * @len: size of buffer
+ */
+#define cper_print_hex(pfx, buf, len) \
+ print_hex_dump(cper_kern_level(pfx), printk_skip_level(pfx), \
+ DUMP_PREFIX_OFFSET, ROW_SIZE, GROUP_SIZE, \
+ buf, len, 0)
+
+/*
* CPER record ID need to be unique even after reboot, because record
* ID is used as index for ERST storage, while CPER records from
* multiple boot may co-exist in ERST.
@@ -379,6 +417,12 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
pfx, pcie->bridge.secondary_status, pcie->bridge.control);
}

+static void cper_print_vendor(const char *pfx, __u8 *vendor_err, __u32 len)
+{
+ printk("%ssection length: %d\n", pfx, len);
+ cper_print_hex(pfx, vendor_err, len);
+}
+
static void cper_estatus_print_section(
const char *pfx, const struct acpi_hest_generic_data *gdata, int sec_no)
{
@@ -416,9 +460,22 @@ static void cper_estatus_print_section(
cper_print_pcie(newpfx, pcie, gdata);
else
goto err_section_too_small;
- } else
+ } else {
+ int i;
+ __u8 *vendor_err = (void *)(gdata + 1);
+
+ for (i = 0; uuid_le_cmp(sec_vendors[i].guid,
+ NULL_UUID_LE); i++) {
+ if (!uuid_le_cmp(*sec_type, sec_vendors[i].guid)) {
+ printk("%ssection_type: %s", newpfx,
+ sec_vendors[i].name);
+ cper_print_vendor(newpfx, vendor_err,
+ gdata->error_data_length);
+ return;
+ }
+ }
printk("%s""section type: unknown, %pUl\n", newpfx, sec_type);
-
+ }
return;

err_section_too_small:
diff --git a/include/linux/cper.h b/include/linux/cper.h
index 76abba4b238e..2bb38cc1219e 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -210,6 +210,10 @@ enum {
#define CPER_SEC_DMAR_IOMMU \
UUID_LE(0x036F84E1, 0x7F37, 0x428c, 0xA7, 0x9E, 0x57, 0x5F, \
0xDF, 0xAA, 0x84, 0xEC)
+/* Qualcomm Technologies Inc. Proprietary Error */
+#define CPER_SEC_QTI_ERR \
+ UUID_LE(0xD2E2621C, 0xF936, 0x468D, 0x0D, 0x84, 0x15, 0xA4, \
+ 0xED, 0x01, 0x5C, 0x8B)

#define CPER_PROC_VALID_TYPE 0x0001
#define CPER_PROC_VALID_ISA 0x0002
--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project

2015-07-21 23:37:08

by Zhang, Jonathan Zhixiong

[permalink] [raw]
Subject: [PATCH 2/2] ras: acpi/apei: trace event for vendor proprietary CPER section

From: "Jonathan (Zhixiong) Zhang" <[email protected]>

Trace event is generated when hardware detected a hardware error
event, which is of non-standard section as defined in UEFI
spec appendix "Common Platform Error Record" (section N.2.3 of
UEFI version 2.5).

The trace buffer contains length of error data and raw error data
in hex.

Following is a sample output of "perf script":
_________swapper_____0_[000]___133.521441:_ras:vendor_event:_len=88_raw=11_20_0
0_01_16_04_15_20_01_00_00_01_02_00_00_00_45_43_43_5f_43_45_5f_52_4d_57_00_00_00
_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_80_fe_00_00_00_00_04_0
0_00_00_45_43_43_5f

Change-Id: Ic8661310133b0ae51f6b299cdde3cd0fa5517464
Signed-off-by: Jonathan (Zhixiong) Zhang <[email protected]>
---
drivers/acpi/apei/ghes.c | 29 +++++++++++++++++++++++++++--
drivers/ras/ras.c | 1 +
include/ras/ras_event.h | 30 ++++++++++++++++++++++++++++++
3 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 67fc948da17a..03114d27d218 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -53,9 +53,15 @@
#include <acpi/ghes.h>
#include <acpi/apei.h>
#include <asm/tlbflush.h>
+#include <ras/ras_event.h>

#include "apei-internal.h"

+static uuid_le sec_vendor_uuids[] = {
+ CPER_SEC_QTI_ERR,
+ NULL_UUID_LE,
+};
+
#define GHES_PFX "GHES: "

#define GHES_ESTATUS_MAX_SIZE 65536
@@ -440,11 +446,14 @@ static void ghes_do_proc(struct ghes *ghes,
{
int sev, sec_sev;
struct acpi_hest_generic_data *gdata;
+ uuid_le sec_type;

sev = ghes_severity(estatus->error_severity);
apei_estatus_for_each_section(estatus, gdata) {
sec_sev = ghes_severity(gdata->error_severity);
- if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
+ sec_type = *(uuid_le *)gdata->section_type;
+
+ if (!uuid_le_cmp(sec_type,
CPER_SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem_err;
mem_err = (struct cper_sec_mem_err *)(gdata+1);
@@ -454,7 +463,7 @@ static void ghes_do_proc(struct ghes *ghes,
ghes_handle_memory_failure(gdata, sev);
}
#ifdef CONFIG_ACPI_APEI_PCIEAER
- else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
+ else if (!uuid_le_cmp(sec_type,
CPER_SEC_PCIE)) {
struct cper_sec_pcie *pcie_err;
pcie_err = (struct cper_sec_pcie *)(gdata+1);
@@ -486,6 +495,22 @@ static void ghes_do_proc(struct ghes *ghes,

}
#endif
+ else {
+ int i;
+
+ for (i = 0; uuid_le_cmp(sec_vendor_uuids[i],
+ NULL_UUID_LE); i++) {
+ if (!uuid_le_cmp(sec_type,
+ sec_vendor_uuids[i])) {
+ const void *vendor_err;
+
+ vendor_err = gdata + 1;
+ trace_vendor_event(vendor_err,
+ gdata->error_data_length);
+ break;
+ }
+ }
+ }
}
}

diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index b67dd362b7b6..a656ff131249 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -27,3 +27,4 @@ subsys_initcall(ras_init);
EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
#endif
EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
+EXPORT_TRACEPOINT_SYMBOL_GPL(vendor_event);
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index 79abb9c71772..a3038653f72d 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -161,6 +161,36 @@ TRACE_EVENT(mc_event,
);

/*
+ * Vendor Proprietary Events Report
+ *
+ * Those event is generated when hardware detected a hardware
+ * error event, which is of non-standard section as defined
+ * in UEFI spec appendix "Common Platform Error Record".
+ *
+ */
+TRACE_EVENT(vendor_event,
+
+ TP_PROTO(const u8 *err,
+ const u32 len),
+
+ TP_ARGS(err, len),
+
+ TP_STRUCT__entry(
+ __field(u32, len)
+ __dynamic_array(u8, buf, len)
+ ),
+
+ TP_fast_assign(
+ __entry->len = len;
+ memcpy(__get_dynamic_array(buf), err, len);
+ ),
+
+ TP_printk("len=%d raw=%s",
+ __entry->len,
+ __print_hex(__get_dynamic_array(buf), __entry->len))
+);
+
+/*
* PCIe AER Trace event
*
* These events are generated when hardware detects a corrected or
--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project

2015-07-22 07:30:32

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 1/2] efi: parse vendor proprietary CPER section

On Tue, Jul 21, 2015 at 04:36:46PM -0700, Jonathan (Zhixiong) Zhang wrote:
> From: "Jonathan (Zhixiong) Zhang" <[email protected]>
>
> UEFI spec allows for non-standard section in Common Platform Error
> Record. This is defined in section N.2.3 of UEFI version 2.5.
>
> If Section Type field of Generic Error Data Entry indicates a
> non-standard section (eg. matchs a vendor proprietary GUID as defined
> in include/linux/cper.h), print out the raw data in hex in dmesg
> buffer. Data length is taken from Error Data length field of
> Generic Error Data Entry.
>
> Following is a sample output from dmesg:
> {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 2
> {1}[Hardware Error]: It has been corrected by h/w and requires no further action
> {1}[Hardware Error]: event severity: corrected
> {1}[Hardware Error]: Error 0, type: corrected
> {1}[Hardware Error]: fru_id: 00000000-0000-0000-0000-000000000000
> {1}[Hardware Error]: fru_text:
> {1}[Hardware Error]: section_type: Qualcomm Technologies Inc. proprietary error
> {1}[Hardware Error]: section length: 88
> {1}[Hardware Error]: 00000000: 01002011 20150416 01000001 00000002
> {1}[Hardware Error]: 00000010: 5f434345 525f4543 0000574d 00000000
> {1}[Hardware Error]: 00000020: 00000000 00000000 00000000 00000000
> {1}[Hardware Error]: 00000030: 00000000 00000000 00000000 00000000
> {1}[Hardware Error]: 00000040: 00000000 00000000 fe800000 00000000
> {1}[Hardware Error]: 00000050: 00000004 5f434345
>
> ---
> checkpatch.pl gives following warnings on this patch:
> WARNING: printk() should include KERN_ facility level
> This is a false warning as pfx parameter includes KERN_ facility
> level. Also such practice is consistent with the rest of the file.
> ---
>
> Change-Id: I9a5bb6039beef1c0a36097765268b382e6a28498
> Signed-off-by: Jonathan (Zhixiong) Zhang <[email protected]>
> ---
> drivers/firmware/efi/cper.c | 61 +++++++++++++++++++++++++++++++++++++++++++--
> include/linux/cper.h | 4 +++
> 2 files changed, 63 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
> index 4fd9961d552e..29af8846ffd1 100644
> --- a/drivers/firmware/efi/cper.c
> +++ b/drivers/firmware/efi/cper.c
> @@ -32,12 +32,50 @@
> #include <linux/acpi.h>
> #include <linux/pci.h>
> #include <linux/aer.h>
> +#include <linux/printk.h>
>
> #define INDENT_SP " "
>
> +#define ROW_SIZE 16
> +#define GROUP_SIZE 4
> +
> +struct sec_vendor {
> + uuid_le guid;
> + const char *name;
> +};
> +
> +static struct sec_vendor sec_vendors[] = {
> + {CPER_SEC_QTI_ERR, "Qualcomm Technologies Inc. proprietary error"},
> + {NULL_UUID_LE, NULL},
> +};

No, no vendor-specific stuff like that. This becomes a nightmare to
maintain...

> +
> static char rcd_decode_str[CPER_REC_LEN];
>
> /*
> + * In case of CPER error record, there should be only two message
> + * levels: KERN_ERR and KERN_WARNING
> + */
> +static const char *cper_kern_level(const char *pfx)
> +{
> + switch (printk_get_level(pfx)) {
> + case '3': return KERN_ERR;
> + case '4': return KERN_WARNING;
> + default: return KERN_DEFAULT;
> + }


We already pass down const char *pfx - no need for retranslation.

> +
> +/*
> + * cper_print_hex - print hex from a binary buffer
> + * @pfx: prefix for each line, including log level and prefix string
> + * @buf: buffer pointer
> + * @len: size of buffer
> + */
> +#define cper_print_hex(pfx, buf, len) \
> + print_hex_dump(cper_kern_level(pfx), printk_skip_level(pfx), \
> + DUMP_PREFIX_OFFSET, ROW_SIZE, GROUP_SIZE, \
> + buf, len, 0)
> +
> +/*
> * CPER record ID need to be unique even after reboot, because record
> * ID is used as index for ERST storage, while CPER records from
> * multiple boot may co-exist in ERST.
> @@ -379,6 +417,12 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
> pfx, pcie->bridge.secondary_status, pcie->bridge.control);
> }
>
> +static void cper_print_vendor(const char *pfx, __u8 *vendor_err, __u32 len)
> +{
> + printk("%ssection length: %d\n", pfx, len);
> + cper_print_hex(pfx, vendor_err, len);
> +}
> +
> static void cper_estatus_print_section(
> const char *pfx, const struct acpi_hest_generic_data *gdata, int sec_no)
> {
> @@ -416,9 +460,22 @@ static void cper_estatus_print_section(
> cper_print_pcie(newpfx, pcie, gdata);
> else
> goto err_section_too_small;
> - } else
> + } else {
> + int i;
> + __u8 *vendor_err = (void *)(gdata + 1);
> +
> + for (i = 0; uuid_le_cmp(sec_vendors[i].guid,
> + NULL_UUID_LE); i++) {
> + if (!uuid_le_cmp(*sec_type, sec_vendors[i].guid)) {
> + printk("%ssection_type: %s", newpfx,
> + sec_vendors[i].name);
> + cper_print_vendor(newpfx, vendor_err,
> + gdata->error_data_length);
> + return;
> + }
> + }
> printk("%s""section type: unknown, %pUl\n", newpfx, sec_type);

... you simply dump the section in binary if none of the standard
UUIDs match. And you can then turn the "unknown" message into a
vendor-specific one.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-07-22 07:36:37

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 2/2] ras: acpi/apei: trace event for vendor proprietary CPER section

On Tue, Jul 21, 2015 at 04:36:47PM -0700, Jonathan (Zhixiong) Zhang wrote:
> From: "Jonathan (Zhixiong) Zhang" <[email protected]>
>
> Trace event is generated when hardware detected a hardware error
> event, which is of non-standard section as defined in UEFI
> spec appendix "Common Platform Error Record" (section N.2.3 of
> UEFI version 2.5).
>
> The trace buffer contains length of error data and raw error data
> in hex.
>
> Following is a sample output of "perf script":
> _________swapper_____0_[000]___133.521441:_ras:vendor_event:_len=88_raw=11_20_0
> 0_01_16_04_15_20_01_00_00_01_02_00_00_00_45_43_43_5f_43_45_5f_52_4d_57_00_00_00
> _00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
> 00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_80_fe_00_00_00_00_04_0
> 0_00_00_45_43_43_5f
>
> Change-Id: Ic8661310133b0ae51f6b299cdde3cd0fa5517464
> Signed-off-by: Jonathan (Zhixiong) Zhang <[email protected]>
> ---
> drivers/acpi/apei/ghes.c | 29 +++++++++++++++++++++++++++--
> drivers/ras/ras.c | 1 +
> include/ras/ras_event.h | 30 ++++++++++++++++++++++++++++++
> 3 files changed, 58 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 67fc948da17a..03114d27d218 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -53,9 +53,15 @@
> #include <acpi/ghes.h>
> #include <acpi/apei.h>
> #include <asm/tlbflush.h>
> +#include <ras/ras_event.h>
>
> #include "apei-internal.h"
>
> +static uuid_le sec_vendor_uuids[] = {
> + CPER_SEC_QTI_ERR,
> + NULL_UUID_LE,
> +};
> +
> #define GHES_PFX "GHES: "
>
> #define GHES_ESTATUS_MAX_SIZE 65536
> @@ -440,11 +446,14 @@ static void ghes_do_proc(struct ghes *ghes,
> {
> int sev, sec_sev;
> struct acpi_hest_generic_data *gdata;
> + uuid_le sec_type;
>
> sev = ghes_severity(estatus->error_severity);
> apei_estatus_for_each_section(estatus, gdata) {
> sec_sev = ghes_severity(gdata->error_severity);
> - if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
> + sec_type = *(uuid_le *)gdata->section_type;
> +
> + if (!uuid_le_cmp(sec_type,
> CPER_SEC_PLATFORM_MEM)) {

No need to break lines like that - 80 cols rule is superceded by common
sense.

> struct cper_sec_mem_err *mem_err;
> mem_err = (struct cper_sec_mem_err *)(gdata+1);
> @@ -454,7 +463,7 @@ static void ghes_do_proc(struct ghes *ghes,
> ghes_handle_memory_failure(gdata, sev);
> }
> #ifdef CONFIG_ACPI_APEI_PCIEAER
> - else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
> + else if (!uuid_le_cmp(sec_type,
> CPER_SEC_PCIE)) {
> struct cper_sec_pcie *pcie_err;
> pcie_err = (struct cper_sec_pcie *)(gdata+1);
> @@ -486,6 +495,22 @@ static void ghes_do_proc(struct ghes *ghes,
>
> }
> #endif
> + else {

If you return in the cases above, you can save yourself this last else
and an intentation level...

> + int i;
> +
> + for (i = 0; uuid_le_cmp(sec_vendor_uuids[i],
> + NULL_UUID_LE); i++) {

... and not break statements like that...

> + if (!uuid_le_cmp(sec_type,
> + sec_vendor_uuids[i])) {

... and like that.

> + const void *vendor_err;
> +
> + vendor_err = gdata + 1;
> + trace_vendor_event(vendor_err,
> + gdata->error_data_length);
> + break;
> + }
> + }
> + }
> }
> }
>
> diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
> index b67dd362b7b6..a656ff131249 100644
> --- a/drivers/ras/ras.c
> +++ b/drivers/ras/ras.c
> @@ -27,3 +27,4 @@ subsys_initcall(ras_init);
> EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
> #endif
> EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(vendor_event);
> diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
> index 79abb9c71772..a3038653f72d 100644
> --- a/include/ras/ras_event.h
> +++ b/include/ras/ras_event.h
> @@ -161,6 +161,36 @@ TRACE_EVENT(mc_event,
> );
>
> /*
> + * Vendor Proprietary Events Report
> + *
> + * Those event is generated when hardware detected a hardware
> + * error event, which is of non-standard section as defined
> + * in UEFI spec appendix "Common Platform Error Record".
> + *
> + */
> +TRACE_EVENT(vendor_event,

This should be something more generic like "data_event" or
"raw_data_event" which can be used as a catch-all for all non-standard
formats the UEFI insanity spec will come up with in the future.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-07-22 18:11:17

by Zhang, Jonathan Zhixiong

[permalink] [raw]
Subject: Re: [PATCH 1/2] efi: parse vendor proprietary CPER section

Thank you Borislav for your help.

On 7/22/2015 12:30 AM, Borislav Petkov wrote:
> On Tue, Jul 21, 2015 at 04:36:46PM -0700, Jonathan (Zhixiong) Zhang wrote:
>> From: "Jonathan (Zhixiong) Zhang" <[email protected]>
>>
>> UEFI spec allows for non-standard section in Common Platform Error
>> Record. This is defined in section N.2.3 of UEFI version 2.5.
>>
>> If Section Type field of Generic Error Data Entry indicates a
>> non-standard section (eg. matchs a vendor proprietary GUID as defined
>> in include/linux/cper.h), print out the raw data in hex in dmesg
>> buffer. Data length is taken from Error Data length field of
>> Generic Error Data Entry.
>>
>> Following is a sample output from dmesg:
>> {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 2
>> {1}[Hardware Error]: It has been corrected by h/w and requires no further action
>> {1}[Hardware Error]: event severity: corrected
>> {1}[Hardware Error]: Error 0, type: corrected
>> {1}[Hardware Error]: fru_id: 00000000-0000-0000-0000-000000000000
>> {1}[Hardware Error]: fru_text:
>> {1}[Hardware Error]: section_type: Qualcomm Technologies Inc. proprietary error
>> {1}[Hardware Error]: section length: 88
>> {1}[Hardware Error]: 00000000: 01002011 20150416 01000001 00000002
>> {1}[Hardware Error]: 00000010: 5f434345 525f4543 0000574d 00000000
>> {1}[Hardware Error]: 00000020: 00000000 00000000 00000000 00000000
>> {1}[Hardware Error]: 00000030: 00000000 00000000 00000000 00000000
>> {1}[Hardware Error]: 00000040: 00000000 00000000 fe800000 00000000
>> {1}[Hardware Error]: 00000050: 00000004 5f434345
>>
>> ---
>> checkpatch.pl gives following warnings on this patch:
>> WARNING: printk() should include KERN_ facility level
>> This is a false warning as pfx parameter includes KERN_ facility
>> level. Also such practice is consistent with the rest of the file.
>> ---
>>
>> Change-Id: I9a5bb6039beef1c0a36097765268b382e6a28498
>> Signed-off-by: Jonathan (Zhixiong) Zhang <[email protected]>
>> ---
>> drivers/firmware/efi/cper.c | 61 +++++++++++++++++++++++++++++++++++++++++++--
>> include/linux/cper.h | 4 +++
>> 2 files changed, 63 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
>> index 4fd9961d552e..29af8846ffd1 100644
>> --- a/drivers/firmware/efi/cper.c
>> +++ b/drivers/firmware/efi/cper.c
>> @@ -32,12 +32,50 @@
>> #include <linux/acpi.h>
>> #include <linux/pci.h>
>> #include <linux/aer.h>
>> +#include <linux/printk.h>
>>
>> #define INDENT_SP " "
>>
>> +#define ROW_SIZE 16
>> +#define GROUP_SIZE 4
>> +
>> +struct sec_vendor {
>> + uuid_le guid;
>> + const char *name;
>> +};
>> +
>> +static struct sec_vendor sec_vendors[] = {
>> + {CPER_SEC_QTI_ERR, "Qualcomm Technologies Inc. proprietary error"},
>> + {NULL_UUID_LE, NULL},
>> +};
>
> No, no vendor-specific stuff like that. This becomes a nightmare to
> maintain...
Got it. I will take the feedback and print data of "unknown" sections
as is.
>
>> +
>> static char rcd_decode_str[CPER_REC_LEN];
>>
>> /*
>> + * In case of CPER error record, there should be only two message
>> + * levels: KERN_ERR and KERN_WARNING
>> + */
>> +static const char *cper_kern_level(const char *pfx)
>> +{
>> + switch (printk_get_level(pfx)) {
>> + case '3': return KERN_ERR;
>> + case '4': return KERN_WARNING;
>> + default: return KERN_DEFAULT;
>> + }
>
>
> We already pass down const char *pfx - no need for retranslation.
The reason I did re-translation is because print_hex_dump() asks for two
parameters, one for level, another for prefix string. In our case, pfx
already includes both of them. Since print_hex_dump() simply
concatenates level and prefix string together, I will not do the
re-translation, but pass pfx as the level parameter, and pass empty
string as the prefix string parameter to print_hex_dump().
>
>> +
>> +/*
>> + * cper_print_hex - print hex from a binary buffer
>> + * @pfx: prefix for each line, including log level and prefix string
>> + * @buf: buffer pointer
>> + * @len: size of buffer
>> + */
>> +#define cper_print_hex(pfx, buf, len) \
>> + print_hex_dump(cper_kern_level(pfx), printk_skip_level(pfx), \
>> + DUMP_PREFIX_OFFSET, ROW_SIZE, GROUP_SIZE, \
>> + buf, len, 0)
>> +
>> +/*
>> * CPER record ID need to be unique even after reboot, because record
>> * ID is used as index for ERST storage, while CPER records from
>> * multiple boot may co-exist in ERST.
>> @@ -379,6 +417,12 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
>> pfx, pcie->bridge.secondary_status, pcie->bridge.control);
>> }
>>
>> +static void cper_print_vendor(const char *pfx, __u8 *vendor_err, __u32 len)
>> +{
>> + printk("%ssection length: %d\n", pfx, len);
>> + cper_print_hex(pfx, vendor_err, len);
>> +}
>> +
>> static void cper_estatus_print_section(
>> const char *pfx, const struct acpi_hest_generic_data *gdata, int sec_no)
>> {
>> @@ -416,9 +460,22 @@ static void cper_estatus_print_section(
>> cper_print_pcie(newpfx, pcie, gdata);
>> else
>> goto err_section_too_small;
>> - } else
>> + } else {
>> + int i;
>> + __u8 *vendor_err = (void *)(gdata + 1);
>> +
>> + for (i = 0; uuid_le_cmp(sec_vendors[i].guid,
>> + NULL_UUID_LE); i++) {
>> + if (!uuid_le_cmp(*sec_type, sec_vendors[i].guid)) {
>> + printk("%ssection_type: %s", newpfx,
>> + sec_vendors[i].name);
>> + cper_print_vendor(newpfx, vendor_err,
>> + gdata->error_data_length);
>> + return;
>> + }
>> + }
>> printk("%s""section type: unknown, %pUl\n", newpfx, sec_type);
>
> ... you simply dump the section in binary if none of the standard
> UUIDs match. And you can then turn the "unknown" message into a
> vendor-specific one.
Sure.

--
Jonathan (Zhixiong) Zhang
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project