This series adds decoding for the CXL Protocol Errors Common Platform
Error Record.
Smita Koralahalli (2):
efi/cper, cxl: Decode CXL Protocol Error Section
efi/cper, cxl: Decode CXL Error Log
drivers/firmware/efi/Makefile | 2 +-
drivers/firmware/efi/cper.c | 9 +++
drivers/firmware/efi/cper_cxl.c | 108 ++++++++++++++++++++++++++++++++
drivers/firmware/efi/cper_cxl.h | 58 +++++++++++++++++
include/linux/cxl_err.h | 21 +++++++
5 files changed, 197 insertions(+), 1 deletion(-)
create mode 100644 drivers/firmware/efi/cper_cxl.c
create mode 100644 drivers/firmware/efi/cper_cxl.h
create mode 100644 include/linux/cxl_err.h
--
2.17.1
Print the CXL Error Log field as found in CXL Protocol Error Section.
The CXL RAS Capability structure will be reused by OS First Handling
and the duplication/appropriate placement will be addressed eventually.
Signed-off-by: Smita Koralahalli <[email protected]>
---
drivers/firmware/efi/cper_cxl.c | 21 +++++++++++++++++++++
include/linux/cxl_err.h | 21 +++++++++++++++++++++
2 files changed, 42 insertions(+)
create mode 100644 include/linux/cxl_err.h
diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
index e5f48f0de1a4..c3d1d0770aef 100644
--- a/drivers/firmware/efi/cper_cxl.c
+++ b/drivers/firmware/efi/cper_cxl.c
@@ -8,6 +8,7 @@
*/
#include <linux/cper.h>
+#include <linux/cxl_err.h>
#include "cper_cxl.h"
#define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
@@ -16,6 +17,7 @@
#define PROT_ERR_VALID_SERIAL_NUMBER BIT_ULL(3)
#define PROT_ERR_VALID_CAPABILITY BIT_ULL(4)
#define PROT_ERR_VALID_DVSEC BIT_ULL(5)
+#define PROT_ERR_VALID_ERROR_LOG BIT_ULL(6)
static const char * const prot_err_agent_type_strs[] = {
"Restricted CXL Device",
@@ -84,4 +86,23 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
break;
}
}
+
+ if (prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG) {
+ size_t size = sizeof(*prot_err) + prot_err->dvsec_len;
+ struct ras_capability_regs *cxl_ras;
+
+ pr_info("%s Error log length: 0x%04x\n", pfx, prot_err->err_len);
+
+ pr_info("%s CXL Error Log:\n", pfx);
+ cxl_ras = (struct ras_capability_regs *)((long)prot_err + size);
+ pr_info("%s cxl_ras_uncor_status: 0x%08x, cxl_ras_uncor_mask: 0x%08x\n",
+ pfx, cxl_ras->uncor_status, cxl_ras->uncor_mask);
+ pr_info("%s cxl_ras_uncor_severity: 0x%08x\n", pfx,
+ cxl_ras->uncor_severity);
+ pr_info("%s cxl_ras_cor_status: 0x%08x, cxl_ras_cor_mask: 0x%08x\n",
+ pfx, cxl_ras->cor_status, cxl_ras->cor_mask);
+ pr_info("%s Header Log Registers:\n", pfx);
+ print_hex_dump(pfx, "", DUMP_PREFIX_OFFSET, 16, 4, cxl_ras->header_log,
+ sizeof(cxl_ras->header_log), 0);
+ }
}
diff --git a/include/linux/cxl_err.h b/include/linux/cxl_err.h
new file mode 100644
index 000000000000..c89dbb6c286f
--- /dev/null
+++ b/include/linux/cxl_err.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 Advanced Micro Devices, Inc.
+ *
+ * Author: Smita Koralahalli <[email protected]>
+ */
+
+#ifndef LINUX_CXL_ERR_H
+#define LINUX_CXL_ERR_H
+
+struct ras_capability_regs {
+ u32 uncor_status;
+ u32 uncor_mask;
+ u32 uncor_severity;
+ u32 cor_status;
+ u32 cor_mask;
+ u32 cap_control;
+ u32 header_log[16];
+};
+
+#endif //__CXL_ERR_
--
2.17.1
On Fri, 7 Oct 2022 21:17:14 +0000
Smita Koralahalli <[email protected]> wrote:
> Print the CXL Error Log field as found in CXL Protocol Error Section.
>
> The CXL RAS Capability structure will be reused by OS First Handling
> and the duplication/appropriate placement will be addressed eventually.
>
> Signed-off-by: Smita Koralahalli <[email protected]>
Ah. This clearly answers at least a few comments from my patch one review.
I should have read on!
> ---
> drivers/firmware/efi/cper_cxl.c | 21 +++++++++++++++++++++
> include/linux/cxl_err.h | 21 +++++++++++++++++++++
> 2 files changed, 42 insertions(+)
> create mode 100644 include/linux/cxl_err.h
>
> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
> index e5f48f0de1a4..c3d1d0770aef 100644
> --- a/drivers/firmware/efi/cper_cxl.c
> +++ b/drivers/firmware/efi/cper_cxl.c
> @@ -8,6 +8,7 @@
> */
>
> #include <linux/cper.h>
> +#include <linux/cxl_err.h>
> #include "cper_cxl.h"
>
> #define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
> @@ -16,6 +17,7 @@
> #define PROT_ERR_VALID_SERIAL_NUMBER BIT_ULL(3)
> #define PROT_ERR_VALID_CAPABILITY BIT_ULL(4)
> #define PROT_ERR_VALID_DVSEC BIT_ULL(5)
> +#define PROT_ERR_VALID_ERROR_LOG BIT_ULL(6)
>
> static const char * const prot_err_agent_type_strs[] = {
> "Restricted CXL Device",
> @@ -84,4 +86,23 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
> break;
> }
> }
> +
> + if (prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG) {
> + size_t size = sizeof(*prot_err) + prot_err->dvsec_len;
> + struct ras_capability_regs *cxl_ras;
> +
> + pr_info("%s Error log length: 0x%04x\n", pfx, prot_err->err_len);
> +
> + pr_info("%s CXL Error Log:\n", pfx);
> + cxl_ras = (struct ras_capability_regs *)((long)prot_err + size);
> + pr_info("%s cxl_ras_uncor_status: 0x%08x, cxl_ras_uncor_mask: 0x%08x\n",
> + pfx, cxl_ras->uncor_status, cxl_ras->uncor_mask);
Is it worth splitting these up, so that we get a human readable line with the
individual fields broken out?
> + pr_info("%s cxl_ras_uncor_severity: 0x%08x\n", pfx,
> + cxl_ras->uncor_severity);
> + pr_info("%s cxl_ras_cor_status: 0x%08x, cxl_ras_cor_mask: 0x%08x\n",
> + pfx, cxl_ras->cor_status, cxl_ras->cor_mask);
Not outputting the cap_control register? Some of that might be useful.
> + pr_info("%s Header Log Registers:\n", pfx);
> + print_hex_dump(pfx, "", DUMP_PREFIX_OFFSET, 16, 4, cxl_ras->header_log,
> + sizeof(cxl_ras->header_log), 0);
> + }
> }
> diff --git a/include/linux/cxl_err.h b/include/linux/cxl_err.h
> new file mode 100644
> index 000000000000..c89dbb6c286f
> --- /dev/null
> +++ b/include/linux/cxl_err.h
> @@ -0,0 +1,21 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2022 Advanced Micro Devices, Inc.
> + *
> + * Author: Smita Koralahalli <[email protected]>
> + */
> +
> +#ifndef LINUX_CXL_ERR_H
> +#define LINUX_CXL_ERR_H
> +
> +struct ras_capability_regs {
CXL r3.0 Spec reference plus prefix it with cxl_
Agreed with your comment at the top. Some discussion needed on where to
put this - or whether to delay figuring that out until a later stage.
> + u32 uncor_status;
> + u32 uncor_mask;
> + u32 uncor_severity;
> + u32 cor_status;
> + u32 cor_mask;
> + u32 cap_control;
> + u32 header_log[16];
> +};
> +
> +#endif //__CXL_ERR_
On 10/10/2022 7:34 AM, Jonathan Cameron wrote:
> On Fri, 7 Oct 2022 21:17:14 +0000
> Smita Koralahalli <[email protected]> wrote:
>
>
>> +
>> + if (prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG) {
>> + size_t size = sizeof(*prot_err) + prot_err->dvsec_len;
>> + struct ras_capability_regs *cxl_ras;
>> +
>> + pr_info("%s Error log length: 0x%04x\n", pfx, prot_err->err_len);
>> +
>> + pr_info("%s CXL Error Log:\n", pfx);
>> + cxl_ras = (struct ras_capability_regs *)((long)prot_err + size);
>> + pr_info("%s cxl_ras_uncor_status: 0x%08x, cxl_ras_uncor_mask: 0x%08x\n",
>> + pfx, cxl_ras->uncor_status, cxl_ras->uncor_mask);
> Is it worth splitting these up, so that we get a human readable line with the
> individual fields broken out?
Will do.
>
>> + pr_info("%s cxl_ras_uncor_severity: 0x%08x\n", pfx,
>> + cxl_ras->uncor_severity);
>> + pr_info("%s cxl_ras_cor_status: 0x%08x, cxl_ras_cor_mask: 0x%08x\n",
>> + pfx, cxl_ras->cor_status, cxl_ras->cor_mask);
> Not outputting the cap_control register? Some of that might be useful.
Will add.
>
>> + pr_info("%s Header Log Registers:\n", pfx);
>> + print_hex_dump(pfx, "", DUMP_PREFIX_OFFSET, 16, 4, cxl_ras->header_log,
>> + sizeof(cxl_ras->header_log), 0);
>> + }
>> }
>> diff --git a/include/linux/cxl_err.h b/include/linux/cxl_err.h
>> new file mode 100644
>> index 000000000000..c89dbb6c286f
>> --- /dev/null
>> +++ b/include/linux/cxl_err.h
>> @@ -0,0 +1,21 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/*
>> + * Copyright (C) 2022 Advanced Micro Devices, Inc.
>> + *
>> + * Author: Smita Koralahalli <[email protected]>
>> + */
>> +
>> +#ifndef LINUX_CXL_ERR_H
>> +#define LINUX_CXL_ERR_H
>> +
>> +struct ras_capability_regs {
> CXL r3.0 Spec reference plus prefix it with cxl_
Okay.
>
> Agreed with your comment at the top. Some discussion needed on where to
> put this - or whether to delay figuring that out until a later stage.
Yes, more discussion needed here though!
Thanks,
Smita
>
>> + u32 uncor_status;
>> + u32 uncor_mask;
>> + u32 uncor_severity;
>> + u32 cor_status;
>> + u32 cor_mask;
>> + u32 cap_control;
>> + u32 header_log[16];
>> +};
>> +
>> +#endif //__CXL_ERR_
Hi Smita,
Smita Koralahalli wrote:
> This series adds decoding for the CXL Protocol Errors Common Platform
> Error Record.
Be sure to copy Ard Biesheuvel <[email protected]>, added, on
drivers/firmware/efi/ patches.
Along those lines, drivers/cxl/ developers have an idea of what is
contained in the new CXL protocol error records and why Linux might want
to decode them, others from outside drivers/cxl/ might not. It always
helps to have a small summary of the benefit to end users of the
motivation to apply a patch set.
>
> Smita Koralahalli (2):
> efi/cper, cxl: Decode CXL Protocol Error Section
> efi/cper, cxl: Decode CXL Error Log
>
> drivers/firmware/efi/Makefile | 2 +-
> drivers/firmware/efi/cper.c | 9 +++
> drivers/firmware/efi/cper_cxl.c | 108 ++++++++++++++++++++++++++++++++
> drivers/firmware/efi/cper_cxl.h | 58 +++++++++++++++++
> include/linux/cxl_err.h | 21 +++++++
> 5 files changed, 197 insertions(+), 1 deletion(-)
I notice no updates for the trace events in ghes_do_proc(), is that next
in your queue? That's ok to be a follow-on after v2.
Hi Dan,
On 10/21/2022 3:18 PM, Dan Williams wrote:
> Hi Smita,
>
> Smita Koralahalli wrote:
>> This series adds decoding for the CXL Protocol Errors Common Platform
>> Error Record.
> Be sure to copy Ard Biesheuvel <[email protected]>, added, on
> drivers/firmware/efi/ patches.
>
> Along those lines, drivers/cxl/ developers have an idea of what is
> contained in the new CXL protocol error records and why Linux might want
> to decode them, others from outside drivers/cxl/ might not. It always
> helps to have a small summary of the benefit to end users of the
> motivation to apply a patch set.
Sure, will include in my v2.
>
>> Smita Koralahalli (2):
>> efi/cper, cxl: Decode CXL Protocol Error Section
>> efi/cper, cxl: Decode CXL Error Log
>>
>> drivers/firmware/efi/Makefile | 2 +-
>> drivers/firmware/efi/cper.c | 9 +++
>> drivers/firmware/efi/cper_cxl.c | 108 ++++++++++++++++++++++++++++++++
>> drivers/firmware/efi/cper_cxl.h | 58 +++++++++++++++++
>> include/linux/cxl_err.h | 21 +++++++
>> 5 files changed, 197 insertions(+), 1 deletion(-)
> I notice no updates for the trace events in ghes_do_proc(), is that next
> in your queue? That's ok to be a follow-on after v2.
Sorry, if I haven't understood this right. Are you implying about the
"handling"
of cxl memory errors in ghes_do_proc() or is it just copying of CPER
entries to
tracepoints?
Thanks,
Smita
Smita Koralahalli wrote:
> Hi Dan,
>
> On 10/21/2022 3:18 PM, Dan Williams wrote:
> > Hi Smita,
> >
> > Smita Koralahalli wrote:
> >> This series adds decoding for the CXL Protocol Errors Common Platform
> >> Error Record.
> > Be sure to copy Ard Biesheuvel <[email protected]>, added, on
> > drivers/firmware/efi/ patches.
> >
> > Along those lines, drivers/cxl/ developers have an idea of what is
> > contained in the new CXL protocol error records and why Linux might want
> > to decode them, others from outside drivers/cxl/ might not. It always
> > helps to have a small summary of the benefit to end users of the
> > motivation to apply a patch set.
>
> Sure, will include in my v2.
>
> >
> >> Smita Koralahalli (2):
> >> efi/cper, cxl: Decode CXL Protocol Error Section
> >> efi/cper, cxl: Decode CXL Error Log
> >>
> >> drivers/firmware/efi/Makefile | 2 +-
> >> drivers/firmware/efi/cper.c | 9 +++
> >> drivers/firmware/efi/cper_cxl.c | 108 ++++++++++++++++++++++++++++++++
> >> drivers/firmware/efi/cper_cxl.h | 58 +++++++++++++++++
> >> include/linux/cxl_err.h | 21 +++++++
> >> 5 files changed, 197 insertions(+), 1 deletion(-)
> > I notice no updates for the trace events in ghes_do_proc(), is that next
> > in your queue? That's ok to be a follow-on after v2.
>
> Sorry, if I haven't understood this right. Are you implying about the
> "handling"
> of cxl memory errors in ghes_do_proc() or is it just copying of CPER
> entries to
> tracepoints?
Right now ghes_do_proc() will let the CXL CPER records fall through to
log_non_standard_event(). Are you planning to add trace event decode
there for CPER_SEC_CXL_PROT_ERR records?
I am not sure if the CXL CPER to trace record conversion belongs there,
or somewhere closer to trace_aer_event() invocations since the CXL
protocol errors are effectively an extenstion of PCI AER events.
On 10/25/2022 5:11 PM, Dan Williams wrote:
> Smita Koralahalli wrote:
>> Hi Dan,
>>
>> On 10/21/2022 3:18 PM, Dan Williams wrote:
>>> Hi Smita,
>>>
>>> Smita Koralahalli wrote:
>>>> This series adds decoding for the CXL Protocol Errors Common Platform
>>>> Error Record.
>>> Be sure to copy Ard Biesheuvel <[email protected]>, added, on
>>> drivers/firmware/efi/ patches.
>>>
>>> Along those lines, drivers/cxl/ developers have an idea of what is
>>> contained in the new CXL protocol error records and why Linux might want
>>> to decode them, others from outside drivers/cxl/ might not. It always
>>> helps to have a small summary of the benefit to end users of the
>>> motivation to apply a patch set.
>> Sure, will include in my v2.
>>
>>>> Smita Koralahalli (2):
>>>> efi/cper, cxl: Decode CXL Protocol Error Section
>>>> efi/cper, cxl: Decode CXL Error Log
>>>>
>>>> drivers/firmware/efi/Makefile | 2 +-
>>>> drivers/firmware/efi/cper.c | 9 +++
>>>> drivers/firmware/efi/cper_cxl.c | 108 ++++++++++++++++++++++++++++++++
>>>> drivers/firmware/efi/cper_cxl.h | 58 +++++++++++++++++
>>>> include/linux/cxl_err.h | 21 +++++++
>>>> 5 files changed, 197 insertions(+), 1 deletion(-)
>>> I notice no updates for the trace events in ghes_do_proc(), is that next
>>> in your queue? That's ok to be a follow-on after v2.
>> Sorry, if I haven't understood this right. Are you implying about the
>> "handling"
>> of cxl memory errors in ghes_do_proc() or is it just copying of CPER
>> entries to
>> tracepoints?
> Right now ghes_do_proc() will let the CXL CPER records fall through to
> log_non_standard_event(). Are you planning to add trace event decode
> there for CPER_SEC_CXL_PROT_ERR records?
Thanks! Yeah its a good idea to add. I did not think about this before.
I will send this as a separate patchset after v2.
I think with this cxl cper trace event support and Ira's patchset which
traces
specific event record types via Get Event Record, we can start the userspace
handling probably in rasdaemon?
>
> I am not sure if the CXL CPER to trace record conversion belongs there,
> or somewhere closer to trace_aer_event() invocations since the CXL
> protocol errors are effectively an extenstion of PCI AER events.
Right, I will keep it simple in v1 and get the comments about the
placement..
Thanks,
Smita
> On Oct 26, 2022, at 12:31 PM, Smita Koralahalli <[email protected]> wrote:
>
> On 10/25/2022 5:11 PM, Dan Williams wrote:
>> Smita Koralahalli wrote:
>>> Hi Dan,
>>>
>>> On 10/21/2022 3:18 PM, Dan Williams wrote:
>>>> Hi Smita,
>>>>
>>>> Smita Koralahalli wrote:
>>>>> This series adds decoding for the CXL Protocol Errors Common Platform
>>>>> Error Record.
>>>> Be sure to copy Ard Biesheuvel <[email protected]>, added, on
>>>> drivers/firmware/efi/ patches.
>>>>
>>>> Along those lines, drivers/cxl/ developers have an idea of what is
>>>> contained in the new CXL protocol error records and why Linux might want
>>>> to decode them, others from outside drivers/cxl/ might not. It always
>>>> helps to have a small summary of the benefit to end users of the
>>>> motivation to apply a patch set.
>>> Sure, will include in my v2.
>>>
>>>>> Smita Koralahalli (2):
>>>>> efi/cper, cxl: Decode CXL Protocol Error Section
>>>>> efi/cper, cxl: Decode CXL Error Log
>>>>>
>>>>> drivers/firmware/efi/Makefile | 2 +-
>>>>> drivers/firmware/efi/cper.c | 9 +++
>>>>> drivers/firmware/efi/cper_cxl.c | 108 ++++++++++++++++++++++++++++++++
>>>>> drivers/firmware/efi/cper_cxl.h | 58 +++++++++++++++++
>>>>> include/linux/cxl_err.h | 21 +++++++
>>>>> 5 files changed, 197 insertions(+), 1 deletion(-)
>>>> I notice no updates for the trace events in ghes_do_proc(), is that next
>>>> in your queue? That's ok to be a follow-on after v2.
>>> Sorry, if I haven't understood this right. Are you implying about the
>>> "handling"
>>> of cxl memory errors in ghes_do_proc() or is it just copying of CPER
>>> entries to
>>> tracepoints?
>> Right now ghes_do_proc() will let the CXL CPER records fall through to
>> log_non_standard_event(). Are you planning to add trace event decode
>> there for CPER_SEC_CXL_PROT_ERR records?
>
> Thanks! Yeah its a good idea to add. I did not think about this before.
> I will send this as a separate patchset after v2.
>
> I think with this cxl cper trace event support and Ira's patchset which traces
> specific event record types via Get Event Record, we can start the userspace
> handling probably in rasdaemon?
Yes, I think this makes sense. rasdaemon could aggregate data and provide user
with full picture:
* Memory errors from both processor attached memory and CXL memory.
* CXL protocol errors.
* CXL device errors.
Such errors may be handled either firmware first or OS first.
>
>>
>> I am not sure if the CXL CPER to trace record conversion belongs there,
>> or somewhere closer to trace_aer_event() invocations since the CXL
>> protocol errors are effectively an extenstion of PCI AER events.
>
> Right, I will keep it simple in v1 and get the comments about the placement..
>
> Thanks,
> Smita
>
>
>
Jonathan Zhang (Infra) wrote:
>
>
> > On Oct 26, 2022, at 12:31 PM, Smita Koralahalli <[email protected]> wrote:
> >
> > On 10/25/2022 5:11 PM, Dan Williams wrote:
> >> Smita Koralahalli wrote:
> >>> Hi Dan,
> >>>
> >>> On 10/21/2022 3:18 PM, Dan Williams wrote:
> >>>> Hi Smita,
> >>>>
> >>>> Smita Koralahalli wrote:
> >>>>> This series adds decoding for the CXL Protocol Errors Common Platform
> >>>>> Error Record.
> >>>> Be sure to copy Ard Biesheuvel <[email protected]>, added, on
> >>>> drivers/firmware/efi/ patches.
> >>>>
> >>>> Along those lines, drivers/cxl/ developers have an idea of what is
> >>>> contained in the new CXL protocol error records and why Linux might want
> >>>> to decode them, others from outside drivers/cxl/ might not. It always
> >>>> helps to have a small summary of the benefit to end users of the
> >>>> motivation to apply a patch set.
> >>> Sure, will include in my v2.
> >>>
> >>>>> Smita Koralahalli (2):
> >>>>> efi/cper, cxl: Decode CXL Protocol Error Section
> >>>>> efi/cper, cxl: Decode CXL Error Log
> >>>>>
> >>>>> drivers/firmware/efi/Makefile | 2 +-
> >>>>> drivers/firmware/efi/cper.c | 9 +++
> >>>>> drivers/firmware/efi/cper_cxl.c | 108 ++++++++++++++++++++++++++++++++
> >>>>> drivers/firmware/efi/cper_cxl.h | 58 +++++++++++++++++
> >>>>> include/linux/cxl_err.h | 21 +++++++
> >>>>> 5 files changed, 197 insertions(+), 1 deletion(-)
> >>>> I notice no updates for the trace events in ghes_do_proc(), is that next
> >>>> in your queue? That's ok to be a follow-on after v2.
> >>> Sorry, if I haven't understood this right. Are you implying about the
> >>> "handling"
> >>> of cxl memory errors in ghes_do_proc() or is it just copying of CPER
> >>> entries to
> >>> tracepoints?
> >> Right now ghes_do_proc() will let the CXL CPER records fall through to
> >> log_non_standard_event(). Are you planning to add trace event decode
> >> there for CPER_SEC_CXL_PROT_ERR records?
> >
> > Thanks! Yeah its a good idea to add. I did not think about this before.
> > I will send this as a separate patchset after v2.
> >
> > I think with this cxl cper trace event support and Ira's patchset which traces
> > specific event record types via Get Event Record, we can start the userspace
> > handling probably in rasdaemon?
> Yes, I think this makes sense. rasdaemon could aggregate data and provide user
> with full picture:
> * Memory errors from both processor attached memory and CXL memory.
> * CXL protocol errors.
> * CXL device errors.
> Such errors may be handled either firmware first or OS first.
I have no concerns about rasdaemon subscribing to CXL RAS events, but
the nice thing about trace-events is that any number of subscribers can
attach to the event stream. So I expect cxl-cli to have a monitor of
these CXL specific events and that does not preclude rasdaemon from
also incorporating CXL events into its event list.
On Fri, 28 Oct 2022 13:46:22 -0700
Dan Williams <[email protected]> wrote:
> Jonathan Zhang (Infra) wrote:
> >
> >
> > > On Oct 26, 2022, at 12:31 PM, Smita Koralahalli <[email protected]> wrote:
> > >
> > > On 10/25/2022 5:11 PM, Dan Williams wrote:
> > >> Smita Koralahalli wrote:
> > >>> Hi Dan,
> > >>>
> > >>> On 10/21/2022 3:18 PM, Dan Williams wrote:
> > >>>> Hi Smita,
> > >>>>
> > >>>> Smita Koralahalli wrote:
> > >>>>> This series adds decoding for the CXL Protocol Errors Common Platform
> > >>>>> Error Record.
> > >>>> Be sure to copy Ard Biesheuvel <[email protected]>, added, on
> > >>>> drivers/firmware/efi/ patches.
> > >>>>
> > >>>> Along those lines, drivers/cxl/ developers have an idea of what is
> > >>>> contained in the new CXL protocol error records and why Linux might want
> > >>>> to decode them, others from outside drivers/cxl/ might not. It always
> > >>>> helps to have a small summary of the benefit to end users of the
> > >>>> motivation to apply a patch set.
> > >>> Sure, will include in my v2.
> > >>>
> > >>>>> Smita Koralahalli (2):
> > >>>>> efi/cper, cxl: Decode CXL Protocol Error Section
> > >>>>> efi/cper, cxl: Decode CXL Error Log
> > >>>>>
> > >>>>> drivers/firmware/efi/Makefile | 2 +-
> > >>>>> drivers/firmware/efi/cper.c | 9 +++
> > >>>>> drivers/firmware/efi/cper_cxl.c | 108 ++++++++++++++++++++++++++++++++
> > >>>>> drivers/firmware/efi/cper_cxl.h | 58 +++++++++++++++++
> > >>>>> include/linux/cxl_err.h | 21 +++++++
> > >>>>> 5 files changed, 197 insertions(+), 1 deletion(-)
> > >>>> I notice no updates for the trace events in ghes_do_proc(), is that next
> > >>>> in your queue? That's ok to be a follow-on after v2.
> > >>> Sorry, if I haven't understood this right. Are you implying about the
> > >>> "handling"
> > >>> of cxl memory errors in ghes_do_proc() or is it just copying of CPER
> > >>> entries to
> > >>> tracepoints?
> > >> Right now ghes_do_proc() will let the CXL CPER records fall through to
> > >> log_non_standard_event(). Are you planning to add trace event decode
> > >> there for CPER_SEC_CXL_PROT_ERR records?
> > >
> > > Thanks! Yeah its a good idea to add. I did not think about this before.
> > > I will send this as a separate patchset after v2.
> > >
> > > I think with this cxl cper trace event support and Ira's patchset which traces
> > > specific event record types via Get Event Record, we can start the userspace
> > > handling probably in rasdaemon?
> > Yes, I think this makes sense. rasdaemon could aggregate data and provide user
> > with full picture:
> > * Memory errors from both processor attached memory and CXL memory.
> > * CXL protocol errors.
> > * CXL device errors.
> > Such errors may be handled either firmware first or OS first.
>
> I have no concerns about rasdaemon subscribing to CXL RAS events, but
> the nice thing about trace-events is that any number of subscribers can
> attach to the event stream. So I expect cxl-cli to have a monitor of
> these CXL specific events and that does not preclude rasdaemon from
> also incorporating CXL events into its event list.
FYI, we posted some poison list RAS daemon patches a while back.
https://lore.kernel.org/all/[email protected]/
Absolutely agree we'll want all the rest of these once kernel patches are in
(and hence we know the tracepoints definitions are stable)
If anyone is working on the RASdaemon side of things, shout on the list as
I'd rather not see duplication of effort.
Jonathan