2024-01-02 15:10:26

by Smita Koralahalli

[permalink] [raw]
Subject: [PATCH 0/4] acpi/ghes, cper, cxl: Trace FW-First CXL Protocol Errors

This patchset adds trace event support for FW-First Protocol Errors.

This series depends on:
https://lore.kernel.org/linux-cxl/[email protected]

Smita Koralahalli (4):
acpi/ghes, cxl: Create a common CXL struct to handle different CXL
CPER records
efi/cper, cxl: Make definitions and structures global
acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors.
acpi/ghes, cxl/pci: Trace FW-First CXL Protocol Errors

drivers/acpi/apei/ghes.c | 26 +++++++++++++-
drivers/cxl/core/pci.c | 46 ++++++++++++++++++++++++
drivers/cxl/cxlpci.h | 3 ++
drivers/cxl/pci.c | 13 ++++---
drivers/firmware/efi/cper_cxl.c | 63 +++++++++++++++++++++++++++------
drivers/firmware/efi/cper_cxl.h | 7 ++--
include/linux/cper.h | 4 +++
include/linux/cxl-event.h | 24 ++++++++++++-
8 files changed, 164 insertions(+), 22 deletions(-)

--
2.17.1



2024-01-02 15:10:47

by Smita Koralahalli

[permalink] [raw]
Subject: [PATCH 2/4] efi/cper, cxl: Make definitions and structures global

In preparation to add tracepoint support, move protocol error UUID
definition to a common location and make CXL RAS capability struct
global for use across different modules.

Signed-off-by: Smita Koralahalli <[email protected]>
---
drivers/firmware/efi/cper_cxl.c | 11 -----------
drivers/firmware/efi/cper_cxl.h | 7 ++-----
include/linux/cper.h | 4 ++++
include/linux/cxl-event.h | 11 +++++++++++
4 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
index a55771b99a97..4fd8d783993e 100644
--- a/drivers/firmware/efi/cper_cxl.c
+++ b/drivers/firmware/efi/cper_cxl.c
@@ -18,17 +18,6 @@
#define PROT_ERR_VALID_DVSEC BIT_ULL(5)
#define PROT_ERR_VALID_ERROR_LOG BIT_ULL(6)

-/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */
-struct cxl_ras_capability_regs {
- u32 uncor_status;
- u32 uncor_mask;
- u32 uncor_severity;
- u32 cor_status;
- u32 cor_mask;
- u32 cap_control;
- u32 header_log[16];
-};
-
static const char * const prot_err_agent_type_strs[] = {
"Restricted CXL Device",
"Restricted CXL Host Downstream Port",
diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h
index 86bfcf7909ec..6f8c00495708 100644
--- a/drivers/firmware/efi/cper_cxl.h
+++ b/drivers/firmware/efi/cper_cxl.h
@@ -7,14 +7,11 @@
* Author: Smita Koralahalli <[email protected]>
*/

+#include <linux/cxl-event.h>
+
#ifndef LINUX_CPER_CXL_H
#define LINUX_CPER_CXL_H

-/* CXL Protocol Error Section */
-#define CPER_SEC_CXL_PROT_ERR \
- GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
- 0x4B, 0x77, 0x10, 0x48)
-
#pragma pack(1)

/* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */
diff --git a/include/linux/cper.h b/include/linux/cper.h
index c1a7dc325121..2cbf0a93785a 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -89,6 +89,10 @@ enum {
#define CPER_NOTIFY_DMAR \
GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \
0x72, 0x2D, 0xEB, 0x41)
+/* CXL Protocol Error Section */
+#define CPER_SEC_CXL_PROT_ERR \
+ GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
+ 0x4B, 0x77, 0x10, 0x48)

/*
* Flags bits definitions for flags in struct cper_record_header
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index afa71ee0437c..90d8390a73cb 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -141,6 +141,17 @@ struct cxl_cper_event_rec {
union cxl_event event;
} __packed;

+/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */
+struct cxl_ras_capability_regs {
+ u32 uncor_status;
+ u32 uncor_mask;
+ u32 uncor_severity;
+ u32 cor_status;
+ u32 cor_mask;
+ u32 cap_control;
+ u32 header_log[16];
+};
+
struct cxl_cper_rec_data {
struct cxl_cper_event_rec rec;
};
--
2.17.1


2024-01-02 15:10:50

by Smita Koralahalli

[permalink] [raw]
Subject: [PATCH 1/4] acpi/ghes, cxl: Create a common CXL struct to handle different CXL CPER records

Currently defined cxl_cper_callback interface between CXL subsystem and
GHES module is just confined to handling CXL Component errors only.

Extend this callback to process CXL Protocol errors as well. Achieve
by defining a new struct cxl_cper_rec_data to include cxl_cper_event_rec
and other fields of CXL protocol errors which will be defined in future
patches.

Signed-off-by: Smita Koralahalli <[email protected]>
---
drivers/acpi/apei/ghes.c | 6 +++++-
drivers/cxl/pci.c | 8 ++++----
include/linux/cxl-event.h | 6 +++++-
3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index aed465d2fd68..d6a85fbc0a8b 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -693,6 +693,10 @@ static cxl_cper_callback cper_callback;
static void cxl_cper_post_event(enum cxl_event_type event_type,
struct cxl_cper_event_rec *rec)
{
+ struct cxl_cper_rec_data data;
+
+ data.rec = *(struct cxl_cper_event_rec *)rec;
+
if (rec->hdr.length <= sizeof(rec->hdr) ||
rec->hdr.length > sizeof(*rec)) {
pr_err(FW_WARN "CXL CPER Invalid section length (%u)\n",
@@ -707,7 +711,7 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,

guard(rwsem_read)(&cxl_cper_rw_sem);
if (cper_callback)
- cper_callback(event_type, rec);
+ cper_callback(event_type, &data);
}

int cxl_cper_register_callback(cxl_cper_callback callback)
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index b14237f824cf..7edbd53357e5 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -972,9 +972,9 @@ static struct pci_driver cxl_pci_driver = {

#define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
static void cxl_cper_event_call(enum cxl_event_type ev_type,
- struct cxl_cper_event_rec *rec)
+ struct cxl_cper_rec_data *data)
{
- struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
+ struct cper_cxl_event_devid *device_id = &data->rec.hdr.device_id;
struct pci_dev *pdev __free(pci_dev_put) = NULL;
enum cxl_event_log_type log_type;
struct cxl_dev_state *cxlds;
@@ -996,11 +996,11 @@ static void cxl_cper_event_call(enum cxl_event_type ev_type,
return;

/* Fabricate a log type */
- hdr_flags = get_unaligned_le24(rec->event.generic.hdr.flags);
+ hdr_flags = get_unaligned_le24(data->rec.event.generic.hdr.flags);
log_type = FIELD_GET(CXL_EVENT_HDR_FLAGS_REC_SEVERITY, hdr_flags);

cxl_event_trace_record(cxlds->cxlmd, log_type, ev_type,
- &uuid_null, &rec->event);
+ &uuid_null, &data->rec.event);
}

static int __init cxl_pci_driver_init(void)
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 17eadee819b6..afa71ee0437c 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -141,8 +141,12 @@ struct cxl_cper_event_rec {
union cxl_event event;
} __packed;

+struct cxl_cper_rec_data {
+ struct cxl_cper_event_rec rec;
+};
+
typedef void (*cxl_cper_callback)(enum cxl_event_type type,
- struct cxl_cper_event_rec *rec);
+ struct cxl_cper_rec_data *data);

#ifdef CONFIG_ACPI_APEI_GHES
int cxl_cper_register_callback(cxl_cper_callback callback);
--
2.17.1


2024-01-02 15:11:16

by Smita Koralahalli

[permalink] [raw]
Subject: [PATCH 3/4] acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors.

UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.

Add GHES support to detect CXL CPER Protocol record and cache error
severity, device_id, serial number and a pointer to CXL RAS capability
struct in struct cxl_cper_rec_data.

Signed-off-by: Smita Koralahalli <[email protected]>
---
drivers/acpi/apei/ghes.c | 15 ++++++++++
drivers/firmware/efi/cper_cxl.c | 52 +++++++++++++++++++++++++++++++++
include/linux/cxl-event.h | 6 ++++
3 files changed, 73 insertions(+)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index d6a85fbc0a8b..6471584b2e79 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -714,6 +714,18 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
cper_callback(event_type, &data);
}

+void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
+{
+ struct cxl_cper_rec_data data;
+
+ memset(&data, 0, sizeof(data));
+
+ if (cxl_cper_handle_prot_err_info(gdata, &data))
+ return;
+
+ data.severity = gdata->error_severity;
+}
+
int cxl_cper_register_callback(cxl_cper_callback callback)
{
guard(rwsem_write)(&cxl_cper_rw_sem);
@@ -768,6 +780,9 @@ static bool ghes_do_proc(struct ghes *ghes,
else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
queued = ghes_handle_arm_hw_error(gdata, sev);
}
+ else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
+ cxl_cper_handle_prot_err(gdata);
+ }
else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) {
struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);

diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
index 4fd8d783993e..3bc0b9f28c9e 100644
--- a/drivers/firmware/efi/cper_cxl.c
+++ b/drivers/firmware/efi/cper_cxl.c
@@ -8,6 +8,7 @@
*/

#include <linux/cper.h>
+#include <acpi/ghes.h>
#include "cper_cxl.h"

#define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
@@ -176,3 +177,54 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
sizeof(cxl_ras->header_log), 0);
}
}
+
+int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
+ struct cxl_cper_rec_data *data)
+{
+ struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
+ struct cper_cxl_event_devid *device_id = &data->rec.hdr.device_id;
+ struct cper_cxl_event_sn *dev_serial_num = &data->rec.hdr.dev_serial_num;
+ size_t size = sizeof(*prot_err) + prot_err->dvsec_len;
+
+ if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
+ pr_err(FW_WARN "Not a valid protocol error log\n");
+ return -EINVAL;
+ }
+
+ if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
+ pr_err(FW_WARN "Not a valid Device ID\n");
+ return -EINVAL;
+ }
+
+ /*
+ * The device ID or agent address is only valid for CXL 1.1 device,
+ * CXL 2.0 device, CXL 2.0 Logical device, CXL 2.0 Fabric Manager
+ * Managed Logical Device, CXL Root Port, CXL Downstream Switch
+ * Port, or CXL Upstream Switch Port.
+ */
+ if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
+ device_id->segment_num = prot_err->agent_addr.segment;
+ device_id->bus_num = prot_err->agent_addr.bus;
+ device_id->device_num = prot_err->agent_addr.device;
+ device_id->func_num = prot_err->agent_addr.function;
+ } else {
+ pr_err(FW_WARN "Not a valid agent type\n");
+ return -EINVAL;
+ }
+
+ /*
+ * The device serial number is only valid for CXL 1.1 device, CXL
+ * 2.0 device, CXL 2.0 Logical device, or CXL 2.0 Fabric Manager
+ * Managed Logical Device.
+ */
+ if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
+ prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
+ pr_warn(FW_WARN "No valid serial number present\n");
+
+ dev_serial_num->lower_dw = prot_err->dev_serial_num.lower_dw;
+ dev_serial_num->upper_dw = prot_err->dev_serial_num.upper_dw;
+
+ data->cxl_ras = (struct cxl_ras_capability_regs *)((long)prot_err + size);
+
+ return 0;
+}
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 90d8390a73cb..7ba2dfd6619e 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -154,11 +154,17 @@ struct cxl_ras_capability_regs {

struct cxl_cper_rec_data {
struct cxl_cper_event_rec rec;
+ struct cxl_ras_capability_regs *cxl_ras;
+ int severity;
};

typedef void (*cxl_cper_callback)(enum cxl_event_type type,
struct cxl_cper_rec_data *data);

+struct acpi_hest_generic_data;
+int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
+ struct cxl_cper_rec_data *data);
+
#ifdef CONFIG_ACPI_APEI_GHES
int cxl_cper_register_callback(cxl_cper_callback callback);
int cxl_cper_unregister_callback(cxl_cper_callback callback);
--
2.17.1


2024-01-02 15:11:26

by Smita Koralahalli

[permalink] [raw]
Subject: [PATCH 4/4] acpi/ghes, cxl/pci: Trace FW-First CXL Protocol Errors

When PCIe AER is in FW-First, OS should process CXL Protocol errors from
CPER records. These CPER records obtained from GHES module, will rely on
a registered callback to be notified to the CXL subsystem in order to be
processed.

Call the existing cxl_cper_callback to notify the CXL subsystem on a
Protocol error.

The defined trace events cxl_aer_uncorrectable_error and
cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
them to trace FW-First Protocol Errors.

Signed-off-by: Smita Koralahalli <[email protected]>
---
drivers/acpi/apei/ghes.c | 5 +++++
drivers/cxl/core/pci.c | 46 +++++++++++++++++++++++++++++++++++++++
drivers/cxl/cxlpci.h | 3 +++
drivers/cxl/pci.c | 5 +++++
include/linux/cxl-event.h | 1 +
5 files changed, 60 insertions(+)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 6471584b2e79..217494c7c884 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -716,6 +716,7 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,

void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
{
+ enum cxl_event_type event_type = CXL_CPER_EVENT_PROT_ERR;
struct cxl_cper_rec_data data;

memset(&data, 0, sizeof(data));
@@ -724,6 +725,10 @@ void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
return;

data.severity = gdata->error_severity;
+
+ guard(rwsem_read)(&cxl_cper_rw_sem);
+ if (cper_callback)
+ cper_callback(event_type, &data);
}

int cxl_cper_register_callback(cxl_cper_callback callback)
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 37e1652afbc7..da516982a625 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -6,6 +6,7 @@
#include <linux/pci.h>
#include <linux/pci-doe.h>
#include <linux/aer.h>
+#include <linux/cper.h>
#include <cxlpci.h>
#include <cxlmem.h>
#include <cxl.h>
@@ -836,6 +837,51 @@ void cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport)
}
EXPORT_SYMBOL_NS_GPL(cxl_setup_parent_dport, CXL);

+#define CXL_AER_UNCORRECTABLE 0
+#define CXL_AER_CORRECTABLE 1
+
+int cper_severity_cxl_aer(int cper_severity)
+{
+ switch (cper_severity) {
+ case CPER_SEV_RECOVERABLE:
+ case CPER_SEV_FATAL:
+ return CXL_AER_UNCORRECTABLE;
+ default:
+ return CXL_AER_CORRECTABLE;
+ }
+}
+
+void cxl_prot_err_trace_record(struct cxl_dev_state *cxlds,
+ struct cxl_cper_rec_data *data)
+{
+ struct cper_cxl_event_sn *dev_serial_num = &data->rec.hdr.dev_serial_num;
+ u32 status, fe;
+ int severity;
+
+ severity = cper_severity_cxl_aer(data->severity);
+
+ cxlds->serial = (((u64)dev_serial_num->upper_dw << 32) |
+ dev_serial_num->lower_dw);
+
+ if (severity == CXL_AER_CORRECTABLE) {
+ status = data->cxl_ras->cor_status & ~data->cxl_ras->cor_mask;
+
+ trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
+ } else {
+ status = data->cxl_ras->uncor_status & ~data->cxl_ras->uncor_mask;
+
+ if (hweight32(status) > 1)
+ fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
+ data->cxl_ras->cap_control));
+ else
+ fe = status;
+
+ trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
+ data->cxl_ras->header_log);
+ }
+}
+EXPORT_SYMBOL_NS_GPL(cxl_prot_err_trace_record, CXL);
+
static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds,
struct cxl_dport *dport)
{
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 0fa4799ea316..462f1f9e82b0 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -93,4 +93,7 @@ void read_cdat_data(struct cxl_port *port);
void cxl_cor_error_detected(struct pci_dev *pdev);
pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
pci_channel_state_t state);
+struct cxl_cper_rec_data;
+void cxl_prot_err_trace_record(struct cxl_dev_state *cxlds,
+ struct cxl_cper_rec_data *data);
#endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 7edbd53357e5..2b9f4dbf06c9 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -995,6 +995,11 @@ static void cxl_cper_event_call(enum cxl_event_type ev_type,
if (!cxlds)
return;

+ if (ev_type == CXL_CPER_EVENT_PROT_ERR) {
+ cxl_prot_err_trace_record(cxlds, data);
+ return;
+ }
+
/* Fabricate a log type */
hdr_flags = get_unaligned_le24(data->rec.event.generic.hdr.flags);
log_type = FIELD_GET(CXL_EVENT_HDR_FLAGS_REC_SEVERITY, hdr_flags);
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 7ba2dfd6619e..b4558f206f59 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -113,6 +113,7 @@ enum cxl_event_type {
CXL_CPER_EVENT_GEN_MEDIA,
CXL_CPER_EVENT_DRAM,
CXL_CPER_EVENT_MEM_MODULE,
+ CXL_CPER_EVENT_PROT_ERR,
};

#define CPER_CXL_DEVICE_ID_VALID BIT(0)
--
2.17.1


2024-01-02 16:23:30

by Ira Weiny

[permalink] [raw]
Subject: Re: [PATCH 1/4] acpi/ghes, cxl: Create a common CXL struct to handle different CXL CPER records

Smita Koralahalli wrote:
> Currently defined cxl_cper_callback interface between CXL subsystem and
> GHES module is just confined to handling CXL Component errors only.
>
> Extend this callback to process CXL Protocol errors as well. Achieve
> by defining a new struct cxl_cper_rec_data to include cxl_cper_event_rec
> and other fields of CXL protocol errors which will be defined in future
> patches.
>
> Signed-off-by: Smita Koralahalli <[email protected]>

[snip]

>
> static int __init cxl_pci_driver_init(void)
> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
> index 17eadee819b6..afa71ee0437c 100644
> --- a/include/linux/cxl-event.h
> +++ b/include/linux/cxl-event.h
> @@ -141,8 +141,12 @@ struct cxl_cper_event_rec {
> union cxl_event event;
> } __packed;
>
> +struct cxl_cper_rec_data {
> + struct cxl_cper_event_rec rec;

NIT: I would call this something like event to distinguish it from other
record data.

Other than that this seems reasonable to me.

Reviewed-by: Ira Weiny <[email protected]>

[snip]

2024-01-02 16:30:47

by Ira Weiny

[permalink] [raw]
Subject: Re: [PATCH 2/4] efi/cper, cxl: Make definitions and structures global

Smita Koralahalli wrote:
> In preparation to add tracepoint support, move protocol error UUID
> definition to a common location and make CXL RAS capability struct
> global for use across different modules.
>
> Signed-off-by: Smita Koralahalli <[email protected]>

[snip]

> diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h
> index 86bfcf7909ec..6f8c00495708 100644
> --- a/drivers/firmware/efi/cper_cxl.h
> +++ b/drivers/firmware/efi/cper_cxl.h
> @@ -7,14 +7,11 @@
> * Author: Smita Koralahalli <[email protected]>
> */
>
> +#include <linux/cxl-event.h>
> +
> #ifndef LINUX_CPER_CXL_H
> #define LINUX_CPER_CXL_H
>
> -/* CXL Protocol Error Section */
> -#define CPER_SEC_CXL_PROT_ERR \
> - GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
> - 0x4B, 0x77, 0x10, 0x48)
> -
> #pragma pack(1)
>
> /* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */
> diff --git a/include/linux/cper.h b/include/linux/cper.h
> index c1a7dc325121..2cbf0a93785a 100644
> --- a/include/linux/cper.h
> +++ b/include/linux/cper.h
> @@ -89,6 +89,10 @@ enum {
> #define CPER_NOTIFY_DMAR \
> GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \
> 0x72, 0x2D, 0xEB, 0x41)
> +/* CXL Protocol Error Section */
> +#define CPER_SEC_CXL_PROT_ERR \
> + GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
> + 0x4B, 0x77, 0x10, 0x48)

Is this shared with code outside of GHES? I did not need my GUID defines
outside of ghes.c and further becuase the events are defined as UUID's I
chose to keep the GUID definition as local as possible to ghes.c.

Can you do the same with this define?

The rest looks good,
Ira

[snip]

2024-01-02 17:58:53

by Ira Weiny

[permalink] [raw]
Subject: Re: [PATCH 3/4] acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors.

Smita Koralahalli wrote:
> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
>
> Add GHES support to detect CXL CPER Protocol record and cache error
> severity, device_id, serial number and a pointer to CXL RAS capability
> struct in struct cxl_cper_rec_data.
>
> Signed-off-by: Smita Koralahalli <[email protected]>
> ---
> drivers/acpi/apei/ghes.c | 15 ++++++++++
> drivers/firmware/efi/cper_cxl.c | 52 +++++++++++++++++++++++++++++++++
> include/linux/cxl-event.h | 6 ++++
> 3 files changed, 73 insertions(+)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index d6a85fbc0a8b..6471584b2e79 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -714,6 +714,18 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
> cper_callback(event_type, &data);
> }
>
> +void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
> +{
> + struct cxl_cper_rec_data data;
> +
> + memset(&data, 0, sizeof(data));

Does this need to be done? Could cxl_cper_handle_prot_err_info() use
something like this initially?

*data = (struct cxl_cper_rec_data) {
.rec.hdr.dev_serial_num.lower_dw = prot_err->dev_serial_num.lower_dw;
.rec.hdr.dev_serial_num.upper_dw = prot_err->dev_serial_num.upper_dw;
};

The serial number is always set even if it is not valid.

> +
> + if (cxl_cper_handle_prot_err_info(gdata, &data))
> + return;
> +
> + data.severity = gdata->error_severity;
> +}
> +
> int cxl_cper_register_callback(cxl_cper_callback callback)
> {
> guard(rwsem_write)(&cxl_cper_rw_sem);
> @@ -768,6 +780,9 @@ static bool ghes_do_proc(struct ghes *ghes,
> else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
> queued = ghes_handle_arm_hw_error(gdata, sev);
> }
> + else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
> + cxl_cper_handle_prot_err(gdata);
> + }
> else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) {
> struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
>
> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
> index 4fd8d783993e..3bc0b9f28c9e 100644
> --- a/drivers/firmware/efi/cper_cxl.c
> +++ b/drivers/firmware/efi/cper_cxl.c
> @@ -8,6 +8,7 @@
> */
>
> #include <linux/cper.h>
> +#include <acpi/ghes.h>
> #include "cper_cxl.h"
>
> #define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
> @@ -176,3 +177,54 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
> sizeof(cxl_ras->header_log), 0);
> }
> }
> +
> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
> + struct cxl_cper_rec_data *data)
> +{
> + struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
> + struct cper_cxl_event_devid *device_id = &data->rec.hdr.device_id;
> + struct cper_cxl_event_sn *dev_serial_num = &data->rec.hdr.dev_serial_num;
> + size_t size = sizeof(*prot_err) + prot_err->dvsec_len;
> +
> + if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
> + pr_err(FW_WARN "Not a valid protocol error log\n");
> + return -EINVAL;
> + }
> +
> + if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
> + pr_err(FW_WARN "Not a valid Device ID\n");
> + return -EINVAL;
> + }
> +
> + /*
> + * The device ID or agent address is only valid for CXL 1.1 device,
> + * CXL 2.0 device, CXL 2.0 Logical device, CXL 2.0 Fabric Manager
> + * Managed Logical Device, CXL Root Port, CXL Downstream Switch
> + * Port, or CXL Upstream Switch Port.
> + */
> + if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
> + device_id->segment_num = prot_err->agent_addr.segment;
> + device_id->bus_num = prot_err->agent_addr.bus;
> + device_id->device_num = prot_err->agent_addr.device;
> + device_id->func_num = prot_err->agent_addr.function;
> + } else {
> + pr_err(FW_WARN "Not a valid agent type\n");
> + return -EINVAL;
> + }
> +
> + /*
> + * The device serial number is only valid for CXL 1.1 device, CXL
> + * 2.0 device, CXL 2.0 Logical device, or CXL 2.0 Fabric Manager
> + * Managed Logical Device.
> + */
> + if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
> + prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
> + pr_warn(FW_WARN "No valid serial number present\n");
> +
> + dev_serial_num->lower_dw = prot_err->dev_serial_num.lower_dw;
> + dev_serial_num->upper_dw = prot_err->dev_serial_num.upper_dw;
> +
> + data->cxl_ras = (struct cxl_ras_capability_regs *)((long)prot_err + size);

I think this is ok now because the cxl trace code processes the data
before returning (in next patch). But I'm a bit leary about passing a
pointer to data controlled by the acpi sub-system. At some point the
event may get cached to be processed by another thread and it might be
better to copy the struct here.

> +
> + return 0;
> +}
> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
> index 90d8390a73cb..7ba2dfd6619e 100644
> --- a/include/linux/cxl-event.h
> +++ b/include/linux/cxl-event.h
> @@ -154,11 +154,17 @@ struct cxl_ras_capability_regs {
>
> struct cxl_cper_rec_data {
> struct cxl_cper_event_rec rec;
> + struct cxl_ras_capability_regs *cxl_ras;
> + int severity;

NIT: When I saw this without any addition to event type I wondered if the
series would bisect. I see it does because the record is not sent until
the next patch. But I wonder if the 2 patches would be best reversed.

Also, should cxl_ras + severity be put in a protocol error sub-struct?
Does severity ever apply to event records?

Ira

> };
>
> typedef void (*cxl_cper_callback)(enum cxl_event_type type,
> struct cxl_cper_rec_data *data);
>
> +struct acpi_hest_generic_data;
> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
> + struct cxl_cper_rec_data *data);
> +
> #ifdef CONFIG_ACPI_APEI_GHES
> int cxl_cper_register_callback(cxl_cper_callback callback);
> int cxl_cper_unregister_callback(cxl_cper_callback callback);
> --
> 2.17.1
>



2024-01-02 20:27:51

by Ira Weiny

[permalink] [raw]
Subject: Re: [PATCH 4/4] acpi/ghes, cxl/pci: Trace FW-First CXL Protocol Errors

Smita Koralahalli wrote:
> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
> CPER records. These CPER records obtained from GHES module, will rely on
> a registered callback to be notified to the CXL subsystem in order to be
> processed.
>
> Call the existing cxl_cper_callback to notify the CXL subsystem on a
> Protocol error.
>
> The defined trace events cxl_aer_uncorrectable_error and
> cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
> them to trace FW-First Protocol Errors.
>
> Signed-off-by: Smita Koralahalli <[email protected]>

[snip]

> int cxl_cper_register_callback(cxl_cper_callback callback)
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 37e1652afbc7..da516982a625 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -6,6 +6,7 @@
> #include <linux/pci.h>
> #include <linux/pci-doe.h>
> #include <linux/aer.h>
> +#include <linux/cper.h>
> #include <cxlpci.h>
> #include <cxlmem.h>
> #include <cxl.h>
> @@ -836,6 +837,51 @@ void cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport)
> }
> EXPORT_SYMBOL_NS_GPL(cxl_setup_parent_dport, CXL);
>
> +#define CXL_AER_UNCORRECTABLE 0
> +#define CXL_AER_CORRECTABLE 1

Better defined as an enum?

> +
> +int cper_severity_cxl_aer(int cper_severity)

My gut says that it would be better to hide this conversion in the
GHES/CPER code and send a more generic defined CXL_AER_* severity through.

> +{
> + switch (cper_severity) {
> + case CPER_SEV_RECOVERABLE:
> + case CPER_SEV_FATAL:
> + return CXL_AER_UNCORRECTABLE;
> + default:
> + return CXL_AER_CORRECTABLE;
> + }
> +}
> +
> +void cxl_prot_err_trace_record(struct cxl_dev_state *cxlds,
> + struct cxl_cper_rec_data *data)
> +{
> + struct cper_cxl_event_sn *dev_serial_num = &data->rec.hdr.dev_serial_num;
> + u32 status, fe;
> + int severity;
> +
> + severity = cper_severity_cxl_aer(data->severity);
> +
> + cxlds->serial = (((u64)dev_serial_num->upper_dw << 32) |
> + dev_serial_num->lower_dw);

This permanently overwrites the serial number read from PCI...

If the serial number does not match up or was not valid (per the check in
the previous patch) lets add a warning.

AFAICT they should match.

Ira

[snip]

2024-01-03 20:04:25

by Smita Koralahalli

[permalink] [raw]
Subject: Re: [PATCH 1/4] acpi/ghes, cxl: Create a common CXL struct to handle different CXL CPER records

On 1/2/2024 8:23 AM, Ira Weiny wrote:
> Smita Koralahalli wrote:
>> Currently defined cxl_cper_callback interface between CXL subsystem and
>> GHES module is just confined to handling CXL Component errors only.
>>
>> Extend this callback to process CXL Protocol errors as well. Achieve
>> by defining a new struct cxl_cper_rec_data to include cxl_cper_event_rec
>> and other fields of CXL protocol errors which will be defined in future
>> patches.
>>
>> Signed-off-by: Smita Koralahalli <[email protected]>
>
> [snip]
>
>>
>> static int __init cxl_pci_driver_init(void)
>> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
>> index 17eadee819b6..afa71ee0437c 100644
>> --- a/include/linux/cxl-event.h
>> +++ b/include/linux/cxl-event.h
>> @@ -141,8 +141,12 @@ struct cxl_cper_event_rec {
>> union cxl_event event;
>> } __packed;
>>
>> +struct cxl_cper_rec_data {
>> + struct cxl_cper_event_rec rec;
>
> NIT: I would call this something like event to distinguish it from other
> record data.

What do you think of the below?

struct cxl_cper_event_info {
struct cxl_cper_event_rec rec;
struct cxl_cper_prot_err {
struct cxl_ras_capability_regs cxl_ras;
int severity;
} p_err;
};

Addressed changing to sub-struct and copying the struct rather than
pointer comments in patch 3..

>
> Other than that this seems reasonable to me.
>
> Reviewed-by: Ira Weiny <[email protected]>
>
> [snip]
>

2024-01-03 20:19:47

by Smita Koralahalli

[permalink] [raw]
Subject: Re: [PATCH 2/4] efi/cper, cxl: Make definitions and structures global

On 1/2/2024 8:30 AM, Ira Weiny wrote:
> Smita Koralahalli wrote:
>> In preparation to add tracepoint support, move protocol error UUID
>> definition to a common location and make CXL RAS capability struct
>> global for use across different modules.
>>
>> Signed-off-by: Smita Koralahalli <[email protected]>
>
> [snip]
>
>> diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h
>> index 86bfcf7909ec..6f8c00495708 100644
>> --- a/drivers/firmware/efi/cper_cxl.h
>> +++ b/drivers/firmware/efi/cper_cxl.h
>> @@ -7,14 +7,11 @@
>> * Author: Smita Koralahalli <[email protected]>
>> */
>>
>> +#include <linux/cxl-event.h>
>> +
>> #ifndef LINUX_CPER_CXL_H
>> #define LINUX_CPER_CXL_H
>>
>> -/* CXL Protocol Error Section */
>> -#define CPER_SEC_CXL_PROT_ERR \
>> - GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
>> - 0x4B, 0x77, 0x10, 0x48)
>> -
>> #pragma pack(1)
>>
>> /* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */
>> diff --git a/include/linux/cper.h b/include/linux/cper.h
>> index c1a7dc325121..2cbf0a93785a 100644
>> --- a/include/linux/cper.h
>> +++ b/include/linux/cper.h
>> @@ -89,6 +89,10 @@ enum {
>> #define CPER_NOTIFY_DMAR \
>> GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \
>> 0x72, 0x2D, 0xEB, 0x41)
>> +/* CXL Protocol Error Section */
>> +#define CPER_SEC_CXL_PROT_ERR \
>> + GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
>> + 0x4B, 0x77, 0x10, 0x48)
>
> Is this shared with code outside of GHES? I did not need my GUID defines
> outside of ghes.c and further becuase the events are defined as UUID's I
> chose to keep the GUID definition as local as possible to ghes.c.
>
> Can you do the same with this define?

Actually, it is shared with efi/cper.
https://elixir.bootlin.com/linux/v6.7-rc8/source/drivers/firmware/efi/cper.c#L602

But this would be something to look into. Should we continue to support
logging from efi/cper or just confine it to ghes..

If we just log it from ghes similar to component events, we might loose
error records from RCH Downstream Port and other agent_types which do
not log device_ids. Also, I'm not sure how useful are other fields in
protocol error CPER, the ones like Capability struct and DVSEC len etc
as the tracepoints doesn't log all of them.

Thanks,
Smita

>
> The rest looks good,
> Ira
>
> [snip]
>

2024-01-03 20:36:14

by Ira Weiny

[permalink] [raw]
Subject: Re: [PATCH 1/4] acpi/ghes, cxl: Create a common CXL struct to handle different CXL CPER records

Smita Koralahalli wrote:
> On 1/2/2024 8:23 AM, Ira Weiny wrote:
> > Smita Koralahalli wrote:

[snip]

> >> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
> >> index 17eadee819b6..afa71ee0437c 100644
> >> --- a/include/linux/cxl-event.h
> >> +++ b/include/linux/cxl-event.h
> >> @@ -141,8 +141,12 @@ struct cxl_cper_event_rec {
> >> union cxl_event event;
> >> } __packed;
> >>
> >> +struct cxl_cper_rec_data {
> >> + struct cxl_cper_event_rec rec;
> >
> > NIT: I would call this something like event to distinguish it from other
> > record data.
>
> What do you think of the below?
>
> struct cxl_cper_event_info {
> struct cxl_cper_event_rec rec;
> struct cxl_cper_prot_err {
> struct cxl_ras_capability_regs cxl_ras;
> int severity;
> } p_err;
> };
>
> Addressed changing to sub-struct and copying the struct rather than
> pointer comments in patch 3..

That is much better. Thanks!
Ira

2024-01-03 21:12:23

by Smita Koralahalli

[permalink] [raw]
Subject: Re: [PATCH 3/4] acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors.

On 1/2/2024 9:58 AM, Ira Weiny wrote:
> Smita Koralahalli wrote:
>> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
>>
>> Add GHES support to detect CXL CPER Protocol record and cache error
>> severity, device_id, serial number and a pointer to CXL RAS capability
>> struct in struct cxl_cper_rec_data.
>>
>> Signed-off-by: Smita Koralahalli <[email protected]>
>> ---
>> drivers/acpi/apei/ghes.c | 15 ++++++++++
>> drivers/firmware/efi/cper_cxl.c | 52 +++++++++++++++++++++++++++++++++
>> include/linux/cxl-event.h | 6 ++++
>> 3 files changed, 73 insertions(+)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index d6a85fbc0a8b..6471584b2e79 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -714,6 +714,18 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
>> cper_callback(event_type, &data);
>> }
>>
>> +void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
>> +{
>> + struct cxl_cper_rec_data data;
>> +
>> + memset(&data, 0, sizeof(data));
>
> Does this need to be done? Could cxl_cper_handle_prot_err_info() use
> something like this initially?
>
> *data = (struct cxl_cper_rec_data) {
> .rec.hdr.dev_serial_num.lower_dw = prot_err->dev_serial_num.lower_dw;
> .rec.hdr.dev_serial_num.upper_dw = prot_err->dev_serial_num.upper_dw;
> };
>
> The serial number is always set even if it is not valid.

Ok will copy serial number at the beginning of
cxl_cper_handle_prot_err_info() and remove memset.

>
>> +
>> + if (cxl_cper_handle_prot_err_info(gdata, &data))
>> + return;
>> +
>> + data.severity = gdata->error_severity;
>> +}
>> +
>> int cxl_cper_register_callback(cxl_cper_callback callback)
>> {
>> guard(rwsem_write)(&cxl_cper_rw_sem);
>> @@ -768,6 +780,9 @@ static bool ghes_do_proc(struct ghes *ghes,
>> else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
>> queued = ghes_handle_arm_hw_error(gdata, sev);
>> }
>> + else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
>> + cxl_cper_handle_prot_err(gdata);
>> + }
>> else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) {
>> struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
>>
>> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
>> index 4fd8d783993e..3bc0b9f28c9e 100644
>> --- a/drivers/firmware/efi/cper_cxl.c
>> +++ b/drivers/firmware/efi/cper_cxl.c
>> @@ -8,6 +8,7 @@
>> */
>>
>> #include <linux/cper.h>
>> +#include <acpi/ghes.h>
>> #include "cper_cxl.h"
>>
>> #define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
>> @@ -176,3 +177,54 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
>> sizeof(cxl_ras->header_log), 0);
>> }
>> }
>> +
>> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
>> + struct cxl_cper_rec_data *data)
>> +{
>> + struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
>> + struct cper_cxl_event_devid *device_id = &data->rec.hdr.device_id;
>> + struct cper_cxl_event_sn *dev_serial_num = &data->rec.hdr.dev_serial_num;
>> + size_t size = sizeof(*prot_err) + prot_err->dvsec_len;
>> +
>> + if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
>> + pr_err(FW_WARN "Not a valid protocol error log\n");
>> + return -EINVAL;
>> + }
>> +
>> + if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
>> + pr_err(FW_WARN "Not a valid Device ID\n");
>> + return -EINVAL;
>> + }
>> +
>> + /*
>> + * The device ID or agent address is only valid for CXL 1.1 device,
>> + * CXL 2.0 device, CXL 2.0 Logical device, CXL 2.0 Fabric Manager
>> + * Managed Logical Device, CXL Root Port, CXL Downstream Switch
>> + * Port, or CXL Upstream Switch Port.
>> + */
>> + if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
>> + device_id->segment_num = prot_err->agent_addr.segment;
>> + device_id->bus_num = prot_err->agent_addr.bus;
>> + device_id->device_num = prot_err->agent_addr.device;
>> + device_id->func_num = prot_err->agent_addr.function;
>> + } else {
>> + pr_err(FW_WARN "Not a valid agent type\n");
>> + return -EINVAL;
>> + }
>> +
>> + /*
>> + * The device serial number is only valid for CXL 1.1 device, CXL
>> + * 2.0 device, CXL 2.0 Logical device, or CXL 2.0 Fabric Manager
>> + * Managed Logical Device.
>> + */
>> + if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
>> + prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
>> + pr_warn(FW_WARN "No valid serial number present\n");
>> +
>> + dev_serial_num->lower_dw = prot_err->dev_serial_num.lower_dw;
>> + dev_serial_num->upper_dw = prot_err->dev_serial_num.upper_dw;
>> +
>> + data->cxl_ras = (struct cxl_ras_capability_regs *)((long)prot_err + size);
>
> I think this is ok now because the cxl trace code processes the data
> before returning (in next patch). But I'm a bit leary about passing a
> pointer to data controlled by the acpi sub-system. At some point the
> event may get cached to be processed by another thread and it might be
> better to copy the struct here.

Ok will make the changes. Rewrote the struct in patch 1.

>
>> +
>> + return 0;
>> +}
>> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
>> index 90d8390a73cb..7ba2dfd6619e 100644
>> --- a/include/linux/cxl-event.h
>> +++ b/include/linux/cxl-event.h
>> @@ -154,11 +154,17 @@ struct cxl_ras_capability_regs {
>>
>> struct cxl_cper_rec_data {
>> struct cxl_cper_event_rec rec;
>> + struct cxl_ras_capability_regs *cxl_ras;
>> + int severity;
>
> NIT: When I saw this without any addition to event type I wondered if the
> series would bisect. I see it does because the record is not sent until
> the next patch. But I wonder if the 2 patches would be best reversed.

You mean to say patch 4 to be 3 and 3 to be 4?

And since I haven't used severity yet, fix it by declaring severity to
where it is used..

>
> Also, should cxl_ras + severity be put in a protocol error sub-struct?
> Does severity ever apply to event records?

No, not in any of the component event records. Only place is in "Event
Record Flags" in Common Event Record Format (Table 8-42).

Addressed in patch 1 to make a sub-struct.

Thanks,
Smita

>
> Ira
>
>> };
>>
>> typedef void (*cxl_cper_callback)(enum cxl_event_type type,
>> struct cxl_cper_rec_data *data);
>>
>> +struct acpi_hest_generic_data;
>> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
>> + struct cxl_cper_rec_data *data);
>> +
>> #ifdef CONFIG_ACPI_APEI_GHES
>> int cxl_cper_register_callback(cxl_cper_callback callback);
>> int cxl_cper_unregister_callback(cxl_cper_callback callback);
>> --
>> 2.17.1
>>
>
>

2024-01-03 21:14:08

by Smita Koralahalli

[permalink] [raw]
Subject: Re: [PATCH 4/4] acpi/ghes, cxl/pci: Trace FW-First CXL Protocol Errors

On 1/2/2024 12:27 PM, Ira Weiny wrote:
> Smita Koralahalli wrote:
>> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
>> CPER records. These CPER records obtained from GHES module, will rely on
>> a registered callback to be notified to the CXL subsystem in order to be
>> processed.
>>
>> Call the existing cxl_cper_callback to notify the CXL subsystem on a
>> Protocol error.
>>
>> The defined trace events cxl_aer_uncorrectable_error and
>> cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
>> them to trace FW-First Protocol Errors.
>>
>> Signed-off-by: Smita Koralahalli <[email protected]>
>
> [snip]
>
>> int cxl_cper_register_callback(cxl_cper_callback callback)
>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>> index 37e1652afbc7..da516982a625 100644
>> --- a/drivers/cxl/core/pci.c
>> +++ b/drivers/cxl/core/pci.c
>> @@ -6,6 +6,7 @@
>> #include <linux/pci.h>
>> #include <linux/pci-doe.h>
>> #include <linux/aer.h>
>> +#include <linux/cper.h>
>> #include <cxlpci.h>
>> #include <cxlmem.h>
>> #include <cxl.h>
>> @@ -836,6 +837,51 @@ void cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport)
>> }
>> EXPORT_SYMBOL_NS_GPL(cxl_setup_parent_dport, CXL);
>>
>> +#define CXL_AER_UNCORRECTABLE 0
>> +#define CXL_AER_CORRECTABLE 1
>
> Better defined as an enum?

Will change.

>
>> +
>> +int cper_severity_cxl_aer(int cper_severity)
>
> My gut says that it would be better to hide this conversion in the
> GHES/CPER code and send a more generic defined CXL_AER_* severity through.

Ok will change.

>
>> +{
>> + switch (cper_severity) {
>> + case CPER_SEV_RECOVERABLE:
>> + case CPER_SEV_FATAL:
>> + return CXL_AER_UNCORRECTABLE;
>> + default:
>> + return CXL_AER_CORRECTABLE;
>> + }
>> +}
>> +
>> +void cxl_prot_err_trace_record(struct cxl_dev_state *cxlds,
>> + struct cxl_cper_rec_data *data)
>> +{
>> + struct cper_cxl_event_sn *dev_serial_num = &data->rec.hdr.dev_serial_num;
>> + u32 status, fe;
>> + int severity;
>> +
>> + severity = cper_severity_cxl_aer(data->severity);
>> +
>> + cxlds->serial = (((u64)dev_serial_num->upper_dw << 32) |
>> + dev_serial_num->lower_dw);
>
> This permanently overwrites the serial number read from PCI...
>
> If the serial number does not match up or was not valid (per the check in
> the previous patch) lets add a warning.

Sure will add.

Thanks,
Smita

>
> AFAICT they should match.
>
> Ira
>
> [snip]
>

2024-01-03 22:32:45

by Ira Weiny

[permalink] [raw]
Subject: Re: [PATCH 2/4] efi/cper, cxl: Make definitions and structures global

Smita Koralahalli wrote:
> On 1/2/2024 8:30 AM, Ira Weiny wrote:
> > Smita Koralahalli wrote:
> >> In preparation to add tracepoint support, move protocol error UUID
> >> definition to a common location and make CXL RAS capability struct
> >> global for use across different modules.
> >>
> >> Signed-off-by: Smita Koralahalli <[email protected]>
> >
> > [snip]
> >
> >> diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h
> >> index 86bfcf7909ec..6f8c00495708 100644
> >> --- a/drivers/firmware/efi/cper_cxl.h
> >> +++ b/drivers/firmware/efi/cper_cxl.h
> >> @@ -7,14 +7,11 @@
> >> * Author: Smita Koralahalli <[email protected]>
> >> */
> >>
> >> +#include <linux/cxl-event.h>
> >> +
> >> #ifndef LINUX_CPER_CXL_H
> >> #define LINUX_CPER_CXL_H
> >>
> >> -/* CXL Protocol Error Section */
> >> -#define CPER_SEC_CXL_PROT_ERR \
> >> - GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
> >> - 0x4B, 0x77, 0x10, 0x48)
> >> -
> >> #pragma pack(1)
> >>
> >> /* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */
> >> diff --git a/include/linux/cper.h b/include/linux/cper.h
> >> index c1a7dc325121..2cbf0a93785a 100644
> >> --- a/include/linux/cper.h
> >> +++ b/include/linux/cper.h
> >> @@ -89,6 +89,10 @@ enum {
> >> #define CPER_NOTIFY_DMAR \
> >> GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \
> >> 0x72, 0x2D, 0xEB, 0x41)
> >> +/* CXL Protocol Error Section */
> >> +#define CPER_SEC_CXL_PROT_ERR \
> >> + GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
> >> + 0x4B, 0x77, 0x10, 0x48)
> >
> > Is this shared with code outside of GHES? I did not need my GUID defines
> > outside of ghes.c and further becuase the events are defined as UUID's I
> > chose to keep the GUID definition as local as possible to ghes.c.
> >
> > Can you do the same with this define?
>
> Actually, it is shared with efi/cper.
> https://elixir.bootlin.com/linux/v6.7-rc8/source/drivers/firmware/efi/cper.c#L602

Ah ok.

>
> But this would be something to look into. Should we continue to support
> logging from efi/cper or just confine it to ghes..

I missed that you were not removing the efi/cper print. I kind of thought
that was part of the series.

>
> If we just log it from ghes similar to component events, we might loose
> error records from RCH Downstream Port and other agent_types which do
> not log device_ids.

That is a good reason to keep the efi/cper print AFAICS.

> Also, I'm not sure how useful are other fields in
> protocol error CPER, the ones like Capability struct and DVSEC len etc
> as the tracepoints doesn't log all of them.

I'm not sure about their importance but if they are important I would say
they should be added to the tracepoint.

Ira

2024-01-03 22:33:02

by Ira Weiny

[permalink] [raw]
Subject: Re: [PATCH 3/4] acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors.

Smita Koralahalli wrote:
> On 1/2/2024 9:58 AM, Ira Weiny wrote:
> > Smita Koralahalli wrote:

[snip]

> >> +
> >> + return 0;
> >> +}
> >> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
> >> index 90d8390a73cb..7ba2dfd6619e 100644
> >> --- a/include/linux/cxl-event.h
> >> +++ b/include/linux/cxl-event.h
> >> @@ -154,11 +154,17 @@ struct cxl_ras_capability_regs {
> >>
> >> struct cxl_cper_rec_data {
> >> struct cxl_cper_event_rec rec;
> >> + struct cxl_ras_capability_regs *cxl_ras;
> >> + int severity;
> >
> > NIT: When I saw this without any addition to event type I wondered if the
> > series would bisect. I see it does because the record is not sent until
> > the next patch. But I wonder if the 2 patches would be best reversed.
>
> You mean to say patch 4 to be 3 and 3 to be 4?
>
> And since I haven't used severity yet, fix it by declaring severity to
> where it is used..

Yes. What you have is not technically wrong but it threw me in review.

>
> >
> > Also, should cxl_ras + severity be put in a protocol error sub-struct?
> > Does severity ever apply to event records?
>
> No, not in any of the component event records. Only place is in "Event
> Record Flags" in Common Event Record Format (Table 8-42).
>
> Addressed in patch 1 to make a sub-struct.

Thanks!
Ira