This patchset adds trace event support for FW-First Protocol Errors.
Reworked the patchset to reuse the work item written by Ira for handling
CPER events.
Smita Koralahalli (4):
efi/cper, cxl: Make definitions and structures global
acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors.
acpi/ghes, cxl/pci: Trace FW-First CXL Protocol Errors
cxl/pci: Define a common function get_cxl_dev()
drivers/acpi/apei/ghes.c | 24 ++++++++++
drivers/cxl/core/pci.c | 24 ++++++++++
drivers/cxl/cxlpci.h | 3 ++
drivers/cxl/pci.c | 60 ++++++++++++++++++-------
drivers/firmware/efi/cper_cxl.c | 77 ++++++++++++++++++++++++++++-----
drivers/firmware/efi/cper_cxl.h | 7 +--
include/linux/cper.h | 4 ++
include/linux/cxl-event.h | 38 ++++++++++++++++
8 files changed, 206 insertions(+), 31 deletions(-)
--
2.17.1
In preparation to add tracepoint support, move protocol error UUID
definition to a common location and make CXL RAS capability struct
global for use across different modules.
Signed-off-by: Smita Koralahalli <[email protected]>
---
drivers/firmware/efi/cper_cxl.c | 11 -----------
drivers/firmware/efi/cper_cxl.h | 7 ++-----
include/linux/cper.h | 4 ++++
include/linux/cxl-event.h | 11 +++++++++++
4 files changed, 17 insertions(+), 16 deletions(-)
diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
index a55771b99a97..4fd8d783993e 100644
--- a/drivers/firmware/efi/cper_cxl.c
+++ b/drivers/firmware/efi/cper_cxl.c
@@ -18,17 +18,6 @@
#define PROT_ERR_VALID_DVSEC BIT_ULL(5)
#define PROT_ERR_VALID_ERROR_LOG BIT_ULL(6)
-/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */
-struct cxl_ras_capability_regs {
- u32 uncor_status;
- u32 uncor_mask;
- u32 uncor_severity;
- u32 cor_status;
- u32 cor_mask;
- u32 cap_control;
- u32 header_log[16];
-};
-
static const char * const prot_err_agent_type_strs[] = {
"Restricted CXL Device",
"Restricted CXL Host Downstream Port",
diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h
index 86bfcf7909ec..6f8c00495708 100644
--- a/drivers/firmware/efi/cper_cxl.h
+++ b/drivers/firmware/efi/cper_cxl.h
@@ -7,14 +7,11 @@
* Author: Smita Koralahalli <[email protected]>
*/
+#include <linux/cxl-event.h>
+
#ifndef LINUX_CPER_CXL_H
#define LINUX_CPER_CXL_H
-/* CXL Protocol Error Section */
-#define CPER_SEC_CXL_PROT_ERR \
- GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
- 0x4B, 0x77, 0x10, 0x48)
-
#pragma pack(1)
/* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */
diff --git a/include/linux/cper.h b/include/linux/cper.h
index 265b0f8fc0b3..5c6d4d5b9975 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -89,6 +89,10 @@ enum {
#define CPER_NOTIFY_DMAR \
GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \
0x72, 0x2D, 0xEB, 0x41)
+/* CXL Protocol Error Section */
+#define CPER_SEC_CXL_PROT_ERR \
+ GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
+ 0x4B, 0x77, 0x10, 0x48)
/* CXL Event record UUIDs are formatted as GUIDs and reported in section type */
/*
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 60b25020281f..f11e52ff565a 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -154,6 +154,17 @@ struct cxl_cper_event_rec {
union cxl_event event;
} __packed;
+/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */
+struct cxl_ras_capability_regs {
+ u32 uncor_status;
+ u32 uncor_mask;
+ u32 uncor_severity;
+ u32 cor_status;
+ u32 cor_mask;
+ u32 cap_control;
+ u32 header_log[16];
+};
+
struct cxl_cper_work_data {
enum cxl_event_type event_type;
struct cxl_cper_event_rec rec;
--
2.17.1
When PCIe AER is in FW-First, OS should process CXL Protocol errors from
CPER records.
Reuse the existing work queue cxl_cper_work registered with GHES to notify
the CXL subsystem on a Protocol error.
The defined trace events cxl_aer_uncorrectable_error and
cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
them to trace FW-First Protocol Errors.
Signed-off-by: Smita Koralahalli <[email protected]>
---
drivers/acpi/apei/ghes.c | 14 ++++++++++++++
drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++
drivers/cxl/cxlpci.h | 3 +++
drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++--
include/linux/cxl-event.h | 1 +
5 files changed, 74 insertions(+), 2 deletions(-)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 1a58032770ee..a31bd91e9475 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
return;
+
+ guard(spinlock_irqsave)(&cxl_cper_work_lock);
+
+ if (!cxl_cper_work)
+ return;
+
+ wd.event_type = CXL_CPER_EVENT_PROT_ERR;
+
+ if (!kfifo_put(&cxl_cper_fifo, wd)) {
+ pr_err_ratelimited("CXL CPER kfifo overflow\n");
+ return;
+ }
+
+ schedule_work(cxl_cper_work);
}
int cxl_cper_register_work(struct work_struct *work)
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 0df09bd79408..ef9438cb1dd6 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -686,6 +686,30 @@ void read_cdat_data(struct cxl_port *port)
}
EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
+void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
+ struct cxl_cper_prot_err *p_err)
+{
+ u32 status, fe;
+
+ if (p_err->severity == CXL_AER_CORRECTABLE) {
+ status = p_err->cxl_ras.cor_status & ~p_err->cxl_ras.cor_mask;
+
+ trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
+ } else {
+ status = p_err->cxl_ras.uncor_status & ~p_err->cxl_ras.uncor_mask;
+
+ if (hweight32(status) > 1)
+ fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
+ p_err->cxl_ras.cap_control));
+ else
+ fe = status;
+
+ trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
+ p_err->cxl_ras.header_log);
+ }
+}
+EXPORT_SYMBOL_NS_GPL(cxl_trace_prot_err, CXL);
+
static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds,
void __iomem *ras_base)
{
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 93992a1c8eec..0ba3215786e1 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -130,4 +130,7 @@ void read_cdat_data(struct cxl_port *port);
void cxl_cor_error_detected(struct pci_dev *pdev);
pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
pci_channel_state_t state);
+struct cxl_cper_prot_err;
+void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
+ struct cxl_cper_prot_err *p_err);
#endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 74876c9835e8..3e3c36983686 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1011,12 +1011,42 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
&uuid_null, &rec->event);
}
+static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
+{
+ struct pci_dev *pdev __free(pci_dev_put) = NULL;
+ struct cxl_dev_state *cxlds;
+ unsigned int devfn;
+
+ devfn = PCI_DEVFN(p_err->device, p_err->function);
+ pdev = pci_get_domain_bus_and_slot(p_err->segment,
+ p_err->bus, devfn);
+ if (!pdev)
+ return;
+
+ guard(device)(&pdev->dev);
+ if (pdev->driver != &cxl_pci_driver)
+ return;
+
+ cxlds = pci_get_drvdata(pdev);
+ if (!cxlds)
+ return;
+
+ if (((u64)p_err->upper_dw << 32 | p_err->lower_dw) != cxlds->serial)
+ pr_warn("CPER-reported device serial number does not match expected value\n");
+
+ cxl_trace_prot_err(cxlds, p_err);
+}
+
static void cxl_cper_work_fn(struct work_struct *work)
{
struct cxl_cper_work_data wd;
- while (cxl_cper_kfifo_get(&wd))
- cxl_handle_cper_event(wd.event_type, &wd.rec);
+ while (cxl_cper_kfifo_get(&wd)) {
+ if (wd.event_type == CXL_CPER_EVENT_PROT_ERR)
+ cxl_handle_prot_err(&wd.p_err);
+ else
+ cxl_handle_cper_event(wd.event_type, &wd.rec);
+ }
}
static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 9c7b69e076a0..5562844df850 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -122,6 +122,7 @@ struct cxl_event_record_raw {
} __packed;
enum cxl_event_type {
+ CXL_CPER_EVENT_PROT_ERR,
CXL_CPER_EVENT_GENERIC,
CXL_CPER_EVENT_GEN_MEDIA,
CXL_CPER_EVENT_DRAM,
--
2.17.1
Refactor computation of cxlds to a common function get_cxl_dev() and reuse
the function in both cxl_handle_cper_event() and cxl_handle_prot_err().
Signed-off-by: Smita Koralahalli <[email protected]>
---
drivers/cxl/pci.c | 52 +++++++++++++++++++++++------------------------
1 file changed, 26 insertions(+), 26 deletions(-)
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 3e3c36983686..26e65e5b68cb 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -974,32 +974,43 @@ static struct pci_driver cxl_pci_driver = {
},
};
+static struct cxl_dev_state *get_cxl_dev(u16 segment, u8 bus, u8 device,
+ u8 function)
+{
+ struct pci_dev *pdev __free(pci_dev_put) = NULL;
+ struct cxl_dev_state *cxlds;
+ unsigned int devfn;
+
+ devfn = PCI_DEVFN(device, function);
+ pdev = pci_get_domain_bus_and_slot(segment, bus, devfn);
+
+ if (!pdev)
+ return NULL;
+
+ guard(device)(&pdev->dev);
+ if (pdev->driver != &cxl_pci_driver)
+ return NULL;
+
+ cxlds = pci_get_drvdata(pdev);
+
+ return cxlds;
+}
+
#define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
static void cxl_handle_cper_event(enum cxl_event_type ev_type,
struct cxl_cper_event_rec *rec)
{
struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
- struct pci_dev *pdev __free(pci_dev_put) = NULL;
enum cxl_event_log_type log_type;
struct cxl_dev_state *cxlds;
- unsigned int devfn;
u32 hdr_flags;
pr_debug("CPER event %d for device %u:%u:%u.%u\n", ev_type,
device_id->segment_num, device_id->bus_num,
device_id->device_num, device_id->func_num);
- devfn = PCI_DEVFN(device_id->device_num, device_id->func_num);
- pdev = pci_get_domain_bus_and_slot(device_id->segment_num,
- device_id->bus_num, devfn);
- if (!pdev)
- return;
-
- guard(device)(&pdev->dev);
- if (pdev->driver != &cxl_pci_driver)
- return;
-
- cxlds = pci_get_drvdata(pdev);
+ cxlds = get_cxl_dev(device_id->segment_num, device_id->bus_num,
+ device_id->device_num, device_id->func_num);
if (!cxlds)
return;
@@ -1013,21 +1024,10 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
{
- struct pci_dev *pdev __free(pci_dev_put) = NULL;
struct cxl_dev_state *cxlds;
- unsigned int devfn;
- devfn = PCI_DEVFN(p_err->device, p_err->function);
- pdev = pci_get_domain_bus_and_slot(p_err->segment,
- p_err->bus, devfn);
- if (!pdev)
- return;
-
- guard(device)(&pdev->dev);
- if (pdev->driver != &cxl_pci_driver)
- return;
-
- cxlds = pci_get_drvdata(pdev);
+ cxlds = get_cxl_dev(p_err->segment, p_err->bus,
+ p_err->device, p_err->function);
if (!cxlds)
return;
--
2.17.1
On 5/22/24 8:08 AM, Smita Koralahalli wrote:
> In preparation to add tracepoint support, move protocol error UUID
> definition to a common location and make CXL RAS capability struct
> global for use across different modules.
>
> Signed-off-by: Smita Koralahalli <[email protected]>
Reviewed-by: Dave Jiang <[email protected]>
> ---
> drivers/firmware/efi/cper_cxl.c | 11 -----------
> drivers/firmware/efi/cper_cxl.h | 7 ++-----
> include/linux/cper.h | 4 ++++
> include/linux/cxl-event.h | 11 +++++++++++
> 4 files changed, 17 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
> index a55771b99a97..4fd8d783993e 100644
> --- a/drivers/firmware/efi/cper_cxl.c
> +++ b/drivers/firmware/efi/cper_cxl.c
> @@ -18,17 +18,6 @@
> #define PROT_ERR_VALID_DVSEC BIT_ULL(5)
> #define PROT_ERR_VALID_ERROR_LOG BIT_ULL(6)
>
> -/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */
> -struct cxl_ras_capability_regs {
> - u32 uncor_status;
> - u32 uncor_mask;
> - u32 uncor_severity;
> - u32 cor_status;
> - u32 cor_mask;
> - u32 cap_control;
> - u32 header_log[16];
> -};
> -
> static const char * const prot_err_agent_type_strs[] = {
> "Restricted CXL Device",
> "Restricted CXL Host Downstream Port",
> diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h
> index 86bfcf7909ec..6f8c00495708 100644
> --- a/drivers/firmware/efi/cper_cxl.h
> +++ b/drivers/firmware/efi/cper_cxl.h
> @@ -7,14 +7,11 @@
> * Author: Smita Koralahalli <[email protected]>
> */
>
> +#include <linux/cxl-event.h>
> +
> #ifndef LINUX_CPER_CXL_H
> #define LINUX_CPER_CXL_H
>
> -/* CXL Protocol Error Section */
> -#define CPER_SEC_CXL_PROT_ERR \
> - GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
> - 0x4B, 0x77, 0x10, 0x48)
> -
> #pragma pack(1)
>
> /* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */
> diff --git a/include/linux/cper.h b/include/linux/cper.h
> index 265b0f8fc0b3..5c6d4d5b9975 100644
> --- a/include/linux/cper.h
> +++ b/include/linux/cper.h
> @@ -89,6 +89,10 @@ enum {
> #define CPER_NOTIFY_DMAR \
> GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \
> 0x72, 0x2D, 0xEB, 0x41)
> +/* CXL Protocol Error Section */
> +#define CPER_SEC_CXL_PROT_ERR \
> + GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
> + 0x4B, 0x77, 0x10, 0x48)
>
> /* CXL Event record UUIDs are formatted as GUIDs and reported in section type */
> /*
> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
> index 60b25020281f..f11e52ff565a 100644
> --- a/include/linux/cxl-event.h
> +++ b/include/linux/cxl-event.h
> @@ -154,6 +154,17 @@ struct cxl_cper_event_rec {
> union cxl_event event;
> } __packed;
>
> +/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */
> +struct cxl_ras_capability_regs {
> + u32 uncor_status;
> + u32 uncor_mask;
> + u32 uncor_severity;
> + u32 cor_status;
> + u32 cor_mask;
> + u32 cap_control;
> + u32 header_log[16];
> +};
> +
> struct cxl_cper_work_data {
> enum cxl_event_type event_type;
> struct cxl_cper_event_rec rec;
On 5/22/24 8:08 AM, Smita Koralahalli wrote:
> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
> CPER records.
>
> Reuse the existing work queue cxl_cper_work registered with GHES to notify
> the CXL subsystem on a Protocol error.
>
> The defined trace events cxl_aer_uncorrectable_error and
> cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
> them to trace FW-First Protocol Errors.
>
> Signed-off-by: Smita Koralahalli <[email protected]>
> ---
> drivers/acpi/apei/ghes.c | 14 ++++++++++++++
> drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++
> drivers/cxl/cxlpci.h | 3 +++
> drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++--
> include/linux/cxl-event.h | 1 +
> 5 files changed, 74 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 1a58032770ee..a31bd91e9475 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
>
> if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
> return;
> +
> + guard(spinlock_irqsave)(&cxl_cper_work_lock);
> +
> + if (!cxl_cper_work)
> + return;
> +
> + wd.event_type = CXL_CPER_EVENT_PROT_ERR;
> +
> + if (!kfifo_put(&cxl_cper_fifo, wd)) {
> + pr_err_ratelimited("CXL CPER kfifo overflow\n");
> + return;
> + }
> +
> + schedule_work(cxl_cper_work);
> }
>
> int cxl_cper_register_work(struct work_struct *work)
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 0df09bd79408..ef9438cb1dd6 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -686,6 +686,30 @@ void read_cdat_data(struct cxl_port *port)
> }
> EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
>
> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
> + struct cxl_cper_prot_err *p_err)
> +{
> + u32 status, fe;
> +
> + if (p_err->severity == CXL_AER_CORRECTABLE) {
> + status = p_err->cxl_ras.cor_status & ~p_err->cxl_ras.cor_mask;
> +
> + trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
> + } else {
> + status = p_err->cxl_ras.uncor_status & ~p_err->cxl_ras.uncor_mask;
> +
> + if (hweight32(status) > 1)
> + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
> + p_err->cxl_ras.cap_control));
> + else
> + fe = status;
> +
> + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
> + p_err->cxl_ras.header_log);
> + }
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_trace_prot_err, CXL);
> +
> static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds,
> void __iomem *ras_base)
> {
> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
> index 93992a1c8eec..0ba3215786e1 100644
> --- a/drivers/cxl/cxlpci.h
> +++ b/drivers/cxl/cxlpci.h
> @@ -130,4 +130,7 @@ void read_cdat_data(struct cxl_port *port);
> void cxl_cor_error_detected(struct pci_dev *pdev);
> pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
> pci_channel_state_t state);
> +struct cxl_cper_prot_err;
> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
> + struct cxl_cper_prot_err *p_err);
> #endif /* __CXL_PCI_H__ */
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 74876c9835e8..3e3c36983686 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -1011,12 +1011,42 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
> &uuid_null, &rec->event);
> }
>
> +static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
> +{
> + struct pci_dev *pdev __free(pci_dev_put) = NULL;
> + struct cxl_dev_state *cxlds;
> + unsigned int devfn;
> +
> + devfn = PCI_DEVFN(p_err->device, p_err->function);
> + pdev = pci_get_domain_bus_and_slot(p_err->segment,
> + p_err->bus, devfn);
> + if (!pdev)
> + return;
> +
> + guard(device)(&pdev->dev);
> + if (pdev->driver != &cxl_pci_driver)
> + return;
> +
> + cxlds = pci_get_drvdata(pdev);
> + if (!cxlds)
> + return;
> +
> + if (((u64)p_err->upper_dw << 32 | p_err->lower_dw) != cxlds->serial)
> + pr_warn("CPER-reported device serial number does not match expected value\n");
Given that we are operating on a device, perhaps dev_warn(&pdev->dev, ...) may be better served.
DJ
> +
> + cxl_trace_prot_err(cxlds, p_err);
> +}
> +
> static void cxl_cper_work_fn(struct work_struct *work)
> {
> struct cxl_cper_work_data wd;
>
> - while (cxl_cper_kfifo_get(&wd))
> - cxl_handle_cper_event(wd.event_type, &wd.rec);
> + while (cxl_cper_kfifo_get(&wd)) {
> + if (wd.event_type == CXL_CPER_EVENT_PROT_ERR)
> + cxl_handle_prot_err(&wd.p_err);
> + else
> + cxl_handle_cper_event(wd.event_type, &wd.rec);
> + }
> }
> static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
>
> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
> index 9c7b69e076a0..5562844df850 100644
> --- a/include/linux/cxl-event.h
> +++ b/include/linux/cxl-event.h
> @@ -122,6 +122,7 @@ struct cxl_event_record_raw {
> } __packed;
>
> enum cxl_event_type {
> + CXL_CPER_EVENT_PROT_ERR,
> CXL_CPER_EVENT_GENERIC,
> CXL_CPER_EVENT_GEN_MEDIA,
> CXL_CPER_EVENT_DRAM,
On 5/22/24 8:08 AM, Smita Koralahalli wrote:
> Refactor computation of cxlds to a common function get_cxl_dev() and reuse
> the function in both cxl_handle_cper_event() and cxl_handle_prot_err().
I think just introduce the function where you open coded it instead of adding code and then deleting them in the same series.
>
> Signed-off-by: Smita Koralahalli <[email protected]>
> ---
> drivers/cxl/pci.c | 52 +++++++++++++++++++++++------------------------
> 1 file changed, 26 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 3e3c36983686..26e65e5b68cb 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -974,32 +974,43 @@ static struct pci_driver cxl_pci_driver = {
> },
> };
>
> +static struct cxl_dev_state *get_cxl_dev(u16 segment, u8 bus, u8 device,
> + u8 function)
get_cxlds() or get_cxl_device_state() would be better.
DJ
> +{
> + struct pci_dev *pdev __free(pci_dev_put) = NULL;
> + struct cxl_dev_state *cxlds;
> + unsigned int devfn;
> +
> + devfn = PCI_DEVFN(device, function);
> + pdev = pci_get_domain_bus_and_slot(segment, bus, devfn);
> +
> + if (!pdev)
> + return NULL;
> +
> + guard(device)(&pdev->dev);
> + if (pdev->driver != &cxl_pci_driver)
> + return NULL;
> +
> + cxlds = pci_get_drvdata(pdev);
> +
> + return cxlds;
> +}
> +
> #define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
> static void cxl_handle_cper_event(enum cxl_event_type ev_type,
> struct cxl_cper_event_rec *rec)
> {
> struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
> - struct pci_dev *pdev __free(pci_dev_put) = NULL;
> enum cxl_event_log_type log_type;
> struct cxl_dev_state *cxlds;
> - unsigned int devfn;
> u32 hdr_flags;
>
> pr_debug("CPER event %d for device %u:%u:%u.%u\n", ev_type,
> device_id->segment_num, device_id->bus_num,
> device_id->device_num, device_id->func_num);
>
> - devfn = PCI_DEVFN(device_id->device_num, device_id->func_num);
> - pdev = pci_get_domain_bus_and_slot(device_id->segment_num,
> - device_id->bus_num, devfn);
> - if (!pdev)
> - return;
> -
> - guard(device)(&pdev->dev);
> - if (pdev->driver != &cxl_pci_driver)
> - return;
> -
> - cxlds = pci_get_drvdata(pdev);
> + cxlds = get_cxl_dev(device_id->segment_num, device_id->bus_num,
> + device_id->device_num, device_id->func_num);
> if (!cxlds)
> return;
>
> @@ -1013,21 +1024,10 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
>
> static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
> {
> - struct pci_dev *pdev __free(pci_dev_put) = NULL;
> struct cxl_dev_state *cxlds;
> - unsigned int devfn;
>
> - devfn = PCI_DEVFN(p_err->device, p_err->function);
> - pdev = pci_get_domain_bus_and_slot(p_err->segment,
> - p_err->bus, devfn);
> - if (!pdev)
> - return;
> -
> - guard(device)(&pdev->dev);
> - if (pdev->driver != &cxl_pci_driver)
> - return;
> -
> - cxlds = pci_get_drvdata(pdev);
> + cxlds = get_cxl_dev(p_err->segment, p_err->bus,
> + p_err->device, p_err->function);
> if (!cxlds)
> return;
>
On Wed, May 22, 2024 at 03:08:36PM +0000, Smita Koralahalli wrote:
> In preparation to add tracepoint support, move protocol error UUID
> definition to a common location and make CXL RAS capability struct
> global for use across different modules.
Reviewed-by: Alison Schofield <[email protected]>
>
> Signed-off-by: Smita Koralahalli <[email protected]>
> ---
> drivers/firmware/efi/cper_cxl.c | 11 -----------
> drivers/firmware/efi/cper_cxl.h | 7 ++-----
> include/linux/cper.h | 4 ++++
> include/linux/cxl-event.h | 11 +++++++++++
> 4 files changed, 17 insertions(+), 16 deletions(-)
>
snip to end
On Wed, May 22, 2024 at 03:08:38PM +0000, Smita Koralahalli wrote:
> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
> CPER records.
>
> Reuse the existing work queue cxl_cper_work registered with GHES to notify
> the CXL subsystem on a Protocol error.
>
> The defined trace events cxl_aer_uncorrectable_error and
> cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
> them to trace FW-First Protocol Errors.
Will the trace log differentiate between errors reported in FW-First
versus Native mode? Wondering if that bit of info needs to be logged
or is discoverable elsewhere.
Otherwise, LGTM,
Reviewed-by: Alison Schofield <[email protected]>
>
> Signed-off-by: Smita Koralahalli <[email protected]>
> ---
> drivers/acpi/apei/ghes.c | 14 ++++++++++++++
> drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++
> drivers/cxl/cxlpci.h | 3 +++
> drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++--
> include/linux/cxl-event.h | 1 +
> 5 files changed, 74 insertions(+), 2 deletions(-)
snip
On Wed, May 22, 2024 at 03:08:39PM +0000, Smita Koralahalli wrote:
> Refactor computation of cxlds to a common function get_cxl_dev() and reuse
> the function in both cxl_handle_cper_event() and cxl_handle_prot_err().
>
> Signed-off-by: Smita Koralahalli <[email protected]>
> ---
> drivers/cxl/pci.c | 52 +++++++++++++++++++++++------------------------
> 1 file changed, 26 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 3e3c36983686..26e65e5b68cb 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -974,32 +974,43 @@ static struct pci_driver cxl_pci_driver = {
> },
> };
>
> +static struct cxl_dev_state *get_cxl_dev(u16 segment, u8 bus, u8 device,
> + u8 function)
> +{
> + struct pci_dev *pdev __free(pci_dev_put) = NULL;
> + struct cxl_dev_state *cxlds;
> + unsigned int devfn;
> +
> + devfn = PCI_DEVFN(device, function);
> + pdev = pci_get_domain_bus_and_slot(segment, bus, devfn);
> +
> + if (!pdev)
> + return NULL;
> +
> + guard(device)(&pdev->dev);
> + if (pdev->driver != &cxl_pci_driver)
> + return NULL;
> +
> + cxlds = pci_get_drvdata(pdev);
> +
> + return cxlds;
This can be:
return pci_get_drvdata(pdev);
> +}
snip
On Wed, May 22, 2024 at 11:05:49AM -0700, Dave Jiang wrote:
>
>
> On 5/22/24 8:08 AM, Smita Koralahalli wrote:
> > When PCIe AER is in FW-First, OS should process CXL Protocol errors from
> > CPER records.
> >
> > Reuse the existing work queue cxl_cper_work registered with GHES to notify
> > the CXL subsystem on a Protocol error.
> >
> > The defined trace events cxl_aer_uncorrectable_error and
> > cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
> > them to trace FW-First Protocol Errors.
> >
> > Signed-off-by: Smita Koralahalli <[email protected]>
> > ---
> > drivers/acpi/apei/ghes.c | 14 ++++++++++++++
> > drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++
> > drivers/cxl/cxlpci.h | 3 +++
> > drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++--
> > include/linux/cxl-event.h | 1 +
> > 5 files changed, 74 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> > index 1a58032770ee..a31bd91e9475 100644
> > --- a/drivers/acpi/apei/ghes.c
> > +++ b/drivers/acpi/apei/ghes.c
> > @@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
> >
> > if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
> > return;
> > +
> > + guard(spinlock_irqsave)(&cxl_cper_work_lock);
> > +
> > + if (!cxl_cper_work)
> > + return;
> > +
> > + wd.event_type = CXL_CPER_EVENT_PROT_ERR;
> > +
> > + if (!kfifo_put(&cxl_cper_fifo, wd)) {
> > + pr_err_ratelimited("CXL CPER kfifo overflow\n");
> > + return;
> > + }
> > +
> > + schedule_work(cxl_cper_work);
> > }
> >
> > int cxl_cper_register_work(struct work_struct *work)
> > diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> > index 0df09bd79408..ef9438cb1dd6 100644
> > --- a/drivers/cxl/core/pci.c
> > +++ b/drivers/cxl/core/pci.c
> > @@ -686,6 +686,30 @@ void read_cdat_data(struct cxl_port *port)
> > }
> > EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
> >
> > +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
> > + struct cxl_cper_prot_err *p_err)
> > +{
> > + u32 status, fe;
> > +
> > + if (p_err->severity == CXL_AER_CORRECTABLE) {
> > + status = p_err->cxl_ras.cor_status & ~p_err->cxl_ras.cor_mask;
> > +
> > + trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
> > + } else {
> > + status = p_err->cxl_ras.uncor_status & ~p_err->cxl_ras.uncor_mask;
> > +
> > + if (hweight32(status) > 1)
> > + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
> > + p_err->cxl_ras.cap_control));
> > + else
> > + fe = status;
> > +
> > + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
> > + p_err->cxl_ras.header_log);
> > + }
> > +}
> > +EXPORT_SYMBOL_NS_GPL(cxl_trace_prot_err, CXL);
> > +
> > static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds,
> > void __iomem *ras_base)
> > {
> > diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
> > index 93992a1c8eec..0ba3215786e1 100644
> > --- a/drivers/cxl/cxlpci.h
> > +++ b/drivers/cxl/cxlpci.h
> > @@ -130,4 +130,7 @@ void read_cdat_data(struct cxl_port *port);
> > void cxl_cor_error_detected(struct pci_dev *pdev);
> > pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
> > pci_channel_state_t state);
> > +struct cxl_cper_prot_err;
> > +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
> > + struct cxl_cper_prot_err *p_err);
> > #endif /* __CXL_PCI_H__ */
> > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> > index 74876c9835e8..3e3c36983686 100644
> > --- a/drivers/cxl/pci.c
> > +++ b/drivers/cxl/pci.c
> > @@ -1011,12 +1011,42 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
> > &uuid_null, &rec->event);
> > }
> >
> > +static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
> > +{
> > + struct pci_dev *pdev __free(pci_dev_put) = NULL;
> > + struct cxl_dev_state *cxlds;
> > + unsigned int devfn;
> > +
> > + devfn = PCI_DEVFN(p_err->device, p_err->function);
> > + pdev = pci_get_domain_bus_and_slot(p_err->segment,
> > + p_err->bus, devfn);
> > + if (!pdev)
> > + return;
> > +
> > + guard(device)(&pdev->dev);
> > + if (pdev->driver != &cxl_pci_driver)
> > + return;
> > +
> > + cxlds = pci_get_drvdata(pdev);
> > + if (!cxlds)
> > + return;
> > +
> > + if (((u64)p_err->upper_dw << 32 | p_err->lower_dw) != cxlds->serial)
> > + pr_warn("CPER-reported device serial number does not match expected value\n");
>
> Given that we are operating on a device, perhaps dev_warn(&pdev->dev, ...) may be better served.
>
> DJ
Good point. Providing the dev lets the user look up the serial number,
meaning this message doesn't need to include an 'expected' but not found
value.
-- Alison
> > +
> > + cxl_trace_prot_err(cxlds, p_err);
> > +}
> > +
> > static void cxl_cper_work_fn(struct work_struct *work)
> > {
> > struct cxl_cper_work_data wd;
> >
> > - while (cxl_cper_kfifo_get(&wd))
> > - cxl_handle_cper_event(wd.event_type, &wd.rec);
> > + while (cxl_cper_kfifo_get(&wd)) {
> > + if (wd.event_type == CXL_CPER_EVENT_PROT_ERR)
> > + cxl_handle_prot_err(&wd.p_err);
> > + else
> > + cxl_handle_cper_event(wd.event_type, &wd.rec);
> > + }
> > }
> > static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
> >
> > diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
> > index 9c7b69e076a0..5562844df850 100644
> > --- a/include/linux/cxl-event.h
> > +++ b/include/linux/cxl-event.h
> > @@ -122,6 +122,7 @@ struct cxl_event_record_raw {
> > } __packed;
> >
> > enum cxl_event_type {
> > + CXL_CPER_EVENT_PROT_ERR,
> > CXL_CPER_EVENT_GENERIC,
> > CXL_CPER_EVENT_GEN_MEDIA,
> > CXL_CPER_EVENT_DRAM,
On 5/22/2024 11:05 AM, Dave Jiang wrote:
>
>
> On 5/22/24 8:08 AM, Smita Koralahalli wrote:
>> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
>> CPER records.
>>
>> Reuse the existing work queue cxl_cper_work registered with GHES to notify
>> the CXL subsystem on a Protocol error.
>>
>> The defined trace events cxl_aer_uncorrectable_error and
>> cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
>> them to trace FW-First Protocol Errors.
>>
>> Signed-off-by: Smita Koralahalli <[email protected]>
>> ---
>> drivers/acpi/apei/ghes.c | 14 ++++++++++++++
>> drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++
>> drivers/cxl/cxlpci.h | 3 +++
>> drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++--
>> include/linux/cxl-event.h | 1 +
>> 5 files changed, 74 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index 1a58032770ee..a31bd91e9475 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
>>
>> if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
>> return;
>> +
>> + guard(spinlock_irqsave)(&cxl_cper_work_lock);
>> +
>> + if (!cxl_cper_work)
>> + return;
>> +
>> + wd.event_type = CXL_CPER_EVENT_PROT_ERR;
>> +
>> + if (!kfifo_put(&cxl_cper_fifo, wd)) {
>> + pr_err_ratelimited("CXL CPER kfifo overflow\n");
>> + return;
>> + }
>> +
>> + schedule_work(cxl_cper_work);
>> }
>>
>> int cxl_cper_register_work(struct work_struct *work)
>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>> index 0df09bd79408..ef9438cb1dd6 100644
>> --- a/drivers/cxl/core/pci.c
>> +++ b/drivers/cxl/core/pci.c
>> @@ -686,6 +686,30 @@ void read_cdat_data(struct cxl_port *port)
>> }
>> EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
>>
>> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
>> + struct cxl_cper_prot_err *p_err)
>> +{
>> + u32 status, fe;
>> +
>> + if (p_err->severity == CXL_AER_CORRECTABLE) {
>> + status = p_err->cxl_ras.cor_status & ~p_err->cxl_ras.cor_mask;
>> +
>> + trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
>> + } else {
>> + status = p_err->cxl_ras.uncor_status & ~p_err->cxl_ras.uncor_mask;
>> +
>> + if (hweight32(status) > 1)
>> + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
>> + p_err->cxl_ras.cap_control));
>> + else
>> + fe = status;
>> +
>> + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
>> + p_err->cxl_ras.header_log);
>> + }
>> +}
>> +EXPORT_SYMBOL_NS_GPL(cxl_trace_prot_err, CXL);
>> +
>> static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds,
>> void __iomem *ras_base)
>> {
>> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
>> index 93992a1c8eec..0ba3215786e1 100644
>> --- a/drivers/cxl/cxlpci.h
>> +++ b/drivers/cxl/cxlpci.h
>> @@ -130,4 +130,7 @@ void read_cdat_data(struct cxl_port *port);
>> void cxl_cor_error_detected(struct pci_dev *pdev);
>> pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>> pci_channel_state_t state);
>> +struct cxl_cper_prot_err;
>> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
>> + struct cxl_cper_prot_err *p_err);
>> #endif /* __CXL_PCI_H__ */
>> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
>> index 74876c9835e8..3e3c36983686 100644
>> --- a/drivers/cxl/pci.c
>> +++ b/drivers/cxl/pci.c
>> @@ -1011,12 +1011,42 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
>> &uuid_null, &rec->event);
>> }
>>
>> +static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
>> +{
>> + struct pci_dev *pdev __free(pci_dev_put) = NULL;
>> + struct cxl_dev_state *cxlds;
>> + unsigned int devfn;
>> +
>> + devfn = PCI_DEVFN(p_err->device, p_err->function);
>> + pdev = pci_get_domain_bus_and_slot(p_err->segment,
>> + p_err->bus, devfn);
>> + if (!pdev)
>> + return;
>> +
>> + guard(device)(&pdev->dev);
>> + if (pdev->driver != &cxl_pci_driver)
>> + return;
>> +
>> + cxlds = pci_get_drvdata(pdev);
>> + if (!cxlds)
>> + return;
>> +
>> + if (((u64)p_err->upper_dw << 32 | p_err->lower_dw) != cxlds->serial)
>> + pr_warn("CPER-reported device serial number does not match expected value\n");
>
> Given that we are operating on a device, perhaps dev_warn(&pdev->dev, ...) may be better served.
Will fix.
Thanks,
Smita
[snip]
On 5/22/2024 12:42 PM, Dave Jiang wrote:
>
>
> On 5/22/24 8:08 AM, Smita Koralahalli wrote:
>> Refactor computation of cxlds to a common function get_cxl_dev() and reuse
>> the function in both cxl_handle_cper_event() and cxl_handle_prot_err().
>
> I think just introduce the function where you open coded it instead of adding code and then deleting them in the same series.
Okay refactor first, then add trace support. Will change.
>>
>> Signed-off-by: Smita Koralahalli <[email protected]>
>> ---
>> drivers/cxl/pci.c | 52 +++++++++++++++++++++++------------------------
>> 1 file changed, 26 insertions(+), 26 deletions(-)
>>
>> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
>> index 3e3c36983686..26e65e5b68cb 100644
>> --- a/drivers/cxl/pci.c
>> +++ b/drivers/cxl/pci.c
>> @@ -974,32 +974,43 @@ static struct pci_driver cxl_pci_driver = {
>> },
>> };
>>
>> +static struct cxl_dev_state *get_cxl_dev(u16 segment, u8 bus, u8 device,
>> + u8 function)
>
> get_cxlds() or get_cxl_device_state() would be better.
Okay will change
>
> DJ
>
Thanks
Smita
>> +{
>> + struct pci_dev *pdev __free(pci_dev_put) = NULL;
>> + struct cxl_dev_state *cxlds;
>> + unsigned int devfn;
>> +
>> + devfn = PCI_DEVFN(device, function);
>> + pdev = pci_get_domain_bus_and_slot(segment, bus, devfn);
>> +
>> + if (!pdev)
>> + return NULL;
>> +
>> + guard(device)(&pdev->dev);
>> + if (pdev->driver != &cxl_pci_driver)
>> + return NULL;
>> +
>> + cxlds = pci_get_drvdata(pdev);
>> +
>> + return cxlds;
>> +}
>> +
>> #define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
>> static void cxl_handle_cper_event(enum cxl_event_type ev_type,
>> struct cxl_cper_event_rec *rec)
>> {
>> struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
>> - struct pci_dev *pdev __free(pci_dev_put) = NULL;
>> enum cxl_event_log_type log_type;
>> struct cxl_dev_state *cxlds;
>> - unsigned int devfn;
>> u32 hdr_flags;
>>
>> pr_debug("CPER event %d for device %u:%u:%u.%u\n", ev_type,
>> device_id->segment_num, device_id->bus_num,
>> device_id->device_num, device_id->func_num);
>>
>> - devfn = PCI_DEVFN(device_id->device_num, device_id->func_num);
>> - pdev = pci_get_domain_bus_and_slot(device_id->segment_num,
>> - device_id->bus_num, devfn);
>> - if (!pdev)
>> - return;
>> -
>> - guard(device)(&pdev->dev);
>> - if (pdev->driver != &cxl_pci_driver)
>> - return;
>> -
>> - cxlds = pci_get_drvdata(pdev);
>> + cxlds = get_cxl_dev(device_id->segment_num, device_id->bus_num,
>> + device_id->device_num, device_id->func_num);
>> if (!cxlds)
>> return;
>>
>> @@ -1013,21 +1024,10 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
>>
>> static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
>> {
>> - struct pci_dev *pdev __free(pci_dev_put) = NULL;
>> struct cxl_dev_state *cxlds;
>> - unsigned int devfn;
>>
>> - devfn = PCI_DEVFN(p_err->device, p_err->function);
>> - pdev = pci_get_domain_bus_and_slot(p_err->segment,
>> - p_err->bus, devfn);
>> - if (!pdev)
>> - return;
>> -
>> - guard(device)(&pdev->dev);
>> - if (pdev->driver != &cxl_pci_driver)
>> - return;
>> -
>> - cxlds = pci_get_drvdata(pdev);
>> + cxlds = get_cxl_dev(p_err->segment, p_err->bus,
>> + p_err->device, p_err->function);
>> if (!cxlds)
>> return;
>>
On 5/22/2024 5:22 PM, Alison Schofield wrote:
> On Wed, May 22, 2024 at 03:08:38PM +0000, Smita Koralahalli wrote:
>> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
>> CPER records.
>>
>> Reuse the existing work queue cxl_cper_work registered with GHES to notify
>> the CXL subsystem on a Protocol error.
>>
>> The defined trace events cxl_aer_uncorrectable_error and
>> cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
>> them to trace FW-First Protocol Errors.
>
> Will the trace log differentiate between errors reported in FW-First
> versus Native mode? Wondering if that bit of info needs to be logged
> or is discoverable elsewhere.
No, the trace log won't differentiate currently.
But just a side note, FW-First also logs errors in dmesg. I'm not sure
if going forward, we would still continue to log errors in dmesg. But I
feel it might be needed so that we don't miss errors from RCH Downstream
Port or hexdump of unrecognized agent types.
Thanks
Smita
>
> Otherwise, LGTM,
> Reviewed-by: Alison Schofield <[email protected]>
>
>
>>
>> Signed-off-by: Smita Koralahalli <[email protected]>
>> ---
>> drivers/acpi/apei/ghes.c | 14 ++++++++++++++
>> drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++
>> drivers/cxl/cxlpci.h | 3 +++
>> drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++--
>> include/linux/cxl-event.h | 1 +
>> 5 files changed, 74 insertions(+), 2 deletions(-)
>
> snip
>
>
On Wed, 22 May 2024 15:08:36 +0000
Smita Koralahalli <[email protected]> wrote:
> In preparation to add tracepoint support, move protocol error UUID
> definition to a common location and make CXL RAS capability struct
> global for use across different modules.
>
> Signed-off-by: Smita Koralahalli <[email protected]>
FWIW given already well reviewed.
Reviewed-by: Jonathan Cameron <[email protected]>
On Wed, 22 May 2024 15:08:39 +0000
Smita Koralahalli <[email protected]> wrote:
> Refactor computation of cxlds to a common function get_cxl_dev() and reuse
> the function in both cxl_handle_cper_event() and cxl_handle_prot_err().
>
> Signed-off-by: Smita Koralahalli <[email protected]>
> ---
> drivers/cxl/pci.c | 52 +++++++++++++++++++++++------------------------
> 1 file changed, 26 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 3e3c36983686..26e65e5b68cb 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -974,32 +974,43 @@ static struct pci_driver cxl_pci_driver = {
> },
> };
>
> +static struct cxl_dev_state *get_cxl_dev(u16 segment, u8 bus, u8 device,
> + u8 function)
I'd not expect a function with this name to return anything other
than a struct device *
get_cxl_devstate() maybe?
> +{
> + struct pci_dev *pdev __free(pci_dev_put) = NULL;
> + struct cxl_dev_state *cxlds;
> + unsigned int devfn;
> +
> + devfn = PCI_DEVFN(device, function);
Might as well do
unsigned int devfn = PCI_DEVFN(device, function);
> + pdev = pci_get_domain_bus_and_slot(segment, bus, devfn);
struct pci_dev *pdev __free(pci_dev_put) =
pci_get_domain_bus_and_slot(segment, bus, devfn);
see fwctl thread for a reference to Linus expressing a preference for
keeping construct and destructor definitions together even when
that means relaxing c conventions that we are so used to!
Obviously this is copied from below, but might as well tidy it up
whilst here.
However, do the devfn change above and it is in the definitions block...
> +
> + if (!pdev)
> + return NULL;
> +
> + guard(device)(&pdev->dev);
> + if (pdev->driver != &cxl_pci_driver)
> + return NULL;
> +
> + cxlds = pci_get_drvdata(pdev);
> +
> + return cxlds;
return pci_get_drvdata(pdev);
I think the function is small enough having cxlds just so it's obvious
what is being returned is unnecessary.
> +}
> +
> #define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
> static void cxl_handle_cper_event(enum cxl_event_type ev_type,
> struct cxl_cper_event_rec *rec)
> {
> struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
> - struct pci_dev *pdev __free(pci_dev_put) = NULL;
> enum cxl_event_log_type log_type;
> struct cxl_dev_state *cxlds;
> - unsigned int devfn;
> u32 hdr_flags;
>
> pr_debug("CPER event %d for device %u:%u:%u.%u\n", ev_type,
> device_id->segment_num, device_id->bus_num,
> device_id->device_num, device_id->func_num);
>
> - devfn = PCI_DEVFN(device_id->device_num, device_id->func_num);
> - pdev = pci_get_domain_bus_and_slot(device_id->segment_num,
> - device_id->bus_num, devfn);
> - if (!pdev)
> - return;
> -
> - guard(device)(&pdev->dev);
> - if (pdev->driver != &cxl_pci_driver)
> - return;
> -
> - cxlds = pci_get_drvdata(pdev);
> + cxlds = get_cxl_dev(device_id->segment_num, device_id->bus_num,
> + device_id->device_num, device_id->func_num);
> if (!cxlds)
> return;
>
> @@ -1013,21 +1024,10 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
>
> static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
> {
> - struct pci_dev *pdev __free(pci_dev_put) = NULL;
> struct cxl_dev_state *cxlds;
> - unsigned int devfn;
>
> - devfn = PCI_DEVFN(p_err->device, p_err->function);
> - pdev = pci_get_domain_bus_and_slot(p_err->segment,
> - p_err->bus, devfn);
> - if (!pdev)
> - return;
> -
> - guard(device)(&pdev->dev);
> - if (pdev->driver != &cxl_pci_driver)
> - return;
> -
> - cxlds = pci_get_drvdata(pdev);
> + cxlds = get_cxl_dev(p_err->segment, p_err->bus,
> + p_err->device, p_err->function);
> if (!cxlds)
> return;
>
On 6/7/2024 8:14 AM, Jonathan Cameron wrote:
> On Wed, 22 May 2024 15:08:36 +0000
> Smita Koralahalli <[email protected]> wrote:
>
>> In preparation to add tracepoint support, move protocol error UUID
>> definition to a common location and make CXL RAS capability struct
>> global for use across different modules.
>>
>> Signed-off-by: Smita Koralahalli <[email protected]>
> FWIW given already well reviewed.
> Reviewed-by: Jonathan Cameron <[email protected]>
>
Sorry I missed your reviewed by tag. Thanks for reviewing my patches!
Thanks
Smita
Smita Koralahalli wrote:
> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
> CPER records.
>
> Reuse the existing work queue cxl_cper_work registered with GHES to notify
> the CXL subsystem on a Protocol error.
>
> The defined trace events cxl_aer_uncorrectable_error and
> cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
> them to trace FW-First Protocol Errors.
>
> Signed-off-by: Smita Koralahalli <[email protected]>
> ---
> drivers/acpi/apei/ghes.c | 14 ++++++++++++++
> drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++
> drivers/cxl/cxlpci.h | 3 +++
> drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++--
> include/linux/cxl-event.h | 1 +
> 5 files changed, 74 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 1a58032770ee..a31bd91e9475 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
>
> if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
> return;
> +
> + guard(spinlock_irqsave)(&cxl_cper_work_lock);
> +
> + if (!cxl_cper_work)
> + return;
> +
> + wd.event_type = CXL_CPER_EVENT_PROT_ERR;
> +
> + if (!kfifo_put(&cxl_cper_fifo, wd)) {
> + pr_err_ratelimited("CXL CPER kfifo overflow\n");
> + return;
> + }
> +
> + schedule_work(cxl_cper_work);
This seems wrong to unconditionally schedule the cxl_pci driver to look
at potentially "non-device" errors. With Terry's upcoming CXL switch
port error handling there will be a native path for those errors, but
until that arrives, I see no point in this code trying to convey
root/switch port errors to the endpoint driver.
> }
>
> int cxl_cper_register_work(struct work_struct *work)
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 0df09bd79408..ef9438cb1dd6 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -686,6 +686,30 @@ void read_cdat_data(struct cxl_port *port)
> }
> EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
>
> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
> + struct cxl_cper_prot_err *p_err)
> +{
> + u32 status, fe;
> +
> + if (p_err->severity == CXL_AER_CORRECTABLE) {
> + status = p_err->cxl_ras.cor_status & ~p_err->cxl_ras.cor_mask;
> +
> + trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
> + } else {
> + status = p_err->cxl_ras.uncor_status & ~p_err->cxl_ras.uncor_mask;
> +
> + if (hweight32(status) > 1)
> + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
> + p_err->cxl_ras.cap_control));
> + else
> + fe = status;
> +
> + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
> + p_err->cxl_ras.header_log);
> + }
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_trace_prot_err, CXL);
> +
> static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds,
> void __iomem *ras_base)
> {
> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
> index 93992a1c8eec..0ba3215786e1 100644
> --- a/drivers/cxl/cxlpci.h
> +++ b/drivers/cxl/cxlpci.h
> @@ -130,4 +130,7 @@ void read_cdat_data(struct cxl_port *port);
> void cxl_cor_error_detected(struct pci_dev *pdev);
> pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
> pci_channel_state_t state);
> +struct cxl_cper_prot_err;
> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
> + struct cxl_cper_prot_err *p_err);
> #endif /* __CXL_PCI_H__ */
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 74876c9835e8..3e3c36983686 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -1011,12 +1011,42 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
> &uuid_null, &rec->event);
> }
>
> +static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
Can we call this variable @rec instead of @p_err? The data being passed
is CPER data which is a "record" structure.
> +{
> + struct pci_dev *pdev __free(pci_dev_put) = NULL;
> + struct cxl_dev_state *cxlds;
> + unsigned int devfn;
> +
> + devfn = PCI_DEVFN(p_err->device, p_err->function);
> + pdev = pci_get_domain_bus_and_slot(p_err->segment,
> + p_err->bus, devfn);
> + if (!pdev)
> + return;
> +
> + guard(device)(&pdev->dev);
> + if (pdev->driver != &cxl_pci_driver)
> + return;
> +
> + cxlds = pci_get_drvdata(pdev);
> + if (!cxlds)
> + return;
> +
> + if (((u64)p_err->upper_dw << 32 | p_err->lower_dw) != cxlds->serial)
> + pr_warn("CPER-reported device serial number does not match expected value\n");
Not much the end user can do about this warning, I would say skip this
message, or make it a pci_dbg() because matching by BDF should be
sufficient.
> +
> + cxl_trace_prot_err(cxlds, p_err);
> +}
> +
> static void cxl_cper_work_fn(struct work_struct *work)
> {
> struct cxl_cper_work_data wd;
>
> - while (cxl_cper_kfifo_get(&wd))
> - cxl_handle_cper_event(wd.event_type, &wd.rec);
> + while (cxl_cper_kfifo_get(&wd)) {
> + if (wd.event_type == CXL_CPER_EVENT_PROT_ERR)
> + cxl_handle_prot_err(&wd.p_err);
> + else
> + cxl_handle_cper_event(wd.event_type, &wd.rec);
> + }
> }
> static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
>
> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
> index 9c7b69e076a0..5562844df850 100644
> --- a/include/linux/cxl-event.h
> +++ b/include/linux/cxl-event.h
> @@ -122,6 +122,7 @@ struct cxl_event_record_raw {
> } __packed;
>
> enum cxl_event_type {
> + CXL_CPER_EVENT_PROT_ERR,
> CXL_CPER_EVENT_GENERIC,
> CXL_CPER_EVENT_GEN_MEDIA,
> CXL_CPER_EVENT_DRAM,
> --
> 2.17.1
>
Hi Dan,
On 6/11/2024 5:07 PM, Dan Williams wrote:
> Smita Koralahalli wrote:
>> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
>> CPER records.
>>
>> Reuse the existing work queue cxl_cper_work registered with GHES to notify
>> the CXL subsystem on a Protocol error.
>>
>> The defined trace events cxl_aer_uncorrectable_error and
>> cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
>> them to trace FW-First Protocol Errors.
>>
>> Signed-off-by: Smita Koralahalli <[email protected]>
>> ---
>> drivers/acpi/apei/ghes.c | 14 ++++++++++++++
>> drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++
>> drivers/cxl/cxlpci.h | 3 +++
>> drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++--
>> include/linux/cxl-event.h | 1 +
>> 5 files changed, 74 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index 1a58032770ee..a31bd91e9475 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
>>
>> if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
>> return;
>> +
>> + guard(spinlock_irqsave)(&cxl_cper_work_lock);
>> +
>> + if (!cxl_cper_work)
>> + return;
>> +
>> + wd.event_type = CXL_CPER_EVENT_PROT_ERR;
>> +
>> + if (!kfifo_put(&cxl_cper_fifo, wd)) {
>> + pr_err_ratelimited("CXL CPER kfifo overflow\n");
>> + return;
>> + }
>> +
>> + schedule_work(cxl_cper_work);
>
> This seems wrong to unconditionally schedule the cxl_pci driver to look
> at potentially "non-device" errors. With Terry's upcoming CXL switch
> port error handling there will be a native path for those errors, but
> until that arrives, I see no point in this code trying to convey
> root/switch port errors to the endpoint driver.
I see okay. What are your recommendations on this? Just confine it to
CXL RCD, CXL SLD and CXL LD? And then extend it to ports once Terry
sends patches?
Also, I'm not sure about FMLD. Should we just drop it as of now?
>
>> }
>>
>> int cxl_cper_register_work(struct work_struct *work)
>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>> index 0df09bd79408..ef9438cb1dd6 100644
>> --- a/drivers/cxl/core/pci.c
>> +++ b/drivers/cxl/core/pci.c
>> @@ -686,6 +686,30 @@ void read_cdat_data(struct cxl_port *port)
>> }
>> EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
>>
>> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
>> + struct cxl_cper_prot_err *p_err)
>> +{
>> + u32 status, fe;
>> +
>> + if (p_err->severity == CXL_AER_CORRECTABLE) {
>> + status = p_err->cxl_ras.cor_status & ~p_err->cxl_ras.cor_mask;
>> +
>> + trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
>> + } else {
>> + status = p_err->cxl_ras.uncor_status & ~p_err->cxl_ras.uncor_mask;
>> +
>> + if (hweight32(status) > 1)
>> + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
>> + p_err->cxl_ras.cap_control));
>> + else
>> + fe = status;
>> +
>> + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
>> + p_err->cxl_ras.header_log);
>> + }
>> +}
>> +EXPORT_SYMBOL_NS_GPL(cxl_trace_prot_err, CXL);
>> +
>> static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds,
>> void __iomem *ras_base)
>> {
>> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
>> index 93992a1c8eec..0ba3215786e1 100644
>> --- a/drivers/cxl/cxlpci.h
>> +++ b/drivers/cxl/cxlpci.h
>> @@ -130,4 +130,7 @@ void read_cdat_data(struct cxl_port *port);
>> void cxl_cor_error_detected(struct pci_dev *pdev);
>> pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>> pci_channel_state_t state);
>> +struct cxl_cper_prot_err;
>> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
>> + struct cxl_cper_prot_err *p_err);
>> #endif /* __CXL_PCI_H__ */
>> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
>> index 74876c9835e8..3e3c36983686 100644
>> --- a/drivers/cxl/pci.c
>> +++ b/drivers/cxl/pci.c
>> @@ -1011,12 +1011,42 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
>> &uuid_null, &rec->event);
>> }
>>
>> +static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
>
> Can we call this variable @rec instead of @p_err? The data being passed
> is CPER data which is a "record" structure.
Will change.
>
>> +{
>> + struct pci_dev *pdev __free(pci_dev_put) = NULL;
>> + struct cxl_dev_state *cxlds;
>> + unsigned int devfn;
>> +
>> + devfn = PCI_DEVFN(p_err->device, p_err->function);
>> + pdev = pci_get_domain_bus_and_slot(p_err->segment,
>> + p_err->bus, devfn);
>> + if (!pdev)
>> + return;
>> +
>> + guard(device)(&pdev->dev);
>> + if (pdev->driver != &cxl_pci_driver)
>> + return;
>> +
>> + cxlds = pci_get_drvdata(pdev);
>> + if (!cxlds)
>> + return;
>> +
>> + if (((u64)p_err->upper_dw << 32 | p_err->lower_dw) != cxlds->serial)
>> + pr_warn("CPER-reported device serial number does not match expected value\n");
>
> Not much the end user can do about this warning, I would say skip this
> message, or make it a pci_dbg() because matching by BDF should be
> sufficient.
Will skip this message.
Thanks
Smita
>
>> +
>> + cxl_trace_prot_err(cxlds, p_err);
>> +}
>> +
>> static void cxl_cper_work_fn(struct work_struct *work)
>> {
>> struct cxl_cper_work_data wd;
>>
>> - while (cxl_cper_kfifo_get(&wd))
>> - cxl_handle_cper_event(wd.event_type, &wd.rec);
>> + while (cxl_cper_kfifo_get(&wd)) {
>> + if (wd.event_type == CXL_CPER_EVENT_PROT_ERR)
>> + cxl_handle_prot_err(&wd.p_err);
>> + else
>> + cxl_handle_cper_event(wd.event_type, &wd.rec);
>> + }
>> }
>> static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
>>
>> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
>> index 9c7b69e076a0..5562844df850 100644
>> --- a/include/linux/cxl-event.h
>> +++ b/include/linux/cxl-event.h
>> @@ -122,6 +122,7 @@ struct cxl_event_record_raw {
>> } __packed;
>>
>> enum cxl_event_type {
>> + CXL_CPER_EVENT_PROT_ERR,
>> CXL_CPER_EVENT_GENERIC,
>> CXL_CPER_EVENT_GEN_MEDIA,
>> CXL_CPER_EVENT_DRAM,
>> --
>> 2.17.1
>>
>
>