UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
Severity, Device ID, Device Serial number and CXL RAS capability struct in
struct cxl_cper_prot_err. Include this struct as a member of struct
cxl_cper_work_data.
Signed-off-by: Smita Koralahalli <[email protected]>
---
drivers/acpi/apei/ghes.c | 10 +++++
drivers/firmware/efi/cper_cxl.c | 66 +++++++++++++++++++++++++++++++++
include/linux/cxl-event.h | 26 +++++++++++++
3 files changed, 102 insertions(+)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 623cc0cb4a65..1a58032770ee 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -717,6 +717,14 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
schedule_work(cxl_cper_work);
}
+static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
+{
+ struct cxl_cper_work_data wd;
+
+ if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
+ return;
+}
+
int cxl_cper_register_work(struct work_struct *work)
{
if (cxl_cper_work)
@@ -791,6 +799,8 @@ static bool ghes_do_proc(struct ghes *ghes,
struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec);
+ } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
+ cxl_cper_handle_prot_err(gdata);
} else {
void *err = acpi_hest_get_payload(gdata);
diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
index 4fd8d783993e..03b9839f3b73 100644
--- a/drivers/firmware/efi/cper_cxl.c
+++ b/drivers/firmware/efi/cper_cxl.c
@@ -8,6 +8,7 @@
*/
#include <linux/cper.h>
+#include <acpi/ghes.h>
#include "cper_cxl.h"
#define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
@@ -44,6 +45,17 @@ enum {
USP, /* CXL Upstream Switch Port */
};
+static enum cxl_aer_err_type cper_severity_cxl_aer(int cper_severity)
+{
+ switch (cper_severity) {
+ case CPER_SEV_RECOVERABLE:
+ case CPER_SEV_FATAL:
+ return CXL_AER_UNCORRECTABLE;
+ default:
+ return CXL_AER_CORRECTABLE;
+ }
+}
+
void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
{
if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
@@ -176,3 +188,57 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
sizeof(cxl_ras->header_log), 0);
}
}
+
+int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
+ struct cxl_cper_prot_err *p_err)
+{
+ struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
+ u8 *dvsec_start, *cap_start;
+
+ if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
+ pr_err(FW_WARN "No Device ID\n");
+ return -EINVAL;
+ }
+
+ /*
+ * The device ID or agent address is required for CXL RCD, CXL
+ * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
+ * CXL Downstream Switch Port and CXL Upstream Switch Port.
+ */
+ if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
+ p_err->segment = prot_err->agent_addr.segment;
+ p_err->bus = prot_err->agent_addr.bus;
+ p_err->device = prot_err->agent_addr.device;
+ p_err->function = prot_err->agent_addr.function;
+ } else {
+ pr_err(FW_WARN "Invalid agent type\n");
+ return -EINVAL;
+ }
+
+ if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
+ pr_err(FW_WARN "Invalid Protocol Error log\n");
+ return -EINVAL;
+ }
+
+ dvsec_start = (u8 *)(prot_err + 1);
+ cap_start = dvsec_start + prot_err->dvsec_len;
+ p_err->cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;
+
+ /*
+ * Set device serial number unconditionally.
+ *
+ * Print a warning message if it is not valid. The device serial
+ * number is required for CXL RCD, CXL SLD, CXL LD and CXL Fabric
+ * Manager Managed LD.
+ */
+ if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
+ prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
+ pr_warn(FW_WARN "No Device Serial number\n");
+
+ p_err->lower_dw = prot_err->dev_serial_num.lower_dw;
+ p_err->upper_dw = prot_err->dev_serial_num.upper_dw;
+
+ p_err->severity = cper_severity_cxl_aer(gdata->error_severity);
+
+ return 0;
+}
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index f11e52ff565a..9c7b69e076a0 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -165,11 +165,37 @@ struct cxl_ras_capability_regs {
u32 header_log[16];
};
+enum cxl_aer_err_type {
+ CXL_AER_UNCORRECTABLE,
+ CXL_AER_CORRECTABLE,
+};
+
+struct cxl_cper_prot_err {
+ struct cxl_ras_capability_regs cxl_ras;
+
+ /* Device ID */
+ u8 function;
+ u8 device;
+ u8 bus;
+ u16 segment;
+
+ /* Device Serial Number */
+ u32 lower_dw;
+ u32 upper_dw;
+
+ int severity;
+};
+
struct cxl_cper_work_data {
enum cxl_event_type event_type;
struct cxl_cper_event_rec rec;
+ struct cxl_cper_prot_err p_err;
};
+struct acpi_hest_generic_data;
+int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
+ struct cxl_cper_prot_err *p_err);
+
#ifdef CONFIG_ACPI_APEI_GHES
int cxl_cper_register_work(struct work_struct *work);
int cxl_cper_unregister_work(struct work_struct *work);
--
2.17.1
On 5/22/24 8:08 AM, Smita Koralahalli wrote:
> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
>
> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
> Severity, Device ID, Device Serial number and CXL RAS capability struct in
> struct cxl_cper_prot_err. Include this struct as a member of struct
> cxl_cper_work_data.
>
> Signed-off-by: Smita Koralahalli <[email protected]>
> ---
> drivers/acpi/apei/ghes.c | 10 +++++
> drivers/firmware/efi/cper_cxl.c | 66 +++++++++++++++++++++++++++++++++
> include/linux/cxl-event.h | 26 +++++++++++++
> 3 files changed, 102 insertions(+)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 623cc0cb4a65..1a58032770ee 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -717,6 +717,14 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
> schedule_work(cxl_cper_work);
> }
>
> +static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
> +{
> + struct cxl_cper_work_data wd;
> +
> + if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
> + return;
> +}
> +
> int cxl_cper_register_work(struct work_struct *work)
> {
> if (cxl_cper_work)
> @@ -791,6 +799,8 @@ static bool ghes_do_proc(struct ghes *ghes,
> struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
>
> cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec);
> + } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
> + cxl_cper_handle_prot_err(gdata);
> } else {
> void *err = acpi_hest_get_payload(gdata);
>
> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
> index 4fd8d783993e..03b9839f3b73 100644
> --- a/drivers/firmware/efi/cper_cxl.c
> +++ b/drivers/firmware/efi/cper_cxl.c
> @@ -8,6 +8,7 @@
> */
>
> #include <linux/cper.h>
> +#include <acpi/ghes.h>
> #include "cper_cxl.h"
>
> #define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
> @@ -44,6 +45,17 @@ enum {
> USP, /* CXL Upstream Switch Port */
> };
>
> +static enum cxl_aer_err_type cper_severity_cxl_aer(int cper_severity)
> +{
> + switch (cper_severity) {
> + case CPER_SEV_RECOVERABLE:
> + case CPER_SEV_FATAL:
> + return CXL_AER_UNCORRECTABLE;
> + default:
> + return CXL_AER_CORRECTABLE;
> + }
> +}
> +
> void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
> {
> if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
> @@ -176,3 +188,57 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
> sizeof(cxl_ras->header_log), 0);
> }
> }
> +
> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
> + struct cxl_cper_prot_err *p_err)
> +{
> + struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
> + u8 *dvsec_start, *cap_start;
> +
> + if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
> + pr_err(FW_WARN "No Device ID\n");
> + return -EINVAL;
> + }
> +
> + /*
> + * The device ID or agent address is required for CXL RCD, CXL
> + * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
> + * CXL Downstream Switch Port and CXL Upstream Switch Port.
> + */
> + if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
Perhaps define an enum CXL_AGENT_TYPE_MAX instead of 0x7 magic number? Otherwise if a new type is introduced, it would break this code.
> + p_err->segment = prot_err->agent_addr.segment;
> + p_err->bus = prot_err->agent_addr.bus;
> + p_err->device = prot_err->agent_addr.device;
> + p_err->function = prot_err->agent_addr.function;
> + } else {
> + pr_err(FW_WARN "Invalid agent type\n");
> + return -EINVAL;
> + }
Up to you if you want to do this or not, but maybe:
if (prot_err->agent_type >= CXL_AGENT_TYPE_MAX || prot_err->agent_type == RCH_DP) {
pr_warn(...);
return -EINVAL;
}
p_err->segment = ...;
p_err->bus = ...;
...
Although perhaps a helper function cxl_cper_valid_agent_type() that checks invalid agent type by checking the valid_bits, the agent_type boundary, and if agent_type != RCH_DP?
> +
> + if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
> + pr_err(FW_WARN "Invalid Protocol Error log\n");
> + return -EINVAL;
> + }
> +
> + dvsec_start = (u8 *)(prot_err + 1);
> + cap_start = dvsec_start + prot_err->dvsec_len;
> + p_err->cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;
> +
> + /*
> + * Set device serial number unconditionally.
> + *
> + * Print a warning message if it is not valid. The device serial
> + * number is required for CXL RCD, CXL SLD, CXL LD and CXL Fabric
> + * Manager Managed LD.
> + */
> + if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
> + prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
prot_err->agent_type > FM_LD? Although maybe it would be a clearer read if a helper function is defined to identify the agent types such as cxl_cper_prot_err_serial_needed() or cxl_cper_prot_agent_type_device() and with it a switch statement to explicitly identify all the agent types that require serial number. If a future device is defined, the > 0x4 logic may break.
DJ
> + pr_warn(FW_WARN "No Device Serial number\n");
> +
> + p_err->lower_dw = prot_err->dev_serial_num.lower_dw;
> + p_err->upper_dw = prot_err->dev_serial_num.upper_dw;
> +
> + p_err->severity = cper_severity_cxl_aer(gdata->error_severity);
> +
> + return 0;
> +}
> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
> index f11e52ff565a..9c7b69e076a0 100644
> --- a/include/linux/cxl-event.h
> +++ b/include/linux/cxl-event.h
> @@ -165,11 +165,37 @@ struct cxl_ras_capability_regs {
> u32 header_log[16];
> };
>
> +enum cxl_aer_err_type {
> + CXL_AER_UNCORRECTABLE,
> + CXL_AER_CORRECTABLE,
> +};
> +
> +struct cxl_cper_prot_err {
> + struct cxl_ras_capability_regs cxl_ras;
> +
> + /* Device ID */
> + u8 function;
> + u8 device;
> + u8 bus;
> + u16 segment;
> +
> + /* Device Serial Number */
> + u32 lower_dw;
> + u32 upper_dw;
> +
> + int severity;
> +};
> +
> struct cxl_cper_work_data {
> enum cxl_event_type event_type;
> struct cxl_cper_event_rec rec;
> + struct cxl_cper_prot_err p_err;
> };
>
> +struct acpi_hest_generic_data;
> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
> + struct cxl_cper_prot_err *p_err);
> +
> #ifdef CONFIG_ACPI_APEI_GHES
> int cxl_cper_register_work(struct work_struct *work);
> int cxl_cper_unregister_work(struct work_struct *work);
On Wed, May 22, 2024 at 03:08:37PM +0000, Smita Koralahalli wrote:
> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
>
> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
> Severity, Device ID, Device Serial number and CXL RAS capability struct in
> struct cxl_cper_prot_err. Include this struct as a member of struct
> cxl_cper_work_data.
>
> Signed-off-by: Smita Koralahalli <[email protected]>
> ---
> drivers/acpi/apei/ghes.c | 10 +++++
> drivers/firmware/efi/cper_cxl.c | 66 +++++++++++++++++++++++++++++++++
> include/linux/cxl-event.h | 26 +++++++++++++
> 3 files changed, 102 insertions(+)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 623cc0cb4a65..1a58032770ee 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -717,6 +717,14 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
> schedule_work(cxl_cper_work);
> }
>
> +static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
> +{
> + struct cxl_cper_work_data wd;
> +
> + if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
> + return;
> +}
> +
> int cxl_cper_register_work(struct work_struct *work)
> {
> if (cxl_cper_work)
> @@ -791,6 +799,8 @@ static bool ghes_do_proc(struct ghes *ghes,
> struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
>
> cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec);
> + } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
> + cxl_cper_handle_prot_err(gdata);
> } else {
> void *err = acpi_hest_get_payload(gdata);
>
> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
> index 4fd8d783993e..03b9839f3b73 100644
> --- a/drivers/firmware/efi/cper_cxl.c
> +++ b/drivers/firmware/efi/cper_cxl.c
> @@ -8,6 +8,7 @@
> */
>
> #include <linux/cper.h>
> +#include <acpi/ghes.h>
> #include "cper_cxl.h"
>
> #define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
> @@ -44,6 +45,17 @@ enum {
> USP, /* CXL Upstream Switch Port */
> };
>
> +static enum cxl_aer_err_type cper_severity_cxl_aer(int cper_severity)
> +{
> + switch (cper_severity) {
> + case CPER_SEV_RECOVERABLE:
> + case CPER_SEV_FATAL:
> + return CXL_AER_UNCORRECTABLE;
> + default:
> + return CXL_AER_CORRECTABLE;
> + }
> +}
> +
> void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
> {
> if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
> @@ -176,3 +188,57 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
> sizeof(cxl_ras->header_log), 0);
> }
> }
> +
> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
> + struct cxl_cper_prot_err *p_err)
> +{
> + struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
> + u8 *dvsec_start, *cap_start;
> +
> + if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
> + pr_err(FW_WARN "No Device ID\n");
> + return -EINVAL;
> + }
> +
> + /*
> + * The device ID or agent address is required for CXL RCD, CXL
> + * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
> + * CXL Downstream Switch Port and CXL Upstream Switch Port.
> + */
> + if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
For this check against agent_type, and the similar one below, would a boolean
array indexed by the agent type work? That would avoid the <= 0x7 and > 0x4
below. It seems one array would suffice for this case, but naming it isn't obvious
to me. Maybe it'll be to you.
Something similar to what is done for prot_err_agent_type_strs[]
static const bool agent_requires_id_address_serial[] = {
true, /* RDC */
false, /* RCH_DP */
.
.
.
};
> + p_err->segment = prot_err->agent_addr.segment;
> + p_err->bus = prot_err->agent_addr.bus;
> + p_err->device = prot_err->agent_addr.device;
> + p_err->function = prot_err->agent_addr.function;
> + } else {
> + pr_err(FW_WARN "Invalid agent type\n");
> + return -EINVAL;
> + }
> +
> + if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
> + pr_err(FW_WARN "Invalid Protocol Error log\n");
> + return -EINVAL;
> + }
> +
> + dvsec_start = (u8 *)(prot_err + 1);
> + cap_start = dvsec_start + prot_err->dvsec_len;
> + p_err->cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;
> +
> + /*
> + * Set device serial number unconditionally.
> + *
> + * Print a warning message if it is not valid. The device serial
> + * number is required for CXL RCD, CXL SLD, CXL LD and CXL Fabric
> + * Manager Managed LD.
> + */
> + if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
> + prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
then this also can be replaced with
agent_requires_id_address_serial[prot_err->agent_type]
-- Alison
> + pr_warn(FW_WARN "No Device Serial number\n");
> +
> + p_err->lower_dw = prot_err->dev_serial_num.lower_dw;
> + p_err->upper_dw = prot_err->dev_serial_num.upper_dw;
> +
> + p_err->severity = cper_severity_cxl_aer(gdata->error_severity);
> +
> + return 0;
> +}
snip
Hi Dave,
On 5/22/2024 10:59 AM, Dave Jiang wrote:
>
>
> On 5/22/24 8:08 AM, Smita Koralahalli wrote:
>> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
>>
>> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
>> Severity, Device ID, Device Serial number and CXL RAS capability struct in
>> struct cxl_cper_prot_err. Include this struct as a member of struct
>> cxl_cper_work_data.
>>
>> Signed-off-by: Smita Koralahalli <[email protected]>
>> ---
>> drivers/acpi/apei/ghes.c | 10 +++++
>> drivers/firmware/efi/cper_cxl.c | 66 +++++++++++++++++++++++++++++++++
>> include/linux/cxl-event.h | 26 +++++++++++++
>> 3 files changed, 102 insertions(+)
>>
[snip]
>> + * The device ID or agent address is required for CXL RCD, CXL
>> + * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
>> + * CXL Downstream Switch Port and CXL Upstream Switch Port.
>> + */
>> + if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
>
> Perhaps define an enum CXL_AGENT_TYPE_MAX instead of 0x7 magic number? Otherwise if a new type is introduced, it would break this code.
Agreed. I will define a boolean array indexed by agent type as suggested
by Alison. That would avoid all these comparisons and not worry about
breaking code in future.
>
>> + p_err->segment = prot_err->agent_addr.segment;
>> + p_err->bus = prot_err->agent_addr.bus;
>> + p_err->device = prot_err->agent_addr.device;
>> + p_err->function = prot_err->agent_addr.function;
>> + } else {
>> + pr_err(FW_WARN "Invalid agent type\n");
>> + return -EINVAL;
>> + }
>
> Up to you if you want to do this or not, but maybe:
>
> if (prot_err->agent_type >= CXL_AGENT_TYPE_MAX || prot_err->agent_type == RCH_DP) {
> pr_warn(...);
> return -EINVAL;
> }
>
> p_err->segment = ...;
> p_err->bus = ...;
Noted.
> ...
>
> Although perhaps a helper function cxl_cper_valid_agent_type() that checks invalid agent type by checking the valid_bits, the agent_type boundary, and if agent_type != RCH_DP?
Okay.
>> +
>> + if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
>> + pr_err(FW_WARN "Invalid Protocol Error log\n");
>> + return -EINVAL;
>> + }
>> +
>> + dvsec_start = (u8 *)(prot_err + 1);
>> + cap_start = dvsec_start + prot_err->dvsec_len;
>> + p_err->cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;
>> +
>> + /*
>> + * Set device serial number unconditionally.
>> + *
>> + * Print a warning message if it is not valid. The device serial
>> + * number is required for CXL RCD, CXL SLD, CXL LD and CXL Fabric
>> + * Manager Managed LD.
>> + */
>> + if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
>> + prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
>
> prot_err->agent_type > FM_LD? Although maybe it would be a clearer read if a helper function is defined to identify the agent types such as cxl_cper_prot_err_serial_needed() or cxl_cper_prot_agent_type_device() and with it a switch statement to explicitly identify all the agent types that require serial number. If a future device is defined, the > 0x4 logic may break.
Probably helper function is not required if boolean array is defined?
What do you think?
Thanks,
Smita
[snip]
Hi Alison,
On 5/22/2024 5:03 PM, Alison Schofield wrote:
> On Wed, May 22, 2024 at 03:08:37PM +0000, Smita Koralahalli wrote:
>> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
>>
>> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
>> Severity, Device ID, Device Serial number and CXL RAS capability struct in
>> struct cxl_cper_prot_err. Include this struct as a member of struct
>> cxl_cper_work_data.
>>
>> Signed-off-by: Smita Koralahalli <[email protected]>
>> ---
>> drivers/acpi/apei/ghes.c | 10 +++++
>> drivers/firmware/efi/cper_cxl.c | 66 +++++++++++++++++++++++++++++++++
>> include/linux/cxl-event.h | 26 +++++++++++++
>> 3 files changed, 102 insertions(+)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index 623cc0cb4a65..1a58032770ee 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -717,6 +717,14 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
>> schedule_work(cxl_cper_work);
>> }
>>
>> +static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
>> +{
>> + struct cxl_cper_work_data wd;
>> +
>> + if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
>> + return;
>> +}
>> +
>> int cxl_cper_register_work(struct work_struct *work)
>> {
>> if (cxl_cper_work)
>> @@ -791,6 +799,8 @@ static bool ghes_do_proc(struct ghes *ghes,
>> struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
>>
>> cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec);
>> + } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
>> + cxl_cper_handle_prot_err(gdata);
>> } else {
>> void *err = acpi_hest_get_payload(gdata);
>>
>> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
>> index 4fd8d783993e..03b9839f3b73 100644
>> --- a/drivers/firmware/efi/cper_cxl.c
>> +++ b/drivers/firmware/efi/cper_cxl.c
>> @@ -8,6 +8,7 @@
>> */
>>
>> #include <linux/cper.h>
>> +#include <acpi/ghes.h>
>> #include "cper_cxl.h"
>>
>> #define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
>> @@ -44,6 +45,17 @@ enum {
>> USP, /* CXL Upstream Switch Port */
>> };
>>
>> +static enum cxl_aer_err_type cper_severity_cxl_aer(int cper_severity)
>> +{
>> + switch (cper_severity) {
>> + case CPER_SEV_RECOVERABLE:
>> + case CPER_SEV_FATAL:
>> + return CXL_AER_UNCORRECTABLE;
>> + default:
>> + return CXL_AER_CORRECTABLE;
>> + }
>> +}
>> +
>> void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
>> {
>> if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
>> @@ -176,3 +188,57 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
>> sizeof(cxl_ras->header_log), 0);
>> }
>> }
>> +
>> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
>> + struct cxl_cper_prot_err *p_err)
>> +{
>> + struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
>> + u8 *dvsec_start, *cap_start;
>> +
>> + if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
>> + pr_err(FW_WARN "No Device ID\n");
>> + return -EINVAL;
>> + }
>> +
>> + /*
>> + * The device ID or agent address is required for CXL RCD, CXL
>> + * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
>> + * CXL Downstream Switch Port and CXL Upstream Switch Port.
>> + */
>> + if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
>
> For this check against agent_type, and the similar one below, would a boolean
> array indexed by the agent type work? That would avoid the <= 0x7 and > 0x4
> below. It seems one array would suffice for this case, but naming it isn't obvious
> to me. Maybe it'll be to you.
>
> Something similar to what is done for prot_err_agent_type_strs[]
>
> static const bool agent_requires_id_address_serial[] = {
> true, /* RDC */
> false, /* RCH_DP */
> .
> .
> .
> };
>
>
Noted. Will implement it this way!
Thanks
Smita
>> + p_err->segment = prot_err->agent_addr.segment;
>> + p_err->bus = prot_err->agent_addr.bus;
>> + p_err->device = prot_err->agent_addr.device;
>> + p_err->function = prot_err->agent_addr.function;
>> + } else {
>> + pr_err(FW_WARN "Invalid agent type\n");
>> + return -EINVAL;
>> + }
>> +
>> + if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
>> + pr_err(FW_WARN "Invalid Protocol Error log\n");
>> + return -EINVAL;
>> + }
>> +
>> + dvsec_start = (u8 *)(prot_err + 1);
>> + cap_start = dvsec_start + prot_err->dvsec_len;
>> + p_err->cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;
>> +
>> + /*
>> + * Set device serial number unconditionally.
>> + *
>> + * Print a warning message if it is not valid. The device serial
>> + * number is required for CXL RCD, CXL SLD, CXL LD and CXL Fabric
>> + * Manager Managed LD.
>> + */
>> + if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
>> + prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
>
> then this also can be replaced with
> agent_requires_id_address_serial[prot_err->agent_type]
>
>
> -- Alison
>
>
>> + pr_warn(FW_WARN "No Device Serial number\n");
>> +
>> + p_err->lower_dw = prot_err->dev_serial_num.lower_dw;
>> + p_err->upper_dw = prot_err->dev_serial_num.upper_dw;
>> +
>> + p_err->severity = cper_severity_cxl_aer(gdata->error_severity);
>> +
>> + return 0;
>> +}
>
> snip
>
On 5/23/24 2:19 PM, Smita Koralahalli wrote:
> Hi Dave,
>
> On 5/22/2024 10:59 AM, Dave Jiang wrote:
>>
>>
>> On 5/22/24 8:08 AM, Smita Koralahalli wrote:
>>> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
>>>
>>> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
>>> Severity, Device ID, Device Serial number and CXL RAS capability struct in
>>> struct cxl_cper_prot_err. Include this struct as a member of struct
>>> cxl_cper_work_data.
>>>
>>> Signed-off-by: Smita Koralahalli <[email protected]>
>>> ---
>>> drivers/acpi/apei/ghes.c | 10 +++++
>>> drivers/firmware/efi/cper_cxl.c | 66 +++++++++++++++++++++++++++++++++
>>> include/linux/cxl-event.h | 26 +++++++++++++
>>> 3 files changed, 102 insertions(+)
>>>
>
> [snip]
>
>
>>> + * The device ID or agent address is required for CXL RCD, CXL
>>> + * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
>>> + * CXL Downstream Switch Port and CXL Upstream Switch Port.
>>> + */
>>> + if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
>>
>> Perhaps define an enum CXL_AGENT_TYPE_MAX instead of 0x7 magic number? Otherwise if a new type is introduced, it would break this code.
>
> Agreed. I will define a boolean array indexed by agent type as suggested by Alison. That would avoid all these comparisons and not worry about breaking code in future.
>
>>
>>> + p_err->segment = prot_err->agent_addr.segment;
>>> + p_err->bus = prot_err->agent_addr.bus;
>>> + p_err->device = prot_err->agent_addr.device;
>>> + p_err->function = prot_err->agent_addr.function;
>>> + } else {
>>> + pr_err(FW_WARN "Invalid agent type\n");
>>> + return -EINVAL;
>>> + }
>>
>> Up to you if you want to do this or not, but maybe:
>>
>> if (prot_err->agent_type >= CXL_AGENT_TYPE_MAX || prot_err->agent_type == RCH_DP) {
>> pr_warn(...);
>> return -EINVAL;
>> }
>>
>> p_err->segment = ...;
>> p_err->bus = ...;
>
> Noted.
>
>> ...
>>
>> Although perhaps a helper function cxl_cper_valid_agent_type() that checks invalid agent type by checking the valid_bits, the agent_type boundary, and if agent_type != RCH_DP?
>
> Okay.
>
>>> +
>>> + if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
>>> + pr_err(FW_WARN "Invalid Protocol Error log\n");
>>> + return -EINVAL;
>>> + }
>>> +
>>> + dvsec_start = (u8 *)(prot_err + 1);
>>> + cap_start = dvsec_start + prot_err->dvsec_len;
>>> + p_err->cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;
>>> +
>>> + /*
>>> + * Set device serial number unconditionally.
>>> + *
>>> + * Print a warning message if it is not valid. The device serial
>>> + * number is required for CXL RCD, CXL SLD, CXL LD and CXL Fabric
>>> + * Manager Managed LD.
>>> + */
>>> + if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
>>> + prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
>>
>> prot_err->agent_type > FM_LD? Although maybe it would be a clearer read if a helper function is defined to identify the agent types such as cxl_cper_prot_err_serial_needed() or cxl_cper_prot_agent_type_device() and with it a switch statement to explicitly identify all the agent types that require serial number. If a future device is defined, the > 0x4 logic may break.
>
> Probably helper function is not required if boolean array is defined? What do you think?
That works for me. My main concern is to clarify the code and remove possibility of breakage from future changes.
>
> Thanks,
> Smita
>
> [snip]
On Thu, 23 May 2024 14:21:40 -0700
Smita Koralahalli <[email protected]> wrote:
> Hi Alison,
>
> On 5/22/2024 5:03 PM, Alison Schofield wrote:
> > On Wed, May 22, 2024 at 03:08:37PM +0000, Smita Koralahalli wrote:
> >> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
> >>
> >> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
> >> Severity, Device ID, Device Serial number and CXL RAS capability struct in
> >> struct cxl_cper_prot_err. Include this struct as a member of struct
> >> cxl_cper_work_data.
> >>
> >> Signed-off-by: Smita Koralahalli <[email protected]>
> >> ---
> >> drivers/acpi/apei/ghes.c | 10 +++++
> >> drivers/firmware/efi/cper_cxl.c | 66 +++++++++++++++++++++++++++++++++
> >> include/linux/cxl-event.h | 26 +++++++++++++
> >> 3 files changed, 102 insertions(+)
> >>
> >> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> >> index 623cc0cb4a65..1a58032770ee 100644
> >> --- a/drivers/acpi/apei/ghes.c
> >> +++ b/drivers/acpi/apei/ghes.c
> >> @@ -717,6 +717,14 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
> >> schedule_work(cxl_cper_work);
> >> }
> >>
> >> +static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
> >> +{
> >> + struct cxl_cper_work_data wd;
> >> +
> >> + if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
> >> + return;
> >> +}
> >> +
> >> int cxl_cper_register_work(struct work_struct *work)
> >> {
> >> if (cxl_cper_work)
> >> @@ -791,6 +799,8 @@ static bool ghes_do_proc(struct ghes *ghes,
> >> struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
> >>
> >> cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec);
> >> + } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
> >> + cxl_cper_handle_prot_err(gdata);
> >> } else {
> >> void *err = acpi_hest_get_payload(gdata);
> >>
> >> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
> >> index 4fd8d783993e..03b9839f3b73 100644
> >> --- a/drivers/firmware/efi/cper_cxl.c
> >> +++ b/drivers/firmware/efi/cper_cxl.c
> >> @@ -8,6 +8,7 @@
> >> */
> >>
> >> #include <linux/cper.h>
> >> +#include <acpi/ghes.h>
> >> #include "cper_cxl.h"
> >>
> >> #define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
> >> @@ -44,6 +45,17 @@ enum {
> >> USP, /* CXL Upstream Switch Port */
> >> };
> >>
> >> +static enum cxl_aer_err_type cper_severity_cxl_aer(int cper_severity)
> >> +{
> >> + switch (cper_severity) {
> >> + case CPER_SEV_RECOVERABLE:
> >> + case CPER_SEV_FATAL:
> >> + return CXL_AER_UNCORRECTABLE;
> >> + default:
> >> + return CXL_AER_CORRECTABLE;
> >> + }
> >> +}
> >> +
> >> void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
> >> {
> >> if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
> >> @@ -176,3 +188,57 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
> >> sizeof(cxl_ras->header_log), 0);
> >> }
> >> }
> >> +
> >> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
> >> + struct cxl_cper_prot_err *p_err)
> >> +{
> >> + struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
> >> + u8 *dvsec_start, *cap_start;
> >> +
> >> + if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
> >> + pr_err(FW_WARN "No Device ID\n");
> >> + return -EINVAL;
> >> + }
> >> +
> >> + /*
> >> + * The device ID or agent address is required for CXL RCD, CXL
> >> + * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
> >> + * CXL Downstream Switch Port and CXL Upstream Switch Port.
> >> + */
> >> + if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
> >
> > For this check against agent_type, and the similar one below, would a boolean
> > array indexed by the agent type work? That would avoid the <= 0x7 and > 0x4
> > below. It seems one array would suffice for this case, but naming it isn't obvious
> > to me. Maybe it'll be to you.
> >
> > Something similar to what is done for prot_err_agent_type_strs[]
> >
> > static const bool agent_requires_id_address_serial[] = {
> > true, /* RDC */
> > false, /* RCH_DP */
[RCD] = false,
etc rather than comments would be neater.
Given two similar things already. Maybe time for a little structure.
//with better name than this
struct agent_reqs {
bool sn;
bool sbdf;
};
static const agent_reqs agent_reqs[] = {
[RCD] = { .sn = false, .sbdf = true, },
};
etc.
Maybe just bring the the string in as well
struct agent_info {
const char *string;
bool req_sn;
bool req_sbdf;
};
static const agent_info agent_info[] = {
[RD] = {
.string = "Restricted CXL Device",
.req_sn = false,
.req_sbdf = true,
},
};
Values made up, but hopefully conveys that moving to having
all the data in one place makes it harder to forget stuff
for new entries etc.
> > .
> > .
> > .
> > };
> >
> >
>
> Noted. Will implement it this way!
>
> Thanks
> Smita
>
> >> + p_err->segment = prot_err->agent_addr.segment;
> >> + p_err->bus = prot_err->agent_addr.bus;
> >> + p_err->device = prot_err->agent_addr.device;
> >> + p_err->function = prot_err->agent_addr.function;
> >> + } else {
> >> + pr_err(FW_WARN "Invalid agent type\n");
> >> + return -EINVAL;
> >> + }
> >> +
> >> + if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
> >> + pr_err(FW_WARN "Invalid Protocol Error log\n");
> >> + return -EINVAL;
> >> + }
> >> +
> >> + dvsec_start = (u8 *)(prot_err + 1);
> >> + cap_start = dvsec_start + prot_err->dvsec_len;
> >> + p_err->cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;
> >> +
> >> + /*
> >> + * Set device serial number unconditionally.
> >> + *
> >> + * Print a warning message if it is not valid. The device serial
> >> + * number is required for CXL RCD, CXL SLD, CXL LD and CXL Fabric
> >> + * Manager Managed LD.
> >> + */
> >> + if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
> >> + prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
> >
> > then this also can be replaced with
> > agent_requires_id_address_serial[prot_err->agent_type]
> >
> >
> > -- Alison
> >
> >
> >> + pr_warn(FW_WARN "No Device Serial number\n");
> >> +
> >> + p_err->lower_dw = prot_err->dev_serial_num.lower_dw;
> >> + p_err->upper_dw = prot_err->dev_serial_num.upper_dw;
> >> +
> >> + p_err->severity = cper_severity_cxl_aer(gdata->error_severity);
> >> +
> >> + return 0;
> >> +}
> >
> > snip
> >
On 6/7/2024 8:26 AM, Jonathan Cameron wrote:
> On Thu, 23 May 2024 14:21:40 -0700
> Smita Koralahalli <[email protected]> wrote:
>
>> Hi Alison,
>>
>> On 5/22/2024 5:03 PM, Alison Schofield wrote:
>>> On Wed, May 22, 2024 at 03:08:37PM +0000, Smita Koralahalli wrote:
>>>> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
>>>>
>>>> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
>>>> Severity, Device ID, Device Serial number and CXL RAS capability struct in
>>>> struct cxl_cper_prot_err. Include this struct as a member of struct
>>>> cxl_cper_work_data.
>>>>
>>>> Signed-off-by: Smita Koralahalli <[email protected]>
[snip]
>>>> + * The device ID or agent address is required for CXL RCD, CXL
>>>> + * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
>>>> + * CXL Downstream Switch Port and CXL Upstream Switch Port.
>>>> + */
>>>> + if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
>>>
>>> For this check against agent_type, and the similar one below, would a boolean
>>> array indexed by the agent type work? That would avoid the <= 0x7 and > 0x4
>>> below. It seems one array would suffice for this case, but naming it isn't obvious
>>> to me. Maybe it'll be to you.
>>>
>>> Something similar to what is done for prot_err_agent_type_strs[]
>>>
>>> static const bool agent_requires_id_address_serial[] = {
>>> true, /* RDC */
>>> false, /* RCH_DP */
> [RCD] = false,
>
> etc rather than comments would be neater.
> Given two similar things already. Maybe time for a little structure.
>
> //with better name than this
> struct agent_reqs {
> bool sn;
> bool sbdf;
> };
>
> static const agent_reqs agent_reqs[] = {
> [RCD] = { .sn = false, .sbdf = true, },
> };
>
> etc.
>
> Maybe just bring the the string in as well
>
> struct agent_info {
> const char *string;
> bool req_sn;
> bool req_sbdf;
> };
>
> static const agent_info agent_info[] = {
> [RD] = {
> .string = "Restricted CXL Device",
> .req_sn = false,
> .req_sbdf = true,
> },
> };
>
> Values made up, but hopefully conveys that moving to having
> all the data in one place makes it harder to forget stuff
> for new entries etc.
>
Okay. I was considering 2D array without naming before. Something like:
static const bool agent_requires_id_address[][] = {
/* Device_ID, Serial_num */
{true, true}, /* RCD */
{false, false}, /* RCH_DP */
Let me change to array of structures if naming helps.
Thanks,
Smita
[snip]