2022-09-29 09:48:51

by Quan Nguyen

[permalink] [raw]
Subject: [PATCH v9 3/9] misc: smpro-errmon: Add Ampere's SMpro error monitor driver

This commit adds Ampere's SMpro error monitor driver for monitoring
and reporting RAS-related errors as reported by SMpro co-processor
found on Ampere's Altra processor family.

Signed-off-by: Quan Nguyen <[email protected]>
---
Changes in v9:
+ Fix ugly static struct define [Greg]
+ Remove unused defines and update documentation [Quan]
+ Add minor refactor code [Quan]
+ Fix messy goto [Greg]
+ Update SPDX licence [Greg]
+ Use ATTRIBUTE_GROUPS() [Greg]
+ Use dev_groups instead of sysfs_create_group() to avoid
racing issue with user space [Greg]
+ Refactor code to fix unnecessary initialization issue [Quan]
+ Refactor code to avoid clever encoding issue [Quan]
+ Separate error_[smpro|pmpro] to error_* and warn_* [Quan]
+ Add minor code refactor [Quan]

Changes in v8:
+ Update wording for SMPRO_ERRMON on Kconfig file [Quan]
+ Avoid uninitialized variable use [kernel test robot]
+ Switch to use sysfs_emit() [Greg]
+ Make sysfs to return single value [Greg]
+ Change errors_* sysfs to error_* [Quan]
+ Add overflow_[core|mem|pcie|other]_[ce|ue] sysfs to report
overflow status of each type of HW errors [Quan]
+ Add some minor refactor [Quan]

Changes in v7:
+ Remove regmap_acquire/release_lock(), read_i2c_block_data() [Quan]
+ Use regmap_noinc_read() instead of errmon_read_block() [Quan]
+ Validate number of errors before read [Quan]
+ Fix wrong return type of *_show() function [kernel test robot]
+ Adjust patch order to avoid dependence with smpro-mfd [Lee Jones]
+ Use pointer instead of stack memory [Quan]

Changes in v6:
+ First introduced in v6 [Quan]

drivers/misc/Kconfig | 12 +
drivers/misc/Makefile | 1 +
drivers/misc/smpro-errmon.c | 529 ++++++++++++++++++++++++++++++++++++
3 files changed, 542 insertions(+)
create mode 100644 drivers/misc/smpro-errmon.c

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 358ad56f6524..b9ceee949dab 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -176,6 +176,18 @@ config SGI_XP
this feature will allow for direct communication between SSIs
based on a network adapter and DMA messaging.

+config SMPRO_ERRMON
+ tristate "Ampere Computing SMPro error monitor driver"
+ depends on MFD_SMPRO || COMPILE_TEST
+ help
+ Say Y here to get support for the SMpro error monitor function
+ provided by Ampere Computing's Altra and Altra Max SoCs. Upon
+ loading, the driver creates sysfs files which can be use to gather
+ multiple HW error data reported via read and write system calls.
+
+ To compile this driver as a module, say M here. The driver will be
+ called smpro-errmon.
+
config CS5535_MFGPT
tristate "CS5535/CS5536 Geode Multi-Function General Purpose Timer (MFGPT) support"
depends on MFD_CS5535
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index ac9b3e757ba1..bbe24d4511a3 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o
obj-$(CONFIG_KGDB_TESTS) += kgdbts.o
obj-$(CONFIG_SGI_XP) += sgi-xp/
obj-$(CONFIG_SGI_GRU) += sgi-gru/
+obj-$(CONFIG_SMPRO_ERRMON) += smpro-errmon.o
obj-$(CONFIG_CS5535_MFGPT) += cs5535-mfgpt.o
obj-$(CONFIG_GEHC_ACHC) += gehc-achc.o
obj-$(CONFIG_HP_ILO) += hpilo.o
diff --git a/drivers/misc/smpro-errmon.c b/drivers/misc/smpro-errmon.c
new file mode 100644
index 000000000000..d1431d419aa4
--- /dev/null
+++ b/drivers/misc/smpro-errmon.c
@@ -0,0 +1,529 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Ampere Computing SoC's SMpro Error Monitoring Driver
+ *
+ * Copyright (c) 2022, Ampere Computing LLC
+ *
+ */
+
+#include <linux/i2c.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+
+/* GPI RAS Error Registers */
+#define GPI_RAS_ERR 0x7E
+
+/* Core and L2C Error Registers */
+#define CORE_CE_ERR_CNT 0x80
+#define CORE_CE_ERR_LEN 0x81
+#define CORE_CE_ERR_DATA 0x82
+#define CORE_UE_ERR_CNT 0x83
+#define CORE_UE_ERR_LEN 0x84
+#define CORE_UE_ERR_DATA 0x85
+
+/* Memory Error Registers */
+#define MEM_CE_ERR_CNT 0x90
+#define MEM_CE_ERR_LEN 0x91
+#define MEM_CE_ERR_DATA 0x92
+#define MEM_UE_ERR_CNT 0x93
+#define MEM_UE_ERR_LEN 0x94
+#define MEM_UE_ERR_DATA 0x95
+
+/* RAS Error/Warning Registers */
+#define ERR_SMPRO_TYPE 0xA0
+#define ERR_PMPRO_TYPE 0xA1
+#define ERR_SMPRO_INFO_LO 0xA2
+#define ERR_SMPRO_INFO_HI 0xA3
+#define ERR_SMPRO_DATA_LO 0xA4
+#define ERR_SMPRO_DATA_HI 0xA5
+#define WARN_SMPRO_INFO_LO 0xAA
+#define WARN_SMPRO_INFO_HI 0xAB
+#define ERR_PMPRO_INFO_LO 0xA6
+#define ERR_PMPRO_INFO_HI 0xA7
+#define ERR_PMPRO_DATA_LO 0xA8
+#define ERR_PMPRO_DATA_HI 0xA9
+#define WARN_PMPRO_INFO_LO 0xAC
+#define WARN_PMPRO_INFO_HI 0xAD
+
+/* PCIE Error Registers */
+#define PCIE_CE_ERR_CNT 0xC0
+#define PCIE_CE_ERR_LEN 0xC1
+#define PCIE_CE_ERR_DATA 0xC2
+#define PCIE_UE_ERR_CNT 0xC3
+#define PCIE_UE_ERR_LEN 0xC4
+#define PCIE_UE_ERR_DATA 0xC5
+
+/* Other Error Registers */
+#define OTHER_CE_ERR_CNT 0xD0
+#define OTHER_CE_ERR_LEN 0xD1
+#define OTHER_CE_ERR_DATA 0xD2
+#define OTHER_UE_ERR_CNT 0xD8
+#define OTHER_UE_ERR_LEN 0xD9
+#define OTHER_UE_ERR_DATA 0xDA
+
+/* Event Data Registers */
+#define VRD_WARN_FAULT_EVENT_DATA 0x78
+#define VRD_HOT_EVENT_DATA 0x79
+#define DIMM_HOT_EVENT_DATA 0x7A
+
+#define MAX_READ_BLOCK_LENGTH 48
+
+#define RAS_SMPRO_ERR 0
+#define RAS_PMPRO_ERR 1
+
+enum RAS_48BYTES_ERR_TYPES {
+ CORE_CE_ERR,
+ CORE_UE_ERR,
+ MEM_CE_ERR,
+ MEM_UE_ERR,
+ PCIE_CE_ERR,
+ PCIE_UE_ERR,
+ OTHER_CE_ERR,
+ OTHER_UE_ERR,
+ NUM_48BYTES_ERR_TYPE,
+};
+
+struct smpro_error_hdr {
+ u8 count; /* Number of the RAS errors */
+ u8 len; /* Number of data bytes */
+ u8 data; /* Start of 48-byte data */
+ u8 max_cnt; /* Max num of errors */
+};
+
+/*
+ * Included Address of registers to get Count, Length of data and Data
+ * of the 48 bytes error data
+ */
+static struct smpro_error_hdr smpro_error_table[] = {
+ [CORE_CE_ERR] = {
+ .count = CORE_CE_ERR_CNT,
+ .len = CORE_CE_ERR_LEN,
+ .data = CORE_CE_ERR_DATA,
+ .max_cnt = 32
+ },
+ [CORE_UE_ERR] = {
+ .count = CORE_UE_ERR_CNT,
+ .len = CORE_UE_ERR_LEN,
+ .data = CORE_UE_ERR_DATA,
+ .max_cnt = 32
+ },
+ [MEM_CE_ERR] = {
+ .count = MEM_CE_ERR_CNT,
+ .len = MEM_CE_ERR_LEN,
+ .data = MEM_CE_ERR_DATA,
+ .max_cnt = 16
+ },
+ [MEM_UE_ERR] = {
+ .count = MEM_UE_ERR_CNT,
+ .len = MEM_UE_ERR_LEN,
+ .data = MEM_UE_ERR_DATA,
+ .max_cnt = 16
+ },
+ [PCIE_CE_ERR] = {
+ .count = PCIE_CE_ERR_CNT,
+ .len = PCIE_CE_ERR_LEN,
+ .data = PCIE_CE_ERR_DATA,
+ .max_cnt = 96
+ },
+ [PCIE_UE_ERR] = {
+ .count = PCIE_UE_ERR_CNT,
+ .len = PCIE_UE_ERR_LEN,
+ .data = PCIE_UE_ERR_DATA,
+ .max_cnt = 96
+ },
+ [OTHER_CE_ERR] = {
+ .count = OTHER_CE_ERR_CNT,
+ .len = OTHER_CE_ERR_LEN,
+ .data = OTHER_CE_ERR_DATA,
+ .max_cnt = 8
+ },
+ [OTHER_UE_ERR] = {
+ .count = OTHER_UE_ERR_CNT,
+ .len = OTHER_UE_ERR_LEN,
+ .data = OTHER_UE_ERR_DATA,
+ .max_cnt = 8
+ },
+};
+
+/*
+ * List of SCP registers which are used to get
+ * one type of RAS Internal errors.
+ */
+struct smpro_int_error_hdr {
+ u8 type;
+ u8 info_l;
+ u8 info_h;
+ u8 data_l;
+ u8 data_h;
+ u8 warn_l;
+ u8 warn_h;
+};
+
+static struct smpro_int_error_hdr list_smpro_int_error_hdr[] = {
+ [RAS_SMPRO_ERR] = {
+ .type = ERR_SMPRO_TYPE,
+ .info_l = ERR_SMPRO_INFO_LO,
+ .info_h = ERR_SMPRO_INFO_HI,
+ .data_l = ERR_SMPRO_DATA_LO,
+ .data_h = ERR_SMPRO_DATA_HI,
+ .warn_l = WARN_SMPRO_INFO_LO,
+ .warn_h = WARN_SMPRO_INFO_HI,
+ },
+ [RAS_PMPRO_ERR] = {
+ .type = ERR_PMPRO_TYPE,
+ .info_l = ERR_PMPRO_INFO_LO,
+ .info_h = ERR_PMPRO_INFO_HI,
+ .data_l = ERR_PMPRO_DATA_LO,
+ .data_h = ERR_PMPRO_DATA_HI,
+ .warn_l = WARN_PMPRO_INFO_LO,
+ .warn_h = WARN_PMPRO_INFO_HI,
+ },
+};
+
+struct smpro_errmon {
+ struct regmap *regmap;
+};
+
+enum EVENT_TYPES {
+ VRD_WARN_FAULT_EVENT,
+ VRD_HOT_EVENT,
+ DIMM_HOT_EVENT,
+ NUM_EVENTS_TYPE,
+};
+
+/* Included Address of event source and data registers */
+static u8 smpro_event_table[NUM_EVENTS_TYPE] = {
+ VRD_WARN_FAULT_EVENT_DATA,
+ VRD_HOT_EVENT_DATA,
+ DIMM_HOT_EVENT_DATA,
+};
+
+static ssize_t smpro_event_data_read(struct device *dev,
+ struct device_attribute *da, char *buf,
+ int channel)
+{
+ struct smpro_errmon *errmon = dev_get_drvdata(dev);
+ s32 event_data;
+ int ret;
+
+ ret = regmap_read(errmon->regmap, smpro_event_table[channel], &event_data);
+ if (ret)
+ return ret;
+ /* Clear event after read */
+ if (event_data != 0)
+ regmap_write(errmon->regmap, smpro_event_table[channel], event_data);
+
+ return sysfs_emit(buf, "%04x\n", event_data);
+}
+
+static ssize_t smpro_overflow_data_read(struct device *dev, struct device_attribute *da,
+ char *buf, int channel)
+{
+ struct smpro_errmon *errmon = dev_get_drvdata(dev);
+ struct smpro_error_hdr *err_info;
+ s32 err_count;
+ int ret;
+
+ err_info = &smpro_error_table[channel];
+
+ ret = regmap_read(errmon->regmap, err_info->count, &err_count);
+ if (ret)
+ return ret;
+
+ /* Bit 8 indicates the overflow status */
+ return sysfs_emit(buf, "%d\n", (err_count & BIT(8)) ? 1 : 0);
+}
+
+static ssize_t smpro_error_data_read(struct device *dev, struct device_attribute *da,
+ char *buf, int channel)
+{
+ struct smpro_errmon *errmon = dev_get_drvdata(dev);
+ unsigned char err_data[MAX_READ_BLOCK_LENGTH];
+ struct smpro_error_hdr *err_info;
+ s32 err_count, err_length;
+ int ret;
+
+ err_info = &smpro_error_table[channel];
+
+ ret = regmap_read(errmon->regmap, err_info->count, &err_count);
+ /* Error count is the low byte */
+ err_count &= 0xff;
+ if (ret || !err_count || err_count > err_info->max_cnt)
+ return ret;
+
+ ret = regmap_read(errmon->regmap, err_info->len, &err_length);
+ if (ret || err_length <= 0)
+ return ret;
+
+ if (err_length > MAX_READ_BLOCK_LENGTH)
+ err_length = MAX_READ_BLOCK_LENGTH;
+
+ memset(err_data, 0x00, MAX_READ_BLOCK_LENGTH);
+ ret = regmap_noinc_read(errmon->regmap, err_info->data, err_data, err_length);
+ if (ret < 0)
+ return ret;
+
+ /* clear the error */
+ ret = regmap_write(errmon->regmap, err_info->count, 0x100);
+ if (ret)
+ return ret;
+ /*
+ * The output of Core/Memory/PCIe/Others UE/CE errors follows the format
+ * specified in section 5.8.1 CE/UE Error Data record in
+ * Altra SOC BMC Interface specification.
+ */
+ return sysfs_emit(buf, "%*phN\n", MAX_READ_BLOCK_LENGTH, err_data);
+}
+
+/*
+ * Output format:
+ * <4-byte hex value of error info><4-byte hex value of error extensive data>
+ * Where:
+ * + error info : The error information
+ * + error data : Extensive data (32 bits)
+ * Reference to section 5.10 RAS Internal Error Register Definition in
+ * Altra SOC BMC Interface specification
+ */
+static ssize_t smpro_internal_err_read(struct device *dev, struct device_attribute *da,
+ char *buf, int channel)
+{
+ struct smpro_errmon *errmon = dev_get_drvdata(dev);
+ struct smpro_int_error_hdr *err_info;
+ unsigned int err[4] = { 0 };
+ unsigned int err_type;
+ unsigned int val;
+ int ret;
+
+ /* read error status */
+ ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val);
+ if (ret)
+ return ret;
+
+ if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) ||
+ (channel == RAS_PMPRO_ERR && !(val & BIT(1))))
+ return 0;
+
+ err_info = &list_smpro_int_error_hdr[channel];
+ ret = regmap_read(errmon->regmap, err_info->type, &val);
+ if (ret)
+ return ret;
+
+ err_type = (val & BIT(1)) ? BIT(1) :
+ (val & BIT(2)) ? BIT(2) : 0;
+
+ if (!err_type)
+ return 0;
+
+ ret = regmap_read(errmon->regmap, err_info->info_l, err + 1);
+ if (ret)
+ return ret;
+
+ ret = regmap_read(errmon->regmap, err_info->info_h, err);
+ if (ret)
+ return ret;
+
+ if (err_type & BIT(2)) {
+ /* Error with data type */
+ ret = regmap_read(errmon->regmap, err_info->data_l, err + 3);
+ if (ret)
+ return ret;
+
+ ret = regmap_read(errmon->regmap, err_info->data_h, err + 2);
+ if (ret)
+ return ret;
+ }
+
+ /* clear the read errors */
+ ret = regmap_write(errmon->regmap, err_info->type, err_type);
+ if (ret)
+ return ret;
+
+ return sysfs_emit(buf, "%*phN\n", (int)sizeof(err), err);
+}
+
+/*
+ * Output format:
+ * <4-byte hex value of warining info>
+ * Reference to section 5.10 RAS Internal Error Register Definition in
+ * Altra SOC BMC Interface specification
+ */
+static ssize_t smpro_internal_warn_read(struct device *dev, struct device_attribute *da,
+ char *buf, int channel)
+{
+ struct smpro_errmon *errmon = dev_get_drvdata(dev);
+ struct smpro_int_error_hdr *err_info;
+ unsigned int warn[2] = { 0 };
+ unsigned int val;
+ int ret;
+
+ /* read error status */
+ ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val);
+ if (ret)
+ return ret;
+
+ if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) ||
+ (channel == RAS_PMPRO_ERR && !(val & BIT(1))))
+ return 0;
+
+ err_info = &list_smpro_int_error_hdr[channel];
+ ret = regmap_read(errmon->regmap, err_info->type, &val);
+ if (ret)
+ return ret;
+
+ if (!(val & BIT(0)))
+ return 0;
+
+ ret = regmap_read(errmon->regmap, err_info->warn_l, warn + 1);
+ if (ret)
+ return ret;
+
+ ret = regmap_read(errmon->regmap, err_info->warn_h, warn);
+ if (ret)
+ return ret;
+
+ /* clear the warning */
+ ret = regmap_write(errmon->regmap, err_info->type, BIT(0));
+ if (ret)
+ return ret;
+
+ return sysfs_emit(buf, "%*phN\n", (int)sizeof(warn), warn);
+}
+
+#define ERROR_OVERFLOW_RO(_error, _index) \
+ static ssize_t overflow_##_error##_show(struct device *dev, \
+ struct device_attribute *da, \
+ char *buf) \
+ { \
+ return smpro_overflow_data_read(dev, da, buf, _index); \
+ } \
+ static DEVICE_ATTR_RO(overflow_##_error)
+
+ERROR_OVERFLOW_RO(core_ce, CORE_CE_ERR);
+ERROR_OVERFLOW_RO(core_ue, CORE_UE_ERR);
+ERROR_OVERFLOW_RO(mem_ce, MEM_CE_ERR);
+ERROR_OVERFLOW_RO(mem_ue, MEM_UE_ERR);
+ERROR_OVERFLOW_RO(pcie_ce, PCIE_CE_ERR);
+ERROR_OVERFLOW_RO(pcie_ue, PCIE_UE_ERR);
+ERROR_OVERFLOW_RO(other_ce, OTHER_CE_ERR);
+ERROR_OVERFLOW_RO(other_ue, OTHER_UE_ERR);
+
+#define ERROR_RO(_error, _index) \
+ static ssize_t error_##_error##_show(struct device *dev, \
+ struct device_attribute *da, \
+ char *buf) \
+ { \
+ return smpro_error_data_read(dev, da, buf, _index); \
+ } \
+ static DEVICE_ATTR_RO(error_##_error)
+
+ERROR_RO(core_ce, CORE_CE_ERR);
+ERROR_RO(core_ue, CORE_UE_ERR);
+ERROR_RO(mem_ce, MEM_CE_ERR);
+ERROR_RO(mem_ue, MEM_UE_ERR);
+ERROR_RO(pcie_ce, PCIE_CE_ERR);
+ERROR_RO(pcie_ue, PCIE_UE_ERR);
+ERROR_RO(other_ce, OTHER_CE_ERR);
+ERROR_RO(other_ue, OTHER_UE_ERR);
+
+static ssize_t error_smpro_show(struct device *dev, struct device_attribute *da, char *buf)
+{
+ return smpro_internal_err_read(dev, da, buf, RAS_SMPRO_ERR);
+}
+static DEVICE_ATTR_RO(error_smpro);
+
+static ssize_t error_pmpro_show(struct device *dev, struct device_attribute *da, char *buf)
+{
+ return smpro_internal_err_read(dev, da, buf, RAS_PMPRO_ERR);
+}
+static DEVICE_ATTR_RO(error_pmpro);
+
+static ssize_t warn_smpro_show(struct device *dev, struct device_attribute *da, char *buf)
+{
+ return smpro_internal_warn_read(dev, da, buf, RAS_SMPRO_ERR);
+}
+static DEVICE_ATTR_RO(warn_smpro);
+
+static ssize_t warn_pmpro_show(struct device *dev, struct device_attribute *da, char *buf)
+{
+ return smpro_internal_warn_read(dev, da, buf, RAS_PMPRO_ERR);
+}
+static DEVICE_ATTR_RO(warn_pmpro);
+
+#define EVENT_RO(_event, _index) \
+ static ssize_t event_##_event##_show(struct device *dev, \
+ struct device_attribute *da, \
+ char *buf) \
+ { \
+ return smpro_event_data_read(dev, da, buf, _index); \
+ } \
+ static DEVICE_ATTR_RO(event_##_event)
+
+EVENT_RO(vrd_warn_fault, VRD_WARN_FAULT_EVENT);
+EVENT_RO(vrd_hot, VRD_HOT_EVENT);
+EVENT_RO(dimm_hot, DIMM_HOT_EVENT);
+
+static struct attribute *smpro_errmon_attrs[] = {
+ &dev_attr_overflow_core_ce.attr,
+ &dev_attr_overflow_core_ue.attr,
+ &dev_attr_overflow_mem_ce.attr,
+ &dev_attr_overflow_mem_ue.attr,
+ &dev_attr_overflow_pcie_ce.attr,
+ &dev_attr_overflow_pcie_ue.attr,
+ &dev_attr_overflow_other_ce.attr,
+ &dev_attr_overflow_other_ue.attr,
+ &dev_attr_error_core_ce.attr,
+ &dev_attr_error_core_ue.attr,
+ &dev_attr_error_mem_ce.attr,
+ &dev_attr_error_mem_ue.attr,
+ &dev_attr_error_pcie_ce.attr,
+ &dev_attr_error_pcie_ue.attr,
+ &dev_attr_error_other_ce.attr,
+ &dev_attr_error_other_ue.attr,
+ &dev_attr_error_smpro.attr,
+ &dev_attr_error_pmpro.attr,
+ &dev_attr_warn_smpro.attr,
+ &dev_attr_warn_pmpro.attr,
+ &dev_attr_event_vrd_warn_fault.attr,
+ &dev_attr_event_vrd_hot.attr,
+ &dev_attr_event_dimm_hot.attr,
+ NULL
+};
+
+ATTRIBUTE_GROUPS(smpro_errmon);
+
+static int smpro_errmon_probe(struct platform_device *pdev)
+{
+ struct smpro_errmon *errmon;
+
+ errmon = devm_kzalloc(&pdev->dev, sizeof(struct smpro_errmon), GFP_KERNEL);
+ if (!errmon)
+ return -ENOMEM;
+
+ platform_set_drvdata(pdev, errmon);
+
+ errmon->regmap = dev_get_regmap(pdev->dev.parent, NULL);
+ if (!errmon->regmap)
+ return -ENODEV;
+
+ return 0;
+}
+
+static struct platform_driver smpro_errmon_driver = {
+ .probe = smpro_errmon_probe,
+ .driver = {
+ .name = "smpro-errmon",
+ .dev_groups = smpro_errmon_groups,
+ },
+};
+
+module_platform_driver(smpro_errmon_driver);
+
+MODULE_AUTHOR("Tung Nguyen <[email protected]>");
+MODULE_AUTHOR("Thinh Pham <[email protected]>");
+MODULE_AUTHOR("Hoang Nguyen <[email protected]>");
+MODULE_AUTHOR("Thu Nguyen <[email protected]>");
+MODULE_AUTHOR("Quan Nguyen <[email protected]>");
+MODULE_DESCRIPTION("Ampere Altra SMpro driver");
+MODULE_LICENSE("GPL");
--
2.35.1


2022-09-29 10:17:43

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [PATCH v9 3/9] misc: smpro-errmon: Add Ampere's SMpro error monitor driver

On Thu, Sep 29, 2022 at 04:43:15PM +0700, Quan Nguyen wrote:
> This commit adds Ampere's SMpro error monitor driver for monitoring
> and reporting RAS-related errors as reported by SMpro co-processor
> found on Ampere's Altra processor family.
>
> Signed-off-by: Quan Nguyen <[email protected]>
> ---
> Changes in v9:
> + Fix ugly static struct define [Greg]
> + Remove unused defines and update documentation [Quan]
> + Add minor refactor code [Quan]
> + Fix messy goto [Greg]
> + Update SPDX licence [Greg]
> + Use ATTRIBUTE_GROUPS() [Greg]
> + Use dev_groups instead of sysfs_create_group() to avoid
> racing issue with user space [Greg]
> + Refactor code to fix unnecessary initialization issue [Quan]
> + Refactor code to avoid clever encoding issue [Quan]
> + Separate error_[smpro|pmpro] to error_* and warn_* [Quan]
> + Add minor code refactor [Quan]
>
> Changes in v8:
> + Update wording for SMPRO_ERRMON on Kconfig file [Quan]
> + Avoid uninitialized variable use [kernel test robot]
> + Switch to use sysfs_emit() [Greg]
> + Make sysfs to return single value [Greg]
> + Change errors_* sysfs to error_* [Quan]
> + Add overflow_[core|mem|pcie|other]_[ce|ue] sysfs to report
> overflow status of each type of HW errors [Quan]
> + Add some minor refactor [Quan]
>
> Changes in v7:
> + Remove regmap_acquire/release_lock(), read_i2c_block_data() [Quan]
> + Use regmap_noinc_read() instead of errmon_read_block() [Quan]
> + Validate number of errors before read [Quan]
> + Fix wrong return type of *_show() function [kernel test robot]
> + Adjust patch order to avoid dependence with smpro-mfd [Lee Jones]
> + Use pointer instead of stack memory [Quan]
>
> Changes in v6:
> + First introduced in v6 [Quan]
>
> drivers/misc/Kconfig | 12 +
> drivers/misc/Makefile | 1 +
> drivers/misc/smpro-errmon.c | 529 ++++++++++++++++++++++++++++++++++++
> 3 files changed, 542 insertions(+)
> create mode 100644 drivers/misc/smpro-errmon.c
>
> diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
> index 358ad56f6524..b9ceee949dab 100644
> --- a/drivers/misc/Kconfig
> +++ b/drivers/misc/Kconfig
> @@ -176,6 +176,18 @@ config SGI_XP
> this feature will allow for direct communication between SSIs
> based on a network adapter and DMA messaging.
>
> +config SMPRO_ERRMON
> + tristate "Ampere Computing SMPro error monitor driver"
> + depends on MFD_SMPRO || COMPILE_TEST
> + help
> + Say Y here to get support for the SMpro error monitor function
> + provided by Ampere Computing's Altra and Altra Max SoCs. Upon
> + loading, the driver creates sysfs files which can be use to gather
> + multiple HW error data reported via read and write system calls.
> +
> + To compile this driver as a module, say M here. The driver will be
> + called smpro-errmon.
> +
> config CS5535_MFGPT
> tristate "CS5535/CS5536 Geode Multi-Function General Purpose Timer (MFGPT) support"
> depends on MFD_CS5535
> diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
> index ac9b3e757ba1..bbe24d4511a3 100644
> --- a/drivers/misc/Makefile
> +++ b/drivers/misc/Makefile
> @@ -23,6 +23,7 @@ obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o
> obj-$(CONFIG_KGDB_TESTS) += kgdbts.o
> obj-$(CONFIG_SGI_XP) += sgi-xp/
> obj-$(CONFIG_SGI_GRU) += sgi-gru/
> +obj-$(CONFIG_SMPRO_ERRMON) += smpro-errmon.o
> obj-$(CONFIG_CS5535_MFGPT) += cs5535-mfgpt.o
> obj-$(CONFIG_GEHC_ACHC) += gehc-achc.o
> obj-$(CONFIG_HP_ILO) += hpilo.o
> diff --git a/drivers/misc/smpro-errmon.c b/drivers/misc/smpro-errmon.c
> new file mode 100644
> index 000000000000..d1431d419aa4
> --- /dev/null
> +++ b/drivers/misc/smpro-errmon.c
> @@ -0,0 +1,529 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Ampere Computing SoC's SMpro Error Monitoring Driver
> + *
> + * Copyright (c) 2022, Ampere Computing LLC
> + *
> + */
> +
> +#include <linux/i2c.h>
> +#include <linux/mod_devicetable.h>
> +#include <linux/module.h>
> +#include <linux/platform_device.h>
> +#include <linux/regmap.h>
> +
> +/* GPI RAS Error Registers */
> +#define GPI_RAS_ERR 0x7E
> +
> +/* Core and L2C Error Registers */
> +#define CORE_CE_ERR_CNT 0x80
> +#define CORE_CE_ERR_LEN 0x81
> +#define CORE_CE_ERR_DATA 0x82
> +#define CORE_UE_ERR_CNT 0x83
> +#define CORE_UE_ERR_LEN 0x84
> +#define CORE_UE_ERR_DATA 0x85
> +
> +/* Memory Error Registers */
> +#define MEM_CE_ERR_CNT 0x90
> +#define MEM_CE_ERR_LEN 0x91
> +#define MEM_CE_ERR_DATA 0x92
> +#define MEM_UE_ERR_CNT 0x93
> +#define MEM_UE_ERR_LEN 0x94
> +#define MEM_UE_ERR_DATA 0x95
> +
> +/* RAS Error/Warning Registers */
> +#define ERR_SMPRO_TYPE 0xA0
> +#define ERR_PMPRO_TYPE 0xA1
> +#define ERR_SMPRO_INFO_LO 0xA2
> +#define ERR_SMPRO_INFO_HI 0xA3
> +#define ERR_SMPRO_DATA_LO 0xA4
> +#define ERR_SMPRO_DATA_HI 0xA5
> +#define WARN_SMPRO_INFO_LO 0xAA
> +#define WARN_SMPRO_INFO_HI 0xAB
> +#define ERR_PMPRO_INFO_LO 0xA6
> +#define ERR_PMPRO_INFO_HI 0xA7
> +#define ERR_PMPRO_DATA_LO 0xA8
> +#define ERR_PMPRO_DATA_HI 0xA9
> +#define WARN_PMPRO_INFO_LO 0xAC
> +#define WARN_PMPRO_INFO_HI 0xAD
> +
> +/* PCIE Error Registers */
> +#define PCIE_CE_ERR_CNT 0xC0
> +#define PCIE_CE_ERR_LEN 0xC1
> +#define PCIE_CE_ERR_DATA 0xC2
> +#define PCIE_UE_ERR_CNT 0xC3
> +#define PCIE_UE_ERR_LEN 0xC4
> +#define PCIE_UE_ERR_DATA 0xC5
> +
> +/* Other Error Registers */
> +#define OTHER_CE_ERR_CNT 0xD0
> +#define OTHER_CE_ERR_LEN 0xD1
> +#define OTHER_CE_ERR_DATA 0xD2
> +#define OTHER_UE_ERR_CNT 0xD8
> +#define OTHER_UE_ERR_LEN 0xD9
> +#define OTHER_UE_ERR_DATA 0xDA
> +
> +/* Event Data Registers */
> +#define VRD_WARN_FAULT_EVENT_DATA 0x78
> +#define VRD_HOT_EVENT_DATA 0x79
> +#define DIMM_HOT_EVENT_DATA 0x7A
> +
> +#define MAX_READ_BLOCK_LENGTH 48
> +
> +#define RAS_SMPRO_ERR 0
> +#define RAS_PMPRO_ERR 1
> +
> +enum RAS_48BYTES_ERR_TYPES {
> + CORE_CE_ERR,
> + CORE_UE_ERR,
> + MEM_CE_ERR,
> + MEM_UE_ERR,
> + PCIE_CE_ERR,
> + PCIE_UE_ERR,
> + OTHER_CE_ERR,
> + OTHER_UE_ERR,
> + NUM_48BYTES_ERR_TYPE,
> +};
> +
> +struct smpro_error_hdr {
> + u8 count; /* Number of the RAS errors */
> + u8 len; /* Number of data bytes */
> + u8 data; /* Start of 48-byte data */
> + u8 max_cnt; /* Max num of errors */
> +};
> +
> +/*
> + * Included Address of registers to get Count, Length of data and Data
> + * of the 48 bytes error data
> + */
> +static struct smpro_error_hdr smpro_error_table[] = {
> + [CORE_CE_ERR] = {
> + .count = CORE_CE_ERR_CNT,
> + .len = CORE_CE_ERR_LEN,
> + .data = CORE_CE_ERR_DATA,
> + .max_cnt = 32
> + },
> + [CORE_UE_ERR] = {
> + .count = CORE_UE_ERR_CNT,
> + .len = CORE_UE_ERR_LEN,
> + .data = CORE_UE_ERR_DATA,
> + .max_cnt = 32
> + },
> + [MEM_CE_ERR] = {
> + .count = MEM_CE_ERR_CNT,
> + .len = MEM_CE_ERR_LEN,
> + .data = MEM_CE_ERR_DATA,
> + .max_cnt = 16
> + },
> + [MEM_UE_ERR] = {
> + .count = MEM_UE_ERR_CNT,
> + .len = MEM_UE_ERR_LEN,
> + .data = MEM_UE_ERR_DATA,
> + .max_cnt = 16
> + },
> + [PCIE_CE_ERR] = {
> + .count = PCIE_CE_ERR_CNT,
> + .len = PCIE_CE_ERR_LEN,
> + .data = PCIE_CE_ERR_DATA,
> + .max_cnt = 96
> + },
> + [PCIE_UE_ERR] = {
> + .count = PCIE_UE_ERR_CNT,
> + .len = PCIE_UE_ERR_LEN,
> + .data = PCIE_UE_ERR_DATA,
> + .max_cnt = 96
> + },
> + [OTHER_CE_ERR] = {
> + .count = OTHER_CE_ERR_CNT,
> + .len = OTHER_CE_ERR_LEN,
> + .data = OTHER_CE_ERR_DATA,
> + .max_cnt = 8
> + },
> + [OTHER_UE_ERR] = {
> + .count = OTHER_UE_ERR_CNT,
> + .len = OTHER_UE_ERR_LEN,
> + .data = OTHER_UE_ERR_DATA,
> + .max_cnt = 8
> + },
> +};
> +
> +/*
> + * List of SCP registers which are used to get
> + * one type of RAS Internal errors.
> + */
> +struct smpro_int_error_hdr {
> + u8 type;
> + u8 info_l;
> + u8 info_h;
> + u8 data_l;
> + u8 data_h;
> + u8 warn_l;
> + u8 warn_h;
> +};
> +
> +static struct smpro_int_error_hdr list_smpro_int_error_hdr[] = {
> + [RAS_SMPRO_ERR] = {
> + .type = ERR_SMPRO_TYPE,
> + .info_l = ERR_SMPRO_INFO_LO,
> + .info_h = ERR_SMPRO_INFO_HI,
> + .data_l = ERR_SMPRO_DATA_LO,
> + .data_h = ERR_SMPRO_DATA_HI,
> + .warn_l = WARN_SMPRO_INFO_LO,
> + .warn_h = WARN_SMPRO_INFO_HI,
> + },
> + [RAS_PMPRO_ERR] = {
> + .type = ERR_PMPRO_TYPE,
> + .info_l = ERR_PMPRO_INFO_LO,
> + .info_h = ERR_PMPRO_INFO_HI,
> + .data_l = ERR_PMPRO_DATA_LO,
> + .data_h = ERR_PMPRO_DATA_HI,
> + .warn_l = WARN_PMPRO_INFO_LO,
> + .warn_h = WARN_PMPRO_INFO_HI,
> + },
> +};
> +
> +struct smpro_errmon {
> + struct regmap *regmap;
> +};
> +
> +enum EVENT_TYPES {
> + VRD_WARN_FAULT_EVENT,
> + VRD_HOT_EVENT,
> + DIMM_HOT_EVENT,
> + NUM_EVENTS_TYPE,
> +};
> +
> +/* Included Address of event source and data registers */
> +static u8 smpro_event_table[NUM_EVENTS_TYPE] = {
> + VRD_WARN_FAULT_EVENT_DATA,
> + VRD_HOT_EVENT_DATA,
> + DIMM_HOT_EVENT_DATA,
> +};
> +
> +static ssize_t smpro_event_data_read(struct device *dev,
> + struct device_attribute *da, char *buf,
> + int channel)
> +{
> + struct smpro_errmon *errmon = dev_get_drvdata(dev);
> + s32 event_data;
> + int ret;
> +
> + ret = regmap_read(errmon->regmap, smpro_event_table[channel], &event_data);
> + if (ret)
> + return ret;
> + /* Clear event after read */
> + if (event_data != 0)
> + regmap_write(errmon->regmap, smpro_event_table[channel], event_data);
> +
> + return sysfs_emit(buf, "%04x\n", event_data);
> +}
> +
> +static ssize_t smpro_overflow_data_read(struct device *dev, struct device_attribute *da,
> + char *buf, int channel)
> +{
> + struct smpro_errmon *errmon = dev_get_drvdata(dev);
> + struct smpro_error_hdr *err_info;
> + s32 err_count;
> + int ret;
> +
> + err_info = &smpro_error_table[channel];
> +
> + ret = regmap_read(errmon->regmap, err_info->count, &err_count);
> + if (ret)
> + return ret;
> +
> + /* Bit 8 indicates the overflow status */
> + return sysfs_emit(buf, "%d\n", (err_count & BIT(8)) ? 1 : 0);
> +}

Where is the Documentation/ABI/ entry for this field?

Please put that in the same commit so that it is easier to validate that
you really did document everything properly.

thanks,

greg k-h

2022-10-06 07:53:25

by Quan Nguyen

[permalink] [raw]
Subject: Re: [PATCH v9 3/9] misc: smpro-errmon: Add Ampere's SMpro error monitor driver



On 29/09/2022 16:53, Greg Kroah-Hartman wrote:
> On Thu, Sep 29, 2022 at 04:43:15PM +0700, Quan Nguyen wrote:
>> This commit adds Ampere's SMpro error monitor driver for monitoring
>> and reporting RAS-related errors as reported by SMpro co-processor
>> found on Ampere's Altra processor family.
>>
>> Signed-off-by: Quan Nguyen <[email protected]>
>> ---
>> Changes in v9:
>> + Fix ugly static struct define [Greg]
>> + Remove unused defines and update documentation [Quan]
>> + Add minor refactor code [Quan]
>> + Fix messy goto [Greg]
>> + Update SPDX licence [Greg]
>> + Use ATTRIBUTE_GROUPS() [Greg]
>> + Use dev_groups instead of sysfs_create_group() to avoid
>> racing issue with user space [Greg]
>> + Refactor code to fix unnecessary initialization issue [Quan]
>> + Refactor code to avoid clever encoding issue [Quan]
>> + Separate error_[smpro|pmpro] to error_* and warn_* [Quan]
>> + Add minor code refactor [Quan]
>>
>> Changes in v8:
>> + Update wording for SMPRO_ERRMON on Kconfig file [Quan]
>> + Avoid uninitialized variable use [kernel test robot]
>> + Switch to use sysfs_emit() [Greg]
>> + Make sysfs to return single value [Greg]
>> + Change errors_* sysfs to error_* [Quan]
>> + Add overflow_[core|mem|pcie|other]_[ce|ue] sysfs to report
>> overflow status of each type of HW errors [Quan]
>> + Add some minor refactor [Quan]
>>
>> Changes in v7:
>> + Remove regmap_acquire/release_lock(), read_i2c_block_data() [Quan]
>> + Use regmap_noinc_read() instead of errmon_read_block() [Quan]
>> + Validate number of errors before read [Quan]
>> + Fix wrong return type of *_show() function [kernel test robot]
>> + Adjust patch order to avoid dependence with smpro-mfd [Lee Jones]
>> + Use pointer instead of stack memory [Quan]
>>
>> Changes in v6:
>> + First introduced in v6 [Quan]
>>
>> drivers/misc/Kconfig | 12 +
>> drivers/misc/Makefile | 1 +
>> drivers/misc/smpro-errmon.c | 529 ++++++++++++++++++++++++++++++++++++
>> 3 files changed, 542 insertions(+)
>> create mode 100644 drivers/misc/smpro-errmon.c
>>
>> diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
>> index 358ad56f6524..b9ceee949dab 100644
>> --- a/drivers/misc/Kconfig
>> +++ b/drivers/misc/Kconfig
>> @@ -176,6 +176,18 @@ config SGI_XP
>> this feature will allow for direct communication between SSIs
>> based on a network adapter and DMA messaging.
>>
>> +config SMPRO_ERRMON
>> + tristate "Ampere Computing SMPro error monitor driver"
>> + depends on MFD_SMPRO || COMPILE_TEST
>> + help
>> + Say Y here to get support for the SMpro error monitor function
>> + provided by Ampere Computing's Altra and Altra Max SoCs. Upon
>> + loading, the driver creates sysfs files which can be use to gather
>> + multiple HW error data reported via read and write system calls.
>> +
>> + To compile this driver as a module, say M here. The driver will be
>> + called smpro-errmon.
>> +
>> config CS5535_MFGPT
>> tristate "CS5535/CS5536 Geode Multi-Function General Purpose Timer (MFGPT) support"
>> depends on MFD_CS5535
>> diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
>> index ac9b3e757ba1..bbe24d4511a3 100644
>> --- a/drivers/misc/Makefile
>> +++ b/drivers/misc/Makefile
>> @@ -23,6 +23,7 @@ obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o
>> obj-$(CONFIG_KGDB_TESTS) += kgdbts.o
>> obj-$(CONFIG_SGI_XP) += sgi-xp/
>> obj-$(CONFIG_SGI_GRU) += sgi-gru/
>> +obj-$(CONFIG_SMPRO_ERRMON) += smpro-errmon.o
>> obj-$(CONFIG_CS5535_MFGPT) += cs5535-mfgpt.o
>> obj-$(CONFIG_GEHC_ACHC) += gehc-achc.o
>> obj-$(CONFIG_HP_ILO) += hpilo.o
>> diff --git a/drivers/misc/smpro-errmon.c b/drivers/misc/smpro-errmon.c
>> new file mode 100644
>> index 000000000000..d1431d419aa4
>> --- /dev/null
>> +++ b/drivers/misc/smpro-errmon.c
>> @@ -0,0 +1,529 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * Ampere Computing SoC's SMpro Error Monitoring Driver
>> + *
>> + * Copyright (c) 2022, Ampere Computing LLC
>> + *
>> + */
>> +
>> +#include <linux/i2c.h>
>> +#include <linux/mod_devicetable.h>
>> +#include <linux/module.h>
>> +#include <linux/platform_device.h>
>> +#include <linux/regmap.h>
>> +
>> +/* GPI RAS Error Registers */
>> +#define GPI_RAS_ERR 0x7E
>> +
>> +/* Core and L2C Error Registers */
>> +#define CORE_CE_ERR_CNT 0x80
>> +#define CORE_CE_ERR_LEN 0x81
>> +#define CORE_CE_ERR_DATA 0x82
>> +#define CORE_UE_ERR_CNT 0x83
>> +#define CORE_UE_ERR_LEN 0x84
>> +#define CORE_UE_ERR_DATA 0x85
>> +
>> +/* Memory Error Registers */
>> +#define MEM_CE_ERR_CNT 0x90
>> +#define MEM_CE_ERR_LEN 0x91
>> +#define MEM_CE_ERR_DATA 0x92
>> +#define MEM_UE_ERR_CNT 0x93
>> +#define MEM_UE_ERR_LEN 0x94
>> +#define MEM_UE_ERR_DATA 0x95
>> +
>> +/* RAS Error/Warning Registers */
>> +#define ERR_SMPRO_TYPE 0xA0
>> +#define ERR_PMPRO_TYPE 0xA1
>> +#define ERR_SMPRO_INFO_LO 0xA2
>> +#define ERR_SMPRO_INFO_HI 0xA3
>> +#define ERR_SMPRO_DATA_LO 0xA4
>> +#define ERR_SMPRO_DATA_HI 0xA5
>> +#define WARN_SMPRO_INFO_LO 0xAA
>> +#define WARN_SMPRO_INFO_HI 0xAB
>> +#define ERR_PMPRO_INFO_LO 0xA6
>> +#define ERR_PMPRO_INFO_HI 0xA7
>> +#define ERR_PMPRO_DATA_LO 0xA8
>> +#define ERR_PMPRO_DATA_HI 0xA9
>> +#define WARN_PMPRO_INFO_LO 0xAC
>> +#define WARN_PMPRO_INFO_HI 0xAD
>> +
>> +/* PCIE Error Registers */
>> +#define PCIE_CE_ERR_CNT 0xC0
>> +#define PCIE_CE_ERR_LEN 0xC1
>> +#define PCIE_CE_ERR_DATA 0xC2
>> +#define PCIE_UE_ERR_CNT 0xC3
>> +#define PCIE_UE_ERR_LEN 0xC4
>> +#define PCIE_UE_ERR_DATA 0xC5
>> +
>> +/* Other Error Registers */
>> +#define OTHER_CE_ERR_CNT 0xD0
>> +#define OTHER_CE_ERR_LEN 0xD1
>> +#define OTHER_CE_ERR_DATA 0xD2
>> +#define OTHER_UE_ERR_CNT 0xD8
>> +#define OTHER_UE_ERR_LEN 0xD9
>> +#define OTHER_UE_ERR_DATA 0xDA
>> +
>> +/* Event Data Registers */
>> +#define VRD_WARN_FAULT_EVENT_DATA 0x78
>> +#define VRD_HOT_EVENT_DATA 0x79
>> +#define DIMM_HOT_EVENT_DATA 0x7A
>> +
>> +#define MAX_READ_BLOCK_LENGTH 48
>> +
>> +#define RAS_SMPRO_ERR 0
>> +#define RAS_PMPRO_ERR 1
>> +
>> +enum RAS_48BYTES_ERR_TYPES {
>> + CORE_CE_ERR,
>> + CORE_UE_ERR,
>> + MEM_CE_ERR,
>> + MEM_UE_ERR,
>> + PCIE_CE_ERR,
>> + PCIE_UE_ERR,
>> + OTHER_CE_ERR,
>> + OTHER_UE_ERR,
>> + NUM_48BYTES_ERR_TYPE,
>> +};
>> +
>> +struct smpro_error_hdr {
>> + u8 count; /* Number of the RAS errors */
>> + u8 len; /* Number of data bytes */
>> + u8 data; /* Start of 48-byte data */
>> + u8 max_cnt; /* Max num of errors */
>> +};
>> +
>> +/*
>> + * Included Address of registers to get Count, Length of data and Data
>> + * of the 48 bytes error data
>> + */
>> +static struct smpro_error_hdr smpro_error_table[] = {
>> + [CORE_CE_ERR] = {
>> + .count = CORE_CE_ERR_CNT,
>> + .len = CORE_CE_ERR_LEN,
>> + .data = CORE_CE_ERR_DATA,
>> + .max_cnt = 32
>> + },
>> + [CORE_UE_ERR] = {
>> + .count = CORE_UE_ERR_CNT,
>> + .len = CORE_UE_ERR_LEN,
>> + .data = CORE_UE_ERR_DATA,
>> + .max_cnt = 32
>> + },
>> + [MEM_CE_ERR] = {
>> + .count = MEM_CE_ERR_CNT,
>> + .len = MEM_CE_ERR_LEN,
>> + .data = MEM_CE_ERR_DATA,
>> + .max_cnt = 16
>> + },
>> + [MEM_UE_ERR] = {
>> + .count = MEM_UE_ERR_CNT,
>> + .len = MEM_UE_ERR_LEN,
>> + .data = MEM_UE_ERR_DATA,
>> + .max_cnt = 16
>> + },
>> + [PCIE_CE_ERR] = {
>> + .count = PCIE_CE_ERR_CNT,
>> + .len = PCIE_CE_ERR_LEN,
>> + .data = PCIE_CE_ERR_DATA,
>> + .max_cnt = 96
>> + },
>> + [PCIE_UE_ERR] = {
>> + .count = PCIE_UE_ERR_CNT,
>> + .len = PCIE_UE_ERR_LEN,
>> + .data = PCIE_UE_ERR_DATA,
>> + .max_cnt = 96
>> + },
>> + [OTHER_CE_ERR] = {
>> + .count = OTHER_CE_ERR_CNT,
>> + .len = OTHER_CE_ERR_LEN,
>> + .data = OTHER_CE_ERR_DATA,
>> + .max_cnt = 8
>> + },
>> + [OTHER_UE_ERR] = {
>> + .count = OTHER_UE_ERR_CNT,
>> + .len = OTHER_UE_ERR_LEN,
>> + .data = OTHER_UE_ERR_DATA,
>> + .max_cnt = 8
>> + },
>> +};
>> +
>> +/*
>> + * List of SCP registers which are used to get
>> + * one type of RAS Internal errors.
>> + */
>> +struct smpro_int_error_hdr {
>> + u8 type;
>> + u8 info_l;
>> + u8 info_h;
>> + u8 data_l;
>> + u8 data_h;
>> + u8 warn_l;
>> + u8 warn_h;
>> +};
>> +
>> +static struct smpro_int_error_hdr list_smpro_int_error_hdr[] = {
>> + [RAS_SMPRO_ERR] = {
>> + .type = ERR_SMPRO_TYPE,
>> + .info_l = ERR_SMPRO_INFO_LO,
>> + .info_h = ERR_SMPRO_INFO_HI,
>> + .data_l = ERR_SMPRO_DATA_LO,
>> + .data_h = ERR_SMPRO_DATA_HI,
>> + .warn_l = WARN_SMPRO_INFO_LO,
>> + .warn_h = WARN_SMPRO_INFO_HI,
>> + },
>> + [RAS_PMPRO_ERR] = {
>> + .type = ERR_PMPRO_TYPE,
>> + .info_l = ERR_PMPRO_INFO_LO,
>> + .info_h = ERR_PMPRO_INFO_HI,
>> + .data_l = ERR_PMPRO_DATA_LO,
>> + .data_h = ERR_PMPRO_DATA_HI,
>> + .warn_l = WARN_PMPRO_INFO_LO,
>> + .warn_h = WARN_PMPRO_INFO_HI,
>> + },
>> +};
>> +
>> +struct smpro_errmon {
>> + struct regmap *regmap;
>> +};
>> +
>> +enum EVENT_TYPES {
>> + VRD_WARN_FAULT_EVENT,
>> + VRD_HOT_EVENT,
>> + DIMM_HOT_EVENT,
>> + NUM_EVENTS_TYPE,
>> +};
>> +
>> +/* Included Address of event source and data registers */
>> +static u8 smpro_event_table[NUM_EVENTS_TYPE] = {
>> + VRD_WARN_FAULT_EVENT_DATA,
>> + VRD_HOT_EVENT_DATA,
>> + DIMM_HOT_EVENT_DATA,
>> +};
>> +
>> +static ssize_t smpro_event_data_read(struct device *dev,
>> + struct device_attribute *da, char *buf,
>> + int channel)
>> +{
>> + struct smpro_errmon *errmon = dev_get_drvdata(dev);
>> + s32 event_data;
>> + int ret;
>> +
>> + ret = regmap_read(errmon->regmap, smpro_event_table[channel], &event_data);
>> + if (ret)
>> + return ret;
>> + /* Clear event after read */
>> + if (event_data != 0)
>> + regmap_write(errmon->regmap, smpro_event_table[channel], event_data);
>> +
>> + return sysfs_emit(buf, "%04x\n", event_data);
>> +}
>> +
>> +static ssize_t smpro_overflow_data_read(struct device *dev, struct device_attribute *da,
>> + char *buf, int channel)
>> +{
>> + struct smpro_errmon *errmon = dev_get_drvdata(dev);
>> + struct smpro_error_hdr *err_info;
>> + s32 err_count;
>> + int ret;
>> +
>> + err_info = &smpro_error_table[channel];
>> +
>> + ret = regmap_read(errmon->regmap, err_info->count, &err_count);
>> + if (ret)
>> + return ret;
>> +
>> + /* Bit 8 indicates the overflow status */
>> + return sysfs_emit(buf, "%d\n", (err_count & BIT(8)) ? 1 : 0);
>> +}
>
> Where is the Documentation/ABI/ entry for this field?
>
It is documented under the entry for the
"/sys/bus/platform/devices/smpro-errmon.*/overflow_[core|mem|pcie|other]_[ce|ue]"
in the [9/9] patch.

> Please put that in the same commit so that it is easier to validate that
> you really did document everything properly.
>

I'll follow this suggestion in next version.

Thank your for the reviews,
- Quan