This patchset adds support to handle Deferred error, also
cleans up the code for log_ue/ce and corresponding APIs.
Kamati Srinivas (2):
edac: Modify sysfs enabled values log_ue, log_ce to bool
edac: Add support to handle DE (Deferred Errors)
drivers/edac/edac_device.c | 57 ++++++++++++++++++++++++++++++--
drivers/edac/edac_device.h | 34 +++++++++++++++++--
drivers/edac/edac_device_sysfs.c | 50 ++++++++++++++++++++++++----
3 files changed, 130 insertions(+), 11 deletions(-)
--
2.17.1
Use kstrtobool() since simple_strtoul() is deprecated.
Also change edac_device_get_log_ue/ce return type from int to bool.
Signed-off-by: Kamati Srinivas <[email protected]>
---
drivers/edac/edac_device.c | 4 ++--
drivers/edac/edac_device.h | 4 ++--
drivers/edac/edac_device_sysfs.c | 12 ++++++------
3 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
index 8c4d947fb848..ddfa094d0f3a 100644
--- a/drivers/edac/edac_device.c
+++ b/drivers/edac/edac_device.c
@@ -539,12 +539,12 @@ struct edac_device_ctl_info *edac_device_del_device(struct device *dev)
}
EXPORT_SYMBOL_GPL(edac_device_del_device);
-static inline int edac_device_get_log_ce(struct edac_device_ctl_info *edac_dev)
+static inline bool edac_device_get_log_ce(struct edac_device_ctl_info *edac_dev)
{
return edac_dev->log_ce;
}
-static inline int edac_device_get_log_ue(struct edac_device_ctl_info *edac_dev)
+static inline bool edac_device_get_log_ue(struct edac_device_ctl_info *edac_dev)
{
return edac_dev->log_ue;
}
diff --git a/drivers/edac/edac_device.h b/drivers/edac/edac_device.h
index c4c0e0bdce14..e1645332eaa2 100644
--- a/drivers/edac/edac_device.h
+++ b/drivers/edac/edac_device.h
@@ -157,8 +157,8 @@ struct edac_device_ctl_info {
int dev_idx;
/* Per instance controls for this edac_device */
- int log_ue; /* boolean for logging UEs */
- int log_ce; /* boolean for logging CEs */
+ bool log_ue; /* boolean for logging UEs */
+ bool log_ce; /* boolean for logging CEs */
int panic_on_ue; /* boolean for panic'ing on an UE */
unsigned poll_msec; /* number of milliseconds to poll interval */
unsigned long delay; /* number of jiffies for poll_msec */
diff --git a/drivers/edac/edac_device_sysfs.c b/drivers/edac/edac_device_sysfs.c
index 5e7593753799..51a3a90d7404 100644
--- a/drivers/edac/edac_device_sysfs.c
+++ b/drivers/edac/edac_device_sysfs.c
@@ -39,10 +39,10 @@ static ssize_t edac_device_ctl_log_ue_store(struct edac_device_ctl_info
*ctl_info, const char *data,
size_t count)
{
- /* if parameter is zero, turn off flag, if non-zero turn on flag */
- ctl_info->log_ue = (simple_strtoul(data, NULL, 0) != 0);
+ int ret;
- return count;
+ ret = kstrtobool(data, &ctl_info->log_ue);
+ return ret ? ret : count;
}
/* 'log_ce' */
@@ -56,10 +56,10 @@ static ssize_t edac_device_ctl_log_ce_store(struct edac_device_ctl_info
*ctl_info, const char *data,
size_t count)
{
- /* if parameter is zero, turn off flag, if non-zero turn on flag */
- ctl_info->log_ce = (simple_strtoul(data, NULL, 0) != 0);
+ int ret;
- return count;
+ ret = kstrtobool(data, &ctl_info->log_ce);
+ return ret ? ret : count;
}
/* 'panic_on_ue' */
--
2.17.1
The EDAC subsystem doesn't handle DE (Deferred errors),
due to lack of support all DEs are treated either as
CEs (Corrected Errors) or UEs (Uncorrected Errors).
To solve this adding log, counter and associated sysfs
entries to allow EDAC driver to be configured to
handle DEs.
Signed-off-by: Kamati Srinivas <[email protected]>
---
drivers/edac/edac_device.c | 53 +++++++++++++++++++++++++++++++-
drivers/edac/edac_device.h | 30 ++++++++++++++++++
drivers/edac/edac_device_sysfs.c | 38 +++++++++++++++++++++++
3 files changed, 120 insertions(+), 1 deletion(-)
diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
index ddfa094d0f3a..28c355d07304 100644
--- a/drivers/edac/edac_device.c
+++ b/drivers/edac/edac_device.c
@@ -131,9 +131,10 @@ struct edac_device_ctl_info *edac_device_alloc_ctl_info(
dev_ctl->instances = dev_inst;
dev_ctl->pvt_info = pvt;
- /* Default logging of CEs and UEs */
+ /* Default logging of CEs, UEs and DEs */
dev_ctl->log_ce = 1;
dev_ctl->log_ue = 1;
+ dev_ctl->log_de = 1;
/* Name of this edac device */
snprintf(dev_ctl->name,sizeof(dev_ctl->name),"%s",edac_device_name);
@@ -544,6 +545,11 @@ static inline bool edac_device_get_log_ce(struct edac_device_ctl_info *edac_dev)
return edac_dev->log_ce;
}
+static inline bool edac_device_get_log_de(struct edac_device_ctl_info *edac_dev)
+{
+ return edac_dev->log_de;
+}
+
static inline bool edac_device_get_log_ue(struct edac_device_ctl_info *edac_dev)
{
return edac_dev->log_ue;
@@ -601,6 +607,51 @@ void edac_device_handle_ce_count(struct edac_device_ctl_info *edac_dev,
}
EXPORT_SYMBOL_GPL(edac_device_handle_ce_count);
+void edac_device_handle_de_count(struct edac_device_ctl_info *edac_dev,
+ unsigned int count, int inst_nr, int block_nr,
+ const char *msg)
+{
+ struct edac_device_instance *instance;
+ struct edac_device_block *block = NULL;
+
+ if (!count)
+ return;
+
+ if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) {
+ edac_device_printk(edac_dev, KERN_ERR,
+ "INTERNAL ERROR: 'instance' out of range (%d >= %d)\n",
+ inst_nr,
+ edac_dev->nr_instances);
+ return;
+ }
+
+ instance = edac_dev->instances + inst_nr;
+
+ if ((block_nr >= instance->nr_blocks) || (block_nr < 0)) {
+ edac_device_printk(edac_dev, KERN_ERR,
+ "INTERNAL ERROR: instance %d 'block'out of range (%d >= %d)\n",
+ inst_nr, block_nr,
+ instance->nr_blocks);
+ return;
+ }
+
+ if (instance->nr_blocks > 0) {
+ block = instance->blocks + block_nr;
+ block->counters.de_count += count;
+ }
+
+ /* Propagate the count up the 'totals' tree */
+ instance->counters.de_count += count;
+ edac_dev->counters.de_count += count;
+
+ if (edac_device_get_log_de(edac_dev))
+ edac_device_printk(edac_dev, KERN_WARNING,
+ "DE: %s instance: %s block: %s count: %d '%s'\n",
+ edac_dev->ctl_name, instance->name,
+ block ? block->name : "N/A", count, msg);
+}
+EXPORT_SYMBOL_GPL(edac_device_handle_de_count);
+
void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
unsigned int count, int inst_nr, int block_nr,
const char *msg)
diff --git a/drivers/edac/edac_device.h b/drivers/edac/edac_device.h
index e1645332eaa2..883557bc0182 100644
--- a/drivers/edac/edac_device.h
+++ b/drivers/edac/edac_device.h
@@ -74,6 +74,7 @@
struct edac_device_counter {
u32 ue_count;
u32 ce_count;
+ u32 de_count;
};
/* forward reference */
@@ -159,6 +160,7 @@ struct edac_device_ctl_info {
/* Per instance controls for this edac_device */
bool log_ue; /* boolean for logging UEs */
bool log_ce; /* boolean for logging CEs */
+ bool log_de; /* boolean for logging DEs */
int panic_on_ue; /* boolean for panic'ing on an UE */
unsigned poll_msec; /* number of milliseconds to poll interval */
unsigned long delay; /* number of jiffies for poll_msec */
@@ -298,6 +300,19 @@ void edac_device_handle_ce_count(struct edac_device_ctl_info *edac_dev,
unsigned int count, int inst_nr, int block_nr,
const char *msg);
+/**
+ * Log Deferred errors.
+ *
+ * @edac_dev: pointer to struct &edac_device_ctl_info
+ * @inst_nr: number of the instance where the DE error happened
+ * @count: Number of errors to log.
+ * @block_nr: number of the block where the DE error happened
+ * @msg: message to be printed
+ */
+void edac_device_handle_de_count(struct edac_device_ctl_info *edac_dev,
+ unsigned int count, int inst_nr, int block_nr,
+ const char *msg);
+
/**
* Log uncorrectable errors.
*
@@ -341,6 +356,21 @@ edac_device_handle_ue(struct edac_device_ctl_info *edac_dev, int inst_nr,
edac_device_handle_ue_count(edac_dev, 1, inst_nr, block_nr, msg);
}
+/**
+ * edac_device_handle_de(): Log deferred error
+ *
+ * @edac_dev: pointer to struct &edac_device_ctl_info
+ * @inst_nr: number of the instance where the DE error happened
+ * @block_nr: number of the block where the DE error happened
+ * @msg: message to be printed
+ */
+static inline void
+edac_device_handle_de(struct edac_device_ctl_info *edac_dev, int inst_nr,
+ int block_nr, const char *msg)
+{
+ edac_device_handle_de_count(edac_dev, 1, inst_nr, block_nr, msg);
+}
+
/**
* edac_device_alloc_index: Allocate a unique device index number
*
diff --git a/drivers/edac/edac_device_sysfs.c b/drivers/edac/edac_device_sysfs.c
index 51a3a90d7404..76fc50ff8503 100644
--- a/drivers/edac/edac_device_sysfs.c
+++ b/drivers/edac/edac_device_sysfs.c
@@ -62,6 +62,23 @@ static ssize_t edac_device_ctl_log_ce_store(struct edac_device_ctl_info
return ret ? ret : count;
}
+/* 'log_de' */
+static ssize_t edac_device_ctl_log_de_show(struct edac_device_ctl_info
+ *ctl_info, char *data)
+{
+ return sprintf(data, "%u\n", ctl_info->log_de);
+}
+
+static ssize_t edac_device_ctl_log_de_store(struct edac_device_ctl_info
+ *ctl_info, const char *data,
+ size_t count)
+{
+ int ret;
+
+ ret = kstrtobool(data, &ctl_info->log_de);
+ return ret ? ret : count;
+}
+
/* 'panic_on_ue' */
static ssize_t edac_device_ctl_panic_on_ue_show(struct edac_device_ctl_info
*ctl_info, char *data)
@@ -156,6 +173,8 @@ CTL_INFO_ATTR(log_ue, S_IRUGO | S_IWUSR,
edac_device_ctl_log_ue_show, edac_device_ctl_log_ue_store);
CTL_INFO_ATTR(log_ce, S_IRUGO | S_IWUSR,
edac_device_ctl_log_ce_show, edac_device_ctl_log_ce_store);
+CTL_INFO_ATTR(log_de, S_IRUGO | S_IWUSR,
+ edac_device_ctl_log_de_show, edac_device_ctl_log_de_store);
CTL_INFO_ATTR(panic_on_ue, S_IRUGO | S_IWUSR,
edac_device_ctl_panic_on_ue_show,
edac_device_ctl_panic_on_ue_store);
@@ -167,6 +186,7 @@ static struct ctl_info_attribute *device_ctrl_attr[] = {
&attr_ctl_info_panic_on_ue,
&attr_ctl_info_log_ue,
&attr_ctl_info_log_ce,
+ &attr_ctl_info_log_de,
&attr_ctl_info_poll_msec,
NULL,
};
@@ -318,6 +338,12 @@ static ssize_t instance_ce_count_show(struct edac_device_instance *instance,
return sprintf(data, "%u\n", instance->counters.ce_count);
}
+static ssize_t instance_de_count_show(struct edac_device_instance *instance,
+ char *data)
+{
+ return sprintf(data, "%u\n", instance->counters.de_count);
+}
+
#define to_instance(k) container_of(k, struct edac_device_instance, kobj)
#define to_instance_attr(a) container_of(a,struct instance_attribute,attr)
@@ -387,11 +413,13 @@ static struct instance_attribute attr_instance_##_name = { \
*/
INSTANCE_ATTR(ce_count, S_IRUGO, instance_ce_count_show, NULL);
INSTANCE_ATTR(ue_count, S_IRUGO, instance_ue_count_show, NULL);
+INSTANCE_ATTR(de_count, S_IRUGO, instance_de_count_show, NULL);
/* list of edac_dev 'instance' attributes */
static struct instance_attribute *device_instance_attr[] = {
&attr_instance_ce_count,
&attr_instance_ue_count,
+ &attr_instance_de_count,
NULL,
};
@@ -427,6 +455,14 @@ static ssize_t block_ce_count_show(struct kobject *kobj,
return sprintf(data, "%u\n", block->counters.ce_count);
}
+static ssize_t block_de_count_show(struct kobject *kobj,
+ struct attribute *attr, char *data)
+{
+ struct edac_device_block *block = to_block(kobj);
+
+ return sprintf(data, "%u\n", block->counters.de_count);
+}
+
/* DEVICE block kobject release() function */
static void edac_device_ctrl_block_release(struct kobject *kobj)
{
@@ -485,11 +521,13 @@ static struct edac_dev_sysfs_block_attribute attr_block_##_name = { \
BLOCK_ATTR(ce_count, S_IRUGO, block_ce_count_show, NULL);
BLOCK_ATTR(ue_count, S_IRUGO, block_ue_count_show, NULL);
+BLOCK_ATTR(de_count, S_IRUGO, block_de_count_show, NULL);
/* list of edac_dev 'block' attributes */
static struct edac_dev_sysfs_block_attribute *device_block_attr[] = {
&attr_block_ce_count,
&attr_block_ue_count,
+ &attr_block_de_count,
NULL,
};
--
2.17.1
> From: Kamati Srinivas <[email protected]>
> ...
>
> This patchset adds support to handle Deferred error, also
It would be better if there is a use case showing some EDAC
driver(s) needs this newly added "Deferred Error" code
in the EDAC core.
-Qiuxu
> cleans up the code for log_ue/ce and corresponding APIs.
>
> Kamati Srinivas (2):
> edac: Modify sysfs enabled values log_ue, log_ce to bool
> edac: Add support to handle DE (Deferred Errors)
>
> drivers/edac/edac_device.c | 57 ++++++++++++++++++++++++++++++--
> drivers/edac/edac_device.h | 34 +++++++++++++++++--
> drivers/edac/edac_device_sysfs.c | 50 ++++++++++++++++++++++++----
> 3 files changed, 130 insertions(+), 11 deletions(-)
On 3/3/23 22:30, Qiuxu Zhuo wrote:
>> From: Kamati Srinivas <[email protected]>
>> ...
>>
>> This patchset adds support to handle Deferred error, also
>
> It would be better if there is a use case showing some EDAC
> driver(s) needs this newly added "Deferred Error" code
> in the EDAC core.
>
I agree. On AMD systems, "deferred" errors are uncorrectable errors that
do not require immediate action. This may be similar to "action
optional" errors. They are still uncorrectable errors and can be counted
as such.
Thanks,
Yazen