2019-09-05 11:18:03

by Hanna Hawa

[permalink] [raw]
Subject: [PATCH 1/1] edac: Add an API for edac device to report for multiple errors

Add an API for edac device to report multiple errors with same type.

Signed-off-by: Hanna Hawa <[email protected]>
---
drivers/edac/edac_device.c | 66 +++++++++++++++++++++++++++++---------
drivers/edac/edac_device.h | 31 ++++++++++++++++--
2 files changed, 79 insertions(+), 18 deletions(-)

diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
index 65cf2b9355c4..bf6a4fd9831b 100644
--- a/drivers/edac/edac_device.c
+++ b/drivers/edac/edac_device.c
@@ -555,12 +555,15 @@ static inline int edac_device_get_panic_on_ue(struct edac_device_ctl_info
return edac_dev->panic_on_ue;
}

-void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
- int inst_nr, int block_nr, const char *msg)
+static void __edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
+ u16 error_count, int inst_nr, int block_nr,
+ const char *msg)
{
struct edac_device_instance *instance;
struct edac_device_block *block = NULL;

+ WARN_ON(!error_count);
+
if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) {
edac_device_printk(edac_dev, KERN_ERR,
"INTERNAL ERROR: 'instance' out of range "
@@ -582,27 +585,44 @@ void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,

if (instance->nr_blocks > 0) {
block = instance->blocks + block_nr;
- block->counters.ce_count++;
+ block->counters.ce_count += error_count;
}

/* Propagate the count up the 'totals' tree */
- instance->counters.ce_count++;
- edac_dev->counters.ce_count++;
+ instance->counters.ce_count += error_count;
+ edac_dev->counters.ce_count += error_count;

if (edac_device_get_log_ce(edac_dev))
edac_device_printk(edac_dev, KERN_WARNING,
- "CE: %s instance: %s block: %s '%s'\n",
+ "CE: %s instance: %s block: %s count: %d '%s'\n",
edac_dev->ctl_name, instance->name,
- block ? block->name : "N/A", msg);
+ block ? block->name : "N/A", error_count, msg);
+}
+
+void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
+ int inst_nr, int block_nr, const char *msg)
+{
+ __edac_device_handle_ce(edac_dev, 1, inst_nr, block_nr, msg);
}
EXPORT_SYMBOL_GPL(edac_device_handle_ce);

-void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
- int inst_nr, int block_nr, const char *msg)
+void edac_device_handle_ce_count(struct edac_device_ctl_info *edac_dev,
+ u16 error_count, int inst_nr, int block_nr,
+ const char *msg)
+{
+ __edac_device_handle_ce(edac_dev, error_count, inst_nr, block_nr, msg);
+}
+EXPORT_SYMBOL_GPL(edac_device_handle_ce_count);
+
+static void __edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
+ u16 error_count, int inst_nr, int block_nr,
+ const char *msg)
{
struct edac_device_instance *instance;
struct edac_device_block *block = NULL;

+ WARN_ON(!error_count);
+
if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) {
edac_device_printk(edac_dev, KERN_ERR,
"INTERNAL ERROR: 'instance' out of range "
@@ -624,22 +644,36 @@ void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,

if (instance->nr_blocks > 0) {
block = instance->blocks + block_nr;
- block->counters.ue_count++;
+ block->counters.ue_count += error_count;
}

/* Propagate the count up the 'totals' tree */
- instance->counters.ue_count++;
- edac_dev->counters.ue_count++;
+ instance->counters.ue_count += error_count;
+ edac_dev->counters.ue_count += error_count;

if (edac_device_get_log_ue(edac_dev))
edac_device_printk(edac_dev, KERN_EMERG,
- "UE: %s instance: %s block: %s '%s'\n",
+ "UE: %s instance: %s block: %s count: %d '%s'\n",
edac_dev->ctl_name, instance->name,
- block ? block->name : "N/A", msg);
+ block ? block->name : "N/A", error_count, msg);

if (edac_device_get_panic_on_ue(edac_dev))
- panic("EDAC %s: UE instance: %s block %s '%s'\n",
+ panic("EDAC %s: UE instance: %s block %s count: %d '%s'\n",
edac_dev->ctl_name, instance->name,
- block ? block->name : "N/A", msg);
+ block ? block->name : "N/A", error_count, msg);
+}
+
+void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
+ int inst_nr, int block_nr, const char *msg)
+{
+ __edac_device_handle_ue(edac_dev, 1, inst_nr, block_nr, msg);
}
EXPORT_SYMBOL_GPL(edac_device_handle_ue);
+
+void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
+ u16 error_count, int inst_nr, int block_nr,
+ const char *msg)
+{
+ __edac_device_handle_ue(edac_dev, error_count, inst_nr, block_nr, msg);
+}
+EXPORT_SYMBOL_GPL(edac_device_handle_ue_count);
diff --git a/drivers/edac/edac_device.h b/drivers/edac/edac_device.h
index 1aaba74ae411..c8dc83eda64f 100644
--- a/drivers/edac/edac_device.h
+++ b/drivers/edac/edac_device.h
@@ -287,7 +287,7 @@ extern struct edac_device_ctl_info *edac_device_del_device(struct device *dev);

/**
* edac_device_handle_ue():
- * perform a common output and handling of an 'edac_dev' UE event
+ * perform a common output and handling of an 'edac_dev' single UE event
*
* @edac_dev: pointer to struct &edac_device_ctl_info
* @inst_nr: number of the instance where the UE error happened
@@ -298,7 +298,7 @@ extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
int inst_nr, int block_nr, const char *msg);
/**
* edac_device_handle_ce():
- * perform a common output and handling of an 'edac_dev' CE event
+ * perform a common output and handling of an 'edac_dev' single CE event
*
* @edac_dev: pointer to struct &edac_device_ctl_info
* @inst_nr: number of the instance where the CE error happened
@@ -308,6 +308,33 @@ extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
int inst_nr, int block_nr, const char *msg);

+/**
+ * edac_device_handle_ue_count():
+ * perform a common output and handling of an 'edac_dev'
+ *
+ * @edac_dev: pointer to struct &edac_device_ctl_info
+ * @error_count: number of errors of the same type
+ * @inst_nr: number of the instance where the UE error happened
+ * @block_nr: number of the block where the UE error happened
+ * @msg: message to be printed
+ */
+extern void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
+ u16 error_count, int inst_nr,
+ int block_nr, const char *msg);
+/**
+ * edac_device_handle_ce_count():
+ * perform a common output and handling of an 'edac_dev'
+ *
+ * @edac_dev: pointer to struct &edac_device_ctl_info
+ * @error_count: number of errors of the same type
+ * @inst_nr: number of the instance where the CE error happened
+ * @block_nr: number of the block where the CE error happened
+ * @msg: message to be printed
+ */
+extern void edac_device_handle_ce_count(struct edac_device_ctl_info *edac_dev,
+ u16 error_count, int inst_nr,
+ int block_nr, const char *msg);
+
/**
* edac_device_alloc_index: Allocate a unique device index number
*
--
2.17.1


2019-09-05 11:59:48

by Robert Richter

[permalink] [raw]
Subject: Re: [PATCH 1/1] edac: Add an API for edac device to report for multiple errors

Hi Hanna,

thanks for the update. See below.

On 05.09.19 09:37:45, Hanna Hawa wrote:
> Add an API for edac device to report multiple errors with same type.
>
> Signed-off-by: Hanna Hawa <[email protected]>
> ---
> drivers/edac/edac_device.c | 66 +++++++++++++++++++++++++++++---------
> drivers/edac/edac_device.h | 31 ++++++++++++++++--
> 2 files changed, 79 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
> index 65cf2b9355c4..bf6a4fd9831b 100644
> --- a/drivers/edac/edac_device.c
> +++ b/drivers/edac/edac_device.c
> @@ -555,12 +555,15 @@ static inline int edac_device_get_panic_on_ue(struct edac_device_ctl_info
> return edac_dev->panic_on_ue;
> }
>
> -void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
> - int inst_nr, int block_nr, const char *msg)
> +static void __edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
> + u16 error_count, int inst_nr, int block_nr,

Just curious, why u16, some register mask size? Maybe just use unsigned int?

I think the variable can be shortened to 'count', the meaning should
still be clear.

> + const char *msg)
> {
> struct edac_device_instance *instance;
> struct edac_device_block *block = NULL;
>
> + WARN_ON(!error_count);

Should return in this case.

Better use WARN_ON_ONCE() to avoid flooding.

The check should be moved to edac_device_handle_ce_count().

> +
> if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) {
> edac_device_printk(edac_dev, KERN_ERR,
> "INTERNAL ERROR: 'instance' out of range "
> @@ -582,27 +585,44 @@ void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
>
> if (instance->nr_blocks > 0) {
> block = instance->blocks + block_nr;
> - block->counters.ce_count++;
> + block->counters.ce_count += error_count;
> }
>
> /* Propagate the count up the 'totals' tree */
> - instance->counters.ce_count++;
> - edac_dev->counters.ce_count++;
> + instance->counters.ce_count += error_count;
> + edac_dev->counters.ce_count += error_count;
>
> if (edac_device_get_log_ce(edac_dev))
> edac_device_printk(edac_dev, KERN_WARNING,
> - "CE: %s instance: %s block: %s '%s'\n",
> + "CE: %s instance: %s block: %s count: %d '%s'\n",
> edac_dev->ctl_name, instance->name,
> - block ? block->name : "N/A", msg);
> + block ? block->name : "N/A", error_count, msg);
> +}
> +
> +void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
> + int inst_nr, int block_nr, const char *msg)
> +{
> + __edac_device_handle_ce(edac_dev, 1, inst_nr, block_nr, msg);
> }
> EXPORT_SYMBOL_GPL(edac_device_handle_ce);

We could just export the __*() version of those functions and make
everything else inline in the header file? Though, better do this with
two patches to avoid an ABI breakage in case someone wants to backport
it. Let's see what others say here.

>
> -void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
> - int inst_nr, int block_nr, const char *msg)
> +void edac_device_handle_ce_count(struct edac_device_ctl_info *edac_dev,
> + u16 error_count, int inst_nr, int block_nr,
> + const char *msg)
> +{
> + __edac_device_handle_ce(edac_dev, error_count, inst_nr, block_nr, msg);
> +}
> +EXPORT_SYMBOL_GPL(edac_device_handle_ce_count);
> +
> +static void __edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
> + u16 error_count, int inst_nr, int block_nr,
> + const char *msg)

All the above applies for this function too.

> {
> struct edac_device_instance *instance;
> struct edac_device_block *block = NULL;
>
> + WARN_ON(!error_count);
> +
> if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) {
> edac_device_printk(edac_dev, KERN_ERR,
> "INTERNAL ERROR: 'instance' out of range "
> @@ -624,22 +644,36 @@ void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
>
> if (instance->nr_blocks > 0) {
> block = instance->blocks + block_nr;
> - block->counters.ue_count++;
> + block->counters.ue_count += error_count;
> }
>
> /* Propagate the count up the 'totals' tree */
> - instance->counters.ue_count++;
> - edac_dev->counters.ue_count++;
> + instance->counters.ue_count += error_count;
> + edac_dev->counters.ue_count += error_count;
>
> if (edac_device_get_log_ue(edac_dev))
> edac_device_printk(edac_dev, KERN_EMERG,
> - "UE: %s instance: %s block: %s '%s'\n",
> + "UE: %s instance: %s block: %s count: %d '%s'\n",
> edac_dev->ctl_name, instance->name,
> - block ? block->name : "N/A", msg);
> + block ? block->name : "N/A", error_count, msg);
>
> if (edac_device_get_panic_on_ue(edac_dev))
> - panic("EDAC %s: UE instance: %s block %s '%s'\n",
> + panic("EDAC %s: UE instance: %s block %s count: %d '%s'\n",
> edac_dev->ctl_name, instance->name,
> - block ? block->name : "N/A", msg);
> + block ? block->name : "N/A", error_count, msg);
> +}
> +
> +void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
> + int inst_nr, int block_nr, const char *msg)
> +{
> + __edac_device_handle_ue(edac_dev, 1, inst_nr, block_nr, msg);
> }
> EXPORT_SYMBOL_GPL(edac_device_handle_ue);
> +
> +void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
> + u16 error_count, int inst_nr, int block_nr,
> + const char *msg)
> +{
> + __edac_device_handle_ue(edac_dev, error_count, inst_nr, block_nr, msg);
> +}
> +EXPORT_SYMBOL_GPL(edac_device_handle_ue_count);
> diff --git a/drivers/edac/edac_device.h b/drivers/edac/edac_device.h
> index 1aaba74ae411..c8dc83eda64f 100644
> --- a/drivers/edac/edac_device.h
> +++ b/drivers/edac/edac_device.h
> @@ -287,7 +287,7 @@ extern struct edac_device_ctl_info *edac_device_del_device(struct device *dev);
>
> /**
> * edac_device_handle_ue():
> - * perform a common output and handling of an 'edac_dev' UE event
> + * perform a common output and handling of an 'edac_dev' single UE event
> *
> * @edac_dev: pointer to struct &edac_device_ctl_info
> * @inst_nr: number of the instance where the UE error happened
> @@ -298,7 +298,7 @@ extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
> int inst_nr, int block_nr, const char *msg);
> /**
> * edac_device_handle_ce():
> - * perform a common output and handling of an 'edac_dev' CE event
> + * perform a common output and handling of an 'edac_dev' single CE event
> *
> * @edac_dev: pointer to struct &edac_device_ctl_info
> * @inst_nr: number of the instance where the CE error happened
> @@ -308,6 +308,33 @@ extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
> extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
> int inst_nr, int block_nr, const char *msg);
>
> +/**
> + * edac_device_handle_ue_count():
> + * perform a common output and handling of an 'edac_dev'
> + *
> + * @edac_dev: pointer to struct &edac_device_ctl_info
> + * @error_count: number of errors of the same type
> + * @inst_nr: number of the instance where the UE error happened
> + * @block_nr: number of the block where the UE error happened
> + * @msg: message to be printed
> + */
> +extern void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
> + u16 error_count, int inst_nr,
> + int block_nr, const char *msg);
> +/**
> + * edac_device_handle_ce_count():
> + * perform a common output and handling of an 'edac_dev'
> + *
> + * @edac_dev: pointer to struct &edac_device_ctl_info
> + * @error_count: number of errors of the same type
> + * @inst_nr: number of the instance where the CE error happened
> + * @block_nr: number of the block where the CE error happened
> + * @msg: message to be printed
> + */
> +extern void edac_device_handle_ce_count(struct edac_device_ctl_info *edac_dev,
> + u16 error_count, int inst_nr,
> + int block_nr, const char *msg);
> +

Looks otherwise good to me.

Thanks,

-Robert

> /**
> * edac_device_alloc_index: Allocate a unique device index number
> *
> --
> 2.17.1
>

2019-09-08 12:58:55

by Hanna Hawa

[permalink] [raw]
Subject: Re: [PATCH 1/1] edac: Add an API for edac device to report for multiple errors



On 9/5/2019 12:56 PM, Robert Richter wrote:
> Hi Hanna,
>
> thanks for the update. See below.
>
> On 05.09.19 09:37:45, Hanna Hawa wrote:
>> Add an API for edac device to report multiple errors with same type.
>>
>> Signed-off-by: Hanna Hawa <[email protected]>
>> ---
>> drivers/edac/edac_device.c | 66 +++++++++++++++++++++++++++++---------
>> drivers/edac/edac_device.h | 31 ++++++++++++++++--
>> 2 files changed, 79 insertions(+), 18 deletions(-)
>>
>> diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
>> index 65cf2b9355c4..bf6a4fd9831b 100644
>> --- a/drivers/edac/edac_device.c
>> +++ b/drivers/edac/edac_device.c
>> @@ -555,12 +555,15 @@ static inline int edac_device_get_panic_on_ue(struct edac_device_ctl_info
>> return edac_dev->panic_on_ue;
>> }
>>
>> -void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
>> - int inst_nr, int block_nr, const char *msg)
>> +static void __edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
>> + u16 error_count, int inst_nr, int block_nr,
>
> Just curious, why u16, some register mask size? Maybe just use unsigned int?

Wanted to be aligned with edac MC.
I can change it to be u32.

>
> I think the variable can be shortened to 'count', the meaning should
> still be clear.

I think more clear to include 'error'.
maybe shorter name 'err_count'?

>
>> + const char *msg)
>> {
>> struct edac_device_instance *instance;
>> struct edac_device_block *block = NULL;
>>
>> + WARN_ON(!error_count);
>
> Should return in this case.
>
> Better use WARN_ON_ONCE() to avoid flooding.

In case of two drivers using this function with wrong error count, only
the first WARN_ON_ONCE will catch in this case, and other will miss
other wrong usage of other edac device drivers.

>
> The check should be moved to edac_device_handle_ce_count().

I'll move it to edac_device_handle_ce_count.

>
>> +
>> if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) {
>> edac_device_printk(edac_dev, KERN_ERR,
>> "INTERNAL ERROR: 'instance' out of range "
>> @@ -582,27 +585,44 @@ void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
>>
>> if (instance->nr_blocks > 0) {
>> block = instance->blocks + block_nr;
>> - block->counters.ce_count++;
>> + block->counters.ce_count += error_count;
>> }
>>
>> /* Propagate the count up the 'totals' tree */
>> - instance->counters.ce_count++;
>> - edac_dev->counters.ce_count++;
>> + instance->counters.ce_count += error_count;
>> + edac_dev->counters.ce_count += error_count;
>>
>> if (edac_device_get_log_ce(edac_dev))
>> edac_device_printk(edac_dev, KERN_WARNING,
>> - "CE: %s instance: %s block: %s '%s'\n",
>> + "CE: %s instance: %s block: %s count: %d '%s'\n",
>> edac_dev->ctl_name, instance->name,
>> - block ? block->name : "N/A", msg);
>> + block ? block->name : "N/A", error_count, msg);
>> +}
>> +
>> +void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
>> + int inst_nr, int block_nr, const char *msg)
>> +{
>> + __edac_device_handle_ce(edac_dev, 1, inst_nr, block_nr, msg);
>> }
>> EXPORT_SYMBOL_GPL(edac_device_handle_ce);
>
> We could just export the __*() version of those functions and make
> everything else inline in the header file? Though, better do this with
> two patches to avoid an ABI breakage in case someone wants to backport
> it. Let's see what others say here.

Waiting for other reviewers.

>
>>
>> -void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
>> - int inst_nr, int block_nr, const char *msg)
>> +void edac_device_handle_ce_count(struct edac_device_ctl_info *edac_dev,
>> + u16 error_count, int inst_nr, int block_nr,
>> + const char *msg)
>> +{
>> + __edac_device_handle_ce(edac_dev, error_count, inst_nr, block_nr, msg);
>> +}
>> +EXPORT_SYMBOL_GPL(edac_device_handle_ce_count);
>> +
>> +static void __edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
>> + u16 error_count, int inst_nr, int block_nr,
>> + const char *msg)
>
> All the above applies for this function too.
>
>> {
>> struct edac_device_instance *instance;
>> struct edac_device_block *block = NULL;
>>
>> + WARN_ON(!error_count);
>> +
>> if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) {
>> edac_device_printk(edac_dev, KERN_ERR,
>> "INTERNAL ERROR: 'instance' out of range "
>> @@ -624,22 +644,36 @@ void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
>>
>> if (instance->nr_blocks > 0) {
>> block = instance->blocks + block_nr;
>> - block->counters.ue_count++;
>> + block->counters.ue_count += error_count;
>> }
>>
>> /* Propagate the count up the 'totals' tree */
>> - instance->counters.ue_count++;
>> - edac_dev->counters.ue_count++;
>> + instance->counters.ue_count += error_count;
>> + edac_dev->counters.ue_count += error_count;
>>
>> if (edac_device_get_log_ue(edac_dev))
>> edac_device_printk(edac_dev, KERN_EMERG,
>> - "UE: %s instance: %s block: %s '%s'\n",
>> + "UE: %s instance: %s block: %s count: %d '%s'\n",
>> edac_dev->ctl_name, instance->name,
>> - block ? block->name : "N/A", msg);
>> + block ? block->name : "N/A", error_count, msg);
>>
>> if (edac_device_get_panic_on_ue(edac_dev))
>> - panic("EDAC %s: UE instance: %s block %s '%s'\n",
>> + panic("EDAC %s: UE instance: %s block %s count: %d '%s'\n",
>> edac_dev->ctl_name, instance->name,
>> - block ? block->name : "N/A", msg);
>> + block ? block->name : "N/A", error_count, msg);
>> +}
>> +
>> +void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
>> + int inst_nr, int block_nr, const char *msg)
>> +{
>> + __edac_device_handle_ue(edac_dev, 1, inst_nr, block_nr, msg);
>> }
>> EXPORT_SYMBOL_GPL(edac_device_handle_ue);
>> +
>> +void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
>> + u16 error_count, int inst_nr, int block_nr,
>> + const char *msg)
>> +{
>> + __edac_device_handle_ue(edac_dev, error_count, inst_nr, block_nr, msg);
>> +}
>> +EXPORT_SYMBOL_GPL(edac_device_handle_ue_count);
>> diff --git a/drivers/edac/edac_device.h b/drivers/edac/edac_device.h
>> index 1aaba74ae411..c8dc83eda64f 100644
>> --- a/drivers/edac/edac_device.h
>> +++ b/drivers/edac/edac_device.h
>> @@ -287,7 +287,7 @@ extern struct edac_device_ctl_info *edac_device_del_device(struct device *dev);
>>
>> /**
>> * edac_device_handle_ue():
>> - * perform a common output and handling of an 'edac_dev' UE event
>> + * perform a common output and handling of an 'edac_dev' single UE event
>> *
>> * @edac_dev: pointer to struct &edac_device_ctl_info
>> * @inst_nr: number of the instance where the UE error happened
>> @@ -298,7 +298,7 @@ extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
>> int inst_nr, int block_nr, const char *msg);
>> /**
>> * edac_device_handle_ce():
>> - * perform a common output and handling of an 'edac_dev' CE event
>> + * perform a common output and handling of an 'edac_dev' single CE event
>> *
>> * @edac_dev: pointer to struct &edac_device_ctl_info
>> * @inst_nr: number of the instance where the CE error happened
>> @@ -308,6 +308,33 @@ extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
>> extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
>> int inst_nr, int block_nr, const char *msg);
>>
>> +/**
>> + * edac_device_handle_ue_count():
>> + * perform a common output and handling of an 'edac_dev'
>> + *
>> + * @edac_dev: pointer to struct &edac_device_ctl_info
>> + * @error_count: number of errors of the same type
>> + * @inst_nr: number of the instance where the UE error happened
>> + * @block_nr: number of the block where the UE error happened
>> + * @msg: message to be printed
>> + */
>> +extern void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
>> + u16 error_count, int inst_nr,
>> + int block_nr, const char *msg);
>> +/**
>> + * edac_device_handle_ce_count():
>> + * perform a common output and handling of an 'edac_dev'
>> + *
>> + * @edac_dev: pointer to struct &edac_device_ctl_info
>> + * @error_count: number of errors of the same type
>> + * @inst_nr: number of the instance where the CE error happened
>> + * @block_nr: number of the block where the CE error happened
>> + * @msg: message to be printed
>> + */
>> +extern void edac_device_handle_ce_count(struct edac_device_ctl_info *edac_dev,
>> + u16 error_count, int inst_nr,
>> + int block_nr, const char *msg);
>> +
>
> Looks otherwise good to me.

Thanks!

Thanks,
Hanna

>
> Thanks,
>
> -Robert
>
>> /**
>> * edac_device_alloc_index: Allocate a unique device index number
>> *
>> --
>> 2.17.1
>>

2019-09-08 12:59:25

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 1/1] edac: Add an API for edac device to report for multiple errors

On Sun, Sep 08, 2019 at 10:16:02AM +0200, Borislav Petkov wrote:
> On Sun, Sep 08, 2019 at 10:58:31AM +0300, Hawa, Hanna wrote:
> > > Better use WARN_ON_ONCE() to avoid flooding.
> >
> > In case of two drivers using this function with wrong error count, only the
> > first WARN_ON_ONCE will catch in this case, and other will miss other wrong
> > usage of other edac device drivers.
>
> The idea is to catch any driver using a 0 error count and fix it, not to
> flood dmesg. You want _ONCE.

... and you want to return early too, i.e.,

if (WARN_ON_ONCE(!error_count))
return;

Frankly, I'd even remove all the warning functionality and simply do

if (!error_count)
return;

but let's see how much it screams first.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2019-09-09 08:19:20

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 1/1] edac: Add an API for edac device to report for multiple errors

On Sun, Sep 08, 2019 at 10:58:31AM +0300, Hawa, Hanna wrote:
> > Better use WARN_ON_ONCE() to avoid flooding.
>
> In case of two drivers using this function with wrong error count, only the
> first WARN_ON_ONCE will catch in this case, and other will miss other wrong
> usage of other edac device drivers.

The idea is to catch any driver using a 0 error count and fix it, not to
flood dmesg. You want _ONCE.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2019-09-10 18:56:26

by Hanna Hawa

[permalink] [raw]
Subject: Re: [PATCH 1/1] edac: Add an API for edac device to report for multiple errors

Hi

On 9/8/2019 11:35 AM, Borislav Petkov wrote:
> On Sun, Sep 08, 2019 at 10:16:02AM +0200, Borislav Petkov wrote:
>> On Sun, Sep 08, 2019 at 10:58:31AM +0300, Hawa, Hanna wrote:
>>>> Better use WARN_ON_ONCE() to avoid flooding.
>>>
>>> In case of two drivers using this function with wrong error count, only the
>>> first WARN_ON_ONCE will catch in this case, and other will miss other wrong
>>> usage of other edac device drivers.
>>
>> The idea is to catch any driver using a 0 error count and fix it, not to
>> flood dmesg. You want _ONCE.
>
> ... and you want to return early too, i.e.,
>
> if (WARN_ON_ONCE(!error_count))
> return;
>
> Frankly, I'd even remove all the warning functionality and simply do
>
> if (!error_count)
> return;

I'll keep it simple as you suggest and remove the warning functionality.

>
> but let's see how much it screams first.
>

Thanks,
Hanna

2019-09-10 19:01:34

by Robert Richter

[permalink] [raw]
Subject: Re: [PATCH 1/1] edac: Add an API for edac device to report for multiple errors

On 08.09.19 10:58:31, Hawa, Hanna wrote:
> On 9/5/2019 12:56 PM, Robert Richter wrote:

> > > diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
> > > index 65cf2b9355c4..bf6a4fd9831b 100644
> > > --- a/drivers/edac/edac_device.c
> > > +++ b/drivers/edac/edac_device.c
> > > @@ -555,12 +555,15 @@ static inline int edac_device_get_panic_on_ue(struct edac_device_ctl_info
> > > return edac_dev->panic_on_ue;
> > > }
> > > -void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
> > > - int inst_nr, int block_nr, const char *msg)
> > > +static void __edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
> > > + u16 error_count, int inst_nr, int block_nr,
> >
> > Just curious, why u16, some register mask size? Maybe just use unsigned int?
>
> Wanted to be aligned with edac MC.
> I can change it to be u32.

There is no specific reason for u32 either. This code is generic for
many machines and compilers, so unsigned int seems to be the best
choice here to get an optimum.

> > I think the variable can be shortened to 'count', the meaning should
> > still be clear.
>
> I think more clear to include 'error'.
> maybe shorter name 'err_count'?

IMO 'count' is clear here, 'error' isn't. I prefer short names for
local variables and arguments.

> > > +void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
> > > + int inst_nr, int block_nr, const char *msg)
> > > +{
> > > + __edac_device_handle_ce(edac_dev, 1, inst_nr, block_nr, msg);
> > > }
> > > EXPORT_SYMBOL_GPL(edac_device_handle_ce);
> >
> > We could just export the __*() version of those functions and make
> > everything else inline in the header file? Though, better do this with
> > two patches to avoid an ABI breakage in case someone wants to backport
> > it. Let's see what others say here.
>
> Waiting for other reviewers.

If no one else complains I would prefer moving to
__edac_device_handle_* as exported sympbol. But make this change in 2
patches to make backports easy, first add __edac_device_handle_*()
(and introduce the *_count() inline functions), then remove
edac_device_handle_*() entirely.

Thanks,

-Robert