The current ghes_edac driver does not update per-dimm error
counters when reporting memory errors, because there is no
platform-independent way to find DIMMs based on the error
information provided by firmware. This patch offers a solution
for platforms whose firmwares provide valid module handles
(SMBIOS type 17) in error records. In this case ghes_edac will
use the module handles to locate DIMMs and thus makes per-dimm
error reporting possible.
Signed-off-by: Fan Wu <[email protected]>
---
drivers/edac/ghes_edac.c | 36 +++++++++++++++++++++++++++++++++---
include/linux/edac.h | 2 ++
2 files changed, 35 insertions(+), 3 deletions(-)
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 473aeec..db527f0 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -81,6 +81,26 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
(*num_dimm)++;
}
+static int ghes_edac_dimm_index(u16 handle)
+{
+ struct mem_ctl_info *mci;
+ int i;
+
+ if (!ghes_pvt)
+ return -1;
+
+ mci = ghes_pvt->mci;
+
+ if (!mci)
+ return -1;
+
+ for (i = 0; i < mci->tot_dimms; i++) {
+ if (mci->dimms[i]->smbios_handle == handle)
+ return i;
+ }
+ return -1;
+}
+
static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
{
struct ghes_edac_dimm_fill *dimm_fill = arg;
@@ -177,6 +197,8 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
entry->total_width, entry->data_width);
}
+ dimm->smbios_handle = entry->handle;
+
dimm_fill->count++;
}
}
@@ -327,12 +349,20 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
const char *bank = NULL, *device = NULL;
+ int index = -1;
+
dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
+ p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
+ mem_err->mem_dev_handle);
if (bank != NULL && device != NULL)
p += sprintf(p, "DIMM location:%s %s ", bank, device);
- else
- p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
- mem_err->mem_dev_handle);
+
+ index = ghes_edac_dimm_index(mem_err->mem_dev_handle);
+ if (index >= 0) {
+ e->top_layer = index;
+ e->enable_per_layer_report = true;
+ }
+
}
if (p > e->location)
*(p - 1) = '\0';
diff --git a/include/linux/edac.h b/include/linux/edac.h
index bffb978..a45ce1f 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -451,6 +451,8 @@ struct dimm_info {
u32 nr_pages; /* number of pages on this dimm */
unsigned csrow, cschannel; /* Points to the old API data */
+
+ u16 smbios_handle; /* Handle for SMBIOS type 17 */
};
/**
--
2.7.4
On Wed, Aug 29, 2018 at 06:33:52PM +0000, Fan Wu wrote:
> The current ghes_edac driver does not update per-dimm error
> counters when reporting memory errors, because there is no
> platform-independent way to find DIMMs based on the error
> information provided by firmware. This patch offers a solution
> for platforms whose firmwares provide valid module handles
> (SMBIOS type 17) in error records. In this case ghes_edac will
> use the module handles to locate DIMMs and thus makes per-dimm
> error reporting possible.
>
> Signed-off-by: Fan Wu <[email protected]>
> ---
> drivers/edac/ghes_edac.c | 36 +++++++++++++++++++++++++++++++++---
> include/linux/edac.h | 2 ++
> 2 files changed, 35 insertions(+), 3 deletions(-)
If we're going to do this, it needs to be tested on an x86 box which loads
ghes_edac. Adding Toshi to Cc.
Otherwise it must remain ARM-specific.
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index 473aeec..db527f0 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -81,6 +81,26 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
> (*num_dimm)++;
> }
>
> +static int ghes_edac_dimm_index(u16 handle)
get_dimm_smbios_handle()
> +{
> + struct mem_ctl_info *mci;
> + int i;
> +
> + if (!ghes_pvt)
> + return -1;
You don't need that test.
> +
> + mci = ghes_pvt->mci;
> +
> + if (!mci)
> + return -1;
Ditto.
> +
> + for (i = 0; i < mci->tot_dimms; i++) {
> + if (mci->dimms[i]->smbios_handle == handle)
> + return i;
> + }
> + return -1;
> +}
> +
> static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
> {
> struct ghes_edac_dimm_fill *dimm_fill = arg;
--
Regards/Gruss,
Boris.
ECO tip #101: Trim your mails when you reply.
--
Hi Fan,
On 29/08/18 19:33, Fan Wu wrote:
> The current ghes_edac driver does not update per-dimm error
> counters when reporting memory errors, because there is no
> platform-independent way to find DIMMs based on the error
> information provided by firmware.
I'd argue there is: its in the CPER records, we just didn't do anything useful
with the information in the past!
> This patch offers a solution
> for platforms whose firmwares provide valid module handles
> (SMBIOS type 17) in error records. In this case ghes_edac will
> use the module handles to locate DIMMs and thus makes per-dimm
> error reporting possible.
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index 473aeec..db527f0 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -81,6 +81,26 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
> (*num_dimm)++;
> }
>
> +static int ghes_edac_dimm_index(u16 handle)
> +{
> + struct mem_ctl_info *mci;
> + int i;
> +
> + if (!ghes_pvt)
> + return -1;
ghes_edac_report_mem_error() already checked this, as its the only caller there
is no need to check it again.
> + mci = ghes_pvt->mci;
> +
> + if (!mci)
> + return -1;
Can this happen? ghes_edac_report_mem_error() would have dereferenced this already!
If you need the struct mem_ctl_info, you may as well pass it in as the only
caller has it to hand.
> +
> + for (i = 0; i < mci->tot_dimms; i++) {
> + if (mci->dimms[i]->smbios_handle == handle)
> + return i;
> + }
> + return -1;
> +}
> +
> static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
> {
> struct ghes_edac_dimm_fill *dimm_fill = arg;
> @@ -177,6 +197,8 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
> entry->total_width, entry->data_width);
> }
>
> + dimm->smbios_handle = entry->handle;
We aren't checking for duplicate handles, (e.g. they're all zero). I think this
is fine as chances are firmware on those systems won't set
CPER_MEM_VALID_MODULE_HANDLE. If it does, the handle it gives us is ambiguous,
and we pick a dimm, instead of whine-ing about broken firmware tables.
(I'm just drawing attention to it in case someone disagrees)
> dimm_fill->count++;
> }
> }
> @@ -327,12 +349,20 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
> p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
> if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
> const char *bank = NULL, *device = NULL;
> + int index = -1;
> +
> dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
> + p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
> + mem_err->mem_dev_handle);
> if (bank != NULL && device != NULL)
> p += sprintf(p, "DIMM location:%s %s ", bank, device);
> - else
> - p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
> - mem_err->mem_dev_handle);
Why do we now print the handle every time? The handle is pretty meaningless, it
can only be used to find the location-strings, if we get those we print them
instead.
> + index = ghes_edac_dimm_index(mem_err->mem_dev_handle);
> + if (index >= 0) {
> + e->top_layer = index;
> + e->enable_per_layer_report = true;
> + }
> +
> }
> if (p > e->location)
> *(p - 1) = '\0';
Looks good to me!
Thanks,
James
Hi Boris,
> If we're going to do this, it needs to be tested on an x86 box which loads
> ghes_edac. Adding Toshi to Cc.
>
> Otherwise it must remain ARM-specific.
Toshi it would be great if you can help! I'll also test the change in x86 but
not sure if the firmware updates module_handle.
> > +static int ghes_edac_dimm_index(u16 handle)
>
> get_dimm_smbios_handle()
This function returns an index. So how about get_dimm_smbios_index()?
> > +{
> > + struct mem_ctl_info *mci;
> > + int i;
> > +
> > + if (!ghes_pvt)
> > + return -1;
>
> You don't need that test.
Will remove.
> > +
> > + mci = ghes_pvt->mci;
> > +
> > + if (!mci)
> > + return -1;
>
> Ditto.
Will remove
Thanks,
Fan
Hi James,
> > The current ghes_edac driver does not update per-dimm error counters
> > when reporting memory errors, because there is no platform-independent
> > way to find DIMMs based on the error information provided by firmware.
>
> I'd argue there is: its in the CPER records, we just didn't do anything useful
> with the information in the past!
Agreed. Will update the wording.
> > +static int ghes_edac_dimm_index(u16 handle) {
> > + struct mem_ctl_info *mci;
> > + int i;
> > +
> > + if (!ghes_pvt)
> > + return -1;
>
> ghes_edac_report_mem_error() already checked this, as its the only caller
> there is no need to check it again.
Will remove.
>
> > + mci = ghes_pvt->mci;
> > +
> > + if (!mci)
> > + return -1;
>
> Can this happen? ghes_edac_report_mem_error() would have
> dereferenced this already!
>
> If you need the struct mem_ctl_info, you may as well pass it in as the only
> caller has it to hand.
Will remove.
>
> > +
> > + for (i = 0; i < mci->tot_dimms; i++) {
> > + if (mci->dimms[i]->smbios_handle == handle)
> > + return i;
> > + }
> > + return -1;
> > +}
> > +
> > static void ghes_edac_dmidecode(const struct dmi_header *dh, void
> > *arg) {
> > struct ghes_edac_dimm_fill *dimm_fill = arg; @@ -177,6 +197,8 @@
> > static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
> > entry->total_width, entry->data_width);
> > }
> >
> > + dimm->smbios_handle = entry->handle;
>
> We aren't checking for duplicate handles, (e.g. they're all zero). I think this is
> fine as chances are firmware on those systems won't set
> CPER_MEM_VALID_MODULE_HANDLE. If it does, the handle it gives us is
> ambiguous, and we pick a dimm, instead of whine-ing about broken
> firmware tables.
>
> (I'm just drawing attention to it in case someone disagrees)
SBMIOS tables are typically automatically generated so chances for duplicate
handles are small.
>
> > dimm_fill->count++;
> > }
> > }
> > @@ -327,12 +349,20 @@ void ghes_edac_report_mem_error(int sev,
> struct cper_sec_mem_err *mem_err)
> > p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
> > if (mem_err->validation_bits &
> CPER_MEM_VALID_MODULE_HANDLE) {
> > const char *bank = NULL, *device = NULL;
> > + int index = -1;
> > +
> > dmi_memdev_name(mem_err->mem_dev_handle, &bank,
> &device);
>
> > + p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
> > + mem_err->mem_dev_handle);
> > if (bank != NULL && device != NULL)
> > p += sprintf(p, "DIMM location:%s %s ", bank, device);
> > - else
> > - p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
> > - mem_err->mem_dev_handle);
>
> Why do we now print the handle every time? The handle is pretty
> meaningless, it can only be used to find the location-strings, if we get those
> we print them instead.
For ghes_edac the bank/device is informational, and nothing would go wrong
if the bank/device numbers are the same as another entry. But the handle
is now critical for DIMM lookup, thus pull it out.
Thanks!
Fan
On August 30, 2018 5:20:32 PM GMT+03:00, wufan <[email protected]> wrote:
>> > +static int ghes_edac_dimm_index(u16 handle)
>>
>> get_dimm_smbios_handle()
>
>This function returns an index. So how about get_dimm_smbios_index()?
Sure.
--
Sent from a small device: formatting sux and brevity is inevitable.
Hi Fan,
On 30/08/18 15:40, wufan wrote:
>>> @@ -327,12 +349,20 @@ void ghes_edac_report_mem_error(int sev,
>> struct cper_sec_mem_err *mem_err)
>>> p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
>>> if (mem_err->validation_bits &
>> CPER_MEM_VALID_MODULE_HANDLE) {
>>> const char *bank = NULL, *device = NULL;
>>> + int index = -1;
>>> +
>>> dmi_memdev_name(mem_err->mem_dev_handle, &bank,
>> &device);
>>
>>> + p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
>>> + mem_err->mem_dev_handle);
>>> if (bank != NULL && device != NULL)
>>> p += sprintf(p, "DIMM location:%s %s ", bank, device);
>>> - else
>>> - p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
>>> - mem_err->mem_dev_handle);
>>
>> Why do we now print the handle every time? The handle is pretty
>> meaningless, it can only be used to find the location-strings, if we get those
>> we print them instead.
>
> For ghes_edac the bank/device is informational, and nothing would go wrong
> if the bank/device numbers are the same as another entry. But the handle
> is now critical for DIMM lookup, thus pull it out.
Is printing the handle to the kernel log critical?
I'd expect something collecting errors to read from sysfs, not dmesg. I thought
the whole point here was to update the per-dimm counters in sysfs.
Thanks,
James
Hi Boris,
On 30/08/18 11:43, Borislav Petkov wrote:
> On Wed, Aug 29, 2018 at 06:33:52PM +0000, Fan Wu wrote:
>> The current ghes_edac driver does not update per-dimm error
>> counters when reporting memory errors, because there is no
>> platform-independent way to find DIMMs based on the error
>> information provided by firmware. This patch offers a solution
>> for platforms whose firmwares provide valid module handles
>> (SMBIOS type 17) in error records. In this case ghes_edac will
>> use the module handles to locate DIMMs and thus makes per-dimm
>> error reporting possible.
> If we're going to do this, it needs to be tested on an x86 box which loads
> ghes_edac. Adding Toshi to Cc.
Good point, thanks.
> Otherwise it must remain ARM-specific.
Hmmm, that would be a shame.
This should only be a problem if HPE Servers set CPER_MEM_VALID_MODULE_HANDLE,
but don't actually provide module handles, or if firmware has a different idea
of what they are.
If firmware never sets CPER_MEM_VALID_MODULE_HANDLE, this patch shouldn't change
anything.
(Someone must have an x86 that sets CPER_MEM_VALID_MODULE_HANDLE, otherwise the
code wouldn't be there right?!)
Thanks,
James
Hi Zhengqiang,
On 29/08/18 19:33, Fan Wu wrote:
> The current ghes_edac driver does not update per-dimm error
> counters when reporting memory errors, because there is no
> platform-independent way to find DIMMs based on the error
> information provided by firmware. This patch offers a solution
> for platforms whose firmwares provide valid module handles
> (SMBIOS type 17) in error records. In this case ghes_edac will
> use the module handles to locate DIMMs and thus makes per-dimm
> error reporting possible.
Does your platform set CPER_MEM_VALID_MODULE_HANDLE in GHES Memory errors? If
so, any chance you could test this patch on your platform? [0]
(original patch: https://lore.kernel.org/patchwork/patch/978928/)
Thanks,
James
[0] https://marc.info/?l=linux-edac&m=152603960002324
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index 473aeec..db527f0 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -81,6 +81,26 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
> (*num_dimm)++;
> }
>
> +static int ghes_edac_dimm_index(u16 handle)
> +{
> + struct mem_ctl_info *mci;
> + int i;
> +
> + if (!ghes_pvt)
> + return -1;
> +
> + mci = ghes_pvt->mci;
> +
> + if (!mci)
> + return -1;
> +
> + for (i = 0; i < mci->tot_dimms; i++) {
> + if (mci->dimms[i]->smbios_handle == handle)
> + return i;
> + }
> + return -1;
> +}
> +
> static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
> {
> struct ghes_edac_dimm_fill *dimm_fill = arg;
> @@ -177,6 +197,8 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
> entry->total_width, entry->data_width);
> }
>
> + dimm->smbios_handle = entry->handle;
> +
> dimm_fill->count++;
> }
> }
> @@ -327,12 +349,20 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
> p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
> if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
> const char *bank = NULL, *device = NULL;
> + int index = -1;
> +
> dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
> + p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
> + mem_err->mem_dev_handle);
> if (bank != NULL && device != NULL)
> p += sprintf(p, "DIMM location:%s %s ", bank, device);
> - else
> - p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
> - mem_err->mem_dev_handle);
> +
> + index = ghes_edac_dimm_index(mem_err->mem_dev_handle);
> + if (index >= 0) {
> + e->top_layer = index;
> + e->enable_per_layer_report = true;
> + }
> +
> }
> if (p > e->location)
> *(p - 1) = '\0';
> diff --git a/include/linux/edac.h b/include/linux/edac.h
> index bffb978..a45ce1f 100644
> --- a/include/linux/edac.h
> +++ b/include/linux/edac.h
> @@ -451,6 +451,8 @@ struct dimm_info {
> u32 nr_pages; /* number of pages on this dimm */
>
> unsigned csrow, cschannel; /* Points to the old API data */
> +
> + u16 smbios_handle; /* Handle for SMBIOS type 17 */
> };
>
> /**
>
Hi James,
> > For ghes_edac the bank/device is informational, and nothing would go
> > wrong if the bank/device numbers are the same as another entry. But
> > the handle is now critical for DIMM lookup, thus pull it out.
>
> Is printing the handle to the kernel log critical?
>
> I'd expect something collecting errors to read from sysfs, not dmesg. I
> thought the whole point here was to update the per-dimm counters in sysfs.
No, printing out the handle is not critical. What I meant is because the
information is critical it would be nice to have it available for debugging
purpose. Otherwise it would be hard to find the handle if bank/device
is not accurate.
Thanks,
Fan
On Thu, Aug 30, 2018 at 12:32 PM, James Morse <[email protected]> wrote:
> Hi Fan,
>
> On 30/08/18 15:40, wufan wrote:
>>>> @@ -327,12 +349,20 @@ void ghes_edac_report_mem_error(int sev,
>>> struct cper_sec_mem_err *mem_err)
>>>> p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
>>>> if (mem_err->validation_bits &
>>> CPER_MEM_VALID_MODULE_HANDLE) {
>>>> const char *bank = NULL, *device = NULL;
>>>> + int index = -1;
>>>> +
>>>> dmi_memdev_name(mem_err->mem_dev_handle, &bank,
>>> &device);
>>>
>>>> + p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
>>>> + mem_err->mem_dev_handle);
>>>> if (bank != NULL && device != NULL)
>>>> p += sprintf(p, "DIMM location:%s %s ", bank, device);
>>>> - else
>>>> - p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
>>>> - mem_err->mem_dev_handle);
>>>
>>> Why do we now print the handle every time? The handle is pretty
>>> meaningless, it can only be used to find the location-strings, if we get those
>>> we print them instead.
>>
>> For ghes_edac the bank/device is informational, and nothing would go wrong
>> if the bank/device numbers are the same as another entry. But the handle
>> is now critical for DIMM lookup, thus pull it out.
>
> Is printing the handle to the kernel log critical?
>
I don't see why we would need this print. The bank/device
print is enough to map what is shown in dmesg to an SMBIOS
entry if that's really needed.
Thanks,
Tyler
On 30/08/2018 17:34, James Morse wrote:
Hi James,
Zhengqiang no longer works on this topic, so I have cc'ed some more guys
who should be able to help.
John
> Hi Zhengqiang,
>
> On 29/08/18 19:33, Fan Wu wrote:
>> The current ghes_edac driver does not update per-dimm error
>> counters when reporting memory errors, because there is no
>> platform-independent way to find DIMMs based on the error
>> information provided by firmware. This patch offers a solution
>> for platforms whose firmwares provide valid module handles
>> (SMBIOS type 17) in error records. In this case ghes_edac will
>> use the module handles to locate DIMMs and thus makes per-dimm
>> error reporting possible.
>
> Does your platform set CPER_MEM_VALID_MODULE_HANDLE in GHES Memory errors? If
> so, any chance you could test this patch on your platform? [0]
> (original patch: https://lore.kernel.org/patchwork/patch/978928/)
>
> Thanks,
>
> James
>
> [0] https://marc.info/?l=linux-edac&m=152603960002324
>
>
>> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
>> index 473aeec..db527f0 100644
>> --- a/drivers/edac/ghes_edac.c
>> +++ b/drivers/edac/ghes_edac.c
>> @@ -81,6 +81,26 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
>> (*num_dimm)++;
>> }
>>
>> +static int ghes_edac_dimm_index(u16 handle)
>> +{
>> + struct mem_ctl_info *mci;
>> + int i;
>> +
>> + if (!ghes_pvt)
>> + return -1;
>> +
>> + mci = ghes_pvt->mci;
>> +
>> + if (!mci)
>> + return -1;
>> +
>> + for (i = 0; i < mci->tot_dimms; i++) {
>> + if (mci->dimms[i]->smbios_handle == handle)
>> + return i;
>> + }
>> + return -1;
>> +}
>> +
>> static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
>> {
>> struct ghes_edac_dimm_fill *dimm_fill = arg;
>> @@ -177,6 +197,8 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
>> entry->total_width, entry->data_width);
>> }
>>
>> + dimm->smbios_handle = entry->handle;
>> +
>> dimm_fill->count++;
>> }
>> }
>> @@ -327,12 +349,20 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
>> p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
>> if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
>> const char *bank = NULL, *device = NULL;
>> + int index = -1;
>> +
>> dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
>> + p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
>> + mem_err->mem_dev_handle);
>> if (bank != NULL && device != NULL)
>> p += sprintf(p, "DIMM location:%s %s ", bank, device);
>> - else
>> - p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
>> - mem_err->mem_dev_handle);
>> +
>> + index = ghes_edac_dimm_index(mem_err->mem_dev_handle);
>> + if (index >= 0) {
>> + e->top_layer = index;
>> + e->enable_per_layer_report = true;
>> + }
>> +
>> }
>> if (p > e->location)
>> *(p - 1) = '\0';
>> diff --git a/include/linux/edac.h b/include/linux/edac.h
>> index bffb978..a45ce1f 100644
>> --- a/include/linux/edac.h
>> +++ b/include/linux/edac.h
>> @@ -451,6 +451,8 @@ struct dimm_info {
>> u32 nr_pages; /* number of pages on this dimm */
>>
>> unsigned csrow, cschannel; /* Points to the old API data */
>> +
>> + u16 smbios_handle; /* Handle for SMBIOS type 17 */
>> };
>>
>> /**
>>
>
>
> .
>
Hi Tyler,
> > Is printing the handle to the kernel log critical?
> >
>
> I don't see why we would need this print. The bank/device print is enough to
> map what is shown in dmesg to an SMBIOS entry if that's really needed.
This change is mostly for convenience. I'll revert it since we have two votes
against it now.
Thanks,
Fan
On 2018/8/31 0:50, John Garry wrote:
> On 30/08/2018 17:34, James Morse wrote:
>
> Hi James,
>
> Zhengqiang no longer works on this topic, so I have cc'ed some more guys who should be able to help.
>
> John
>
>> Hi Zhengqiang,
>>
>> On 29/08/18 19:33, Fan Wu wrote:
>>> The current ghes_edac driver does not update per-dimm error
>>> counters when reporting memory errors, because there is no
>>> platform-independent way to find DIMMs based on the error
>>> information provided by firmware. This patch offers a solution
>>> for platforms whose firmwares provide valid module handles
>>> (SMBIOS type 17) in error records. In this case ghes_edac will
>>> use the module handles to locate DIMMs and thus makes per-dimm
>>> error reporting possible.
>>
>> Does your platform set CPER_MEM_VALID_MODULE_HANDLE in GHES Memory errors? If
>> so, any chance you could test this patch on your platform? [0]
>> (original patch: https://lore.kernel.org/patchwork/patch/978928/)
>>
Hi James,
Our platform do not set CPER_MEM_VALID_MODULE_HANDLE in GHES Memory errors.
Thanks,
tanxiaofei
>> Thanks,
>>
>> James
>>
>> [0] https://marc.info/?l=linux-edac&m=152603960002324
>>
>>
>>> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
>>> index 473aeec..db527f0 100644
>>> --- a/drivers/edac/ghes_edac.c
>>> +++ b/drivers/edac/ghes_edac.c
>>> @@ -81,6 +81,26 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
>>> (*num_dimm)++;
>>> }
>>>
>>> +static int ghes_edac_dimm_index(u16 handle)
>>> +{
>>> + struct mem_ctl_info *mci;
>>> + int i;
>>> +
>>> + if (!ghes_pvt)
>>> + return -1;
>>> +
>>> + mci = ghes_pvt->mci;
>>> +
>>> + if (!mci)
>>> + return -1;
>>> +
>>> + for (i = 0; i < mci->tot_dimms; i++) {
>>> + if (mci->dimms[i]->smbios_handle == handle)
>>> + return i;
>>> + }
>>> + return -1;
>>> +}
>>> +
>>> static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
>>> {
>>> struct ghes_edac_dimm_fill *dimm_fill = arg;
>>> @@ -177,6 +197,8 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
>>> entry->total_width, entry->data_width);
>>> }
>>>
>>> + dimm->smbios_handle = entry->handle;
>>> +
>>> dimm_fill->count++;
>>> }
>>> }
>>> @@ -327,12 +349,20 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
>>> p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
>>> if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
>>> const char *bank = NULL, *device = NULL;
>>> + int index = -1;
>>> +
>>> dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
>>> + p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
>>> + mem_err->mem_dev_handle);
>>> if (bank != NULL && device != NULL)
>>> p += sprintf(p, "DIMM location:%s %s ", bank, device);
>>> - else
>>> - p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
>>> - mem_err->mem_dev_handle);
>>> +
>>> + index = ghes_edac_dimm_index(mem_err->mem_dev_handle);
>>> + if (index >= 0) {
>>> + e->top_layer = index;
>>> + e->enable_per_layer_report = true;
>>> + }
>>> +
>>> }
>>> if (p > e->location)
>>> *(p - 1) = '\0';
>>> diff --git a/include/linux/edac.h b/include/linux/edac.h
>>> index bffb978..a45ce1f 100644
>>> --- a/include/linux/edac.h
>>> +++ b/include/linux/edac.h
>>> @@ -451,6 +451,8 @@ struct dimm_info {
>>> u32 nr_pages; /* number of pages on this dimm */
>>>
>>> unsigned csrow, cschannel; /* Points to the old API data */
>>> +
>>> + u16 smbios_handle; /* Handle for SMBIOS type 17 */
>>> };
>>>
>>> /**
>>>
>>
>>
>> .
>>
>
>
>
> .
>
--
best wishes
谭小飞
Thanks tanxiaofei!
Boris/James, are you OK to sign off, or you want to see more tests on
this patch?
Thanks,
Fan
> -----Original Message-----
> From: tanxiaofei <[email protected]>
> Sent: Friday, August 31, 2018 4:06 AM
>
> Hi James,
>
> Our platform do not set CPER_MEM_VALID_MODULE_HANDLE in GHES
> Memory errors.
>
> Thanks,
> tanxiaofei
On Mon, Sep 03, 2018 at 09:05:29AM -0600, wufan wrote:
> Thanks tanxiaofei!
>
> Boris/James, are you OK to sign off, or you want to see more tests on
> this patch?
Please do not top-post.
Now, I don't have any problems with it - I'm still sceptical as the
firmware is a stinking pile but if we cannot find a broken case, I guess
we can take this one for a spin and revert it if there's trouble down
the road.
Unless James has objections.
Then, you sent a v2 here:
https://lkml.kernel.org/r/[email protected]
and you've received a bunch of Reviewed-by's and Tested-by's and a
couple of new comments.
Incorporate *all* of those and send me a v3.
Thx.
--
Regards/Gruss,
Boris.
Good mailing practices for 400: avoid top-posting and trim the reply.