This patch set is a rework of the ghes_edac and edac_mc driver. It
addresses issues found during code review and while working with the
code. The changes include:
* improve function interfaces and data structures to decrease
complexity such as number of function arguments, unused data, etc.
(#1, #2, #14, #17, #18, #19),
* add helper functions and factor out code (#3, #15, #16, #20)
* fix style issues found by checkpatch (#4)
* improve code readability (#5, #6, #7, #8)
* use of standard kernel macros (#9)
* code unification (#10, #11, #12)
* documentation updates (#13)
v2:
* rebased onto edac-for-next and ghes locking fix
* moved adding idx to struct dimm_info to patch "EDAC: Remove
EDAC_DIMM_OFF() macro"
* added patch: "EDAC, mc: Remove needless zero string termination"
* added Mauro's Reviewed-by: tags
* moved patches to the end:
EDAC, mc: Remove per layer counters
EDAC, mc: Split edac_mc_alloc() into smaller functions
EDAC, mc: Reorder functions edac_mc_alloc*()
EDAC, mc: Rework edac_raw_mc_handle_error() to use struct dimm_info
EDAC: Store error type in struct edac_raw_error_desc
EDAC, mc: Determine mci pointer from the error descriptor
EDAC, mc: Create new function edac_inc_csrow()
Robert Richter (20):
EDAC: Replace EDAC_DIMM_PTR() macro with edac_get_dimm() function
EDAC: Remove EDAC_DIMM_OFF() macro
EDAC: Introduce mci_for_each_dimm() iterator
EDAC, mc: Do not BUG_ON() in edac_mc_alloc()
EDAC, mc: Remove needless zero string termination
EDAC, mc: Reduce indentation level in edac_mc_handle_error()
EDAC, mc: Rename iterator variable to idx
EDAC: Remove misleading comment in struct edac_raw_error_desc
EDAC, ghes: Use standard kernel macros for page calculations
EDAC, ghes: Fix grain calculation
EDAC, ghes: Remove intermediate buffer pvt->detail_location
EDAC, ghes: Unify trace_mc_event() code with edac_mc driver
EDAC, Documentation: Describe CPER module definition and DIMM ranks
EDAC, mc: Remove per layer counters
EDAC, mc: Split edac_mc_alloc() into smaller functions
EDAC, mc: Reorder functions edac_mc_alloc*()
EDAC, mc: Rework edac_raw_mc_handle_error() to use struct dimm_info
EDAC: Store error type in struct edac_raw_error_desc
EDAC, mc: Determine mci pointer from the error descriptor
EDAC, mc: Create new function edac_inc_csrow()
Documentation/admin-guide/ras.rst | 31 +-
drivers/edac/edac_mc.c | 477 +++++++++++++++---------------
drivers/edac/edac_mc.h | 8 +-
drivers/edac/edac_mc_sysfs.c | 65 ++--
drivers/edac/ghes_edac.c | 56 ++--
drivers/edac/i10nm_base.c | 3 +-
drivers/edac/i3200_edac.c | 3 +-
drivers/edac/i5000_edac.c | 5 +-
drivers/edac/i5100_edac.c | 14 +-
drivers/edac/i5400_edac.c | 3 +-
drivers/edac/i7300_edac.c | 3 +-
drivers/edac/i7core_edac.c | 3 +-
drivers/edac/ie31200_edac.c | 7 +-
drivers/edac/pnd2_edac.c | 4 +-
drivers/edac/sb_edac.c | 2 +-
drivers/edac/skx_base.c | 3 +-
drivers/edac/ti_edac.c | 2 +-
include/linux/edac.h | 153 +++++-----
18 files changed, 401 insertions(+), 441 deletions(-)
--
2.20.1
No need to crash the system in case edac_mc_alloc() is called with
invalid arguments, just warn and return. This would cause a checkpatch
warning when touching the code later, so just fix it.
Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
---
drivers/edac/edac_mc.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index c5240bb4c6c0..f2cbca77bc50 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -323,7 +323,9 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
int i, j, row, chn, n, len;
bool per_rank = false;
- BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0);
+ if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0))
+ return NULL;
+
/*
* Calculate the total amount of dimms and csrows/cschannels while
* in the old API emulation mode
--
2.20.1
Introduce the mci_for_each_dimm() iterator. It returns a pointer to a
struct dimm_info. This makes the declaration and use of an index
obsolete and avoids access to internal data of struct mci (direct
array access etc).
Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
---
drivers/edac/edac_mc.c | 19 +++++++++++--------
drivers/edac/edac_mc_sysfs.c | 29 ++++++++++++-----------------
drivers/edac/ghes_edac.c | 9 +++++----
drivers/edac/i5100_edac.c | 13 +++++--------
include/linux/edac.h | 7 +++++++
5 files changed, 40 insertions(+), 37 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 72088d49b03b..c5240bb4c6c0 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -145,15 +145,18 @@ static void edac_mc_dump_channel(struct rank_info *chan)
edac_dbg(4, " channel->dimm = %p\n", chan->dimm);
}
-static void edac_mc_dump_dimm(struct dimm_info *dimm, int number)
+static void edac_mc_dump_dimm(struct dimm_info *dimm)
{
char location[80];
+ if (!dimm->nr_pages)
+ return;
+
edac_dimm_info_location(dimm, location, sizeof(location));
edac_dbg(4, "%s%i: %smapped as virtual row %d, chan %d\n",
dimm->mci->csbased ? "rank" : "dimm",
- number, location, dimm->csrow, dimm->cschannel);
+ dimm->idx, location, dimm->csrow, dimm->cschannel);
edac_dbg(4, " dimm = %p\n", dimm);
edac_dbg(4, " dimm->label = '%s'\n", dimm->label);
edac_dbg(4, " dimm->nr_pages = 0x%x\n", dimm->nr_pages);
@@ -702,6 +705,7 @@ EXPORT_SYMBOL_GPL(edac_get_owner);
int edac_mc_add_mc_with_groups(struct mem_ctl_info *mci,
const struct attribute_group **groups)
{
+ struct dimm_info *dimm;
int ret = -EINVAL;
edac_dbg(0, "\n");
@@ -726,9 +730,9 @@ int edac_mc_add_mc_with_groups(struct mem_ctl_info *mci,
if (csrow->channels[j]->dimm->nr_pages)
edac_mc_dump_channel(csrow->channels[j]);
}
- for (i = 0; i < mci->tot_dimms; i++)
- if (mci->dimms[i]->nr_pages)
- edac_mc_dump_dimm(mci->dimms[i], i);
+
+ mci_for_each_dimm(mci, dimm)
+ edac_mc_dump_dimm(dimm);
}
#endif
mutex_lock(&mem_ctls_mutex);
@@ -1086,6 +1090,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
const char *msg,
const char *other_detail)
{
+ struct dimm_info *dimm;
char *p;
int row = -1, chan = -1;
int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
@@ -1146,9 +1151,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
p = e->label;
*p = '\0';
- for (i = 0; i < mci->tot_dimms; i++) {
- struct dimm_info *dimm = mci->dimms[i];
-
+ mci_for_each_dimm(mci, dimm) {
if (top_layer >= 0 && top_layer != dimm->location[0])
continue;
if (mid_layer >= 0 && mid_layer != dimm->location[1])
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
index 91e4c8f155af..0367554e7437 100644
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -621,8 +621,7 @@ static const struct device_type dimm_attr_type = {
/* Create a DIMM object under specifed memory controller device */
static int edac_create_dimm_object(struct mem_ctl_info *mci,
- struct dimm_info *dimm,
- int index)
+ struct dimm_info *dimm)
{
int err;
dimm->mci = mci;
@@ -632,9 +631,9 @@ static int edac_create_dimm_object(struct mem_ctl_info *mci,
dimm->dev.parent = &mci->dev;
if (mci->csbased)
- dev_set_name(&dimm->dev, "rank%d", index);
+ dev_set_name(&dimm->dev, "rank%d", dimm->idx);
else
- dev_set_name(&dimm->dev, "dimm%d", index);
+ dev_set_name(&dimm->dev, "dimm%d", dimm->idx);
dev_set_drvdata(&dimm->dev, dimm);
pm_runtime_forbid(&mci->dev);
@@ -916,7 +915,8 @@ static const struct device_type mci_attr_type = {
int edac_create_sysfs_mci_device(struct mem_ctl_info *mci,
const struct attribute_group **groups)
{
- int i, err;
+ struct dimm_info *dimm;
+ int err;
/* get the /sys/devices/system/edac subsys reference */
mci->dev.type = &mci_attr_type;
@@ -940,13 +940,12 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci,
/*
* Create the dimm/rank devices
*/
- for (i = 0; i < mci->tot_dimms; i++) {
- struct dimm_info *dimm = mci->dimms[i];
+ mci_for_each_dimm(mci, dimm) {
/* Only expose populated DIMMs */
if (!dimm->nr_pages)
continue;
- err = edac_create_dimm_object(mci, dimm, i);
+ err = edac_create_dimm_object(mci, dimm);
if (err)
goto fail_unregister_dimm;
}
@@ -961,12 +960,9 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci,
return 0;
fail_unregister_dimm:
- for (i--; i >= 0; i--) {
- struct dimm_info *dimm = mci->dimms[i];
- if (!dimm->nr_pages)
- continue;
-
- device_unregister(&dimm->dev);
+ mci_for_each_dimm(mci, dimm) {
+ if (device_is_registered(&dimm->dev))
+ device_unregister(&dimm->dev);
}
device_unregister(&mci->dev);
@@ -978,7 +974,7 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci,
*/
void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci)
{
- int i;
+ struct dimm_info *dimm;
edac_dbg(0, "\n");
@@ -989,8 +985,7 @@ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci)
edac_delete_csrow_objects(mci);
#endif
- for (i = 0; i < mci->tot_dimms; i++) {
- struct dimm_info *dimm = mci->dimms[i];
+ mci_for_each_dimm(mci, dimm) {
if (dimm->nr_pages == 0)
continue;
edac_dbg(1, "unregistering device %s\n", dev_name(&dimm->dev));
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index d92cd99081d2..af27f8063891 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -90,12 +90,13 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
static int get_dimm_smbios_index(struct mem_ctl_info *mci, u16 handle)
{
- int i;
+ struct dimm_info *dimm;
- for (i = 0; i < mci->tot_dimms; i++) {
- if (mci->dimms[i]->smbios_handle == handle)
- return i;
+ mci_for_each_dimm(mci, dimm) {
+ if (dimm->smbios_handle == handle)
+ return dimm->idx;
}
+
return -1;
}
diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c
index 134586753311..0ddc41e47a96 100644
--- a/drivers/edac/i5100_edac.c
+++ b/drivers/edac/i5100_edac.c
@@ -846,20 +846,17 @@ static void i5100_init_interleaving(struct pci_dev *pdev,
static void i5100_init_csrows(struct mem_ctl_info *mci)
{
- int i;
struct i5100_priv *priv = mci->pvt_info;
+ struct dimm_info *dimm;
- for (i = 0; i < mci->tot_dimms; i++) {
- struct dimm_info *dimm;
- const unsigned long npages = i5100_npages(mci, i);
- const unsigned int chan = i5100_csrow_to_chan(mci, i);
- const unsigned int rank = i5100_csrow_to_rank(mci, i);
+ mci_for_each_dimm(mci, dimm) {
+ const unsigned long npages = i5100_npages(mci, dimm->idx);
+ const unsigned int chan = i5100_csrow_to_chan(mci, dimm->idx);
+ const unsigned int rank = i5100_csrow_to_rank(mci, dimm->idx);
if (!npages)
continue;
- dimm = edac_get_dimm(mci, chan, rank, 0);
-
dimm->nr_pages = npages;
dimm->grain = 32;
dimm->dtype = (priv->mtr[chan][rank].width == 4) ?
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 79c5564ee314..8beb6e834be9 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -599,6 +599,13 @@ struct mem_ctl_info {
u16 fake_inject_count;
};
+#define mci_for_each_dimm(mci, dimm) \
+ for ((dimm) = (mci)->dimms[0]; \
+ (dimm); \
+ (dimm) = (dimm)->idx + 1 < (mci)->tot_dimms \
+ ? (mci)->dimms[(dimm)->idx + 1] \
+ : NULL)
+
/**
* edac_get_dimm_by_index - Get DIMM info from a memory controller
* given by an index
--
2.20.1
Reduce the indentation level in edac_mc_handle_error() a bit by using
continue. No functional changes.
Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
---
drivers/edac/edac_mc.c | 57 +++++++++++++++++++++---------------------
1 file changed, 29 insertions(+), 28 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 8bfe76d1bdf1..3dc1c5afabce 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -1171,36 +1171,37 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
* channel/memory controller/... may be affected.
* Also, don't show errors for empty DIMM slots.
*/
- if (e->enable_per_layer_report && dimm->nr_pages) {
- if (n_labels >= EDAC_MAX_LABELS) {
- e->enable_per_layer_report = false;
- break;
- }
- n_labels++;
- if (p != e->label) {
- strcpy(p, OTHER_LABEL);
- p += strlen(OTHER_LABEL);
- }
- strcpy(p, dimm->label);
- p += strlen(p);
+ if (!e->enable_per_layer_report || !dimm->nr_pages)
+ continue;
- /*
- * get csrow/channel of the DIMM, in order to allow
- * incrementing the compat API counters
- */
- edac_dbg(4, "%s csrows map: (%d,%d)\n",
- mci->csbased ? "rank" : "dimm",
- dimm->csrow, dimm->cschannel);
- if (row == -1)
- row = dimm->csrow;
- else if (row >= 0 && row != dimm->csrow)
- row = -2;
-
- if (chan == -1)
- chan = dimm->cschannel;
- else if (chan >= 0 && chan != dimm->cschannel)
- chan = -2;
+ if (n_labels >= EDAC_MAX_LABELS) {
+ e->enable_per_layer_report = false;
+ break;
+ }
+ n_labels++;
+ if (p != e->label) {
+ strcpy(p, OTHER_LABEL);
+ p += strlen(OTHER_LABEL);
}
+ strcpy(p, dimm->label);
+ p += strlen(p);
+
+ /*
+ * get csrow/channel of the DIMM, in order to allow
+ * incrementing the compat API counters
+ */
+ edac_dbg(4, "%s csrows map: (%d,%d)\n",
+ mci->csbased ? "rank" : "dimm",
+ dimm->csrow, dimm->cschannel);
+ if (row == -1)
+ row = dimm->csrow;
+ else if (row >= 0 && row != dimm->csrow)
+ row = -2;
+
+ if (chan == -1)
+ chan = dimm->cschannel;
+ else if (chan >= 0 && chan != dimm->cschannel)
+ chan = -2;
}
if (!e->enable_per_layer_report) {
--
2.20.1
Rename iterator variable to idx. The name is more handy, esp. when
searching it in the code.
Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
---
drivers/edac/edac_mc.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 3dc1c5afabce..f76252b7a787 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -330,14 +330,14 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
* Calculate the total amount of dimms and csrows/cschannels while
* in the old API emulation mode
*/
- for (i = 0; i < n_layers; i++) {
- tot_dimms *= layers[i].size;
- if (layers[i].is_virt_csrow)
- tot_csrows *= layers[i].size;
+ for (idx = 0; idx < n_layers; idx++) {
+ tot_dimms *= layers[idx].size;
+ if (layers[idx].is_virt_csrow)
+ tot_csrows *= layers[idx].size;
else
- tot_channels *= layers[i].size;
+ tot_channels *= layers[idx].size;
- if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT)
+ if (layers[idx].type == EDAC_MC_LAYER_CHIP_SELECT)
per_rank = true;
}
--
2.20.1
Use standard macros for page calculations.
Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: James Morse <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
---
drivers/edac/ghes_edac.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index af27f8063891..944c9d2f0eef 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -319,8 +319,8 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
/* Error address */
if (mem_err->validation_bits & CPER_MEM_VALID_PA) {
- e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT;
- e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK;
+ e->page_frame_number = PHYS_PFN(mem_err->physical_addr);
+ e->offset_in_page = offset_in_page(mem_err->physical_addr);
}
/* Error grain */
--
2.20.1
detail_location[] is used to collect two location strings so they can
be passed as one to trace_mc_event(). Instead of having an extra copy
step, assemble the location string in other_detail[] from the
beginning.
Using other_detail[] to call trace_mc_event() is now the same as in
edac_mc.c and code can be unified.
Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: James Morse <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
---
drivers/edac/ghes_edac.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 4acb456c20f7..cb1ab44361f0 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -21,8 +21,7 @@ struct ghes_edac_pvt {
struct mem_ctl_info *mci;
/* Buffers for the error handling routine */
- char detail_location[240];
- char other_detail[160];
+ char other_detail[400];
char msg[80];
};
@@ -369,6 +368,8 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
/* All other fields are mapped on e->other_detail */
p = pvt->other_detail;
+ p += snprintf(p, sizeof(pvt->other_detail),
+ "APEI location: %s ", e->location);
if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) {
u64 status = mem_err->error_status;
@@ -449,12 +450,10 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
grain_bits = fls_long(e->grain - 1);
/* Generate the trace event */
- snprintf(pvt->detail_location, sizeof(pvt->detail_location),
- "APEI location: %s %s", e->location, e->other_detail);
trace_mc_event(type, e->msg, e->label, e->error_count,
mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
(e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
- grain_bits, e->syndrome, pvt->detail_location);
+ grain_bits, e->syndrome, e->other_detail);
edac_raw_mc_handle_error(type, mci, e);
unlock:
--
2.20.1
The current code to convert a physical address mask to a grain
(defined as granularity in bytes) is:
e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK);
This is broken in several ways:
1) It calculates to wrong grain values. E.g., a physical address mask
of ~0xfff should give a grain of 0x1000. Without considering
PAGE_MASK, there is an off-by-one. Things are worse when also
filtering it with ~PAGE_MASK. This will calculate to a grain with the
upper bits set. In the example it even calculates to ~0.
2) The grain does not depend on and is unrelated to the kernel's
page-size. The page-size only matters when unmapping memory in
memory_failure(). Smaller grains are wrongly rounded up to the
page-size, on architectures with a configurable page-size (e.g. arm64)
this could round up to the even bigger page-size of the hypervisor.
Fix this with:
e->grain = ~mem_err->physical_addr_mask + 1;
The grain_bits are defined as:
grain = 1 << grain_bits;
Change also the grain_bits calculation accordingly, it is the same
formula as in edac_mc.c now and the code can be unified.
The value in ->physical_addr_mask coming from firmware is assumed to
be contiguous, but this is not sanity-checked. However, in case the
mask is non-contiguous, a conversion to grain_bits effectively
converts the grain bit mask to a power of 2 by rounding up.
Suggested-by: James Morse <[email protected]>
Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
---
drivers/edac/ghes_edac.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 944c9d2f0eef..4acb456c20f7 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -230,6 +230,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
/* Cleans the error report buffer */
memset(e, 0, sizeof (*e));
e->error_count = 1;
+ e->grain = 1;
strcpy(e->label, "unknown label");
e->msg = pvt->msg;
e->other_detail = pvt->other_detail;
@@ -325,7 +326,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
/* Error grain */
if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK)
- e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK);
+ e->grain = ~mem_err->physical_addr_mask + 1;
/* Memory error location, mapped on e->location */
p = e->location;
@@ -441,8 +442,13 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
if (p > pvt->other_detail)
*(p - 1) = '\0';
+ /* Sanity-check driver-supplied grain value. */
+ if (WARN_ON_ONCE(!e->grain))
+ e->grain = 1;
+
+ grain_bits = fls_long(e->grain - 1);
+
/* Generate the trace event */
- grain_bits = fls_long(e->grain);
snprintf(pvt->detail_location, sizeof(pvt->detail_location),
"APEI location: %s %s", e->location, e->other_detail);
trace_mc_event(type, e->msg, e->label, e->error_count,
--
2.20.1
Update on CPER DIMM naming convention and DIMM ranks.
Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
---
Documentation/admin-guide/ras.rst | 31 +++++++++++++++++++------------
1 file changed, 19 insertions(+), 12 deletions(-)
diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst
index 2b20f5f7380d..26e02a59f0f4 100644
--- a/Documentation/admin-guide/ras.rst
+++ b/Documentation/admin-guide/ras.rst
@@ -330,9 +330,12 @@ There can be multiple csrows and multiple channels.
.. [#f4] Nowadays, the term DIMM (Dual In-line Memory Module) is widely
used to refer to a memory module, although there are other memory
- packaging alternatives, like SO-DIMM, SIMM, etc. Along this document,
- and inside the EDAC system, the term "dimm" is used for all memory
- modules, even when they use a different kind of packaging.
+ packaging alternatives, like SO-DIMM, SIMM, etc. The UEFI
+ specification (Version 2.7) defines a memory module in the Common
+ Platform Error Record (CPER) section to be an SMBIOS Memory Device
+ (Type 17). Along this document, and inside the EDAC system, the term
+ "dimm" is used for all memory modules, even when they use a
+ different kind of packaging.
Memory controllers allow for several csrows, with 8 csrows being a
typical value. Yet, the actual number of csrows depends on the layout of
@@ -349,12 +352,14 @@ controllers. The following example will assume 2 channels:
| | ``ch0`` | ``ch1`` |
+============+===========+===========+
| ``csrow0`` | DIMM_A0 | DIMM_B0 |
- +------------+ | |
- | ``csrow1`` | | |
+ | | rank0 | rank0 |
+ +------------+ - | - |
+ | ``csrow1`` | rank1 | rank1 |
+------------+-----------+-----------+
| ``csrow2`` | DIMM_A1 | DIMM_B1 |
- +------------+ | |
- | ``csrow3`` | | |
+ | | rank0 | rank0 |
+ +------------+ - | - |
+ | ``csrow3`` | rank1 | rank1 |
+------------+-----------+-----------+
In the above example, there are 4 physical slots on the motherboard
@@ -374,11 +379,13 @@ which the memory DIMM is placed. Thus, when 1 DIMM is placed in each
Channel, the csrows cross both DIMMs.
Memory DIMMs come single or dual "ranked". A rank is a populated csrow.
-Thus, 2 single ranked DIMMs, placed in slots DIMM_A0 and DIMM_B0 above
-will have just one csrow (csrow0). csrow1 will be empty. On the other
-hand, when 2 dual ranked DIMMs are similarly placed, then both csrow0
-and csrow1 will be populated. The pattern repeats itself for csrow2 and
-csrow3.
+In the example above 2 dual ranked DIMMs are similarly placed. Thus,
+both csrow0 and csrow1 are populated. On the other hand, when 2 single
+ranked DIMMs are placed in slots DIMM_A0 and DIMM_B0, then they will
+have just one csrow (csrow0) and csrow1 will be empty. The pattern
+repeats itself for csrow2 and csrow3. Also note that some memory
+controller doesn't have any logic to identify the memory module, see
+``rankX`` directories below.
The representation of the above is reflected in the directory
tree in EDAC's sysfs interface. Starting in directory
--
2.20.1
The code in ghes_edac.c and edac_mc.c for grain_bits calculation and
calling trace_mc_event() is now the same. Move it to a single location
in edac_raw_mc_handle_error().
The only difference is the missing IS_ENABLED(CONFIG_RAS) switch, but
this is needed for ghes too.
Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
---
drivers/edac/edac_mc.c | 30 +++++++++++++++---------------
drivers/edac/ghes_edac.c | 13 -------------
2 files changed, 15 insertions(+), 28 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index f76252b7a787..b6032f51338e 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -1057,6 +1057,21 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
{
char detail[80];
int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer };
+ u8 grain_bits;
+
+ /* Sanity-check driver-supplied grain value. */
+ if (WARN_ON_ONCE(!e->grain))
+ e->grain = 1;
+
+ grain_bits = fls_long(e->grain - 1);
+
+ /* Report the error via the trace interface */
+ if (IS_ENABLED(CONFIG_RAS))
+ trace_mc_event(type, e->msg, e->label, e->error_count,
+ mci->mc_idx, e->top_layer, e->mid_layer,
+ e->low_layer,
+ (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
+ grain_bits, e->syndrome, e->other_detail);
/* Memory type dependent details about the error */
if (type == HW_EVENT_ERR_CORRECTED) {
@@ -1097,7 +1112,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
int row = -1, chan = -1;
int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
int i, n_labels = 0;
- u8 grain_bits;
struct edac_raw_error_desc *e = &mci->error_desc;
edac_dbg(3, "MC%d\n", mci->mc_idx);
@@ -1235,20 +1249,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
if (p > e->location)
*(p - 1) = '\0';
- /* Sanity-check driver-supplied grain value. */
- if (WARN_ON_ONCE(!e->grain))
- e->grain = 1;
-
- grain_bits = fls_long(e->grain - 1);
-
- /* Report the error via the trace interface */
- if (IS_ENABLED(CONFIG_RAS))
- trace_mc_event(type, e->msg, e->label, e->error_count,
- mci->mc_idx, e->top_layer, e->mid_layer,
- e->low_layer,
- (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
- grain_bits, e->syndrome, e->other_detail);
-
edac_raw_mc_handle_error(type, mci, e);
}
EXPORT_SYMBOL_GPL(edac_mc_handle_error);
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index cb1ab44361f0..725b9c58c028 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -207,7 +207,6 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
struct ghes_edac_pvt *pvt;
unsigned long flags;
char *p;
- u8 grain_bits;
/*
* We can do the locking below because GHES defers error processing
@@ -443,18 +442,6 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
if (p > pvt->other_detail)
*(p - 1) = '\0';
- /* Sanity-check driver-supplied grain value. */
- if (WARN_ON_ONCE(!e->grain))
- e->grain = 1;
-
- grain_bits = fls_long(e->grain - 1);
-
- /* Generate the trace event */
- trace_mc_event(type, e->msg, e->label, e->error_count,
- mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
- (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
- grain_bits, e->syndrome, e->other_detail);
-
edac_raw_mc_handle_error(type, mci, e);
unlock:
spin_unlock_irqrestore(&ghes_lock, flags);
--
2.20.1
The error handling functions have the pos[] array argument for
determing the dimm handle. Rework those functions to use the dimm
handle directly.
Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
---
drivers/edac/edac_mc.c | 28 +++++++++++++---------------
drivers/edac/edac_mc.h | 2 ++
drivers/edac/ghes_edac.c | 6 +++++-
3 files changed, 20 insertions(+), 16 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index bbe37af487c3..e4a11218009b 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -925,11 +925,9 @@ const char *edac_layer_name[] = {
EXPORT_SYMBOL_GPL(edac_layer_name);
static void edac_inc_ce_error(struct mem_ctl_info *mci,
- const int pos[EDAC_MAX_LAYERS],
+ struct dimm_info *dimm,
const u16 count)
{
- struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
-
mci->ce_mc += count;
if (dimm)
@@ -939,11 +937,9 @@ static void edac_inc_ce_error(struct mem_ctl_info *mci,
}
static void edac_inc_ue_error(struct mem_ctl_info *mci,
- const int pos[EDAC_MAX_LAYERS],
- const u16 count)
+ struct dimm_info *dimm,
+ const u16 count)
{
- struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
-
mci->ue_mc += count;
if (dimm)
@@ -953,8 +949,8 @@ static void edac_inc_ue_error(struct mem_ctl_info *mci,
}
static void edac_ce_error(struct mem_ctl_info *mci,
+ struct dimm_info *dimm,
const u16 error_count,
- const int pos[EDAC_MAX_LAYERS],
const char *msg,
const char *location,
const char *label,
@@ -982,7 +978,7 @@ static void edac_ce_error(struct mem_ctl_info *mci,
error_count, msg, msg_aux, label,
location, detail);
}
- edac_inc_ce_error(mci, pos, error_count);
+ edac_inc_ce_error(mci, dimm, error_count);
if (mci->scrub_mode == SCRUB_SW_SRC) {
/*
@@ -1006,8 +1002,8 @@ static void edac_ce_error(struct mem_ctl_info *mci,
}
static void edac_ue_error(struct mem_ctl_info *mci,
+ struct dimm_info *dimm,
const u16 error_count,
- const int pos[EDAC_MAX_LAYERS],
const char *msg,
const char *location,
const char *label,
@@ -1041,15 +1037,15 @@ static void edac_ue_error(struct mem_ctl_info *mci,
msg, msg_aux, label, location, detail);
}
- edac_inc_ue_error(mci, pos, error_count);
+ edac_inc_ue_error(mci, dimm, error_count);
}
void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
struct mem_ctl_info *mci,
+ struct dimm_info *dimm,
struct edac_raw_error_desc *e)
{
char detail[80];
- int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer };
u8 grain_bits;
/* Sanity-check driver-supplied grain value. */
@@ -1072,7 +1068,7 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
"page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
e->page_frame_number, e->offset_in_page,
e->grain, e->syndrome);
- edac_ce_error(mci, e->error_count, pos, e->msg, e->location,
+ edac_ce_error(mci, dimm, e->error_count, e->msg, e->location,
e->label, detail, e->other_detail,
e->page_frame_number, e->offset_in_page, e->grain);
} else {
@@ -1080,7 +1076,7 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
"page:0x%lx offset:0x%lx grain:%ld",
e->page_frame_number, e->offset_in_page, e->grain);
- edac_ue_error(mci, e->error_count, pos, e->msg, e->location,
+ edac_ue_error(mci, dimm, e->error_count, e->msg, e->location,
e->label, detail, e->other_detail);
}
@@ -1244,6 +1240,8 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
if (p > e->location)
*(p - 1) = '\0';
- edac_raw_mc_handle_error(type, mci, e);
+ dimm = edac_get_dimm(mci, top_layer, mid_layer, low_layer);
+
+ edac_raw_mc_handle_error(type, mci, dimm, e);
}
EXPORT_SYMBOL_GPL(edac_mc_handle_error);
diff --git a/drivers/edac/edac_mc.h b/drivers/edac/edac_mc.h
index 02aac5c61d00..2c3e2fbcedc4 100644
--- a/drivers/edac/edac_mc.h
+++ b/drivers/edac/edac_mc.h
@@ -214,6 +214,7 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
*
* @type: severity of the error (CE/UE/Fatal)
* @mci: a struct mem_ctl_info pointer
+ * @dimm: a struct dimm_info pointer
* @e: error description
*
* This raw function is used internally by edac_mc_handle_error(). It should
@@ -222,6 +223,7 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
*/
void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
struct mem_ctl_info *mci,
+ struct dimm_info *dimm,
struct edac_raw_error_desc *e);
/**
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 74017da1f72c..6eebaf28e31c 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -201,6 +201,7 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
{
+ struct dimm_info *dimm;
enum hw_event_mc_err_type type;
struct edac_raw_error_desc *e;
struct mem_ctl_info *mci;
@@ -439,7 +440,10 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
if (p > pvt->other_detail)
*(p - 1) = '\0';
- edac_raw_mc_handle_error(type, mci, e);
+ dimm = edac_get_dimm_by_index(mci, e->top_layer);
+
+ edac_raw_mc_handle_error(type, mci, dimm, e);
+
unlock:
spin_unlock_irqrestore(&ghes_lock, flags);
}
--
2.20.1
Have a separate function to count errors in csrow/channel. This better
separates code and reduces the indentation level. No functional
changes.
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/edac_mc.c | 40 +++++++++++++++++++++++++---------------
1 file changed, 25 insertions(+), 15 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 93eac968678e..34b740036ffc 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -1045,6 +1045,26 @@ static struct mem_ctl_info *error_desc_to_mci(struct edac_raw_error_desc *e)
return container_of(e, struct mem_ctl_info, error_desc);
}
+static void edac_inc_csrow(struct edac_raw_error_desc *e, int row, int chan)
+{
+ struct mem_ctl_info *mci = error_desc_to_mci(e);
+ u16 count = e->error_count;
+ enum hw_event_mc_err_type type = e->type;
+
+ if (row < 0)
+ return;
+
+ edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
+
+ if (type == HW_EVENT_ERR_CORRECTED) {
+ mci->csrows[row]->ce_count += count;
+ if (chan >= 0)
+ mci->csrows[row]->channels[chan]->ce_count += count;
+ } else {
+ mci->csrows[row]->ue_count += count;
+ }
+}
+
void edac_raw_mc_handle_error(struct edac_raw_error_desc *e,
struct dimm_info *dimm)
{
@@ -1214,22 +1234,12 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
chan = -2;
}
- if (any_memory) {
+ if (any_memory)
strcpy(e->label, "any memory");
- } else {
- edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
- if (p == e->label)
- strcpy(e->label, "unknown memory");
- if (type == HW_EVENT_ERR_CORRECTED) {
- if (row >= 0) {
- mci->csrows[row]->ce_count += error_count;
- if (chan >= 0)
- mci->csrows[row]->channels[chan]->ce_count += error_count;
- }
- } else
- if (row >= 0)
- mci->csrows[row]->ue_count += error_count;
- }
+ else if (!*e->label)
+ strcpy(e->label, "unknown memory");
+
+ edac_inc_csrow(e, row, chan);
/* Fill the RAM location data */
p = e->location;
--
2.20.1
Store the error type in struct edac_raw_error_desc. This makes the
type parameter of edac_raw_mc_handle_error() obsolete.
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/edac_mc.c | 10 +++++-----
drivers/edac/edac_mc.h | 4 +---
drivers/edac/ghes_edac.c | 11 +++++------
include/linux/edac.h | 1 +
4 files changed, 12 insertions(+), 14 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index e4a11218009b..7b4f5e98dfe8 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -1040,8 +1040,7 @@ static void edac_ue_error(struct mem_ctl_info *mci,
edac_inc_ue_error(mci, dimm, error_count);
}
-void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
- struct mem_ctl_info *mci,
+void edac_raw_mc_handle_error(struct mem_ctl_info *mci,
struct dimm_info *dimm,
struct edac_raw_error_desc *e)
{
@@ -1056,14 +1055,14 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
/* Report the error via the trace interface */
if (IS_ENABLED(CONFIG_RAS))
- trace_mc_event(type, e->msg, e->label, e->error_count,
+ trace_mc_event(e->type, e->msg, e->label, e->error_count,
mci->mc_idx, e->top_layer, e->mid_layer,
e->low_layer,
(e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
grain_bits, e->syndrome, e->other_detail);
/* Memory type dependent details about the error */
- if (type == HW_EVENT_ERR_CORRECTED) {
+ if (e->type == HW_EVENT_ERR_CORRECTED) {
snprintf(detail, sizeof(detail),
"page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
e->page_frame_number, e->offset_in_page,
@@ -1109,6 +1108,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
/* Fills the error report buffer */
memset(e, 0, sizeof (*e));
e->error_count = error_count;
+ e->type = type;
e->top_layer = top_layer;
e->mid_layer = mid_layer;
e->low_layer = low_layer;
@@ -1242,6 +1242,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
dimm = edac_get_dimm(mci, top_layer, mid_layer, low_layer);
- edac_raw_mc_handle_error(type, mci, dimm, e);
+ edac_raw_mc_handle_error(mci, dimm, e);
}
EXPORT_SYMBOL_GPL(edac_mc_handle_error);
diff --git a/drivers/edac/edac_mc.h b/drivers/edac/edac_mc.h
index 2c3e2fbcedc4..a8f1b5b5e873 100644
--- a/drivers/edac/edac_mc.h
+++ b/drivers/edac/edac_mc.h
@@ -212,7 +212,6 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
* edac_raw_mc_handle_error() - Reports a memory event to userspace without
* doing anything to discover the error location.
*
- * @type: severity of the error (CE/UE/Fatal)
* @mci: a struct mem_ctl_info pointer
* @dimm: a struct dimm_info pointer
* @e: error description
@@ -221,8 +220,7 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
* only be called directly when the hardware error come directly from BIOS,
* like in the case of APEI GHES driver.
*/
-void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
- struct mem_ctl_info *mci,
+void edac_raw_mc_handle_error(struct mem_ctl_info *mci,
struct dimm_info *dimm,
struct edac_raw_error_desc *e);
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 6eebaf28e31c..7d325d70d6d3 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -202,7 +202,6 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
{
struct dimm_info *dimm;
- enum hw_event_mc_err_type type;
struct edac_raw_error_desc *e;
struct mem_ctl_info *mci;
struct ghes_edac_pvt *pvt;
@@ -241,17 +240,17 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
switch (sev) {
case GHES_SEV_CORRECTED:
- type = HW_EVENT_ERR_CORRECTED;
+ e->type = HW_EVENT_ERR_CORRECTED;
break;
case GHES_SEV_RECOVERABLE:
- type = HW_EVENT_ERR_UNCORRECTED;
+ e->type = HW_EVENT_ERR_UNCORRECTED;
break;
case GHES_SEV_PANIC:
- type = HW_EVENT_ERR_FATAL;
+ e->type = HW_EVENT_ERR_FATAL;
break;
default:
case GHES_SEV_NO:
- type = HW_EVENT_ERR_INFO;
+ e->type = HW_EVENT_ERR_INFO;
}
edac_dbg(1, "error validation_bits: 0x%08llx\n",
@@ -442,7 +441,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
dimm = edac_get_dimm_by_index(mci, e->top_layer);
- edac_raw_mc_handle_error(type, mci, dimm, e);
+ edac_raw_mc_handle_error(mci, dimm, e);
unlock:
spin_unlock_irqrestore(&ghes_lock, flags);
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 4d9673954856..587c53b87fdf 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -463,6 +463,7 @@ struct edac_raw_error_desc {
long grain;
u16 error_count;
+ enum hw_event_mc_err_type type;
int top_layer;
int mid_layer;
int low_layer;
--
2.20.1
Each struct mci has its own error descriptor. Create a function
error_desc_to_mci() to determine the corresponding mci from an error
descriptor. This eases the parameter list of edac_raw_mc_handle_
error() as the mci pointer do not need to be passed any longer.
While at it, reorder parameters of edac_raw_mc_handle_error().
Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
---
drivers/edac/edac_mc.c | 13 +++++++++----
drivers/edac/edac_mc.h | 8 +++-----
drivers/edac/ghes_edac.c | 2 +-
3 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 7b4f5e98dfe8..93eac968678e 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -1040,10 +1040,15 @@ static void edac_ue_error(struct mem_ctl_info *mci,
edac_inc_ue_error(mci, dimm, error_count);
}
-void edac_raw_mc_handle_error(struct mem_ctl_info *mci,
- struct dimm_info *dimm,
- struct edac_raw_error_desc *e)
+static struct mem_ctl_info *error_desc_to_mci(struct edac_raw_error_desc *e)
+{
+ return container_of(e, struct mem_ctl_info, error_desc);
+}
+
+void edac_raw_mc_handle_error(struct edac_raw_error_desc *e,
+ struct dimm_info *dimm)
{
+ struct mem_ctl_info *mci = error_desc_to_mci(e);
char detail[80];
u8 grain_bits;
@@ -1242,6 +1247,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
dimm = edac_get_dimm(mci, top_layer, mid_layer, low_layer);
- edac_raw_mc_handle_error(mci, dimm, e);
+ edac_raw_mc_handle_error(e, dimm);
}
EXPORT_SYMBOL_GPL(edac_mc_handle_error);
diff --git a/drivers/edac/edac_mc.h b/drivers/edac/edac_mc.h
index a8f1b5b5e873..3b01d5d9d7bc 100644
--- a/drivers/edac/edac_mc.h
+++ b/drivers/edac/edac_mc.h
@@ -212,17 +212,15 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
* edac_raw_mc_handle_error() - Reports a memory event to userspace without
* doing anything to discover the error location.
*
- * @mci: a struct mem_ctl_info pointer
- * @dimm: a struct dimm_info pointer
* @e: error description
+ * @dimm: a struct dimm_info pointer
*
* This raw function is used internally by edac_mc_handle_error(). It should
* only be called directly when the hardware error come directly from BIOS,
* like in the case of APEI GHES driver.
*/
-void edac_raw_mc_handle_error(struct mem_ctl_info *mci,
- struct dimm_info *dimm,
- struct edac_raw_error_desc *e);
+void edac_raw_mc_handle_error(struct edac_raw_error_desc *e,
+ struct dimm_info *dimm);
/**
* edac_mc_handle_error() - Reports a memory event to userspace.
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 7d325d70d6d3..c1bcfdbd6f82 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -441,7 +441,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
dimm = edac_get_dimm_by_index(mci, e->top_layer);
- edac_raw_mc_handle_error(mci, dimm, e);
+ edac_raw_mc_handle_error(e, dimm);
unlock:
spin_unlock_irqrestore(&ghes_lock, flags);
--
2.20.1
Looking at how mci->{ue,ce}_per_layer[EDAC_MAX_LAYERS] is used, it
turns out that only the leaves in the memory hierarchy are consumed
(in sysfs), but not the intermediate layers, e.g.:
count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
These unused counters only add complexity, remove them. The error
counter values are directly stored in struct dimm_info now.
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/edac_mc.c | 106 ++++++++++++-----------------------
drivers/edac/edac_mc_sysfs.c | 20 +++----
drivers/edac/ghes_edac.c | 5 +-
include/linux/edac.h | 7 +--
4 files changed, 47 insertions(+), 91 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index b6032f51338e..dfc17c565d8f 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -315,12 +315,11 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
struct csrow_info *csr;
struct rank_info *chan;
struct dimm_info *dimm;
- u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
unsigned int pos[EDAC_MAX_LAYERS];
- unsigned int idx, size, tot_dimms = 1, count = 1;
- unsigned int tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
+ unsigned int idx, size, tot_dimms = 1;
+ unsigned int tot_csrows = 1, tot_channels = 1;
void *pvt, *p, *ptr = NULL;
- int i, j, row, chn, n, len;
+ int j, row, chn, n, len;
bool per_rank = false;
if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0))
@@ -346,19 +345,10 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
* stringent as what the compiler would provide if we could simply
* hardcode everything into a single struct.
*/
- mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
- layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
- for (i = 0; i < n_layers; i++) {
- count *= layers[i].size;
- edac_dbg(4, "errcount layer %d size %d\n", i, count);
- ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
- ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
- tot_errcount += 2 * count;
- }
-
- edac_dbg(4, "allocating %d error counters\n", tot_errcount);
- pvt = edac_align_ptr(&ptr, sz_pvt, 1);
- size = ((unsigned long)pvt) + sz_pvt;
+ mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
+ layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
+ pvt = edac_align_ptr(&ptr, sz_pvt, 1);
+ size = ((unsigned long)pvt) + sz_pvt;
edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
size,
@@ -374,10 +364,6 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
* rather than an imaginary chunk of memory located at address 0.
*/
layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
- for (i = 0; i < n_layers; i++) {
- mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i]));
- mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i]));
- }
pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
/* setup index and various internal pointers */
@@ -908,53 +894,31 @@ const char *edac_layer_name[] = {
EXPORT_SYMBOL_GPL(edac_layer_name);
static void edac_inc_ce_error(struct mem_ctl_info *mci,
- bool enable_per_layer_report,
const int pos[EDAC_MAX_LAYERS],
const u16 count)
{
- int i, index = 0;
+ struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
mci->ce_mc += count;
- if (!enable_per_layer_report) {
+ if (dimm)
+ dimm->ce_count += count;
+ else
mci->ce_noinfo_count += count;
- return;
- }
-
- for (i = 0; i < mci->n_layers; i++) {
- if (pos[i] < 0)
- break;
- index += pos[i];
- mci->ce_per_layer[i][index] += count;
-
- if (i < mci->n_layers - 1)
- index *= mci->layers[i + 1].size;
- }
}
static void edac_inc_ue_error(struct mem_ctl_info *mci,
- bool enable_per_layer_report,
const int pos[EDAC_MAX_LAYERS],
const u16 count)
{
- int i, index = 0;
+ struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
mci->ue_mc += count;
- if (!enable_per_layer_report) {
+ if (dimm)
+ dimm->ue_count += count;
+ else
mci->ue_noinfo_count += count;
- return;
- }
-
- for (i = 0; i < mci->n_layers; i++) {
- if (pos[i] < 0)
- break;
- index += pos[i];
- mci->ue_per_layer[i][index] += count;
-
- if (i < mci->n_layers - 1)
- index *= mci->layers[i + 1].size;
- }
}
static void edac_ce_error(struct mem_ctl_info *mci,
@@ -965,7 +929,6 @@ static void edac_ce_error(struct mem_ctl_info *mci,
const char *label,
const char *detail,
const char *other_detail,
- const bool enable_per_layer_report,
const unsigned long page_frame_number,
const unsigned long offset_in_page,
long grain)
@@ -988,7 +951,7 @@ static void edac_ce_error(struct mem_ctl_info *mci,
error_count, msg, msg_aux, label,
location, detail);
}
- edac_inc_ce_error(mci, enable_per_layer_report, pos, error_count);
+ edac_inc_ce_error(mci, pos, error_count);
if (mci->scrub_mode == SCRUB_SW_SRC) {
/*
@@ -1018,8 +981,7 @@ static void edac_ue_error(struct mem_ctl_info *mci,
const char *location,
const char *label,
const char *detail,
- const char *other_detail,
- const bool enable_per_layer_report)
+ const char *other_detail)
{
char *msg_aux = "";
@@ -1048,7 +1010,7 @@ static void edac_ue_error(struct mem_ctl_info *mci,
msg, msg_aux, label, location, detail);
}
- edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count);
+ edac_inc_ue_error(mci, pos, error_count);
}
void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
@@ -1079,16 +1041,16 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
"page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
e->page_frame_number, e->offset_in_page,
e->grain, e->syndrome);
- edac_ce_error(mci, e->error_count, pos, e->msg, e->location, e->label,
- detail, e->other_detail, e->enable_per_layer_report,
+ edac_ce_error(mci, e->error_count, pos, e->msg, e->location,
+ e->label, detail, e->other_detail,
e->page_frame_number, e->offset_in_page, e->grain);
} else {
snprintf(detail, sizeof(detail),
"page:0x%lx offset:0x%lx grain:%ld",
e->page_frame_number, e->offset_in_page, e->grain);
- edac_ue_error(mci, e->error_count, pos, e->msg, e->location, e->label,
- detail, e->other_detail, e->enable_per_layer_report);
+ edac_ue_error(mci, e->error_count, pos, e->msg, e->location,
+ e->label, detail, e->other_detail);
}
@@ -1113,6 +1075,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
int i, n_labels = 0;
struct edac_raw_error_desc *e = &mci->error_desc;
+ bool any_memory = true;
edac_dbg(3, "MC%d\n", mci->mc_idx);
@@ -1130,9 +1093,9 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
/*
* Check if the event report is consistent and if the memory
- * location is known. If it is known, enable_per_layer_report will be
- * true, the DIMM(s) label info will be filled and the per-layer
- * error counters will be incremented.
+ * location is known. If it is known, the DIMM(s) label info
+ * will be filled and the DIMM's error counters will be
+ * incremented.
*/
for (i = 0; i < mci->n_layers; i++) {
if (pos[i] >= (int)mci->layers[i].size) {
@@ -1150,7 +1113,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
pos[i] = -1;
}
if (pos[i] >= 0)
- e->enable_per_layer_report = true;
+ any_memory = false;
}
/*
@@ -1180,16 +1143,17 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
e->grain = dimm->grain;
/*
- * If the error is memory-controller wide, there's no need to
- * seek for the affected DIMMs because the whole
- * channel/memory controller/... may be affected.
- * Also, don't show errors for empty DIMM slots.
+ * If the error is memory-controller wide, there's no
+ * need to seek for the affected DIMMs because the
+ * whole channel/memory controller/... may be
+ * affected. Also, don't show errors for empty DIMM
+ * slots.
*/
- if (!e->enable_per_layer_report || !dimm->nr_pages)
+ if (any_memory || !dimm->nr_pages)
continue;
if (n_labels >= EDAC_MAX_LABELS) {
- e->enable_per_layer_report = false;
+ any_memory = true;
break;
}
n_labels++;
@@ -1218,7 +1182,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
chan = -2;
}
- if (!e->enable_per_layer_report) {
+ if (any_memory) {
strcpy(e->label, "any memory");
} else {
edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
index 0367554e7437..8682df2f7f4f 100644
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -556,10 +556,8 @@ static ssize_t dimmdev_ce_count_show(struct device *dev,
char *data)
{
struct dimm_info *dimm = to_dimm(dev);
- u32 count;
- count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
- return sprintf(data, "%u\n", count);
+ return sprintf(data, "%u\n", dimm->ce_count);
}
static ssize_t dimmdev_ue_count_show(struct device *dev,
@@ -567,10 +565,8 @@ static ssize_t dimmdev_ue_count_show(struct device *dev,
char *data)
{
struct dimm_info *dimm = to_dimm(dev);
- u32 count;
- count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][dimm->idx];
- return sprintf(data, "%u\n", count);
+ return sprintf(data, "%u\n", dimm->ue_count);
}
/* dimm/rank attribute files */
@@ -666,7 +662,9 @@ static ssize_t mci_reset_counters_store(struct device *dev,
const char *data, size_t count)
{
struct mem_ctl_info *mci = to_mci(dev);
- int cnt, row, chan, i;
+ struct dimm_info *dimm;
+ int row, chan;
+
mci->ue_mc = 0;
mci->ce_mc = 0;
mci->ue_noinfo_count = 0;
@@ -682,11 +680,9 @@ static ssize_t mci_reset_counters_store(struct device *dev,
ri->channels[chan]->ce_count = 0;
}
- cnt = 1;
- for (i = 0; i < mci->n_layers; i++) {
- cnt *= mci->layers[i].size;
- memset(mci->ce_per_layer[i], 0, cnt * sizeof(u32));
- memset(mci->ue_per_layer[i], 0, cnt * sizeof(u32));
+ mci_for_each_dimm(mci, dimm) {
+ dimm->ue_count = 0;
+ dimm->ce_count = 0;
}
mci->start_time = jiffies;
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 725b9c58c028..74017da1f72c 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -356,11 +356,8 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
mem_err->mem_dev_handle);
index = get_dimm_smbios_index(mci, mem_err->mem_dev_handle);
- if (index >= 0) {
+ if (index >= 0)
e->top_layer = index;
- e->enable_per_layer_report = true;
- }
-
}
if (p > e->location)
*(p - 1) = '\0';
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 67be279abd11..4d9673954856 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -383,6 +383,9 @@ struct dimm_info {
unsigned int csrow, cschannel; /* Points to the old API data */
u16 smbios_handle; /* Handle for SMBIOS type 17 */
+
+ u32 ce_count;
+ u32 ue_count;
};
/**
@@ -453,8 +456,6 @@ struct errcount_attribute_data {
* @location: location of the error
* @label: label of the affected DIMM(s)
* @other_detail: other driver-specific detail about the error
- * @enable_per_layer_report: if false, the error affects all layers
- * (typically, a memory controller error)
*/
struct edac_raw_error_desc {
char location[LOCATION_SIZE];
@@ -470,7 +471,6 @@ struct edac_raw_error_desc {
unsigned long syndrome;
const char *msg;
const char *other_detail;
- bool enable_per_layer_report;
};
/* MEMORY controller information structure
@@ -560,7 +560,6 @@ struct mem_ctl_info {
*/
u32 ce_noinfo_count, ue_noinfo_count;
u32 ue_mc, ce_mc;
- u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
struct completion complete;
--
2.20.1
Reorder the new created functions edac_mc_alloc_csrows() and
edac_mc_alloc_dimms() and move them before edac_mc_alloc(). No further
code changes.
Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
---
drivers/edac/edac_mc.c | 171 ++++++++++++++++++++---------------------
1 file changed, 84 insertions(+), 87 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 042a4504bb7f..bbe37af487c3 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -305,93 +305,6 @@ static void _edac_mc_free(struct mem_ctl_info *mci)
kfree(mci);
}
-static int edac_mc_alloc_csrows(struct mem_ctl_info *mci);
-static int edac_mc_alloc_dimms(struct mem_ctl_info *mci);
-
-struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
- unsigned int n_layers,
- struct edac_mc_layer *layers,
- unsigned int sz_pvt)
-{
- struct mem_ctl_info *mci;
- struct edac_mc_layer *layer;
- unsigned int idx, size, tot_dimms = 1;
- unsigned int tot_csrows = 1, tot_channels = 1;
- void *pvt, *ptr = NULL;
- bool per_rank = false;
-
- if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0))
- return NULL;
-
- /*
- * Calculate the total amount of dimms and csrows/cschannels while
- * in the old API emulation mode
- */
- for (idx = 0; idx < n_layers; idx++) {
- tot_dimms *= layers[idx].size;
- if (layers[idx].is_virt_csrow)
- tot_csrows *= layers[idx].size;
- else
- tot_channels *= layers[idx].size;
-
- if (layers[idx].type == EDAC_MC_LAYER_CHIP_SELECT)
- per_rank = true;
- }
-
- /* Figure out the offsets of the various items from the start of an mc
- * structure. We want the alignment of each item to be at least as
- * stringent as what the compiler would provide if we could simply
- * hardcode everything into a single struct.
- */
- mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
- layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
- pvt = edac_align_ptr(&ptr, sz_pvt, 1);
- size = ((unsigned long)pvt) + sz_pvt;
-
- edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
- size,
- tot_dimms,
- per_rank ? "ranks" : "dimms",
- tot_csrows * tot_channels);
-
- mci = kzalloc(size, GFP_KERNEL);
- if (mci == NULL)
- return NULL;
-
- /* Adjust pointers so they point within the memory we just allocated
- * rather than an imaginary chunk of memory located at address 0.
- */
- layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
- pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
-
- /* setup index and various internal pointers */
- mci->mc_idx = mc_num;
- mci->tot_dimms = tot_dimms;
- mci->pvt_info = pvt;
- mci->n_layers = n_layers;
- mci->layers = layer;
- memcpy(mci->layers, layers, sizeof(*layer) * n_layers);
- mci->nr_csrows = tot_csrows;
- mci->num_cschannel = tot_channels;
- mci->csbased = per_rank;
-
- if (edac_mc_alloc_csrows(mci))
- goto error;
-
- if (edac_mc_alloc_dimms(mci))
- goto error;
-
- mci->op_state = OP_ALLOC;
-
- return mci;
-
-error:
- _edac_mc_free(mci);
-
- return NULL;
-}
-EXPORT_SYMBOL_GPL(edac_mc_alloc);
-
static int edac_mc_alloc_csrows(struct mem_ctl_info *mci)
{
unsigned int tot_csrows = mci->nr_csrows;
@@ -520,6 +433,90 @@ static int edac_mc_alloc_dimms(struct mem_ctl_info *mci)
return 0;
}
+struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
+ unsigned int n_layers,
+ struct edac_mc_layer *layers,
+ unsigned int sz_pvt)
+{
+ struct mem_ctl_info *mci;
+ struct edac_mc_layer *layer;
+ unsigned int idx, size, tot_dimms = 1;
+ unsigned int tot_csrows = 1, tot_channels = 1;
+ void *pvt, *ptr = NULL;
+ bool per_rank = false;
+
+ if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0))
+ return NULL;
+
+ /*
+ * Calculate the total amount of dimms and csrows/cschannels while
+ * in the old API emulation mode
+ */
+ for (idx = 0; idx < n_layers; idx++) {
+ tot_dimms *= layers[idx].size;
+ if (layers[idx].is_virt_csrow)
+ tot_csrows *= layers[idx].size;
+ else
+ tot_channels *= layers[idx].size;
+
+ if (layers[idx].type == EDAC_MC_LAYER_CHIP_SELECT)
+ per_rank = true;
+ }
+
+ /* Figure out the offsets of the various items from the start of an mc
+ * structure. We want the alignment of each item to be at least as
+ * stringent as what the compiler would provide if we could simply
+ * hardcode everything into a single struct.
+ */
+ mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
+ layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
+ pvt = edac_align_ptr(&ptr, sz_pvt, 1);
+ size = ((unsigned long)pvt) + sz_pvt;
+
+ edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
+ size,
+ tot_dimms,
+ per_rank ? "ranks" : "dimms",
+ tot_csrows * tot_channels);
+
+ mci = kzalloc(size, GFP_KERNEL);
+ if (mci == NULL)
+ return NULL;
+
+ /* Adjust pointers so they point within the memory we just allocated
+ * rather than an imaginary chunk of memory located at address 0.
+ */
+ layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
+ pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
+
+ /* setup index and various internal pointers */
+ mci->mc_idx = mc_num;
+ mci->tot_dimms = tot_dimms;
+ mci->pvt_info = pvt;
+ mci->n_layers = n_layers;
+ mci->layers = layer;
+ memcpy(mci->layers, layers, sizeof(*layer) * n_layers);
+ mci->nr_csrows = tot_csrows;
+ mci->num_cschannel = tot_channels;
+ mci->csbased = per_rank;
+
+ if (edac_mc_alloc_csrows(mci))
+ goto error;
+
+ if (edac_mc_alloc_dimms(mci))
+ goto error;
+
+ mci->op_state = OP_ALLOC;
+
+ return mci;
+
+error:
+ _edac_mc_free(mci);
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(edac_mc_alloc);
+
void edac_mc_free(struct mem_ctl_info *mci)
{
edac_dbg(1, "\n");
--
2.20.1
There never has been such function edac_raw_error_desc_clean() and in
function ghes_edac_report_mem_error() the whole struct is zero'ed
including the string arrays. Remove that comment.
Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
---
include/linux/edac.h | 5 -----
1 file changed, 5 deletions(-)
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 8beb6e834be9..67be279abd11 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -457,15 +457,10 @@ struct errcount_attribute_data {
* (typically, a memory controller error)
*/
struct edac_raw_error_desc {
- /*
- * NOTE: everything before grain won't be cleaned by
- * edac_raw_error_desc_clean()
- */
char location[LOCATION_SIZE];
char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * EDAC_MAX_LABELS];
long grain;
- /* the vars below and grain will be cleaned on every new error report */
u16 error_count;
int top_layer;
int mid_layer;
--
2.20.1
edac_mc_alloc() is huge. Factor out code by moving it to the two new
functions edac_mc_alloc_csrows() and edac_mc_alloc_dimms(). Do not
move code yet for better review.
Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
---
drivers/edac/edac_mc.c | 104 +++++++++++++++++++++++++++--------------
1 file changed, 69 insertions(+), 35 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index dfc17c565d8f..042a4504bb7f 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -305,6 +305,9 @@ static void _edac_mc_free(struct mem_ctl_info *mci)
kfree(mci);
}
+static int edac_mc_alloc_csrows(struct mem_ctl_info *mci);
+static int edac_mc_alloc_dimms(struct mem_ctl_info *mci);
+
struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
unsigned int n_layers,
struct edac_mc_layer *layers,
@@ -312,14 +315,9 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
{
struct mem_ctl_info *mci;
struct edac_mc_layer *layer;
- struct csrow_info *csr;
- struct rank_info *chan;
- struct dimm_info *dimm;
- unsigned int pos[EDAC_MAX_LAYERS];
unsigned int idx, size, tot_dimms = 1;
unsigned int tot_csrows = 1, tot_channels = 1;
- void *pvt, *p, *ptr = NULL;
- int j, row, chn, n, len;
+ void *pvt, *ptr = NULL;
bool per_rank = false;
if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0))
@@ -377,16 +375,43 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
mci->num_cschannel = tot_channels;
mci->csbased = per_rank;
+ if (edac_mc_alloc_csrows(mci))
+ goto error;
+
+ if (edac_mc_alloc_dimms(mci))
+ goto error;
+
+ mci->op_state = OP_ALLOC;
+
+ return mci;
+
+error:
+ _edac_mc_free(mci);
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(edac_mc_alloc);
+
+static int edac_mc_alloc_csrows(struct mem_ctl_info *mci)
+{
+ unsigned int tot_csrows = mci->nr_csrows;
+ unsigned int tot_channels = mci->num_cschannel;
+ unsigned int row, chn;
+
/*
* Alocate and fill the csrow/channels structs
*/
mci->csrows = kcalloc(tot_csrows, sizeof(*mci->csrows), GFP_KERNEL);
if (!mci->csrows)
- goto error;
+ return -ENOMEM;
+
for (row = 0; row < tot_csrows; row++) {
+ struct csrow_info *csr;
+
csr = kzalloc(sizeof(**mci->csrows), GFP_KERNEL);
if (!csr)
- goto error;
+ return -ENOMEM;
+
mci->csrows[row] = csr;
csr->csrow_idx = row;
csr->mci = mci;
@@ -394,34 +419,51 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
csr->channels = kcalloc(tot_channels, sizeof(*csr->channels),
GFP_KERNEL);
if (!csr->channels)
- goto error;
+ return -ENOMEM;
for (chn = 0; chn < tot_channels; chn++) {
+ struct rank_info *chan;
+
chan = kzalloc(sizeof(**csr->channels), GFP_KERNEL);
if (!chan)
- goto error;
+ return -ENOMEM;
+
csr->channels[chn] = chan;
chan->chan_idx = chn;
chan->csrow = csr;
}
}
+ return 0;
+}
+
+static int edac_mc_alloc_dimms(struct mem_ctl_info *mci)
+{
+ void *p;
+ unsigned int pos[EDAC_MAX_LAYERS];
+ unsigned int row, chn, idx;
+ int layer;
+
/*
* Allocate and fill the dimm structs
*/
- mci->dimms = kcalloc(tot_dimms, sizeof(*mci->dimms), GFP_KERNEL);
+ mci->dimms = kcalloc(mci->tot_dimms, sizeof(*mci->dimms), GFP_KERNEL);
if (!mci->dimms)
- goto error;
+ return -ENOMEM;
memset(&pos, 0, sizeof(pos));
row = 0;
chn = 0;
- for (idx = 0; idx < tot_dimms; idx++) {
+ for (idx = 0; idx < mci->tot_dimms; idx++) {
+ struct dimm_info *dimm;
+ struct rank_info *chan;
+ int n, len;
+
chan = mci->csrows[row]->channels[chn];
dimm = kzalloc(sizeof(**mci->dimms), GFP_KERNEL);
if (!dimm)
- goto error;
+ return -ENOMEM;
mci->dimms[idx] = dimm;
dimm->mci = mci;
dimm->idx = idx;
@@ -431,16 +473,16 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
*/
len = sizeof(dimm->label);
p = dimm->label;
- n = snprintf(p, len, "mc#%u", mc_num);
+ n = snprintf(p, len, "mc#%u", mci->mc_idx);
p += n;
len -= n;
- for (j = 0; j < n_layers; j++) {
+ for (layer = 0; layer < mci->n_layers; layer++) {
n = snprintf(p, len, "%s#%u",
- edac_layer_name[layers[j].type],
- pos[j]);
+ edac_layer_name[mci->layers[layer].type],
+ pos[layer]);
p += n;
len -= n;
- dimm->location[j] = pos[j];
+ dimm->location[layer] = pos[layer];
if (len <= 0)
break;
@@ -452,39 +494,31 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
dimm->cschannel = chn;
/* Increment csrow location */
- if (layers[0].is_virt_csrow) {
+ if (mci->layers[0].is_virt_csrow) {
chn++;
- if (chn == tot_channels) {
+ if (chn == mci->num_cschannel) {
chn = 0;
row++;
}
} else {
row++;
- if (row == tot_csrows) {
+ if (row == mci->nr_csrows) {
row = 0;
chn++;
}
}
/* Increment dimm location */
- for (j = n_layers - 1; j >= 0; j--) {
- pos[j]++;
- if (pos[j] < layers[j].size)
+ for (layer = mci->n_layers - 1; layer >= 0; layer--) {
+ pos[layer]++;
+ if (pos[layer] < mci->layers[layer].size)
break;
- pos[j] = 0;
+ pos[layer] = 0;
}
}
- mci->op_state = OP_ALLOC;
-
- return mci;
-
-error:
- _edac_mc_free(mci);
-
- return NULL;
+ return 0;
}
-EXPORT_SYMBOL_GPL(edac_mc_alloc);
void edac_mc_free(struct mem_ctl_info *mci)
{
--
2.20.1
Since this is a string already and strlen() has been used to advance
the pointer, the end of the buffer is already zero terminated. Remove
the needless zero string termination.
Suggested-by: Joe Perches <[email protected]>
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/edac_mc.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index f2cbca77bc50..8bfe76d1bdf1 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -1183,7 +1183,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
}
strcpy(p, dimm->label);
p += strlen(p);
- *p = '\0';
/*
* get csrow/channel of the DIMM, in order to allow
--
2.20.1
The EDAC_DIMM_PTR() macro takes 3 arguments from struct mem_ctl_info.
Clean up this interface to only pass the mci struct and replace this
macro with the new function edac_get_dimm().
Also introduce the edac_get_dimm_by_index() function for later use.
This allows it to get a dimm pointer only by a given index. This can
be useful if the dimm's position within the layers of the memory
controller or the exact size of the layers are unknown.
Small style changes made for some hunks after applying the semantic
patch.
Semantic patch used:
@@ expression mci, a, b,c; @@
-EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, a, b, c)
+edac_get_dimm(mci, a, b, c)
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/ghes_edac.c | 7 +--
drivers/edac/i10nm_base.c | 3 +-
drivers/edac/i3200_edac.c | 3 +-
drivers/edac/i5000_edac.c | 5 +--
drivers/edac/i5100_edac.c | 3 +-
drivers/edac/i5400_edac.c | 3 +-
drivers/edac/i7300_edac.c | 3 +-
drivers/edac/i7core_edac.c | 3 +-
drivers/edac/ie31200_edac.c | 7 +--
drivers/edac/pnd2_edac.c | 4 +-
drivers/edac/sb_edac.c | 2 +-
drivers/edac/skx_base.c | 3 +-
drivers/edac/ti_edac.c | 2 +-
include/linux/edac.h | 88 ++++++++++++++++++++++++-------------
14 files changed, 74 insertions(+), 62 deletions(-)
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 5da85ef7966d..d92cd99081d2 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -106,9 +106,7 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
if (dh->type == DMI_ENTRY_MEM_DEVICE) {
struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh;
- struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
- mci->n_layers,
- dimm_fill->count, 0, 0);
+ struct dimm_info *dimm = edac_get_dimm(mci, dimm_fill->count, 0, 0);
u16 rdr_mask = BIT(7) | BIT(13);
if (entry->size == 0xffff) {
@@ -543,8 +541,7 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
dimm_fill.mci = mci;
dmi_walk(ghes_edac_dmidecode, &dimm_fill);
} else {
- struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
- mci->n_layers, 0, 0, 0);
+ struct dimm_info *dimm = edac_get_dimm(mci, 0, 0, 0);
dimm->nr_pages = 1;
dimm->grain = 128;
diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index c370d5457e6b..059eccf0582b 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -154,8 +154,7 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci)
ndimms = 0;
for (j = 0; j < I10NM_NUM_DIMMS; j++) {
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
- mci->n_layers, i, j, 0);
+ dimm = edac_get_dimm(mci, i, j, 0);
mtr = I10NM_GET_DIMMMTR(imc, i, j);
mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i, j);
edac_dbg(1, "dimmmtr 0x%x mcddrtcfg 0x%x (mc%d ch%d dimm%d)\n",
diff --git a/drivers/edac/i3200_edac.c b/drivers/edac/i3200_edac.c
index 299b441647cd..432b375a4075 100644
--- a/drivers/edac/i3200_edac.c
+++ b/drivers/edac/i3200_edac.c
@@ -392,8 +392,7 @@ static int i3200_probe1(struct pci_dev *pdev, int dev_idx)
unsigned long nr_pages;
for (j = 0; j < nr_channels; j++) {
- struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
- mci->n_layers, i, j, 0);
+ struct dimm_info *dimm = edac_get_dimm(mci, i, j, 0);
nr_pages = drb_to_nr_pages(drbs, stacked, j, i);
if (nr_pages == 0)
diff --git a/drivers/edac/i5000_edac.c b/drivers/edac/i5000_edac.c
index 078a7351bf05..1a6f69c859ab 100644
--- a/drivers/edac/i5000_edac.c
+++ b/drivers/edac/i5000_edac.c
@@ -1275,9 +1275,8 @@ static int i5000_init_csrows(struct mem_ctl_info *mci)
if (!MTR_DIMMS_PRESENT(mtr))
continue;
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
- channel / MAX_BRANCHES,
- channel % MAX_BRANCHES, slot);
+ dimm = edac_get_dimm(mci, channel / MAX_BRANCHES,
+ channel % MAX_BRANCHES, slot);
csrow_megs = pvt->dimm_info[slot][channel].megabytes;
dimm->grain = 8;
diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c
index 12bebecb203b..134586753311 100644
--- a/drivers/edac/i5100_edac.c
+++ b/drivers/edac/i5100_edac.c
@@ -858,8 +858,7 @@ static void i5100_init_csrows(struct mem_ctl_info *mci)
if (!npages)
continue;
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
- chan, rank, 0);
+ dimm = edac_get_dimm(mci, chan, rank, 0);
dimm->nr_pages = npages;
dimm->grain = 32;
diff --git a/drivers/edac/i5400_edac.c b/drivers/edac/i5400_edac.c
index 8c86c6fd7da7..f131c05ade9f 100644
--- a/drivers/edac/i5400_edac.c
+++ b/drivers/edac/i5400_edac.c
@@ -1187,8 +1187,7 @@ static int i5400_init_dimms(struct mem_ctl_info *mci)
if (!MTR_DIMMS_PRESENT(mtr))
continue;
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
- channel / 2, channel % 2, slot);
+ dimm = edac_get_dimm(mci, channel / 2, channel % 2, slot);
size_mb = pvt->dimm_info[slot][channel].megabytes;
diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c
index 447d357c7a67..2e9bbe56cde9 100644
--- a/drivers/edac/i7300_edac.c
+++ b/drivers/edac/i7300_edac.c
@@ -794,8 +794,7 @@ static int i7300_init_csrows(struct mem_ctl_info *mci)
for (ch = 0; ch < max_channel; ch++) {
int channel = to_channel(ch, branch);
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
- mci->n_layers, branch, ch, slot);
+ dimm = edac_get_dimm(mci, branch, ch, slot);
dinfo = &pvt->dimm_info[slot][channel];
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index a71cca6eeb33..b3135b208f9a 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -585,8 +585,7 @@ static int get_dimm_config(struct mem_ctl_info *mci)
if (!DIMM_PRESENT(dimm_dod[j]))
continue;
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
- i, j, 0);
+ dimm = edac_get_dimm(mci, i, j, 0);
banks = numbank(MC_DOD_NUMBANK(dimm_dod[j]));
ranks = numrank(MC_DOD_NUMRANK(dimm_dod[j]));
rows = numrow(MC_DOD_NUMROW(dimm_dod[j]));
diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c
index d26300f9cb07..4f65073f230b 100644
--- a/drivers/edac/ie31200_edac.c
+++ b/drivers/edac/ie31200_edac.c
@@ -490,9 +490,7 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx)
if (dimm_info[j][i].dual_rank) {
nr_pages = nr_pages / 2;
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
- mci->n_layers, (i * 2) + 1,
- j, 0);
+ dimm = edac_get_dimm(mci, (i * 2) + 1, j, 0);
dimm->nr_pages = nr_pages;
edac_dbg(0, "set nr pages: 0x%lx\n", nr_pages);
dimm->grain = 8; /* just a guess */
@@ -503,8 +501,7 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx)
dimm->dtype = DEV_UNKNOWN;
dimm->edac_mode = EDAC_UNKNOWN;
}
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
- mci->n_layers, i * 2, j, 0);
+ dimm = edac_get_dimm(mci, i * 2, j, 0);
dimm->nr_pages = nr_pages;
edac_dbg(0, "set nr pages: 0x%lx\n", nr_pages);
dimm->grain = 8; /* same guess */
diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c
index b1193be1ef1d..933f7722b893 100644
--- a/drivers/edac/pnd2_edac.c
+++ b/drivers/edac/pnd2_edac.c
@@ -1231,7 +1231,7 @@ static void apl_get_dimm_config(struct mem_ctl_info *mci)
if (!(chan_mask & BIT(i)))
continue;
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, i, 0, 0);
+ dimm = edac_get_dimm(mci, i, 0, 0);
if (!dimm) {
edac_dbg(0, "No allocated DIMM for channel %d\n", i);
continue;
@@ -1311,7 +1311,7 @@ static void dnv_get_dimm_config(struct mem_ctl_info *mci)
if (!ranks_of_dimm[j])
continue;
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, i, j, 0);
+ dimm = edac_get_dimm(mci, i, j, 0);
if (!dimm) {
edac_dbg(0, "No allocated DIMM for channel %d DIMM %d\n", i, j);
continue;
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index a2fd39d330d6..4957e8ee1879 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -1621,7 +1621,7 @@ static int __populate_dimms(struct mem_ctl_info *mci,
}
for (j = 0; j < max_dimms_per_channel; j++) {
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, i, j, 0);
+ dimm = edac_get_dimm(mci, i, j, 0);
if (pvt->info.type == KNIGHTS_LANDING) {
pci_read_config_dword(pvt->knl.pci_channel[i],
knl_mtr_reg, &mtr);
diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
index a8853e724d1f..83545b4facb7 100644
--- a/drivers/edac/skx_base.c
+++ b/drivers/edac/skx_base.c
@@ -189,8 +189,7 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci)
pci_read_config_dword(imc->chan[i].cdev, 0x8C, &amap);
pci_read_config_dword(imc->chan[i].cdev, 0x400, &mcddrtcfg);
for (j = 0; j < SKX_NUM_DIMMS; j++) {
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
- mci->n_layers, i, j, 0);
+ dimm = edac_get_dimm(mci, i, j, 0);
pci_read_config_dword(imc->chan[i].cdev,
0x80 + 4 * j, &mtr);
if (IS_DIMM_PRESENT(mtr)) {
diff --git a/drivers/edac/ti_edac.c b/drivers/edac/ti_edac.c
index 6ac26d1b929f..8be3e89a510e 100644
--- a/drivers/edac/ti_edac.c
+++ b/drivers/edac/ti_edac.c
@@ -135,7 +135,7 @@ static void ti_edac_setup_dimm(struct mem_ctl_info *mci, u32 type)
u32 val;
u32 memsize;
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, 0, 0, 0);
+ dimm = edac_get_dimm(mci, 0, 0, 0);
val = ti_edac_readl(edac, EMIF_SDRAM_CONFIG);
diff --git a/include/linux/edac.h b/include/linux/edac.h
index c19483b90079..696bfb684d92 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -403,37 +403,6 @@ struct edac_mc_layer {
__i; \
})
-/**
- * EDAC_DIMM_PTR - Macro responsible to get a pointer inside a pointer array
- * for the element given by [layer0,layer1,layer2] position
- *
- * @layers: a struct edac_mc_layer array, describing how many elements
- * were allocated for each layer
- * @var: name of the var where we want to get the pointer
- * (like mci->dimms)
- * @nlayers: Number of layers at the @layers array
- * @layer0: layer0 position
- * @layer1: layer1 position. Unused if n_layers < 2
- * @layer2: layer2 position. Unused if n_layers < 3
- *
- * For 1 layer, this macro returns "var[layer0]";
- *
- * For 2 layers, this macro is similar to allocate a bi-dimensional array
- * and to return "var[layer0][layer1]";
- *
- * For 3 layers, this macro is similar to allocate a tri-dimensional array
- * and to return "var[layer0][layer1][layer2]";
- */
-#define EDAC_DIMM_PTR(layers, var, nlayers, layer0, layer1, layer2) ({ \
- typeof(*var) __p; \
- int ___i = EDAC_DIMM_OFF(layers, nlayers, layer0, layer1, layer2); \
- if (___i < 0) \
- __p = NULL; \
- else \
- __p = (var)[___i]; \
- __p; \
-})
-
struct dimm_info {
struct device dev;
@@ -669,4 +638,61 @@ struct mem_ctl_info {
bool fake_inject_ue;
u16 fake_inject_count;
};
+
+/**
+ * edac_get_dimm_by_index - Get DIMM info from a memory controller
+ * given by an index
+ *
+ * @mci: a struct mem_ctl_info
+ * @index: index in the memory controller's DIMM array
+ *
+ * Returns a struct dimm_info* or NULL on failure.
+ */
+static inline struct dimm_info *
+edac_get_dimm_by_index(struct mem_ctl_info *mci, int index)
+{
+ if (index < 0 || index >= mci->tot_dimms)
+ return NULL;
+
+ return mci->dimms[index];
+}
+
+/**
+ * edac_get_dimm - Get DIMM info from a memory controller given by
+ * [layer0,layer1,layer2] position
+ *
+ * @mci: a struct mem_ctl_info
+ * @layer0: layer0 position
+ * @layer1: layer1 position. Unused if n_layers < 2
+ * @layer2: layer2 position. Unused if n_layers < 3
+ *
+ * For 1 layer, this function returns "dimms[layer0]";
+ *
+ * For 2 layers, this function is similar to allocate a bi-dimensional
+ * array and to return "dimms[layer0][layer1]";
+ *
+ * For 3 layers, this function is similar to allocate a tri-dimensional array
+ * and to return "dimms[layer0][layer1][layer2]";
+ */
+static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci,
+ int layer0, int layer1, int layer2)
+{
+ int index;
+
+ if (layer0 < 0
+ || (mci->n_layers > 1 && layer1 < 0)
+ || (mci->n_layers > 2 && layer2 < 0))
+ return NULL;
+
+ index = layer0;
+
+ if (mci->n_layers > 1)
+ index = index * mci->layers[1].size + layer1;
+
+ if (mci->n_layers > 2)
+ index = index * mci->layers[2].size + layer2;
+
+ return edac_get_dimm_by_index(mci, index);
+}
+
#endif
--
2.20.1
Em Wed, 6 Nov 2019 09:33:02 +0000
Robert Richter <[email protected]> escreveu:
> The EDAC_DIMM_PTR() macro takes 3 arguments from struct mem_ctl_info.
> Clean up this interface to only pass the mci struct and replace this
> macro with the new function edac_get_dimm().
>
> Also introduce the edac_get_dimm_by_index() function for later use.
> This allows it to get a dimm pointer only by a given index. This can
> be useful if the dimm's position within the layers of the memory
> controller or the exact size of the layers are unknown.
>
> Small style changes made for some hunks after applying the semantic
> patch.
>
> Semantic patch used:
>
> @@ expression mci, a, b,c; @@
>
> -EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, a, b, c)
> +edac_get_dimm(mci, a, b, c)
>
> Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
> ---
> drivers/edac/ghes_edac.c | 7 +--
> drivers/edac/i10nm_base.c | 3 +-
> drivers/edac/i3200_edac.c | 3 +-
> drivers/edac/i5000_edac.c | 5 +--
> drivers/edac/i5100_edac.c | 3 +-
> drivers/edac/i5400_edac.c | 3 +-
> drivers/edac/i7300_edac.c | 3 +-
> drivers/edac/i7core_edac.c | 3 +-
> drivers/edac/ie31200_edac.c | 7 +--
> drivers/edac/pnd2_edac.c | 4 +-
> drivers/edac/sb_edac.c | 2 +-
> drivers/edac/skx_base.c | 3 +-
> drivers/edac/ti_edac.c | 2 +-
> include/linux/edac.h | 88 ++++++++++++++++++++++++-------------
> 14 files changed, 74 insertions(+), 62 deletions(-)
>
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index 5da85ef7966d..d92cd99081d2 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -106,9 +106,7 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
>
> if (dh->type == DMI_ENTRY_MEM_DEVICE) {
> struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh;
> - struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
> - mci->n_layers,
> - dimm_fill->count, 0, 0);
> + struct dimm_info *dimm = edac_get_dimm(mci, dimm_fill->count, 0, 0);
> u16 rdr_mask = BIT(7) | BIT(13);
>
> if (entry->size == 0xffff) {
> @@ -543,8 +541,7 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
> dimm_fill.mci = mci;
> dmi_walk(ghes_edac_dmidecode, &dimm_fill);
> } else {
> - struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
> - mci->n_layers, 0, 0, 0);
> + struct dimm_info *dimm = edac_get_dimm(mci, 0, 0, 0);
>
> dimm->nr_pages = 1;
> dimm->grain = 128;
> diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
> index c370d5457e6b..059eccf0582b 100644
> --- a/drivers/edac/i10nm_base.c
> +++ b/drivers/edac/i10nm_base.c
> @@ -154,8 +154,7 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci)
>
> ndimms = 0;
> for (j = 0; j < I10NM_NUM_DIMMS; j++) {
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
> - mci->n_layers, i, j, 0);
> + dimm = edac_get_dimm(mci, i, j, 0);
> mtr = I10NM_GET_DIMMMTR(imc, i, j);
> mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i, j);
> edac_dbg(1, "dimmmtr 0x%x mcddrtcfg 0x%x (mc%d ch%d dimm%d)\n",
> diff --git a/drivers/edac/i3200_edac.c b/drivers/edac/i3200_edac.c
> index 299b441647cd..432b375a4075 100644
> --- a/drivers/edac/i3200_edac.c
> +++ b/drivers/edac/i3200_edac.c
> @@ -392,8 +392,7 @@ static int i3200_probe1(struct pci_dev *pdev, int dev_idx)
> unsigned long nr_pages;
>
> for (j = 0; j < nr_channels; j++) {
> - struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
> - mci->n_layers, i, j, 0);
> + struct dimm_info *dimm = edac_get_dimm(mci, i, j, 0);
>
> nr_pages = drb_to_nr_pages(drbs, stacked, j, i);
> if (nr_pages == 0)
> diff --git a/drivers/edac/i5000_edac.c b/drivers/edac/i5000_edac.c
> index 078a7351bf05..1a6f69c859ab 100644
> --- a/drivers/edac/i5000_edac.c
> +++ b/drivers/edac/i5000_edac.c
> @@ -1275,9 +1275,8 @@ static int i5000_init_csrows(struct mem_ctl_info *mci)
> if (!MTR_DIMMS_PRESENT(mtr))
> continue;
>
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
> - channel / MAX_BRANCHES,
> - channel % MAX_BRANCHES, slot);
> + dimm = edac_get_dimm(mci, channel / MAX_BRANCHES,
> + channel % MAX_BRANCHES, slot);
>
> csrow_megs = pvt->dimm_info[slot][channel].megabytes;
> dimm->grain = 8;
> diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c
> index 12bebecb203b..134586753311 100644
> --- a/drivers/edac/i5100_edac.c
> +++ b/drivers/edac/i5100_edac.c
> @@ -858,8 +858,7 @@ static void i5100_init_csrows(struct mem_ctl_info *mci)
> if (!npages)
> continue;
>
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
> - chan, rank, 0);
> + dimm = edac_get_dimm(mci, chan, rank, 0);
>
> dimm->nr_pages = npages;
> dimm->grain = 32;
> diff --git a/drivers/edac/i5400_edac.c b/drivers/edac/i5400_edac.c
> index 8c86c6fd7da7..f131c05ade9f 100644
> --- a/drivers/edac/i5400_edac.c
> +++ b/drivers/edac/i5400_edac.c
> @@ -1187,8 +1187,7 @@ static int i5400_init_dimms(struct mem_ctl_info *mci)
> if (!MTR_DIMMS_PRESENT(mtr))
> continue;
>
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
> - channel / 2, channel % 2, slot);
> + dimm = edac_get_dimm(mci, channel / 2, channel % 2, slot);
>
> size_mb = pvt->dimm_info[slot][channel].megabytes;
>
> diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c
> index 447d357c7a67..2e9bbe56cde9 100644
> --- a/drivers/edac/i7300_edac.c
> +++ b/drivers/edac/i7300_edac.c
> @@ -794,8 +794,7 @@ static int i7300_init_csrows(struct mem_ctl_info *mci)
> for (ch = 0; ch < max_channel; ch++) {
> int channel = to_channel(ch, branch);
>
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
> - mci->n_layers, branch, ch, slot);
> + dimm = edac_get_dimm(mci, branch, ch, slot);
>
> dinfo = &pvt->dimm_info[slot][channel];
>
> diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
> index a71cca6eeb33..b3135b208f9a 100644
> --- a/drivers/edac/i7core_edac.c
> +++ b/drivers/edac/i7core_edac.c
> @@ -585,8 +585,7 @@ static int get_dimm_config(struct mem_ctl_info *mci)
> if (!DIMM_PRESENT(dimm_dod[j]))
> continue;
>
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
> - i, j, 0);
> + dimm = edac_get_dimm(mci, i, j, 0);
> banks = numbank(MC_DOD_NUMBANK(dimm_dod[j]));
> ranks = numrank(MC_DOD_NUMRANK(dimm_dod[j]));
> rows = numrow(MC_DOD_NUMROW(dimm_dod[j]));
> diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c
> index d26300f9cb07..4f65073f230b 100644
> --- a/drivers/edac/ie31200_edac.c
> +++ b/drivers/edac/ie31200_edac.c
> @@ -490,9 +490,7 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx)
>
> if (dimm_info[j][i].dual_rank) {
> nr_pages = nr_pages / 2;
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
> - mci->n_layers, (i * 2) + 1,
> - j, 0);
> + dimm = edac_get_dimm(mci, (i * 2) + 1, j, 0);
> dimm->nr_pages = nr_pages;
> edac_dbg(0, "set nr pages: 0x%lx\n", nr_pages);
> dimm->grain = 8; /* just a guess */
> @@ -503,8 +501,7 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx)
> dimm->dtype = DEV_UNKNOWN;
> dimm->edac_mode = EDAC_UNKNOWN;
> }
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
> - mci->n_layers, i * 2, j, 0);
> + dimm = edac_get_dimm(mci, i * 2, j, 0);
> dimm->nr_pages = nr_pages;
> edac_dbg(0, "set nr pages: 0x%lx\n", nr_pages);
> dimm->grain = 8; /* same guess */
> diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c
> index b1193be1ef1d..933f7722b893 100644
> --- a/drivers/edac/pnd2_edac.c
> +++ b/drivers/edac/pnd2_edac.c
> @@ -1231,7 +1231,7 @@ static void apl_get_dimm_config(struct mem_ctl_info *mci)
> if (!(chan_mask & BIT(i)))
> continue;
>
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, i, 0, 0);
> + dimm = edac_get_dimm(mci, i, 0, 0);
> if (!dimm) {
> edac_dbg(0, "No allocated DIMM for channel %d\n", i);
> continue;
> @@ -1311,7 +1311,7 @@ static void dnv_get_dimm_config(struct mem_ctl_info *mci)
> if (!ranks_of_dimm[j])
> continue;
>
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, i, j, 0);
> + dimm = edac_get_dimm(mci, i, j, 0);
> if (!dimm) {
> edac_dbg(0, "No allocated DIMM for channel %d DIMM %d\n", i, j);
> continue;
> diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
> index a2fd39d330d6..4957e8ee1879 100644
> --- a/drivers/edac/sb_edac.c
> +++ b/drivers/edac/sb_edac.c
> @@ -1621,7 +1621,7 @@ static int __populate_dimms(struct mem_ctl_info *mci,
> }
>
> for (j = 0; j < max_dimms_per_channel; j++) {
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, i, j, 0);
> + dimm = edac_get_dimm(mci, i, j, 0);
> if (pvt->info.type == KNIGHTS_LANDING) {
> pci_read_config_dword(pvt->knl.pci_channel[i],
> knl_mtr_reg, &mtr);
> diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
> index a8853e724d1f..83545b4facb7 100644
> --- a/drivers/edac/skx_base.c
> +++ b/drivers/edac/skx_base.c
> @@ -189,8 +189,7 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci)
> pci_read_config_dword(imc->chan[i].cdev, 0x8C, &amap);
> pci_read_config_dword(imc->chan[i].cdev, 0x400, &mcddrtcfg);
> for (j = 0; j < SKX_NUM_DIMMS; j++) {
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
> - mci->n_layers, i, j, 0);
> + dimm = edac_get_dimm(mci, i, j, 0);
> pci_read_config_dword(imc->chan[i].cdev,
> 0x80 + 4 * j, &mtr);
> if (IS_DIMM_PRESENT(mtr)) {
> diff --git a/drivers/edac/ti_edac.c b/drivers/edac/ti_edac.c
> index 6ac26d1b929f..8be3e89a510e 100644
> --- a/drivers/edac/ti_edac.c
> +++ b/drivers/edac/ti_edac.c
> @@ -135,7 +135,7 @@ static void ti_edac_setup_dimm(struct mem_ctl_info *mci, u32 type)
> u32 val;
> u32 memsize;
>
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, 0, 0, 0);
> + dimm = edac_get_dimm(mci, 0, 0, 0);
>
> val = ti_edac_readl(edac, EMIF_SDRAM_CONFIG);
>
> diff --git a/include/linux/edac.h b/include/linux/edac.h
> index c19483b90079..696bfb684d92 100644
> --- a/include/linux/edac.h
> +++ b/include/linux/edac.h
> @@ -403,37 +403,6 @@ struct edac_mc_layer {
> __i; \
> })
>
> -/**
> - * EDAC_DIMM_PTR - Macro responsible to get a pointer inside a pointer array
> - * for the element given by [layer0,layer1,layer2] position
> - *
> - * @layers: a struct edac_mc_layer array, describing how many elements
> - * were allocated for each layer
> - * @var: name of the var where we want to get the pointer
> - * (like mci->dimms)
> - * @nlayers: Number of layers at the @layers array
> - * @layer0: layer0 position
> - * @layer1: layer1 position. Unused if n_layers < 2
> - * @layer2: layer2 position. Unused if n_layers < 3
> - *
> - * For 1 layer, this macro returns "var[layer0]";
> - *
> - * For 2 layers, this macro is similar to allocate a bi-dimensional array
> - * and to return "var[layer0][layer1]";
> - *
> - * For 3 layers, this macro is similar to allocate a tri-dimensional array
> - * and to return "var[layer0][layer1][layer2]";
> - */
> -#define EDAC_DIMM_PTR(layers, var, nlayers, layer0, layer1, layer2) ({ \
> - typeof(*var) __p; \
> - int ___i = EDAC_DIMM_OFF(layers, nlayers, layer0, layer1, layer2); \
> - if (___i < 0) \
> - __p = NULL; \
> - else \
> - __p = (var)[___i]; \
> - __p; \
> -})
> -
> struct dimm_info {
> struct device dev;
>
> @@ -669,4 +638,61 @@ struct mem_ctl_info {
> bool fake_inject_ue;
> u16 fake_inject_count;
> };
> +
> +/**
> + * edac_get_dimm_by_index - Get DIMM info from a memory controller
> + * given by an index
> + *
> + * @mci: a struct mem_ctl_info
> + * @index: index in the memory controller's DIMM array
> + *
> + * Returns a struct dimm_info* or NULL on failure.
> + */
> +static inline struct dimm_info *
> +edac_get_dimm_by_index(struct mem_ctl_info *mci, int index)
> +{
> + if (index < 0 || index >= mci->tot_dimms)
> + return NULL;
> +
> + return mci->dimms[index];
> +}
> +
> +/**
> + * edac_get_dimm - Get DIMM info from a memory controller given by
> + * [layer0,layer1,layer2] position
> + *
> + * @mci: a struct mem_ctl_info
> + * @layer0: layer0 position
> + * @layer1: layer1 position. Unused if n_layers < 2
> + * @layer2: layer2 position. Unused if n_layers < 3
> + *
> + * For 1 layer, this function returns "dimms[layer0]";
> + *
> + * For 2 layers, this function is similar to allocate a bi-dimensional
> + * array and to return "dimms[layer0][layer1]";
> + *
> + * For 3 layers, this function is similar to allocate a tri-dimensional array
> + * and to return "dimms[layer0][layer1][layer2]";
> + */
> +static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci,
> + int layer0, int layer1, int layer2)
> +{
> + int index;
> +
> + if (layer0 < 0
> + || (mci->n_layers > 1 && layer1 < 0)
> + || (mci->n_layers > 2 && layer2 < 0))
> + return NULL;
> +
> + index = layer0;
> +
> + if (mci->n_layers > 1)
> + index = index * mci->layers[1].size + layer1;
> +
> + if (mci->n_layers > 2)
> + index = index * mci->layers[2].size + layer2;
> +
> + return edac_get_dimm_by_index(mci, index);
> +}
> +
> #endif
Cheers,
Mauro
Em Wed, 6 Nov 2019 09:33:11 +0000
Robert Richter <[email protected]> escreveu:
> Since this is a string already and strlen() has been used to advance
> the pointer, the end of the buffer is already zero terminated. Remove
> the needless zero string termination.
>
> Suggested-by: Joe Perches <[email protected]>
> Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
> ---
> drivers/edac/edac_mc.c | 1 -
> 1 file changed, 1 deletion(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index f2cbca77bc50..8bfe76d1bdf1 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -1183,7 +1183,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> }
> strcpy(p, dimm->label);
> p += strlen(p);
> - *p = '\0';
>
> /*
> * get csrow/channel of the DIMM, in order to allow
Cheers,
Mauro
Em Wed, 6 Nov 2019 09:33:41 +0000
Robert Richter <[email protected]> escreveu:
> Store the error type in struct edac_raw_error_desc. This makes the
> type parameter of edac_raw_mc_handle_error() obsolete.
>
> Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
> ---
> drivers/edac/edac_mc.c | 10 +++++-----
> drivers/edac/edac_mc.h | 4 +---
> drivers/edac/ghes_edac.c | 11 +++++------
> include/linux/edac.h | 1 +
> 4 files changed, 12 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index e4a11218009b..7b4f5e98dfe8 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -1040,8 +1040,7 @@ static void edac_ue_error(struct mem_ctl_info *mci,
> edac_inc_ue_error(mci, dimm, error_count);
> }
>
> -void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
> - struct mem_ctl_info *mci,
> +void edac_raw_mc_handle_error(struct mem_ctl_info *mci,
> struct dimm_info *dimm,
> struct edac_raw_error_desc *e)
> {
> @@ -1056,14 +1055,14 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
>
> /* Report the error via the trace interface */
> if (IS_ENABLED(CONFIG_RAS))
> - trace_mc_event(type, e->msg, e->label, e->error_count,
> + trace_mc_event(e->type, e->msg, e->label, e->error_count,
> mci->mc_idx, e->top_layer, e->mid_layer,
> e->low_layer,
> (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
> grain_bits, e->syndrome, e->other_detail);
>
> /* Memory type dependent details about the error */
> - if (type == HW_EVENT_ERR_CORRECTED) {
> + if (e->type == HW_EVENT_ERR_CORRECTED) {
> snprintf(detail, sizeof(detail),
> "page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
> e->page_frame_number, e->offset_in_page,
> @@ -1109,6 +1108,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> /* Fills the error report buffer */
> memset(e, 0, sizeof (*e));
> e->error_count = error_count;
> + e->type = type;
> e->top_layer = top_layer;
> e->mid_layer = mid_layer;
> e->low_layer = low_layer;
> @@ -1242,6 +1242,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
>
> dimm = edac_get_dimm(mci, top_layer, mid_layer, low_layer);
>
> - edac_raw_mc_handle_error(type, mci, dimm, e);
> + edac_raw_mc_handle_error(mci, dimm, e);
> }
> EXPORT_SYMBOL_GPL(edac_mc_handle_error);
> diff --git a/drivers/edac/edac_mc.h b/drivers/edac/edac_mc.h
> index 2c3e2fbcedc4..a8f1b5b5e873 100644
> --- a/drivers/edac/edac_mc.h
> +++ b/drivers/edac/edac_mc.h
> @@ -212,7 +212,6 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
> * edac_raw_mc_handle_error() - Reports a memory event to userspace without
> * doing anything to discover the error location.
> *
> - * @type: severity of the error (CE/UE/Fatal)
> * @mci: a struct mem_ctl_info pointer
> * @dimm: a struct dimm_info pointer
> * @e: error description
> @@ -221,8 +220,7 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
> * only be called directly when the hardware error come directly from BIOS,
> * like in the case of APEI GHES driver.
> */
> -void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
> - struct mem_ctl_info *mci,
> +void edac_raw_mc_handle_error(struct mem_ctl_info *mci,
> struct dimm_info *dimm,
> struct edac_raw_error_desc *e);
>
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index 6eebaf28e31c..7d325d70d6d3 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -202,7 +202,6 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
> void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
> {
> struct dimm_info *dimm;
> - enum hw_event_mc_err_type type;
> struct edac_raw_error_desc *e;
> struct mem_ctl_info *mci;
> struct ghes_edac_pvt *pvt;
> @@ -241,17 +240,17 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
>
> switch (sev) {
> case GHES_SEV_CORRECTED:
> - type = HW_EVENT_ERR_CORRECTED;
> + e->type = HW_EVENT_ERR_CORRECTED;
> break;
> case GHES_SEV_RECOVERABLE:
> - type = HW_EVENT_ERR_UNCORRECTED;
> + e->type = HW_EVENT_ERR_UNCORRECTED;
> break;
> case GHES_SEV_PANIC:
> - type = HW_EVENT_ERR_FATAL;
> + e->type = HW_EVENT_ERR_FATAL;
> break;
> default:
> case GHES_SEV_NO:
> - type = HW_EVENT_ERR_INFO;
> + e->type = HW_EVENT_ERR_INFO;
> }
>
> edac_dbg(1, "error validation_bits: 0x%08llx\n",
> @@ -442,7 +441,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
>
> dimm = edac_get_dimm_by_index(mci, e->top_layer);
>
> - edac_raw_mc_handle_error(type, mci, dimm, e);
> + edac_raw_mc_handle_error(mci, dimm, e);
>
> unlock:
> spin_unlock_irqrestore(&ghes_lock, flags);
> diff --git a/include/linux/edac.h b/include/linux/edac.h
> index 4d9673954856..587c53b87fdf 100644
> --- a/include/linux/edac.h
> +++ b/include/linux/edac.h
> @@ -463,6 +463,7 @@ struct edac_raw_error_desc {
> long grain;
>
> u16 error_count;
> + enum hw_event_mc_err_type type;
> int top_layer;
> int mid_layer;
> int low_layer;
Cheers,
Mauro
Em Wed, 6 Nov 2019 09:33:46 +0000
Robert Richter <[email protected]> escreveu:
> Have a separate function to count errors in csrow/channel. This better
> separates code and reduces the indentation level. No functional
> changes.
>
> Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
> ---
> drivers/edac/edac_mc.c | 40 +++++++++++++++++++++++++---------------
> 1 file changed, 25 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index 93eac968678e..34b740036ffc 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -1045,6 +1045,26 @@ static struct mem_ctl_info *error_desc_to_mci(struct edac_raw_error_desc *e)
> return container_of(e, struct mem_ctl_info, error_desc);
> }
>
> +static void edac_inc_csrow(struct edac_raw_error_desc *e, int row, int chan)
> +{
> + struct mem_ctl_info *mci = error_desc_to_mci(e);
> + u16 count = e->error_count;
> + enum hw_event_mc_err_type type = e->type;
> +
> + if (row < 0)
> + return;
> +
> + edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
> +
> + if (type == HW_EVENT_ERR_CORRECTED) {
> + mci->csrows[row]->ce_count += count;
> + if (chan >= 0)
> + mci->csrows[row]->channels[chan]->ce_count += count;
> + } else {
> + mci->csrows[row]->ue_count += count;
> + }
> +}
> +
> void edac_raw_mc_handle_error(struct edac_raw_error_desc *e,
> struct dimm_info *dimm)
> {
> @@ -1214,22 +1234,12 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> chan = -2;
> }
>
> - if (any_memory) {
> + if (any_memory)
> strcpy(e->label, "any memory");
> - } else {
> - edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
> - if (p == e->label)
> - strcpy(e->label, "unknown memory");
> - if (type == HW_EVENT_ERR_CORRECTED) {
> - if (row >= 0) {
> - mci->csrows[row]->ce_count += error_count;
> - if (chan >= 0)
> - mci->csrows[row]->channels[chan]->ce_count += error_count;
> - }
> - } else
> - if (row >= 0)
> - mci->csrows[row]->ue_count += error_count;
> - }
> + else if (!*e->label)
> + strcpy(e->label, "unknown memory");
> +
> + edac_inc_csrow(e, row, chan);
>
> /* Fill the RAM location data */
> p = e->location;
Cheers,
Mauro
Em Wed, 6 Nov 2019 09:33:32 +0000
Robert Richter <[email protected]> escreveu:
> Looking at how mci->{ue,ce}_per_layer[EDAC_MAX_LAYERS] is used, it
> turns out that only the leaves in the memory hierarchy are consumed
> (in sysfs), but not the intermediate layers, e.g.:
>
> count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
>
> These unused counters only add complexity, remove them. The error
> counter values are directly stored in struct dimm_info now.
I guess this patch will cause troubles with some memory controllers.
The problem is that, depending on the memory type and how many bits
are wrong, it may not be technically possible to pinpoint an error
to a single DIMM.
I mean, the memory controller can be, for instance, grouping
DIMM1 and DIMM2. If there's just one bit errored, it is possible to
assign it to either DIMM1 or DIMM2, but if there are multiple bits
wrong, most ECC codes won't allow to pinpoint if the error ocurred
at DIMM1 or at DIMM2.
All we know is that the layer has an error.
So, assigning the error to the dimm struct seems plain wrong to me.
>
> Signed-off-by: Robert Richter <[email protected]>
> ---
> drivers/edac/edac_mc.c | 106 ++++++++++++-----------------------
> drivers/edac/edac_mc_sysfs.c | 20 +++----
> drivers/edac/ghes_edac.c | 5 +-
> include/linux/edac.h | 7 +--
> 4 files changed, 47 insertions(+), 91 deletions(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index b6032f51338e..dfc17c565d8f 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -315,12 +315,11 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> struct csrow_info *csr;
> struct rank_info *chan;
> struct dimm_info *dimm;
> - u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
> unsigned int pos[EDAC_MAX_LAYERS];
> - unsigned int idx, size, tot_dimms = 1, count = 1;
> - unsigned int tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
> + unsigned int idx, size, tot_dimms = 1;
> + unsigned int tot_csrows = 1, tot_channels = 1;
> void *pvt, *p, *ptr = NULL;
> - int i, j, row, chn, n, len;
> + int j, row, chn, n, len;
> bool per_rank = false;
>
> if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0))
> @@ -346,19 +345,10 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> * stringent as what the compiler would provide if we could simply
> * hardcode everything into a single struct.
> */
> - mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
> - layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
> - for (i = 0; i < n_layers; i++) {
> - count *= layers[i].size;
> - edac_dbg(4, "errcount layer %d size %d\n", i, count);
> - ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
> - ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
> - tot_errcount += 2 * count;
> - }
> -
> - edac_dbg(4, "allocating %d error counters\n", tot_errcount);
> - pvt = edac_align_ptr(&ptr, sz_pvt, 1);
> - size = ((unsigned long)pvt) + sz_pvt;
> + mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
> + layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
> + pvt = edac_align_ptr(&ptr, sz_pvt, 1);
> + size = ((unsigned long)pvt) + sz_pvt;
>
> edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
> size,
> @@ -374,10 +364,6 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> * rather than an imaginary chunk of memory located at address 0.
> */
> layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
> - for (i = 0; i < n_layers; i++) {
> - mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i]));
> - mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i]));
> - }
> pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
>
> /* setup index and various internal pointers */
> @@ -908,53 +894,31 @@ const char *edac_layer_name[] = {
> EXPORT_SYMBOL_GPL(edac_layer_name);
>
> static void edac_inc_ce_error(struct mem_ctl_info *mci,
> - bool enable_per_layer_report,
> const int pos[EDAC_MAX_LAYERS],
> const u16 count)
> {
> - int i, index = 0;
> + struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
>
> mci->ce_mc += count;
>
> - if (!enable_per_layer_report) {
> + if (dimm)
> + dimm->ce_count += count;
> + else
> mci->ce_noinfo_count += count;
> - return;
> - }
> -
> - for (i = 0; i < mci->n_layers; i++) {
> - if (pos[i] < 0)
> - break;
> - index += pos[i];
> - mci->ce_per_layer[i][index] += count;
> -
> - if (i < mci->n_layers - 1)
> - index *= mci->layers[i + 1].size;
> - }
> }
>
> static void edac_inc_ue_error(struct mem_ctl_info *mci,
> - bool enable_per_layer_report,
> const int pos[EDAC_MAX_LAYERS],
> const u16 count)
> {
> - int i, index = 0;
> + struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
>
> mci->ue_mc += count;
>
> - if (!enable_per_layer_report) {
> + if (dimm)
> + dimm->ue_count += count;
> + else
> mci->ue_noinfo_count += count;
> - return;
> - }
> -
> - for (i = 0; i < mci->n_layers; i++) {
> - if (pos[i] < 0)
> - break;
> - index += pos[i];
> - mci->ue_per_layer[i][index] += count;
> -
> - if (i < mci->n_layers - 1)
> - index *= mci->layers[i + 1].size;
> - }
> }
>
> static void edac_ce_error(struct mem_ctl_info *mci,
> @@ -965,7 +929,6 @@ static void edac_ce_error(struct mem_ctl_info *mci,
> const char *label,
> const char *detail,
> const char *other_detail,
> - const bool enable_per_layer_report,
> const unsigned long page_frame_number,
> const unsigned long offset_in_page,
> long grain)
> @@ -988,7 +951,7 @@ static void edac_ce_error(struct mem_ctl_info *mci,
> error_count, msg, msg_aux, label,
> location, detail);
> }
> - edac_inc_ce_error(mci, enable_per_layer_report, pos, error_count);
> + edac_inc_ce_error(mci, pos, error_count);
>
> if (mci->scrub_mode == SCRUB_SW_SRC) {
> /*
> @@ -1018,8 +981,7 @@ static void edac_ue_error(struct mem_ctl_info *mci,
> const char *location,
> const char *label,
> const char *detail,
> - const char *other_detail,
> - const bool enable_per_layer_report)
> + const char *other_detail)
> {
> char *msg_aux = "";
>
> @@ -1048,7 +1010,7 @@ static void edac_ue_error(struct mem_ctl_info *mci,
> msg, msg_aux, label, location, detail);
> }
>
> - edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count);
> + edac_inc_ue_error(mci, pos, error_count);
> }
>
> void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
> @@ -1079,16 +1041,16 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
> "page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
> e->page_frame_number, e->offset_in_page,
> e->grain, e->syndrome);
> - edac_ce_error(mci, e->error_count, pos, e->msg, e->location, e->label,
> - detail, e->other_detail, e->enable_per_layer_report,
> + edac_ce_error(mci, e->error_count, pos, e->msg, e->location,
> + e->label, detail, e->other_detail,
> e->page_frame_number, e->offset_in_page, e->grain);
> } else {
> snprintf(detail, sizeof(detail),
> "page:0x%lx offset:0x%lx grain:%ld",
> e->page_frame_number, e->offset_in_page, e->grain);
>
> - edac_ue_error(mci, e->error_count, pos, e->msg, e->location, e->label,
> - detail, e->other_detail, e->enable_per_layer_report);
> + edac_ue_error(mci, e->error_count, pos, e->msg, e->location,
> + e->label, detail, e->other_detail);
> }
>
>
> @@ -1113,6 +1075,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
> int i, n_labels = 0;
> struct edac_raw_error_desc *e = &mci->error_desc;
> + bool any_memory = true;
>
> edac_dbg(3, "MC%d\n", mci->mc_idx);
>
> @@ -1130,9 +1093,9 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
>
> /*
> * Check if the event report is consistent and if the memory
> - * location is known. If it is known, enable_per_layer_report will be
> - * true, the DIMM(s) label info will be filled and the per-layer
> - * error counters will be incremented.
> + * location is known. If it is known, the DIMM(s) label info
> + * will be filled and the DIMM's error counters will be
> + * incremented.
> */
> for (i = 0; i < mci->n_layers; i++) {
> if (pos[i] >= (int)mci->layers[i].size) {
> @@ -1150,7 +1113,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> pos[i] = -1;
> }
> if (pos[i] >= 0)
> - e->enable_per_layer_report = true;
> + any_memory = false;
> }
>
> /*
> @@ -1180,16 +1143,17 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> e->grain = dimm->grain;
>
> /*
> - * If the error is memory-controller wide, there's no need to
> - * seek for the affected DIMMs because the whole
> - * channel/memory controller/... may be affected.
> - * Also, don't show errors for empty DIMM slots.
> + * If the error is memory-controller wide, there's no
> + * need to seek for the affected DIMMs because the
> + * whole channel/memory controller/... may be
> + * affected. Also, don't show errors for empty DIMM
> + * slots.
> */
> - if (!e->enable_per_layer_report || !dimm->nr_pages)
> + if (any_memory || !dimm->nr_pages)
> continue;
>
> if (n_labels >= EDAC_MAX_LABELS) {
> - e->enable_per_layer_report = false;
> + any_memory = true;
> break;
> }
> n_labels++;
> @@ -1218,7 +1182,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> chan = -2;
> }
>
> - if (!e->enable_per_layer_report) {
> + if (any_memory) {
> strcpy(e->label, "any memory");
> } else {
> edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
> diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
> index 0367554e7437..8682df2f7f4f 100644
> --- a/drivers/edac/edac_mc_sysfs.c
> +++ b/drivers/edac/edac_mc_sysfs.c
> @@ -556,10 +556,8 @@ static ssize_t dimmdev_ce_count_show(struct device *dev,
> char *data)
> {
> struct dimm_info *dimm = to_dimm(dev);
> - u32 count;
>
> - count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
> - return sprintf(data, "%u\n", count);
> + return sprintf(data, "%u\n", dimm->ce_count);
> }
>
> static ssize_t dimmdev_ue_count_show(struct device *dev,
> @@ -567,10 +565,8 @@ static ssize_t dimmdev_ue_count_show(struct device *dev,
> char *data)
> {
> struct dimm_info *dimm = to_dimm(dev);
> - u32 count;
>
> - count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][dimm->idx];
> - return sprintf(data, "%u\n", count);
> + return sprintf(data, "%u\n", dimm->ue_count);
> }
>
> /* dimm/rank attribute files */
> @@ -666,7 +662,9 @@ static ssize_t mci_reset_counters_store(struct device *dev,
> const char *data, size_t count)
> {
> struct mem_ctl_info *mci = to_mci(dev);
> - int cnt, row, chan, i;
> + struct dimm_info *dimm;
> + int row, chan;
> +
> mci->ue_mc = 0;
> mci->ce_mc = 0;
> mci->ue_noinfo_count = 0;
> @@ -682,11 +680,9 @@ static ssize_t mci_reset_counters_store(struct device *dev,
> ri->channels[chan]->ce_count = 0;
> }
>
> - cnt = 1;
> - for (i = 0; i < mci->n_layers; i++) {
> - cnt *= mci->layers[i].size;
> - memset(mci->ce_per_layer[i], 0, cnt * sizeof(u32));
> - memset(mci->ue_per_layer[i], 0, cnt * sizeof(u32));
> + mci_for_each_dimm(mci, dimm) {
> + dimm->ue_count = 0;
> + dimm->ce_count = 0;
> }
>
> mci->start_time = jiffies;
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index 725b9c58c028..74017da1f72c 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -356,11 +356,8 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
> mem_err->mem_dev_handle);
>
> index = get_dimm_smbios_index(mci, mem_err->mem_dev_handle);
> - if (index >= 0) {
> + if (index >= 0)
> e->top_layer = index;
> - e->enable_per_layer_report = true;
> - }
> -
> }
> if (p > e->location)
> *(p - 1) = '\0';
> diff --git a/include/linux/edac.h b/include/linux/edac.h
> index 67be279abd11..4d9673954856 100644
> --- a/include/linux/edac.h
> +++ b/include/linux/edac.h
> @@ -383,6 +383,9 @@ struct dimm_info {
> unsigned int csrow, cschannel; /* Points to the old API data */
>
> u16 smbios_handle; /* Handle for SMBIOS type 17 */
> +
> + u32 ce_count;
> + u32 ue_count;
> };
>
> /**
> @@ -453,8 +456,6 @@ struct errcount_attribute_data {
> * @location: location of the error
> * @label: label of the affected DIMM(s)
> * @other_detail: other driver-specific detail about the error
> - * @enable_per_layer_report: if false, the error affects all layers
> - * (typically, a memory controller error)
> */
> struct edac_raw_error_desc {
> char location[LOCATION_SIZE];
> @@ -470,7 +471,6 @@ struct edac_raw_error_desc {
> unsigned long syndrome;
> const char *msg;
> const char *other_detail;
> - bool enable_per_layer_report;
> };
>
> /* MEMORY controller information structure
> @@ -560,7 +560,6 @@ struct mem_ctl_info {
> */
> u32 ce_noinfo_count, ue_noinfo_count;
> u32 ue_mc, ce_mc;
> - u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
>
> struct completion complete;
>
Cheers,
Mauro
On Wed, Nov 06, 2019 at 09:33:11AM +0000, Robert Richter wrote:
> Since this is a string already and strlen() has been used to advance
> the pointer, the end of the buffer is already zero terminated. Remove
> the needless zero string termination.
Changed that to:
"The e string to which this is pointing to has already been cleared
earlier in the function so remove the needless zero string termination."
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Wed, Nov 06, 2019 at 09:33:16AM +0000, Robert Richter wrote:
> Rename iterator variable to idx. The name is more handy, esp. when
> searching it in the code.
>
> Signed-off-by: Robert Richter <[email protected]>
> Reviewed-by: Mauro Carvalho Chehab <[email protected]>
> ---
> drivers/edac/edac_mc.c | 12 ++++++------
> 1 file changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index 3dc1c5afabce..f76252b7a787 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -330,14 +330,14 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> * Calculate the total amount of dimms and csrows/cschannels while
> * in the old API emulation mode
> */
> - for (i = 0; i < n_layers; i++) {
> - tot_dimms *= layers[i].size;
> - if (layers[i].is_virt_csrow)
> - tot_csrows *= layers[i].size;
> + for (idx = 0; idx < n_layers; idx++) {
> + tot_dimms *= layers[idx].size;
> + if (layers[idx].is_virt_csrow)
> + tot_csrows *= layers[idx].size;
> else
> - tot_channels *= layers[i].size;
> + tot_channels *= layers[idx].size;
>
> - if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT)
> + if (layers[idx].type == EDAC_MC_LAYER_CHIP_SELECT)
> per_rank = true;
> }
>
> --
Merging that one with 2/20 where you already are converting to "idx" in
the same function.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Wed, Nov 06, 2019 at 09:33:07AM +0000, Robert Richter wrote:
> @@ -702,6 +705,7 @@ EXPORT_SYMBOL_GPL(edac_get_owner);
> int edac_mc_add_mc_with_groups(struct mem_ctl_info *mci,
> const struct attribute_group **groups)
> {
> + struct dimm_info *dimm;
> int ret = -EINVAL;
> edac_dbg(0, "\n");
>
drivers/edac/edac_mc.c: In function ‘edac_mc_add_mc_with_groups’:
drivers/edac/edac_mc.c:711:20: warning: unused variable ‘dimm’ [-Wunused-variable]
struct dimm_info *dimm;
^~~~
Pushing the declaration into the #ifdef CONFIG_EDAC_DEBUG block.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Wed, Nov 06, 2019 at 09:32:58AM +0000, Robert Richter wrote:
> Robert Richter (20):
> EDAC: Replace EDAC_DIMM_PTR() macro with edac_get_dimm() function
> EDAC: Remove EDAC_DIMM_OFF() macro
> EDAC: Introduce mci_for_each_dimm() iterator
> EDAC, mc: Do not BUG_ON() in edac_mc_alloc()
> EDAC, mc: Remove needless zero string termination
> EDAC, mc: Reduce indentation level in edac_mc_handle_error()
> EDAC, mc: Rename iterator variable to idx
> EDAC: Remove misleading comment in struct edac_raw_error_desc
> EDAC, ghes: Use standard kernel macros for page calculations
> EDAC, ghes: Fix grain calculation
> EDAC, ghes: Remove intermediate buffer pvt->detail_location
> EDAC, ghes: Unify trace_mc_event() code with edac_mc driver
> EDAC, Documentation: Describe CPER module definition and DIMM ranks
Queued up to here.
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On 10.11.19 15:51:05, Borislav Petkov wrote:
> On Wed, Nov 06, 2019 at 09:32:58AM +0000, Robert Richter wrote:
> > Robert Richter (20):
> > EDAC: Replace EDAC_DIMM_PTR() macro with edac_get_dimm() function
> > EDAC: Remove EDAC_DIMM_OFF() macro
> > EDAC: Introduce mci_for_each_dimm() iterator
> > EDAC, mc: Do not BUG_ON() in edac_mc_alloc()
> > EDAC, mc: Remove needless zero string termination
> > EDAC, mc: Reduce indentation level in edac_mc_handle_error()
> > EDAC, mc: Rename iterator variable to idx
> > EDAC: Remove misleading comment in struct edac_raw_error_desc
> > EDAC, ghes: Use standard kernel macros for page calculations
> > EDAC, ghes: Fix grain calculation
> > EDAC, ghes: Remove intermediate buffer pvt->detail_location
> > EDAC, ghes: Unify trace_mc_event() code with edac_mc driver
> > EDAC, Documentation: Describe CPER module definition and DIMM ranks
>
> Queued up to here.
Thanks Boris. I am preparing a v3 with the remaining patches rebased
onto edac-for-next.
Thanks,
-Robert
Hi Mauro,
On 09.11.19 08:40:56, Mauro Carvalho Chehab wrote:
> Em Wed, 6 Nov 2019 09:33:32 +0000
> Robert Richter <[email protected]> escreveu:
>
> > Looking at how mci->{ue,ce}_per_layer[EDAC_MAX_LAYERS] is used, it
> > turns out that only the leaves in the memory hierarchy are consumed
> > (in sysfs), but not the intermediate layers, e.g.:
> >
> > count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
first of all, this is the only user, where ce_per_layer[][] is
accessed *readable*. Note that n_layers is a constant value per
mci. Thus we could also convert this without any change of
functionality to:
count = dimm->mci->ce_counts[dimm->idx];
We can also remove the code that writes counter values to inner
layers, those values are never read.
The above is nothing else than storing the count per DIMM, which can
be converted to the following by just adding the count to struct
dimm_info:
count = dimm->ce_count;
Same applies to ue_count.
As we have the counts in struct dimm_info now, we no longer need to
allocate {ue,ce}_counts arrays and can remove its allocation and
release code including everything around.
> >
> > These unused counters only add complexity, remove them. The error
> > counter values are directly stored in struct dimm_info now.
>
> I guess this patch will cause troubles with some memory controllers.
>
> The problem is that, depending on the memory type and how many bits
> are wrong, it may not be technically possible to pinpoint an error
> to a single DIMM.
If a DIMM can not be identified, the MC has one or more of the pos
values (pos[0] to pos[mci->n_layers-1]) unset (negative values). The
count of the outer layer (mci->ce_per_layer[mci->n_layers][index]) is
not written then. See below in function edac_inc_ce_error().
>
> I mean, the memory controller can be, for instance, grouping
> DIMM1 and DIMM2. If there's just one bit errored, it is possible to
> assign it to either DIMM1 or DIMM2, but if there are multiple bits
> wrong, most ECC codes won't allow to pinpoint if the error ocurred
> at DIMM1 or at DIMM2.
An error would not be counted for any DIMM then.
>
> All we know is that the layer has an error.
Right, but this hasn't any effect on DIMM error counters.
This has only effect to csrow/channel counters. The code for this did
not change, see edac_mc_handle_error().
>
> So, assigning the error to the dimm struct seems plain wrong to me.
I think this is the code in question for you:
> > static void edac_inc_ce_error(struct mem_ctl_info *mci,
> > - bool enable_per_layer_report,
> > const int pos[EDAC_MAX_LAYERS],
> > const u16 count)
> > {
> > - int i, index = 0;
> > + struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
> >
> > mci->ce_mc += count;
> >
> > - if (!enable_per_layer_report) {
> > + if (dimm)
> > + dimm->ce_count += count;
> > + else
> > mci->ce_noinfo_count += count;
> > - return;
> > - }
> > -
> > - for (i = 0; i < mci->n_layers; i++) {
> > - if (pos[i] < 0)
> > - break;
> > - index += pos[i];
> > - mci->ce_per_layer[i][index] += count;
No value written here if pos[] < 0.
> > -
> > - if (i < mci->n_layers - 1)
> > - index *= mci->layers[i + 1].size;
> > - }
So in an intermediate step the for loop could be converted to:
dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
if (dimm)
mci->ce_per_layer[mci->n_layers - 1][dimm->idx] += count;
No change in functionality, right?
> > }
I hope this explains what this patch does.
It looks sane to me, please review again. If you still think it is
broken, give me an example.
Thanks,
-Robert