This patch set is a rework of the ghes_edac and edac_mc driver. It
addresses issues found during code review and while working with the
code. The changes include:
* improve function interfaces and data structures to decrease
complexity such as number of function arguments, unused data, etc.
(#1, #2, #6, #10, #12, #13),
* add helper functions and factor out code (#3, #8, #9, #14)
* fix style issues found by checkpatch (#4)
* improve code readability (#5, #7, #11)
* use of standard kernel macros (#15)
* code unification (#16, #17, #18)
* documentation updates (#19)
This series contains some patches that have been posted and reviewed
earlier here:
https://lore.kernel.org/patchwork/cover/1093488/
Review comments have been addressed and the following changes have
been made to those patches:
* rebased onto ras:edac-for-next (v5.4-rc1 based)
* reworded patch descriptions
* dropped hunk that only reorders code in "EDAC, ghes: Remove
pvt->detail_location string"
* updated description for edac_get_dimm_by_index()
* updated description for edac_get_dimm()
* small style changes in "EDAC: Replace EDAC_DIMM_PTR() macro with
edac_get_dimm() function"
* use unsigned type for dimm->idx to avoid need for negative range
check
* fixed mci_for_each_dimm() loop in i5100_init_csrows()
* fixed off-by-one in mci_for_each_dimm()
Robert Richter (19):
EDAC: Replace EDAC_DIMM_PTR() macro with edac_get_dimm() function
EDAC: Remove EDAC_DIMM_OFF() macro
EDAC: Introduce mci_for_each_dimm() iterator
EDAC, mc: Do not BUG_ON() in edac_mc_alloc()
EDAC, mc: Reduce indentation level in edac_mc_handle_error()
EDAC, mc: Remove per layer counters
EDAC, mc: Rename iterator variable to idx
EDAC, mc: Split edac_mc_alloc() into smaller functions
EDAC, mc: Reorder functions edac_mc_alloc*()
EDAC, mc: Rework edac_raw_mc_handle_error() to use struct dimm_info
EDAC: Remove misleading comment in struct edac_raw_error_desc
EDAC: Store error type in struct edac_raw_error_desc
EDAC, mc: Determine mci pointer from the error descriptor
EDAC, mc: Create new function edac_inc_csrow()
EDAC, ghes: Use standard kernel macros for page calculations
EDAC, ghes: Fix grain calculation
EDAC, ghes: Remove intermediate buffer pvt->detail_location
EDAC, ghes: Unify trace_mc_event() code with edac_mc driver
EDAC, Documentation: Describe CPER module definition and DIMM ranks
Documentation/admin-guide/ras.rst | 31 +-
drivers/edac/edac_mc.c | 478 +++++++++++++++---------------
drivers/edac/edac_mc.h | 8 +-
drivers/edac/edac_mc_sysfs.c | 65 ++--
drivers/edac/ghes_edac.c | 55 ++--
drivers/edac/i10nm_base.c | 3 +-
drivers/edac/i3200_edac.c | 3 +-
drivers/edac/i5000_edac.c | 5 +-
drivers/edac/i5100_edac.c | 14 +-
drivers/edac/i5400_edac.c | 3 +-
drivers/edac/i7300_edac.c | 3 +-
drivers/edac/i7core_edac.c | 3 +-
drivers/edac/ie31200_edac.c | 7 +-
drivers/edac/pnd2_edac.c | 4 +-
drivers/edac/sb_edac.c | 2 +-
drivers/edac/skx_base.c | 3 +-
drivers/edac/ti_edac.c | 2 +-
include/linux/edac.h | 153 +++++-----
18 files changed, 401 insertions(+), 441 deletions(-)
--
2.20.1
No need to crash the system in case edac_mc_alloc() is called with
invalid arguments, just warn and return. This would cause a checkpatch
warning when touching the code later, so just fix it.
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/edac_mc.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index c5240bb4c6c0..f2cbca77bc50 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -323,7 +323,9 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
int i, j, row, chn, n, len;
bool per_rank = false;
- BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0);
+ if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0))
+ return NULL;
+
/*
* Calculate the total amount of dimms and csrows/cschannels while
* in the old API emulation mode
--
2.20.1
Introduce the mci_for_each_dimm() iterator. It returns a pointer to a
struct dimm_info. This makes the declaration and use of an index
obsolete and avoids access to internal data of struct mci (direct
array access etc).
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/edac_mc.c | 19 +++++++++++--------
drivers/edac/edac_mc_sysfs.c | 29 ++++++++++++-----------------
drivers/edac/ghes_edac.c | 8 ++++----
drivers/edac/i5100_edac.c | 13 +++++--------
include/linux/edac.h | 7 +++++++
5 files changed, 39 insertions(+), 37 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 72088d49b03b..c5240bb4c6c0 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -145,15 +145,18 @@ static void edac_mc_dump_channel(struct rank_info *chan)
edac_dbg(4, " channel->dimm = %p\n", chan->dimm);
}
-static void edac_mc_dump_dimm(struct dimm_info *dimm, int number)
+static void edac_mc_dump_dimm(struct dimm_info *dimm)
{
char location[80];
+ if (!dimm->nr_pages)
+ return;
+
edac_dimm_info_location(dimm, location, sizeof(location));
edac_dbg(4, "%s%i: %smapped as virtual row %d, chan %d\n",
dimm->mci->csbased ? "rank" : "dimm",
- number, location, dimm->csrow, dimm->cschannel);
+ dimm->idx, location, dimm->csrow, dimm->cschannel);
edac_dbg(4, " dimm = %p\n", dimm);
edac_dbg(4, " dimm->label = '%s'\n", dimm->label);
edac_dbg(4, " dimm->nr_pages = 0x%x\n", dimm->nr_pages);
@@ -702,6 +705,7 @@ EXPORT_SYMBOL_GPL(edac_get_owner);
int edac_mc_add_mc_with_groups(struct mem_ctl_info *mci,
const struct attribute_group **groups)
{
+ struct dimm_info *dimm;
int ret = -EINVAL;
edac_dbg(0, "\n");
@@ -726,9 +730,9 @@ int edac_mc_add_mc_with_groups(struct mem_ctl_info *mci,
if (csrow->channels[j]->dimm->nr_pages)
edac_mc_dump_channel(csrow->channels[j]);
}
- for (i = 0; i < mci->tot_dimms; i++)
- if (mci->dimms[i]->nr_pages)
- edac_mc_dump_dimm(mci->dimms[i], i);
+
+ mci_for_each_dimm(mci, dimm)
+ edac_mc_dump_dimm(dimm);
}
#endif
mutex_lock(&mem_ctls_mutex);
@@ -1086,6 +1090,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
const char *msg,
const char *other_detail)
{
+ struct dimm_info *dimm;
char *p;
int row = -1, chan = -1;
int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
@@ -1146,9 +1151,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
p = e->label;
*p = '\0';
- for (i = 0; i < mci->tot_dimms; i++) {
- struct dimm_info *dimm = mci->dimms[i];
-
+ mci_for_each_dimm(mci, dimm) {
if (top_layer >= 0 && top_layer != dimm->location[0])
continue;
if (mid_layer >= 0 && mid_layer != dimm->location[1])
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
index 91e4c8f155af..0367554e7437 100644
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -621,8 +621,7 @@ static const struct device_type dimm_attr_type = {
/* Create a DIMM object under specifed memory controller device */
static int edac_create_dimm_object(struct mem_ctl_info *mci,
- struct dimm_info *dimm,
- int index)
+ struct dimm_info *dimm)
{
int err;
dimm->mci = mci;
@@ -632,9 +631,9 @@ static int edac_create_dimm_object(struct mem_ctl_info *mci,
dimm->dev.parent = &mci->dev;
if (mci->csbased)
- dev_set_name(&dimm->dev, "rank%d", index);
+ dev_set_name(&dimm->dev, "rank%d", dimm->idx);
else
- dev_set_name(&dimm->dev, "dimm%d", index);
+ dev_set_name(&dimm->dev, "dimm%d", dimm->idx);
dev_set_drvdata(&dimm->dev, dimm);
pm_runtime_forbid(&mci->dev);
@@ -916,7 +915,8 @@ static const struct device_type mci_attr_type = {
int edac_create_sysfs_mci_device(struct mem_ctl_info *mci,
const struct attribute_group **groups)
{
- int i, err;
+ struct dimm_info *dimm;
+ int err;
/* get the /sys/devices/system/edac subsys reference */
mci->dev.type = &mci_attr_type;
@@ -940,13 +940,12 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci,
/*
* Create the dimm/rank devices
*/
- for (i = 0; i < mci->tot_dimms; i++) {
- struct dimm_info *dimm = mci->dimms[i];
+ mci_for_each_dimm(mci, dimm) {
/* Only expose populated DIMMs */
if (!dimm->nr_pages)
continue;
- err = edac_create_dimm_object(mci, dimm, i);
+ err = edac_create_dimm_object(mci, dimm);
if (err)
goto fail_unregister_dimm;
}
@@ -961,12 +960,9 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci,
return 0;
fail_unregister_dimm:
- for (i--; i >= 0; i--) {
- struct dimm_info *dimm = mci->dimms[i];
- if (!dimm->nr_pages)
- continue;
-
- device_unregister(&dimm->dev);
+ mci_for_each_dimm(mci, dimm) {
+ if (device_is_registered(&dimm->dev))
+ device_unregister(&dimm->dev);
}
device_unregister(&mci->dev);
@@ -978,7 +974,7 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci,
*/
void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci)
{
- int i;
+ struct dimm_info *dimm;
edac_dbg(0, "\n");
@@ -989,8 +985,7 @@ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci)
edac_delete_csrow_objects(mci);
#endif
- for (i = 0; i < mci->tot_dimms; i++) {
- struct dimm_info *dimm = mci->dimms[i];
+ mci_for_each_dimm(mci, dimm) {
if (dimm->nr_pages == 0)
continue;
edac_dbg(1, "unregistering device %s\n", dev_name(&dimm->dev));
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index fb31e80dfe94..842080d7b33a 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -82,11 +82,11 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
static int get_dimm_smbios_index(u16 handle)
{
struct mem_ctl_info *mci = ghes_pvt->mci;
- int i;
+ struct dimm_info *dimm;
- for (i = 0; i < mci->tot_dimms; i++) {
- if (mci->dimms[i]->smbios_handle == handle)
- return i;
+ mci_for_each_dimm(mci, dimm) {
+ if (dimm->smbios_handle == handle)
+ return dimm->idx;
}
return -1;
}
diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c
index 134586753311..0ddc41e47a96 100644
--- a/drivers/edac/i5100_edac.c
+++ b/drivers/edac/i5100_edac.c
@@ -846,20 +846,17 @@ static void i5100_init_interleaving(struct pci_dev *pdev,
static void i5100_init_csrows(struct mem_ctl_info *mci)
{
- int i;
struct i5100_priv *priv = mci->pvt_info;
+ struct dimm_info *dimm;
- for (i = 0; i < mci->tot_dimms; i++) {
- struct dimm_info *dimm;
- const unsigned long npages = i5100_npages(mci, i);
- const unsigned int chan = i5100_csrow_to_chan(mci, i);
- const unsigned int rank = i5100_csrow_to_rank(mci, i);
+ mci_for_each_dimm(mci, dimm) {
+ const unsigned long npages = i5100_npages(mci, dimm->idx);
+ const unsigned int chan = i5100_csrow_to_chan(mci, dimm->idx);
+ const unsigned int rank = i5100_csrow_to_rank(mci, dimm->idx);
if (!npages)
continue;
- dimm = edac_get_dimm(mci, chan, rank, 0);
-
dimm->nr_pages = npages;
dimm->grain = 32;
dimm->dtype = (priv->mtr[chan][rank].width == 4) ?
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 79c5564ee314..8beb6e834be9 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -599,6 +599,13 @@ struct mem_ctl_info {
u16 fake_inject_count;
};
+#define mci_for_each_dimm(mci, dimm) \
+ for ((dimm) = (mci)->dimms[0]; \
+ (dimm); \
+ (dimm) = (dimm)->idx + 1 < (mci)->tot_dimms \
+ ? (mci)->dimms[(dimm)->idx + 1] \
+ : NULL)
+
/**
* edac_get_dimm_by_index - Get DIMM info from a memory controller
* given by an index
--
2.20.1
The EDAC_DIMM_OFF() macro takes 5 arguments to get the DIMM's index.
This can be much simplified now as the offset is already stored in
struct dimm_info. Use it directly and completely remove the
EDAC_DIMM_OFF() macro.
Another advantage is that edac_mc_alloc() could be used even if the
exact size of the layers is unknown. Only the number of DIMMs would be
needed.
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/edac_mc.c | 15 +++++--------
drivers/edac/edac_mc_sysfs.c | 20 ++++--------------
include/linux/edac.h | 41 ------------------------------------
3 files changed, 9 insertions(+), 67 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 3d45adb5957f..72088d49b03b 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -314,10 +314,10 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
struct dimm_info *dimm;
u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
unsigned int pos[EDAC_MAX_LAYERS];
- unsigned int size, tot_dimms = 1, count = 1;
+ unsigned int idx, size, tot_dimms = 1, count = 1;
unsigned int tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
void *pvt, *p, *ptr = NULL;
- int i, j, row, chn, n, len, off;
+ int i, j, row, chn, n, len;
bool per_rank = false;
BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0);
@@ -425,20 +425,15 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
memset(&pos, 0, sizeof(pos));
row = 0;
chn = 0;
- for (i = 0; i < tot_dimms; i++) {
+ for (idx = 0; idx < tot_dimms; idx++) {
chan = mci->csrows[row]->channels[chn];
- off = EDAC_DIMM_OFF(layer, n_layers, pos[0], pos[1], pos[2]);
- if (off < 0 || off >= tot_dimms) {
- edac_mc_printk(mci, KERN_ERR, "EDAC core bug: EDAC_DIMM_OFF is trying to do an illegal data access\n");
- goto error;
- }
dimm = kzalloc(sizeof(**mci->dimms), GFP_KERNEL);
if (!dimm)
goto error;
- mci->dimms[off] = dimm;
+ mci->dimms[idx] = dimm;
dimm->mci = mci;
- dimm->idx = off;
+ dimm->idx = idx;
/*
* Copy DIMM location and initialize it.
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
index 32d016f1ecd1..91e4c8f155af 100644
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -557,14 +557,8 @@ static ssize_t dimmdev_ce_count_show(struct device *dev,
{
struct dimm_info *dimm = to_dimm(dev);
u32 count;
- int off;
-
- off = EDAC_DIMM_OFF(dimm->mci->layers,
- dimm->mci->n_layers,
- dimm->location[0],
- dimm->location[1],
- dimm->location[2]);
- count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][off];
+
+ count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
return sprintf(data, "%u\n", count);
}
@@ -574,14 +568,8 @@ static ssize_t dimmdev_ue_count_show(struct device *dev,
{
struct dimm_info *dimm = to_dimm(dev);
u32 count;
- int off;
-
- off = EDAC_DIMM_OFF(dimm->mci->layers,
- dimm->mci->n_layers,
- dimm->location[0],
- dimm->location[1],
- dimm->location[2]);
- count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][off];
+
+ count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][dimm->idx];
return sprintf(data, "%u\n", count);
}
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 7f9804438589..79c5564ee314 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -362,47 +362,6 @@ struct edac_mc_layer {
*/
#define EDAC_MAX_LAYERS 3
-/**
- * EDAC_DIMM_OFF - Macro responsible to get a pointer offset inside a pointer
- * array for the element given by [layer0,layer1,layer2]
- * position
- *
- * @layers: a struct edac_mc_layer array, describing how many elements
- * were allocated for each layer
- * @nlayers: Number of layers at the @layers array
- * @layer0: layer0 position
- * @layer1: layer1 position. Unused if n_layers < 2
- * @layer2: layer2 position. Unused if n_layers < 3
- *
- * For 1 layer, this macro returns "var[layer0] - var";
- *
- * For 2 layers, this macro is similar to allocate a bi-dimensional array
- * and to return "var[layer0][layer1] - var";
- *
- * For 3 layers, this macro is similar to allocate a tri-dimensional array
- * and to return "var[layer0][layer1][layer2] - var".
- *
- * A loop could be used here to make it more generic, but, as we only have
- * 3 layers, this is a little faster.
- *
- * By design, layers can never be 0 or more than 3. If that ever happens,
- * a NULL is returned, causing an OOPS during the memory allocation routine,
- * with would point to the developer that he's doing something wrong.
- */
-#define EDAC_DIMM_OFF(layers, nlayers, layer0, layer1, layer2) ({ \
- int __i; \
- if ((nlayers) == 1) \
- __i = layer0; \
- else if ((nlayers) == 2) \
- __i = (layer1) + ((layers[1]).size * (layer0)); \
- else if ((nlayers) == 3) \
- __i = (layer2) + ((layers[2]).size * ((layer1) + \
- ((layers[1]).size * (layer0)))); \
- else \
- __i = -EINVAL; \
- __i; \
-})
-
struct dimm_info {
struct device dev;
--
2.20.1
The EDAC_DIMM_PTR() macro takes 3 arguments from struct mem_ctl_info.
Clean up this interface to only pass the mci struct and replace this
macro with the new function edac_get_dimm().
Also introduce the edac_get_dimm_by_index() function for later use.
This allows it to get a dimm pointer only by a given index. This can
be useful if the dimm's position within the layers of the memory
controller or the exact size of the layers are unknown.
Small style changes made for some hunks after applying the semantic
patch.
Semantic patch used:
@@ expression mci, a, b,c; @@
-EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, a, b, c)
+edac_get_dimm(mci, a, b, c)
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/edac_mc.c | 1 +
drivers/edac/ghes_edac.c | 7 +--
drivers/edac/i10nm_base.c | 3 +-
drivers/edac/i3200_edac.c | 3 +-
drivers/edac/i5000_edac.c | 5 +-
drivers/edac/i5100_edac.c | 3 +-
drivers/edac/i5400_edac.c | 3 +-
drivers/edac/i7300_edac.c | 3 +-
drivers/edac/i7core_edac.c | 3 +-
drivers/edac/ie31200_edac.c | 7 +--
drivers/edac/pnd2_edac.c | 4 +-
drivers/edac/sb_edac.c | 2 +-
drivers/edac/skx_base.c | 3 +-
drivers/edac/ti_edac.c | 2 +-
include/linux/edac.h | 92 ++++++++++++++++++++++++-------------
15 files changed, 79 insertions(+), 62 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index e6fd079783bd..3d45adb5957f 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -438,6 +438,7 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
goto error;
mci->dimms[off] = dimm;
dimm->mci = mci;
+ dimm->idx = off;
/*
* Copy DIMM location and initialize it.
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index d413a0bdc9ad..fb31e80dfe94 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -98,9 +98,7 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
if (dh->type == DMI_ENTRY_MEM_DEVICE) {
struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh;
- struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
- mci->n_layers,
- dimm_fill->count, 0, 0);
+ struct dimm_info *dimm = edac_get_dimm(mci, dimm_fill->count, 0, 0);
u16 rdr_mask = BIT(7) | BIT(13);
if (entry->size == 0xffff) {
@@ -527,8 +525,7 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
dimm_fill.mci = mci;
dmi_walk(ghes_edac_dmidecode, &dimm_fill);
} else {
- struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
- mci->n_layers, 0, 0, 0);
+ struct dimm_info *dimm = edac_get_dimm(mci, 0, 0, 0);
dimm->nr_pages = 1;
dimm->grain = 128;
diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index c370d5457e6b..059eccf0582b 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -154,8 +154,7 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci)
ndimms = 0;
for (j = 0; j < I10NM_NUM_DIMMS; j++) {
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
- mci->n_layers, i, j, 0);
+ dimm = edac_get_dimm(mci, i, j, 0);
mtr = I10NM_GET_DIMMMTR(imc, i, j);
mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i, j);
edac_dbg(1, "dimmmtr 0x%x mcddrtcfg 0x%x (mc%d ch%d dimm%d)\n",
diff --git a/drivers/edac/i3200_edac.c b/drivers/edac/i3200_edac.c
index 299b441647cd..432b375a4075 100644
--- a/drivers/edac/i3200_edac.c
+++ b/drivers/edac/i3200_edac.c
@@ -392,8 +392,7 @@ static int i3200_probe1(struct pci_dev *pdev, int dev_idx)
unsigned long nr_pages;
for (j = 0; j < nr_channels; j++) {
- struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
- mci->n_layers, i, j, 0);
+ struct dimm_info *dimm = edac_get_dimm(mci, i, j, 0);
nr_pages = drb_to_nr_pages(drbs, stacked, j, i);
if (nr_pages == 0)
diff --git a/drivers/edac/i5000_edac.c b/drivers/edac/i5000_edac.c
index 078a7351bf05..1a6f69c859ab 100644
--- a/drivers/edac/i5000_edac.c
+++ b/drivers/edac/i5000_edac.c
@@ -1275,9 +1275,8 @@ static int i5000_init_csrows(struct mem_ctl_info *mci)
if (!MTR_DIMMS_PRESENT(mtr))
continue;
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
- channel / MAX_BRANCHES,
- channel % MAX_BRANCHES, slot);
+ dimm = edac_get_dimm(mci, channel / MAX_BRANCHES,
+ channel % MAX_BRANCHES, slot);
csrow_megs = pvt->dimm_info[slot][channel].megabytes;
dimm->grain = 8;
diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c
index 12bebecb203b..134586753311 100644
--- a/drivers/edac/i5100_edac.c
+++ b/drivers/edac/i5100_edac.c
@@ -858,8 +858,7 @@ static void i5100_init_csrows(struct mem_ctl_info *mci)
if (!npages)
continue;
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
- chan, rank, 0);
+ dimm = edac_get_dimm(mci, chan, rank, 0);
dimm->nr_pages = npages;
dimm->grain = 32;
diff --git a/drivers/edac/i5400_edac.c b/drivers/edac/i5400_edac.c
index 8c86c6fd7da7..f131c05ade9f 100644
--- a/drivers/edac/i5400_edac.c
+++ b/drivers/edac/i5400_edac.c
@@ -1187,8 +1187,7 @@ static int i5400_init_dimms(struct mem_ctl_info *mci)
if (!MTR_DIMMS_PRESENT(mtr))
continue;
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
- channel / 2, channel % 2, slot);
+ dimm = edac_get_dimm(mci, channel / 2, channel % 2, slot);
size_mb = pvt->dimm_info[slot][channel].megabytes;
diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c
index 447d357c7a67..2e9bbe56cde9 100644
--- a/drivers/edac/i7300_edac.c
+++ b/drivers/edac/i7300_edac.c
@@ -794,8 +794,7 @@ static int i7300_init_csrows(struct mem_ctl_info *mci)
for (ch = 0; ch < max_channel; ch++) {
int channel = to_channel(ch, branch);
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
- mci->n_layers, branch, ch, slot);
+ dimm = edac_get_dimm(mci, branch, ch, slot);
dinfo = &pvt->dimm_info[slot][channel];
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index a71cca6eeb33..b3135b208f9a 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -585,8 +585,7 @@ static int get_dimm_config(struct mem_ctl_info *mci)
if (!DIMM_PRESENT(dimm_dod[j]))
continue;
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
- i, j, 0);
+ dimm = edac_get_dimm(mci, i, j, 0);
banks = numbank(MC_DOD_NUMBANK(dimm_dod[j]));
ranks = numrank(MC_DOD_NUMRANK(dimm_dod[j]));
rows = numrow(MC_DOD_NUMROW(dimm_dod[j]));
diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c
index d26300f9cb07..4f65073f230b 100644
--- a/drivers/edac/ie31200_edac.c
+++ b/drivers/edac/ie31200_edac.c
@@ -490,9 +490,7 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx)
if (dimm_info[j][i].dual_rank) {
nr_pages = nr_pages / 2;
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
- mci->n_layers, (i * 2) + 1,
- j, 0);
+ dimm = edac_get_dimm(mci, (i * 2) + 1, j, 0);
dimm->nr_pages = nr_pages;
edac_dbg(0, "set nr pages: 0x%lx\n", nr_pages);
dimm->grain = 8; /* just a guess */
@@ -503,8 +501,7 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx)
dimm->dtype = DEV_UNKNOWN;
dimm->edac_mode = EDAC_UNKNOWN;
}
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
- mci->n_layers, i * 2, j, 0);
+ dimm = edac_get_dimm(mci, i * 2, j, 0);
dimm->nr_pages = nr_pages;
edac_dbg(0, "set nr pages: 0x%lx\n", nr_pages);
dimm->grain = 8; /* same guess */
diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c
index b1193be1ef1d..933f7722b893 100644
--- a/drivers/edac/pnd2_edac.c
+++ b/drivers/edac/pnd2_edac.c
@@ -1231,7 +1231,7 @@ static void apl_get_dimm_config(struct mem_ctl_info *mci)
if (!(chan_mask & BIT(i)))
continue;
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, i, 0, 0);
+ dimm = edac_get_dimm(mci, i, 0, 0);
if (!dimm) {
edac_dbg(0, "No allocated DIMM for channel %d\n", i);
continue;
@@ -1311,7 +1311,7 @@ static void dnv_get_dimm_config(struct mem_ctl_info *mci)
if (!ranks_of_dimm[j])
continue;
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, i, j, 0);
+ dimm = edac_get_dimm(mci, i, j, 0);
if (!dimm) {
edac_dbg(0, "No allocated DIMM for channel %d DIMM %d\n", i, j);
continue;
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index a2fd39d330d6..4957e8ee1879 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -1621,7 +1621,7 @@ static int __populate_dimms(struct mem_ctl_info *mci,
}
for (j = 0; j < max_dimms_per_channel; j++) {
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, i, j, 0);
+ dimm = edac_get_dimm(mci, i, j, 0);
if (pvt->info.type == KNIGHTS_LANDING) {
pci_read_config_dword(pvt->knl.pci_channel[i],
knl_mtr_reg, &mtr);
diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
index 0fcf3785e8f3..8895eda365ff 100644
--- a/drivers/edac/skx_base.c
+++ b/drivers/edac/skx_base.c
@@ -177,8 +177,7 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci)
pci_read_config_dword(imc->chan[i].cdev, 0x8C, &amap);
pci_read_config_dword(imc->chan[i].cdev, 0x400, &mcddrtcfg);
for (j = 0; j < SKX_NUM_DIMMS; j++) {
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
- mci->n_layers, i, j, 0);
+ dimm = edac_get_dimm(mci, i, j, 0);
pci_read_config_dword(imc->chan[i].cdev,
0x80 + 4 * j, &mtr);
if (IS_DIMM_PRESENT(mtr)) {
diff --git a/drivers/edac/ti_edac.c b/drivers/edac/ti_edac.c
index 6ac26d1b929f..8be3e89a510e 100644
--- a/drivers/edac/ti_edac.c
+++ b/drivers/edac/ti_edac.c
@@ -135,7 +135,7 @@ static void ti_edac_setup_dimm(struct mem_ctl_info *mci, u32 type)
u32 val;
u32 memsize;
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, 0, 0, 0);
+ dimm = edac_get_dimm(mci, 0, 0, 0);
val = ti_edac_readl(edac, EMIF_SDRAM_CONFIG);
diff --git a/include/linux/edac.h b/include/linux/edac.h
index c19483b90079..7f9804438589 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -403,37 +403,6 @@ struct edac_mc_layer {
__i; \
})
-/**
- * EDAC_DIMM_PTR - Macro responsible to get a pointer inside a pointer array
- * for the element given by [layer0,layer1,layer2] position
- *
- * @layers: a struct edac_mc_layer array, describing how many elements
- * were allocated for each layer
- * @var: name of the var where we want to get the pointer
- * (like mci->dimms)
- * @nlayers: Number of layers at the @layers array
- * @layer0: layer0 position
- * @layer1: layer1 position. Unused if n_layers < 2
- * @layer2: layer2 position. Unused if n_layers < 3
- *
- * For 1 layer, this macro returns "var[layer0]";
- *
- * For 2 layers, this macro is similar to allocate a bi-dimensional array
- * and to return "var[layer0][layer1]";
- *
- * For 3 layers, this macro is similar to allocate a tri-dimensional array
- * and to return "var[layer0][layer1][layer2]";
- */
-#define EDAC_DIMM_PTR(layers, var, nlayers, layer0, layer1, layer2) ({ \
- typeof(*var) __p; \
- int ___i = EDAC_DIMM_OFF(layers, nlayers, layer0, layer1, layer2); \
- if (___i < 0) \
- __p = NULL; \
- else \
- __p = (var)[___i]; \
- __p; \
-})
-
struct dimm_info {
struct device dev;
@@ -443,6 +412,7 @@ struct dimm_info {
unsigned int location[EDAC_MAX_LAYERS];
struct mem_ctl_info *mci; /* the parent */
+ unsigned int idx; /* index within the parent dimm array */
u32 grain; /* granularity of reported error in bytes */
enum dev_type dtype; /* memory device type */
@@ -669,4 +639,64 @@ struct mem_ctl_info {
bool fake_inject_ue;
u16 fake_inject_count;
};
+
+/**
+ * edac_get_dimm_by_index - Get DIMM info from a memory controller
+ * given by an index
+ *
+ * @mci: a struct mem_ctl_info
+ * @index: index in the memory controller's DIMM array
+ *
+ * Returns a struct dimm_info* or NULL on failure.
+ */
+static inline struct dimm_info *
+edac_get_dimm_by_index(struct mem_ctl_info *mci, int index)
+{
+ if (index < 0 || index >= mci->tot_dimms)
+ return NULL;
+
+ if (WARN_ON_ONCE(mci->dimms[index]->idx != index))
+ return NULL;
+
+ return mci->dimms[index];
+}
+
+/**
+ * edac_get_dimm - Get DIMM info from a memory controller given by
+ * [layer0,layer1,layer2] position
+ *
+ * @mci: a struct mem_ctl_info
+ * @layer0: layer0 position
+ * @layer1: layer1 position. Unused if n_layers < 2
+ * @layer2: layer2 position. Unused if n_layers < 3
+ *
+ * For 1 layer, this function returns "dimms[layer0]";
+ *
+ * For 2 layers, this function is similar to allocate a bi-dimensional
+ * array and to return "dimms[layer0][layer1]";
+ *
+ * For 3 layers, this function is similar to allocate a tri-dimensional array
+ * and to return "dimms[layer0][layer1][layer2]";
+ */
+static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci,
+ int layer0, int layer1, int layer2)
+{
+ int index;
+
+ if (layer0 < 0
+ || (mci->n_layers > 1 && layer1 < 0)
+ || (mci->n_layers > 2 && layer2 < 0))
+ return NULL;
+
+ index = layer0;
+
+ if (mci->n_layers > 1)
+ index = index * mci->layers[1].size + layer1;
+
+ if (mci->n_layers > 2)
+ index = index * mci->layers[2].size + layer2;
+
+ return edac_get_dimm_by_index(mci, index);
+}
+
#endif
--
2.20.1
The error handling functions have the pos[] array argument for
determing the dimm handle. Rework those functions to use the dimm
handle directly.
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/edac_mc.c | 28 +++++++++++++---------------
drivers/edac/edac_mc.h | 2 ++
drivers/edac/ghes_edac.c | 6 +++++-
3 files changed, 20 insertions(+), 16 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 6d880cf4d599..cdfb383f7a35 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -925,11 +925,9 @@ const char *edac_layer_name[] = {
EXPORT_SYMBOL_GPL(edac_layer_name);
static void edac_inc_ce_error(struct mem_ctl_info *mci,
- const int pos[EDAC_MAX_LAYERS],
+ struct dimm_info *dimm,
const u16 count)
{
- struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
-
mci->ce_mc += count;
if (dimm)
@@ -939,11 +937,9 @@ static void edac_inc_ce_error(struct mem_ctl_info *mci,
}
static void edac_inc_ue_error(struct mem_ctl_info *mci,
- const int pos[EDAC_MAX_LAYERS],
- const u16 count)
+ struct dimm_info *dimm,
+ const u16 count)
{
- struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
-
mci->ue_mc += count;
if (dimm)
@@ -953,8 +949,8 @@ static void edac_inc_ue_error(struct mem_ctl_info *mci,
}
static void edac_ce_error(struct mem_ctl_info *mci,
+ struct dimm_info *dimm,
const u16 error_count,
- const int pos[EDAC_MAX_LAYERS],
const char *msg,
const char *location,
const char *label,
@@ -982,7 +978,7 @@ static void edac_ce_error(struct mem_ctl_info *mci,
error_count, msg, msg_aux, label,
location, detail);
}
- edac_inc_ce_error(mci, pos, error_count);
+ edac_inc_ce_error(mci, dimm, error_count);
if (mci->scrub_mode == SCRUB_SW_SRC) {
/*
@@ -1006,8 +1002,8 @@ static void edac_ce_error(struct mem_ctl_info *mci,
}
static void edac_ue_error(struct mem_ctl_info *mci,
+ struct dimm_info *dimm,
const u16 error_count,
- const int pos[EDAC_MAX_LAYERS],
const char *msg,
const char *location,
const char *label,
@@ -1041,15 +1037,15 @@ static void edac_ue_error(struct mem_ctl_info *mci,
msg, msg_aux, label, location, detail);
}
- edac_inc_ue_error(mci, pos, error_count);
+ edac_inc_ue_error(mci, dimm, error_count);
}
void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
struct mem_ctl_info *mci,
+ struct dimm_info *dimm,
struct edac_raw_error_desc *e)
{
char detail[80];
- int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer };
/* Memory type dependent details about the error */
if (type == HW_EVENT_ERR_CORRECTED) {
@@ -1057,7 +1053,7 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
"page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
e->page_frame_number, e->offset_in_page,
e->grain, e->syndrome);
- edac_ce_error(mci, e->error_count, pos, e->msg, e->location,
+ edac_ce_error(mci, dimm, e->error_count, e->msg, e->location,
e->label, detail, e->other_detail,
e->page_frame_number, e->offset_in_page, e->grain);
} else {
@@ -1065,7 +1061,7 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
"page:0x%lx offset:0x%lx grain:%ld",
e->page_frame_number, e->offset_in_page, e->grain);
- edac_ue_error(mci, e->error_count, pos, e->msg, e->location,
+ edac_ue_error(mci, dimm, e->error_count, e->msg, e->location,
e->label, detail, e->other_detail);
}
@@ -1245,6 +1241,8 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
(e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
grain_bits, e->syndrome, e->other_detail);
- edac_raw_mc_handle_error(type, mci, e);
+ dimm = edac_get_dimm(mci, top_layer, mid_layer, low_layer);
+
+ edac_raw_mc_handle_error(type, mci, dimm, e);
}
EXPORT_SYMBOL_GPL(edac_mc_handle_error);
diff --git a/drivers/edac/edac_mc.h b/drivers/edac/edac_mc.h
index 02aac5c61d00..2c3e2fbcedc4 100644
--- a/drivers/edac/edac_mc.h
+++ b/drivers/edac/edac_mc.h
@@ -214,6 +214,7 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
*
* @type: severity of the error (CE/UE/Fatal)
* @mci: a struct mem_ctl_info pointer
+ * @dimm: a struct dimm_info pointer
* @e: error description
*
* This raw function is used internally by edac_mc_handle_error(). It should
@@ -222,6 +223,7 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
*/
void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
struct mem_ctl_info *mci,
+ struct dimm_info *dimm,
struct edac_raw_error_desc *e);
/**
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index e0b90c6d7d63..4f5721cf4380 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -193,6 +193,7 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
{
+ struct dimm_info *dimm;
enum hw_event_mc_err_type type;
struct edac_raw_error_desc *e;
struct mem_ctl_info *mci;
@@ -437,7 +438,10 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
(e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
grain_bits, e->syndrome, pvt->detail_location);
- edac_raw_mc_handle_error(type, mci, e);
+ dimm = edac_get_dimm_by_index(mci, e->top_layer);
+
+ edac_raw_mc_handle_error(type, mci, dimm, e);
+
spin_unlock_irqrestore(&ghes_lock, flags);
}
--
2.20.1
Reduce the indentation level in edac_mc_handle_error() a bit by using
continue. No functional changes.
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/edac_mc.c | 59 +++++++++++++++++++++---------------------
1 file changed, 30 insertions(+), 29 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index f2cbca77bc50..45b02bb31964 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -1171,37 +1171,38 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
* channel/memory controller/... may be affected.
* Also, don't show errors for empty DIMM slots.
*/
- if (e->enable_per_layer_report && dimm->nr_pages) {
- if (n_labels >= EDAC_MAX_LABELS) {
- e->enable_per_layer_report = false;
- break;
- }
- n_labels++;
- if (p != e->label) {
- strcpy(p, OTHER_LABEL);
- p += strlen(OTHER_LABEL);
- }
- strcpy(p, dimm->label);
- p += strlen(p);
- *p = '\0';
+ if (!e->enable_per_layer_report || !dimm->nr_pages)
+ continue;
- /*
- * get csrow/channel of the DIMM, in order to allow
- * incrementing the compat API counters
- */
- edac_dbg(4, "%s csrows map: (%d,%d)\n",
- mci->csbased ? "rank" : "dimm",
- dimm->csrow, dimm->cschannel);
- if (row == -1)
- row = dimm->csrow;
- else if (row >= 0 && row != dimm->csrow)
- row = -2;
-
- if (chan == -1)
- chan = dimm->cschannel;
- else if (chan >= 0 && chan != dimm->cschannel)
- chan = -2;
+ if (n_labels >= EDAC_MAX_LABELS) {
+ e->enable_per_layer_report = false;
+ break;
+ }
+ n_labels++;
+ if (p != e->label) {
+ strcpy(p, OTHER_LABEL);
+ p += strlen(OTHER_LABEL);
}
+ strcpy(p, dimm->label);
+ p += strlen(p);
+ *p = '\0';
+
+ /*
+ * get csrow/channel of the DIMM, in order to allow
+ * incrementing the compat API counters
+ */
+ edac_dbg(4, "%s csrows map: (%d,%d)\n",
+ mci->csbased ? "rank" : "dimm",
+ dimm->csrow, dimm->cschannel);
+ if (row == -1)
+ row = dimm->csrow;
+ else if (row >= 0 && row != dimm->csrow)
+ row = -2;
+
+ if (chan == -1)
+ chan = dimm->cschannel;
+ else if (chan >= 0 && chan != dimm->cschannel)
+ chan = -2;
}
if (!e->enable_per_layer_report) {
--
2.20.1
Use standard macros for page calculations.
Reviewed-by: James Morse <[email protected]>
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/ghes_edac.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 8078d4ec9631..851aad92e42d 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -309,8 +309,8 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
/* Error address */
if (mem_err->validation_bits & CPER_MEM_VALID_PA) {
- e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT;
- e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK;
+ e->page_frame_number = PHYS_PFN(mem_err->physical_addr);
+ e->offset_in_page = offset_in_page(mem_err->physical_addr);
}
/* Error grain */
--
2.20.1
detail_location[] is used to collect two location strings so they can
be passed as one to trace_mc_event(). Instead of having an extra copy
step, assemble the location string in other_detail[] from the
beginning.
Using other_detail[] to call trace_mc_event() is now the same as in
edac_mc.c and code can be unified.
Reviewed-by: James Morse <[email protected]>
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/ghes_edac.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 97242cf18a88..8d9d3c4dbebb 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -21,8 +21,7 @@ struct ghes_edac_pvt {
struct mem_ctl_info *mci;
/* Buffers for the error handling routine */
- char detail_location[240];
- char other_detail[160];
+ char other_detail[400];
char msg[80];
};
@@ -356,6 +355,8 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
/* All other fields are mapped on e->other_detail */
p = pvt->other_detail;
+ p += snprintf(p, sizeof(pvt->other_detail),
+ "APEI location: %s ", e->location);
if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) {
u64 status = mem_err->error_status;
@@ -436,12 +437,10 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
grain_bits = fls_long(e->grain - 1);
/* Generate the trace event */
- snprintf(pvt->detail_location, sizeof(pvt->detail_location),
- "APEI location: %s %s", e->location, e->other_detail);
trace_mc_event(e->type, e->msg, e->label, e->error_count,
mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
(e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
- grain_bits, e->syndrome, pvt->detail_location);
+ grain_bits, e->syndrome, e->other_detail);
dimm = edac_get_dimm_by_index(mci, e->top_layer);
--
2.20.1
The code in ghes_edac.c and edac_mc.c for grain_bits calculation and
calling trace_mc_event() is now the same. Move it to a single location
in edac_raw_mc_handle_error().
The only difference is the missing IS_ENABLED(CONFIG_RAS) switch, but
this is needed for ghes too.
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/edac_mc.c | 30 +++++++++++++++---------------
drivers/edac/ghes_edac.c | 13 -------------
2 files changed, 15 insertions(+), 28 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 3779204c0e21..e6c6e40dc760 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -1070,6 +1070,21 @@ void edac_raw_mc_handle_error(struct edac_raw_error_desc *e,
{
struct mem_ctl_info *mci = error_desc_to_mci(e);
char detail[80];
+ u8 grain_bits;
+
+ /* Sanity-check driver-supplied grain value. */
+ if (WARN_ON_ONCE(!e->grain))
+ e->grain = 1;
+
+ grain_bits = fls_long(e->grain - 1);
+
+ /* Report the error via the trace interface */
+ if (IS_ENABLED(CONFIG_RAS))
+ trace_mc_event(e->type, e->msg, e->label, e->error_count,
+ mci->mc_idx, e->top_layer, e->mid_layer,
+ e->low_layer,
+ (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
+ grain_bits, e->syndrome, e->other_detail);
/* Memory type dependent details about the error */
if (e->type == HW_EVENT_ERR_CORRECTED) {
@@ -1110,7 +1125,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
int row = -1, chan = -1;
int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
int i, n_labels = 0;
- u8 grain_bits;
struct edac_raw_error_desc *e = &mci->error_desc;
bool any_memory = true;
@@ -1242,20 +1256,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
if (p > e->location)
*(p - 1) = '\0';
- /* Sanity-check driver-supplied grain value. */
- if (WARN_ON_ONCE(!e->grain))
- e->grain = 1;
-
- grain_bits = fls_long(e->grain - 1);
-
- /* Report the error via the trace interface */
- if (IS_ENABLED(CONFIG_RAS))
- trace_mc_event(type, e->msg, e->label, e->error_count,
- mci->mc_idx, e->top_layer, e->mid_layer,
- e->low_layer,
- (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
- grain_bits, e->syndrome, e->other_detail);
-
dimm = edac_get_dimm(mci, top_layer, mid_layer, low_layer);
edac_raw_mc_handle_error(e, dimm);
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 8d9d3c4dbebb..17d5b22fe000 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -198,7 +198,6 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
struct ghes_edac_pvt *pvt = ghes_pvt;
unsigned long flags;
char *p;
- u8 grain_bits;
if (!pvt)
return;
@@ -430,18 +429,6 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
if (p > pvt->other_detail)
*(p - 1) = '\0';
- /* Sanity-check driver-supplied grain value. */
- if (WARN_ON_ONCE(!e->grain))
- e->grain = 1;
-
- grain_bits = fls_long(e->grain - 1);
-
- /* Generate the trace event */
- trace_mc_event(e->type, e->msg, e->label, e->error_count,
- mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
- (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
- grain_bits, e->syndrome, e->other_detail);
-
dimm = edac_get_dimm_by_index(mci, e->top_layer);
edac_raw_mc_handle_error(e, dimm);
--
2.20.1
The current code to convert a physical address mask to a grain
(defined as granularity in bytes) is:
e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK);
This is broken in several ways:
1) It calculates to wrong grain values. E.g., a physical address mask
of ~0xfff should give a grain of 0x1000. Without considering
PAGE_MASK, there is an off-by-one. Things are worse when also
filtering it with ~PAGE_MASK. This will calculate to a grain with the
upper bits set. In the example it even calculates to ~0.
2) The grain does not depend on and is unrelated to the kernel's
page-size. The page-size only matters when unmapping memory in
memory_failure(). Smaller grains are wrongly rounded up to the
page-size, on architectures with a configurable page-size (e.g. arm64)
this could round up to the even bigger page-size of the hypervisor.
Fix this with:
e->grain = ~mem_err->physical_addr_mask + 1;
The grain_bits are defined as:
grain = 1 << grain_bits;
Change also the grain_bits calculation accordingly, it is the same
formula as in edac_mc.c now and the code can be unified.
The value in ->physical_addr_mask coming from firmware is assumed to
be contiguous, but this is not sanity-checked. However, in case the
mask is non-contiguous, a conversion to grain_bits effectively
converts the grain bit mask to a power of 2 by rounding up.
Suggested-by: James Morse <[email protected]>
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/ghes_edac.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 851aad92e42d..97242cf18a88 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -220,6 +220,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
/* Cleans the error report buffer */
memset(e, 0, sizeof (*e));
e->error_count = 1;
+ e->grain = 1;
strcpy(e->label, "unknown label");
e->msg = pvt->msg;
e->other_detail = pvt->other_detail;
@@ -315,7 +316,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
/* Error grain */
if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK)
- e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK);
+ e->grain = ~mem_err->physical_addr_mask + 1;
/* Memory error location, mapped on e->location */
p = e->location;
@@ -428,8 +429,13 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
if (p > pvt->other_detail)
*(p - 1) = '\0';
+ /* Sanity-check driver-supplied grain value. */
+ if (WARN_ON_ONCE(!e->grain))
+ e->grain = 1;
+
+ grain_bits = fls_long(e->grain - 1);
+
/* Generate the trace event */
- grain_bits = fls_long(e->grain);
snprintf(pvt->detail_location, sizeof(pvt->detail_location),
"APEI location: %s %s", e->location, e->other_detail);
trace_mc_event(e->type, e->msg, e->label, e->error_count,
--
2.20.1
Update on CPER DIMM naming convention and DIMM ranks.
Signed-off-by: Robert Richter <[email protected]>
---
Documentation/admin-guide/ras.rst | 31 +++++++++++++++++++------------
1 file changed, 19 insertions(+), 12 deletions(-)
diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst
index 2b20f5f7380d..26e02a59f0f4 100644
--- a/Documentation/admin-guide/ras.rst
+++ b/Documentation/admin-guide/ras.rst
@@ -330,9 +330,12 @@ There can be multiple csrows and multiple channels.
.. [#f4] Nowadays, the term DIMM (Dual In-line Memory Module) is widely
used to refer to a memory module, although there are other memory
- packaging alternatives, like SO-DIMM, SIMM, etc. Along this document,
- and inside the EDAC system, the term "dimm" is used for all memory
- modules, even when they use a different kind of packaging.
+ packaging alternatives, like SO-DIMM, SIMM, etc. The UEFI
+ specification (Version 2.7) defines a memory module in the Common
+ Platform Error Record (CPER) section to be an SMBIOS Memory Device
+ (Type 17). Along this document, and inside the EDAC system, the term
+ "dimm" is used for all memory modules, even when they use a
+ different kind of packaging.
Memory controllers allow for several csrows, with 8 csrows being a
typical value. Yet, the actual number of csrows depends on the layout of
@@ -349,12 +352,14 @@ controllers. The following example will assume 2 channels:
| | ``ch0`` | ``ch1`` |
+============+===========+===========+
| ``csrow0`` | DIMM_A0 | DIMM_B0 |
- +------------+ | |
- | ``csrow1`` | | |
+ | | rank0 | rank0 |
+ +------------+ - | - |
+ | ``csrow1`` | rank1 | rank1 |
+------------+-----------+-----------+
| ``csrow2`` | DIMM_A1 | DIMM_B1 |
- +------------+ | |
- | ``csrow3`` | | |
+ | | rank0 | rank0 |
+ +------------+ - | - |
+ | ``csrow3`` | rank1 | rank1 |
+------------+-----------+-----------+
In the above example, there are 4 physical slots on the motherboard
@@ -374,11 +379,13 @@ which the memory DIMM is placed. Thus, when 1 DIMM is placed in each
Channel, the csrows cross both DIMMs.
Memory DIMMs come single or dual "ranked". A rank is a populated csrow.
-Thus, 2 single ranked DIMMs, placed in slots DIMM_A0 and DIMM_B0 above
-will have just one csrow (csrow0). csrow1 will be empty. On the other
-hand, when 2 dual ranked DIMMs are similarly placed, then both csrow0
-and csrow1 will be populated. The pattern repeats itself for csrow2 and
-csrow3.
+In the example above 2 dual ranked DIMMs are similarly placed. Thus,
+both csrow0 and csrow1 are populated. On the other hand, when 2 single
+ranked DIMMs are placed in slots DIMM_A0 and DIMM_B0, then they will
+have just one csrow (csrow0) and csrow1 will be empty. The pattern
+repeats itself for csrow2 and csrow3. Also note that some memory
+controller doesn't have any logic to identify the memory module, see
+``rankX`` directories below.
The representation of the above is reflected in the directory
tree in EDAC's sysfs interface. Starting in directory
--
2.20.1
Have a separate function to count errors in csrow/channel. This better
separates code and reduces the indentation level. No functional
changes.
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/edac_mc.c | 40 +++++++++++++++++++++++++---------------
1 file changed, 25 insertions(+), 15 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 9e8c5716a8c0..3779204c0e21 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -1045,6 +1045,26 @@ static struct mem_ctl_info *error_desc_to_mci(struct edac_raw_error_desc *e)
return container_of(e, struct mem_ctl_info, error_desc);
}
+static void edac_inc_csrow(struct edac_raw_error_desc *e, int row, int chan)
+{
+ struct mem_ctl_info *mci = error_desc_to_mci(e);
+ u16 count = e->error_count;
+ enum hw_event_mc_err_type type = e->type;
+
+ if (row < 0)
+ return;
+
+ edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
+
+ if (type == HW_EVENT_ERR_CORRECTED) {
+ mci->csrows[row]->ce_count += count;
+ if (chan >= 0)
+ mci->csrows[row]->channels[chan]->ce_count += count;
+ } else {
+ mci->csrows[row]->ue_count += count;
+ }
+}
+
void edac_raw_mc_handle_error(struct edac_raw_error_desc *e,
struct dimm_info *dimm)
{
@@ -1201,22 +1221,12 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
chan = -2;
}
- if (any_memory) {
+ if (any_memory)
strcpy(e->label, "any memory");
- } else {
- edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
- if (p == e->label)
- strcpy(e->label, "unknown memory");
- if (type == HW_EVENT_ERR_CORRECTED) {
- if (row >= 0) {
- mci->csrows[row]->ce_count += error_count;
- if (chan >= 0)
- mci->csrows[row]->channels[chan]->ce_count += error_count;
- }
- } else
- if (row >= 0)
- mci->csrows[row]->ue_count += error_count;
- }
+ else if (!*e->label)
+ strcpy(e->label, "unknown memory");
+
+ edac_inc_csrow(e, row, chan);
/* Fill the RAM location data */
p = e->location;
--
2.20.1
Store the error type in struct edac_raw_error_desc. This makes the
type parameter of edac_raw_mc_handle_error() obsolete.
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/edac_mc.c | 8 ++++----
drivers/edac/edac_mc.h | 4 +---
drivers/edac/ghes_edac.c | 13 ++++++-------
include/linux/edac.h | 1 +
4 files changed, 12 insertions(+), 14 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index cdfb383f7a35..ca206854b8ee 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -1040,15 +1040,14 @@ static void edac_ue_error(struct mem_ctl_info *mci,
edac_inc_ue_error(mci, dimm, error_count);
}
-void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
- struct mem_ctl_info *mci,
+void edac_raw_mc_handle_error(struct mem_ctl_info *mci,
struct dimm_info *dimm,
struct edac_raw_error_desc *e)
{
char detail[80];
/* Memory type dependent details about the error */
- if (type == HW_EVENT_ERR_CORRECTED) {
+ if (e->type == HW_EVENT_ERR_CORRECTED) {
snprintf(detail, sizeof(detail),
"page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
e->page_frame_number, e->offset_in_page,
@@ -1095,6 +1094,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
/* Fills the error report buffer */
memset(e, 0, sizeof (*e));
e->error_count = error_count;
+ e->type = type;
e->top_layer = top_layer;
e->mid_layer = mid_layer;
e->low_layer = low_layer;
@@ -1243,6 +1243,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
dimm = edac_get_dimm(mci, top_layer, mid_layer, low_layer);
- edac_raw_mc_handle_error(type, mci, dimm, e);
+ edac_raw_mc_handle_error(mci, dimm, e);
}
EXPORT_SYMBOL_GPL(edac_mc_handle_error);
diff --git a/drivers/edac/edac_mc.h b/drivers/edac/edac_mc.h
index 2c3e2fbcedc4..a8f1b5b5e873 100644
--- a/drivers/edac/edac_mc.h
+++ b/drivers/edac/edac_mc.h
@@ -212,7 +212,6 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
* edac_raw_mc_handle_error() - Reports a memory event to userspace without
* doing anything to discover the error location.
*
- * @type: severity of the error (CE/UE/Fatal)
* @mci: a struct mem_ctl_info pointer
* @dimm: a struct dimm_info pointer
* @e: error description
@@ -221,8 +220,7 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
* only be called directly when the hardware error come directly from BIOS,
* like in the case of APEI GHES driver.
*/
-void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
- struct mem_ctl_info *mci,
+void edac_raw_mc_handle_error(struct mem_ctl_info *mci,
struct dimm_info *dimm,
struct edac_raw_error_desc *e);
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 4f5721cf4380..1db1c012bed9 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -194,7 +194,6 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
{
struct dimm_info *dimm;
- enum hw_event_mc_err_type type;
struct edac_raw_error_desc *e;
struct mem_ctl_info *mci;
struct ghes_edac_pvt *pvt = ghes_pvt;
@@ -232,17 +231,17 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
switch (sev) {
case GHES_SEV_CORRECTED:
- type = HW_EVENT_ERR_CORRECTED;
+ e->type = HW_EVENT_ERR_CORRECTED;
break;
case GHES_SEV_RECOVERABLE:
- type = HW_EVENT_ERR_UNCORRECTED;
+ e->type = HW_EVENT_ERR_UNCORRECTED;
break;
case GHES_SEV_PANIC:
- type = HW_EVENT_ERR_FATAL;
+ e->type = HW_EVENT_ERR_FATAL;
break;
default:
case GHES_SEV_NO:
- type = HW_EVENT_ERR_INFO;
+ e->type = HW_EVENT_ERR_INFO;
}
edac_dbg(1, "error validation_bits: 0x%08llx\n",
@@ -433,14 +432,14 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
grain_bits = fls_long(e->grain);
snprintf(pvt->detail_location, sizeof(pvt->detail_location),
"APEI location: %s %s", e->location, e->other_detail);
- trace_mc_event(type, e->msg, e->label, e->error_count,
+ trace_mc_event(e->type, e->msg, e->label, e->error_count,
mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
(e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
grain_bits, e->syndrome, pvt->detail_location);
dimm = edac_get_dimm_by_index(mci, e->top_layer);
- edac_raw_mc_handle_error(type, mci, dimm, e);
+ edac_raw_mc_handle_error(mci, dimm, e);
spin_unlock_irqrestore(&ghes_lock, flags);
}
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 4d9673954856..587c53b87fdf 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -463,6 +463,7 @@ struct edac_raw_error_desc {
long grain;
u16 error_count;
+ enum hw_event_mc_err_type type;
int top_layer;
int mid_layer;
int low_layer;
--
2.20.1
Each struct mci has its own error descriptor. Create a function
error_desc_to_mci() to determine the corresponding mci from an error
descriptor. This eases the parameter list of edac_raw_mc_handle_
error() as the mci pointer do not need to be passed any longer.
While at it, reorder parameters of edac_raw_mc_handle_error().
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/edac_mc.c | 13 +++++++++----
drivers/edac/edac_mc.h | 8 +++-----
drivers/edac/ghes_edac.c | 2 +-
3 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index ca206854b8ee..9e8c5716a8c0 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -1040,10 +1040,15 @@ static void edac_ue_error(struct mem_ctl_info *mci,
edac_inc_ue_error(mci, dimm, error_count);
}
-void edac_raw_mc_handle_error(struct mem_ctl_info *mci,
- struct dimm_info *dimm,
- struct edac_raw_error_desc *e)
+static struct mem_ctl_info *error_desc_to_mci(struct edac_raw_error_desc *e)
+{
+ return container_of(e, struct mem_ctl_info, error_desc);
+}
+
+void edac_raw_mc_handle_error(struct edac_raw_error_desc *e,
+ struct dimm_info *dimm)
{
+ struct mem_ctl_info *mci = error_desc_to_mci(e);
char detail[80];
/* Memory type dependent details about the error */
@@ -1243,6 +1248,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
dimm = edac_get_dimm(mci, top_layer, mid_layer, low_layer);
- edac_raw_mc_handle_error(mci, dimm, e);
+ edac_raw_mc_handle_error(e, dimm);
}
EXPORT_SYMBOL_GPL(edac_mc_handle_error);
diff --git a/drivers/edac/edac_mc.h b/drivers/edac/edac_mc.h
index a8f1b5b5e873..3b01d5d9d7bc 100644
--- a/drivers/edac/edac_mc.h
+++ b/drivers/edac/edac_mc.h
@@ -212,17 +212,15 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
* edac_raw_mc_handle_error() - Reports a memory event to userspace without
* doing anything to discover the error location.
*
- * @mci: a struct mem_ctl_info pointer
- * @dimm: a struct dimm_info pointer
* @e: error description
+ * @dimm: a struct dimm_info pointer
*
* This raw function is used internally by edac_mc_handle_error(). It should
* only be called directly when the hardware error come directly from BIOS,
* like in the case of APEI GHES driver.
*/
-void edac_raw_mc_handle_error(struct mem_ctl_info *mci,
- struct dimm_info *dimm,
- struct edac_raw_error_desc *e);
+void edac_raw_mc_handle_error(struct edac_raw_error_desc *e,
+ struct dimm_info *dimm);
/**
* edac_mc_handle_error() - Reports a memory event to userspace.
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 1db1c012bed9..8078d4ec9631 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -439,7 +439,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
dimm = edac_get_dimm_by_index(mci, e->top_layer);
- edac_raw_mc_handle_error(mci, dimm, e);
+ edac_raw_mc_handle_error(e, dimm);
spin_unlock_irqrestore(&ghes_lock, flags);
}
--
2.20.1
There never has been such function edac_raw_error_desc_clean() and in
function ghes_edac_report_mem_error() the whole struct is zero'ed
including the string arrays. Remove that comment.
Signed-off-by: Robert Richter <[email protected]>
---
include/linux/edac.h | 5 -----
1 file changed, 5 deletions(-)
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 8e72222e50b0..4d9673954856 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -458,15 +458,10 @@ struct errcount_attribute_data {
* @other_detail: other driver-specific detail about the error
*/
struct edac_raw_error_desc {
- /*
- * NOTE: everything before grain won't be cleaned by
- * edac_raw_error_desc_clean()
- */
char location[LOCATION_SIZE];
char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * EDAC_MAX_LABELS];
long grain;
- /* the vars below and grain will be cleaned on every new error report */
u16 error_count;
int top_layer;
int mid_layer;
--
2.20.1
edac_mc_alloc() is huge. Factor out code by moving it to the two new
functions edac_mc_alloc_csrows() and edac_mc_alloc_dimms(). Do not
move code yet for better review.
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/edac_mc.c | 104 +++++++++++++++++++++++++++--------------
1 file changed, 69 insertions(+), 35 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index a893f793be8a..0db504cb3419 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -305,6 +305,9 @@ static void _edac_mc_free(struct mem_ctl_info *mci)
kfree(mci);
}
+static int edac_mc_alloc_csrows(struct mem_ctl_info *mci);
+static int edac_mc_alloc_dimms(struct mem_ctl_info *mci);
+
struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
unsigned int n_layers,
struct edac_mc_layer *layers,
@@ -312,14 +315,9 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
{
struct mem_ctl_info *mci;
struct edac_mc_layer *layer;
- struct csrow_info *csr;
- struct rank_info *chan;
- struct dimm_info *dimm;
- unsigned int pos[EDAC_MAX_LAYERS];
unsigned int idx, size, tot_dimms = 1;
unsigned int tot_csrows = 1, tot_channels = 1;
- void *pvt, *p, *ptr = NULL;
- int j, row, chn, n, len;
+ void *pvt, *ptr = NULL;
bool per_rank = false;
if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0))
@@ -377,16 +375,43 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
mci->num_cschannel = tot_channels;
mci->csbased = per_rank;
+ if (edac_mc_alloc_csrows(mci))
+ goto error;
+
+ if (edac_mc_alloc_dimms(mci))
+ goto error;
+
+ mci->op_state = OP_ALLOC;
+
+ return mci;
+
+error:
+ _edac_mc_free(mci);
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(edac_mc_alloc);
+
+static int edac_mc_alloc_csrows(struct mem_ctl_info *mci)
+{
+ unsigned int tot_csrows = mci->nr_csrows;
+ unsigned int tot_channels = mci->num_cschannel;
+ unsigned int row, chn;
+
/*
* Alocate and fill the csrow/channels structs
*/
mci->csrows = kcalloc(tot_csrows, sizeof(*mci->csrows), GFP_KERNEL);
if (!mci->csrows)
- goto error;
+ return -ENOMEM;
+
for (row = 0; row < tot_csrows; row++) {
+ struct csrow_info *csr;
+
csr = kzalloc(sizeof(**mci->csrows), GFP_KERNEL);
if (!csr)
- goto error;
+ return -ENOMEM;
+
mci->csrows[row] = csr;
csr->csrow_idx = row;
csr->mci = mci;
@@ -394,34 +419,51 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
csr->channels = kcalloc(tot_channels, sizeof(*csr->channels),
GFP_KERNEL);
if (!csr->channels)
- goto error;
+ return -ENOMEM;
for (chn = 0; chn < tot_channels; chn++) {
+ struct rank_info *chan;
+
chan = kzalloc(sizeof(**csr->channels), GFP_KERNEL);
if (!chan)
- goto error;
+ return -ENOMEM;
+
csr->channels[chn] = chan;
chan->chan_idx = chn;
chan->csrow = csr;
}
}
+ return 0;
+}
+
+static int edac_mc_alloc_dimms(struct mem_ctl_info *mci)
+{
+ void *p;
+ unsigned int pos[EDAC_MAX_LAYERS];
+ unsigned int row, chn, idx;
+ int layer;
+
/*
* Allocate and fill the dimm structs
*/
- mci->dimms = kcalloc(tot_dimms, sizeof(*mci->dimms), GFP_KERNEL);
+ mci->dimms = kcalloc(mci->tot_dimms, sizeof(*mci->dimms), GFP_KERNEL);
if (!mci->dimms)
- goto error;
+ return -ENOMEM;
memset(&pos, 0, sizeof(pos));
row = 0;
chn = 0;
- for (idx = 0; idx < tot_dimms; idx++) {
+ for (idx = 0; idx < mci->tot_dimms; idx++) {
+ struct dimm_info *dimm;
+ struct rank_info *chan;
+ int n, len;
+
chan = mci->csrows[row]->channels[chn];
dimm = kzalloc(sizeof(**mci->dimms), GFP_KERNEL);
if (!dimm)
- goto error;
+ return -ENOMEM;
mci->dimms[idx] = dimm;
dimm->mci = mci;
dimm->idx = idx;
@@ -431,16 +473,16 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
*/
len = sizeof(dimm->label);
p = dimm->label;
- n = snprintf(p, len, "mc#%u", mc_num);
+ n = snprintf(p, len, "mc#%u", mci->mc_idx);
p += n;
len -= n;
- for (j = 0; j < n_layers; j++) {
+ for (layer = 0; layer < mci->n_layers; layer++) {
n = snprintf(p, len, "%s#%u",
- edac_layer_name[layers[j].type],
- pos[j]);
+ edac_layer_name[mci->layers[layer].type],
+ pos[layer]);
p += n;
len -= n;
- dimm->location[j] = pos[j];
+ dimm->location[layer] = pos[layer];
if (len <= 0)
break;
@@ -452,39 +494,31 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
dimm->cschannel = chn;
/* Increment csrow location */
- if (layers[0].is_virt_csrow) {
+ if (mci->layers[0].is_virt_csrow) {
chn++;
- if (chn == tot_channels) {
+ if (chn == mci->num_cschannel) {
chn = 0;
row++;
}
} else {
row++;
- if (row == tot_csrows) {
+ if (row == mci->nr_csrows) {
row = 0;
chn++;
}
}
/* Increment dimm location */
- for (j = n_layers - 1; j >= 0; j--) {
- pos[j]++;
- if (pos[j] < layers[j].size)
+ for (layer = mci->n_layers - 1; layer >= 0; layer--) {
+ pos[layer]++;
+ if (pos[layer] < mci->layers[layer].size)
break;
- pos[j] = 0;
+ pos[layer] = 0;
}
}
- mci->op_state = OP_ALLOC;
-
- return mci;
-
-error:
- _edac_mc_free(mci);
-
- return NULL;
+ return 0;
}
-EXPORT_SYMBOL_GPL(edac_mc_alloc);
void edac_mc_free(struct mem_ctl_info *mci)
{
--
2.20.1
Rename iterator variable to idx. The name is more handy, esp. when
searching it in the code.
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/edac_mc.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index c1e142643006..a893f793be8a 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -319,7 +319,7 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
unsigned int idx, size, tot_dimms = 1;
unsigned int tot_csrows = 1, tot_channels = 1;
void *pvt, *p, *ptr = NULL;
- int i, j, row, chn, n, len;
+ int j, row, chn, n, len;
bool per_rank = false;
if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0))
@@ -329,14 +329,14 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
* Calculate the total amount of dimms and csrows/cschannels while
* in the old API emulation mode
*/
- for (i = 0; i < n_layers; i++) {
- tot_dimms *= layers[i].size;
- if (layers[i].is_virt_csrow)
- tot_csrows *= layers[i].size;
+ for (idx = 0; idx < n_layers; idx++) {
+ tot_dimms *= layers[idx].size;
+ if (layers[idx].is_virt_csrow)
+ tot_csrows *= layers[idx].size;
else
- tot_channels *= layers[i].size;
+ tot_channels *= layers[idx].size;
- if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT)
+ if (layers[idx].type == EDAC_MC_LAYER_CHIP_SELECT)
per_rank = true;
}
--
2.20.1
Looking at how mci->{ue,ce}_per_layer[EDAC_MAX_LAYERS] is used, it
turns out that only the leaves in the memory hierarchy are consumed
(in sysfs), but not the intermediate layers, e.g.:
count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
These unused counters only add complexity, remove them. The error
counter values are directly stored in struct dimm_info now.
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/edac_mc.c | 104 ++++++++++++-----------------------
drivers/edac/edac_mc_sysfs.c | 20 +++----
drivers/edac/ghes_edac.c | 5 +-
include/linux/edac.h | 7 +--
4 files changed, 46 insertions(+), 90 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 45b02bb31964..c1e142643006 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -315,10 +315,9 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
struct csrow_info *csr;
struct rank_info *chan;
struct dimm_info *dimm;
- u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
unsigned int pos[EDAC_MAX_LAYERS];
- unsigned int idx, size, tot_dimms = 1, count = 1;
- unsigned int tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
+ unsigned int idx, size, tot_dimms = 1;
+ unsigned int tot_csrows = 1, tot_channels = 1;
void *pvt, *p, *ptr = NULL;
int i, j, row, chn, n, len;
bool per_rank = false;
@@ -346,19 +345,10 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
* stringent as what the compiler would provide if we could simply
* hardcode everything into a single struct.
*/
- mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
- layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
- for (i = 0; i < n_layers; i++) {
- count *= layers[i].size;
- edac_dbg(4, "errcount layer %d size %d\n", i, count);
- ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
- ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
- tot_errcount += 2 * count;
- }
-
- edac_dbg(4, "allocating %d error counters\n", tot_errcount);
- pvt = edac_align_ptr(&ptr, sz_pvt, 1);
- size = ((unsigned long)pvt) + sz_pvt;
+ mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
+ layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
+ pvt = edac_align_ptr(&ptr, sz_pvt, 1);
+ size = ((unsigned long)pvt) + sz_pvt;
edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
size,
@@ -374,10 +364,6 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
* rather than an imaginary chunk of memory located at address 0.
*/
layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
- for (i = 0; i < n_layers; i++) {
- mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i]));
- mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i]));
- }
pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
/* setup index and various internal pointers */
@@ -908,53 +894,31 @@ const char *edac_layer_name[] = {
EXPORT_SYMBOL_GPL(edac_layer_name);
static void edac_inc_ce_error(struct mem_ctl_info *mci,
- bool enable_per_layer_report,
const int pos[EDAC_MAX_LAYERS],
const u16 count)
{
- int i, index = 0;
+ struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
mci->ce_mc += count;
- if (!enable_per_layer_report) {
+ if (dimm)
+ dimm->ce_count += count;
+ else
mci->ce_noinfo_count += count;
- return;
- }
-
- for (i = 0; i < mci->n_layers; i++) {
- if (pos[i] < 0)
- break;
- index += pos[i];
- mci->ce_per_layer[i][index] += count;
-
- if (i < mci->n_layers - 1)
- index *= mci->layers[i + 1].size;
- }
}
static void edac_inc_ue_error(struct mem_ctl_info *mci,
- bool enable_per_layer_report,
const int pos[EDAC_MAX_LAYERS],
const u16 count)
{
- int i, index = 0;
+ struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
mci->ue_mc += count;
- if (!enable_per_layer_report) {
+ if (dimm)
+ dimm->ue_count += count;
+ else
mci->ue_noinfo_count += count;
- return;
- }
-
- for (i = 0; i < mci->n_layers; i++) {
- if (pos[i] < 0)
- break;
- index += pos[i];
- mci->ue_per_layer[i][index] += count;
-
- if (i < mci->n_layers - 1)
- index *= mci->layers[i + 1].size;
- }
}
static void edac_ce_error(struct mem_ctl_info *mci,
@@ -965,7 +929,6 @@ static void edac_ce_error(struct mem_ctl_info *mci,
const char *label,
const char *detail,
const char *other_detail,
- const bool enable_per_layer_report,
const unsigned long page_frame_number,
const unsigned long offset_in_page,
long grain)
@@ -988,7 +951,7 @@ static void edac_ce_error(struct mem_ctl_info *mci,
error_count, msg, msg_aux, label,
location, detail);
}
- edac_inc_ce_error(mci, enable_per_layer_report, pos, error_count);
+ edac_inc_ce_error(mci, pos, error_count);
if (mci->scrub_mode == SCRUB_SW_SRC) {
/*
@@ -1018,8 +981,7 @@ static void edac_ue_error(struct mem_ctl_info *mci,
const char *location,
const char *label,
const char *detail,
- const char *other_detail,
- const bool enable_per_layer_report)
+ const char *other_detail)
{
char *msg_aux = "";
@@ -1048,7 +1010,7 @@ static void edac_ue_error(struct mem_ctl_info *mci,
msg, msg_aux, label, location, detail);
}
- edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count);
+ edac_inc_ue_error(mci, pos, error_count);
}
void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
@@ -1064,16 +1026,16 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
"page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
e->page_frame_number, e->offset_in_page,
e->grain, e->syndrome);
- edac_ce_error(mci, e->error_count, pos, e->msg, e->location, e->label,
- detail, e->other_detail, e->enable_per_layer_report,
+ edac_ce_error(mci, e->error_count, pos, e->msg, e->location,
+ e->label, detail, e->other_detail,
e->page_frame_number, e->offset_in_page, e->grain);
} else {
snprintf(detail, sizeof(detail),
"page:0x%lx offset:0x%lx grain:%ld",
e->page_frame_number, e->offset_in_page, e->grain);
- edac_ue_error(mci, e->error_count, pos, e->msg, e->location, e->label,
- detail, e->other_detail, e->enable_per_layer_report);
+ edac_ue_error(mci, e->error_count, pos, e->msg, e->location,
+ e->label, detail, e->other_detail);
}
@@ -1099,6 +1061,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
int i, n_labels = 0;
u8 grain_bits;
struct edac_raw_error_desc *e = &mci->error_desc;
+ bool any_memory = true;
edac_dbg(3, "MC%d\n", mci->mc_idx);
@@ -1116,9 +1079,9 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
/*
* Check if the event report is consistent and if the memory
- * location is known. If it is known, enable_per_layer_report will be
- * true, the DIMM(s) label info will be filled and the per-layer
- * error counters will be incremented.
+ * location is known. If it is known, the DIMM(s) label info
+ * will be filled and the DIMM's error counters will be
+ * incremented.
*/
for (i = 0; i < mci->n_layers; i++) {
if (pos[i] >= (int)mci->layers[i].size) {
@@ -1136,7 +1099,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
pos[i] = -1;
}
if (pos[i] >= 0)
- e->enable_per_layer_report = true;
+ any_memory = false;
}
/*
@@ -1166,16 +1129,17 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
e->grain = dimm->grain;
/*
- * If the error is memory-controller wide, there's no need to
- * seek for the affected DIMMs because the whole
- * channel/memory controller/... may be affected.
- * Also, don't show errors for empty DIMM slots.
+ * If the error is memory-controller wide, there's no
+ * need to seek for the affected DIMMs because the
+ * whole channel/memory controller/... may be
+ * affected. Also, don't show errors for empty DIMM
+ * slots.
*/
- if (!e->enable_per_layer_report || !dimm->nr_pages)
+ if (any_memory || !dimm->nr_pages)
continue;
if (n_labels >= EDAC_MAX_LABELS) {
- e->enable_per_layer_report = false;
+ any_memory = true;
break;
}
n_labels++;
@@ -1205,7 +1169,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
chan = -2;
}
- if (!e->enable_per_layer_report) {
+ if (any_memory) {
strcpy(e->label, "any memory");
} else {
edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
index 0367554e7437..8682df2f7f4f 100644
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -556,10 +556,8 @@ static ssize_t dimmdev_ce_count_show(struct device *dev,
char *data)
{
struct dimm_info *dimm = to_dimm(dev);
- u32 count;
- count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
- return sprintf(data, "%u\n", count);
+ return sprintf(data, "%u\n", dimm->ce_count);
}
static ssize_t dimmdev_ue_count_show(struct device *dev,
@@ -567,10 +565,8 @@ static ssize_t dimmdev_ue_count_show(struct device *dev,
char *data)
{
struct dimm_info *dimm = to_dimm(dev);
- u32 count;
- count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][dimm->idx];
- return sprintf(data, "%u\n", count);
+ return sprintf(data, "%u\n", dimm->ue_count);
}
/* dimm/rank attribute files */
@@ -666,7 +662,9 @@ static ssize_t mci_reset_counters_store(struct device *dev,
const char *data, size_t count)
{
struct mem_ctl_info *mci = to_mci(dev);
- int cnt, row, chan, i;
+ struct dimm_info *dimm;
+ int row, chan;
+
mci->ue_mc = 0;
mci->ce_mc = 0;
mci->ue_noinfo_count = 0;
@@ -682,11 +680,9 @@ static ssize_t mci_reset_counters_store(struct device *dev,
ri->channels[chan]->ce_count = 0;
}
- cnt = 1;
- for (i = 0; i < mci->n_layers; i++) {
- cnt *= mci->layers[i].size;
- memset(mci->ce_per_layer[i], 0, cnt * sizeof(u32));
- memset(mci->ue_per_layer[i], 0, cnt * sizeof(u32));
+ mci_for_each_dimm(mci, dimm) {
+ dimm->ue_count = 0;
+ dimm->ce_count = 0;
}
mci->start_time = jiffies;
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 842080d7b33a..e0b90c6d7d63 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -347,11 +347,8 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
mem_err->mem_dev_handle);
index = get_dimm_smbios_index(mem_err->mem_dev_handle);
- if (index >= 0) {
+ if (index >= 0)
e->top_layer = index;
- e->enable_per_layer_report = true;
- }
-
}
if (p > e->location)
*(p - 1) = '\0';
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 8beb6e834be9..8e72222e50b0 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -383,6 +383,9 @@ struct dimm_info {
unsigned int csrow, cschannel; /* Points to the old API data */
u16 smbios_handle; /* Handle for SMBIOS type 17 */
+
+ u32 ce_count;
+ u32 ue_count;
};
/**
@@ -453,8 +456,6 @@ struct errcount_attribute_data {
* @location: location of the error
* @label: label of the affected DIMM(s)
* @other_detail: other driver-specific detail about the error
- * @enable_per_layer_report: if false, the error affects all layers
- * (typically, a memory controller error)
*/
struct edac_raw_error_desc {
/*
@@ -475,7 +476,6 @@ struct edac_raw_error_desc {
unsigned long syndrome;
const char *msg;
const char *other_detail;
- bool enable_per_layer_report;
};
/* MEMORY controller information structure
@@ -565,7 +565,6 @@ struct mem_ctl_info {
*/
u32 ce_noinfo_count, ue_noinfo_count;
u32 ue_mc, ce_mc;
- u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
struct completion complete;
--
2.20.1
Reorder the new created functions edac_mc_alloc_csrows() and
edac_mc_alloc_dimms() and move them before edac_mc_alloc(). No further
code changes.
Signed-off-by: Robert Richter <[email protected]>
---
drivers/edac/edac_mc.c | 171 ++++++++++++++++++++---------------------
1 file changed, 84 insertions(+), 87 deletions(-)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 0db504cb3419..6d880cf4d599 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -305,93 +305,6 @@ static void _edac_mc_free(struct mem_ctl_info *mci)
kfree(mci);
}
-static int edac_mc_alloc_csrows(struct mem_ctl_info *mci);
-static int edac_mc_alloc_dimms(struct mem_ctl_info *mci);
-
-struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
- unsigned int n_layers,
- struct edac_mc_layer *layers,
- unsigned int sz_pvt)
-{
- struct mem_ctl_info *mci;
- struct edac_mc_layer *layer;
- unsigned int idx, size, tot_dimms = 1;
- unsigned int tot_csrows = 1, tot_channels = 1;
- void *pvt, *ptr = NULL;
- bool per_rank = false;
-
- if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0))
- return NULL;
-
- /*
- * Calculate the total amount of dimms and csrows/cschannels while
- * in the old API emulation mode
- */
- for (idx = 0; idx < n_layers; idx++) {
- tot_dimms *= layers[idx].size;
- if (layers[idx].is_virt_csrow)
- tot_csrows *= layers[idx].size;
- else
- tot_channels *= layers[idx].size;
-
- if (layers[idx].type == EDAC_MC_LAYER_CHIP_SELECT)
- per_rank = true;
- }
-
- /* Figure out the offsets of the various items from the start of an mc
- * structure. We want the alignment of each item to be at least as
- * stringent as what the compiler would provide if we could simply
- * hardcode everything into a single struct.
- */
- mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
- layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
- pvt = edac_align_ptr(&ptr, sz_pvt, 1);
- size = ((unsigned long)pvt) + sz_pvt;
-
- edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
- size,
- tot_dimms,
- per_rank ? "ranks" : "dimms",
- tot_csrows * tot_channels);
-
- mci = kzalloc(size, GFP_KERNEL);
- if (mci == NULL)
- return NULL;
-
- /* Adjust pointers so they point within the memory we just allocated
- * rather than an imaginary chunk of memory located at address 0.
- */
- layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
- pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
-
- /* setup index and various internal pointers */
- mci->mc_idx = mc_num;
- mci->tot_dimms = tot_dimms;
- mci->pvt_info = pvt;
- mci->n_layers = n_layers;
- mci->layers = layer;
- memcpy(mci->layers, layers, sizeof(*layer) * n_layers);
- mci->nr_csrows = tot_csrows;
- mci->num_cschannel = tot_channels;
- mci->csbased = per_rank;
-
- if (edac_mc_alloc_csrows(mci))
- goto error;
-
- if (edac_mc_alloc_dimms(mci))
- goto error;
-
- mci->op_state = OP_ALLOC;
-
- return mci;
-
-error:
- _edac_mc_free(mci);
-
- return NULL;
-}
-EXPORT_SYMBOL_GPL(edac_mc_alloc);
-
static int edac_mc_alloc_csrows(struct mem_ctl_info *mci)
{
unsigned int tot_csrows = mci->nr_csrows;
@@ -520,6 +433,90 @@ static int edac_mc_alloc_dimms(struct mem_ctl_info *mci)
return 0;
}
+struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
+ unsigned int n_layers,
+ struct edac_mc_layer *layers,
+ unsigned int sz_pvt)
+{
+ struct mem_ctl_info *mci;
+ struct edac_mc_layer *layer;
+ unsigned int idx, size, tot_dimms = 1;
+ unsigned int tot_csrows = 1, tot_channels = 1;
+ void *pvt, *ptr = NULL;
+ bool per_rank = false;
+
+ if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0))
+ return NULL;
+
+ /*
+ * Calculate the total amount of dimms and csrows/cschannels while
+ * in the old API emulation mode
+ */
+ for (idx = 0; idx < n_layers; idx++) {
+ tot_dimms *= layers[idx].size;
+ if (layers[idx].is_virt_csrow)
+ tot_csrows *= layers[idx].size;
+ else
+ tot_channels *= layers[idx].size;
+
+ if (layers[idx].type == EDAC_MC_LAYER_CHIP_SELECT)
+ per_rank = true;
+ }
+
+ /* Figure out the offsets of the various items from the start of an mc
+ * structure. We want the alignment of each item to be at least as
+ * stringent as what the compiler would provide if we could simply
+ * hardcode everything into a single struct.
+ */
+ mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
+ layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
+ pvt = edac_align_ptr(&ptr, sz_pvt, 1);
+ size = ((unsigned long)pvt) + sz_pvt;
+
+ edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
+ size,
+ tot_dimms,
+ per_rank ? "ranks" : "dimms",
+ tot_csrows * tot_channels);
+
+ mci = kzalloc(size, GFP_KERNEL);
+ if (mci == NULL)
+ return NULL;
+
+ /* Adjust pointers so they point within the memory we just allocated
+ * rather than an imaginary chunk of memory located at address 0.
+ */
+ layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
+ pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
+
+ /* setup index and various internal pointers */
+ mci->mc_idx = mc_num;
+ mci->tot_dimms = tot_dimms;
+ mci->pvt_info = pvt;
+ mci->n_layers = n_layers;
+ mci->layers = layer;
+ memcpy(mci->layers, layers, sizeof(*layer) * n_layers);
+ mci->nr_csrows = tot_csrows;
+ mci->num_cschannel = tot_channels;
+ mci->csbased = per_rank;
+
+ if (edac_mc_alloc_csrows(mci))
+ goto error;
+
+ if (edac_mc_alloc_dimms(mci))
+ goto error;
+
+ mci->op_state = OP_ALLOC;
+
+ return mci;
+
+error:
+ _edac_mc_free(mci);
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(edac_mc_alloc);
+
void edac_mc_free(struct mem_ctl_info *mci)
{
edac_dbg(1, "\n");
--
2.20.1
On 10.10.19 20:25:01, Robert Richter wrote:
> This patch set is a rework of the ghes_edac and edac_mc driver. It
> addresses issues found during code review and while working with the
> code. The changes include:
Sorry for the:
Content-Transfer-Encoding: quoted-printable
I definitely sent it out 8bit, but thanks to office365 this was
converted (IMO this is according to RFC but still a pain).
I will switch to another account for sending patches in the future.
-Robert
On Thu, 2019-10-10 at 20:25 +0000, Robert Richter wrote:
> Reduce the indentation level in edac_mc_handle_error() a bit by using
> continue. No functional changes.
Seems fine, but trivially below:
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
[]
> @@ -1171,37 +1171,38 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
[]
> + strcpy(p, dimm->label);
> + p += strlen(p);
> + *p = '\0';
This *p = '\0' is unnecessary as the strcpy already did that.
On 10.10.19 15:10:53, Joe Perches wrote:
> On Thu, 2019-10-10 at 20:25 +0000, Robert Richter wrote:
> > Reduce the indentation level in edac_mc_handle_error() a bit by using
> > continue. No functional changes.
>
> Seems fine, but trivially below:
>
> > diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> []
> > @@ -1171,37 +1171,38 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> []
> > + strcpy(p, dimm->label);
> > + p += strlen(p);
> > + *p = '\0';
>
> This *p = '\0' is unnecessary as the strcpy already did that.
Other changes than the indentation are not the aim of this patch.
However, on the occasion and if there won't be any objections I could
include this trivial change in the patch in my next version of the
series.
Thanks for review.
-Robert
Em Thu, 10 Oct 2019 20:25:05 +0000
Robert Richter <[email protected]> escreveu:
> The EDAC_DIMM_PTR() macro takes 3 arguments from struct mem_ctl_info.
> Clean up this interface to only pass the mci struct and replace this
> macro with the new function edac_get_dimm().
>
> Also introduce the edac_get_dimm_by_index() function for later use.
> This allows it to get a dimm pointer only by a given index. This can
> be useful if the dimm's position within the layers of the memory
> controller or the exact size of the layers are unknown.
>
> Small style changes made for some hunks after applying the semantic
> patch.
>
> Semantic patch used:
>
> @@ expression mci, a, b,c; @@
>
> -EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, a, b, c)
> +edac_get_dimm(mci, a, b, c)
>
> Signed-off-by: Robert Richter <[email protected]>
> ---
> drivers/edac/edac_mc.c | 1 +
> drivers/edac/ghes_edac.c | 7 +--
> drivers/edac/i10nm_base.c | 3 +-
> drivers/edac/i3200_edac.c | 3 +-
> drivers/edac/i5000_edac.c | 5 +-
> drivers/edac/i5100_edac.c | 3 +-
> drivers/edac/i5400_edac.c | 3 +-
> drivers/edac/i7300_edac.c | 3 +-
> drivers/edac/i7core_edac.c | 3 +-
> drivers/edac/ie31200_edac.c | 7 +--
> drivers/edac/pnd2_edac.c | 4 +-
> drivers/edac/sb_edac.c | 2 +-
> drivers/edac/skx_base.c | 3 +-
> drivers/edac/ti_edac.c | 2 +-
> include/linux/edac.h | 92 ++++++++++++++++++++++++-------------
> 15 files changed, 79 insertions(+), 62 deletions(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index e6fd079783bd..3d45adb5957f 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -438,6 +438,7 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> goto error;
> mci->dimms[off] = dimm;
> dimm->mci = mci;
> + dimm->idx = off;
See my comments about this below. IMO, such change doesn't belong
to this patch.
>
> /*
> * Copy DIMM location and initialize it.
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index d413a0bdc9ad..fb31e80dfe94 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -98,9 +98,7 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
>
> if (dh->type == DMI_ENTRY_MEM_DEVICE) {
> struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh;
> - struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
> - mci->n_layers,
> - dimm_fill->count, 0, 0);
> + struct dimm_info *dimm = edac_get_dimm(mci, dimm_fill->count, 0, 0);
> u16 rdr_mask = BIT(7) | BIT(13);
>
> if (entry->size == 0xffff) {
> @@ -527,8 +525,7 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
> dimm_fill.mci = mci;
> dmi_walk(ghes_edac_dmidecode, &dimm_fill);
> } else {
> - struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
> - mci->n_layers, 0, 0, 0);
> + struct dimm_info *dimm = edac_get_dimm(mci, 0, 0, 0);
>
> dimm->nr_pages = 1;
> dimm->grain = 128;
> diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
> index c370d5457e6b..059eccf0582b 100644
> --- a/drivers/edac/i10nm_base.c
> +++ b/drivers/edac/i10nm_base.c
> @@ -154,8 +154,7 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci)
>
> ndimms = 0;
> for (j = 0; j < I10NM_NUM_DIMMS; j++) {
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
> - mci->n_layers, i, j, 0);
> + dimm = edac_get_dimm(mci, i, j, 0);
> mtr = I10NM_GET_DIMMMTR(imc, i, j);
> mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i, j);
> edac_dbg(1, "dimmmtr 0x%x mcddrtcfg 0x%x (mc%d ch%d dimm%d)\n",
> diff --git a/drivers/edac/i3200_edac.c b/drivers/edac/i3200_edac.c
> index 299b441647cd..432b375a4075 100644
> --- a/drivers/edac/i3200_edac.c
> +++ b/drivers/edac/i3200_edac.c
> @@ -392,8 +392,7 @@ static int i3200_probe1(struct pci_dev *pdev, int dev_idx)
> unsigned long nr_pages;
>
> for (j = 0; j < nr_channels; j++) {
> - struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
> - mci->n_layers, i, j, 0);
> + struct dimm_info *dimm = edac_get_dimm(mci, i, j, 0);
>
> nr_pages = drb_to_nr_pages(drbs, stacked, j, i);
> if (nr_pages == 0)
> diff --git a/drivers/edac/i5000_edac.c b/drivers/edac/i5000_edac.c
> index 078a7351bf05..1a6f69c859ab 100644
> --- a/drivers/edac/i5000_edac.c
> +++ b/drivers/edac/i5000_edac.c
> @@ -1275,9 +1275,8 @@ static int i5000_init_csrows(struct mem_ctl_info *mci)
> if (!MTR_DIMMS_PRESENT(mtr))
> continue;
>
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
> - channel / MAX_BRANCHES,
> - channel % MAX_BRANCHES, slot);
> + dimm = edac_get_dimm(mci, channel / MAX_BRANCHES,
> + channel % MAX_BRANCHES, slot);
>
> csrow_megs = pvt->dimm_info[slot][channel].megabytes;
> dimm->grain = 8;
> diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c
> index 12bebecb203b..134586753311 100644
> --- a/drivers/edac/i5100_edac.c
> +++ b/drivers/edac/i5100_edac.c
> @@ -858,8 +858,7 @@ static void i5100_init_csrows(struct mem_ctl_info *mci)
> if (!npages)
> continue;
>
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
> - chan, rank, 0);
> + dimm = edac_get_dimm(mci, chan, rank, 0);
>
> dimm->nr_pages = npages;
> dimm->grain = 32;
> diff --git a/drivers/edac/i5400_edac.c b/drivers/edac/i5400_edac.c
> index 8c86c6fd7da7..f131c05ade9f 100644
> --- a/drivers/edac/i5400_edac.c
> +++ b/drivers/edac/i5400_edac.c
> @@ -1187,8 +1187,7 @@ static int i5400_init_dimms(struct mem_ctl_info *mci)
> if (!MTR_DIMMS_PRESENT(mtr))
> continue;
>
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
> - channel / 2, channel % 2, slot);
> + dimm = edac_get_dimm(mci, channel / 2, channel % 2, slot);
>
> size_mb = pvt->dimm_info[slot][channel].megabytes;
>
> diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c
> index 447d357c7a67..2e9bbe56cde9 100644
> --- a/drivers/edac/i7300_edac.c
> +++ b/drivers/edac/i7300_edac.c
> @@ -794,8 +794,7 @@ static int i7300_init_csrows(struct mem_ctl_info *mci)
> for (ch = 0; ch < max_channel; ch++) {
> int channel = to_channel(ch, branch);
>
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
> - mci->n_layers, branch, ch, slot);
> + dimm = edac_get_dimm(mci, branch, ch, slot);
>
> dinfo = &pvt->dimm_info[slot][channel];
>
> diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
> index a71cca6eeb33..b3135b208f9a 100644
> --- a/drivers/edac/i7core_edac.c
> +++ b/drivers/edac/i7core_edac.c
> @@ -585,8 +585,7 @@ static int get_dimm_config(struct mem_ctl_info *mci)
> if (!DIMM_PRESENT(dimm_dod[j]))
> continue;
>
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
> - i, j, 0);
> + dimm = edac_get_dimm(mci, i, j, 0);
> banks = numbank(MC_DOD_NUMBANK(dimm_dod[j]));
> ranks = numrank(MC_DOD_NUMRANK(dimm_dod[j]));
> rows = numrow(MC_DOD_NUMROW(dimm_dod[j]));
> diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c
> index d26300f9cb07..4f65073f230b 100644
> --- a/drivers/edac/ie31200_edac.c
> +++ b/drivers/edac/ie31200_edac.c
> @@ -490,9 +490,7 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx)
>
> if (dimm_info[j][i].dual_rank) {
> nr_pages = nr_pages / 2;
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
> - mci->n_layers, (i * 2) + 1,
> - j, 0);
> + dimm = edac_get_dimm(mci, (i * 2) + 1, j, 0);
> dimm->nr_pages = nr_pages;
> edac_dbg(0, "set nr pages: 0x%lx\n", nr_pages);
> dimm->grain = 8; /* just a guess */
> @@ -503,8 +501,7 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx)
> dimm->dtype = DEV_UNKNOWN;
> dimm->edac_mode = EDAC_UNKNOWN;
> }
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
> - mci->n_layers, i * 2, j, 0);
> + dimm = edac_get_dimm(mci, i * 2, j, 0);
> dimm->nr_pages = nr_pages;
> edac_dbg(0, "set nr pages: 0x%lx\n", nr_pages);
> dimm->grain = 8; /* same guess */
> diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c
> index b1193be1ef1d..933f7722b893 100644
> --- a/drivers/edac/pnd2_edac.c
> +++ b/drivers/edac/pnd2_edac.c
> @@ -1231,7 +1231,7 @@ static void apl_get_dimm_config(struct mem_ctl_info *mci)
> if (!(chan_mask & BIT(i)))
> continue;
>
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, i, 0, 0);
> + dimm = edac_get_dimm(mci, i, 0, 0);
> if (!dimm) {
> edac_dbg(0, "No allocated DIMM for channel %d\n", i);
> continue;
> @@ -1311,7 +1311,7 @@ static void dnv_get_dimm_config(struct mem_ctl_info *mci)
> if (!ranks_of_dimm[j])
> continue;
>
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, i, j, 0);
> + dimm = edac_get_dimm(mci, i, j, 0);
> if (!dimm) {
> edac_dbg(0, "No allocated DIMM for channel %d DIMM %d\n", i, j);
> continue;
> diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
> index a2fd39d330d6..4957e8ee1879 100644
> --- a/drivers/edac/sb_edac.c
> +++ b/drivers/edac/sb_edac.c
> @@ -1621,7 +1621,7 @@ static int __populate_dimms(struct mem_ctl_info *mci,
> }
>
> for (j = 0; j < max_dimms_per_channel; j++) {
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, i, j, 0);
> + dimm = edac_get_dimm(mci, i, j, 0);
> if (pvt->info.type == KNIGHTS_LANDING) {
> pci_read_config_dword(pvt->knl.pci_channel[i],
> knl_mtr_reg, &mtr);
> diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
> index 0fcf3785e8f3..8895eda365ff 100644
> --- a/drivers/edac/skx_base.c
> +++ b/drivers/edac/skx_base.c
> @@ -177,8 +177,7 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci)
> pci_read_config_dword(imc->chan[i].cdev, 0x8C, &amap);
> pci_read_config_dword(imc->chan[i].cdev, 0x400, &mcddrtcfg);
> for (j = 0; j < SKX_NUM_DIMMS; j++) {
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
> - mci->n_layers, i, j, 0);
> + dimm = edac_get_dimm(mci, i, j, 0);
> pci_read_config_dword(imc->chan[i].cdev,
> 0x80 + 4 * j, &mtr);
> if (IS_DIMM_PRESENT(mtr)) {
> diff --git a/drivers/edac/ti_edac.c b/drivers/edac/ti_edac.c
> index 6ac26d1b929f..8be3e89a510e 100644
> --- a/drivers/edac/ti_edac.c
> +++ b/drivers/edac/ti_edac.c
> @@ -135,7 +135,7 @@ static void ti_edac_setup_dimm(struct mem_ctl_info *mci, u32 type)
> u32 val;
> u32 memsize;
>
> - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, 0, 0, 0);
> + dimm = edac_get_dimm(mci, 0, 0, 0);
>
> val = ti_edac_readl(edac, EMIF_SDRAM_CONFIG);
>
> diff --git a/include/linux/edac.h b/include/linux/edac.h
> index c19483b90079..7f9804438589 100644
> --- a/include/linux/edac.h
> +++ b/include/linux/edac.h
> @@ -403,37 +403,6 @@ struct edac_mc_layer {
> __i; \
> })
>
> -/**
> - * EDAC_DIMM_PTR - Macro responsible to get a pointer inside a pointer array
> - * for the element given by [layer0,layer1,layer2] position
> - *
> - * @layers: a struct edac_mc_layer array, describing how many elements
> - * were allocated for each layer
> - * @var: name of the var where we want to get the pointer
> - * (like mci->dimms)
> - * @nlayers: Number of layers at the @layers array
> - * @layer0: layer0 position
> - * @layer1: layer1 position. Unused if n_layers < 2
> - * @layer2: layer2 position. Unused if n_layers < 3
> - *
> - * For 1 layer, this macro returns "var[layer0]";
> - *
> - * For 2 layers, this macro is similar to allocate a bi-dimensional array
> - * and to return "var[layer0][layer1]";
> - *
> - * For 3 layers, this macro is similar to allocate a tri-dimensional array
> - * and to return "var[layer0][layer1][layer2]";
> - */
> -#define EDAC_DIMM_PTR(layers, var, nlayers, layer0, layer1, layer2) ({ \
> - typeof(*var) __p; \
> - int ___i = EDAC_DIMM_OFF(layers, nlayers, layer0, layer1, layer2); \
> - if (___i < 0) \
> - __p = NULL; \
> - else \
> - __p = (var)[___i]; \
> - __p; \
> -})
> -
> struct dimm_info {
> struct device dev;
>
> @@ -443,6 +412,7 @@ struct dimm_info {
> unsigned int location[EDAC_MAX_LAYERS];
>
> struct mem_ctl_info *mci; /* the parent */
> + unsigned int idx; /* index within the parent dimm array */
Same comment here. This doesn't belong to this patch.
>
> u32 grain; /* granularity of reported error in bytes */
> enum dev_type dtype; /* memory device type */
> @@ -669,4 +639,64 @@ struct mem_ctl_info {
> bool fake_inject_ue;
> u16 fake_inject_count;
> };
> +
> +/**
> + * edac_get_dimm_by_index - Get DIMM info from a memory controller
> + * given by an index
> + *
> + * @mci: a struct mem_ctl_info
> + * @index: index in the memory controller's DIMM array
> + *
> + * Returns a struct dimm_info* or NULL on failure.
> + */
> +static inline struct dimm_info *
> +edac_get_dimm_by_index(struct mem_ctl_info *mci, int index)
> +{
> + if (index < 0 || index >= mci->tot_dimms)
> + return NULL;
> +
> + if (WARN_ON_ONCE(mci->dimms[index]->idx != index))
> + return NULL;
As far as I noticed, the only place you're using the new
dimm_info.idx field is here...
It sounds that you want to add some sanity check.
If I get it right, the addition of the index doesn't belong to
this patch, as it is unrelated to the macro replacement.
Also, it is not clear why you added it. Are there any bug
with would be caught by this extra check?
Or all you want to do is to add some magic number to double
check if the struct got corrupted? If so, I would initialize
it with:
dimm->idx = off + MAGIC_NUMBER;
and, at the check, do the same math, as it would be a lot less
likely to have a random magic number on an address than small
numbers like 0.
Anyway, if you have a good reason to add this idx, please place
it on a separate patch, with a proper description about why
it is needed.
> +
> + return mci->dimms[index];
> +}
> +
> +/**
> + * edac_get_dimm - Get DIMM info from a memory controller given by
> + * [layer0,layer1,layer2] position
> + *
> + * @mci: a struct mem_ctl_info
> + * @layer0: layer0 position
> + * @layer1: layer1 position. Unused if n_layers < 2
> + * @layer2: layer2 position. Unused if n_layers < 3
> + *
> + * For 1 layer, this function returns "dimms[layer0]";
> + *
> + * For 2 layers, this function is similar to allocate a bi-dimensional
> + * array and to return "dimms[layer0][layer1]";
> + *
> + * For 3 layers, this function is similar to allocate a tri-dimensional array
> + * and to return "dimms[layer0][layer1][layer2]";
> + */
> +static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci,
> + int layer0, int layer1, int layer2)
> +{
> + int index;
> +
> + if (layer0 < 0
> + || (mci->n_layers > 1 && layer1 < 0)
> + || (mci->n_layers > 2 && layer2 < 0))
> + return NULL;
> +
> + index = layer0;
> +
> + if (mci->n_layers > 1)
> + index = index * mci->layers[1].size + layer1;
> +
> + if (mci->n_layers > 2)
> + index = index * mci->layers[2].size + layer2;
> +
> + return edac_get_dimm_by_index(mci, index);
> +}
> +
> #endif
Thanks,
Mauro
Em Thu, 10 Oct 2019 20:25:07 +0000
Robert Richter <[email protected]> escreveu:
> The EDAC_DIMM_OFF() macro takes 5 arguments to get the DIMM's index.
> This can be much simplified now as the offset is already stored in
> struct dimm_info. Use it directly and completely remove the
> EDAC_DIMM_OFF() macro.
Ah, now I understand why you added the dimm idx field. One more reason
why to split it on a separate patch (or perhaps merge it here).
>
> Another advantage is that edac_mc_alloc() could be used even if the
> exact size of the layers is unknown. Only the number of DIMMs would be
> needed.
>
> Signed-off-by: Robert Richter <[email protected]>
> ---
> drivers/edac/edac_mc.c | 15 +++++--------
> drivers/edac/edac_mc_sysfs.c | 20 ++++--------------
> include/linux/edac.h | 41 ------------------------------------
> 3 files changed, 9 insertions(+), 67 deletions(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index 3d45adb5957f..72088d49b03b 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -314,10 +314,10 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> struct dimm_info *dimm;
> u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
> unsigned int pos[EDAC_MAX_LAYERS];
> - unsigned int size, tot_dimms = 1, count = 1;
> + unsigned int idx, size, tot_dimms = 1, count = 1;
> unsigned int tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
> void *pvt, *p, *ptr = NULL;
> - int i, j, row, chn, n, len, off;
> + int i, j, row, chn, n, len;
> bool per_rank = false;
>
> BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0);
> @@ -425,20 +425,15 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> memset(&pos, 0, sizeof(pos));
> row = 0;
> chn = 0;
> - for (i = 0; i < tot_dimms; i++) {
> + for (idx = 0; idx < tot_dimms; idx++) {
> chan = mci->csrows[row]->channels[chn];
> - off = EDAC_DIMM_OFF(layer, n_layers, pos[0], pos[1], pos[2]);
> - if (off < 0 || off >= tot_dimms) {
> - edac_mc_printk(mci, KERN_ERR, "EDAC core bug: EDAC_DIMM_OFF is trying to do an illegal data access\n");
> - goto error;
> - }
>
> dimm = kzalloc(sizeof(**mci->dimms), GFP_KERNEL);
> if (!dimm)
> goto error;
> - mci->dimms[off] = dimm;
> + mci->dimms[idx] = dimm;
> dimm->mci = mci;
> - dimm->idx = off;
> + dimm->idx = idx;
>
> /*
> * Copy DIMM location and initialize it.
> diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
> index 32d016f1ecd1..91e4c8f155af 100644
> --- a/drivers/edac/edac_mc_sysfs.c
> +++ b/drivers/edac/edac_mc_sysfs.c
> @@ -557,14 +557,8 @@ static ssize_t dimmdev_ce_count_show(struct device *dev,
> {
> struct dimm_info *dimm = to_dimm(dev);
> u32 count;
> - int off;
> -
> - off = EDAC_DIMM_OFF(dimm->mci->layers,
> - dimm->mci->n_layers,
> - dimm->location[0],
> - dimm->location[1],
> - dimm->location[2]);
> - count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][off];
> +
> + count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
> return sprintf(data, "%u\n", count);
> }
>
> @@ -574,14 +568,8 @@ static ssize_t dimmdev_ue_count_show(struct device *dev,
> {
> struct dimm_info *dimm = to_dimm(dev);
> u32 count;
> - int off;
> -
> - off = EDAC_DIMM_OFF(dimm->mci->layers,
> - dimm->mci->n_layers,
> - dimm->location[0],
> - dimm->location[1],
> - dimm->location[2]);
> - count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][off];
> +
> + count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][dimm->idx];
> return sprintf(data, "%u\n", count);
> }
>
> diff --git a/include/linux/edac.h b/include/linux/edac.h
> index 7f9804438589..79c5564ee314 100644
> --- a/include/linux/edac.h
> +++ b/include/linux/edac.h
> @@ -362,47 +362,6 @@ struct edac_mc_layer {
> */
> #define EDAC_MAX_LAYERS 3
>
> -/**
> - * EDAC_DIMM_OFF - Macro responsible to get a pointer offset inside a pointer
> - * array for the element given by [layer0,layer1,layer2]
> - * position
> - *
> - * @layers: a struct edac_mc_layer array, describing how many elements
> - * were allocated for each layer
> - * @nlayers: Number of layers at the @layers array
> - * @layer0: layer0 position
> - * @layer1: layer1 position. Unused if n_layers < 2
> - * @layer2: layer2 position. Unused if n_layers < 3
> - *
> - * For 1 layer, this macro returns "var[layer0] - var";
> - *
> - * For 2 layers, this macro is similar to allocate a bi-dimensional array
> - * and to return "var[layer0][layer1] - var";
> - *
> - * For 3 layers, this macro is similar to allocate a tri-dimensional array
> - * and to return "var[layer0][layer1][layer2] - var".
> - *
> - * A loop could be used here to make it more generic, but, as we only have
> - * 3 layers, this is a little faster.
> - *
> - * By design, layers can never be 0 or more than 3. If that ever happens,
> - * a NULL is returned, causing an OOPS during the memory allocation routine,
> - * with would point to the developer that he's doing something wrong.
> - */
> -#define EDAC_DIMM_OFF(layers, nlayers, layer0, layer1, layer2) ({ \
> - int __i; \
> - if ((nlayers) == 1) \
> - __i = layer0; \
> - else if ((nlayers) == 2) \
> - __i = (layer1) + ((layers[1]).size * (layer0)); \
> - else if ((nlayers) == 3) \
> - __i = (layer2) + ((layers[2]).size * ((layer1) + \
> - ((layers[1]).size * (layer0)))); \
> - else \
> - __i = -EINVAL; \
> - __i; \
> -})
> -
> struct dimm_info {
> struct device dev;
>
Thanks,
Mauro
Em Thu, 10 Oct 2019 20:25:10 +0000
Robert Richter <[email protected]> escreveu:
> Introduce the mci_for_each_dimm() iterator. It returns a pointer to a
> struct dimm_info. This makes the declaration and use of an index
> obsolete and avoids access to internal data of struct mci (direct
> array access etc).
>
> Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
> ---
> drivers/edac/edac_mc.c | 19 +++++++++++--------
> drivers/edac/edac_mc_sysfs.c | 29 ++++++++++++-----------------
> drivers/edac/ghes_edac.c | 8 ++++----
> drivers/edac/i5100_edac.c | 13 +++++--------
> include/linux/edac.h | 7 +++++++
> 5 files changed, 39 insertions(+), 37 deletions(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index 72088d49b03b..c5240bb4c6c0 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -145,15 +145,18 @@ static void edac_mc_dump_channel(struct rank_info *chan)
> edac_dbg(4, " channel->dimm = %p\n", chan->dimm);
> }
>
> -static void edac_mc_dump_dimm(struct dimm_info *dimm, int number)
> +static void edac_mc_dump_dimm(struct dimm_info *dimm)
> {
> char location[80];
>
> + if (!dimm->nr_pages)
> + return;
> +
> edac_dimm_info_location(dimm, location, sizeof(location));
>
> edac_dbg(4, "%s%i: %smapped as virtual row %d, chan %d\n",
> dimm->mci->csbased ? "rank" : "dimm",
> - number, location, dimm->csrow, dimm->cschannel);
> + dimm->idx, location, dimm->csrow, dimm->cschannel);
> edac_dbg(4, " dimm = %p\n", dimm);
> edac_dbg(4, " dimm->label = '%s'\n", dimm->label);
> edac_dbg(4, " dimm->nr_pages = 0x%x\n", dimm->nr_pages);
> @@ -702,6 +705,7 @@ EXPORT_SYMBOL_GPL(edac_get_owner);
> int edac_mc_add_mc_with_groups(struct mem_ctl_info *mci,
> const struct attribute_group **groups)
> {
> + struct dimm_info *dimm;
> int ret = -EINVAL;
> edac_dbg(0, "\n");
>
> @@ -726,9 +730,9 @@ int edac_mc_add_mc_with_groups(struct mem_ctl_info *mci,
> if (csrow->channels[j]->dimm->nr_pages)
> edac_mc_dump_channel(csrow->channels[j]);
> }
> - for (i = 0; i < mci->tot_dimms; i++)
> - if (mci->dimms[i]->nr_pages)
> - edac_mc_dump_dimm(mci->dimms[i], i);
> +
> + mci_for_each_dimm(mci, dimm)
> + edac_mc_dump_dimm(dimm);
> }
> #endif
> mutex_lock(&mem_ctls_mutex);
> @@ -1086,6 +1090,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> const char *msg,
> const char *other_detail)
> {
> + struct dimm_info *dimm;
> char *p;
> int row = -1, chan = -1;
> int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
> @@ -1146,9 +1151,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> p = e->label;
> *p = '\0';
>
> - for (i = 0; i < mci->tot_dimms; i++) {
> - struct dimm_info *dimm = mci->dimms[i];
> -
> + mci_for_each_dimm(mci, dimm) {
> if (top_layer >= 0 && top_layer != dimm->location[0])
> continue;
> if (mid_layer >= 0 && mid_layer != dimm->location[1])
> diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
> index 91e4c8f155af..0367554e7437 100644
> --- a/drivers/edac/edac_mc_sysfs.c
> +++ b/drivers/edac/edac_mc_sysfs.c
> @@ -621,8 +621,7 @@ static const struct device_type dimm_attr_type = {
>
> /* Create a DIMM object under specifed memory controller device */
> static int edac_create_dimm_object(struct mem_ctl_info *mci,
> - struct dimm_info *dimm,
> - int index)
> + struct dimm_info *dimm)
> {
> int err;
> dimm->mci = mci;
> @@ -632,9 +631,9 @@ static int edac_create_dimm_object(struct mem_ctl_info *mci,
>
> dimm->dev.parent = &mci->dev;
> if (mci->csbased)
> - dev_set_name(&dimm->dev, "rank%d", index);
> + dev_set_name(&dimm->dev, "rank%d", dimm->idx);
> else
> - dev_set_name(&dimm->dev, "dimm%d", index);
> + dev_set_name(&dimm->dev, "dimm%d", dimm->idx);
> dev_set_drvdata(&dimm->dev, dimm);
> pm_runtime_forbid(&mci->dev);
>
> @@ -916,7 +915,8 @@ static const struct device_type mci_attr_type = {
> int edac_create_sysfs_mci_device(struct mem_ctl_info *mci,
> const struct attribute_group **groups)
> {
> - int i, err;
> + struct dimm_info *dimm;
> + int err;
>
> /* get the /sys/devices/system/edac subsys reference */
> mci->dev.type = &mci_attr_type;
> @@ -940,13 +940,12 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci,
> /*
> * Create the dimm/rank devices
> */
> - for (i = 0; i < mci->tot_dimms; i++) {
> - struct dimm_info *dimm = mci->dimms[i];
> + mci_for_each_dimm(mci, dimm) {
> /* Only expose populated DIMMs */
> if (!dimm->nr_pages)
> continue;
>
> - err = edac_create_dimm_object(mci, dimm, i);
> + err = edac_create_dimm_object(mci, dimm);
> if (err)
> goto fail_unregister_dimm;
> }
> @@ -961,12 +960,9 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci,
> return 0;
>
> fail_unregister_dimm:
> - for (i--; i >= 0; i--) {
> - struct dimm_info *dimm = mci->dimms[i];
> - if (!dimm->nr_pages)
> - continue;
> -
> - device_unregister(&dimm->dev);
> + mci_for_each_dimm(mci, dimm) {
> + if (device_is_registered(&dimm->dev))
> + device_unregister(&dimm->dev);
> }
> device_unregister(&mci->dev);
>
> @@ -978,7 +974,7 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci,
> */
> void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci)
> {
> - int i;
> + struct dimm_info *dimm;
>
> edac_dbg(0, "\n");
>
> @@ -989,8 +985,7 @@ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci)
> edac_delete_csrow_objects(mci);
> #endif
>
> - for (i = 0; i < mci->tot_dimms; i++) {
> - struct dimm_info *dimm = mci->dimms[i];
> + mci_for_each_dimm(mci, dimm) {
> if (dimm->nr_pages == 0)
> continue;
> edac_dbg(1, "unregistering device %s\n", dev_name(&dimm->dev));
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index fb31e80dfe94..842080d7b33a 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -82,11 +82,11 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
> static int get_dimm_smbios_index(u16 handle)
> {
> struct mem_ctl_info *mci = ghes_pvt->mci;
> - int i;
> + struct dimm_info *dimm;
>
> - for (i = 0; i < mci->tot_dimms; i++) {
> - if (mci->dimms[i]->smbios_handle == handle)
> - return i;
> + mci_for_each_dimm(mci, dimm) {
> + if (dimm->smbios_handle == handle)
> + return dimm->idx;
> }
> return -1;
> }
> diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c
> index 134586753311..0ddc41e47a96 100644
> --- a/drivers/edac/i5100_edac.c
> +++ b/drivers/edac/i5100_edac.c
> @@ -846,20 +846,17 @@ static void i5100_init_interleaving(struct pci_dev *pdev,
>
> static void i5100_init_csrows(struct mem_ctl_info *mci)
> {
> - int i;
> struct i5100_priv *priv = mci->pvt_info;
> + struct dimm_info *dimm;
>
> - for (i = 0; i < mci->tot_dimms; i++) {
> - struct dimm_info *dimm;
> - const unsigned long npages = i5100_npages(mci, i);
> - const unsigned int chan = i5100_csrow_to_chan(mci, i);
> - const unsigned int rank = i5100_csrow_to_rank(mci, i);
> + mci_for_each_dimm(mci, dimm) {
> + const unsigned long npages = i5100_npages(mci, dimm->idx);
> + const unsigned int chan = i5100_csrow_to_chan(mci, dimm->idx);
> + const unsigned int rank = i5100_csrow_to_rank(mci, dimm->idx);
>
> if (!npages)
> continue;
>
> - dimm = edac_get_dimm(mci, chan, rank, 0);
> -
> dimm->nr_pages = npages;
> dimm->grain = 32;
> dimm->dtype = (priv->mtr[chan][rank].width == 4) ?
> diff --git a/include/linux/edac.h b/include/linux/edac.h
> index 79c5564ee314..8beb6e834be9 100644
> --- a/include/linux/edac.h
> +++ b/include/linux/edac.h
> @@ -599,6 +599,13 @@ struct mem_ctl_info {
> u16 fake_inject_count;
> };
>
> +#define mci_for_each_dimm(mci, dimm) \
> + for ((dimm) = (mci)->dimms[0]; \
> + (dimm); \
> + (dimm) = (dimm)->idx + 1 < (mci)->tot_dimms \
> + ? (mci)->dimms[(dimm)->idx + 1] \
> + : NULL)
> +
> /**
> * edac_get_dimm_by_index - Get DIMM info from a memory controller
> * given by an index
Thanks,
Mauro
Em Thu, 10 Oct 2019 20:25:14 +0000
Robert Richter <[email protected]> escreveu:
> Reduce the indentation level in edac_mc_handle_error() a bit by using
> continue. No functional changes.
>
> Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
> ---
> drivers/edac/edac_mc.c | 59 +++++++++++++++++++++---------------------
> 1 file changed, 30 insertions(+), 29 deletions(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index f2cbca77bc50..45b02bb31964 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -1171,37 +1171,38 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> * channel/memory controller/... may be affected.
> * Also, don't show errors for empty DIMM slots.
> */
> - if (e->enable_per_layer_report && dimm->nr_pages) {
> - if (n_labels >= EDAC_MAX_LABELS) {
> - e->enable_per_layer_report = false;
> - break;
> - }
> - n_labels++;
> - if (p != e->label) {
> - strcpy(p, OTHER_LABEL);
> - p += strlen(OTHER_LABEL);
> - }
> - strcpy(p, dimm->label);
> - p += strlen(p);
> - *p = '\0';
> + if (!e->enable_per_layer_report || !dimm->nr_pages)
> + continue;
>
> - /*
> - * get csrow/channel of the DIMM, in order to allow
> - * incrementing the compat API counters
> - */
> - edac_dbg(4, "%s csrows map: (%d,%d)\n",
> - mci->csbased ? "rank" : "dimm",
> - dimm->csrow, dimm->cschannel);
> - if (row == -1)
> - row = dimm->csrow;
> - else if (row >= 0 && row != dimm->csrow)
> - row = -2;
> -
> - if (chan == -1)
> - chan = dimm->cschannel;
> - else if (chan >= 0 && chan != dimm->cschannel)
> - chan = -2;
> + if (n_labels >= EDAC_MAX_LABELS) {
> + e->enable_per_layer_report = false;
> + break;
> + }
> + n_labels++;
> + if (p != e->label) {
> + strcpy(p, OTHER_LABEL);
> + p += strlen(OTHER_LABEL);
> }
> + strcpy(p, dimm->label);
> + p += strlen(p);
> + *p = '\0';
> +
> + /*
> + * get csrow/channel of the DIMM, in order to allow
> + * incrementing the compat API counters
> + */
> + edac_dbg(4, "%s csrows map: (%d,%d)\n",
> + mci->csbased ? "rank" : "dimm",
> + dimm->csrow, dimm->cschannel);
> + if (row == -1)
> + row = dimm->csrow;
> + else if (row >= 0 && row != dimm->csrow)
> + row = -2;
> +
> + if (chan == -1)
> + chan = dimm->cschannel;
> + else if (chan >= 0 && chan != dimm->cschannel)
> + chan = -2;
> }
>
> if (!e->enable_per_layer_report) {
Thanks,
Mauro
Em Thu, 10 Oct 2019 20:25:12 +0000
Robert Richter <[email protected]> escreveu:
> No need to crash the system in case edac_mc_alloc() is called with
> invalid arguments, just warn and return. This would cause a checkpatch
> warning when touching the code later, so just fix it.
>
> Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
> ---
> drivers/edac/edac_mc.c | 4 +++-
> 1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index c5240bb4c6c0..f2cbca77bc50 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -323,7 +323,9 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> int i, j, row, chn, n, len;
> bool per_rank = false;
>
> - BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0);
> + if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0))
> + return NULL;
> +
> /*
> * Calculate the total amount of dimms and csrows/cschannels while
> * in the old API emulation mode
Thanks,
Mauro
Em Thu, 10 Oct 2019 15:10:53 -0700
Joe Perches <[email protected]> escreveu:
> On Thu, 2019-10-10 at 20:25 +0000, Robert Richter wrote:
> > Reduce the indentation level in edac_mc_handle_error() a bit by using
> > continue. No functional changes.
>
> Seems fine, but trivially below:
>
> > diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> []
> > @@ -1171,37 +1171,38 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> []
> > + strcpy(p, dimm->label);
> > + p += strlen(p);
> > + *p = '\0';
>
> This *p = '\0' is unnecessary as the strcpy already did that.
True, but better to put it on a separate patch, as it makes
easier to review if you don't mix code de-indent with changes.
Also, maybe there are other occurrences of this pattern.
Thanks,
Mauro
Em Thu, 10 Oct 2019 20:25:18 +0000
Robert Richter <[email protected]> escreveu:
> Rename iterator variable to idx. The name is more handy, esp. when
> searching it in the code.
>
> Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
> ---
> drivers/edac/edac_mc.c | 14 +++++++-------
> 1 file changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index c1e142643006..a893f793be8a 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -319,7 +319,7 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> unsigned int idx, size, tot_dimms = 1;
> unsigned int tot_csrows = 1, tot_channels = 1;
> void *pvt, *p, *ptr = NULL;
> - int i, j, row, chn, n, len;
> + int j, row, chn, n, len;
> bool per_rank = false;
>
> if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0))
> @@ -329,14 +329,14 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> * Calculate the total amount of dimms and csrows/cschannels while
> * in the old API emulation mode
> */
> - for (i = 0; i < n_layers; i++) {
> - tot_dimms *= layers[i].size;
> - if (layers[i].is_virt_csrow)
> - tot_csrows *= layers[i].size;
> + for (idx = 0; idx < n_layers; idx++) {
> + tot_dimms *= layers[idx].size;
> + if (layers[idx].is_virt_csrow)
> + tot_csrows *= layers[idx].size;
> else
> - tot_channels *= layers[i].size;
> + tot_channels *= layers[idx].size;
>
> - if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT)
> + if (layers[idx].type == EDAC_MC_LAYER_CHIP_SELECT)
> per_rank = true;
> }
>
Thanks,
Mauro
Em Thu, 10 Oct 2019 20:25:16 +0000
Robert Richter <[email protected]> escreveu:
> Looking at how mci->{ue,ce}_per_layer[EDAC_MAX_LAYERS] is used, it
> turns out that only the leaves in the memory hierarchy are consumed
> (in sysfs), but not the intermediate layers, e.g.:
>
> count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
>
> These unused counters only add complexity, remove them. The error
> counter values are directly stored in struct dimm_info now.
Hmm... not sure if this patch is correct. I remember that there are some
border cases on some drivers (maybe the 3-layer drivers used together
with RDIMM memory controllers?) where some errors are not associated
to an specific dimm, but, instead, are related to a problem at the memory
bus.
Also, depending on how the memory controllers are organized[1], the ECC
logic groups memory on DIMM pairs. So, when an error occur, it may be
either at DIMM1 or DIMM2.
[1] On Intel, this happens with pre-Nehalem memory controllers.
Due to that, storing errors at the dimm struct sounds wrong, as the
error may affect multiple DIMMs or even the entire layer.
>
> Signed-off-by: Robert Richter <[email protected]>
> ---
> drivers/edac/edac_mc.c | 104 ++++++++++++-----------------------
> drivers/edac/edac_mc_sysfs.c | 20 +++----
> drivers/edac/ghes_edac.c | 5 +-
> include/linux/edac.h | 7 +--
> 4 files changed, 46 insertions(+), 90 deletions(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index 45b02bb31964..c1e142643006 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -315,10 +315,9 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> struct csrow_info *csr;
> struct rank_info *chan;
> struct dimm_info *dimm;
> - u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
> unsigned int pos[EDAC_MAX_LAYERS];
> - unsigned int idx, size, tot_dimms = 1, count = 1;
> - unsigned int tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
> + unsigned int idx, size, tot_dimms = 1;
> + unsigned int tot_csrows = 1, tot_channels = 1;
> void *pvt, *p, *ptr = NULL;
> int i, j, row, chn, n, len;
> bool per_rank = false;
> @@ -346,19 +345,10 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> * stringent as what the compiler would provide if we could simply
> * hardcode everything into a single struct.
> */
> - mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
> - layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
> - for (i = 0; i < n_layers; i++) {
> - count *= layers[i].size;
> - edac_dbg(4, "errcount layer %d size %d\n", i, count);
> - ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
> - ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
> - tot_errcount += 2 * count;
> - }
> -
> - edac_dbg(4, "allocating %d error counters\n", tot_errcount);
> - pvt = edac_align_ptr(&ptr, sz_pvt, 1);
> - size = ((unsigned long)pvt) + sz_pvt;
> + mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
> + layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
> + pvt = edac_align_ptr(&ptr, sz_pvt, 1);
> + size = ((unsigned long)pvt) + sz_pvt;
>
> edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
> size,
> @@ -374,10 +364,6 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> * rather than an imaginary chunk of memory located at address 0.
> */
> layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
> - for (i = 0; i < n_layers; i++) {
> - mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i]));
> - mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i]));
> - }
> pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
>
> /* setup index and various internal pointers */
> @@ -908,53 +894,31 @@ const char *edac_layer_name[] = {
> EXPORT_SYMBOL_GPL(edac_layer_name);
>
> static void edac_inc_ce_error(struct mem_ctl_info *mci,
> - bool enable_per_layer_report,
> const int pos[EDAC_MAX_LAYERS],
> const u16 count)
> {
> - int i, index = 0;
> + struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
>
> mci->ce_mc += count;
>
> - if (!enable_per_layer_report) {
> + if (dimm)
> + dimm->ce_count += count;
> + else
> mci->ce_noinfo_count += count;
> - return;
> - }
> -
> - for (i = 0; i < mci->n_layers; i++) {
> - if (pos[i] < 0)
> - break;
> - index += pos[i];
> - mci->ce_per_layer[i][index] += count;
> -
> - if (i < mci->n_layers - 1)
> - index *= mci->layers[i + 1].size;
> - }
> }
>
> static void edac_inc_ue_error(struct mem_ctl_info *mci,
> - bool enable_per_layer_report,
> const int pos[EDAC_MAX_LAYERS],
> const u16 count)
> {
> - int i, index = 0;
> + struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
>
> mci->ue_mc += count;
>
> - if (!enable_per_layer_report) {
> + if (dimm)
> + dimm->ue_count += count;
> + else
> mci->ue_noinfo_count += count;
> - return;
> - }
> -
> - for (i = 0; i < mci->n_layers; i++) {
> - if (pos[i] < 0)
> - break;
> - index += pos[i];
> - mci->ue_per_layer[i][index] += count;
> -
> - if (i < mci->n_layers - 1)
> - index *= mci->layers[i + 1].size;
> - }
> }
>
> static void edac_ce_error(struct mem_ctl_info *mci,
> @@ -965,7 +929,6 @@ static void edac_ce_error(struct mem_ctl_info *mci,
> const char *label,
> const char *detail,
> const char *other_detail,
> - const bool enable_per_layer_report,
> const unsigned long page_frame_number,
> const unsigned long offset_in_page,
> long grain)
> @@ -988,7 +951,7 @@ static void edac_ce_error(struct mem_ctl_info *mci,
> error_count, msg, msg_aux, label,
> location, detail);
> }
> - edac_inc_ce_error(mci, enable_per_layer_report, pos, error_count);
> + edac_inc_ce_error(mci, pos, error_count);
>
> if (mci->scrub_mode == SCRUB_SW_SRC) {
> /*
> @@ -1018,8 +981,7 @@ static void edac_ue_error(struct mem_ctl_info *mci,
> const char *location,
> const char *label,
> const char *detail,
> - const char *other_detail,
> - const bool enable_per_layer_report)
> + const char *other_detail)
> {
> char *msg_aux = "";
>
> @@ -1048,7 +1010,7 @@ static void edac_ue_error(struct mem_ctl_info *mci,
> msg, msg_aux, label, location, detail);
> }
>
> - edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count);
> + edac_inc_ue_error(mci, pos, error_count);
> }
>
> void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
> @@ -1064,16 +1026,16 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
> "page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
> e->page_frame_number, e->offset_in_page,
> e->grain, e->syndrome);
> - edac_ce_error(mci, e->error_count, pos, e->msg, e->location, e->label,
> - detail, e->other_detail, e->enable_per_layer_report,
> + edac_ce_error(mci, e->error_count, pos, e->msg, e->location,
> + e->label, detail, e->other_detail,
> e->page_frame_number, e->offset_in_page, e->grain);
> } else {
> snprintf(detail, sizeof(detail),
> "page:0x%lx offset:0x%lx grain:%ld",
> e->page_frame_number, e->offset_in_page, e->grain);
>
> - edac_ue_error(mci, e->error_count, pos, e->msg, e->location, e->label,
> - detail, e->other_detail, e->enable_per_layer_report);
> + edac_ue_error(mci, e->error_count, pos, e->msg, e->location,
> + e->label, detail, e->other_detail);
> }
>
>
> @@ -1099,6 +1061,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> int i, n_labels = 0;
> u8 grain_bits;
> struct edac_raw_error_desc *e = &mci->error_desc;
> + bool any_memory = true;
>
> edac_dbg(3, "MC%d\n", mci->mc_idx);
>
> @@ -1116,9 +1079,9 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
>
> /*
> * Check if the event report is consistent and if the memory
> - * location is known. If it is known, enable_per_layer_report will be
> - * true, the DIMM(s) label info will be filled and the per-layer
> - * error counters will be incremented.
> + * location is known. If it is known, the DIMM(s) label info
> + * will be filled and the DIMM's error counters will be
> + * incremented.
> */
> for (i = 0; i < mci->n_layers; i++) {
> if (pos[i] >= (int)mci->layers[i].size) {
> @@ -1136,7 +1099,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> pos[i] = -1;
> }
> if (pos[i] >= 0)
> - e->enable_per_layer_report = true;
> + any_memory = false;
> }
>
> /*
> @@ -1166,16 +1129,17 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> e->grain = dimm->grain;
>
> /*
> - * If the error is memory-controller wide, there's no need to
> - * seek for the affected DIMMs because the whole
> - * channel/memory controller/... may be affected.
> - * Also, don't show errors for empty DIMM slots.
> + * If the error is memory-controller wide, there's no
> + * need to seek for the affected DIMMs because the
> + * whole channel/memory controller/... may be
> + * affected. Also, don't show errors for empty DIMM
> + * slots.
> */
> - if (!e->enable_per_layer_report || !dimm->nr_pages)
> + if (any_memory || !dimm->nr_pages)
> continue;
>
> if (n_labels >= EDAC_MAX_LABELS) {
> - e->enable_per_layer_report = false;
> + any_memory = true;
> break;
> }
> n_labels++;
> @@ -1205,7 +1169,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> chan = -2;
> }
>
> - if (!e->enable_per_layer_report) {
> + if (any_memory) {
> strcpy(e->label, "any memory");
> } else {
> edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
> diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
> index 0367554e7437..8682df2f7f4f 100644
> --- a/drivers/edac/edac_mc_sysfs.c
> +++ b/drivers/edac/edac_mc_sysfs.c
> @@ -556,10 +556,8 @@ static ssize_t dimmdev_ce_count_show(struct device *dev,
> char *data)
> {
> struct dimm_info *dimm = to_dimm(dev);
> - u32 count;
>
> - count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
> - return sprintf(data, "%u\n", count);
> + return sprintf(data, "%u\n", dimm->ce_count);
> }
>
> static ssize_t dimmdev_ue_count_show(struct device *dev,
> @@ -567,10 +565,8 @@ static ssize_t dimmdev_ue_count_show(struct device *dev,
> char *data)
> {
> struct dimm_info *dimm = to_dimm(dev);
> - u32 count;
>
> - count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][dimm->idx];
> - return sprintf(data, "%u\n", count);
> + return sprintf(data, "%u\n", dimm->ue_count);
> }
>
> /* dimm/rank attribute files */
> @@ -666,7 +662,9 @@ static ssize_t mci_reset_counters_store(struct device *dev,
> const char *data, size_t count)
> {
> struct mem_ctl_info *mci = to_mci(dev);
> - int cnt, row, chan, i;
> + struct dimm_info *dimm;
> + int row, chan;
> +
> mci->ue_mc = 0;
> mci->ce_mc = 0;
> mci->ue_noinfo_count = 0;
> @@ -682,11 +680,9 @@ static ssize_t mci_reset_counters_store(struct device *dev,
> ri->channels[chan]->ce_count = 0;
> }
>
> - cnt = 1;
> - for (i = 0; i < mci->n_layers; i++) {
> - cnt *= mci->layers[i].size;
> - memset(mci->ce_per_layer[i], 0, cnt * sizeof(u32));
> - memset(mci->ue_per_layer[i], 0, cnt * sizeof(u32));
> + mci_for_each_dimm(mci, dimm) {
> + dimm->ue_count = 0;
> + dimm->ce_count = 0;
> }
>
> mci->start_time = jiffies;
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index 842080d7b33a..e0b90c6d7d63 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -347,11 +347,8 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
> mem_err->mem_dev_handle);
>
> index = get_dimm_smbios_index(mem_err->mem_dev_handle);
> - if (index >= 0) {
> + if (index >= 0)
> e->top_layer = index;
> - e->enable_per_layer_report = true;
> - }
> -
> }
> if (p > e->location)
> *(p - 1) = '\0';
> diff --git a/include/linux/edac.h b/include/linux/edac.h
> index 8beb6e834be9..8e72222e50b0 100644
> --- a/include/linux/edac.h
> +++ b/include/linux/edac.h
> @@ -383,6 +383,9 @@ struct dimm_info {
> unsigned int csrow, cschannel; /* Points to the old API data */
>
> u16 smbios_handle; /* Handle for SMBIOS type 17 */
> +
> + u32 ce_count;
> + u32 ue_count;
> };
>
> /**
> @@ -453,8 +456,6 @@ struct errcount_attribute_data {
> * @location: location of the error
> * @label: label of the affected DIMM(s)
> * @other_detail: other driver-specific detail about the error
> - * @enable_per_layer_report: if false, the error affects all layers
> - * (typically, a memory controller error)
> */
> struct edac_raw_error_desc {
> /*
> @@ -475,7 +476,6 @@ struct edac_raw_error_desc {
> unsigned long syndrome;
> const char *msg;
> const char *other_detail;
> - bool enable_per_layer_report;
> };
>
> /* MEMORY controller information structure
> @@ -565,7 +565,6 @@ struct mem_ctl_info {
> */
> u32 ce_noinfo_count, ue_noinfo_count;
> u32 ue_mc, ce_mc;
> - u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
>
> struct completion complete;
>
Thanks,
Mauro
Em Thu, 10 Oct 2019 20:25:22 +0000
Robert Richter <[email protected]> escreveu:
> Reorder the new created functions edac_mc_alloc_csrows() and
> edac_mc_alloc_dimms() and move them before edac_mc_alloc(). No further
> code changes.
>
> Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
> ---
> drivers/edac/edac_mc.c | 171 ++++++++++++++++++++---------------------
> 1 file changed, 84 insertions(+), 87 deletions(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index 0db504cb3419..6d880cf4d599 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -305,93 +305,6 @@ static void _edac_mc_free(struct mem_ctl_info *mci)
> kfree(mci);
> }
>
> -static int edac_mc_alloc_csrows(struct mem_ctl_info *mci);
> -static int edac_mc_alloc_dimms(struct mem_ctl_info *mci);
> -
> -struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> - unsigned int n_layers,
> - struct edac_mc_layer *layers,
> - unsigned int sz_pvt)
> -{
> - struct mem_ctl_info *mci;
> - struct edac_mc_layer *layer;
> - unsigned int idx, size, tot_dimms = 1;
> - unsigned int tot_csrows = 1, tot_channels = 1;
> - void *pvt, *ptr = NULL;
> - bool per_rank = false;
> -
> - if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0))
> - return NULL;
> -
> - /*
> - * Calculate the total amount of dimms and csrows/cschannels while
> - * in the old API emulation mode
> - */
> - for (idx = 0; idx < n_layers; idx++) {
> - tot_dimms *= layers[idx].size;
> - if (layers[idx].is_virt_csrow)
> - tot_csrows *= layers[idx].size;
> - else
> - tot_channels *= layers[idx].size;
> -
> - if (layers[idx].type == EDAC_MC_LAYER_CHIP_SELECT)
> - per_rank = true;
> - }
> -
> - /* Figure out the offsets of the various items from the start of an mc
> - * structure. We want the alignment of each item to be at least as
> - * stringent as what the compiler would provide if we could simply
> - * hardcode everything into a single struct.
> - */
> - mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
> - layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
> - pvt = edac_align_ptr(&ptr, sz_pvt, 1);
> - size = ((unsigned long)pvt) + sz_pvt;
> -
> - edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
> - size,
> - tot_dimms,
> - per_rank ? "ranks" : "dimms",
> - tot_csrows * tot_channels);
> -
> - mci = kzalloc(size, GFP_KERNEL);
> - if (mci == NULL)
> - return NULL;
> -
> - /* Adjust pointers so they point within the memory we just allocated
> - * rather than an imaginary chunk of memory located at address 0.
> - */
> - layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
> - pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
> -
> - /* setup index and various internal pointers */
> - mci->mc_idx = mc_num;
> - mci->tot_dimms = tot_dimms;
> - mci->pvt_info = pvt;
> - mci->n_layers = n_layers;
> - mci->layers = layer;
> - memcpy(mci->layers, layers, sizeof(*layer) * n_layers);
> - mci->nr_csrows = tot_csrows;
> - mci->num_cschannel = tot_channels;
> - mci->csbased = per_rank;
> -
> - if (edac_mc_alloc_csrows(mci))
> - goto error;
> -
> - if (edac_mc_alloc_dimms(mci))
> - goto error;
> -
> - mci->op_state = OP_ALLOC;
> -
> - return mci;
> -
> -error:
> - _edac_mc_free(mci);
> -
> - return NULL;
> -}
> -EXPORT_SYMBOL_GPL(edac_mc_alloc);
> -
> static int edac_mc_alloc_csrows(struct mem_ctl_info *mci)
> {
> unsigned int tot_csrows = mci->nr_csrows;
> @@ -520,6 +433,90 @@ static int edac_mc_alloc_dimms(struct mem_ctl_info *mci)
> return 0;
> }
>
> +struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> + unsigned int n_layers,
> + struct edac_mc_layer *layers,
> + unsigned int sz_pvt)
> +{
> + struct mem_ctl_info *mci;
> + struct edac_mc_layer *layer;
> + unsigned int idx, size, tot_dimms = 1;
> + unsigned int tot_csrows = 1, tot_channels = 1;
> + void *pvt, *ptr = NULL;
> + bool per_rank = false;
> +
> + if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0))
> + return NULL;
> +
> + /*
> + * Calculate the total amount of dimms and csrows/cschannels while
> + * in the old API emulation mode
> + */
> + for (idx = 0; idx < n_layers; idx++) {
> + tot_dimms *= layers[idx].size;
> + if (layers[idx].is_virt_csrow)
> + tot_csrows *= layers[idx].size;
> + else
> + tot_channels *= layers[idx].size;
> +
> + if (layers[idx].type == EDAC_MC_LAYER_CHIP_SELECT)
> + per_rank = true;
> + }
> +
> + /* Figure out the offsets of the various items from the start of an mc
> + * structure. We want the alignment of each item to be at least as
> + * stringent as what the compiler would provide if we could simply
> + * hardcode everything into a single struct.
> + */
> + mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
> + layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
> + pvt = edac_align_ptr(&ptr, sz_pvt, 1);
> + size = ((unsigned long)pvt) + sz_pvt;
> +
> + edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
> + size,
> + tot_dimms,
> + per_rank ? "ranks" : "dimms",
> + tot_csrows * tot_channels);
> +
> + mci = kzalloc(size, GFP_KERNEL);
> + if (mci == NULL)
> + return NULL;
> +
> + /* Adjust pointers so they point within the memory we just allocated
> + * rather than an imaginary chunk of memory located at address 0.
> + */
> + layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
> + pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
> +
> + /* setup index and various internal pointers */
> + mci->mc_idx = mc_num;
> + mci->tot_dimms = tot_dimms;
> + mci->pvt_info = pvt;
> + mci->n_layers = n_layers;
> + mci->layers = layer;
> + memcpy(mci->layers, layers, sizeof(*layer) * n_layers);
> + mci->nr_csrows = tot_csrows;
> + mci->num_cschannel = tot_channels;
> + mci->csbased = per_rank;
> +
> + if (edac_mc_alloc_csrows(mci))
> + goto error;
> +
> + if (edac_mc_alloc_dimms(mci))
> + goto error;
> +
> + mci->op_state = OP_ALLOC;
> +
> + return mci;
> +
> +error:
> + _edac_mc_free(mci);
> +
> + return NULL;
> +}
> +EXPORT_SYMBOL_GPL(edac_mc_alloc);
> +
> void edac_mc_free(struct mem_ctl_info *mci)
> {
> edac_dbg(1, "\n");
Thanks,
Mauro
Em Thu, 10 Oct 2019 20:25:20 +0000
Robert Richter <[email protected]> escreveu:
> edac_mc_alloc() is huge. Factor out code by moving it to the two new
> functions edac_mc_alloc_csrows() and edac_mc_alloc_dimms(). Do not
> move code yet for better review.
>
> Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
> ---
> drivers/edac/edac_mc.c | 104 +++++++++++++++++++++++++++--------------
> 1 file changed, 69 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index a893f793be8a..0db504cb3419 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -305,6 +305,9 @@ static void _edac_mc_free(struct mem_ctl_info *mci)
> kfree(mci);
> }
>
> +static int edac_mc_alloc_csrows(struct mem_ctl_info *mci);
> +static int edac_mc_alloc_dimms(struct mem_ctl_info *mci);
> +
> struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> unsigned int n_layers,
> struct edac_mc_layer *layers,
> @@ -312,14 +315,9 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> {
> struct mem_ctl_info *mci;
> struct edac_mc_layer *layer;
> - struct csrow_info *csr;
> - struct rank_info *chan;
> - struct dimm_info *dimm;
> - unsigned int pos[EDAC_MAX_LAYERS];
> unsigned int idx, size, tot_dimms = 1;
> unsigned int tot_csrows = 1, tot_channels = 1;
> - void *pvt, *p, *ptr = NULL;
> - int j, row, chn, n, len;
> + void *pvt, *ptr = NULL;
> bool per_rank = false;
>
> if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0))
> @@ -377,16 +375,43 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> mci->num_cschannel = tot_channels;
> mci->csbased = per_rank;
>
> + if (edac_mc_alloc_csrows(mci))
> + goto error;
> +
> + if (edac_mc_alloc_dimms(mci))
> + goto error;
> +
> + mci->op_state = OP_ALLOC;
> +
> + return mci;
> +
> +error:
> + _edac_mc_free(mci);
> +
> + return NULL;
> +}
> +EXPORT_SYMBOL_GPL(edac_mc_alloc);
> +
> +static int edac_mc_alloc_csrows(struct mem_ctl_info *mci)
> +{
> + unsigned int tot_csrows = mci->nr_csrows;
> + unsigned int tot_channels = mci->num_cschannel;
> + unsigned int row, chn;
> +
> /*
> * Alocate and fill the csrow/channels structs
> */
> mci->csrows = kcalloc(tot_csrows, sizeof(*mci->csrows), GFP_KERNEL);
> if (!mci->csrows)
> - goto error;
> + return -ENOMEM;
> +
> for (row = 0; row < tot_csrows; row++) {
> + struct csrow_info *csr;
> +
> csr = kzalloc(sizeof(**mci->csrows), GFP_KERNEL);
> if (!csr)
> - goto error;
> + return -ENOMEM;
> +
> mci->csrows[row] = csr;
> csr->csrow_idx = row;
> csr->mci = mci;
> @@ -394,34 +419,51 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> csr->channels = kcalloc(tot_channels, sizeof(*csr->channels),
> GFP_KERNEL);
> if (!csr->channels)
> - goto error;
> + return -ENOMEM;
>
> for (chn = 0; chn < tot_channels; chn++) {
> + struct rank_info *chan;
> +
> chan = kzalloc(sizeof(**csr->channels), GFP_KERNEL);
> if (!chan)
> - goto error;
> + return -ENOMEM;
> +
> csr->channels[chn] = chan;
> chan->chan_idx = chn;
> chan->csrow = csr;
> }
> }
>
> + return 0;
> +}
> +
> +static int edac_mc_alloc_dimms(struct mem_ctl_info *mci)
> +{
> + void *p;
> + unsigned int pos[EDAC_MAX_LAYERS];
> + unsigned int row, chn, idx;
> + int layer;
> +
> /*
> * Allocate and fill the dimm structs
> */
> - mci->dimms = kcalloc(tot_dimms, sizeof(*mci->dimms), GFP_KERNEL);
> + mci->dimms = kcalloc(mci->tot_dimms, sizeof(*mci->dimms), GFP_KERNEL);
> if (!mci->dimms)
> - goto error;
> + return -ENOMEM;
>
> memset(&pos, 0, sizeof(pos));
> row = 0;
> chn = 0;
> - for (idx = 0; idx < tot_dimms; idx++) {
> + for (idx = 0; idx < mci->tot_dimms; idx++) {
> + struct dimm_info *dimm;
> + struct rank_info *chan;
> + int n, len;
> +
> chan = mci->csrows[row]->channels[chn];
>
> dimm = kzalloc(sizeof(**mci->dimms), GFP_KERNEL);
> if (!dimm)
> - goto error;
> + return -ENOMEM;
> mci->dimms[idx] = dimm;
> dimm->mci = mci;
> dimm->idx = idx;
> @@ -431,16 +473,16 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> */
> len = sizeof(dimm->label);
> p = dimm->label;
> - n = snprintf(p, len, "mc#%u", mc_num);
> + n = snprintf(p, len, "mc#%u", mci->mc_idx);
> p += n;
> len -= n;
> - for (j = 0; j < n_layers; j++) {
> + for (layer = 0; layer < mci->n_layers; layer++) {
> n = snprintf(p, len, "%s#%u",
> - edac_layer_name[layers[j].type],
> - pos[j]);
> + edac_layer_name[mci->layers[layer].type],
> + pos[layer]);
> p += n;
> len -= n;
> - dimm->location[j] = pos[j];
> + dimm->location[layer] = pos[layer];
>
> if (len <= 0)
> break;
> @@ -452,39 +494,31 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> dimm->cschannel = chn;
>
> /* Increment csrow location */
> - if (layers[0].is_virt_csrow) {
> + if (mci->layers[0].is_virt_csrow) {
> chn++;
> - if (chn == tot_channels) {
> + if (chn == mci->num_cschannel) {
> chn = 0;
> row++;
> }
> } else {
> row++;
> - if (row == tot_csrows) {
> + if (row == mci->nr_csrows) {
> row = 0;
> chn++;
> }
> }
>
> /* Increment dimm location */
> - for (j = n_layers - 1; j >= 0; j--) {
> - pos[j]++;
> - if (pos[j] < layers[j].size)
> + for (layer = mci->n_layers - 1; layer >= 0; layer--) {
> + pos[layer]++;
> + if (pos[layer] < mci->layers[layer].size)
> break;
> - pos[j] = 0;
> + pos[layer] = 0;
> }
> }
>
> - mci->op_state = OP_ALLOC;
> -
> - return mci;
> -
> -error:
> - _edac_mc_free(mci);
> -
> - return NULL;
> + return 0;
> }
> -EXPORT_SYMBOL_GPL(edac_mc_alloc);
>
> void edac_mc_free(struct mem_ctl_info *mci)
> {
Thanks,
Mauro
Em Thu, 10 Oct 2019 20:25:24 +0000
Robert Richter <[email protected]> escreveu:
> The error handling functions have the pos[] array argument for
> determing the dimm handle. Rework those functions to use the dimm
> handle directly.
>
> Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
> ---
> drivers/edac/edac_mc.c | 28 +++++++++++++---------------
> drivers/edac/edac_mc.h | 2 ++
> drivers/edac/ghes_edac.c | 6 +++++-
> 3 files changed, 20 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index 6d880cf4d599..cdfb383f7a35 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -925,11 +925,9 @@ const char *edac_layer_name[] = {
> EXPORT_SYMBOL_GPL(edac_layer_name);
>
> static void edac_inc_ce_error(struct mem_ctl_info *mci,
> - const int pos[EDAC_MAX_LAYERS],
> + struct dimm_info *dimm,
> const u16 count)
> {
> - struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
> -
> mci->ce_mc += count;
>
> if (dimm)
> @@ -939,11 +937,9 @@ static void edac_inc_ce_error(struct mem_ctl_info *mci,
> }
>
> static void edac_inc_ue_error(struct mem_ctl_info *mci,
> - const int pos[EDAC_MAX_LAYERS],
> - const u16 count)
> + struct dimm_info *dimm,
> + const u16 count)
> {
> - struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
> -
> mci->ue_mc += count;
>
> if (dimm)
> @@ -953,8 +949,8 @@ static void edac_inc_ue_error(struct mem_ctl_info *mci,
> }
>
> static void edac_ce_error(struct mem_ctl_info *mci,
> + struct dimm_info *dimm,
> const u16 error_count,
> - const int pos[EDAC_MAX_LAYERS],
> const char *msg,
> const char *location,
> const char *label,
> @@ -982,7 +978,7 @@ static void edac_ce_error(struct mem_ctl_info *mci,
> error_count, msg, msg_aux, label,
> location, detail);
> }
> - edac_inc_ce_error(mci, pos, error_count);
> + edac_inc_ce_error(mci, dimm, error_count);
>
> if (mci->scrub_mode == SCRUB_SW_SRC) {
> /*
> @@ -1006,8 +1002,8 @@ static void edac_ce_error(struct mem_ctl_info *mci,
> }
>
> static void edac_ue_error(struct mem_ctl_info *mci,
> + struct dimm_info *dimm,
> const u16 error_count,
> - const int pos[EDAC_MAX_LAYERS],
> const char *msg,
> const char *location,
> const char *label,
> @@ -1041,15 +1037,15 @@ static void edac_ue_error(struct mem_ctl_info *mci,
> msg, msg_aux, label, location, detail);
> }
>
> - edac_inc_ue_error(mci, pos, error_count);
> + edac_inc_ue_error(mci, dimm, error_count);
> }
>
> void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
> struct mem_ctl_info *mci,
> + struct dimm_info *dimm,
> struct edac_raw_error_desc *e)
> {
> char detail[80];
> - int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer };
>
> /* Memory type dependent details about the error */
> if (type == HW_EVENT_ERR_CORRECTED) {
> @@ -1057,7 +1053,7 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
> "page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
> e->page_frame_number, e->offset_in_page,
> e->grain, e->syndrome);
> - edac_ce_error(mci, e->error_count, pos, e->msg, e->location,
> + edac_ce_error(mci, dimm, e->error_count, e->msg, e->location,
> e->label, detail, e->other_detail,
> e->page_frame_number, e->offset_in_page, e->grain);
> } else {
> @@ -1065,7 +1061,7 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
> "page:0x%lx offset:0x%lx grain:%ld",
> e->page_frame_number, e->offset_in_page, e->grain);
>
> - edac_ue_error(mci, e->error_count, pos, e->msg, e->location,
> + edac_ue_error(mci, dimm, e->error_count, e->msg, e->location,
> e->label, detail, e->other_detail);
> }
>
> @@ -1245,6 +1241,8 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
> grain_bits, e->syndrome, e->other_detail);
>
> - edac_raw_mc_handle_error(type, mci, e);
> + dimm = edac_get_dimm(mci, top_layer, mid_layer, low_layer);
> +
> + edac_raw_mc_handle_error(type, mci, dimm, e);
> }
> EXPORT_SYMBOL_GPL(edac_mc_handle_error);
> diff --git a/drivers/edac/edac_mc.h b/drivers/edac/edac_mc.h
> index 02aac5c61d00..2c3e2fbcedc4 100644
> --- a/drivers/edac/edac_mc.h
> +++ b/drivers/edac/edac_mc.h
> @@ -214,6 +214,7 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
> *
> * @type: severity of the error (CE/UE/Fatal)
> * @mci: a struct mem_ctl_info pointer
> + * @dimm: a struct dimm_info pointer
> * @e: error description
> *
> * This raw function is used internally by edac_mc_handle_error(). It should
> @@ -222,6 +223,7 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
> */
> void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
> struct mem_ctl_info *mci,
> + struct dimm_info *dimm,
> struct edac_raw_error_desc *e);
>
> /**
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index e0b90c6d7d63..4f5721cf4380 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -193,6 +193,7 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
>
> void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
> {
> + struct dimm_info *dimm;
> enum hw_event_mc_err_type type;
> struct edac_raw_error_desc *e;
> struct mem_ctl_info *mci;
> @@ -437,7 +438,10 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
> (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
> grain_bits, e->syndrome, pvt->detail_location);
>
> - edac_raw_mc_handle_error(type, mci, e);
> + dimm = edac_get_dimm_by_index(mci, e->top_layer);
> +
> + edac_raw_mc_handle_error(type, mci, dimm, e);
> +
> spin_unlock_irqrestore(&ghes_lock, flags);
> }
>
Thanks,
Mauro
On Fri, 2019-10-11 at 07:20 -0300, Mauro Carvalho Chehab wrote:
> Em Thu, 10 Oct 2019 15:10:53 -0700
> Joe Perches <[email protected]> escreveu:
>
> > On Thu, 2019-10-10 at 20:25 +0000, Robert Richter wrote:
> > > Reduce the indentation level in edac_mc_handle_error() a bit by using
> > > continue. No functional changes.
> >
> > Seems fine, but trivially below:
> >
> > > diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> > []
> > > @@ -1171,37 +1171,38 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> > []
> > > + strcpy(p, dimm->label);
> > > + p += strlen(p);
> > > + *p = '\0';
> >
> > This *p = '\0' is unnecessary as the strcpy already did that.
>
> True, but better to put it on a separate patch, as it makes
> easier to review if you don't mix code de-indent with changes.
>
> Also, maybe there are other occurrences of this pattern.
Maybe 80 or so uses of paired
strcpy(foo, bar);
strlen(foo)
where another function like stpcpy, which doesn't exist
in the kernel, could have been used.
Em Thu, 10 Oct 2019 20:25:26 +0000
Robert Richter <[email protected]> escreveu:
> There never has been such function edac_raw_error_desc_clean() and in
> function ghes_edac_report_mem_error() the whole struct is zero'ed
> including the string arrays. Remove that comment.
>
> Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
> ---
> include/linux/edac.h | 5 -----
> 1 file changed, 5 deletions(-)
>
> diff --git a/include/linux/edac.h b/include/linux/edac.h
> index 8e72222e50b0..4d9673954856 100644
> --- a/include/linux/edac.h
> +++ b/include/linux/edac.h
> @@ -458,15 +458,10 @@ struct errcount_attribute_data {
> * @other_detail: other driver-specific detail about the error
> */
> struct edac_raw_error_desc {
> - /*
> - * NOTE: everything before grain won't be cleaned by
> - * edac_raw_error_desc_clean()
> - */
> char location[LOCATION_SIZE];
> char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * EDAC_MAX_LABELS];
> long grain;
>
> - /* the vars below and grain will be cleaned on every new error report */
> u16 error_count;
> int top_layer;
> int mid_layer;
Thanks,
Mauro
Em Thu, 10 Oct 2019 20:25:29 +0000
Robert Richter <[email protected]> escreveu:
> Store the error type in struct edac_raw_error_desc. This makes the
> type parameter of edac_raw_mc_handle_error() obsolete.
I don't see much gain on this change, but whatever works best for
ghes.
>
> Signed-off-by: Robert Richter <[email protected]>
> ---
> drivers/edac/edac_mc.c | 8 ++++----
> drivers/edac/edac_mc.h | 4 +---
> drivers/edac/ghes_edac.c | 13 ++++++-------
> include/linux/edac.h | 1 +
> 4 files changed, 12 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index cdfb383f7a35..ca206854b8ee 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -1040,15 +1040,14 @@ static void edac_ue_error(struct mem_ctl_info *mci,
> edac_inc_ue_error(mci, dimm, error_count);
> }
>
> -void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
> - struct mem_ctl_info *mci,
> +void edac_raw_mc_handle_error(struct mem_ctl_info *mci,
> struct dimm_info *dimm,
> struct edac_raw_error_desc *e)
> {
> char detail[80];
>
> /* Memory type dependent details about the error */
> - if (type == HW_EVENT_ERR_CORRECTED) {
> + if (e->type == HW_EVENT_ERR_CORRECTED) {
> snprintf(detail, sizeof(detail),
> "page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
> e->page_frame_number, e->offset_in_page,
> @@ -1095,6 +1094,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> /* Fills the error report buffer */
> memset(e, 0, sizeof (*e));
> e->error_count = error_count;
> + e->type = type;
> e->top_layer = top_layer;
> e->mid_layer = mid_layer;
> e->low_layer = low_layer;
> @@ -1243,6 +1243,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
>
> dimm = edac_get_dimm(mci, top_layer, mid_layer, low_layer);
>
> - edac_raw_mc_handle_error(type, mci, dimm, e);
> + edac_raw_mc_handle_error(mci, dimm, e);
> }
> EXPORT_SYMBOL_GPL(edac_mc_handle_error);
> diff --git a/drivers/edac/edac_mc.h b/drivers/edac/edac_mc.h
> index 2c3e2fbcedc4..a8f1b5b5e873 100644
> --- a/drivers/edac/edac_mc.h
> +++ b/drivers/edac/edac_mc.h
> @@ -212,7 +212,6 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
> * edac_raw_mc_handle_error() - Reports a memory event to userspace without
> * doing anything to discover the error location.
> *
> - * @type: severity of the error (CE/UE/Fatal)
> * @mci: a struct mem_ctl_info pointer
> * @dimm: a struct dimm_info pointer
> * @e: error description
> @@ -221,8 +220,7 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
> * only be called directly when the hardware error come directly from BIOS,
> * like in the case of APEI GHES driver.
> */
> -void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
> - struct mem_ctl_info *mci,
> +void edac_raw_mc_handle_error(struct mem_ctl_info *mci,
> struct dimm_info *dimm,
> struct edac_raw_error_desc *e);
>
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index 4f5721cf4380..1db1c012bed9 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -194,7 +194,6 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
> void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
> {
> struct dimm_info *dimm;
> - enum hw_event_mc_err_type type;
> struct edac_raw_error_desc *e;
> struct mem_ctl_info *mci;
> struct ghes_edac_pvt *pvt = ghes_pvt;
> @@ -232,17 +231,17 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
>
> switch (sev) {
> case GHES_SEV_CORRECTED:
> - type = HW_EVENT_ERR_CORRECTED;
> + e->type = HW_EVENT_ERR_CORRECTED;
> break;
> case GHES_SEV_RECOVERABLE:
> - type = HW_EVENT_ERR_UNCORRECTED;
> + e->type = HW_EVENT_ERR_UNCORRECTED;
> break;
> case GHES_SEV_PANIC:
> - type = HW_EVENT_ERR_FATAL;
> + e->type = HW_EVENT_ERR_FATAL;
> break;
> default:
> case GHES_SEV_NO:
> - type = HW_EVENT_ERR_INFO;
> + e->type = HW_EVENT_ERR_INFO;
> }
>
> edac_dbg(1, "error validation_bits: 0x%08llx\n",
> @@ -433,14 +432,14 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
> grain_bits = fls_long(e->grain);
> snprintf(pvt->detail_location, sizeof(pvt->detail_location),
> "APEI location: %s %s", e->location, e->other_detail);
> - trace_mc_event(type, e->msg, e->label, e->error_count,
> + trace_mc_event(e->type, e->msg, e->label, e->error_count,
> mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
> (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
> grain_bits, e->syndrome, pvt->detail_location);
>
> dimm = edac_get_dimm_by_index(mci, e->top_layer);
>
> - edac_raw_mc_handle_error(type, mci, dimm, e);
> + edac_raw_mc_handle_error(mci, dimm, e);
>
> spin_unlock_irqrestore(&ghes_lock, flags);
> }
> diff --git a/include/linux/edac.h b/include/linux/edac.h
> index 4d9673954856..587c53b87fdf 100644
> --- a/include/linux/edac.h
> +++ b/include/linux/edac.h
> @@ -463,6 +463,7 @@ struct edac_raw_error_desc {
> long grain;
>
> u16 error_count;
> + enum hw_event_mc_err_type type;
> int top_layer;
> int mid_layer;
> int low_layer;
Thanks,
Mauro
Em Thu, 10 Oct 2019 20:25:31 +0000
Robert Richter <[email protected]> escreveu:
> Each struct mci has its own error descriptor. Create a function
> error_desc_to_mci() to determine the corresponding mci from an error
> descriptor. This eases the parameter list of edac_raw_mc_handle_
> error() as the mci pointer do not need to be passed any longer.
>
> While at it, reorder parameters of edac_raw_mc_handle_error().
>
> Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
> ---
> drivers/edac/edac_mc.c | 13 +++++++++----
> drivers/edac/edac_mc.h | 8 +++-----
> drivers/edac/ghes_edac.c | 2 +-
> 3 files changed, 13 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index ca206854b8ee..9e8c5716a8c0 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -1040,10 +1040,15 @@ static void edac_ue_error(struct mem_ctl_info *mci,
> edac_inc_ue_error(mci, dimm, error_count);
> }
>
> -void edac_raw_mc_handle_error(struct mem_ctl_info *mci,
> - struct dimm_info *dimm,
> - struct edac_raw_error_desc *e)
> +static struct mem_ctl_info *error_desc_to_mci(struct edac_raw_error_desc *e)
> +{
> + return container_of(e, struct mem_ctl_info, error_desc);
> +}
> +
> +void edac_raw_mc_handle_error(struct edac_raw_error_desc *e,
> + struct dimm_info *dimm)
> {
> + struct mem_ctl_info *mci = error_desc_to_mci(e);
> char detail[80];
>
> /* Memory type dependent details about the error */
> @@ -1243,6 +1248,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
>
> dimm = edac_get_dimm(mci, top_layer, mid_layer, low_layer);
>
> - edac_raw_mc_handle_error(mci, dimm, e);
> + edac_raw_mc_handle_error(e, dimm);
> }
> EXPORT_SYMBOL_GPL(edac_mc_handle_error);
> diff --git a/drivers/edac/edac_mc.h b/drivers/edac/edac_mc.h
> index a8f1b5b5e873..3b01d5d9d7bc 100644
> --- a/drivers/edac/edac_mc.h
> +++ b/drivers/edac/edac_mc.h
> @@ -212,17 +212,15 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
> * edac_raw_mc_handle_error() - Reports a memory event to userspace without
> * doing anything to discover the error location.
> *
> - * @mci: a struct mem_ctl_info pointer
> - * @dimm: a struct dimm_info pointer
> * @e: error description
> + * @dimm: a struct dimm_info pointer
> *
> * This raw function is used internally by edac_mc_handle_error(). It should
> * only be called directly when the hardware error come directly from BIOS,
> * like in the case of APEI GHES driver.
> */
> -void edac_raw_mc_handle_error(struct mem_ctl_info *mci,
> - struct dimm_info *dimm,
> - struct edac_raw_error_desc *e);
> +void edac_raw_mc_handle_error(struct edac_raw_error_desc *e,
> + struct dimm_info *dimm);
>
> /**
> * edac_mc_handle_error() - Reports a memory event to userspace.
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index 1db1c012bed9..8078d4ec9631 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -439,7 +439,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
>
> dimm = edac_get_dimm_by_index(mci, e->top_layer);
>
> - edac_raw_mc_handle_error(mci, dimm, e);
> + edac_raw_mc_handle_error(e, dimm);
>
> spin_unlock_irqrestore(&ghes_lock, flags);
> }
Thanks,
Mauro
Em Thu, 10 Oct 2019 20:25:33 +0000
Robert Richter <[email protected]> escreveu:
> Have a separate function to count errors in csrow/channel. This better
> separates code and reduces the indentation level. No functional
> changes.
This one assumes patch 06/19, with I'm not sure if it is correct.
>
> Signed-off-by: Robert Richter <[email protected]>
> ---
> drivers/edac/edac_mc.c | 40 +++++++++++++++++++++++++---------------
> 1 file changed, 25 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index 9e8c5716a8c0..3779204c0e21 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -1045,6 +1045,26 @@ static struct mem_ctl_info *error_desc_to_mci(struct edac_raw_error_desc *e)
> return container_of(e, struct mem_ctl_info, error_desc);
> }
>
> +static void edac_inc_csrow(struct edac_raw_error_desc *e, int row, int chan)
> +{
> + struct mem_ctl_info *mci = error_desc_to_mci(e);
> + u16 count = e->error_count;
> + enum hw_event_mc_err_type type = e->type;
> +
> + if (row < 0)
> + return;
> +
> + edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
> +
> + if (type == HW_EVENT_ERR_CORRECTED) {
> + mci->csrows[row]->ce_count += count;
> + if (chan >= 0)
> + mci->csrows[row]->channels[chan]->ce_count += count;
> + } else {
> + mci->csrows[row]->ue_count += count;
> + }
> +}
> +
> void edac_raw_mc_handle_error(struct edac_raw_error_desc *e,
> struct dimm_info *dimm)
> {
> @@ -1201,22 +1221,12 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> chan = -2;
> }
>
> - if (any_memory) {
> + if (any_memory)
> strcpy(e->label, "any memory");
> - } else {
> - edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
> - if (p == e->label)
> - strcpy(e->label, "unknown memory");
> - if (type == HW_EVENT_ERR_CORRECTED) {
> - if (row >= 0) {
> - mci->csrows[row]->ce_count += error_count;
> - if (chan >= 0)
> - mci->csrows[row]->channels[chan]->ce_count += error_count;
> - }
> - } else
> - if (row >= 0)
> - mci->csrows[row]->ue_count += error_count;
> - }
> + else if (!*e->label)
> + strcpy(e->label, "unknown memory");
> +
> + edac_inc_csrow(e, row, chan);
>
> /* Fill the RAM location data */
> p = e->location;
Thanks,
Mauro
Em Thu, 10 Oct 2019 20:25:35 +0000
Robert Richter <[email protected]> escreveu:
> Use standard macros for page calculations.
>
> Reviewed-by: James Morse <[email protected]>
> Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
> ---
> drivers/edac/ghes_edac.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index 8078d4ec9631..851aad92e42d 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -309,8 +309,8 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
>
> /* Error address */
> if (mem_err->validation_bits & CPER_MEM_VALID_PA) {
> - e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT;
> - e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK;
> + e->page_frame_number = PHYS_PFN(mem_err->physical_addr);
> + e->offset_in_page = offset_in_page(mem_err->physical_addr);
> }
>
> /* Error grain */
Thanks,
Mauro
Em Thu, 10 Oct 2019 20:25:38 +0000
Robert Richter <[email protected]> escreveu:
> detail_location[] is used to collect two location strings so they can
> be passed as one to trace_mc_event(). Instead of having an extra copy
> step, assemble the location string in other_detail[] from the
> beginning.
>
> Using other_detail[] to call trace_mc_event() is now the same as in
> edac_mc.c and code can be unified.
>
> Reviewed-by: James Morse <[email protected]>
> Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
> ---
> drivers/edac/ghes_edac.c | 9 ++++-----
> 1 file changed, 4 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index 97242cf18a88..8d9d3c4dbebb 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -21,8 +21,7 @@ struct ghes_edac_pvt {
> struct mem_ctl_info *mci;
>
> /* Buffers for the error handling routine */
> - char detail_location[240];
> - char other_detail[160];
> + char other_detail[400];
> char msg[80];
> };
>
> @@ -356,6 +355,8 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
>
> /* All other fields are mapped on e->other_detail */
> p = pvt->other_detail;
> + p += snprintf(p, sizeof(pvt->other_detail),
> + "APEI location: %s ", e->location);
> if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) {
> u64 status = mem_err->error_status;
>
> @@ -436,12 +437,10 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
> grain_bits = fls_long(e->grain - 1);
>
> /* Generate the trace event */
> - snprintf(pvt->detail_location, sizeof(pvt->detail_location),
> - "APEI location: %s %s", e->location, e->other_detail);
> trace_mc_event(e->type, e->msg, e->label, e->error_count,
> mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
> (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
> - grain_bits, e->syndrome, pvt->detail_location);
> + grain_bits, e->syndrome, e->other_detail);
>
> dimm = edac_get_dimm_by_index(mci, e->top_layer);
>
Thanks,
Mauro
Em Thu, 10 Oct 2019 20:25:37 +0000
Robert Richter <[email protected]> escreveu:
> The current code to convert a physical address mask to a grain
> (defined as granularity in bytes) is:
>
> e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK);
>
> This is broken in several ways:
>
> 1) It calculates to wrong grain values. E.g., a physical address mask
> of ~0xfff should give a grain of 0x1000. Without considering
> PAGE_MASK, there is an off-by-one. Things are worse when also
> filtering it with ~PAGE_MASK. This will calculate to a grain with the
> upper bits set. In the example it even calculates to ~0.
>
> 2) The grain does not depend on and is unrelated to the kernel's
> page-size. The page-size only matters when unmapping memory in
> memory_failure(). Smaller grains are wrongly rounded up to the
> page-size, on architectures with a configurable page-size (e.g. arm64)
> this could round up to the even bigger page-size of the hypervisor.
>
> Fix this with:
>
> e->grain = ~mem_err->physical_addr_mask + 1;
>
> The grain_bits are defined as:
>
> grain = 1 << grain_bits;
>
> Change also the grain_bits calculation accordingly, it is the same
> formula as in edac_mc.c now and the code can be unified.
>
> The value in ->physical_addr_mask coming from firmware is assumed to
> be contiguous, but this is not sanity-checked. However, in case the
> mask is non-contiguous, a conversion to grain_bits effectively
> converts the grain bit mask to a power of 2 by rounding up.
>
> Suggested-by: James Morse <[email protected]>
> Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
> ---
> drivers/edac/ghes_edac.c | 10 ++++++++--
> 1 file changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index 851aad92e42d..97242cf18a88 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -220,6 +220,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
> /* Cleans the error report buffer */
> memset(e, 0, sizeof (*e));
> e->error_count = 1;
> + e->grain = 1;
> strcpy(e->label, "unknown label");
> e->msg = pvt->msg;
> e->other_detail = pvt->other_detail;
> @@ -315,7 +316,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
>
> /* Error grain */
> if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK)
> - e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK);
> + e->grain = ~mem_err->physical_addr_mask + 1;
>
> /* Memory error location, mapped on e->location */
> p = e->location;
> @@ -428,8 +429,13 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
> if (p > pvt->other_detail)
> *(p - 1) = '\0';
>
> + /* Sanity-check driver-supplied grain value. */
> + if (WARN_ON_ONCE(!e->grain))
> + e->grain = 1;
> +
> + grain_bits = fls_long(e->grain - 1);
> +
> /* Generate the trace event */
> - grain_bits = fls_long(e->grain);
> snprintf(pvt->detail_location, sizeof(pvt->detail_location),
> "APEI location: %s %s", e->location, e->other_detail);
> trace_mc_event(e->type, e->msg, e->label, e->error_count,
Thanks,
Mauro
Em Thu, 10 Oct 2019 20:25:40 +0000
Robert Richter <[email protected]> escreveu:
> The code in ghes_edac.c and edac_mc.c for grain_bits calculation and
> calling trace_mc_event() is now the same. Move it to a single location
> in edac_raw_mc_handle_error().
>
> The only difference is the missing IS_ENABLED(CONFIG_RAS) switch, but
> this is needed for ghes too.
>
> Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
> ---
> drivers/edac/edac_mc.c | 30 +++++++++++++++---------------
> drivers/edac/ghes_edac.c | 13 -------------
> 2 files changed, 15 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index 3779204c0e21..e6c6e40dc760 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -1070,6 +1070,21 @@ void edac_raw_mc_handle_error(struct edac_raw_error_desc *e,
> {
> struct mem_ctl_info *mci = error_desc_to_mci(e);
> char detail[80];
> + u8 grain_bits;
> +
> + /* Sanity-check driver-supplied grain value. */
> + if (WARN_ON_ONCE(!e->grain))
> + e->grain = 1;
> +
> + grain_bits = fls_long(e->grain - 1);
> +
> + /* Report the error via the trace interface */
> + if (IS_ENABLED(CONFIG_RAS))
> + trace_mc_event(e->type, e->msg, e->label, e->error_count,
> + mci->mc_idx, e->top_layer, e->mid_layer,
> + e->low_layer,
> + (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
> + grain_bits, e->syndrome, e->other_detail);
>
> /* Memory type dependent details about the error */
> if (e->type == HW_EVENT_ERR_CORRECTED) {
> @@ -1110,7 +1125,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> int row = -1, chan = -1;
> int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
> int i, n_labels = 0;
> - u8 grain_bits;
> struct edac_raw_error_desc *e = &mci->error_desc;
> bool any_memory = true;
>
> @@ -1242,20 +1256,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> if (p > e->location)
> *(p - 1) = '\0';
>
> - /* Sanity-check driver-supplied grain value. */
> - if (WARN_ON_ONCE(!e->grain))
> - e->grain = 1;
> -
> - grain_bits = fls_long(e->grain - 1);
> -
> - /* Report the error via the trace interface */
> - if (IS_ENABLED(CONFIG_RAS))
> - trace_mc_event(type, e->msg, e->label, e->error_count,
> - mci->mc_idx, e->top_layer, e->mid_layer,
> - e->low_layer,
> - (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
> - grain_bits, e->syndrome, e->other_detail);
> -
> dimm = edac_get_dimm(mci, top_layer, mid_layer, low_layer);
>
> edac_raw_mc_handle_error(e, dimm);
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index 8d9d3c4dbebb..17d5b22fe000 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -198,7 +198,6 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
> struct ghes_edac_pvt *pvt = ghes_pvt;
> unsigned long flags;
> char *p;
> - u8 grain_bits;
>
> if (!pvt)
> return;
> @@ -430,18 +429,6 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
> if (p > pvt->other_detail)
> *(p - 1) = '\0';
>
> - /* Sanity-check driver-supplied grain value. */
> - if (WARN_ON_ONCE(!e->grain))
> - e->grain = 1;
> -
> - grain_bits = fls_long(e->grain - 1);
> -
> - /* Generate the trace event */
> - trace_mc_event(e->type, e->msg, e->label, e->error_count,
> - mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
> - (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
> - grain_bits, e->syndrome, e->other_detail);
> -
> dimm = edac_get_dimm_by_index(mci, e->top_layer);
>
> edac_raw_mc_handle_error(e, dimm);
Thanks,
Mauro
Em Thu, 10 Oct 2019 20:25:42 +0000
Robert Richter <[email protected]> escreveu:
> Update on CPER DIMM naming convention and DIMM ranks.
>
> Signed-off-by: Robert Richter <[email protected]>
Reviewed-by: Mauro Carvalho Chehab <[email protected]>
> ---
> Documentation/admin-guide/ras.rst | 31 +++++++++++++++++++------------
> 1 file changed, 19 insertions(+), 12 deletions(-)
>
> diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst
> index 2b20f5f7380d..26e02a59f0f4 100644
> --- a/Documentation/admin-guide/ras.rst
> +++ b/Documentation/admin-guide/ras.rst
> @@ -330,9 +330,12 @@ There can be multiple csrows and multiple channels.
>
> .. [#f4] Nowadays, the term DIMM (Dual In-line Memory Module) is widely
> used to refer to a memory module, although there are other memory
> - packaging alternatives, like SO-DIMM, SIMM, etc. Along this document,
> - and inside the EDAC system, the term "dimm" is used for all memory
> - modules, even when they use a different kind of packaging.
> + packaging alternatives, like SO-DIMM, SIMM, etc. The UEFI
> + specification (Version 2.7) defines a memory module in the Common
> + Platform Error Record (CPER) section to be an SMBIOS Memory Device
> + (Type 17). Along this document, and inside the EDAC system, the term
> + "dimm" is used for all memory modules, even when they use a
> + different kind of packaging.
>
> Memory controllers allow for several csrows, with 8 csrows being a
> typical value. Yet, the actual number of csrows depends on the layout of
> @@ -349,12 +352,14 @@ controllers. The following example will assume 2 channels:
> | | ``ch0`` | ``ch1`` |
> +============+===========+===========+
> | ``csrow0`` | DIMM_A0 | DIMM_B0 |
> - +------------+ | |
> - | ``csrow1`` | | |
> + | | rank0 | rank0 |
> + +------------+ - | - |
> + | ``csrow1`` | rank1 | rank1 |
> +------------+-----------+-----------+
> | ``csrow2`` | DIMM_A1 | DIMM_B1 |
> - +------------+ | |
> - | ``csrow3`` | | |
> + | | rank0 | rank0 |
> + +------------+ - | - |
> + | ``csrow3`` | rank1 | rank1 |
> +------------+-----------+-----------+
>
> In the above example, there are 4 physical slots on the motherboard
> @@ -374,11 +379,13 @@ which the memory DIMM is placed. Thus, when 1 DIMM is placed in each
> Channel, the csrows cross both DIMMs.
>
> Memory DIMMs come single or dual "ranked". A rank is a populated csrow.
> -Thus, 2 single ranked DIMMs, placed in slots DIMM_A0 and DIMM_B0 above
> -will have just one csrow (csrow0). csrow1 will be empty. On the other
> -hand, when 2 dual ranked DIMMs are similarly placed, then both csrow0
> -and csrow1 will be populated. The pattern repeats itself for csrow2 and
> -csrow3.
> +In the example above 2 dual ranked DIMMs are similarly placed. Thus,
> +both csrow0 and csrow1 are populated. On the other hand, when 2 single
> +ranked DIMMs are placed in slots DIMM_A0 and DIMM_B0, then they will
> +have just one csrow (csrow0) and csrow1 will be empty. The pattern
> +repeats itself for csrow2 and csrow3. Also note that some memory
> +controller doesn't have any logic to identify the memory module, see
> +``rankX`` directories below.
>
> The representation of the above is reflected in the directory
> tree in EDAC's sysfs interface. Starting in directory
Thanks,
Mauro
Thanks for review.
On 11.10.19 07:09:03, Mauro Carvalho Chehab wrote:
> Em Thu, 10 Oct 2019 20:25:07 +0000
> Robert Richter <[email protected]> escreveu:
>
> > The EDAC_DIMM_OFF() macro takes 5 arguments to get the DIMM's index.
> > This can be much simplified now as the offset is already stored in
> > struct dimm_info. Use it directly and completely remove the
> > EDAC_DIMM_OFF() macro.
>
> Ah, now I understand why you added the dimm idx field. One more reason
> why to split it on a separate patch (or perhaps merge it here).
Right, it needs to be part of the this patch.
-Robert
On 11.10.19 06:58:57, Mauro Carvalho Chehab wrote:
> Anyway, if you have a good reason to add this idx, please place
> it on a separate patch, with a proper description about why
> it is needed.
Right, see next patch. Will change.
-Robert
On 11.10.19 03:50:23, Joe Perches wrote:
> On Fri, 2019-10-11 at 07:20 -0300, Mauro Carvalho Chehab wrote:
> > Em Thu, 10 Oct 2019 15:10:53 -0700
> > Joe Perches <[email protected]> escreveu:
> >
> > > On Thu, 2019-10-10 at 20:25 +0000, Robert Richter wrote:
> > > > Reduce the indentation level in edac_mc_handle_error() a bit by using
> > > > continue. No functional changes.
> > >
> > > Seems fine, but trivially below:
> > >
> > > > diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> > > []
> > > > @@ -1171,37 +1171,38 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> > > []
> > > > + strcpy(p, dimm->label);
> > > > + p += strlen(p);
> > > > + *p = '\0';
> > >
> > > This *p = '\0' is unnecessary as the strcpy already did that.
> >
> > True, but better to put it on a separate patch, as it makes
> > easier to review if you don't mix code de-indent with changes.
> >
> > Also, maybe there are other occurrences of this pattern.
>
> Maybe 80 or so uses of paired
>
> strcpy(foo, bar);
> strlen(foo)
>
> where another function like stpcpy, which doesn't exist
> in the kernel, could have been used.
Under drivers/edac/ I found this one place only.
So I could create a separate patch as a oneliner with that (trivial)
change?
Thanks,
-Robert
On Fri, 2019-10-11 at 12:08 +0000, Robert Richter wrote:
> On 11.10.19 03:50:23, Joe Perches wrote:
> > On Fri, 2019-10-11 at 07:20 -0300, Mauro Carvalho Chehab wrote:
> > > Em Thu, 10 Oct 2019 15:10:53 -0700 Joe Perches <[email protected]> escreveu:
> > > > On Thu, 2019-10-10 at 20:25 +0000, Robert Richter wrote:
> > > > > Reduce the indentation level in edac_mc_handle_error() a bit by using
> > > > > continue. No functional changes.
> > > >
> > > > Seems fine, but trivially below:
> > > >
> > > > > diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> > > > []
> > > > > @@ -1171,37 +1171,38 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> > > > []
> > > > > + strcpy(p, dimm->label);
> > > > > + p += strlen(p);
> > > > > + *p = '\0';
> > > >
> > > > This *p = '\0' is unnecessary as the strcpy already did that.
> > >
> > > True, but better to put it on a separate patch, as it makes
> > > easier to review if you don't mix code de-indent with changes.
> > >
> > > Also, maybe there are other occurrences of this pattern.
> >
> > Maybe 80 or so uses of paired
> >
> > strcpy(foo, bar);
> > strlen(foo)
> >
> > where another function like stpcpy, which doesn't exist
> > in the kernel, could have been used.
>
> Under drivers/edac/ I found this one place only.
>
> So I could create a separate patch as a oneliner with that (trivial)
> change?
Of course yes.
On 11.10.19 07:40:31, Mauro Carvalho Chehab wrote:
> Em Thu, 10 Oct 2019 20:25:16 +0000
> Robert Richter <[email protected]> escreveu:
>
> > Looking at how mci->{ue,ce}_per_layer[EDAC_MAX_LAYERS] is used, it
> > turns out that only the leaves in the memory hierarchy are consumed
> > (in sysfs), but not the intermediate layers, e.g.:
> >
> > count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
> >
> > These unused counters only add complexity, remove them. The error
> > counter values are directly stored in struct dimm_info now.
>
> Hmm... not sure if this patch is correct. I remember that there are some
> border cases on some drivers (maybe the 3-layer drivers used together
> with RDIMM memory controllers?) where some errors are not associated
> to an specific dimm, but, instead, are related to a problem at the memory
> bus.
>
> Also, depending on how the memory controllers are organized[1], the ECC
> logic groups memory on DIMM pairs. So, when an error occur, it may be
> either at DIMM1 or DIMM2.
>
> [1] On Intel, this happens with pre-Nehalem memory controllers.
>
> Due to that, storing errors at the dimm struct sounds wrong, as the
> error may affect multiple DIMMs or even the entire layer.
That was my first thought too, but the counter values are not used at
all. The only exception is to provide *per-dimm* counters here:
{ce,ue}_per_layer[n_layers-1][dimm->idx].
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/edac/edac_mc_sysfs.c?id=4f5cafb5cb8471e54afdc9054d973535614f7675#n567
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/edac/edac_mc_sysfs.c?id=4f5cafb5cb8471e54afdc9054d973535614f7675#n584
The case you mentioned above is if the mc only sends parts of the
error location (with a top, mid or low layer missing). The dimm cannot
be identified then. In this case edac_mc_handle_error() tries to find
a unique row (+ channel infomation if available and lists all possible
dimm labels in e->label. See:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/edac/edac_mc.c?id=4f5cafb5cb8471e54afdc9054d973535614f7675#n1153
Thus, we see a counter increment for row (and also channel if it can
be identified), but this is counted in mci->csrows array only that is
not removed by this patch.
That said, the {ue,ce}_per_layer[] arrays can be removed by keeping
the same driver functionality, esp. the case you mentioned above.
-Robert
On 11.10.19 07:54:19, Mauro Carvalho Chehab wrote:
> Em Thu, 10 Oct 2019 20:25:29 +0000
> Robert Richter <[email protected]> escreveu:
>
> > Store the error type in struct edac_raw_error_desc. This makes the
> > type parameter of edac_raw_mc_handle_error() obsolete.
>
> I don't see much gain on this change, but whatever works best for
> ghes.
The error type clearly describes the error. It makes sense to keep it
in struct edac_raw_error_desc as the function interface of
edac_raw_mc_handle_error() becomes easier. There is no reason to have
a function argument for the type while all other error data is in
edac_raw_error_desc.
This change might look trivial, but this series contains many small
changes like this and in the end there is a reasonable change of the
function that describes it much better:
void edac_raw_mc_handle_error(struct edac_raw_error_desc *e,
struct dimm_info *dimm);
... that reads as handle error described in e that affects this dimm.
-Robert
On 11.10.19 08:08:25, Mauro Carvalho Chehab wrote:
> Em Thu, 10 Oct 2019 20:25:33 +0000
> Robert Richter <[email protected]> escreveu:
>
> > Have a separate function to count errors in csrow/channel. This better
> > separates code and reduces the indentation level. No functional
> > changes.
>
> This one assumes patch 06/19, with I'm not sure if it is correct.
See also my answer there, the driver should still work as you expect
it even with patch #6 applied. This patch is important to isolate the
csrow handling that makes the code much better understandable and
readable. I hope there are no concerns to this patch.
Thank you for review.
-Robert
>
> >
> > Signed-off-by: Robert Richter <[email protected]>
> > ---
> > drivers/edac/edac_mc.c | 40 +++++++++++++++++++++++++---------------
> > 1 file changed, 25 insertions(+), 15 deletions(-)
> >
> > diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> > index 9e8c5716a8c0..3779204c0e21 100644
> > --- a/drivers/edac/edac_mc.c
> > +++ b/drivers/edac/edac_mc.c
> > @@ -1045,6 +1045,26 @@ static struct mem_ctl_info *error_desc_to_mci(struct edac_raw_error_desc *e)
> > return container_of(e, struct mem_ctl_info, error_desc);
> > }
> >
> > +static void edac_inc_csrow(struct edac_raw_error_desc *e, int row, int chan)
> > +{
> > + struct mem_ctl_info *mci = error_desc_to_mci(e);
> > + u16 count = e->error_count;
> > + enum hw_event_mc_err_type type = e->type;
> > +
> > + if (row < 0)
> > + return;
> > +
> > + edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
> > +
> > + if (type == HW_EVENT_ERR_CORRECTED) {
> > + mci->csrows[row]->ce_count += count;
> > + if (chan >= 0)
> > + mci->csrows[row]->channels[chan]->ce_count += count;
> > + } else {
> > + mci->csrows[row]->ue_count += count;
> > + }
> > +}
> > +
> > void edac_raw_mc_handle_error(struct edac_raw_error_desc *e,
> > struct dimm_info *dimm)
> > {
> > @@ -1201,22 +1221,12 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> > chan = -2;
> > }
> >
> > - if (any_memory) {
> > + if (any_memory)
> > strcpy(e->label, "any memory");
> > - } else {
> > - edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
> > - if (p == e->label)
> > - strcpy(e->label, "unknown memory");
> > - if (type == HW_EVENT_ERR_CORRECTED) {
> > - if (row >= 0) {
> > - mci->csrows[row]->ce_count += error_count;
> > - if (chan >= 0)
> > - mci->csrows[row]->channels[chan]->ce_count += error_count;
> > - }
> > - } else
> > - if (row >= 0)
> > - mci->csrows[row]->ue_count += error_count;
> > - }
> > + else if (!*e->label)
> > + strcpy(e->label, "unknown memory");
> > +
> > + edac_inc_csrow(e, row, chan);
> >
> > /* Fill the RAM location data */
> > p = e->location;
>
>
>
> Thanks,
> Mauro
On 10.10.19 20:25:01, Robert Richter wrote:
> This patch set is a rework of the ghes_edac and edac_mc driver. It
> addresses issues found during code review and while working with the
> code. The changes include:
Thanks Mauro for your review of the whole series. I hope I could all
your concerns address with my answers.
-Robert