2019-04-18 03:42:26

by WANG Chao

[permalink] [raw]
Subject: [PATCH 1/3] RAS/CEC: fix __find_elem

A left over pfn (because we don't clear) at ca->array[n] can be a match
in __find_elem. Later it'd cause a memmove size overflow in del_elem.

Signed-off-by: WANG Chao <[email protected]>
---
drivers/ras/cec.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 2d9ec378a8bc..2e0bf1269c31 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -206,7 +206,7 @@ static int __find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)

this_pfn = PFN(ca->array[min]);

- if (this_pfn == pfn)
+ if (this_pfn == pfn && ca->n > min)
return min;

return -ENOKEY;
--
2.21.0


2019-04-18 03:43:39

by WANG Chao

[permalink] [raw]
Subject: [PATCH 2/3] RAS/CEC: make ces_entered smp safe

ces_entered should be put in a critical section to avoid race condition.

Signed-off-by: WANG Chao <[email protected]>
---
drivers/ras/cec.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 2e0bf1269c31..702e4c02c713 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -286,10 +286,10 @@ int cec_add_elem(u64 pfn)
if (!ce_arr.array || ce_arr.disabled)
return -ENODEV;

- ca->ces_entered++;
-
mutex_lock(&ce_mutex);

+ ca->ces_entered++;
+
if (ca->n == MAX_ELEMS)
WARN_ON(!del_lru_elem_unlocked(ca));

--
2.21.0

2019-04-18 03:52:01

by WANG Chao

[permalink] [raw]
Subject: [PATCH 3/3] RAS/CEC: immediate soft-offline page when count_threshold == 1

count_threshol == 1 isn't working as expected. CEC only does soft
offline the second time the same pfn is hit by a correctable error.

Signed-off-by: WANG Chao <[email protected]>
---
drivers/ras/cec.c | 36 +++++++++++++++++++++---------------
1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 702e4c02c713..ac879c45377c 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -272,7 +272,22 @@ static u64 __maybe_unused del_lru_elem(void)
return pfn;
}

+static void cec_valid_soft_offline(u64 pfn)
+{
+ if (!pfn_valid(pfn)) {
+ pr_warn("CEC: Invalid pfn: 0x%llx\n", pfn);
+ } else {
+ /* We have reached max count for this page, soft-offline it. */
+ pr_err("Soft-offlining pfn: 0x%llx\n", pfn);
+ memory_failure_queue(pfn, MF_SOFT_OFFLINE, &cec_chain);
+ ce_arr.pfns_poisoned++;
+ }
+}

+/*
+ * Return a >0 value to denote that we've reached the offlining
+ * threshold.
+ */
int cec_add_elem(u64 pfn)
{
struct ce_array *ca = &ce_arr;
@@ -295,6 +310,11 @@ int cec_add_elem(u64 pfn)

ret = find_elem(ca, pfn, &to);
if (ret < 0) {
+ if (count_threshold == 1) {
+ cec_valid_soft_offline(pfn);
+ ret = 1;
+ goto unlock;
+ }
/*
* Shift range [to-end] to make room for one more element.
*/
@@ -320,23 +340,9 @@ int cec_add_elem(u64 pfn)

ret = 0;
} else {
- u64 pfn = ca->array[to] >> PAGE_SHIFT;
-
- if (!pfn_valid(pfn)) {
- pr_warn("CEC: Invalid pfn: 0x%llx\n", pfn);
- } else {
- /* We have reached max count for this page, soft-offline it. */
- pr_err("Soft-offlining pfn: 0x%llx\n", pfn);
- memory_failure_queue(pfn, MF_SOFT_OFFLINE);
- ca->pfns_poisoned++;
- }
-
+ cec_valid_soft_offline(pfn);
del_elem(ca, to);

- /*
- * Return a >0 value to denote that we've reached the offlining
- * threshold.
- */
ret = 1;

goto unlock;
--
2.21.0

Subject: [tip:ras/core] RAS/CEC: Increment cec_entered under the mutex lock

Commit-ID: 06e0fe2d8e9178bda874a75083bc13647fbf983f
Gitweb: https://git.kernel.org/tip/06e0fe2d8e9178bda874a75083bc13647fbf983f
Author: WANG Chao <chao.wang () ucloud ! cn>
AuthorDate: Thu, 18 Apr 2019 03:41:14 +0000
Committer: Borislav Petkov <[email protected]>
CommitDate: Sat, 20 Apr 2019 12:13:13 +0200

RAS/CEC: Increment cec_entered under the mutex lock

Modify ->cec_entered in the critical section of the mutex.

Signed-off-by: WANG Chao <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
Cc: Tony Luck <[email protected]>
Cc: linux-edac <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
drivers/ras/cec.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 2d9ec378a8bc..88e4f3ff0cb8 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -286,10 +286,10 @@ int cec_add_elem(u64 pfn)
if (!ce_arr.array || ce_arr.disabled)
return -ENODEV;

- ca->ces_entered++;
-
mutex_lock(&ce_mutex);

+ ca->ces_entered++;
+
if (ca->n == MAX_ELEMS)
WARN_ON(!del_lru_elem_unlocked(ca));

Subject: [tip:ras/core] RAS/CEC: Increment cec_entered under the mutex lock

Commit-ID: 09cbd2197e9291d6a3d3f42873f06ca1f388c1a4
Gitweb: https://git.kernel.org/tip/09cbd2197e9291d6a3d3f42873f06ca1f388c1a4
Author: WANG Chao <[email protected]>
AuthorDate: Thu, 18 Apr 2019 03:41:14 +0000
Committer: Borislav Petkov <[email protected]>
CommitDate: Sat, 20 Apr 2019 12:16:52 +0200

RAS/CEC: Increment cec_entered under the mutex lock

Modify ->cec_entered in the critical section of the mutex.

Signed-off-by: WANG Chao <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
Cc: Tony Luck <[email protected]>
Cc: linux-edac <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
drivers/ras/cec.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 2d9ec378a8bc..88e4f3ff0cb8 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -286,10 +286,10 @@ int cec_add_elem(u64 pfn)
if (!ce_arr.array || ce_arr.disabled)
return -ENODEV;

- ca->ces_entered++;
-
mutex_lock(&ce_mutex);

+ ca->ces_entered++;
+
if (ca->n == MAX_ELEMS)
WARN_ON(!del_lru_elem_unlocked(ca));

2019-04-20 11:58:34

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 3/3] RAS/CEC: immediate soft-offline page when count_threshold == 1

On Thu, Apr 18, 2019 at 11:41:15AM +0800, WANG Chao wrote:
> count_threshol == 1 isn't working as expected. CEC only does soft
> offline the second time the same pfn is hit by a correctable error.

So this?

---
diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index b3c377ddf340..750a427e1a73 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -333,6 +333,7 @@ int cec_add_elem(u64 pfn)

mutex_lock(&ce_mutex);

+ /* Array full, free the LRU slot. */
if (ca->n == MAX_ELEMS)
WARN_ON(!del_lru_elem_unlocked(ca));

@@ -346,14 +347,9 @@ int cec_add_elem(u64 pfn)
(void *)&ca->array[to],
(ca->n - to) * sizeof(u64));

- ca->array[to] = (pfn << PAGE_SHIFT) |
- (DECAY_MASK << COUNT_BITS) | 1;
+ ca->array[to] = (pfn << PAGE_SHIFT) | 1;

ca->n++;
-
- ret = 0;
-
- goto decay;
}

count = COUNT(ca->array[to]);
@@ -386,7 +382,6 @@ int cec_add_elem(u64 pfn)
goto unlock;
}

-decay:
ca->decay_count++;

if (ca->decay_count >= CLEAN_ELEMS)

--
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

2019-04-24 02:52:52

by WANG Chao

[permalink] [raw]
Subject: Re: [PATCH 3/3] RAS/CEC: immediate soft-offline page when count_threshold == 1

On 04/20/19 at 01:57P, Borislav Petkov wrote:
> On Thu, Apr 18, 2019 at 11:41:15AM +0800, WANG Chao wrote:
> > count_threshol == 1 isn't working as expected. CEC only does soft
> > offline the second time the same pfn is hit by a correctable error.
>
> So this?
>
> ---
> diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
> index b3c377ddf340..750a427e1a73 100644
> --- a/drivers/ras/cec.c
> +++ b/drivers/ras/cec.c
> @@ -333,6 +333,7 @@ int cec_add_elem(u64 pfn)
>
> mutex_lock(&ce_mutex);
>
> + /* Array full, free the LRU slot. */
> if (ca->n == MAX_ELEMS)
> WARN_ON(!del_lru_elem_unlocked(ca));
>
> @@ -346,14 +347,9 @@ int cec_add_elem(u64 pfn)
> (void *)&ca->array[to],
> (ca->n - to) * sizeof(u64));
>
> - ca->array[to] = (pfn << PAGE_SHIFT) |
> - (DECAY_MASK << COUNT_BITS) | 1;
> + ca->array[to] = (pfn << PAGE_SHIFT) | 1;
>
> ca->n++;
> -
> - ret = 0;
> -
> - goto decay;
> }
>
> count = COUNT(ca->array[to]);
> @@ -386,7 +382,6 @@ int cec_add_elem(u64 pfn)
> goto unlock;
> }
>
> -decay:
> ca->decay_count++;
>
> if (ca->decay_count >= CLEAN_ELEMS)

It looks good to me. Thanks for a better fix.

2019-04-24 10:34:54

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 3/3] RAS/CEC: immediate soft-offline page when count_threshold == 1

On Wed, Apr 24, 2019 at 10:43:04AM +0800, WANG Chao wrote:
> It looks good to me. Thanks for a better fix.

Latest version:

https://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git/commit/?h=tip-ras-core-cec&id=aad216775348c4aaf467069c2e5fbf7ff6c27695

I'll post soon after I've hammered more on this thing.

Thx.

--
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

2019-04-25 08:46:24

by WANG Chao

[permalink] [raw]
Subject: Re: [PATCH 1/3] RAS/CEC: fix __find_elem

On 04/18/19 at 11:41P, WANG Chao wrote:
> A left over pfn (because we don't clear) at ca->array[n] can be a match
> in __find_elem. Later it'd cause a memmove size overflow in del_elem.
>
> Signed-off-by: WANG Chao <[email protected]>
> ---
> drivers/ras/cec.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
> index 2d9ec378a8bc..2e0bf1269c31 100644
> --- a/drivers/ras/cec.c
> +++ b/drivers/ras/cec.c
> @@ -206,7 +206,7 @@ static int __find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
>
> this_pfn = PFN(ca->array[min]);
>
> - if (this_pfn == pfn)
> + if (this_pfn == pfn && ca->n > min)
> return min;
>
> return -ENOKEY;

Any thought on this one?

2019-04-25 08:49:04

by WANG Chao

[permalink] [raw]
Subject: Re: [PATCH 1/3] RAS/CEC: fix __find_elem

On 04/25/19 at 03:56P, WANG Chao wrote:
> On 04/18/19 at 11:41P, WANG Chao wrote:
> > A left over pfn (because we don't clear) at ca->array[n] can be a match
> > in __find_elem. Later it'd cause a memmove size overflow in del_elem.
> >
> > Signed-off-by: WANG Chao <[email protected]>
> > ---
> > drivers/ras/cec.c | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
> > index 2d9ec378a8bc..2e0bf1269c31 100644
> > --- a/drivers/ras/cec.c
> > +++ b/drivers/ras/cec.c
> > @@ -206,7 +206,7 @@ static int __find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
> >
> > this_pfn = PFN(ca->array[min]);
> >
> > - if (this_pfn == pfn)
> > + if (this_pfn == pfn && ca->n > min)
> > return min;
> >
> > return -ENOKEY;
>
> Any thought on this one?

Aha, I see there's another fix queued. Thanks.

Subject: [tip:ras/core] RAS/CEC: Check count_threshold unconditionally

Commit-ID: de0e0624d86ff9fc512dedb297f8978698abf21a
Gitweb: https://git.kernel.org/tip/de0e0624d86ff9fc512dedb297f8978698abf21a
Author: Borislav Petkov <[email protected]>
AuthorDate: Sat, 20 Apr 2019 14:06:37 +0200
Committer: Borislav Petkov <[email protected]>
CommitDate: Sat, 8 Jun 2019 17:33:10 +0200

RAS/CEC: Check count_threshold unconditionally

The count_threshold should be checked unconditionally, after insertion
too, so that a count_threshold value of 1 can cause an immediate
offlining. I.e., offline the page on the *first* error encountered.

Add comments to make it clear what cec_add_elem() does, while at it.

Reported-by: WANG Chao <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
Cc: Tony Luck <[email protected]>
Cc: [email protected]
Link: https://lkml.kernel.org/r/[email protected]
---
drivers/ras/cec.c | 27 ++++++++++-----------------
1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index f5795adc5a6e..73a975c26f9f 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -294,6 +294,7 @@ int cec_add_elem(u64 pfn)

ca->ces_entered++;

+ /* Array full, free the LRU slot. */
if (ca->n == MAX_ELEMS)
WARN_ON(!del_lru_elem_unlocked(ca));

@@ -306,24 +307,17 @@ int cec_add_elem(u64 pfn)
(void *)&ca->array[to],
(ca->n - to) * sizeof(u64));

- ca->array[to] = (pfn << PAGE_SHIFT) |
- (DECAY_MASK << COUNT_BITS) | 1;
-
+ ca->array[to] = pfn << PAGE_SHIFT;
ca->n++;
-
- ret = 0;
-
- goto decay;
}

- count = COUNT(ca->array[to]);
-
- if (count < count_threshold) {
- ca->array[to] |= (DECAY_MASK << COUNT_BITS);
- ca->array[to]++;
+ /* Add/refresh element generation and increment count */
+ ca->array[to] |= DECAY_MASK << COUNT_BITS;
+ ca->array[to]++;

- ret = 0;
- } else {
+ /* Check action threshold and soft-offline, if reached. */
+ count = COUNT(ca->array[to]);
+ if (count >= count_threshold) {
u64 pfn = ca->array[to] >> PAGE_SHIFT;

if (!pfn_valid(pfn)) {
@@ -338,15 +332,14 @@ int cec_add_elem(u64 pfn)
del_elem(ca, to);

/*
- * Return a >0 value to denote that we've reached the offlining
- * threshold.
+ * Return a >0 value to callers, to denote that we've reached
+ * the offlining threshold.
*/
ret = 1;

goto unlock;
}

-decay:
ca->decay_count++;

if (ca->decay_count >= CLEAN_ELEMS)