2014-10-27 04:24:55

by Ian Munsie

[permalink] [raw]
Subject: [PATCH] CXL: Fix PSL error due to duplicate segment table entries

From: Ian Munsie <[email protected]>

In certain circumstances the PSL can send an interrupt for a segment
miss that the kernel has already handled. This can happen if multiple
translations for the same segment are queued in the PSL before the
kernel has restarted the first translation.

The CXL driver did not expect this situation and did not check if a
segment had already been handled. This could cause a duplicate segment
table entry which in turn caused a PSL error taking down the card.

This patch fixes the issue by checking for existing entries in the
segment table that match the segment it is trying to insert to avoid
inserting duplicate entries.

Some of the code has been refactored to simplify it - the segment table
hash has been moved from cxl_load_segment to find_free_sste where it is
used and we have disabled the secondary hash in the segment table to
reduce the number of entries that need to be tested from 16 to 8. Due to
the large segment sizes we use it is extremely unlikely that the
secondary hash would ever have been used in practice, so this should not
have any negative impacts and may even improve performance.

copro_calculate_slb will now mask the ESID by the correct mask for 1T vs
256M segments. This has no effect by itself as the extra bits were
ignored, but it makes debugging the segment table entries easier and
means that we can directly compare the ESID values for duplicates
without needing to worry about masking in the comparison.

Signed-off-by: Ian Munsie <[email protected]>
---
arch/powerpc/mm/copro_fault.c | 3 +-
drivers/misc/cxl/fault.c | 73 ++++++++++++++++++++++---------------------
drivers/misc/cxl/native.c | 4 +--
3 files changed, 41 insertions(+), 39 deletions(-)

diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index 0f9939e..5a236f0 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -99,8 +99,6 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
u64 vsid;
int psize, ssize;

- slb->esid = (ea & ESID_MASK) | SLB_ESID_V;
-
switch (REGION_ID(ea)) {
case USER_REGION_ID:
pr_devel("%s: 0x%llx -- USER_REGION_ID\n", __func__, ea);
@@ -133,6 +131,7 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
vsid |= mmu_psize_defs[psize].sllp |
((ssize == MMU_SEGSIZE_1T) ? SLB_VSID_B_1T : 0);

+ slb->esid = (ea & (ssize == MMU_SEGSIZE_1T ? ESID_MASK_1T : ESID_MASK)) | SLB_ESID_V;
slb->vsid = vsid;

return 0;
diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
index 69506eb..421cfd6 100644
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -21,60 +21,63 @@

#include "cxl.h"

-static struct cxl_sste* find_free_sste(struct cxl_sste *primary_group,
- bool sec_hash,
- struct cxl_sste *secondary_group,
- unsigned int *lru)
+static bool sste_matches(struct cxl_sste *sste, struct copro_slb *slb)
{
- unsigned int i, entry;
- struct cxl_sste *sste, *group = primary_group;
-
- for (i = 0; i < 2; i++) {
- for (entry = 0; entry < 8; entry++) {
- sste = group + entry;
- if (!(be64_to_cpu(sste->esid_data) & SLB_ESID_V))
- return sste;
- }
- if (!sec_hash)
- break;
- group = secondary_group;
+ return ((sste->vsid_data == cpu_to_be64(slb->vsid)) &&
+ (sste->esid_data == cpu_to_be64(slb->esid)));
+}
+
+/* This finds a free SSTE and checks to see if it's already in table */
+static struct cxl_sste* find_free_sste(struct cxl_context *ctx,
+ struct copro_slb *slb)
+{
+ struct cxl_sste *primary, *sste, *ret = NULL;
+ unsigned int mask = (ctx->sst_size >> 7) - 1; /* SSTP0[SegTableSize] */
+ unsigned int entry;
+ unsigned int hash;
+
+ if (slb->vsid & SLB_VSID_B_1T)
+ hash = (slb->esid >> SID_SHIFT_1T) & mask;
+ else /* 256M */
+ hash = (slb->esid >> SID_SHIFT) & mask;
+
+ primary = ctx->sstp + (hash << 3);
+ sste = primary;
+
+ for (entry = 0; entry < 8; entry++) {
+ if (!ret && !(be64_to_cpu(sste->esid_data) & SLB_ESID_V))
+ ret = sste;
+ if (sste_matches(sste, slb))
+ return NULL;
+ sste++;
}
+ if (ret)
+ return ret;
+
/* Nothing free, select an entry to cast out */
- if (sec_hash && (*lru & 0x8))
- sste = secondary_group + (*lru & 0x7);
- else
- sste = primary_group + (*lru & 0x7);
- *lru = (*lru + 1) & 0xf;
+ ret = primary + ctx->sst_lru;
+ ctx->sst_lru = (ctx->sst_lru + 1) & 0x7;

- return sste;
+ return ret;
}

static void cxl_load_segment(struct cxl_context *ctx, struct copro_slb *slb)
{
/* mask is the group index, we search primary and secondary here. */
- unsigned int mask = (ctx->sst_size >> 7)-1; /* SSTP0[SegTableSize] */
- bool sec_hash = 1;
struct cxl_sste *sste;
- unsigned int hash;
unsigned long flags;

-
- sec_hash = !!(cxl_p1n_read(ctx->afu, CXL_PSL_SR_An) & CXL_PSL_SR_An_SC);
-
- if (slb->vsid & SLB_VSID_B_1T)
- hash = (slb->esid >> SID_SHIFT_1T) & mask;
- else /* 256M */
- hash = (slb->esid >> SID_SHIFT) & mask;
-
spin_lock_irqsave(&ctx->sste_lock, flags);
- sste = find_free_sste(ctx->sstp + (hash << 3), sec_hash,
- ctx->sstp + ((~hash & mask) << 3), &ctx->sst_lru);
+ sste = find_free_sste(ctx, slb);
+ if (!sste)
+ goto out_unlock;

pr_devel("CXL Populating SST[%li]: %#llx %#llx\n",
sste - ctx->sstp, slb->vsid, slb->esid);

sste->vsid_data = cpu_to_be64(slb->vsid);
sste->esid_data = cpu_to_be64(slb->esid);
+out_unlock:
spin_unlock_irqrestore(&ctx->sste_lock, flags);
}

diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 623286a..d47532e 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -417,7 +417,7 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
ctx->elem->haurp = 0; /* disable */
ctx->elem->sdr = cpu_to_be64(mfspr(SPRN_SDR1));

- sr = CXL_PSL_SR_An_SC;
+ sr = 0;
if (ctx->master)
sr |= CXL_PSL_SR_An_MP;
if (mfspr(SPRN_LPCR) & LPCR_TC)
@@ -508,7 +508,7 @@ static int attach_dedicated(struct cxl_context *ctx, u64 wed, u64 amr)
u64 sr;
int rc;

- sr = CXL_PSL_SR_An_SC;
+ sr = 0;
set_endian(sr);
if (ctx->master)
sr |= CXL_PSL_SR_An_MP;
--
2.1.1


2014-10-27 06:41:04

by Michael Ellerman

[permalink] [raw]
Subject: Re: CXL: Fix PSL error due to duplicate segment table entries

On Mon, 2014-27-10 at 04:24:35 UTC, Ian Munsie wrote:
> From: Ian Munsie <[email protected]>
>
> In certain circumstances the PSL can send an interrupt for a segment

Define PSL before using it please.

> miss that the kernel has already handled. This can happen if multiple
> translations for the same segment are queued in the PSL before the
> kernel has restarted the first translation.
>
> The CXL driver did not expect this situation and did not check if a

does not and does not, you haven't patched it yet.

> segment had already been handled. This could cause a duplicate segment
> table entry which in turn caused a PSL error taking down the card.
>
> This patch fixes the issue by checking for existing entries in the
> segment table that match the segment it is trying to insert to avoid
> inserting duplicate entries.
>
> Some of the code has been refactored to simplify it - the segment table
> hash has been moved from cxl_load_segment to find_free_sste where it is

Any reason that's not a separate patch?

> used and we have disabled the secondary hash in the segment table to
> reduce the number of entries that need to be tested from 16 to 8. Due to
> the large segment sizes we use it is extremely unlikely that the
> secondary hash would ever have been used in practice, so this should not
> have any negative impacts and may even improve performance.

Any reason that's not a separate patch?

> copro_calculate_slb will now mask the ESID by the correct mask for 1T vs

Didn't, but will after this patch?

> 256M segments. This has no effect by itself as the extra bits were
> ignored, but it makes debugging the segment table entries easier and
> means that we can directly compare the ESID values for duplicates
> without needing to worry about masking in the comparison.

Separate patch?

cheers

2014-10-27 14:39:07

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] CXL: Fix PSL error due to duplicate segment table entries

Ian Munsie <[email protected]> writes:

> From: Ian Munsie <[email protected]>
>
> In certain circumstances the PSL can send an interrupt for a segment
> miss that the kernel has already handled. This can happen if multiple
> translations for the same segment are queued in the PSL before the
> kernel has restarted the first translation.
>
> The CXL driver did not expect this situation and did not check if a
> segment had already been handled. This could cause a duplicate segment
> table entry which in turn caused a PSL error taking down the card.
>
> This patch fixes the issue by checking for existing entries in the
> segment table that match the segment it is trying to insert to avoid
> inserting duplicate entries.
>
> Some of the code has been refactored to simplify it - the segment table
> hash has been moved from cxl_load_segment to find_free_sste where it is
> used and we have disabled the secondary hash in the segment table to
> reduce the number of entries that need to be tested from 16 to 8. Due to
> the large segment sizes we use it is extremely unlikely that the
> secondary hash would ever have been used in practice, so this should not
> have any negative impacts and may even improve performance.
>
> copro_calculate_slb will now mask the ESID by the correct mask for 1T vs
> 256M segments. This has no effect by itself as the extra bits were
> ignored, but it makes debugging the segment table entries easier and
> means that we can directly compare the ESID values for duplicates
> without needing to worry about masking in the comparison.
>
> Signed-off-by: Ian Munsie <[email protected]>


I guess you are missing too many fixes in one patch.

1) One cleanup
2) Fix for masking ea correctly
3) And fix for not erroring out when a slb is already in the slb cache.


> ---
> arch/powerpc/mm/copro_fault.c | 3 +-
> drivers/misc/cxl/fault.c | 73 ++++++++++++++++++++++---------------------
> drivers/misc/cxl/native.c | 4 +--
> 3 files changed, 41 insertions(+), 39 deletions(-)
>
> diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
> index 0f9939e..5a236f0 100644
> --- a/arch/powerpc/mm/copro_fault.c
> +++ b/arch/powerpc/mm/copro_fault.c
> @@ -99,8 +99,6 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
> u64 vsid;
> int psize, ssize;
>
> - slb->esid = (ea & ESID_MASK) | SLB_ESID_V;
> -
> switch (REGION_ID(ea)) {
> case USER_REGION_ID:
> pr_devel("%s: 0x%llx -- USER_REGION_ID\n", __func__, ea);
> @@ -133,6 +131,7 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
> vsid |= mmu_psize_defs[psize].sllp |
> ((ssize == MMU_SEGSIZE_1T) ? SLB_VSID_B_1T : 0);
>
> + slb->esid = (ea & (ssize == MMU_SEGSIZE_1T ? ESID_MASK_1T : ESID_MASK)) | SLB_ESID_V;
> slb->vsid = vsid;
>
> return 0;
> diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
> index 69506eb..421cfd6 100644
> --- a/drivers/misc/cxl/fault.c
> +++ b/drivers/misc/cxl/fault.c
> @@ -21,60 +21,63 @@
>
> #include "cxl.h"
>
> -static struct cxl_sste* find_free_sste(struct cxl_sste *primary_group,
> - bool sec_hash,
> - struct cxl_sste *secondary_group,
> - unsigned int *lru)
> +static bool sste_matches(struct cxl_sste *sste, struct copro_slb *slb)
> {
> - unsigned int i, entry;
> - struct cxl_sste *sste, *group = primary_group;
> -
> - for (i = 0; i < 2; i++) {
> - for (entry = 0; entry < 8; entry++) {
> - sste = group + entry;
> - if (!(be64_to_cpu(sste->esid_data) & SLB_ESID_V))
> - return sste;
> - }
> - if (!sec_hash)
> - break;
> - group = secondary_group;
> + return ((sste->vsid_data == cpu_to_be64(slb->vsid)) &&
> + (sste->esid_data == cpu_to_be64(slb->esid)));
> +}
> +
> +/* This finds a free SSTE and checks to see if it's already in table */
> +static struct cxl_sste* find_free_sste(struct cxl_context *ctx,
> + struct copro_slb *slb)

the name is confusing. If you want to keep the name, can you also
specify that it return NULL, if it finds a matching entry. IIUC that
is the real part of the fix for the problem mentioned ?

> +{
> + struct cxl_sste *primary, *sste, *ret = NULL;
> + unsigned int mask = (ctx->sst_size >> 7) - 1; /* SSTP0[SegTableSize] */
> + unsigned int entry;
> + unsigned int hash;
> +
> + if (slb->vsid & SLB_VSID_B_1T)
> + hash = (slb->esid >> SID_SHIFT_1T) & mask;
> + else /* 256M */
> + hash = (slb->esid >> SID_SHIFT) & mask;
> +
> + primary = ctx->sstp + (hash << 3);
> + sste = primary;
> +
> + for (entry = 0; entry < 8; entry++) {
> + if (!ret && !(be64_to_cpu(sste->esid_data) & SLB_ESID_V))
> + ret = sste;
> + if (sste_matches(sste, slb))
> + return NULL;
> + sste++;
> }
> + if (ret)
> + return ret;
> +
> /* Nothing free, select an entry to cast out */
> - if (sec_hash && (*lru & 0x8))
> - sste = secondary_group + (*lru & 0x7);
> - else
> - sste = primary_group + (*lru & 0x7);
> - *lru = (*lru + 1) & 0xf;
> + ret = primary + ctx->sst_lru;
> + ctx->sst_lru = (ctx->sst_lru + 1) & 0x7;
>
> - return sste;
> + return ret;
> }
>
> static void cxl_load_segment(struct cxl_context *ctx, struct copro_slb *slb)
> {
> /* mask is the group index, we search primary and secondary here. */
> - unsigned int mask = (ctx->sst_size >> 7)-1; /* SSTP0[SegTableSize] */
> - bool sec_hash = 1;
> struct cxl_sste *sste;
> - unsigned int hash;
> unsigned long flags;
>
> -
> - sec_hash = !!(cxl_p1n_read(ctx->afu, CXL_PSL_SR_An) & CXL_PSL_SR_An_SC);
> -
> - if (slb->vsid & SLB_VSID_B_1T)
> - hash = (slb->esid >> SID_SHIFT_1T) & mask;
> - else /* 256M */
> - hash = (slb->esid >> SID_SHIFT) & mask;
> -
> spin_lock_irqsave(&ctx->sste_lock, flags);
> - sste = find_free_sste(ctx->sstp + (hash << 3), sec_hash,
> - ctx->sstp + ((~hash & mask) << 3), &ctx->sst_lru);
> + sste = find_free_sste(ctx, slb);
> + if (!sste)
> + goto out_unlock;
>
> pr_devel("CXL Populating SST[%li]: %#llx %#llx\n",
> sste - ctx->sstp, slb->vsid, slb->esid);
>
> sste->vsid_data = cpu_to_be64(slb->vsid);
> sste->esid_data = cpu_to_be64(slb->esid);
> +out_unlock:
> spin_unlock_irqrestore(&ctx->sste_lock, flags);
> }
>
> diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
> index 623286a..d47532e 100644
> --- a/drivers/misc/cxl/native.c
> +++ b/drivers/misc/cxl/native.c
> @@ -417,7 +417,7 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
> ctx->elem->haurp = 0; /* disable */
> ctx->elem->sdr = cpu_to_be64(mfspr(SPRN_SDR1));
>
> - sr = CXL_PSL_SR_An_SC;
> + sr = 0;
> if (ctx->master)
> sr |= CXL_PSL_SR_An_MP;
> if (mfspr(SPRN_LPCR) & LPCR_TC)
> @@ -508,7 +508,7 @@ static int attach_dedicated(struct cxl_context *ctx, u64 wed, u64 amr)
> u64 sr;
> int rc;
>
> - sr = CXL_PSL_SR_An_SC;
> + sr = 0;

What is this change about ?

> set_endian(sr);
> if (ctx->master)
> sr |= CXL_PSL_SR_An_MP;
> --
> 2.1.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2014-10-28 00:17:52

by Ian Munsie

[permalink] [raw]
Subject: Re: CXL: Fix PSL error due to duplicate segment table entries

Excerpts from Michael Ellerman's message of 2014-10-27 17:41:00 +1100:
> On Mon, 2014-27-10 at 04:24:35 UTC, Ian Munsie wrote:
> > From: Ian Munsie <[email protected]>
> >
> > In certain circumstances the PSL can send an interrupt for a segment
>
> Define PSL before using it please.

ok

> > The CXL driver did not expect this situation and did not check if a
>
> does not and does not, you haven't patched it yet.

ok

> > Some of the code has been refactored to simplify it - the segment table
> > hash has been moved from cxl_load_segment to find_free_sste where it is
>
> Any reason that's not a separate patch?

ok, I'll split it.

> > used and we have disabled the secondary hash in the segment table to
> > reduce the number of entries that need to be tested from 16 to 8. Due to
> > the large segment sizes we use it is extremely unlikely that the
> > secondary hash would ever have been used in practice, so this should not
> > have any negative impacts and may even improve performance.
>
> Any reason that's not a separate patch?

ok, I'll split it.

> > copro_calculate_slb will now mask the ESID by the correct mask for 1T vs
>
> Didn't, but will after this patch?

ok, will reword

> > 256M segments. This has no effect by itself as the extra bits were
> > ignored, but it makes debugging the segment table entries easier and
> > means that we can directly compare the ESID values for duplicates
> > without needing to worry about masking in the comparison.
>
> Separate patch?

ok, I'll split it.

Cheers,
-Ian

2014-10-28 00:20:22

by Ian Munsie

[permalink] [raw]
Subject: Re: [PATCH] CXL: Fix PSL error due to duplicate segment table entries

Excerpts from Aneesh Kumar K.V's message of 2014-10-28 01:38:41 +1100:
> I guess you are missing too many fixes in one patch.
>
> 1) One cleanup
> 2) Fix for masking ea correctly
> 3) And fix for not erroring out when a slb is already in the slb cache.

ok, I'll split it up

> > +/* This finds a free SSTE and checks to see if it's already in table */
> > +static struct cxl_sste* find_free_sste(struct cxl_context *ctx,
> > + struct copro_slb *slb)
>
> the name is confusing. If you want to keep the name, can you also
> specify that it return NULL, if it finds a matching entry. IIUC that
> is the real part of the fix for the problem mentioned ?

Good point.

> > - sr = CXL_PSL_SR_An_SC;
> > + sr = 0;
>
> What is this change about ?

That tells the PSL not to use the secondary hash since we are no longer
filling out any entries using it. I'll clarify that in the commit
message when I split this out.

Cheers,
-Ian