2021-01-05 21:45:58

by Christian König

[permalink] [raw]
Subject: Re: [PATCH v2] drm/amdgpu: Add check to prevenet IH overflow

Am 05.01.21 um 17:06 schrieb Defang Bo:
> Similar to commit <b82175750131>("drm/amdgpu: fix IH overflow on Vega10 v2").
> When an ring buffer overflow happens the appropriate bit is set in the WPTR
> register which is also written back to memory. But clearing the bit in the
> WPTR doesn't trigger another memory writeback.
>
> So what can happen is that we end up processing the buffer overflow over and
> over again because the bit is never cleared. Resulting in a random system
> lockup because of an infinite loop in an interrupt handler.
>
> Signed-off-by: Defang Bo <[email protected]>

Reviewed-by: Christian König <[email protected]>

Thanks for the help,
Christian.

> ---
> Changes since v1:
> - Modify the subject and replace the wrong register.
> ---
> ---
> drivers/gpu/drm/amd/amdgpu/cz_ih.c | 39 +++++++++++++++++++++------------
> drivers/gpu/drm/amd/amdgpu/iceland_ih.c | 36 +++++++++++++++++++-----------
> drivers/gpu/drm/amd/amdgpu/tonga_ih.c | 37 ++++++++++++++++++++-----------
> 3 files changed, 72 insertions(+), 40 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> index 1dca0cabc326..65361afb21ab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> @@ -190,22 +190,33 @@ static u32 cz_ih_get_wptr(struct amdgpu_device *adev,
> struct amdgpu_ih_ring *ih)
> {
> u32 wptr, tmp;
> -
> +
> wptr = le32_to_cpu(*ih->wptr_cpu);
>
> - if (REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW)) {
> - wptr = REG_SET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW, 0);
> - /* When a ring buffer overflow happen start parsing interrupt
> - * from the last not overwritten vector (wptr + 16). Hopefully
> - * this should allow us to catchup.
> - */
> - dev_warn(adev->dev, "IH ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n",
> - wptr, ih->rptr, (wptr + 16) & ih->ptr_mask);
> - ih->rptr = (wptr + 16) & ih->ptr_mask;
> - tmp = RREG32(mmIH_RB_CNTL);
> - tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> - WREG32(mmIH_RB_CNTL, tmp);
> - }
> + if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
> + goto out;
> +
> + /* Double check that the overflow wasn't already cleared. */
> + wptr = RREG32(mmIH_RB_WPTR);
> +
> + if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
> + goto out;
> +
> + wptr = REG_SET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW, 0);
> +
> + /* When a ring buffer overflow happen start parsing interrupt
> + * from the last not overwritten vector (wptr + 16). Hopefully
> + * this should allow us to catchup.
> + */
> + dev_warn(adev->dev, "IH ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n",
> + wptr, ih->rptr, (wptr + 16) & ih->ptr_mask);
> + ih->rptr = (wptr + 16) & ih->ptr_mask;
> + tmp = RREG32(mmIH_RB_CNTL);
> + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> + WREG32(mmIH_RB_CNTL, tmp);
> +
> +
> +out:
> return (wptr & ih->ptr_mask);
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> index a13dd9a51149..8e4dae8addb9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> @@ -193,19 +193,29 @@ static u32 iceland_ih_get_wptr(struct amdgpu_device *adev,
>
> wptr = le32_to_cpu(*ih->wptr_cpu);
>
> - if (REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW)) {
> - wptr = REG_SET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW, 0);
> - /* When a ring buffer overflow happen start parsing interrupt
> - * from the last not overwritten vector (wptr + 16). Hopefully
> - * this should allow us to catchup.
> - */
> - dev_warn(adev->dev, "IH ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n",
> - wptr, ih->rptr, (wptr + 16) & ih->ptr_mask);
> - ih->rptr = (wptr + 16) & ih->ptr_mask;
> - tmp = RREG32(mmIH_RB_CNTL);
> - tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> - WREG32(mmIH_RB_CNTL, tmp);
> - }
> + if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
> + goto out;
> +
> + /* Double check that the overflow wasn't already cleared. */
> + wptr = RREG32(mmIH_RB_WPTR);
> +
> + if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
> + goto out;
> +
> + wptr = REG_SET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW, 0);
> + /* When a ring buffer overflow happen start parsing interrupt
> + * from the last not overwritten vector (wptr + 16). Hopefully
> + * this should allow us to catchup.
> + */
> + dev_warn(adev->dev, "IH ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n",
> + wptr, ih->rptr, (wptr + 16) & ih->ptr_mask);
> + ih->rptr = (wptr + 16) & ih->ptr_mask;
> + tmp = RREG32(mmIH_RB_CNTL);
> + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> + WREG32(mmIH_RB_CNTL, tmp);
> +
> +
> +out:
> return (wptr & ih->ptr_mask);
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> index e40140bf6699..2ba1ce323b6d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> @@ -195,19 +195,30 @@ static u32 tonga_ih_get_wptr(struct amdgpu_device *adev,
>
> wptr = le32_to_cpu(*ih->wptr_cpu);
>
> - if (REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW)) {
> - wptr = REG_SET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW, 0);
> - /* When a ring buffer overflow happen start parsing interrupt
> - * from the last not overwritten vector (wptr + 16). Hopefully
> - * this should allow us to catchup.
> - */
> - dev_warn(adev->dev, "IH ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n",
> - wptr, ih->rptr, (wptr + 16) & ih->ptr_mask);
> - ih->rptr = (wptr + 16) & ih->ptr_mask;
> - tmp = RREG32(mmIH_RB_CNTL);
> - tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> - WREG32(mmIH_RB_CNTL, tmp);
> - }
> + if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
> + goto out;
> +
> + /* Double check that the overflow wasn't already cleared. */
> + wptr = RREG32(mmIH_RB_WPTR);
> +
> + if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
> + goto out;
> +
> + wptr = REG_SET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW, 0);
> +
> + /* When a ring buffer overflow happen start parsing interrupt
> + * from the last not overwritten vector (wptr + 16). Hopefully
> + * this should allow us to catchup.
> + */
> +
> + dev_warn(adev->dev, "IH ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n",
> + wptr, ih->rptr, (wptr + 16) & ih->ptr_mask);
> + ih->rptr = (wptr + 16) & ih->ptr_mask;
> + tmp = RREG32(mmIH_RB_CNTL);
> + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> + WREG32(mmIH_RB_CNTL, tmp);
> +
> +out:
> return (wptr & ih->ptr_mask);
> }
>


2021-01-05 22:42:48

by Alex Deucher

[permalink] [raw]
Subject: Re: [PATCH v2] drm/amdgpu: Add check to prevenet IH overflow

On Tue, Jan 5, 2021 at 11:49 AM Christian König
<[email protected]> wrote:
>
> Am 05.01.21 um 17:06 schrieb Defang Bo:
> > Similar to commit <b82175750131>("drm/amdgpu: fix IH overflow on Vega10 v2").
> > When an ring buffer overflow happens the appropriate bit is set in the WPTR
> > register which is also written back to memory. But clearing the bit in the
> > WPTR doesn't trigger another memory writeback.
> >
> > So what can happen is that we end up processing the buffer overflow over and
> > over again because the bit is never cleared. Resulting in a random system
> > lockup because of an infinite loop in an interrupt handler.
> >
> > Signed-off-by: Defang Bo <[email protected]>
>
> Reviewed-by: Christian König <[email protected]>
>
> Thanks for the help,
> Christian.

Applied. Thanks!

Alex


>
> > ---
> > Changes since v1:
> > - Modify the subject and replace the wrong register.
> > ---
> > ---
> > drivers/gpu/drm/amd/amdgpu/cz_ih.c | 39 +++++++++++++++++++++------------
> > drivers/gpu/drm/amd/amdgpu/iceland_ih.c | 36 +++++++++++++++++++-----------
> > drivers/gpu/drm/amd/amdgpu/tonga_ih.c | 37 ++++++++++++++++++++-----------
> > 3 files changed, 72 insertions(+), 40 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> > index 1dca0cabc326..65361afb21ab 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> > @@ -190,22 +190,33 @@ static u32 cz_ih_get_wptr(struct amdgpu_device *adev,
> > struct amdgpu_ih_ring *ih)
> > {
> > u32 wptr, tmp;
> > -
> > +
> > wptr = le32_to_cpu(*ih->wptr_cpu);
> >
> > - if (REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW)) {
> > - wptr = REG_SET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW, 0);
> > - /* When a ring buffer overflow happen start parsing interrupt
> > - * from the last not overwritten vector (wptr + 16). Hopefully
> > - * this should allow us to catchup.
> > - */
> > - dev_warn(adev->dev, "IH ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n",
> > - wptr, ih->rptr, (wptr + 16) & ih->ptr_mask);
> > - ih->rptr = (wptr + 16) & ih->ptr_mask;
> > - tmp = RREG32(mmIH_RB_CNTL);
> > - tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> > - WREG32(mmIH_RB_CNTL, tmp);
> > - }
> > + if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
> > + goto out;
> > +
> > + /* Double check that the overflow wasn't already cleared. */
> > + wptr = RREG32(mmIH_RB_WPTR);
> > +
> > + if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
> > + goto out;
> > +
> > + wptr = REG_SET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW, 0);
> > +
> > + /* When a ring buffer overflow happen start parsing interrupt
> > + * from the last not overwritten vector (wptr + 16). Hopefully
> > + * this should allow us to catchup.
> > + */
> > + dev_warn(adev->dev, "IH ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n",
> > + wptr, ih->rptr, (wptr + 16) & ih->ptr_mask);
> > + ih->rptr = (wptr + 16) & ih->ptr_mask;
> > + tmp = RREG32(mmIH_RB_CNTL);
> > + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> > + WREG32(mmIH_RB_CNTL, tmp);
> > +
> > +
> > +out:
> > return (wptr & ih->ptr_mask);
> > }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> > index a13dd9a51149..8e4dae8addb9 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> > @@ -193,19 +193,29 @@ static u32 iceland_ih_get_wptr(struct amdgpu_device *adev,
> >
> > wptr = le32_to_cpu(*ih->wptr_cpu);
> >
> > - if (REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW)) {
> > - wptr = REG_SET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW, 0);
> > - /* When a ring buffer overflow happen start parsing interrupt
> > - * from the last not overwritten vector (wptr + 16). Hopefully
> > - * this should allow us to catchup.
> > - */
> > - dev_warn(adev->dev, "IH ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n",
> > - wptr, ih->rptr, (wptr + 16) & ih->ptr_mask);
> > - ih->rptr = (wptr + 16) & ih->ptr_mask;
> > - tmp = RREG32(mmIH_RB_CNTL);
> > - tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> > - WREG32(mmIH_RB_CNTL, tmp);
> > - }
> > + if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
> > + goto out;
> > +
> > + /* Double check that the overflow wasn't already cleared. */
> > + wptr = RREG32(mmIH_RB_WPTR);
> > +
> > + if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
> > + goto out;
> > +
> > + wptr = REG_SET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW, 0);
> > + /* When a ring buffer overflow happen start parsing interrupt
> > + * from the last not overwritten vector (wptr + 16). Hopefully
> > + * this should allow us to catchup.
> > + */
> > + dev_warn(adev->dev, "IH ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n",
> > + wptr, ih->rptr, (wptr + 16) & ih->ptr_mask);
> > + ih->rptr = (wptr + 16) & ih->ptr_mask;
> > + tmp = RREG32(mmIH_RB_CNTL);
> > + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> > + WREG32(mmIH_RB_CNTL, tmp);
> > +
> > +
> > +out:
> > return (wptr & ih->ptr_mask);
> > }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> > index e40140bf6699..2ba1ce323b6d 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> > @@ -195,19 +195,30 @@ static u32 tonga_ih_get_wptr(struct amdgpu_device *adev,
> >
> > wptr = le32_to_cpu(*ih->wptr_cpu);
> >
> > - if (REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW)) {
> > - wptr = REG_SET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW, 0);
> > - /* When a ring buffer overflow happen start parsing interrupt
> > - * from the last not overwritten vector (wptr + 16). Hopefully
> > - * this should allow us to catchup.
> > - */
> > - dev_warn(adev->dev, "IH ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n",
> > - wptr, ih->rptr, (wptr + 16) & ih->ptr_mask);
> > - ih->rptr = (wptr + 16) & ih->ptr_mask;
> > - tmp = RREG32(mmIH_RB_CNTL);
> > - tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> > - WREG32(mmIH_RB_CNTL, tmp);
> > - }
> > + if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
> > + goto out;
> > +
> > + /* Double check that the overflow wasn't already cleared. */
> > + wptr = RREG32(mmIH_RB_WPTR);
> > +
> > + if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
> > + goto out;
> > +
> > + wptr = REG_SET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW, 0);
> > +
> > + /* When a ring buffer overflow happen start parsing interrupt
> > + * from the last not overwritten vector (wptr + 16). Hopefully
> > + * this should allow us to catchup.
> > + */
> > +
> > + dev_warn(adev->dev, "IH ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n",
> > + wptr, ih->rptr, (wptr + 16) & ih->ptr_mask);
> > + ih->rptr = (wptr + 16) & ih->ptr_mask;
> > + tmp = RREG32(mmIH_RB_CNTL);
> > + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> > + WREG32(mmIH_RB_CNTL, tmp);
> > +
> > +out:
> > return (wptr & ih->ptr_mask);
> > }
> >
>
> _______________________________________________
> amd-gfx mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx