2022-03-02 23:28:55

by Akhil P Oommen

[permalink] [raw]
Subject: [PATCH v1 00/10] Support for GMU coredump and some related improvements

Major enhancement in this series is the support for a minimal gmu coredump
which can be captured inline instead of through our usual recover worker. It
is helpful in the case of gmu errors during gpu wake-up/suspend path and
helps to capture a snapshot of gmu before we do a suspend. I had to introduce
a lock to synchronize the crashstate because the runtime-suspend can happen
from an asynchronous RPM thread.

Apart from this, there are some improvements to gracefully handle the
gmu errors by propagating the error back to parent or by retrying. Also, a
few patches to fix some trivial bugs in the related code.


Akhil P Oommen (10):
drm/msm/a6xx: Add helper to check smmu is stalled
drm/msm/a6xx: Send NMI to gmu when it is hung
drm/msm/a6xx: Avoid gmu lock in pm ops
drm/msm/a6xx: Enhance debugging of gmu faults
drm/msm: Do recovery on hw_init failure
drm/msm/a6xx: Propagate OOB set error
drm/msm/adreno: Retry on gpu resume failure
drm/msm/a6xx: Remove clk votes on failure
drm/msm: Remove pm_runtime_get() from msm_job_run()
drm/msm/a6xx: Free gmu_debug crashstate bo

drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 89 +++++++++++++++++++++++------
drivers/gpu/drm/msm/adreno/a6xx_gmu.h | 1 +
drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 31 +++++++---
drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 4 +-
drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c | 79 +++++++++++++++++++++----
drivers/gpu/drm/msm/adreno/adreno_device.c | 10 +++-
drivers/gpu/drm/msm/adreno/adreno_gpu.c | 10 +++-
drivers/gpu/drm/msm/adreno/adreno_gpu.h | 2 +
drivers/gpu/drm/msm/msm_gpu.c | 28 ++++++++-
drivers/gpu/drm/msm/msm_gpu.h | 11 ++--
drivers/gpu/drm/msm/msm_ringbuffer.c | 4 --
11 files changed, 218 insertions(+), 51 deletions(-)

--
2.7.4


2022-03-02 23:57:37

by Akhil P Oommen

[permalink] [raw]
Subject: [PATCH v1 03/10] drm/msm/a6xx: Avoid gmu lock in pm ops

We don't really need gmu lock in runtime pm ops because these operations
are serialized anyway and also with other paths where we take this lock.
This patch will help to simplify the locking order when we introduce
crashstate_lock in the upcoming patch.

Signed-off-by: Akhil P Oommen <[email protected]>
---

drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 4 ----
1 file changed, 4 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 3faf551..8c3cb31 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1530,9 +1530,7 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)

trace_msm_gpu_resume(0);

- mutex_lock(&a6xx_gpu->gmu.lock);
ret = a6xx_gmu_resume(a6xx_gpu);
- mutex_unlock(&a6xx_gpu->gmu.lock);
if (ret)
return ret;

@@ -1555,9 +1553,7 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu)

msm_devfreq_suspend(gpu);

- mutex_lock(&a6xx_gpu->gmu.lock);
ret = a6xx_gmu_stop(a6xx_gpu);
- mutex_unlock(&a6xx_gpu->gmu.lock);
if (ret)
return ret;

--
2.7.4

2022-03-03 00:02:20

by Akhil P Oommen

[permalink] [raw]
Subject: [PATCH v1 08/10] drm/msm/a6xx: Remove clk votes on failure

Remove vote on clks on gpu resume failure.

Signed-off-by: Akhil P Oommen <[email protected]>
---

drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 1 +
1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
index 66ae509..e90359f 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
@@ -1033,6 +1033,7 @@ int a6xx_gmu_resume(struct a6xx_gpu *a6xx_gpu)
disable_irq(gmu->gmu_irq);
a6xx_gmu_inline_coredump(gmu);
a6xx_rpmh_stop(gmu);
+ clk_bulk_disable_unprepare(gmu->nr_clocks, gmu->clocks);
pm_runtime_put(gmu->gxpd);
pm_runtime_put(gmu->dev);
}
--
2.7.4

2022-03-03 00:29:51

by Akhil P Oommen

[permalink] [raw]
Subject: [PATCH v1 02/10] drm/msm/a6xx: Send NMI to gmu when it is hung

While capturing gmu state, first send an NMI to gmu when it is hung.
This helps to move gmu to a safe state.

Signed-off-by: Akhil P Oommen <[email protected]>
---

drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 37 +++++++++++++++++++++++++++++
drivers/gpu/drm/msm/adreno/a6xx_gmu.h | 1 +
drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c | 14 ++++++++++-
3 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
index 3e325e2..f208a81 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
@@ -14,6 +14,37 @@
#include "msm_gpu_trace.h"
#include "msm_mmu.h"

+void a6xx_gmu_send_nmi(struct a6xx_gmu *gmu)
+{
+ struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
+ struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
+ struct msm_gpu *gpu = &adreno_gpu->base;
+ u32 val;
+
+ if (a6xx_gmu_gx_is_on(gmu) && a6xx_is_smmu_stalled(gpu)) {
+ DRM_DEV_ERROR(gmu->dev,
+ "Skipping GMU NMI since SMMU is stalled\n");
+ }
+
+ /* Don't retrigger NMI if gmu reset is already active */
+ val = gmu_read(gmu, REG_A6XX_GMU_CM3_FW_INIT_RESULT);
+ if (val & 0xE00)
+ return;
+
+ /* Mask all interrupts from GMU first */
+ gmu_write(gmu, REG_A6XX_GMU_GMU2HOST_INTR_MASK, 0xFFFFFFFF);
+
+ /* Trigger NMI to make gmu save it's internal state to ddr */
+ val = gmu_read(gmu, REG_A6XX_GMU_CM3_CFG);
+ gmu_write(gmu, REG_A6XX_GMU_CM3_CFG, val | BIT(9));
+
+ /* Barrier to ensure write is posted before we proceed */
+ wmb();
+
+ /* Small delay to ensure state copy is ddr is complete at GMU */
+ udelay(200);
+}
+
static void a6xx_gmu_fault(struct a6xx_gmu *gmu)
{
struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
@@ -790,6 +821,12 @@ static int a6xx_gmu_fw_start(struct a6xx_gmu *gmu, unsigned int state)
gmu_write(gmu, REG_A6XX_GMU_CM3_FW_INIT_RESULT, 0);
gmu_write(gmu, REG_A6XX_GMU_CM3_BOOT_CONFIG, 0x02);

+ /*
+ * Make sure that the NMI bit is cleared by configuring the reset value
+ * here
+ */
+ gmu_write(gmu, REG_A6XX_GMU_CM3_CFG, 0x4052);
+
/* Write the iova of the HFI table */
gmu_write(gmu, REG_A6XX_GMU_HFI_QTBL_ADDR, gmu->hfi.iova);
gmu_write(gmu, REG_A6XX_GMU_HFI_QTBL_INFO, 1);
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
index 84bd516..4228ec1 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
@@ -186,5 +186,6 @@ int a6xx_hfi_set_freq(struct a6xx_gmu *gmu, int index);

bool a6xx_gmu_gx_is_on(struct a6xx_gmu *gmu);
bool a6xx_gmu_sptprac_is_on(struct a6xx_gmu *gmu);
+void a6xx_gmu_send_nmi(struct a6xx_gmu *gmu);

#endif
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c
index 7de9d2f..09b2ff0 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c
@@ -964,6 +964,18 @@ static void a6xx_get_indexed_registers(struct msm_gpu *gpu,
a6xx_state->nr_indexed_regs = count;
}

+void a6xx_get_gmu_state(struct msm_gpu *gpu, struct a6xx_gpu_state *a6xx_state)
+{
+ struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+ struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
+ struct a6xx_gmu *gmu = &a6xx_gpu->gmu;
+
+ if (gmu->hung)
+ a6xx_gmu_send_nmi(gmu);
+
+ a6xx_get_gmu_registers(gpu, a6xx_state);
+}
+
struct msm_gpu_state *a6xx_gpu_state_get(struct msm_gpu *gpu)
{
struct a6xx_crashdumper _dumper = { 0 }, *dumper = NULL;
@@ -980,7 +992,7 @@ struct msm_gpu_state *a6xx_gpu_state_get(struct msm_gpu *gpu)
/* Get the generic state from the adreno core */
adreno_gpu_state_get(gpu, &a6xx_state->base);

- a6xx_get_gmu_registers(gpu, a6xx_state);
+ a6xx_get_gmu_state(gpu, a6xx_state);

a6xx_state->gmu_log = a6xx_snapshot_gmu_bo(a6xx_state, &a6xx_gpu->gmu.log);
a6xx_state->gmu_hfi = a6xx_snapshot_gmu_bo(a6xx_state, &a6xx_gpu->gmu.hfi);
--
2.7.4

2022-03-08 23:32:24

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v1 02/10] drm/msm/a6xx: Send NMI to gmu when it is hung

Hi Akhil,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on drm/drm-next]
[also build test WARNING on drm-intel/for-linux-next drm-tip/drm-tip drm-exynos/exynos-drm-next tegra-drm/drm/tegra/for-next v5.17-rc7 next-20220308]
[cannot apply to airlied/drm-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url: https://github.com/0day-ci/linux/commits/Akhil-P-Oommen/Support-for-GMU-coredump-and-some-related-improvements/20220303-013028
base: git://anongit.freedesktop.org/drm/drm drm-next
config: riscv-randconfig-r042-20220307 (https://download.01.org/0day-ci/archive/20220308/[email protected]/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project d271fc04d5b97b12e6b797c6067d3c96a8d7470e)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install riscv cross compiling tool for clang build
# apt-get install binutils-riscv64-linux-gnu
# https://github.com/0day-ci/linux/commit/23953efc645803299a93f178e9a32f2ae97dae39
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Akhil-P-Oommen/Support-for-GMU-coredump-and-some-related-improvements/20220303-013028
git checkout 23953efc645803299a93f178e9a32f2ae97dae39
# save the config file to linux build tree
mkdir build_dir
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=riscv SHELL=/bin/bash drivers/gpu/drm/msm/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>

All warnings (new ones prefixed by >>):

>> drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c:967:6: warning: no previous prototype for function 'a6xx_get_gmu_state' [-Wmissing-prototypes]
void a6xx_get_gmu_state(struct msm_gpu *gpu, struct a6xx_gpu_state *a6xx_state)
^
drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c:967:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
void a6xx_get_gmu_state(struct msm_gpu *gpu, struct a6xx_gpu_state *a6xx_state)
^
static
1 warning generated.


vim +/a6xx_get_gmu_state +967 drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c

966
> 967 void a6xx_get_gmu_state(struct msm_gpu *gpu, struct a6xx_gpu_state *a6xx_state)
968 {
969 struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
970 struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
971 struct a6xx_gmu *gmu = &a6xx_gpu->gmu;
972
973 if (gmu->hung)
974 a6xx_gmu_send_nmi(gmu);
975
976 a6xx_get_gmu_registers(gpu, a6xx_state);
977 }
978

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]