2020-06-14 06:50:16

by Navid Emamdoost

[permalink] [raw]
Subject: [PATCH] drm/etnaviv: fix ref count leak via pm_runtime_get_sync

in etnaviv_gpu_submit, etnaviv_gpu_recover_hang, etnaviv_gpu_debugfs,
and etnaviv_gpu_init the call to pm_runtime_get_sync increments the
counter even in case of failure, leading to incorrect ref count.
In case of failure, decrement the ref count before returning.

Signed-off-by: Navid Emamdoost <[email protected]>
---
drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
index a31eeff2b297..16f5bc65771a 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
@@ -722,7 +722,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
ret = pm_runtime_get_sync(gpu->dev);
if (ret < 0) {
dev_err(gpu->dev, "Failed to enable GPU power domain\n");
- return ret;
+ goto pm_put;
}

etnaviv_hw_identify(gpu);
@@ -819,6 +819,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu)

fail:
pm_runtime_mark_last_busy(gpu->dev);
+pm_put:
pm_runtime_put_autosuspend(gpu->dev);

return ret;
@@ -859,7 +860,7 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m)

ret = pm_runtime_get_sync(gpu->dev);
if (ret < 0)
- return ret;
+ goto pm_put;

dma_lo = gpu_read(gpu, VIVS_FE_DMA_LOW);
dma_hi = gpu_read(gpu, VIVS_FE_DMA_HIGH);
@@ -1003,6 +1004,7 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m)
ret = 0;

pm_runtime_mark_last_busy(gpu->dev);
+pm_put:
pm_runtime_put_autosuspend(gpu->dev);

return ret;
@@ -1016,7 +1018,7 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu)
dev_err(gpu->dev, "recover hung GPU!\n");

if (pm_runtime_get_sync(gpu->dev) < 0)
- return;
+ goto pm_put;

mutex_lock(&gpu->lock);

@@ -1035,6 +1037,7 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu)

mutex_unlock(&gpu->lock);
pm_runtime_mark_last_busy(gpu->dev);
+pm_put:
pm_runtime_put_autosuspend(gpu->dev);
}

@@ -1308,8 +1311,10 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit)

if (!submit->runtime_resumed) {
ret = pm_runtime_get_sync(gpu->dev);
- if (ret < 0)
+ if (ret < 0) {
+ pm_runtime_put(gpu->dev);
return NULL;
+ }
submit->runtime_resumed = true;
}

@@ -1326,6 +1331,7 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit)
ret = event_alloc(gpu, nr_events, event);
if (ret) {
DRM_ERROR("no free events\n");
+ pm_runtime_put(gpu->dev);
return NULL;
}

--
2.17.1


2020-06-14 09:50:31

by Andy Shevchenko

[permalink] [raw]
Subject: Re: [PATCH] drm/etnaviv: fix ref count leak via pm_runtime_get_sync

On Sun, Jun 14, 2020 at 9:48 AM Navid Emamdoost
<[email protected]> wrote:

...

> + if (ret < 0) {

> + pm_runtime_put(gpu->dev);

Please, in all your patches fix this to be _put_noidle(). We wouldn't
bear the flag day of fixing these parts again.
Yes, I know that *now* behaviour is the same, but calling put here is
slightly inconsistent.

...

> + pm_runtime_put(gpu->dev);

--
With Best Regards,
Andy Shevchenko

2020-06-15 06:15:21

by Navid Emamdoost

[permalink] [raw]
Subject: [PATCH v2] drm/etnaviv: fix ref count leak via pm_runtime_get_sync

in etnaviv_gpu_submit, etnaviv_gpu_recover_hang, etnaviv_gpu_debugfs,
and etnaviv_gpu_init the call to pm_runtime_get_sync increments the
counter even in case of failure, leading to incorrect ref count.
In case of failure, decrement the ref count before returning.

Signed-off-by: Navid Emamdoost <[email protected]>
---
Changes in v2:
- replace pm_runtime_put with pm_runtime_put_noidle
---
drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
index a31eeff2b297..7c9f3f9ba123 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
@@ -722,7 +722,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
ret = pm_runtime_get_sync(gpu->dev);
if (ret < 0) {
dev_err(gpu->dev, "Failed to enable GPU power domain\n");
- return ret;
+ goto pm_put;
}

etnaviv_hw_identify(gpu);
@@ -819,6 +819,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu)

fail:
pm_runtime_mark_last_busy(gpu->dev);
+pm_put:
pm_runtime_put_autosuspend(gpu->dev);

return ret;
@@ -859,7 +860,7 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m)

ret = pm_runtime_get_sync(gpu->dev);
if (ret < 0)
- return ret;
+ goto pm_put;

dma_lo = gpu_read(gpu, VIVS_FE_DMA_LOW);
dma_hi = gpu_read(gpu, VIVS_FE_DMA_HIGH);
@@ -1003,6 +1004,7 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m)
ret = 0;

pm_runtime_mark_last_busy(gpu->dev);
+pm_put:
pm_runtime_put_autosuspend(gpu->dev);

return ret;
@@ -1016,7 +1018,7 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu)
dev_err(gpu->dev, "recover hung GPU!\n");

if (pm_runtime_get_sync(gpu->dev) < 0)
- return;
+ goto pm_put;

mutex_lock(&gpu->lock);

@@ -1035,6 +1037,7 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu)

mutex_unlock(&gpu->lock);
pm_runtime_mark_last_busy(gpu->dev);
+pm_put:
pm_runtime_put_autosuspend(gpu->dev);
}

@@ -1308,8 +1311,10 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit)

if (!submit->runtime_resumed) {
ret = pm_runtime_get_sync(gpu->dev);
- if (ret < 0)
+ if (ret < 0) {
+ pm_runtime_put_noidle(gpu->dev);
return NULL;
+ }
submit->runtime_resumed = true;
}

@@ -1326,6 +1331,7 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit)
ret = event_alloc(gpu, nr_events, event);
if (ret) {
DRM_ERROR("no free events\n");
+ pm_runtime_put_noidle(gpu->dev);
return NULL;
}

--
2.17.1

2020-06-15 06:18:27

by Navid Emamdoost

[permalink] [raw]
Subject: Re: [PATCH] drm/etnaviv: fix ref count leak via pm_runtime_get_sync

On Sun, Jun 14, 2020 at 4:48 AM Andy Shevchenko
<[email protected]> wrote:
>
> On Sun, Jun 14, 2020 at 9:48 AM Navid Emamdoost
> <[email protected]> wrote:
>
> ...
>
> > + if (ret < 0) {
>
> > + pm_runtime_put(gpu->dev);
>
> Please, in all your patches fix this to be _put_noidle(). We wouldn't
> bear the flag day of fixing these parts again.
> Yes, I know that *now* behaviour is the same, but calling put here is
> slightly inconsistent.

v2 is sent.

>
> ...
>
> > + pm_runtime_put(gpu->dev);
>
> --
> With Best Regards,
> Andy Shevchenko



--
Navid.

2020-06-15 16:02:17

by Markus Elfring

[permalink] [raw]
Subject: Re: [PATCH v2] drm/etnaviv: fix ref count leak via pm_runtime_get_sync


> In case of failure, decrement the ref count before returning.

Can it be nicer to use the term “reference count” here?

Will the tag “Fixes” become helpful for the commit message?



> +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c

> @@ -1326,6 +1331,7 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit)
> ret = event_alloc(gpu, nr_events, event);
> if (ret) {
> DRM_ERROR("no free events\n");
> + pm_runtime_put_noidle(gpu->dev);
> return NULL;
> }

I suggest to move a bit of exception handling code to the end of
this function implementation so that it can be better reused after
the addition of a jump target like “put_runtime”.

Regards,
Markus

2020-06-17 09:56:05

by Lucas Stach

[permalink] [raw]
Subject: Re: [PATCH v2] drm/etnaviv: fix ref count leak via pm_runtime_get_sync

Hi Navid,

Am Montag, den 15.06.2020, 01:12 -0500 schrieb Navid Emamdoost:
> in etnaviv_gpu_submit, etnaviv_gpu_recover_hang, etnaviv_gpu_debugfs,
> and etnaviv_gpu_init the call to pm_runtime_get_sync increments the
> counter even in case of failure, leading to incorrect ref count.
> In case of failure, decrement the ref count before returning.

While that change is correct with the current API, may I ask the
question why the way this API works is considered reasonable? A API
call that fails, but still changes internal state and expects the
caller to clean up the mess it not really what I would consider fool-
proof API design. Is there a specific reason why it is done this way
and not handled internally?

Regards,
Lucas

> Signed-off-by: Navid Emamdoost <[email protected]>
> ---
> Changes in v2:
> - replace pm_runtime_put with pm_runtime_put_noidle
> ---
> drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 14 ++++++++++----
> 1 file changed, 10 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
> index a31eeff2b297..7c9f3f9ba123 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
> @@ -722,7 +722,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
> ret = pm_runtime_get_sync(gpu->dev);
> if (ret < 0) {
> dev_err(gpu->dev, "Failed to enable GPU power domain\n");
> - return ret;
> + goto pm_put;
> }
>
> etnaviv_hw_identify(gpu);
> @@ -819,6 +819,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
>
> fail:
> pm_runtime_mark_last_busy(gpu->dev);
> +pm_put:
> pm_runtime_put_autosuspend(gpu->dev);
>
> return ret;
> @@ -859,7 +860,7 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m)
>
> ret = pm_runtime_get_sync(gpu->dev);
> if (ret < 0)
> - return ret;
> + goto pm_put;
>
> dma_lo = gpu_read(gpu, VIVS_FE_DMA_LOW);
> dma_hi = gpu_read(gpu, VIVS_FE_DMA_HIGH);
> @@ -1003,6 +1004,7 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m)
> ret = 0;
>
> pm_runtime_mark_last_busy(gpu->dev);
> +pm_put:
> pm_runtime_put_autosuspend(gpu->dev);
>
> return ret;
> @@ -1016,7 +1018,7 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu)
> dev_err(gpu->dev, "recover hung GPU!\n");
>
> if (pm_runtime_get_sync(gpu->dev) < 0)
> - return;
> + goto pm_put;
>
> mutex_lock(&gpu->lock);
>
> @@ -1035,6 +1037,7 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu)
>
> mutex_unlock(&gpu->lock);
> pm_runtime_mark_last_busy(gpu->dev);
> +pm_put:
> pm_runtime_put_autosuspend(gpu->dev);
> }
>
> @@ -1308,8 +1311,10 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit)
>
> if (!submit->runtime_resumed) {
> ret = pm_runtime_get_sync(gpu->dev);
> - if (ret < 0)
> + if (ret < 0) {
> + pm_runtime_put_noidle(gpu->dev);
> return NULL;
> + }
> submit->runtime_resumed = true;
> }
>
> @@ -1326,6 +1331,7 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit)
> ret = event_alloc(gpu, nr_events, event);
> if (ret) {
> DRM_ERROR("no free events\n");
> + pm_runtime_put_noidle(gpu->dev);
> return NULL;
> }
>

2020-06-17 18:01:42

by Navid Emamdoost

[permalink] [raw]
Subject: Re: [PATCH v2] drm/etnaviv: fix ref count leak via pm_runtime_get_sync

Hi Lucas,


On Wed, Jun 17, 2020 at 4:53 AM Lucas Stach <[email protected]> wrote:
>
> Hi Navid,
>
> Am Montag, den 15.06.2020, 01:12 -0500 schrieb Navid Emamdoost:
> > in etnaviv_gpu_submit, etnaviv_gpu_recover_hang, etnaviv_gpu_debugfs,
> > and etnaviv_gpu_init the call to pm_runtime_get_sync increments the
> > counter even in case of failure, leading to incorrect ref count.
> > In case of failure, decrement the ref count before returning.
>
> While that change is correct with the current API, may I ask the
> question why the way this API works is considered reasonable? A API
> call that fails, but still changes internal state and expects the
> caller to clean up the mess it not really what I would consider fool-
> proof API design. Is there a specific reason why it is done this way
> and not handled internally?

I share the same concern with you on the way this API is working now.
To the best of my knowledge, there are ongoing discussions on this
issue:

https://lkml.org/lkml/2020/6/14/76
https://patchwork.ozlabs.org/project/linux-tegra/patch/[email protected]/

>
> Regards,
> Lucas
>
> > Signed-off-by: Navid Emamdoost <[email protected]>
> > ---
> > Changes in v2:
> > - replace pm_runtime_put with pm_runtime_put_noidle
> > ---
> > drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 14 ++++++++++----
> > 1 file changed, 10 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
> > index a31eeff2b297..7c9f3f9ba123 100644
> > --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
> > +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
> > @@ -722,7 +722,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
> > ret = pm_runtime_get_sync(gpu->dev);
> > if (ret < 0) {
> > dev_err(gpu->dev, "Failed to enable GPU power domain\n");
> > - return ret;
> > + goto pm_put;
> > }
> >
> > etnaviv_hw_identify(gpu);
> > @@ -819,6 +819,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
> >
> > fail:
> > pm_runtime_mark_last_busy(gpu->dev);
> > +pm_put:
> > pm_runtime_put_autosuspend(gpu->dev);
> >
> > return ret;
> > @@ -859,7 +860,7 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m)
> >
> > ret = pm_runtime_get_sync(gpu->dev);
> > if (ret < 0)
> > - return ret;
> > + goto pm_put;
> >
> > dma_lo = gpu_read(gpu, VIVS_FE_DMA_LOW);
> > dma_hi = gpu_read(gpu, VIVS_FE_DMA_HIGH);
> > @@ -1003,6 +1004,7 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m)
> > ret = 0;
> >
> > pm_runtime_mark_last_busy(gpu->dev);
> > +pm_put:
> > pm_runtime_put_autosuspend(gpu->dev);
> >
> > return ret;
> > @@ -1016,7 +1018,7 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu)
> > dev_err(gpu->dev, "recover hung GPU!\n");
> >
> > if (pm_runtime_get_sync(gpu->dev) < 0)
> > - return;
> > + goto pm_put;
> >
> > mutex_lock(&gpu->lock);
> >
> > @@ -1035,6 +1037,7 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu)
> >
> > mutex_unlock(&gpu->lock);
> > pm_runtime_mark_last_busy(gpu->dev);
> > +pm_put:
> > pm_runtime_put_autosuspend(gpu->dev);
> > }
> >
> > @@ -1308,8 +1311,10 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit)
> >
> > if (!submit->runtime_resumed) {
> > ret = pm_runtime_get_sync(gpu->dev);
> > - if (ret < 0)
> > + if (ret < 0) {
> > + pm_runtime_put_noidle(gpu->dev);
> > return NULL;
> > + }
> > submit->runtime_resumed = true;
> > }
> >
> > @@ -1326,6 +1331,7 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit)
> > ret = event_alloc(gpu, nr_events, event);
> > if (ret) {
> > DRM_ERROR("no free events\n");
> > + pm_runtime_put_noidle(gpu->dev);
> > return NULL;
> > }
> >
>


--
Navid.