2023-06-16 14:59:28

by Arnd Bergmann

[permalink] [raw]
Subject: [PATCH 1/2] media: verisilicon: fix excessive stack usage

From: Arnd Bergmann <[email protected]>

In some configurations, gcc decides not to inline the register accessor functions,
which in turn leads to lots of temporary hantro_reg structures on the stack that
cannot be eliminated because they escape into an uninlined function:

drivers/media/platform/verisilicon/rockchip_vpu981_hw_av1_dec.c:1022:1: warning: the frame size of 1112 bytes is larger than 1024 bytes [-Wframe-larger-than=]

Mark all of these as __always_inline so the compiler is able to completely
eliminate the temporary structures instead, which brings the stack usage
back down to just the normal local variables.

Reported-by: kernel test robot <[email protected]>
Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/
Fixes: 727a400686a2c ("media: verisilicon: Add Rockchip AV1 decoder")
Signed-off-by: Arnd Bergmann <[email protected]>
---
drivers/media/platform/verisilicon/hantro.h | 22 ++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/media/platform/verisilicon/hantro.h b/drivers/media/platform/verisilicon/hantro.h
index 6523ffb748812..6c5e56ce5b351 100644
--- a/drivers/media/platform/verisilicon/hantro.h
+++ b/drivers/media/platform/verisilicon/hantro.h
@@ -370,26 +370,26 @@ extern int hantro_debug;
pr_err("%s:%d: " fmt, __func__, __LINE__, ##args)

/* Structure access helpers. */
-static inline struct hantro_ctx *fh_to_ctx(struct v4l2_fh *fh)
+static __always_inline struct hantro_ctx *fh_to_ctx(struct v4l2_fh *fh)
{
return container_of(fh, struct hantro_ctx, fh);
}

/* Register accessors. */
-static inline void vepu_write_relaxed(struct hantro_dev *vpu,
+static __always_inline void vepu_write_relaxed(struct hantro_dev *vpu,
u32 val, u32 reg)
{
vpu_debug(6, "0x%04x = 0x%08x\n", reg / 4, val);
writel_relaxed(val, vpu->enc_base + reg);
}

-static inline void vepu_write(struct hantro_dev *vpu, u32 val, u32 reg)
+static __always_inline void vepu_write(struct hantro_dev *vpu, u32 val, u32 reg)
{
vpu_debug(6, "0x%04x = 0x%08x\n", reg / 4, val);
writel(val, vpu->enc_base + reg);
}

-static inline u32 vepu_read(struct hantro_dev *vpu, u32 reg)
+static __always_inline u32 vepu_read(struct hantro_dev *vpu, u32 reg)
{
u32 val = readl(vpu->enc_base + reg);

@@ -397,27 +397,27 @@ static inline u32 vepu_read(struct hantro_dev *vpu, u32 reg)
return val;
}

-static inline void vdpu_write_relaxed(struct hantro_dev *vpu,
+static __always_inline void vdpu_write_relaxed(struct hantro_dev *vpu,
u32 val, u32 reg)
{
vpu_debug(6, "0x%04x = 0x%08x\n", reg / 4, val);
writel_relaxed(val, vpu->dec_base + reg);
}

-static inline void vdpu_write(struct hantro_dev *vpu, u32 val, u32 reg)
+static __always_inline void vdpu_write(struct hantro_dev *vpu, u32 val, u32 reg)
{
vpu_debug(6, "0x%04x = 0x%08x\n", reg / 4, val);
writel(val, vpu->dec_base + reg);
}

-static inline void hantro_write_addr(struct hantro_dev *vpu,
+static __always_inline void hantro_write_addr(struct hantro_dev *vpu,
unsigned long offset,
dma_addr_t addr)
{
vdpu_write(vpu, addr & 0xffffffff, offset);
}

-static inline u32 vdpu_read(struct hantro_dev *vpu, u32 reg)
+static __always_inline u32 vdpu_read(struct hantro_dev *vpu, u32 reg)
{
u32 val = readl(vpu->dec_base + reg);

@@ -425,7 +425,7 @@ static inline u32 vdpu_read(struct hantro_dev *vpu, u32 reg)
return val;
}

-static inline u32 vdpu_read_mask(struct hantro_dev *vpu,
+static __always_inline u32 vdpu_read_mask(struct hantro_dev *vpu,
const struct hantro_reg *reg,
u32 val)
{
@@ -437,14 +437,14 @@ static inline u32 vdpu_read_mask(struct hantro_dev *vpu,
return v;
}

-static inline void hantro_reg_write(struct hantro_dev *vpu,
+static __always_inline void hantro_reg_write(struct hantro_dev *vpu,
const struct hantro_reg *reg,
u32 val)
{
vdpu_write_relaxed(vpu, vdpu_read_mask(vpu, reg, val), reg->base);
}

-static inline void hantro_reg_write_s(struct hantro_dev *vpu,
+static __always_inline void hantro_reg_write_s(struct hantro_dev *vpu,
const struct hantro_reg *reg,
u32 val)
{
--
2.39.2



2023-06-16 15:01:59

by Arnd Bergmann

[permalink] [raw]
Subject: [PATCH 2/2] media: verisilicon: change confusingly named relaxed register access

From: Arnd Bergmann <[email protected]>

The register abstraction has wrappers around both the normal writel()
and its writel_relaxed() counterpart, but this has led to a lot of users
ending up with the relaxed version.

There is sometimes a need to intentionally pick the relaxed accessor for
performance critical functions, but I noticed that each hantro_reg_write()
call also contains a non-relaxed readl(), which is typically much more
expensive than a writel, so there is little benefit here but an added
risk of missing a serialization against DMA.

To make this behave like other interfaces, use the normal accessor by
default and only provide the relaxed version as an alternative for
performance critical code. hantro_postproc.c is the only place that
used both the relaxed and normal writel, but this does not seem
cricital either, so change it all to the normal ones.

Signed-off-by: Arnd Bergmann <[email protected]>
---
I did not look whether there is an actual bug here, just noticed this
when I debugged the excessive stack usage.
---
drivers/media/platform/verisilicon/hantro.h | 6 +++---
drivers/media/platform/verisilicon/hantro_postproc.c | 12 ++++++------
2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/media/platform/verisilicon/hantro.h b/drivers/media/platform/verisilicon/hantro.h
index 6c5e56ce5b351..a481d957fef93 100644
--- a/drivers/media/platform/verisilicon/hantro.h
+++ b/drivers/media/platform/verisilicon/hantro.h
@@ -441,14 +441,14 @@ static __always_inline void hantro_reg_write(struct hantro_dev *vpu,
const struct hantro_reg *reg,
u32 val)
{
- vdpu_write_relaxed(vpu, vdpu_read_mask(vpu, reg, val), reg->base);
+ vdpu_write(vpu, vdpu_read_mask(vpu, reg, val), reg->base);
}

-static __always_inline void hantro_reg_write_s(struct hantro_dev *vpu,
+static __always_inline void hantro_reg_write_relaxed(struct hantro_dev *vpu,
const struct hantro_reg *reg,
u32 val)
{
- vdpu_write(vpu, vdpu_read_mask(vpu, reg, val), reg->base);
+ vdpu_write_relaxed(vpu, vdpu_read_mask(vpu, reg, val), reg->base);
}

void *hantro_get_ctrl(struct hantro_ctx *ctx, u32 id);
diff --git a/drivers/media/platform/verisilicon/hantro_postproc.c b/drivers/media/platform/verisilicon/hantro_postproc.c
index c977d64105b18..0224ff68ab3fc 100644
--- a/drivers/media/platform/verisilicon/hantro_postproc.c
+++ b/drivers/media/platform/verisilicon/hantro_postproc.c
@@ -21,11 +21,11 @@
val); \
}

-#define HANTRO_PP_REG_WRITE_S(vpu, reg_name, val) \
+#define HANTRO_PP_REG_WRITE_RELAXED(vpu, reg_name, val) \
{ \
- hantro_reg_write_s(vpu, \
- &hantro_g1_postproc_regs.reg_name, \
- val); \
+ hantro_reg_write_relaxed(vpu, \
+ &hantro_g1_postproc_regs.reg_name, \
+ val); \
}

#define VPU_PP_IN_YUYV 0x0
@@ -72,7 +72,7 @@ static void hantro_postproc_g1_enable(struct hantro_ctx *ctx)
dma_addr_t dst_dma;

/* Turn on pipeline mode. Must be done first. */
- HANTRO_PP_REG_WRITE_S(vpu, pipeline_en, 0x1);
+ HANTRO_PP_REG_WRITE(vpu, pipeline_en, 0x1);

src_pp_fmt = VPU_PP_IN_NV12;

@@ -242,7 +242,7 @@ static void hantro_postproc_g1_disable(struct hantro_ctx *ctx)
{
struct hantro_dev *vpu = ctx->dev;

- HANTRO_PP_REG_WRITE_S(vpu, pipeline_en, 0x0);
+ HANTRO_PP_REG_WRITE(vpu, pipeline_en, 0x0);
}

static void hantro_postproc_g2_disable(struct hantro_ctx *ctx)
--
2.39.2


2023-06-19 14:59:06

by Nicolas Dufresne

[permalink] [raw]
Subject: Re: [PATCH 2/2] media: verisilicon: change confusingly named relaxed register access

Hi Arnd,

Le vendredi 16 juin 2023 à 16:48 +0200, Arnd Bergmann a écrit :
> From: Arnd Bergmann <[email protected]>
>
> The register abstraction has wrappers around both the normal writel()
> and its writel_relaxed() counterpart, but this has led to a lot of users
> ending up with the relaxed version.
>
> There is sometimes a need to intentionally pick the relaxed accessor for
> performance critical functions, but I noticed that each hantro_reg_write()
> call also contains a non-relaxed readl(), which is typically much more
> expensive than a writel, so there is little benefit here but an added
> risk of missing a serialization against DMA.
>
> To make this behave like other interfaces, use the normal accessor by
> default and only provide the relaxed version as an alternative for
> performance critical code. hantro_postproc.c is the only place that
> used both the relaxed and normal writel, but this does not seem
> cricital either, so change it all to the normal ones.

In this text you spoke about potential performance side effects of existing code
and your changes, but its left all very vague and theoretical. Have you done any
measurement ? Do you need help with the manner ?

regards,
Nicolas

>
> Signed-off-by: Arnd Bergmann <[email protected]>
> ---
> I did not look whether there is an actual bug here, just noticed this
> when I debugged the excessive stack usage.
> ---
> drivers/media/platform/verisilicon/hantro.h | 6 +++---
> drivers/media/platform/verisilicon/hantro_postproc.c | 12 ++++++------
> 2 files changed, 9 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/media/platform/verisilicon/hantro.h b/drivers/media/platform/verisilicon/hantro.h
> index 6c5e56ce5b351..a481d957fef93 100644
> --- a/drivers/media/platform/verisilicon/hantro.h
> +++ b/drivers/media/platform/verisilicon/hantro.h
> @@ -441,14 +441,14 @@ static __always_inline void hantro_reg_write(struct hantro_dev *vpu,
> const struct hantro_reg *reg,
> u32 val)
> {
> - vdpu_write_relaxed(vpu, vdpu_read_mask(vpu, reg, val), reg->base);
> + vdpu_write(vpu, vdpu_read_mask(vpu, reg, val), reg->base);
> }
>
> -static __always_inline void hantro_reg_write_s(struct hantro_dev *vpu,
> +static __always_inline void hantro_reg_write_relaxed(struct hantro_dev *vpu,
> const struct hantro_reg *reg,
> u32 val)
> {
> - vdpu_write(vpu, vdpu_read_mask(vpu, reg, val), reg->base);
> + vdpu_write_relaxed(vpu, vdpu_read_mask(vpu, reg, val), reg->base);
> }
>
> void *hantro_get_ctrl(struct hantro_ctx *ctx, u32 id);
> diff --git a/drivers/media/platform/verisilicon/hantro_postproc.c b/drivers/media/platform/verisilicon/hantro_postproc.c
> index c977d64105b18..0224ff68ab3fc 100644
> --- a/drivers/media/platform/verisilicon/hantro_postproc.c
> +++ b/drivers/media/platform/verisilicon/hantro_postproc.c
> @@ -21,11 +21,11 @@
> val); \
> }
>
> -#define HANTRO_PP_REG_WRITE_S(vpu, reg_name, val) \
> +#define HANTRO_PP_REG_WRITE_RELAXED(vpu, reg_name, val) \
> { \
> - hantro_reg_write_s(vpu, \
> - &hantro_g1_postproc_regs.reg_name, \
> - val); \
> + hantro_reg_write_relaxed(vpu, \
> + &hantro_g1_postproc_regs.reg_name, \
> + val); \
> }
>
> #define VPU_PP_IN_YUYV 0x0
> @@ -72,7 +72,7 @@ static void hantro_postproc_g1_enable(struct hantro_ctx *ctx)
> dma_addr_t dst_dma;
>
> /* Turn on pipeline mode. Must be done first. */
> - HANTRO_PP_REG_WRITE_S(vpu, pipeline_en, 0x1);
> + HANTRO_PP_REG_WRITE(vpu, pipeline_en, 0x1);
>
> src_pp_fmt = VPU_PP_IN_NV12;
>
> @@ -242,7 +242,7 @@ static void hantro_postproc_g1_disable(struct hantro_ctx *ctx)
> {
> struct hantro_dev *vpu = ctx->dev;
>
> - HANTRO_PP_REG_WRITE_S(vpu, pipeline_en, 0x0);
> + HANTRO_PP_REG_WRITE(vpu, pipeline_en, 0x0);
> }
>
> static void hantro_postproc_g2_disable(struct hantro_ctx *ctx)


2023-06-19 15:00:47

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH 2/2] media: verisilicon: change confusingly named relaxed register access

On Mon, Jun 19, 2023, at 16:41, Nicolas Dufresne wrote:
> Le vendredi 16 juin 2023 à 16:48 +0200, Arnd Bergmann a écrit :
>> From: Arnd Bergmann <[email protected]>
>>
>> The register abstraction has wrappers around both the normal writel()
>> and its writel_relaxed() counterpart, but this has led to a lot of users
>> ending up with the relaxed version.
>>
>> There is sometimes a need to intentionally pick the relaxed accessor for
>> performance critical functions, but I noticed that each hantro_reg_write()
>> call also contains a non-relaxed readl(), which is typically much more
>> expensive than a writel, so there is little benefit here but an added
>> risk of missing a serialization against DMA.
>>
>> To make this behave like other interfaces, use the normal accessor by
>> default and only provide the relaxed version as an alternative for
>> performance critical code. hantro_postproc.c is the only place that
>> used both the relaxed and normal writel, but this does not seem
>> cricital either, so change it all to the normal ones.
>
> In this text you spoke about potential performance side effects of existing code
> and your changes, but its left all very vague and theoretical. Have you done any
> measurement ? Do you need help with the manner ?

I don't have this hardware and have not done any measurements.
Obviously the only point of using relaxed accessors is to
improve performance in critical code paths, but from the way they
are used here it seems that this was instead just an accident
and nobody else did any comparisons either.

My guess would be that if one wanted to speed up the register
access, a better way would be to use a regmap cache to avoid
reading registers when the contents are already known.

Arnd

2023-06-19 19:08:03

by Nicolas Dufresne

[permalink] [raw]
Subject: Re: [PATCH 2/2] media: verisilicon: change confusingly named relaxed register access

Le lundi 19 juin 2023 à 16:49 +0200, Arnd Bergmann a écrit :
> On Mon, Jun 19, 2023, at 16:41, Nicolas Dufresne wrote:
> > Le vendredi 16 juin 2023 à 16:48 +0200, Arnd Bergmann a écrit :
> > > From: Arnd Bergmann <[email protected]>
> > >
> > > The register abstraction has wrappers around both the normal writel()
> > > and its writel_relaxed() counterpart, but this has led to a lot of users
> > > ending up with the relaxed version.
> > >
> > > There is sometimes a need to intentionally pick the relaxed accessor for
> > > performance critical functions, but I noticed that each hantro_reg_write()
> > > call also contains a non-relaxed readl(), which is typically much more
> > > expensive than a writel, so there is little benefit here but an added
> > > risk of missing a serialization against DMA.
> > >
> > > To make this behave like other interfaces, use the normal accessor by
> > > default and only provide the relaxed version as an alternative for
> > > performance critical code. hantro_postproc.c is the only place that
> > > used both the relaxed and normal writel, but this does not seem
> > > cricital either, so change it all to the normal ones.
> >
> > In this text you spoke about potential performance side effects of existing code
> > and your changes, but its left all very vague and theoretical. Have you done any
> > measurement ? Do you need help with the manner ?
>
> I don't have this hardware and have not done any measurements.
> Obviously the only point of using relaxed accessors is to
> improve performance in critical code paths, but from the way they
> are used here it seems that this was instead just an accident
> and nobody else did any comparisons either.
>
> My guess would be that if one wanted to speed up the register
> access, a better way would be to use a regmap cache to avoid
> reading registers when the contents are already known.

All I know is that for the majority of registers when programming stateless
codecs, each 32bit word of registers are fully written too, the read value is
not always meaningful (its a value from last time the HW has been triggered) and
should be ignored, so better to not do that. As for regmap, there is folks that
have reported regmap to be completely overkill for this type of hardware.

That discussion highlight my concern, which is that this specific patch should
require a Tested-by before being merged. A clearer note to say that this patch
is not tested could have helped.

regards,
Nicolas

>
> Arnd


2023-06-19 19:57:52

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH 2/2] media: verisilicon: change confusingly named relaxed register access

On Mon, Jun 19, 2023, at 20:29, Nicolas Dufresne wrote:
> Le lundi 19 juin 2023 à 16:49 +0200, Arnd Bergmann a écrit :
>> >
>> > In this text you spoke about potential performance side effects of existing code
>> > and your changes, but its left all very vague and theoretical. Have you done any
>> > measurement ? Do you need help with the manner ?
>>
>> I don't have this hardware and have not done any measurements.
>> Obviously the only point of using relaxed accessors is to
>> improve performance in critical code paths, but from the way they
>> are used here it seems that this was instead just an accident
>> and nobody else did any comparisons either.
>>
>> My guess would be that if one wanted to speed up the register
>> access, a better way would be to use a regmap cache to avoid
>> reading registers when the contents are already known.
>
> All I know is that for the majority of registers when programming stateless
> codecs, each 32bit word of registers are fully written too, the read value is
> not always meaningful (its a value from last time the HW has been triggered) and
> should be ignored, so better to not do that. As for regmap, there is folks that
> have reported regmap to be completely overkill for this type of hardware.

Right, most likely neither the cache nor avoiding the readl() is necessary,
and that was exactly my point to start with: don't add potentially dangerous
microoptimizations like relaxed accessors unless the obvious optimizations
are also needed and used.

Obviously, testing my patch would still be a good idea before applying it.

Arnd

2023-06-20 08:19:03

by Benjamin Gaignard

[permalink] [raw]
Subject: Re: [PATCH 2/2] media: verisilicon: change confusingly named relaxed register access


Le 19/06/2023 à 21:26, Arnd Bergmann a écrit :
> On Mon, Jun 19, 2023, at 20:29, Nicolas Dufresne wrote:
>> Le lundi 19 juin 2023 à 16:49 +0200, Arnd Bergmann a écrit :
>>>> In this text you spoke about potential performance side effects of existing code
>>>> and your changes, but its left all very vague and theoretical. Have you done any
>>>> measurement ? Do you need help with the manner ?
>>> I don't have this hardware and have not done any measurements.
>>> Obviously the only point of using relaxed accessors is to
>>> improve performance in critical code paths, but from the way they
>>> are used here it seems that this was instead just an accident
>>> and nobody else did any comparisons either.
>>>
>>> My guess would be that if one wanted to speed up the register
>>> access, a better way would be to use a regmap cache to avoid
>>> reading registers when the contents are already known.
>> All I know is that for the majority of registers when programming stateless
>> codecs, each 32bit word of registers are fully written too, the read value is
>> not always meaningful (its a value from last time the HW has been triggered) and
>> should be ignored, so better to not do that. As for regmap, there is folks that
>> have reported regmap to be completely overkill for this type of hardware.
> Right, most likely neither the cache nor avoiding the readl() is necessary,
> and that was exactly my point to start with: don't add potentially dangerous
> microoptimizations like relaxed accessors unless the obvious optimizations
> are also needed and used.
>
> Obviously, testing my patch would still be a good idea before applying it.

I have test the patches on IMX8M (HEVC decoder) and RK3588 (AV1 decoder).
I notice not regression or problems, conformance tests scores remain identical.

For the both patches:
Tested-by: Benjamin Gaignard <[email protected]>

Thanks for the patches,
Benjamin

>
> Arnd

2023-06-21 14:57:10

by Nicolas Dufresne

[permalink] [raw]
Subject: Re: [PATCH 1/2] media: verisilicon: fix excessive stack usage

Hi,

thanks you.

Le vendredi 16 juin 2023 à 16:48 +0200, Arnd Bergmann a écrit :
> From: Arnd Bergmann <[email protected]>
>
> In some configurations, gcc decides not to inline the register accessor functions,
> which in turn leads to lots of temporary hantro_reg structures on the stack that
> cannot be eliminated because they escape into an uninlined function:
>
> drivers/media/platform/verisilicon/rockchip_vpu981_hw_av1_dec.c:1022:1: warning: the frame size of 1112 bytes is larger than 1024 bytes [-Wframe-larger-than=]
>
> Mark all of these as __always_inline so the compiler is able to completely
> eliminate the temporary structures instead, which brings the stack usage
> back down to just the normal local variables.

This is falling into compiler bug territory, though I see no harm in forcing
these to inline, as in the old days these would have been macros anyway.

>
> Reported-by: kernel test robot <[email protected]>
> Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/
> Fixes: 727a400686a2c ("media: verisilicon: Add Rockchip AV1 decoder")
> Signed-off-by: Arnd Bergmann <[email protected]>

Thanks again,

Reviewed-by: Nicolas Dufresne <[email protected]>

> ---
> drivers/media/platform/verisilicon/hantro.h | 22 ++++++++++-----------
> 1 file changed, 11 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/media/platform/verisilicon/hantro.h b/drivers/media/platform/verisilicon/hantro.h
> index 6523ffb748812..6c5e56ce5b351 100644
> --- a/drivers/media/platform/verisilicon/hantro.h
> +++ b/drivers/media/platform/verisilicon/hantro.h
> @@ -370,26 +370,26 @@ extern int hantro_debug;
> pr_err("%s:%d: " fmt, __func__, __LINE__, ##args)
>
> /* Structure access helpers. */
> -static inline struct hantro_ctx *fh_to_ctx(struct v4l2_fh *fh)
> +static __always_inline struct hantro_ctx *fh_to_ctx(struct v4l2_fh *fh)
> {
> return container_of(fh, struct hantro_ctx, fh);
> }
>
> /* Register accessors. */
> -static inline void vepu_write_relaxed(struct hantro_dev *vpu,
> +static __always_inline void vepu_write_relaxed(struct hantro_dev *vpu,
> u32 val, u32 reg)
> {
> vpu_debug(6, "0x%04x = 0x%08x\n", reg / 4, val);
> writel_relaxed(val, vpu->enc_base + reg);
> }
>
> -static inline void vepu_write(struct hantro_dev *vpu, u32 val, u32 reg)
> +static __always_inline void vepu_write(struct hantro_dev *vpu, u32 val, u32 reg)
> {
> vpu_debug(6, "0x%04x = 0x%08x\n", reg / 4, val);
> writel(val, vpu->enc_base + reg);
> }
>
> -static inline u32 vepu_read(struct hantro_dev *vpu, u32 reg)
> +static __always_inline u32 vepu_read(struct hantro_dev *vpu, u32 reg)
> {
> u32 val = readl(vpu->enc_base + reg);
>
> @@ -397,27 +397,27 @@ static inline u32 vepu_read(struct hantro_dev *vpu, u32 reg)
> return val;
> }
>
> -static inline void vdpu_write_relaxed(struct hantro_dev *vpu,
> +static __always_inline void vdpu_write_relaxed(struct hantro_dev *vpu,
> u32 val, u32 reg)
> {
> vpu_debug(6, "0x%04x = 0x%08x\n", reg / 4, val);
> writel_relaxed(val, vpu->dec_base + reg);
> }
>
> -static inline void vdpu_write(struct hantro_dev *vpu, u32 val, u32 reg)
> +static __always_inline void vdpu_write(struct hantro_dev *vpu, u32 val, u32 reg)
> {
> vpu_debug(6, "0x%04x = 0x%08x\n", reg / 4, val);
> writel(val, vpu->dec_base + reg);
> }
>
> -static inline void hantro_write_addr(struct hantro_dev *vpu,
> +static __always_inline void hantro_write_addr(struct hantro_dev *vpu,
> unsigned long offset,
> dma_addr_t addr)
> {
> vdpu_write(vpu, addr & 0xffffffff, offset);
> }
>
> -static inline u32 vdpu_read(struct hantro_dev *vpu, u32 reg)
> +static __always_inline u32 vdpu_read(struct hantro_dev *vpu, u32 reg)
> {
> u32 val = readl(vpu->dec_base + reg);
>
> @@ -425,7 +425,7 @@ static inline u32 vdpu_read(struct hantro_dev *vpu, u32 reg)
> return val;
> }
>
> -static inline u32 vdpu_read_mask(struct hantro_dev *vpu,
> +static __always_inline u32 vdpu_read_mask(struct hantro_dev *vpu,
> const struct hantro_reg *reg,
> u32 val)
> {
> @@ -437,14 +437,14 @@ static inline u32 vdpu_read_mask(struct hantro_dev *vpu,
> return v;
> }
>
> -static inline void hantro_reg_write(struct hantro_dev *vpu,
> +static __always_inline void hantro_reg_write(struct hantro_dev *vpu,
> const struct hantro_reg *reg,
> u32 val)
> {
> vdpu_write_relaxed(vpu, vdpu_read_mask(vpu, reg, val), reg->base);
> }
>
> -static inline void hantro_reg_write_s(struct hantro_dev *vpu,
> +static __always_inline void hantro_reg_write_s(struct hantro_dev *vpu,
> const struct hantro_reg *reg,
> u32 val)
> {


2023-06-28 18:44:37

by Nathan Chancellor

[permalink] [raw]
Subject: Re: [PATCH 1/2] media: verisilicon: fix excessive stack usage

On Fri, Jun 16, 2023 at 04:48:47PM +0200, Arnd Bergmann wrote:
> From: Arnd Bergmann <[email protected]>
>
> In some configurations, gcc decides not to inline the register accessor functions,
> which in turn leads to lots of temporary hantro_reg structures on the stack that
> cannot be eliminated because they escape into an uninlined function:
>
> drivers/media/platform/verisilicon/rockchip_vpu981_hw_av1_dec.c:1022:1: warning: the frame size of 1112 bytes is larger than 1024 bytes [-Wframe-larger-than=]
>
> Mark all of these as __always_inline so the compiler is able to completely
> eliminate the temporary structures instead, which brings the stack usage
> back down to just the normal local variables.
>
> Reported-by: kernel test robot <[email protected]>
> Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/
> Fixes: 727a400686a2c ("media: verisilicon: Add Rockchip AV1 decoder")
> Signed-off-by: Arnd Bergmann <[email protected]>

For what it's worth, this patch massively helps with avoiding a warning
with clang 16.x and older, for presumably a similar reason, since this
happens with allmodconfig, which turns on a bunch of sanitizers.

https://github.com/ClangBuiltLinux/linux/issues/1875

Before this change:

drivers/media/platform/verisilicon/rockchip_vpu981_hw_av1_dec.c:2097:5: error: stack frame size (2096) exceeds limit (2048) in 'rockchip_vpu981_av1_dec_run' [-Werror,-Wframe-larger-than]
int rockchip_vpu981_av1_dec_run(struct hantro_ctx *ctx)
^
238/2096 (11.35%) spills, 1858/2096 (88.65%) variables

After this change:

drivers/media/platform/verisilicon/rockchip_vpu981_hw_av1_dec.c:2097:5: error: stack frame size (496) exceeds limit (200) in 'rockchip_vpu981_av1_dec_run' [-Werror,-Wframe-larger-than]
int rockchip_vpu981_av1_dec_run(struct hantro_ctx *ctx)
^
265/496 (53.43%) spills, 231/496 (46.57%) variables

If this could be picked up either before the 6.5 media pull goes out or
at some point during the -rc cycle, that would be great!

Tested-by: Nathan Chancellor <[email protected]>

> ---
> drivers/media/platform/verisilicon/hantro.h | 22 ++++++++++-----------
> 1 file changed, 11 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/media/platform/verisilicon/hantro.h b/drivers/media/platform/verisilicon/hantro.h
> index 6523ffb748812..6c5e56ce5b351 100644
> --- a/drivers/media/platform/verisilicon/hantro.h
> +++ b/drivers/media/platform/verisilicon/hantro.h
> @@ -370,26 +370,26 @@ extern int hantro_debug;
> pr_err("%s:%d: " fmt, __func__, __LINE__, ##args)
>
> /* Structure access helpers. */
> -static inline struct hantro_ctx *fh_to_ctx(struct v4l2_fh *fh)
> +static __always_inline struct hantro_ctx *fh_to_ctx(struct v4l2_fh *fh)
> {
> return container_of(fh, struct hantro_ctx, fh);
> }
>
> /* Register accessors. */
> -static inline void vepu_write_relaxed(struct hantro_dev *vpu,
> +static __always_inline void vepu_write_relaxed(struct hantro_dev *vpu,
> u32 val, u32 reg)
> {
> vpu_debug(6, "0x%04x = 0x%08x\n", reg / 4, val);
> writel_relaxed(val, vpu->enc_base + reg);
> }
>
> -static inline void vepu_write(struct hantro_dev *vpu, u32 val, u32 reg)
> +static __always_inline void vepu_write(struct hantro_dev *vpu, u32 val, u32 reg)
> {
> vpu_debug(6, "0x%04x = 0x%08x\n", reg / 4, val);
> writel(val, vpu->enc_base + reg);
> }
>
> -static inline u32 vepu_read(struct hantro_dev *vpu, u32 reg)
> +static __always_inline u32 vepu_read(struct hantro_dev *vpu, u32 reg)
> {
> u32 val = readl(vpu->enc_base + reg);
>
> @@ -397,27 +397,27 @@ static inline u32 vepu_read(struct hantro_dev *vpu, u32 reg)
> return val;
> }
>
> -static inline void vdpu_write_relaxed(struct hantro_dev *vpu,
> +static __always_inline void vdpu_write_relaxed(struct hantro_dev *vpu,
> u32 val, u32 reg)
> {
> vpu_debug(6, "0x%04x = 0x%08x\n", reg / 4, val);
> writel_relaxed(val, vpu->dec_base + reg);
> }
>
> -static inline void vdpu_write(struct hantro_dev *vpu, u32 val, u32 reg)
> +static __always_inline void vdpu_write(struct hantro_dev *vpu, u32 val, u32 reg)
> {
> vpu_debug(6, "0x%04x = 0x%08x\n", reg / 4, val);
> writel(val, vpu->dec_base + reg);
> }
>
> -static inline void hantro_write_addr(struct hantro_dev *vpu,
> +static __always_inline void hantro_write_addr(struct hantro_dev *vpu,
> unsigned long offset,
> dma_addr_t addr)
> {
> vdpu_write(vpu, addr & 0xffffffff, offset);
> }
>
> -static inline u32 vdpu_read(struct hantro_dev *vpu, u32 reg)
> +static __always_inline u32 vdpu_read(struct hantro_dev *vpu, u32 reg)
> {
> u32 val = readl(vpu->dec_base + reg);
>
> @@ -425,7 +425,7 @@ static inline u32 vdpu_read(struct hantro_dev *vpu, u32 reg)
> return val;
> }
>
> -static inline u32 vdpu_read_mask(struct hantro_dev *vpu,
> +static __always_inline u32 vdpu_read_mask(struct hantro_dev *vpu,
> const struct hantro_reg *reg,
> u32 val)
> {
> @@ -437,14 +437,14 @@ static inline u32 vdpu_read_mask(struct hantro_dev *vpu,
> return v;
> }
>
> -static inline void hantro_reg_write(struct hantro_dev *vpu,
> +static __always_inline void hantro_reg_write(struct hantro_dev *vpu,
> const struct hantro_reg *reg,
> u32 val)
> {
> vdpu_write_relaxed(vpu, vdpu_read_mask(vpu, reg, val), reg->base);
> }
>
> -static inline void hantro_reg_write_s(struct hantro_dev *vpu,
> +static __always_inline void hantro_reg_write_s(struct hantro_dev *vpu,
> const struct hantro_reg *reg,
> u32 val)
> {
> --
> 2.39.2
>

2023-06-29 07:59:37

by Hans Verkuil

[permalink] [raw]
Subject: Re: [PATCH 1/2] media: verisilicon: fix excessive stack usage

Hi all,

On 6/28/23 20:26, Nathan Chancellor wrote:
> On Fri, Jun 16, 2023 at 04:48:47PM +0200, Arnd Bergmann wrote:
>> From: Arnd Bergmann <[email protected]>
>>
>> In some configurations, gcc decides not to inline the register accessor functions,
>> which in turn leads to lots of temporary hantro_reg structures on the stack that
>> cannot be eliminated because they escape into an uninlined function:
>>
>> drivers/media/platform/verisilicon/rockchip_vpu981_hw_av1_dec.c:1022:1: warning: the frame size of 1112 bytes is larger than 1024 bytes [-Wframe-larger-than=]
>>
>> Mark all of these as __always_inline so the compiler is able to completely
>> eliminate the temporary structures instead, which brings the stack usage
>> back down to just the normal local variables.
>>
>> Reported-by: kernel test robot <[email protected]>
>> Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/
>> Fixes: 727a400686a2c ("media: verisilicon: Add Rockchip AV1 decoder")
>> Signed-off-by: Arnd Bergmann <[email protected]>
>
> For what it's worth, this patch massively helps with avoiding a warning
> with clang 16.x and older, for presumably a similar reason, since this
> happens with allmodconfig, which turns on a bunch of sanitizers.
>
> https://github.com/ClangBuiltLinux/linux/issues/1875
>
> Before this change:
>
> drivers/media/platform/verisilicon/rockchip_vpu981_hw_av1_dec.c:2097:5: error: stack frame size (2096) exceeds limit (2048) in 'rockchip_vpu981_av1_dec_run' [-Werror,-Wframe-larger-than]
> int rockchip_vpu981_av1_dec_run(struct hantro_ctx *ctx)
> ^
> 238/2096 (11.35%) spills, 1858/2096 (88.65%) variables
>
> After this change:
>
> drivers/media/platform/verisilicon/rockchip_vpu981_hw_av1_dec.c:2097:5: error: stack frame size (496) exceeds limit (200) in 'rockchip_vpu981_av1_dec_run' [-Werror,-Wframe-larger-than]
> int rockchip_vpu981_av1_dec_run(struct hantro_ctx *ctx)
> ^
> 265/496 (53.43%) spills, 231/496 (46.57%) variables
>
> If this could be picked up either before the 6.5 media pull goes out or
> at some point during the -rc cycle, that would be great!

Once the merge window closes I'll make a PR to get it in 6.5.

Regards,

Hans

>
> Tested-by: Nathan Chancellor <[email protected]>
>
>> ---
>> drivers/media/platform/verisilicon/hantro.h | 22 ++++++++++-----------
>> 1 file changed, 11 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/media/platform/verisilicon/hantro.h b/drivers/media/platform/verisilicon/hantro.h
>> index 6523ffb748812..6c5e56ce5b351 100644
>> --- a/drivers/media/platform/verisilicon/hantro.h
>> +++ b/drivers/media/platform/verisilicon/hantro.h
>> @@ -370,26 +370,26 @@ extern int hantro_debug;
>> pr_err("%s:%d: " fmt, __func__, __LINE__, ##args)
>>
>> /* Structure access helpers. */
>> -static inline struct hantro_ctx *fh_to_ctx(struct v4l2_fh *fh)
>> +static __always_inline struct hantro_ctx *fh_to_ctx(struct v4l2_fh *fh)
>> {
>> return container_of(fh, struct hantro_ctx, fh);
>> }
>>
>> /* Register accessors. */
>> -static inline void vepu_write_relaxed(struct hantro_dev *vpu,
>> +static __always_inline void vepu_write_relaxed(struct hantro_dev *vpu,
>> u32 val, u32 reg)
>> {
>> vpu_debug(6, "0x%04x = 0x%08x\n", reg / 4, val);
>> writel_relaxed(val, vpu->enc_base + reg);
>> }
>>
>> -static inline void vepu_write(struct hantro_dev *vpu, u32 val, u32 reg)
>> +static __always_inline void vepu_write(struct hantro_dev *vpu, u32 val, u32 reg)
>> {
>> vpu_debug(6, "0x%04x = 0x%08x\n", reg / 4, val);
>> writel(val, vpu->enc_base + reg);
>> }
>>
>> -static inline u32 vepu_read(struct hantro_dev *vpu, u32 reg)
>> +static __always_inline u32 vepu_read(struct hantro_dev *vpu, u32 reg)
>> {
>> u32 val = readl(vpu->enc_base + reg);
>>
>> @@ -397,27 +397,27 @@ static inline u32 vepu_read(struct hantro_dev *vpu, u32 reg)
>> return val;
>> }
>>
>> -static inline void vdpu_write_relaxed(struct hantro_dev *vpu,
>> +static __always_inline void vdpu_write_relaxed(struct hantro_dev *vpu,
>> u32 val, u32 reg)
>> {
>> vpu_debug(6, "0x%04x = 0x%08x\n", reg / 4, val);
>> writel_relaxed(val, vpu->dec_base + reg);
>> }
>>
>> -static inline void vdpu_write(struct hantro_dev *vpu, u32 val, u32 reg)
>> +static __always_inline void vdpu_write(struct hantro_dev *vpu, u32 val, u32 reg)
>> {
>> vpu_debug(6, "0x%04x = 0x%08x\n", reg / 4, val);
>> writel(val, vpu->dec_base + reg);
>> }
>>
>> -static inline void hantro_write_addr(struct hantro_dev *vpu,
>> +static __always_inline void hantro_write_addr(struct hantro_dev *vpu,
>> unsigned long offset,
>> dma_addr_t addr)
>> {
>> vdpu_write(vpu, addr & 0xffffffff, offset);
>> }
>>
>> -static inline u32 vdpu_read(struct hantro_dev *vpu, u32 reg)
>> +static __always_inline u32 vdpu_read(struct hantro_dev *vpu, u32 reg)
>> {
>> u32 val = readl(vpu->dec_base + reg);
>>
>> @@ -425,7 +425,7 @@ static inline u32 vdpu_read(struct hantro_dev *vpu, u32 reg)
>> return val;
>> }
>>
>> -static inline u32 vdpu_read_mask(struct hantro_dev *vpu,
>> +static __always_inline u32 vdpu_read_mask(struct hantro_dev *vpu,
>> const struct hantro_reg *reg,
>> u32 val)
>> {
>> @@ -437,14 +437,14 @@ static inline u32 vdpu_read_mask(struct hantro_dev *vpu,
>> return v;
>> }
>>
>> -static inline void hantro_reg_write(struct hantro_dev *vpu,
>> +static __always_inline void hantro_reg_write(struct hantro_dev *vpu,
>> const struct hantro_reg *reg,
>> u32 val)
>> {
>> vdpu_write_relaxed(vpu, vdpu_read_mask(vpu, reg, val), reg->base);
>> }
>>
>> -static inline void hantro_reg_write_s(struct hantro_dev *vpu,
>> +static __always_inline void hantro_reg_write_s(struct hantro_dev *vpu,
>> const struct hantro_reg *reg,
>> u32 val)
>> {
>> --
>> 2.39.2
>>