2022-09-22 06:27:57

by Samuel Holland

[permalink] [raw]
Subject: [PATCH] riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y

commit 8eb060e10185 ("arch/riscv: add Zihintpause support") broke
building with CONFIG_CC_OPTIMIZE_FOR_SIZE enabled (gcc 11.1.0):

CC arch/riscv/kernel/vdso/vgettimeofday.o
In file included from <command-line>:
./arch/riscv/include/asm/jump_label.h: In function 'cpu_relax':
././include/linux/compiler_types.h:285:33: warning: 'asm' operand 0 probably does not match constraints
285 | #define asm_volatile_goto(x...) asm goto(x)
| ^~~
./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
41 | asm_volatile_goto(
| ^~~~~~~~~~~~~~~~~
././include/linux/compiler_types.h:285:33: error: impossible constraint in 'asm'
285 | #define asm_volatile_goto(x...) asm goto(x)
| ^~~
./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
41 | asm_volatile_goto(
| ^~~~~~~~~~~~~~~~~
make[1]: *** [scripts/Makefile.build:249: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1
make: *** [arch/riscv/Makefile:128: vdso_prepare] Error 2

Having a static branch in cpu_relax() is problematic because that
function is widely inlined, including in some quite complex functions
like in the VDSO. A quick measurement shows this static branch is
responsible by itself for around 40% of the jump table.

Drop the static branch, which ends up being the same number of
instructions anyway. If Zihintpause is supported, we trade the nop from
the static branch for a div. If Zihintpause is unsupported, we trade the
jump from the static branch for (what gets interpreted as) a nop.

Fixes: 8eb060e10185 ("arch/riscv: add Zihintpause support")
Signed-off-by: Samuel Holland <[email protected]>
---

arch/riscv/include/asm/hwcap.h | 3 ---
arch/riscv/include/asm/vdso/processor.h | 25 ++++++++++---------------
2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
index 6f59ec64175e..b21d46e68386 100644
--- a/arch/riscv/include/asm/hwcap.h
+++ b/arch/riscv/include/asm/hwcap.h
@@ -68,7 +68,6 @@ enum riscv_isa_ext_id {
*/
enum riscv_isa_ext_key {
RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */
- RISCV_ISA_EXT_KEY_ZIHINTPAUSE,
RISCV_ISA_EXT_KEY_MAX,
};

@@ -88,8 +87,6 @@ static __always_inline int riscv_isa_ext2key(int num)
return RISCV_ISA_EXT_KEY_FPU;
case RISCV_ISA_EXT_d:
return RISCV_ISA_EXT_KEY_FPU;
- case RISCV_ISA_EXT_ZIHINTPAUSE:
- return RISCV_ISA_EXT_KEY_ZIHINTPAUSE;
default:
return -EINVAL;
}
diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h
index 1e4f8b4aef79..789bdb8211a2 100644
--- a/arch/riscv/include/asm/vdso/processor.h
+++ b/arch/riscv/include/asm/vdso/processor.h
@@ -4,30 +4,25 @@

#ifndef __ASSEMBLY__

-#include <linux/jump_label.h>
#include <asm/barrier.h>
-#include <asm/hwcap.h>

static inline void cpu_relax(void)
{
- if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) {
#ifdef __riscv_muldiv
- int dummy;
- /* In lieu of a halt instruction, induce a long-latency stall. */
- __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
+ int dummy;
+ /* In lieu of a halt instruction, induce a long-latency stall. */
+ __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
#endif
- } else {
- /*
- * Reduce instruction retirement.
- * This assumes the PC changes.
- */
+ /*
+ * Reduce instruction retirement.
+ * This assumes the PC changes.
+ */
#ifdef __riscv_zihintpause
- __asm__ __volatile__ ("pause");
+ __asm__ __volatile__ ("pause");
#else
- /* Encoding of the pause instruction */
- __asm__ __volatile__ (".4byte 0x100000F");
+ /* Encoding of the pause instruction */
+ __asm__ __volatile__ (".4byte 0x100000F");
#endif
- }
barrier();
}

--
2.35.1


2022-09-22 13:58:23

by Conor Dooley

[permalink] [raw]
Subject: Re: [PATCH] riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y

On Thu, Sep 22, 2022 at 01:09:58AM -0500, Samuel Holland wrote:
> commit 8eb060e10185 ("arch/riscv: add Zihintpause support") broke
> building with CONFIG_CC_OPTIMIZE_FOR_SIZE enabled (gcc 11.1.0):

clang-15 chucks a fit too..
Reviewed-by: Conor Dooley <[email protected]>

>
> CC arch/riscv/kernel/vdso/vgettimeofday.o
> In file included from <command-line>:
> ./arch/riscv/include/asm/jump_label.h: In function 'cpu_relax':
> ././include/linux/compiler_types.h:285:33: warning: 'asm' operand 0 probably does not match constraints
> 285 | #define asm_volatile_goto(x...) asm goto(x)
> | ^~~
> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> 41 | asm_volatile_goto(
> | ^~~~~~~~~~~~~~~~~
> ././include/linux/compiler_types.h:285:33: error: impossible constraint in 'asm'
> 285 | #define asm_volatile_goto(x...) asm goto(x)
> | ^~~
> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> 41 | asm_volatile_goto(
> | ^~~~~~~~~~~~~~~~~
> make[1]: *** [scripts/Makefile.build:249: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1
> make: *** [arch/riscv/Makefile:128: vdso_prepare] Error 2
>
> Having a static branch in cpu_relax() is problematic because that
> function is widely inlined, including in some quite complex functions
> like in the VDSO. A quick measurement shows this static branch is
> responsible by itself for around 40% of the jump table.
>
> Drop the static branch, which ends up being the same number of
> instructions anyway. If Zihintpause is supported, we trade the nop from
> the static branch for a div. If Zihintpause is unsupported, we trade the
> jump from the static branch for (what gets interpreted as) a nop.
>
> Fixes: 8eb060e10185 ("arch/riscv: add Zihintpause support")
> Signed-off-by: Samuel Holland <[email protected]>
> ---
>
> arch/riscv/include/asm/hwcap.h | 3 ---
> arch/riscv/include/asm/vdso/processor.h | 25 ++++++++++---------------
> 2 files changed, 10 insertions(+), 18 deletions(-)
>
> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> index 6f59ec64175e..b21d46e68386 100644
> --- a/arch/riscv/include/asm/hwcap.h
> +++ b/arch/riscv/include/asm/hwcap.h
> @@ -68,7 +68,6 @@ enum riscv_isa_ext_id {
> */
> enum riscv_isa_ext_key {
> RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */
> - RISCV_ISA_EXT_KEY_ZIHINTPAUSE,
> RISCV_ISA_EXT_KEY_MAX,
> };
>
> @@ -88,8 +87,6 @@ static __always_inline int riscv_isa_ext2key(int num)
> return RISCV_ISA_EXT_KEY_FPU;
> case RISCV_ISA_EXT_d:
> return RISCV_ISA_EXT_KEY_FPU;
> - case RISCV_ISA_EXT_ZIHINTPAUSE:
> - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE;
> default:
> return -EINVAL;
> }
> diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h
> index 1e4f8b4aef79..789bdb8211a2 100644
> --- a/arch/riscv/include/asm/vdso/processor.h
> +++ b/arch/riscv/include/asm/vdso/processor.h
> @@ -4,30 +4,25 @@
>
> #ifndef __ASSEMBLY__
>
> -#include <linux/jump_label.h>
> #include <asm/barrier.h>
> -#include <asm/hwcap.h>
>
> static inline void cpu_relax(void)
> {
> - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) {
> #ifdef __riscv_muldiv
> - int dummy;
> - /* In lieu of a halt instruction, induce a long-latency stall. */
> - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> + int dummy;
> + /* In lieu of a halt instruction, induce a long-latency stall. */
> + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> #endif
> - } else {
> - /*
> - * Reduce instruction retirement.
> - * This assumes the PC changes.
> - */
> + /*
> + * Reduce instruction retirement.
> + * This assumes the PC changes.
> + */
> #ifdef __riscv_zihintpause
> - __asm__ __volatile__ ("pause");
> + __asm__ __volatile__ ("pause");
> #else
> - /* Encoding of the pause instruction */
> - __asm__ __volatile__ (".4byte 0x100000F");
> + /* Encoding of the pause instruction */
> + __asm__ __volatile__ (".4byte 0x100000F");
> #endif
> - }
> barrier();
> }
>
> --
> 2.35.1
>
>
> _______________________________________________
> linux-riscv mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-riscv

2022-09-22 16:05:39

by Jessica Clarke

[permalink] [raw]
Subject: Re: [PATCH] riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y

On 22 Sept 2022, at 16:45, Heiko Stuebner <[email protected]> wrote:
>
> Am Donnerstag, 22. September 2022, 08:09:58 CEST schrieb Samuel Holland:
>> commit 8eb060e10185 ("arch/riscv: add Zihintpause support") broke
>> building with CONFIG_CC_OPTIMIZE_FOR_SIZE enabled (gcc 11.1.0):
>>
>> CC arch/riscv/kernel/vdso/vgettimeofday.o
>> In file included from <command-line>:
>> ./arch/riscv/include/asm/jump_label.h: In function 'cpu_relax':
>> ././include/linux/compiler_types.h:285:33: warning: 'asm' operand 0 probably does not match constraints
>> 285 | #define asm_volatile_goto(x...) asm goto(x)
>> | ^~~
>> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
>> 41 | asm_volatile_goto(
>> | ^~~~~~~~~~~~~~~~~
>> ././include/linux/compiler_types.h:285:33: error: impossible constraint in 'asm'
>> 285 | #define asm_volatile_goto(x...) asm goto(x)
>> | ^~~
>> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
>> 41 | asm_volatile_goto(
>> | ^~~~~~~~~~~~~~~~~
>> make[1]: *** [scripts/Makefile.build:249: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1
>> make: *** [arch/riscv/Makefile:128: vdso_prepare] Error 2
>>
>> Having a static branch in cpu_relax() is problematic because that
>> function is widely inlined, including in some quite complex functions
>> like in the VDSO. A quick measurement shows this static branch is
>> responsible by itself for around 40% of the jump table.
>>
>> Drop the static branch, which ends up being the same number of
>> instructions anyway. If Zihintpause is supported, we trade the nop from
>> the static branch for a div. If Zihintpause is unsupported, we trade the
>> jump from the static branch for (what gets interpreted as) a nop.
>>
>> Fixes: 8eb060e10185 ("arch/riscv: add Zihintpause support")
>> Signed-off-by: Samuel Holland <[email protected]>
>> ---
>>
>> arch/riscv/include/asm/hwcap.h | 3 ---
>> arch/riscv/include/asm/vdso/processor.h | 25 ++++++++++---------------
>> 2 files changed, 10 insertions(+), 18 deletions(-)
>>
>> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
>> index 6f59ec64175e..b21d46e68386 100644
>> --- a/arch/riscv/include/asm/hwcap.h
>> +++ b/arch/riscv/include/asm/hwcap.h
>> @@ -68,7 +68,6 @@ enum riscv_isa_ext_id {
>> */
>> enum riscv_isa_ext_key {
>> RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */
>> - RISCV_ISA_EXT_KEY_ZIHINTPAUSE,
>> RISCV_ISA_EXT_KEY_MAX,
>> };
>>
>> @@ -88,8 +87,6 @@ static __always_inline int riscv_isa_ext2key(int num)
>> return RISCV_ISA_EXT_KEY_FPU;
>> case RISCV_ISA_EXT_d:
>> return RISCV_ISA_EXT_KEY_FPU;
>> - case RISCV_ISA_EXT_ZIHINTPAUSE:
>> - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE;
>> default:
>> return -EINVAL;
>> }
>> diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h
>> index 1e4f8b4aef79..789bdb8211a2 100644
>> --- a/arch/riscv/include/asm/vdso/processor.h
>> +++ b/arch/riscv/include/asm/vdso/processor.h
>> @@ -4,30 +4,25 @@
>>
>> #ifndef __ASSEMBLY__
>>
>> -#include <linux/jump_label.h>
>> #include <asm/barrier.h>
>> -#include <asm/hwcap.h>
>>
>> static inline void cpu_relax(void)
>> {
>> - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) {
>> #ifdef __riscv_muldiv
>> - int dummy;
>> - /* In lieu of a halt instruction, induce a long-latency stall. */
>> - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
>> + int dummy;
>> + /* In lieu of a halt instruction, induce a long-latency stall. */
>> + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
>> #endif
>> - } else {
>> - /*
>> - * Reduce instruction retirement.
>> - * This assumes the PC changes.
>> - */
>> + /*
>> + * Reduce instruction retirement.
>> + * This assumes the PC changes.
>> + */
>> #ifdef __riscv_zihintpause
>> - __asm__ __volatile__ ("pause");
>> + __asm__ __volatile__ ("pause");
>> #else
>> - /* Encoding of the pause instruction */
>> - __asm__ __volatile__ (".4byte 0x100000F");
>> + /* Encoding of the pause instruction */
>> + __asm__ __volatile__ (".4byte 0x100000F");
>> #endif
>
> hmm, though before this part of the code was only ever accessed
> when the zhintpause extension was really available on the running
> machine while now the pause instruction is called every time.
>
> So I'm just wondering, can't this run into some "illegal instruction"
> thingy on machines not supporting the extension?

No. The encoding for pause was deliberately chosen to be one of the
“useless” encodings of fence, with the hope that existing
microarchitectures might take a while to execute it and thus it would
still function as a slow-running instruction. It’s somewhat
questionable whether the div is even needed, the worst that happens is
cpu_relax isn’t very relaxed and you spin a bit faster. Any
implementations where that’s true probably also don’t have fancy
clock/power management anyway, and div isn’t going to be a low-power
operation so the only real effect is likely hammering on contended
atomics a bit more, and who cares about that on the low core count
systems we have today.

Jess

2022-09-22 16:30:46

by Heiko Stübner

[permalink] [raw]
Subject: Re: [PATCH] riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y

Am Donnerstag, 22. September 2022, 08:09:58 CEST schrieb Samuel Holland:
> commit 8eb060e10185 ("arch/riscv: add Zihintpause support") broke
> building with CONFIG_CC_OPTIMIZE_FOR_SIZE enabled (gcc 11.1.0):
>
> CC arch/riscv/kernel/vdso/vgettimeofday.o
> In file included from <command-line>:
> ./arch/riscv/include/asm/jump_label.h: In function 'cpu_relax':
> ././include/linux/compiler_types.h:285:33: warning: 'asm' operand 0 probably does not match constraints
> 285 | #define asm_volatile_goto(x...) asm goto(x)
> | ^~~
> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> 41 | asm_volatile_goto(
> | ^~~~~~~~~~~~~~~~~
> ././include/linux/compiler_types.h:285:33: error: impossible constraint in 'asm'
> 285 | #define asm_volatile_goto(x...) asm goto(x)
> | ^~~
> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> 41 | asm_volatile_goto(
> | ^~~~~~~~~~~~~~~~~
> make[1]: *** [scripts/Makefile.build:249: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1
> make: *** [arch/riscv/Makefile:128: vdso_prepare] Error 2
>
> Having a static branch in cpu_relax() is problematic because that
> function is widely inlined, including in some quite complex functions
> like in the VDSO. A quick measurement shows this static branch is
> responsible by itself for around 40% of the jump table.
>
> Drop the static branch, which ends up being the same number of
> instructions anyway. If Zihintpause is supported, we trade the nop from
> the static branch for a div. If Zihintpause is unsupported, we trade the
> jump from the static branch for (what gets interpreted as) a nop.
>
> Fixes: 8eb060e10185 ("arch/riscv: add Zihintpause support")
> Signed-off-by: Samuel Holland <[email protected]>
> ---
>
> arch/riscv/include/asm/hwcap.h | 3 ---
> arch/riscv/include/asm/vdso/processor.h | 25 ++++++++++---------------
> 2 files changed, 10 insertions(+), 18 deletions(-)
>
> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> index 6f59ec64175e..b21d46e68386 100644
> --- a/arch/riscv/include/asm/hwcap.h
> +++ b/arch/riscv/include/asm/hwcap.h
> @@ -68,7 +68,6 @@ enum riscv_isa_ext_id {
> */
> enum riscv_isa_ext_key {
> RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */
> - RISCV_ISA_EXT_KEY_ZIHINTPAUSE,
> RISCV_ISA_EXT_KEY_MAX,
> };
>
> @@ -88,8 +87,6 @@ static __always_inline int riscv_isa_ext2key(int num)
> return RISCV_ISA_EXT_KEY_FPU;
> case RISCV_ISA_EXT_d:
> return RISCV_ISA_EXT_KEY_FPU;
> - case RISCV_ISA_EXT_ZIHINTPAUSE:
> - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE;
> default:
> return -EINVAL;
> }
> diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h
> index 1e4f8b4aef79..789bdb8211a2 100644
> --- a/arch/riscv/include/asm/vdso/processor.h
> +++ b/arch/riscv/include/asm/vdso/processor.h
> @@ -4,30 +4,25 @@
>
> #ifndef __ASSEMBLY__
>
> -#include <linux/jump_label.h>
> #include <asm/barrier.h>
> -#include <asm/hwcap.h>
>
> static inline void cpu_relax(void)
> {
> - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) {
> #ifdef __riscv_muldiv
> - int dummy;
> - /* In lieu of a halt instruction, induce a long-latency stall. */
> - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> + int dummy;
> + /* In lieu of a halt instruction, induce a long-latency stall. */
> + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> #endif
> - } else {
> - /*
> - * Reduce instruction retirement.
> - * This assumes the PC changes.
> - */
> + /*
> + * Reduce instruction retirement.
> + * This assumes the PC changes.
> + */
> #ifdef __riscv_zihintpause
> - __asm__ __volatile__ ("pause");
> + __asm__ __volatile__ ("pause");
> #else
> - /* Encoding of the pause instruction */
> - __asm__ __volatile__ (".4byte 0x100000F");
> + /* Encoding of the pause instruction */
> + __asm__ __volatile__ (".4byte 0x100000F");
> #endif

hmm, though before this part of the code was only ever accessed
when the zhintpause extension was really available on the running
machine while now the pause instruction is called every time.

So I'm just wondering, can't this run into some "illegal instruction"
thingy on machines not supporting the extension?

Heiko

> - }
> barrier();
> }
>
>




2022-09-23 07:26:27

by Heiko Stübner

[permalink] [raw]
Subject: Re: [PATCH] riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y

Hi,

Am Donnerstag, 22. September 2022, 17:52:46 CEST schrieb Jessica Clarke:
> On 22 Sept 2022, at 16:45, Heiko Stuebner <[email protected]> wrote:
> >
> > Am Donnerstag, 22. September 2022, 08:09:58 CEST schrieb Samuel Holland:
> >> commit 8eb060e10185 ("arch/riscv: add Zihintpause support") broke
> >> building with CONFIG_CC_OPTIMIZE_FOR_SIZE enabled (gcc 11.1.0):
> >>
> >> CC arch/riscv/kernel/vdso/vgettimeofday.o
> >> In file included from <command-line>:
> >> ./arch/riscv/include/asm/jump_label.h: In function 'cpu_relax':
> >> ././include/linux/compiler_types.h:285:33: warning: 'asm' operand 0 probably does not match constraints
> >> 285 | #define asm_volatile_goto(x...) asm goto(x)
> >> | ^~~
> >> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> >> 41 | asm_volatile_goto(
> >> | ^~~~~~~~~~~~~~~~~
> >> ././include/linux/compiler_types.h:285:33: error: impossible constraint in 'asm'
> >> 285 | #define asm_volatile_goto(x...) asm goto(x)
> >> | ^~~
> >> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> >> 41 | asm_volatile_goto(
> >> | ^~~~~~~~~~~~~~~~~
> >> make[1]: *** [scripts/Makefile.build:249: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1
> >> make: *** [arch/riscv/Makefile:128: vdso_prepare] Error 2
> >>
> >> Having a static branch in cpu_relax() is problematic because that
> >> function is widely inlined, including in some quite complex functions
> >> like in the VDSO. A quick measurement shows this static branch is
> >> responsible by itself for around 40% of the jump table.
> >>
> >> Drop the static branch, which ends up being the same number of
> >> instructions anyway. If Zihintpause is supported, we trade the nop from
> >> the static branch for a div. If Zihintpause is unsupported, we trade the
> >> jump from the static branch for (what gets interpreted as) a nop.
> >>
> >> Fixes: 8eb060e10185 ("arch/riscv: add Zihintpause support")
> >> Signed-off-by: Samuel Holland <[email protected]>
> >> ---
> >>
> >> arch/riscv/include/asm/hwcap.h | 3 ---
> >> arch/riscv/include/asm/vdso/processor.h | 25 ++++++++++---------------
> >> 2 files changed, 10 insertions(+), 18 deletions(-)
> >>
> >> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> >> index 6f59ec64175e..b21d46e68386 100644
> >> --- a/arch/riscv/include/asm/hwcap.h
> >> +++ b/arch/riscv/include/asm/hwcap.h
> >> @@ -68,7 +68,6 @@ enum riscv_isa_ext_id {
> >> */
> >> enum riscv_isa_ext_key {
> >> RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */
> >> - RISCV_ISA_EXT_KEY_ZIHINTPAUSE,
> >> RISCV_ISA_EXT_KEY_MAX,
> >> };
> >>
> >> @@ -88,8 +87,6 @@ static __always_inline int riscv_isa_ext2key(int num)
> >> return RISCV_ISA_EXT_KEY_FPU;
> >> case RISCV_ISA_EXT_d:
> >> return RISCV_ISA_EXT_KEY_FPU;
> >> - case RISCV_ISA_EXT_ZIHINTPAUSE:
> >> - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE;
> >> default:
> >> return -EINVAL;
> >> }
> >> diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h
> >> index 1e4f8b4aef79..789bdb8211a2 100644
> >> --- a/arch/riscv/include/asm/vdso/processor.h
> >> +++ b/arch/riscv/include/asm/vdso/processor.h
> >> @@ -4,30 +4,25 @@
> >>
> >> #ifndef __ASSEMBLY__
> >>
> >> -#include <linux/jump_label.h>
> >> #include <asm/barrier.h>
> >> -#include <asm/hwcap.h>
> >>
> >> static inline void cpu_relax(void)
> >> {
> >> - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) {
> >> #ifdef __riscv_muldiv
> >> - int dummy;
> >> - /* In lieu of a halt instruction, induce a long-latency stall. */
> >> - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> >> + int dummy;
> >> + /* In lieu of a halt instruction, induce a long-latency stall. */
> >> + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> >> #endif
> >> - } else {
> >> - /*
> >> - * Reduce instruction retirement.
> >> - * This assumes the PC changes.
> >> - */
> >> + /*
> >> + * Reduce instruction retirement.
> >> + * This assumes the PC changes.
> >> + */
> >> #ifdef __riscv_zihintpause
> >> - __asm__ __volatile__ ("pause");
> >> + __asm__ __volatile__ ("pause");
> >> #else
> >> - /* Encoding of the pause instruction */
> >> - __asm__ __volatile__ (".4byte 0x100000F");
> >> + /* Encoding of the pause instruction */
> >> + __asm__ __volatile__ (".4byte 0x100000F");
> >> #endif
> >
> > hmm, though before this part of the code was only ever accessed
> > when the zhintpause extension was really available on the running
> > machine while now the pause instruction is called every time.
> >
> > So I'm just wondering, can't this run into some "illegal instruction"
> > thingy on machines not supporting the extension?
>
> No. The encoding for pause was deliberately chosen to be one of the
> “useless” encodings of fence, with the hope that existing
> microarchitectures might take a while to execute it and thus it would
> still function as a slow-running instruction. It’s somewhat
> questionable whether the div is even needed, the worst that happens is
> cpu_relax isn’t very relaxed and you spin a bit faster. Any
> implementations where that’s true probably also don’t have fancy
> clock/power management anyway, and div isn’t going to be a low-power
> operation so the only real effect is likely hammering on contended
> atomics a bit more, and who cares about that on the low core count
> systems we have today.

thanks a lot for that explanation, which made things a lot clearer.

So as you said, dropping the div part might make the function even smaller,
though somehow part of me would want to add some sort of comment to
the function for when the next developer stumbles over the unconditional
use of pause :-) .


Heiko


2022-09-23 18:24:44

by Atish Patra

[permalink] [raw]
Subject: Re: [PATCH] riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y

On Fri, Sep 23, 2022 at 12:18 AM Heiko Stuebner <[email protected]> wrote:
>
> Hi,
>
> Am Donnerstag, 22. September 2022, 17:52:46 CEST schrieb Jessica Clarke:
> > On 22 Sept 2022, at 16:45, Heiko Stuebner <[email protected]> wrote:
> > >
> > > Am Donnerstag, 22. September 2022, 08:09:58 CEST schrieb Samuel Holland:
> > >> commit 8eb060e10185 ("arch/riscv: add Zihintpause support") broke
> > >> building with CONFIG_CC_OPTIMIZE_FOR_SIZE enabled (gcc 11.1.0):
> > >>
> > >> CC arch/riscv/kernel/vdso/vgettimeofday.o
> > >> In file included from <command-line>:
> > >> ./arch/riscv/include/asm/jump_label.h: In function 'cpu_relax':
> > >> ././include/linux/compiler_types.h:285:33: warning: 'asm' operand 0 probably does not match constraints
> > >> 285 | #define asm_volatile_goto(x...) asm goto(x)
> > >> | ^~~
> > >> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> > >> 41 | asm_volatile_goto(
> > >> | ^~~~~~~~~~~~~~~~~
> > >> ././include/linux/compiler_types.h:285:33: error: impossible constraint in 'asm'
> > >> 285 | #define asm_volatile_goto(x...) asm goto(x)
> > >> | ^~~
> > >> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> > >> 41 | asm_volatile_goto(
> > >> | ^~~~~~~~~~~~~~~~~
> > >> make[1]: *** [scripts/Makefile.build:249: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1
> > >> make: *** [arch/riscv/Makefile:128: vdso_prepare] Error 2
> > >>
> > >> Having a static branch in cpu_relax() is problematic because that
> > >> function is widely inlined, including in some quite complex functions
> > >> like in the VDSO. A quick measurement shows this static branch is
> > >> responsible by itself for around 40% of the jump table.
> > >>
> > >> Drop the static branch, which ends up being the same number of
> > >> instructions anyway. If Zihintpause is supported, we trade the nop from
> > >> the static branch for a div. If Zihintpause is unsupported, we trade the
> > >> jump from the static branch for (what gets interpreted as) a nop.
> > >>
> > >> Fixes: 8eb060e10185 ("arch/riscv: add Zihintpause support")
> > >> Signed-off-by: Samuel Holland <[email protected]>
> > >> ---
> > >>
> > >> arch/riscv/include/asm/hwcap.h | 3 ---
> > >> arch/riscv/include/asm/vdso/processor.h | 25 ++++++++++---------------
> > >> 2 files changed, 10 insertions(+), 18 deletions(-)
> > >>
> > >> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > >> index 6f59ec64175e..b21d46e68386 100644
> > >> --- a/arch/riscv/include/asm/hwcap.h
> > >> +++ b/arch/riscv/include/asm/hwcap.h
> > >> @@ -68,7 +68,6 @@ enum riscv_isa_ext_id {
> > >> */
> > >> enum riscv_isa_ext_key {
> > >> RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */
> > >> - RISCV_ISA_EXT_KEY_ZIHINTPAUSE,
> > >> RISCV_ISA_EXT_KEY_MAX,
> > >> };
> > >>
> > >> @@ -88,8 +87,6 @@ static __always_inline int riscv_isa_ext2key(int num)
> > >> return RISCV_ISA_EXT_KEY_FPU;
> > >> case RISCV_ISA_EXT_d:
> > >> return RISCV_ISA_EXT_KEY_FPU;
> > >> - case RISCV_ISA_EXT_ZIHINTPAUSE:
> > >> - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE;
> > >> default:
> > >> return -EINVAL;
> > >> }
> > >> diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h
> > >> index 1e4f8b4aef79..789bdb8211a2 100644
> > >> --- a/arch/riscv/include/asm/vdso/processor.h
> > >> +++ b/arch/riscv/include/asm/vdso/processor.h
> > >> @@ -4,30 +4,25 @@
> > >>
> > >> #ifndef __ASSEMBLY__
> > >>
> > >> -#include <linux/jump_label.h>
> > >> #include <asm/barrier.h>
> > >> -#include <asm/hwcap.h>
> > >>
> > >> static inline void cpu_relax(void)
> > >> {
> > >> - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) {
> > >> #ifdef __riscv_muldiv
> > >> - int dummy;
> > >> - /* In lieu of a halt instruction, induce a long-latency stall. */
> > >> - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> > >> + int dummy;
> > >> + /* In lieu of a halt instruction, induce a long-latency stall. */
> > >> + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> > >> #endif
> > >> - } else {
> > >> - /*
> > >> - * Reduce instruction retirement.
> > >> - * This assumes the PC changes.
> > >> - */
> > >> + /*
> > >> + * Reduce instruction retirement.
> > >> + * This assumes the PC changes.
> > >> + */
> > >> #ifdef __riscv_zihintpause
> > >> - __asm__ __volatile__ ("pause");
> > >> + __asm__ __volatile__ ("pause");
> > >> #else
> > >> - /* Encoding of the pause instruction */
> > >> - __asm__ __volatile__ (".4byte 0x100000F");
> > >> + /* Encoding of the pause instruction */
> > >> + __asm__ __volatile__ (".4byte 0x100000F");
> > >> #endif
> > >
> > > hmm, though before this part of the code was only ever accessed
> > > when the zhintpause extension was really available on the running
> > > machine while now the pause instruction is called every time.
> > >
> > > So I'm just wondering, can't this run into some "illegal instruction"
> > > thingy on machines not supporting the extension?
> >
> > No. The encoding for pause was deliberately chosen to be one of the
> > “useless” encodings of fence, with the hope that existing
> > microarchitectures might take a while to execute it and thus it would
> > still function as a slow-running instruction. It’s somewhat
> > questionable whether the div is even needed, the worst that happens is
> > cpu_relax isn’t very relaxed and you spin a bit faster. Any
> > implementations where that’s true probably also don’t have fancy
> > clock/power management anyway, and div isn’t going to be a low-power
> > operation so the only real effect is likely hammering on contended
> > atomics a bit more, and who cares about that on the low core count
> > systems we have today.
>
> thanks a lot for that explanation, which made things a lot clearer.
>
> So as you said, dropping the div part might make the function even smaller,
> though somehow part of me would want to add some sort of comment to
> the function for when the next developer stumbles over the unconditional
> use of pause :-) .
>

I agree. If that's what microarch will do, we can drop div altogether.
Though microarch may be treated as nop even if it is undesirable.
IIRC, the div was introduced for the rocket chip which would induce a
long latency stall with div instruction (zero as operands).

Does any other core or newer rocket chip actually induce a latency
stall with div instruction ?
If not, it is equivalent to NOP as well. We can definitely remove the div.
The only cores affected will be the older rocket core.

Tagging some folks to understand what their core does.

@Paul Walmsley @Guo Ren @Conor Dooley ?

(Please add anybody who may have an insight to execution flow on
existing Linux capable cores)

>
> Heiko
>
>
>
> _______________________________________________
> linux-riscv mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-riscv



--
Regards,
Atish

2022-09-25 00:05:10

by Conor Dooley

[permalink] [raw]
Subject: Re: [PATCH] riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y

On Fri, Sep 23, 2022 at 11:01:28AM -0700, Atish Patra wrote:
> On Fri, Sep 23, 2022 at 12:18 AM Heiko Stuebner <[email protected]> wrote:
> >
> > Hi,
> >
> > Am Donnerstag, 22. September 2022, 17:52:46 CEST schrieb Jessica Clarke:
> > > On 22 Sept 2022, at 16:45, Heiko Stuebner <[email protected]> wrote:
> > > >
> > > > Am Donnerstag, 22. September 2022, 08:09:58 CEST schrieb Samuel Holland:
> > > >> commit 8eb060e10185 ("arch/riscv: add Zihintpause support") broke
> > > >> building with CONFIG_CC_OPTIMIZE_FOR_SIZE enabled (gcc 11.1.0):
> > > >>
> > > >> CC arch/riscv/kernel/vdso/vgettimeofday.o
> > > >> In file included from <command-line>:
> > > >> ./arch/riscv/include/asm/jump_label.h: In function 'cpu_relax':
> > > >> ././include/linux/compiler_types.h:285:33: warning: 'asm' operand 0 probably does not match constraints
> > > >> 285 | #define asm_volatile_goto(x...) asm goto(x)
> > > >> | ^~~
> > > >> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> > > >> 41 | asm_volatile_goto(
> > > >> | ^~~~~~~~~~~~~~~~~
> > > >> ././include/linux/compiler_types.h:285:33: error: impossible constraint in 'asm'
> > > >> 285 | #define asm_volatile_goto(x...) asm goto(x)
> > > >> | ^~~
> > > >> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> > > >> 41 | asm_volatile_goto(
> > > >> | ^~~~~~~~~~~~~~~~~
> > > >> make[1]: *** [scripts/Makefile.build:249: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1
> > > >> make: *** [arch/riscv/Makefile:128: vdso_prepare] Error 2
> > > >>
> > > >> Having a static branch in cpu_relax() is problematic because that
> > > >> function is widely inlined, including in some quite complex functions
> > > >> like in the VDSO. A quick measurement shows this static branch is
> > > >> responsible by itself for around 40% of the jump table.
> > > >>
> > > >> Drop the static branch, which ends up being the same number of
> > > >> instructions anyway. If Zihintpause is supported, we trade the nop from
> > > >> the static branch for a div. If Zihintpause is unsupported, we trade the
> > > >> jump from the static branch for (what gets interpreted as) a nop.
> > > >>
> > > >> Fixes: 8eb060e10185 ("arch/riscv: add Zihintpause support")
> > > >> Signed-off-by: Samuel Holland <[email protected]>
> > > >> ---
> > > >>
> > > >> arch/riscv/include/asm/hwcap.h | 3 ---
> > > >> arch/riscv/include/asm/vdso/processor.h | 25 ++++++++++---------------
> > > >> 2 files changed, 10 insertions(+), 18 deletions(-)
> > > >>
> > > >> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > > >> index 6f59ec64175e..b21d46e68386 100644
> > > >> --- a/arch/riscv/include/asm/hwcap.h
> > > >> +++ b/arch/riscv/include/asm/hwcap.h
> > > >> @@ -68,7 +68,6 @@ enum riscv_isa_ext_id {
> > > >> */
> > > >> enum riscv_isa_ext_key {
> > > >> RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */
> > > >> - RISCV_ISA_EXT_KEY_ZIHINTPAUSE,
> > > >> RISCV_ISA_EXT_KEY_MAX,
> > > >> };
> > > >>
> > > >> @@ -88,8 +87,6 @@ static __always_inline int riscv_isa_ext2key(int num)
> > > >> return RISCV_ISA_EXT_KEY_FPU;
> > > >> case RISCV_ISA_EXT_d:
> > > >> return RISCV_ISA_EXT_KEY_FPU;
> > > >> - case RISCV_ISA_EXT_ZIHINTPAUSE:
> > > >> - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE;
> > > >> default:
> > > >> return -EINVAL;
> > > >> }
> > > >> diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h
> > > >> index 1e4f8b4aef79..789bdb8211a2 100644
> > > >> --- a/arch/riscv/include/asm/vdso/processor.h
> > > >> +++ b/arch/riscv/include/asm/vdso/processor.h
> > > >> @@ -4,30 +4,25 @@
> > > >>
> > > >> #ifndef __ASSEMBLY__
> > > >>
> > > >> -#include <linux/jump_label.h>
> > > >> #include <asm/barrier.h>
> > > >> -#include <asm/hwcap.h>
> > > >>
> > > >> static inline void cpu_relax(void)
> > > >> {
> > > >> - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) {
> > > >> #ifdef __riscv_muldiv
> > > >> - int dummy;
> > > >> - /* In lieu of a halt instruction, induce a long-latency stall. */
> > > >> - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> > > >> + int dummy;
> > > >> + /* In lieu of a halt instruction, induce a long-latency stall. */
> > > >> + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> > > >> #endif
> > > >> - } else {
> > > >> - /*
> > > >> - * Reduce instruction retirement.
> > > >> - * This assumes the PC changes.
> > > >> - */
> > > >> + /*
> > > >> + * Reduce instruction retirement.
> > > >> + * This assumes the PC changes.
> > > >> + */
> > > >> #ifdef __riscv_zihintpause
> > > >> - __asm__ __volatile__ ("pause");
> > > >> + __asm__ __volatile__ ("pause");
> > > >> #else
> > > >> - /* Encoding of the pause instruction */
> > > >> - __asm__ __volatile__ (".4byte 0x100000F");
> > > >> + /* Encoding of the pause instruction */
> > > >> + __asm__ __volatile__ (".4byte 0x100000F");
> > > >> #endif
> > > >
> > > > hmm, though before this part of the code was only ever accessed
> > > > when the zhintpause extension was really available on the running
> > > > machine while now the pause instruction is called every time.
> > > >
> > > > So I'm just wondering, can't this run into some "illegal instruction"
> > > > thingy on machines not supporting the extension?
> > >
> > > No. The encoding for pause was deliberately chosen to be one of the
> > > “useless” encodings of fence, with the hope that existing
> > > microarchitectures might take a while to execute it and thus it would
> > > still function as a slow-running instruction. It’s somewhat
> > > questionable whether the div is even needed, the worst that happens is
> > > cpu_relax isn’t very relaxed and you spin a bit faster. Any
> > > implementations where that’s true probably also don’t have fancy
> > > clock/power management anyway, and div isn’t going to be a low-power
> > > operation so the only real effect is likely hammering on contended
> > > atomics a bit more, and who cares about that on the low core count
> > > systems we have today.
> >
> > thanks a lot for that explanation, which made things a lot clearer.
> >
> > So as you said, dropping the div part might make the function even smaller,
> > though somehow part of me would want to add some sort of comment to
> > the function for when the next developer stumbles over the unconditional
> > use of pause :-) .
> >
>
> I agree. If that's what microarch will do, we can drop div altogether.
> Though microarch may be treated as nop even if it is undesirable.
> IIRC, the div was introduced for the rocket chip which would induce a
> long latency stall with div instruction (zero as operands).
>
> Does any other core or newer rocket chip actually induce a latency
> stall with div instruction ?
> If not, it is equivalent to NOP as well. We can definitely remove the div.
> The only cores affected will be the older rocket core.
>
> Tagging some folks to understand what their core does.
>
> @Paul Walmsley @Guo Ren @Conor Dooley ?

I am no microarch expert by _any_ stretch of the imagination, but
from a quick experiment it looks like the u54s on PolarFire SoC behave
in the same way, and div w/ zero operands does in fact take significantly
longer than regular division (looks to be about 3x).

Hope that's helpful,
Conor.

(I just did a quick check of what pretty much amounted to a bunch of
div a5,zero,zero in a row versus div a5,a5,a5)

>
> (Please add anybody who may have an insight to execution flow on
> existing Linux capable cores)
>
> >
> > Heiko
> >
> >
> >
> > _______________________________________________
> > linux-riscv mailing list
> > [email protected]
> > http://lists.infradead.org/mailman/listinfo/linux-riscv
>
>
>
> --
> Regards,
> Atish
>
> _______________________________________________
> linux-riscv mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-riscv

2022-09-28 08:35:59

by Atish Patra

[permalink] [raw]
Subject: Re: [PATCH] riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y

On Sat, Sep 24, 2022 at 4:15 PM Conor Dooley <[email protected]> wrote:
>
> On Fri, Sep 23, 2022 at 11:01:28AM -0700, Atish Patra wrote:
> > On Fri, Sep 23, 2022 at 12:18 AM Heiko Stuebner <[email protected]> wrote:
> > >
> > > Hi,
> > >
> > > Am Donnerstag, 22. September 2022, 17:52:46 CEST schrieb Jessica Clarke:
> > > > On 22 Sept 2022, at 16:45, Heiko Stuebner <[email protected]> wrote:
> > > > >
> > > > > Am Donnerstag, 22. September 2022, 08:09:58 CEST schrieb Samuel Holland:
> > > > >> commit 8eb060e10185 ("arch/riscv: add Zihintpause support") broke
> > > > >> building with CONFIG_CC_OPTIMIZE_FOR_SIZE enabled (gcc 11.1.0):
> > > > >>
> > > > >> CC arch/riscv/kernel/vdso/vgettimeofday.o
> > > > >> In file included from <command-line>:
> > > > >> ./arch/riscv/include/asm/jump_label.h: In function 'cpu_relax':
> > > > >> ././include/linux/compiler_types.h:285:33: warning: 'asm' operand 0 probably does not match constraints
> > > > >> 285 | #define asm_volatile_goto(x...) asm goto(x)
> > > > >> | ^~~
> > > > >> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> > > > >> 41 | asm_volatile_goto(
> > > > >> | ^~~~~~~~~~~~~~~~~
> > > > >> ././include/linux/compiler_types.h:285:33: error: impossible constraint in 'asm'
> > > > >> 285 | #define asm_volatile_goto(x...) asm goto(x)
> > > > >> | ^~~
> > > > >> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> > > > >> 41 | asm_volatile_goto(
> > > > >> | ^~~~~~~~~~~~~~~~~
> > > > >> make[1]: *** [scripts/Makefile.build:249: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1
> > > > >> make: *** [arch/riscv/Makefile:128: vdso_prepare] Error 2
> > > > >>
> > > > >> Having a static branch in cpu_relax() is problematic because that
> > > > >> function is widely inlined, including in some quite complex functions
> > > > >> like in the VDSO. A quick measurement shows this static branch is
> > > > >> responsible by itself for around 40% of the jump table.
> > > > >>
> > > > >> Drop the static branch, which ends up being the same number of
> > > > >> instructions anyway. If Zihintpause is supported, we trade the nop from
> > > > >> the static branch for a div. If Zihintpause is unsupported, we trade the
> > > > >> jump from the static branch for (what gets interpreted as) a nop.
> > > > >>
> > > > >> Fixes: 8eb060e10185 ("arch/riscv: add Zihintpause support")
> > > > >> Signed-off-by: Samuel Holland <[email protected]>
> > > > >> ---
> > > > >>
> > > > >> arch/riscv/include/asm/hwcap.h | 3 ---
> > > > >> arch/riscv/include/asm/vdso/processor.h | 25 ++++++++++---------------
> > > > >> 2 files changed, 10 insertions(+), 18 deletions(-)
> > > > >>
> > > > >> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > > > >> index 6f59ec64175e..b21d46e68386 100644
> > > > >> --- a/arch/riscv/include/asm/hwcap.h
> > > > >> +++ b/arch/riscv/include/asm/hwcap.h
> > > > >> @@ -68,7 +68,6 @@ enum riscv_isa_ext_id {
> > > > >> */
> > > > >> enum riscv_isa_ext_key {
> > > > >> RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */
> > > > >> - RISCV_ISA_EXT_KEY_ZIHINTPAUSE,
> > > > >> RISCV_ISA_EXT_KEY_MAX,
> > > > >> };
> > > > >>
> > > > >> @@ -88,8 +87,6 @@ static __always_inline int riscv_isa_ext2key(int num)
> > > > >> return RISCV_ISA_EXT_KEY_FPU;
> > > > >> case RISCV_ISA_EXT_d:
> > > > >> return RISCV_ISA_EXT_KEY_FPU;
> > > > >> - case RISCV_ISA_EXT_ZIHINTPAUSE:
> > > > >> - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE;
> > > > >> default:
> > > > >> return -EINVAL;
> > > > >> }
> > > > >> diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h
> > > > >> index 1e4f8b4aef79..789bdb8211a2 100644
> > > > >> --- a/arch/riscv/include/asm/vdso/processor.h
> > > > >> +++ b/arch/riscv/include/asm/vdso/processor.h
> > > > >> @@ -4,30 +4,25 @@
> > > > >>
> > > > >> #ifndef __ASSEMBLY__
> > > > >>
> > > > >> -#include <linux/jump_label.h>
> > > > >> #include <asm/barrier.h>
> > > > >> -#include <asm/hwcap.h>
> > > > >>
> > > > >> static inline void cpu_relax(void)
> > > > >> {
> > > > >> - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) {
> > > > >> #ifdef __riscv_muldiv
> > > > >> - int dummy;
> > > > >> - /* In lieu of a halt instruction, induce a long-latency stall. */
> > > > >> - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> > > > >> + int dummy;
> > > > >> + /* In lieu of a halt instruction, induce a long-latency stall. */
> > > > >> + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> > > > >> #endif
> > > > >> - } else {
> > > > >> - /*
> > > > >> - * Reduce instruction retirement.
> > > > >> - * This assumes the PC changes.
> > > > >> - */
> > > > >> + /*
> > > > >> + * Reduce instruction retirement.
> > > > >> + * This assumes the PC changes.
> > > > >> + */
> > > > >> #ifdef __riscv_zihintpause
> > > > >> - __asm__ __volatile__ ("pause");
> > > > >> + __asm__ __volatile__ ("pause");
> > > > >> #else
> > > > >> - /* Encoding of the pause instruction */
> > > > >> - __asm__ __volatile__ (".4byte 0x100000F");
> > > > >> + /* Encoding of the pause instruction */
> > > > >> + __asm__ __volatile__ (".4byte 0x100000F");
> > > > >> #endif
> > > > >
> > > > > hmm, though before this part of the code was only ever accessed
> > > > > when the zhintpause extension was really available on the running
> > > > > machine while now the pause instruction is called every time.
> > > > >
> > > > > So I'm just wondering, can't this run into some "illegal instruction"
> > > > > thingy on machines not supporting the extension?
> > > >
> > > > No. The encoding for pause was deliberately chosen to be one of the
> > > > “useless” encodings of fence, with the hope that existing
> > > > microarchitectures might take a while to execute it and thus it would
> > > > still function as a slow-running instruction. It’s somewhat
> > > > questionable whether the div is even needed, the worst that happens is
> > > > cpu_relax isn’t very relaxed and you spin a bit faster. Any
> > > > implementations where that’s true probably also don’t have fancy
> > > > clock/power management anyway, and div isn’t going to be a low-power
> > > > operation so the only real effect is likely hammering on contended
> > > > atomics a bit more, and who cares about that on the low core count
> > > > systems we have today.
> > >
> > > thanks a lot for that explanation, which made things a lot clearer.
> > >
> > > So as you said, dropping the div part might make the function even smaller,
> > > though somehow part of me would want to add some sort of comment to
> > > the function for when the next developer stumbles over the unconditional
> > > use of pause :-) .
> > >
> >
> > I agree. If that's what microarch will do, we can drop div altogether.
> > Though microarch may be treated as nop even if it is undesirable.
> > IIRC, the div was introduced for the rocket chip which would induce a
> > long latency stall with div instruction (zero as operands).
> >
> > Does any other core or newer rocket chip actually induce a latency
> > stall with div instruction ?
> > If not, it is equivalent to NOP as well. We can definitely remove the div.
> > The only cores affected will be the older rocket core.
> >
> > Tagging some folks to understand what their core does.
> >
> > @Paul Walmsley @Guo Ren @Conor Dooley ?
>
> I am no microarch expert by _any_ stretch of the imagination, but
> from a quick experiment it looks like the u54s on PolarFire SoC behave
> in the same way, and div w/ zero operands does in fact take significantly
> longer than regular division (looks to be about 3x).
>

Thanks. Do you have any data on how much the "pause" instruction takes.
I understand that it is not available in these cores. Just wanted to
understand if microarchitecture
actually takes a while executing the useless encoding as pointed out by Jessica.

If that's the case, we can remove the div instruction altogether.
Otherwise, this patch will cause some performance regression
for existing SoC (HiFive unleashed has the same core. Not sure about
unmatched though).
This needs to be documented at least.

> Hope that's helpful,
> Conor.
>
> (I just did a quick check of what pretty much amounted to a bunch of
> div a5,zero,zero in a row versus div a5,a5,a5)
>
> >
> > (Please add anybody who may have an insight to execution flow on
> > existing Linux capable cores)
> >
> > >
> > > Heiko
> > >
> > >
> > >
> > > _______________________________________________
> > > linux-riscv mailing list
> > > [email protected]
> > > http://lists.infradead.org/mailman/listinfo/linux-riscv
> >
> >
> >
> > --
> > Regards,
> > Atish
> >
> > _______________________________________________
> > linux-riscv mailing list
> > [email protected]
> > http://lists.infradead.org/mailman/listinfo/linux-riscv



--
Regards,
Atish

2022-09-28 21:56:37

by Conor Dooley

[permalink] [raw]
Subject: Re: [PATCH] riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y

On Wed, Sep 28, 2022 at 12:21:55AM -0700, Atish Patra wrote:
> On Sat, Sep 24, 2022 at 4:15 PM Conor Dooley <[email protected]> wrote:
> >
> > On Fri, Sep 23, 2022 at 11:01:28AM -0700, Atish Patra wrote:
> > > On Fri, Sep 23, 2022 at 12:18 AM Heiko Stuebner <[email protected]> wrote:
> > > >
> > > > Hi,
> > > >
> > > > Am Donnerstag, 22. September 2022, 17:52:46 CEST schrieb Jessica Clarke:
> > > > > On 22 Sept 2022, at 16:45, Heiko Stuebner <[email protected]> wrote:
> > > > > >
> > > > > > Am Donnerstag, 22. September 2022, 08:09:58 CEST schrieb Samuel Holland:
> > > > > >> commit 8eb060e10185 ("arch/riscv: add Zihintpause support") broke
> > > > > >> building with CONFIG_CC_OPTIMIZE_FOR_SIZE enabled (gcc 11.1.0):
> > > > > >>
> > > > > >> CC arch/riscv/kernel/vdso/vgettimeofday.o
> > > > > >> In file included from <command-line>:
> > > > > >> ./arch/riscv/include/asm/jump_label.h: In function 'cpu_relax':
> > > > > >> ././include/linux/compiler_types.h:285:33: warning: 'asm' operand 0 probably does not match constraints
> > > > > >> 285 | #define asm_volatile_goto(x...) asm goto(x)
> > > > > >> | ^~~
> > > > > >> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> > > > > >> 41 | asm_volatile_goto(
> > > > > >> | ^~~~~~~~~~~~~~~~~
> > > > > >> ././include/linux/compiler_types.h:285:33: error: impossible constraint in 'asm'
> > > > > >> 285 | #define asm_volatile_goto(x...) asm goto(x)
> > > > > >> | ^~~
> > > > > >> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> > > > > >> 41 | asm_volatile_goto(
> > > > > >> | ^~~~~~~~~~~~~~~~~
> > > > > >> make[1]: *** [scripts/Makefile.build:249: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1
> > > > > >> make: *** [arch/riscv/Makefile:128: vdso_prepare] Error 2
> > > > > >>
> > > > > >> Having a static branch in cpu_relax() is problematic because that
> > > > > >> function is widely inlined, including in some quite complex functions
> > > > > >> like in the VDSO. A quick measurement shows this static branch is
> > > > > >> responsible by itself for around 40% of the jump table.
> > > > > >>
> > > > > >> Drop the static branch, which ends up being the same number of
> > > > > >> instructions anyway. If Zihintpause is supported, we trade the nop from
> > > > > >> the static branch for a div. If Zihintpause is unsupported, we trade the
> > > > > >> jump from the static branch for (what gets interpreted as) a nop.
> > > > > >>
> > > > > >> Fixes: 8eb060e10185 ("arch/riscv: add Zihintpause support")
> > > > > >> Signed-off-by: Samuel Holland <[email protected]>
> > > > > >> ---
> > > > > >>
> > > > > >> arch/riscv/include/asm/hwcap.h | 3 ---
> > > > > >> arch/riscv/include/asm/vdso/processor.h | 25 ++++++++++---------------
> > > > > >> 2 files changed, 10 insertions(+), 18 deletions(-)
> > > > > >>
> > > > > >> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > > > > >> index 6f59ec64175e..b21d46e68386 100644
> > > > > >> --- a/arch/riscv/include/asm/hwcap.h
> > > > > >> +++ b/arch/riscv/include/asm/hwcap.h
> > > > > >> @@ -68,7 +68,6 @@ enum riscv_isa_ext_id {
> > > > > >> */
> > > > > >> enum riscv_isa_ext_key {
> > > > > >> RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */
> > > > > >> - RISCV_ISA_EXT_KEY_ZIHINTPAUSE,
> > > > > >> RISCV_ISA_EXT_KEY_MAX,
> > > > > >> };
> > > > > >>
> > > > > >> @@ -88,8 +87,6 @@ static __always_inline int riscv_isa_ext2key(int num)
> > > > > >> return RISCV_ISA_EXT_KEY_FPU;
> > > > > >> case RISCV_ISA_EXT_d:
> > > > > >> return RISCV_ISA_EXT_KEY_FPU;
> > > > > >> - case RISCV_ISA_EXT_ZIHINTPAUSE:
> > > > > >> - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE;
> > > > > >> default:
> > > > > >> return -EINVAL;
> > > > > >> }
> > > > > >> diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h
> > > > > >> index 1e4f8b4aef79..789bdb8211a2 100644
> > > > > >> --- a/arch/riscv/include/asm/vdso/processor.h
> > > > > >> +++ b/arch/riscv/include/asm/vdso/processor.h
> > > > > >> @@ -4,30 +4,25 @@
> > > > > >>
> > > > > >> #ifndef __ASSEMBLY__
> > > > > >>
> > > > > >> -#include <linux/jump_label.h>
> > > > > >> #include <asm/barrier.h>
> > > > > >> -#include <asm/hwcap.h>
> > > > > >>
> > > > > >> static inline void cpu_relax(void)
> > > > > >> {
> > > > > >> - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) {
> > > > > >> #ifdef __riscv_muldiv
> > > > > >> - int dummy;
> > > > > >> - /* In lieu of a halt instruction, induce a long-latency stall. */
> > > > > >> - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> > > > > >> + int dummy;
> > > > > >> + /* In lieu of a halt instruction, induce a long-latency stall. */
> > > > > >> + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> > > > > >> #endif
> > > > > >> - } else {
> > > > > >> - /*
> > > > > >> - * Reduce instruction retirement.
> > > > > >> - * This assumes the PC changes.
> > > > > >> - */
> > > > > >> + /*
> > > > > >> + * Reduce instruction retirement.
> > > > > >> + * This assumes the PC changes.
> > > > > >> + */
> > > > > >> #ifdef __riscv_zihintpause
> > > > > >> - __asm__ __volatile__ ("pause");
> > > > > >> + __asm__ __volatile__ ("pause");
> > > > > >> #else
> > > > > >> - /* Encoding of the pause instruction */
> > > > > >> - __asm__ __volatile__ (".4byte 0x100000F");
> > > > > >> + /* Encoding of the pause instruction */
> > > > > >> + __asm__ __volatile__ (".4byte 0x100000F");
> > > > > >> #endif
> > > > > >
> > > > > > hmm, though before this part of the code was only ever accessed
> > > > > > when the zhintpause extension was really available on the running
> > > > > > machine while now the pause instruction is called every time.
> > > > > >
> > > > > > So I'm just wondering, can't this run into some "illegal instruction"
> > > > > > thingy on machines not supporting the extension?
> > > > >
> > > > > No. The encoding for pause was deliberately chosen to be one of the
> > > > > “useless” encodings of fence, with the hope that existing
> > > > > microarchitectures might take a while to execute it and thus it would
> > > > > still function as a slow-running instruction. It’s somewhat
> > > > > questionable whether the div is even needed, the worst that happens is
> > > > > cpu_relax isn’t very relaxed and you spin a bit faster. Any
> > > > > implementations where that’s true probably also don’t have fancy
> > > > > clock/power management anyway, and div isn’t going to be a low-power
> > > > > operation so the only real effect is likely hammering on contended
> > > > > atomics a bit more, and who cares about that on the low core count
> > > > > systems we have today.
> > > >
> > > > thanks a lot for that explanation, which made things a lot clearer.
> > > >
> > > > So as you said, dropping the div part might make the function even smaller,
> > > > though somehow part of me would want to add some sort of comment to
> > > > the function for when the next developer stumbles over the unconditional
> > > > use of pause :-) .
> > > >
> > >
> > > I agree. If that's what microarch will do, we can drop div altogether.
> > > Though microarch may be treated as nop even if it is undesirable.
> > > IIRC, the div was introduced for the rocket chip which would induce a
> > > long latency stall with div instruction (zero as operands).
> > >
> > > Does any other core or newer rocket chip actually induce a latency
> > > stall with div instruction ?
> > > If not, it is equivalent to NOP as well. We can definitely remove the div.
> > > The only cores affected will be the older rocket core.
> > >
> > > Tagging some folks to understand what their core does.
> > >
> > > @Paul Walmsley @Guo Ren @Conor Dooley ?
> >
> > I am no microarch expert by _any_ stretch of the imagination, but
> > from a quick experiment it looks like the u54s on PolarFire SoC behave
> > in the same way, and div w/ zero operands does in fact take significantly
> > longer than regular division (looks to be about 3x).
> >
>
> Thanks. Do you have any data on how much the "pause" instruction takes.

So these numbers you may consider as being pulled out of a magic hat
as all I am doing is reading the counters from userspace and there is
some variance etc. Plus the fact that I just started hacking at some
existing code I had lying around as I'm pretty snowed under at the
moment.

Doing the following takes about 70 cycles on both a PolarFire SoC and an
unmatched:
long divisor = 2, dividend = 100000, dest;
asm("div %0, zero, zero" : "=r" (dest));
and equates to:
sd a5,-48(s0)
div a5,zero,zero

Clocking in at about 40 cycles is some actual divisions, I just did the
following a dozen times, doing a trivial computation:
long divisor = 2, dividend = 100000, dest;
asm("div %0, %1, %2" : "=r" (dividend) : "r" (dividend), "r" (divisor))

ie, a load of the following:
sd a5,-48(s0)
ld a5,-48(s0)
ld a4,-40(s0)
div a5,a5,a4

So clearly the div w/ zero args makes a difference...

On PolarFire SoC, `0x100000F` takes approx 6 cycles. On my unmatched, it
takes approx 40. Again, I just had an asm block & called the instruction
a number times and took the average - here it was 48 times.

Take the actual numbers with a fist full of salt, but at least the
relative numbers should be of some use to you.

Hope that's somewhat helpful, maybe next week I can do something a
little more useful for you...

Conor.

> I understand that it is not available in these cores. Just wanted to
> understand if microarchitecture
> actually takes a while executing the useless encoding as pointed out by Jessica.
>
> If that's the case, we can remove the div instruction altogether.
> Otherwise, this patch will cause some performance regression
> for existing SoC (HiFive unleashed has the same core. Not sure about
> unmatched though).
> This needs to be documented at least.
>
> > Hope that's helpful,
> > Conor.
> >
> > (I just did a quick check of what pretty much amounted to a bunch of
> > div a5,zero,zero in a row versus div a5,a5,a5)
> >
> > >
> > > (Please add anybody who may have an insight to execution flow on
> > > existing Linux capable cores)
> > >
> > > >
> > > > Heiko
> > > >
> > > >
> > > >
> > > > _______________________________________________
> > > > linux-riscv mailing list
> > > > [email protected]
> > > > http://lists.infradead.org/mailman/listinfo/linux-riscv
> > >
> > >
> > >
> > > --
> > > Regards,
> > > Atish
> > >
> > > _______________________________________________
> > > linux-riscv mailing list
> > > [email protected]
> > > http://lists.infradead.org/mailman/listinfo/linux-riscv
>
>
>
> --
> Regards,
> Atish
>
> _______________________________________________
> linux-riscv mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-riscv

2022-09-29 04:07:00

by Atish Patra

[permalink] [raw]
Subject: Re: [PATCH] riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y

On Wed, Sep 28, 2022 at 2:16 PM Conor Dooley <[email protected]> wrote:
>
> On Wed, Sep 28, 2022 at 12:21:55AM -0700, Atish Patra wrote:
> > On Sat, Sep 24, 2022 at 4:15 PM Conor Dooley <[email protected]> wrote:
> > >
> > > On Fri, Sep 23, 2022 at 11:01:28AM -0700, Atish Patra wrote:
> > > > On Fri, Sep 23, 2022 at 12:18 AM Heiko Stuebner <[email protected]> wrote:
> > > > >
> > > > > Hi,
> > > > >
> > > > > Am Donnerstag, 22. September 2022, 17:52:46 CEST schrieb Jessica Clarke:
> > > > > > On 22 Sept 2022, at 16:45, Heiko Stuebner <[email protected]> wrote:
> > > > > > >
> > > > > > > Am Donnerstag, 22. September 2022, 08:09:58 CEST schrieb Samuel Holland:
> > > > > > >> commit 8eb060e10185 ("arch/riscv: add Zihintpause support") broke
> > > > > > >> building with CONFIG_CC_OPTIMIZE_FOR_SIZE enabled (gcc 11.1.0):
> > > > > > >>
> > > > > > >> CC arch/riscv/kernel/vdso/vgettimeofday.o
> > > > > > >> In file included from <command-line>:
> > > > > > >> ./arch/riscv/include/asm/jump_label.h: In function 'cpu_relax':
> > > > > > >> ././include/linux/compiler_types.h:285:33: warning: 'asm' operand 0 probably does not match constraints
> > > > > > >> 285 | #define asm_volatile_goto(x...) asm goto(x)
> > > > > > >> | ^~~
> > > > > > >> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> > > > > > >> 41 | asm_volatile_goto(
> > > > > > >> | ^~~~~~~~~~~~~~~~~
> > > > > > >> ././include/linux/compiler_types.h:285:33: error: impossible constraint in 'asm'
> > > > > > >> 285 | #define asm_volatile_goto(x...) asm goto(x)
> > > > > > >> | ^~~
> > > > > > >> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> > > > > > >> 41 | asm_volatile_goto(
> > > > > > >> | ^~~~~~~~~~~~~~~~~
> > > > > > >> make[1]: *** [scripts/Makefile.build:249: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1
> > > > > > >> make: *** [arch/riscv/Makefile:128: vdso_prepare] Error 2
> > > > > > >>
> > > > > > >> Having a static branch in cpu_relax() is problematic because that
> > > > > > >> function is widely inlined, including in some quite complex functions
> > > > > > >> like in the VDSO. A quick measurement shows this static branch is
> > > > > > >> responsible by itself for around 40% of the jump table.
> > > > > > >>
> > > > > > >> Drop the static branch, which ends up being the same number of
> > > > > > >> instructions anyway. If Zihintpause is supported, we trade the nop from
> > > > > > >> the static branch for a div. If Zihintpause is unsupported, we trade the
> > > > > > >> jump from the static branch for (what gets interpreted as) a nop.
> > > > > > >>
> > > > > > >> Fixes: 8eb060e10185 ("arch/riscv: add Zihintpause support")
> > > > > > >> Signed-off-by: Samuel Holland <[email protected]>
> > > > > > >> ---
> > > > > > >>
> > > > > > >> arch/riscv/include/asm/hwcap.h | 3 ---
> > > > > > >> arch/riscv/include/asm/vdso/processor.h | 25 ++++++++++---------------
> > > > > > >> 2 files changed, 10 insertions(+), 18 deletions(-)
> > > > > > >>
> > > > > > >> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > > > > > >> index 6f59ec64175e..b21d46e68386 100644
> > > > > > >> --- a/arch/riscv/include/asm/hwcap.h
> > > > > > >> +++ b/arch/riscv/include/asm/hwcap.h
> > > > > > >> @@ -68,7 +68,6 @@ enum riscv_isa_ext_id {
> > > > > > >> */
> > > > > > >> enum riscv_isa_ext_key {
> > > > > > >> RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */
> > > > > > >> - RISCV_ISA_EXT_KEY_ZIHINTPAUSE,
> > > > > > >> RISCV_ISA_EXT_KEY_MAX,
> > > > > > >> };
> > > > > > >>
> > > > > > >> @@ -88,8 +87,6 @@ static __always_inline int riscv_isa_ext2key(int num)
> > > > > > >> return RISCV_ISA_EXT_KEY_FPU;
> > > > > > >> case RISCV_ISA_EXT_d:
> > > > > > >> return RISCV_ISA_EXT_KEY_FPU;
> > > > > > >> - case RISCV_ISA_EXT_ZIHINTPAUSE:
> > > > > > >> - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE;
> > > > > > >> default:
> > > > > > >> return -EINVAL;
> > > > > > >> }
> > > > > > >> diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h
> > > > > > >> index 1e4f8b4aef79..789bdb8211a2 100644
> > > > > > >> --- a/arch/riscv/include/asm/vdso/processor.h
> > > > > > >> +++ b/arch/riscv/include/asm/vdso/processor.h
> > > > > > >> @@ -4,30 +4,25 @@
> > > > > > >>
> > > > > > >> #ifndef __ASSEMBLY__
> > > > > > >>
> > > > > > >> -#include <linux/jump_label.h>
> > > > > > >> #include <asm/barrier.h>
> > > > > > >> -#include <asm/hwcap.h>
> > > > > > >>
> > > > > > >> static inline void cpu_relax(void)
> > > > > > >> {
> > > > > > >> - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) {
> > > > > > >> #ifdef __riscv_muldiv
> > > > > > >> - int dummy;
> > > > > > >> - /* In lieu of a halt instruction, induce a long-latency stall. */
> > > > > > >> - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> > > > > > >> + int dummy;
> > > > > > >> + /* In lieu of a halt instruction, induce a long-latency stall. */
> > > > > > >> + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> > > > > > >> #endif
> > > > > > >> - } else {
> > > > > > >> - /*
> > > > > > >> - * Reduce instruction retirement.
> > > > > > >> - * This assumes the PC changes.
> > > > > > >> - */
> > > > > > >> + /*
> > > > > > >> + * Reduce instruction retirement.
> > > > > > >> + * This assumes the PC changes.
> > > > > > >> + */
> > > > > > >> #ifdef __riscv_zihintpause
> > > > > > >> - __asm__ __volatile__ ("pause");
> > > > > > >> + __asm__ __volatile__ ("pause");
> > > > > > >> #else
> > > > > > >> - /* Encoding of the pause instruction */
> > > > > > >> - __asm__ __volatile__ (".4byte 0x100000F");
> > > > > > >> + /* Encoding of the pause instruction */
> > > > > > >> + __asm__ __volatile__ (".4byte 0x100000F");
> > > > > > >> #endif
> > > > > > >
> > > > > > > hmm, though before this part of the code was only ever accessed
> > > > > > > when the zhintpause extension was really available on the running
> > > > > > > machine while now the pause instruction is called every time.
> > > > > > >
> > > > > > > So I'm just wondering, can't this run into some "illegal instruction"
> > > > > > > thingy on machines not supporting the extension?
> > > > > >
> > > > > > No. The encoding for pause was deliberately chosen to be one of the
> > > > > > “useless” encodings of fence, with the hope that existing
> > > > > > microarchitectures might take a while to execute it and thus it would
> > > > > > still function as a slow-running instruction. It’s somewhat
> > > > > > questionable whether the div is even needed, the worst that happens is
> > > > > > cpu_relax isn’t very relaxed and you spin a bit faster. Any
> > > > > > implementations where that’s true probably also don’t have fancy
> > > > > > clock/power management anyway, and div isn’t going to be a low-power
> > > > > > operation so the only real effect is likely hammering on contended
> > > > > > atomics a bit more, and who cares about that on the low core count
> > > > > > systems we have today.
> > > > >
> > > > > thanks a lot for that explanation, which made things a lot clearer.
> > > > >
> > > > > So as you said, dropping the div part might make the function even smaller,
> > > > > though somehow part of me would want to add some sort of comment to
> > > > > the function for when the next developer stumbles over the unconditional
> > > > > use of pause :-) .
> > > > >
> > > >
> > > > I agree. If that's what microarch will do, we can drop div altogether.
> > > > Though microarch may be treated as nop even if it is undesirable.
> > > > IIRC, the div was introduced for the rocket chip which would induce a
> > > > long latency stall with div instruction (zero as operands).
> > > >
> > > > Does any other core or newer rocket chip actually induce a latency
> > > > stall with div instruction ?
> > > > If not, it is equivalent to NOP as well. We can definitely remove the div.
> > > > The only cores affected will be the older rocket core.
> > > >
> > > > Tagging some folks to understand what their core does.
> > > >
> > > > @Paul Walmsley @Guo Ren @Conor Dooley ?
> > >
> > > I am no microarch expert by _any_ stretch of the imagination, but
> > > from a quick experiment it looks like the u54s on PolarFire SoC behave
> > > in the same way, and div w/ zero operands does in fact take significantly
> > > longer than regular division (looks to be about 3x).
> > >
> >
> > Thanks. Do you have any data on how much the "pause" instruction takes.
>
> So these numbers you may consider as being pulled out of a magic hat
> as all I am doing is reading the counters from userspace and there is
> some variance etc. Plus the fact that I just started hacking at some
> existing code I had lying around as I'm pretty snowed under at the
> moment.
>
> Doing the following takes about 70 cycles on both a PolarFire SoC and an
> unmatched:
> long divisor = 2, dividend = 100000, dest;
> asm("div %0, zero, zero" : "=r" (dest));
> and equates to:
> sd a5,-48(s0)
> div a5,zero,zero
>
> Clocking in at about 40 cycles is some actual divisions, I just did the
> following a dozen times, doing a trivial computation:
> long divisor = 2, dividend = 100000, dest;
> asm("div %0, %1, %2" : "=r" (dividend) : "r" (dividend), "r" (divisor))
>
> ie, a load of the following:
> sd a5,-48(s0)
> ld a5,-48(s0)
> ld a4,-40(s0)
> div a5,a5,a4
>
> So clearly the div w/ zero args makes a difference...
>
> On PolarFire SoC, `0x100000F` takes approx 6 cycles. On my unmatched, it
> takes approx 40. Again, I just had an asm block & called the instruction
> a number times and took the average - here it was 48 times.
>
> Take the actual numbers with a fist full of salt, but at least the
> relative numbers should be of some use to you.
>
> Hope that's somewhat helpful, maybe next week I can do something a
> little more useful for you...
>

Thanks. It would be good to understand what happens when "pause" is
executed on these boards ?

> Conor.
>
> > I understand that it is not available in these cores. Just wanted to
> > understand if microarchitecture
> > actually takes a while executing the useless encoding as pointed out by Jessica.
> >
> > If that's the case, we can remove the div instruction altogether.
> > Otherwise, this patch will cause some performance regression
> > for existing SoC (HiFive unleashed has the same core. Not sure about
> > unmatched though).
> > This needs to be documented at least.
> >
> > > Hope that's helpful,
> > > Conor.
> > >
> > > (I just did a quick check of what pretty much amounted to a bunch of
> > > div a5,zero,zero in a row versus div a5,a5,a5)
> > >
> > > >
> > > > (Please add anybody who may have an insight to execution flow on
> > > > existing Linux capable cores)
> > > >
> > > > >
> > > > > Heiko
> > > > >
> > > > >
> > > > >
> > > > > _______________________________________________
> > > > > linux-riscv mailing list
> > > > > [email protected]
> > > > > http://lists.infradead.org/mailman/listinfo/linux-riscv
> > > >
> > > >
> > > >
> > > > --
> > > > Regards,
> > > > Atish
> > > >
> > > > _______________________________________________
> > > > linux-riscv mailing list
> > > > [email protected]
> > > > http://lists.infradead.org/mailman/listinfo/linux-riscv
> >
> >
> >
> > --
> > Regards,
> > Atish
> >
> > _______________________________________________
> > linux-riscv mailing list
> > [email protected]
> > http://lists.infradead.org/mailman/listinfo/linux-riscv



--
Regards,
Atish

2022-10-01 20:29:51

by Conor Dooley

[permalink] [raw]
Subject: Re: [PATCH] riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y

On Wed, Sep 28, 2022 at 08:26:01PM -0700, Atish Patra wrote:
> On Wed, Sep 28, 2022 at 2:16 PM Conor Dooley <[email protected]> wrote:
> >
> > On Wed, Sep 28, 2022 at 12:21:55AM -0700, Atish Patra wrote:
> > > On Sat, Sep 24, 2022 at 4:15 PM Conor Dooley <[email protected]> wrote:
> > > >
> > > > On Fri, Sep 23, 2022 at 11:01:28AM -0700, Atish Patra wrote:
> > > > > On Fri, Sep 23, 2022 at 12:18 AM Heiko Stuebner <[email protected]> wrote:
> > > > > >
> > > > > > Hi,
> > > > > >
> > > > > > Am Donnerstag, 22. September 2022, 17:52:46 CEST schrieb Jessica Clarke:
> > > > > > > On 22 Sept 2022, at 16:45, Heiko Stuebner <[email protected]> wrote:
> > > > > > > >
> > > > > > > > Am Donnerstag, 22. September 2022, 08:09:58 CEST schrieb Samuel Holland:
> > > > > > > >> commit 8eb060e10185 ("arch/riscv: add Zihintpause support") broke
> > > > > > > >> building with CONFIG_CC_OPTIMIZE_FOR_SIZE enabled (gcc 11.1.0):
> > > > > > > >>
> > > > > > > >> CC arch/riscv/kernel/vdso/vgettimeofday.o
> > > > > > > >> In file included from <command-line>:
> > > > > > > >> ./arch/riscv/include/asm/jump_label.h: In function 'cpu_relax':
> > > > > > > >> ././include/linux/compiler_types.h:285:33: warning: 'asm' operand 0 probably does not match constraints
> > > > > > > >> 285 | #define asm_volatile_goto(x...) asm goto(x)
> > > > > > > >> | ^~~
> > > > > > > >> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> > > > > > > >> 41 | asm_volatile_goto(
> > > > > > > >> | ^~~~~~~~~~~~~~~~~
> > > > > > > >> ././include/linux/compiler_types.h:285:33: error: impossible constraint in 'asm'
> > > > > > > >> 285 | #define asm_volatile_goto(x...) asm goto(x)
> > > > > > > >> | ^~~
> > > > > > > >> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> > > > > > > >> 41 | asm_volatile_goto(
> > > > > > > >> | ^~~~~~~~~~~~~~~~~
> > > > > > > >> make[1]: *** [scripts/Makefile.build:249: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1
> > > > > > > >> make: *** [arch/riscv/Makefile:128: vdso_prepare] Error 2
> > > > > > > >>
> > > > > > > >> Having a static branch in cpu_relax() is problematic because that
> > > > > > > >> function is widely inlined, including in some quite complex functions
> > > > > > > >> like in the VDSO. A quick measurement shows this static branch is
> > > > > > > >> responsible by itself for around 40% of the jump table.
> > > > > > > >>
> > > > > > > >> Drop the static branch, which ends up being the same number of
> > > > > > > >> instructions anyway. If Zihintpause is supported, we trade the nop from
> > > > > > > >> the static branch for a div. If Zihintpause is unsupported, we trade the
> > > > > > > >> jump from the static branch for (what gets interpreted as) a nop.
> > > > > > > >>
> > > > > > > >> Fixes: 8eb060e10185 ("arch/riscv: add Zihintpause support")
> > > > > > > >> Signed-off-by: Samuel Holland <[email protected]>
> > > > > > > >> ---
> > > > > > > >>
> > > > > > > >> arch/riscv/include/asm/hwcap.h | 3 ---
> > > > > > > >> arch/riscv/include/asm/vdso/processor.h | 25 ++++++++++---------------
> > > > > > > >> 2 files changed, 10 insertions(+), 18 deletions(-)
> > > > > > > >>
> > > > > > > >> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > > > > > > >> index 6f59ec64175e..b21d46e68386 100644
> > > > > > > >> --- a/arch/riscv/include/asm/hwcap.h
> > > > > > > >> +++ b/arch/riscv/include/asm/hwcap.h
> > > > > > > >> @@ -68,7 +68,6 @@ enum riscv_isa_ext_id {
> > > > > > > >> */
> > > > > > > >> enum riscv_isa_ext_key {
> > > > > > > >> RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */
> > > > > > > >> - RISCV_ISA_EXT_KEY_ZIHINTPAUSE,
> > > > > > > >> RISCV_ISA_EXT_KEY_MAX,
> > > > > > > >> };
> > > > > > > >>
> > > > > > > >> @@ -88,8 +87,6 @@ static __always_inline int riscv_isa_ext2key(int num)
> > > > > > > >> return RISCV_ISA_EXT_KEY_FPU;
> > > > > > > >> case RISCV_ISA_EXT_d:
> > > > > > > >> return RISCV_ISA_EXT_KEY_FPU;
> > > > > > > >> - case RISCV_ISA_EXT_ZIHINTPAUSE:
> > > > > > > >> - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE;
> > > > > > > >> default:
> > > > > > > >> return -EINVAL;
> > > > > > > >> }
> > > > > > > >> diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h
> > > > > > > >> index 1e4f8b4aef79..789bdb8211a2 100644
> > > > > > > >> --- a/arch/riscv/include/asm/vdso/processor.h
> > > > > > > >> +++ b/arch/riscv/include/asm/vdso/processor.h
> > > > > > > >> @@ -4,30 +4,25 @@
> > > > > > > >>
> > > > > > > >> #ifndef __ASSEMBLY__
> > > > > > > >>
> > > > > > > >> -#include <linux/jump_label.h>
> > > > > > > >> #include <asm/barrier.h>
> > > > > > > >> -#include <asm/hwcap.h>
> > > > > > > >>
> > > > > > > >> static inline void cpu_relax(void)
> > > > > > > >> {
> > > > > > > >> - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) {
> > > > > > > >> #ifdef __riscv_muldiv
> > > > > > > >> - int dummy;
> > > > > > > >> - /* In lieu of a halt instruction, induce a long-latency stall. */
> > > > > > > >> - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> > > > > > > >> + int dummy;
> > > > > > > >> + /* In lieu of a halt instruction, induce a long-latency stall. */
> > > > > > > >> + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> > > > > > > >> #endif
> > > > > > > >> - } else {
> > > > > > > >> - /*
> > > > > > > >> - * Reduce instruction retirement.
> > > > > > > >> - * This assumes the PC changes.
> > > > > > > >> - */
> > > > > > > >> + /*
> > > > > > > >> + * Reduce instruction retirement.
> > > > > > > >> + * This assumes the PC changes.
> > > > > > > >> + */
> > > > > > > >> #ifdef __riscv_zihintpause
> > > > > > > >> - __asm__ __volatile__ ("pause");
> > > > > > > >> + __asm__ __volatile__ ("pause");
> > > > > > > >> #else
> > > > > > > >> - /* Encoding of the pause instruction */
> > > > > > > >> - __asm__ __volatile__ (".4byte 0x100000F");
> > > > > > > >> + /* Encoding of the pause instruction */
> > > > > > > >> + __asm__ __volatile__ (".4byte 0x100000F");
> > > > > > > >> #endif
> > > > > > > >
> > > > > > > > hmm, though before this part of the code was only ever accessed
> > > > > > > > when the zhintpause extension was really available on the running
> > > > > > > > machine while now the pause instruction is called every time.
> > > > > > > >
> > > > > > > > So I'm just wondering, can't this run into some "illegal instruction"
> > > > > > > > thingy on machines not supporting the extension?
> > > > > > >
> > > > > > > No. The encoding for pause was deliberately chosen to be one of the
> > > > > > > “useless” encodings of fence, with the hope that existing
> > > > > > > microarchitectures might take a while to execute it and thus it would
> > > > > > > still function as a slow-running instruction. It’s somewhat
> > > > > > > questionable whether the div is even needed, the worst that happens is
> > > > > > > cpu_relax isn’t very relaxed and you spin a bit faster. Any
> > > > > > > implementations where that’s true probably also don’t have fancy
> > > > > > > clock/power management anyway, and div isn’t going to be a low-power
> > > > > > > operation so the only real effect is likely hammering on contended
> > > > > > > atomics a bit more, and who cares about that on the low core count
> > > > > > > systems we have today.
> > > > > >
> > > > > > thanks a lot for that explanation, which made things a lot clearer.
> > > > > >
> > > > > > So as you said, dropping the div part might make the function even smaller,
> > > > > > though somehow part of me would want to add some sort of comment to
> > > > > > the function for when the next developer stumbles over the unconditional
> > > > > > use of pause :-) .
> > > > > >
> > > > >
> > > > > I agree. If that's what microarch will do, we can drop div altogether.
> > > > > Though microarch may be treated as nop even if it is undesirable.
> > > > > IIRC, the div was introduced for the rocket chip which would induce a
> > > > > long latency stall with div instruction (zero as operands).
> > > > >
> > > > > Does any other core or newer rocket chip actually induce a latency
> > > > > stall with div instruction ?
> > > > > If not, it is equivalent to NOP as well. We can definitely remove the div.
> > > > > The only cores affected will be the older rocket core.
> > > > >
> > > > > Tagging some folks to understand what their core does.
> > > > >
> > > > > @Paul Walmsley @Guo Ren @Conor Dooley ?
> > > >
> > > > I am no microarch expert by _any_ stretch of the imagination, but
> > > > from a quick experiment it looks like the u54s on PolarFire SoC behave
> > > > in the same way, and div w/ zero operands does in fact take significantly
> > > > longer than regular division (looks to be about 3x).
> > > >
> > >
> > > Thanks. Do you have any data on how much the "pause" instruction takes.
> >
> > So these numbers you may consider as being pulled out of a magic hat
> > as all I am doing is reading the counters from userspace and there is
> > some variance etc. Plus the fact that I just started hacking at some
> > existing code I had lying around as I'm pretty snowed under at the
> > moment.
> >
> > Doing the following takes about 70 cycles on both a PolarFire SoC and an
> > unmatched:
> > long divisor = 2, dividend = 100000, dest;
> > asm("div %0, zero, zero" : "=r" (dest));
> > and equates to:
> > sd a5,-48(s0)
> > div a5,zero,zero
> >
> > Clocking in at about 40 cycles is some actual divisions, I just did the
> > following a dozen times, doing a trivial computation:
> > long divisor = 2, dividend = 100000, dest;
> > asm("div %0, %1, %2" : "=r" (dividend) : "r" (dividend), "r" (divisor))
> >
> > ie, a load of the following:
> > sd a5,-48(s0)
> > ld a5,-48(s0)
> > ld a4,-40(s0)
> > div a5,a5,a4
> >
> > So clearly the div w/ zero args makes a difference...
> >
> > On PolarFire SoC, `0x100000F` takes approx 6 cycles. On my unmatched, it
> > takes approx 40. Again, I just had an asm block & called the instruction
> > a number times and took the average - here it was 48 times.
> >
> > Take the actual numbers with a fist full of salt, but at least the
> > relative numbers should be of some use to you.
> >
> > Hope that's somewhat helpful, maybe next week I can do something a
> > little more useful for you...
> >
>
> Thanks. It would be good to understand what happens when "pause" is
> executed on these boards ?

The actual pause instruction? uhh, so with the usual "I don't know what
I am doing" disclaimer, I ran each of the .insn and pause instruction 48
times in a row and checked the time elapsed via rdcycle & then ran that
c program 1000 times in a bash loop. Got the below, the insns were run
first and then the pauses.
insn pause
min 2.3 3.2
max 9.5 10.6
avg 27.0 29.1
5% 2.9 4.2
95% 18.1 19.1

Swapping the pause & insn order around made a minor difference, but not
enough to report on. I'd be very wary of drawing any real conclusions
from this data, but at least both are roughly similar (and certainly not
even close to doing the div w/ zero args.

Again, hope that is helpful?
Conor.

>
> > Conor.
> >
> > > I understand that it is not available in these cores. Just wanted to
> > > understand if microarchitecture
> > > actually takes a while executing the useless encoding as pointed out by Jessica.
> > >
> > > If that's the case, we can remove the div instruction altogether.
> > > Otherwise, this patch will cause some performance regression
> > > for existing SoC (HiFive unleashed has the same core. Not sure about
> > > unmatched though).
> > > This needs to be documented at least.
> > >
> > > > Hope that's helpful,
> > > > Conor.
> > > >
> > > > (I just did a quick check of what pretty much amounted to a bunch of
> > > > div a5,zero,zero in a row versus div a5,a5,a5)
> > > >
> > > > >
> > > > > (Please add anybody who may have an insight to execution flow on
> > > > > existing Linux capable cores)
> > > > >

2022-10-04 17:21:43

by Atish Patra

[permalink] [raw]
Subject: Re: [PATCH] riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y

On Sat, Oct 1, 2022 at 1:13 PM Conor Dooley <[email protected]> wrote:
>
> On Wed, Sep 28, 2022 at 08:26:01PM -0700, Atish Patra wrote:
> > On Wed, Sep 28, 2022 at 2:16 PM Conor Dooley <[email protected]> wrote:
> > >
> > > On Wed, Sep 28, 2022 at 12:21:55AM -0700, Atish Patra wrote:
> > > > On Sat, Sep 24, 2022 at 4:15 PM Conor Dooley <[email protected]> wrote:
> > > > >
> > > > > On Fri, Sep 23, 2022 at 11:01:28AM -0700, Atish Patra wrote:
> > > > > > On Fri, Sep 23, 2022 at 12:18 AM Heiko Stuebner <[email protected]> wrote:
> > > > > > >
> > > > > > > Hi,
> > > > > > >
> > > > > > > Am Donnerstag, 22. September 2022, 17:52:46 CEST schrieb Jessica Clarke:
> > > > > > > > On 22 Sept 2022, at 16:45, Heiko Stuebner <[email protected]> wrote:
> > > > > > > > >
> > > > > > > > > Am Donnerstag, 22. September 2022, 08:09:58 CEST schrieb Samuel Holland:
> > > > > > > > >> commit 8eb060e10185 ("arch/riscv: add Zihintpause support") broke
> > > > > > > > >> building with CONFIG_CC_OPTIMIZE_FOR_SIZE enabled (gcc 11.1.0):
> > > > > > > > >>
> > > > > > > > >> CC arch/riscv/kernel/vdso/vgettimeofday.o
> > > > > > > > >> In file included from <command-line>:
> > > > > > > > >> ./arch/riscv/include/asm/jump_label.h: In function 'cpu_relax':
> > > > > > > > >> ././include/linux/compiler_types.h:285:33: warning: 'asm' operand 0 probably does not match constraints
> > > > > > > > >> 285 | #define asm_volatile_goto(x...) asm goto(x)
> > > > > > > > >> | ^~~
> > > > > > > > >> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> > > > > > > > >> 41 | asm_volatile_goto(
> > > > > > > > >> | ^~~~~~~~~~~~~~~~~
> > > > > > > > >> ././include/linux/compiler_types.h:285:33: error: impossible constraint in 'asm'
> > > > > > > > >> 285 | #define asm_volatile_goto(x...) asm goto(x)
> > > > > > > > >> | ^~~
> > > > > > > > >> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> > > > > > > > >> 41 | asm_volatile_goto(
> > > > > > > > >> | ^~~~~~~~~~~~~~~~~
> > > > > > > > >> make[1]: *** [scripts/Makefile.build:249: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1
> > > > > > > > >> make: *** [arch/riscv/Makefile:128: vdso_prepare] Error 2
> > > > > > > > >>
> > > > > > > > >> Having a static branch in cpu_relax() is problematic because that
> > > > > > > > >> function is widely inlined, including in some quite complex functions
> > > > > > > > >> like in the VDSO. A quick measurement shows this static branch is
> > > > > > > > >> responsible by itself for around 40% of the jump table.
> > > > > > > > >>
> > > > > > > > >> Drop the static branch, which ends up being the same number of
> > > > > > > > >> instructions anyway. If Zihintpause is supported, we trade the nop from
> > > > > > > > >> the static branch for a div. If Zihintpause is unsupported, we trade the
> > > > > > > > >> jump from the static branch for (what gets interpreted as) a nop.
> > > > > > > > >>
> > > > > > > > >> Fixes: 8eb060e10185 ("arch/riscv: add Zihintpause support")
> > > > > > > > >> Signed-off-by: Samuel Holland <[email protected]>
> > > > > > > > >> ---
> > > > > > > > >>
> > > > > > > > >> arch/riscv/include/asm/hwcap.h | 3 ---
> > > > > > > > >> arch/riscv/include/asm/vdso/processor.h | 25 ++++++++++---------------
> > > > > > > > >> 2 files changed, 10 insertions(+), 18 deletions(-)
> > > > > > > > >>
> > > > > > > > >> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > > > > > > > >> index 6f59ec64175e..b21d46e68386 100644
> > > > > > > > >> --- a/arch/riscv/include/asm/hwcap.h
> > > > > > > > >> +++ b/arch/riscv/include/asm/hwcap.h
> > > > > > > > >> @@ -68,7 +68,6 @@ enum riscv_isa_ext_id {
> > > > > > > > >> */
> > > > > > > > >> enum riscv_isa_ext_key {
> > > > > > > > >> RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */
> > > > > > > > >> - RISCV_ISA_EXT_KEY_ZIHINTPAUSE,
> > > > > > > > >> RISCV_ISA_EXT_KEY_MAX,
> > > > > > > > >> };
> > > > > > > > >>
> > > > > > > > >> @@ -88,8 +87,6 @@ static __always_inline int riscv_isa_ext2key(int num)
> > > > > > > > >> return RISCV_ISA_EXT_KEY_FPU;
> > > > > > > > >> case RISCV_ISA_EXT_d:
> > > > > > > > >> return RISCV_ISA_EXT_KEY_FPU;
> > > > > > > > >> - case RISCV_ISA_EXT_ZIHINTPAUSE:
> > > > > > > > >> - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE;
> > > > > > > > >> default:
> > > > > > > > >> return -EINVAL;
> > > > > > > > >> }
> > > > > > > > >> diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h
> > > > > > > > >> index 1e4f8b4aef79..789bdb8211a2 100644
> > > > > > > > >> --- a/arch/riscv/include/asm/vdso/processor.h
> > > > > > > > >> +++ b/arch/riscv/include/asm/vdso/processor.h
> > > > > > > > >> @@ -4,30 +4,25 @@
> > > > > > > > >>
> > > > > > > > >> #ifndef __ASSEMBLY__
> > > > > > > > >>
> > > > > > > > >> -#include <linux/jump_label.h>
> > > > > > > > >> #include <asm/barrier.h>
> > > > > > > > >> -#include <asm/hwcap.h>
> > > > > > > > >>
> > > > > > > > >> static inline void cpu_relax(void)
> > > > > > > > >> {
> > > > > > > > >> - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) {
> > > > > > > > >> #ifdef __riscv_muldiv
> > > > > > > > >> - int dummy;
> > > > > > > > >> - /* In lieu of a halt instruction, induce a long-latency stall. */
> > > > > > > > >> - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> > > > > > > > >> + int dummy;
> > > > > > > > >> + /* In lieu of a halt instruction, induce a long-latency stall. */
> > > > > > > > >> + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> > > > > > > > >> #endif
> > > > > > > > >> - } else {
> > > > > > > > >> - /*
> > > > > > > > >> - * Reduce instruction retirement.
> > > > > > > > >> - * This assumes the PC changes.
> > > > > > > > >> - */
> > > > > > > > >> + /*
> > > > > > > > >> + * Reduce instruction retirement.
> > > > > > > > >> + * This assumes the PC changes.
> > > > > > > > >> + */
> > > > > > > > >> #ifdef __riscv_zihintpause
> > > > > > > > >> - __asm__ __volatile__ ("pause");
> > > > > > > > >> + __asm__ __volatile__ ("pause");
> > > > > > > > >> #else
> > > > > > > > >> - /* Encoding of the pause instruction */
> > > > > > > > >> - __asm__ __volatile__ (".4byte 0x100000F");
> > > > > > > > >> + /* Encoding of the pause instruction */
> > > > > > > > >> + __asm__ __volatile__ (".4byte 0x100000F");
> > > > > > > > >> #endif
> > > > > > > > >
> > > > > > > > > hmm, though before this part of the code was only ever accessed
> > > > > > > > > when the zhintpause extension was really available on the running
> > > > > > > > > machine while now the pause instruction is called every time.
> > > > > > > > >
> > > > > > > > > So I'm just wondering, can't this run into some "illegal instruction"
> > > > > > > > > thingy on machines not supporting the extension?
> > > > > > > >
> > > > > > > > No. The encoding for pause was deliberately chosen to be one of the
> > > > > > > > “useless” encodings of fence, with the hope that existing
> > > > > > > > microarchitectures might take a while to execute it and thus it would
> > > > > > > > still function as a slow-running instruction. It’s somewhat
> > > > > > > > questionable whether the div is even needed, the worst that happens is
> > > > > > > > cpu_relax isn’t very relaxed and you spin a bit faster. Any
> > > > > > > > implementations where that’s true probably also don’t have fancy
> > > > > > > > clock/power management anyway, and div isn’t going to be a low-power
> > > > > > > > operation so the only real effect is likely hammering on contended
> > > > > > > > atomics a bit more, and who cares about that on the low core count
> > > > > > > > systems we have today.
> > > > > > >
> > > > > > > thanks a lot for that explanation, which made things a lot clearer.
> > > > > > >
> > > > > > > So as you said, dropping the div part might make the function even smaller,
> > > > > > > though somehow part of me would want to add some sort of comment to
> > > > > > > the function for when the next developer stumbles over the unconditional
> > > > > > > use of pause :-) .
> > > > > > >
> > > > > >
> > > > > > I agree. If that's what microarch will do, we can drop div altogether.
> > > > > > Though microarch may be treated as nop even if it is undesirable.
> > > > > > IIRC, the div was introduced for the rocket chip which would induce a
> > > > > > long latency stall with div instruction (zero as operands).
> > > > > >
> > > > > > Does any other core or newer rocket chip actually induce a latency
> > > > > > stall with div instruction ?
> > > > > > If not, it is equivalent to NOP as well. We can definitely remove the div.
> > > > > > The only cores affected will be the older rocket core.
> > > > > >
> > > > > > Tagging some folks to understand what their core does.
> > > > > >
> > > > > > @Paul Walmsley @Guo Ren @Conor Dooley ?
> > > > >
> > > > > I am no microarch expert by _any_ stretch of the imagination, but
> > > > > from a quick experiment it looks like the u54s on PolarFire SoC behave
> > > > > in the same way, and div w/ zero operands does in fact take significantly
> > > > > longer than regular division (looks to be about 3x).
> > > > >
> > > >
> > > > Thanks. Do you have any data on how much the "pause" instruction takes.
> > >
> > > So these numbers you may consider as being pulled out of a magic hat
> > > as all I am doing is reading the counters from userspace and there is
> > > some variance etc. Plus the fact that I just started hacking at some
> > > existing code I had lying around as I'm pretty snowed under at the
> > > moment.
> > >
> > > Doing the following takes about 70 cycles on both a PolarFire SoC and an
> > > unmatched:
> > > long divisor = 2, dividend = 100000, dest;
> > > asm("div %0, zero, zero" : "=r" (dest));
> > > and equates to:
> > > sd a5,-48(s0)
> > > div a5,zero,zero
> > >
> > > Clocking in at about 40 cycles is some actual divisions, I just did the
> > > following a dozen times, doing a trivial computation:
> > > long divisor = 2, dividend = 100000, dest;
> > > asm("div %0, %1, %2" : "=r" (dividend) : "r" (dividend), "r" (divisor))
> > >
> > > ie, a load of the following:
> > > sd a5,-48(s0)
> > > ld a5,-48(s0)
> > > ld a4,-40(s0)
> > > div a5,a5,a4
> > >
> > > So clearly the div w/ zero args makes a difference...
> > >
> > > On PolarFire SoC, `0x100000F` takes approx 6 cycles. On my unmatched, it
> > > takes approx 40. Again, I just had an asm block & called the instruction
> > > a number times and took the average - here it was 48 times.
> > >
> > > Take the actual numbers with a fist full of salt, but at least the
> > > relative numbers should be of some use to you.
> > >
> > > Hope that's somewhat helpful, maybe next week I can do something a
> > > little more useful for you...
> > >
> >
> > Thanks. It would be good to understand what happens when "pause" is
> > executed on these boards ?
>
> The actual pause instruction? uhh, so with the usual "I don't know what
> I am doing" disclaimer, I ran each of the .insn and pause instruction 48
> times in a row and checked the time elapsed via rdcycle & then ran that
> c program 1000 times in a bash loop. Got the below, the insns were run
> first and then the pauses.
> insn pause
> min 2.3 3.2
> max 9.5 10.6
> avg 27.0 29.1
> 5% 2.9 4.2
> 95% 18.1 19.1
>
> Swapping the pause & insn order around made a minor difference, but not
> enough to report on. I'd be very wary of drawing any real conclusions
> from this data, but at least both are roughly similar (and certainly not
> even close to doing the div w/ zero args.
>

Yeah. That's what I was expecting. So we can't drop the div for now. Otherwise,
the existing hardware(don't support Zhintpause) suffers by spinning faster.

Thanks for running the experiments.

> Again, hope that is helpful?
> Conor.
>
> >
> > > Conor.
> > >
> > > > I understand that it is not available in these cores. Just wanted to
> > > > understand if microarchitecture
> > > > actually takes a while executing the useless encoding as pointed out by Jessica.
> > > >
> > > > If that's the case, we can remove the div instruction altogether.
> > > > Otherwise, this patch will cause some performance regression
> > > > for existing SoC (HiFive unleashed has the same core. Not sure about
> > > > unmatched though).
> > > > This needs to be documented at least.
> > > >
> > > > > Hope that's helpful,
> > > > > Conor.
> > > > >
> > > > > (I just did a quick check of what pretty much amounted to a bunch of
> > > > > div a5,zero,zero in a row versus div a5,a5,a5)
> > > > >
> > > > > >
> > > > > > (Please add anybody who may have an insight to execution flow on
> > > > > > existing Linux capable cores)
> > > > > >
>


--
Regards,
Atish

2022-10-04 17:22:22

by Conor Dooley

[permalink] [raw]
Subject: Re: [PATCH] riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y

On Tue, Oct 04, 2022 at 09:52:41AM -0700, Atish Patra wrote:
> On Sat, Oct 1, 2022 at 1:13 PM Conor Dooley <[email protected]> wrote:

> > > Thanks. It would be good to understand what happens when "pause" is
> > > executed on these boards ?
> >
> > The actual pause instruction? uhh, so with the usual "I don't know what
> > I am doing" disclaimer, I ran each of the .insn and pause instruction 48
> > times in a row and checked the time elapsed via rdcycle & then ran that
> > c program 1000 times in a bash loop. Got the below, the insns were run
> > first and then the pauses.
> > insn pause
> > min 2.3 3.2
> > max 9.5 10.6
> > avg 27.0 29.1
> > 5% 2.9 4.2
> > 95% 18.1 19.1
> >
> > Swapping the pause & insn order around made a minor difference, but not
> > enough to report on. I'd be very wary of drawing any real conclusions
> > from this data, but at least both are roughly similar (and certainly not
> > even close to doing the div w/ zero args.
> >
>
> Yeah. That's what I was expecting. So we can't drop the div for now. Otherwise,
> the existing hardware(don't support Zhintpause) suffers by spinning faster.
>
> Thanks for running the experiments.

I've lost track, does that mean the patch is okay as, is or needs to be
changed? The former, right?

Thanks,
Conor.

2022-10-04 17:46:27

by Jessica Clarke

[permalink] [raw]
Subject: Re: [PATCH] riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y

On 4 Oct 2022, at 17:52, Atish Patra <[email protected]> wrote:
>
> On Sat, Oct 1, 2022 at 1:13 PM Conor Dooley <[email protected]> wrote:
>>
>> On Wed, Sep 28, 2022 at 08:26:01PM -0700, Atish Patra wrote:
>>> On Wed, Sep 28, 2022 at 2:16 PM Conor Dooley <[email protected]> wrote:
>>>>
>>>> On Wed, Sep 28, 2022 at 12:21:55AM -0700, Atish Patra wrote:
>>>>> On Sat, Sep 24, 2022 at 4:15 PM Conor Dooley <[email protected]> wrote:
>>>>>>
>>>>>> On Fri, Sep 23, 2022 at 11:01:28AM -0700, Atish Patra wrote:
>>>>>>> On Fri, Sep 23, 2022 at 12:18 AM Heiko Stuebner <[email protected]> wrote:
>>>>>>>>
>>>>>>>> Hi,
>>>>>>>>
>>>>>>>> Am Donnerstag, 22. September 2022, 17:52:46 CEST schrieb Jessica Clarke:
>>>>>>>>> On 22 Sept 2022, at 16:45, Heiko Stuebner <[email protected]> wrote:
>>>>>>>>>>
>>>>>>>>>> Am Donnerstag, 22. September 2022, 08:09:58 CEST schrieb Samuel Holland:
>>>>>>>>>>> commit 8eb060e10185 ("arch/riscv: add Zihintpause support") broke
>>>>>>>>>>> building with CONFIG_CC_OPTIMIZE_FOR_SIZE enabled (gcc 11.1.0):
>>>>>>>>>>>
>>>>>>>>>>> CC arch/riscv/kernel/vdso/vgettimeofday.o
>>>>>>>>>>> In file included from <command-line>:
>>>>>>>>>>> ./arch/riscv/include/asm/jump_label.h: In function 'cpu_relax':
>>>>>>>>>>> ././include/linux/compiler_types.h:285:33: warning: 'asm' operand 0 probably does not match constraints
>>>>>>>>>>> 285 | #define asm_volatile_goto(x...) asm goto(x)
>>>>>>>>>>> | ^~~
>>>>>>>>>>> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
>>>>>>>>>>> 41 | asm_volatile_goto(
>>>>>>>>>>> | ^~~~~~~~~~~~~~~~~
>>>>>>>>>>> ././include/linux/compiler_types.h:285:33: error: impossible constraint in 'asm'
>>>>>>>>>>> 285 | #define asm_volatile_goto(x...) asm goto(x)
>>>>>>>>>>> | ^~~
>>>>>>>>>>> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
>>>>>>>>>>> 41 | asm_volatile_goto(
>>>>>>>>>>> | ^~~~~~~~~~~~~~~~~
>>>>>>>>>>> make[1]: *** [scripts/Makefile.build:249: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1
>>>>>>>>>>> make: *** [arch/riscv/Makefile:128: vdso_prepare] Error 2
>>>>>>>>>>>
>>>>>>>>>>> Having a static branch in cpu_relax() is problematic because that
>>>>>>>>>>> function is widely inlined, including in some quite complex functions
>>>>>>>>>>> like in the VDSO. A quick measurement shows this static branch is
>>>>>>>>>>> responsible by itself for around 40% of the jump table.
>>>>>>>>>>>
>>>>>>>>>>> Drop the static branch, which ends up being the same number of
>>>>>>>>>>> instructions anyway. If Zihintpause is supported, we trade the nop from
>>>>>>>>>>> the static branch for a div. If Zihintpause is unsupported, we trade the
>>>>>>>>>>> jump from the static branch for (what gets interpreted as) a nop.
>>>>>>>>>>>
>>>>>>>>>>> Fixes: 8eb060e10185 ("arch/riscv: add Zihintpause support")
>>>>>>>>>>> Signed-off-by: Samuel Holland <[email protected]>
>>>>>>>>>>> ---
>>>>>>>>>>>
>>>>>>>>>>> arch/riscv/include/asm/hwcap.h | 3 ---
>>>>>>>>>>> arch/riscv/include/asm/vdso/processor.h | 25 ++++++++++---------------
>>>>>>>>>>> 2 files changed, 10 insertions(+), 18 deletions(-)
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
>>>>>>>>>>> index 6f59ec64175e..b21d46e68386 100644
>>>>>>>>>>> --- a/arch/riscv/include/asm/hwcap.h
>>>>>>>>>>> +++ b/arch/riscv/include/asm/hwcap.h
>>>>>>>>>>> @@ -68,7 +68,6 @@ enum riscv_isa_ext_id {
>>>>>>>>>>> */
>>>>>>>>>>> enum riscv_isa_ext_key {
>>>>>>>>>>> RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */
>>>>>>>>>>> - RISCV_ISA_EXT_KEY_ZIHINTPAUSE,
>>>>>>>>>>> RISCV_ISA_EXT_KEY_MAX,
>>>>>>>>>>> };
>>>>>>>>>>>
>>>>>>>>>>> @@ -88,8 +87,6 @@ static __always_inline int riscv_isa_ext2key(int num)
>>>>>>>>>>> return RISCV_ISA_EXT_KEY_FPU;
>>>>>>>>>>> case RISCV_ISA_EXT_d:
>>>>>>>>>>> return RISCV_ISA_EXT_KEY_FPU;
>>>>>>>>>>> - case RISCV_ISA_EXT_ZIHINTPAUSE:
>>>>>>>>>>> - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE;
>>>>>>>>>>> default:
>>>>>>>>>>> return -EINVAL;
>>>>>>>>>>> }
>>>>>>>>>>> diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h
>>>>>>>>>>> index 1e4f8b4aef79..789bdb8211a2 100644
>>>>>>>>>>> --- a/arch/riscv/include/asm/vdso/processor.h
>>>>>>>>>>> +++ b/arch/riscv/include/asm/vdso/processor.h
>>>>>>>>>>> @@ -4,30 +4,25 @@
>>>>>>>>>>>
>>>>>>>>>>> #ifndef __ASSEMBLY__
>>>>>>>>>>>
>>>>>>>>>>> -#include <linux/jump_label.h>
>>>>>>>>>>> #include <asm/barrier.h>
>>>>>>>>>>> -#include <asm/hwcap.h>
>>>>>>>>>>>
>>>>>>>>>>> static inline void cpu_relax(void)
>>>>>>>>>>> {
>>>>>>>>>>> - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) {
>>>>>>>>>>> #ifdef __riscv_muldiv
>>>>>>>>>>> - int dummy;
>>>>>>>>>>> - /* In lieu of a halt instruction, induce a long-latency stall. */
>>>>>>>>>>> - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
>>>>>>>>>>> + int dummy;
>>>>>>>>>>> + /* In lieu of a halt instruction, induce a long-latency stall. */
>>>>>>>>>>> + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
>>>>>>>>>>> #endif
>>>>>>>>>>> - } else {
>>>>>>>>>>> - /*
>>>>>>>>>>> - * Reduce instruction retirement.
>>>>>>>>>>> - * This assumes the PC changes.
>>>>>>>>>>> - */
>>>>>>>>>>> + /*
>>>>>>>>>>> + * Reduce instruction retirement.
>>>>>>>>>>> + * This assumes the PC changes.
>>>>>>>>>>> + */
>>>>>>>>>>> #ifdef __riscv_zihintpause
>>>>>>>>>>> - __asm__ __volatile__ ("pause");
>>>>>>>>>>> + __asm__ __volatile__ ("pause");
>>>>>>>>>>> #else
>>>>>>>>>>> - /* Encoding of the pause instruction */
>>>>>>>>>>> - __asm__ __volatile__ (".4byte 0x100000F");
>>>>>>>>>>> + /* Encoding of the pause instruction */
>>>>>>>>>>> + __asm__ __volatile__ (".4byte 0x100000F");
>>>>>>>>>>> #endif
>>>>>>>>>>
>>>>>>>>>> hmm, though before this part of the code was only ever accessed
>>>>>>>>>> when the zhintpause extension was really available on the running
>>>>>>>>>> machine while now the pause instruction is called every time.
>>>>>>>>>>
>>>>>>>>>> So I'm just wondering, can't this run into some "illegal instruction"
>>>>>>>>>> thingy on machines not supporting the extension?
>>>>>>>>>
>>>>>>>>> No. The encoding for pause was deliberately chosen to be one of the
>>>>>>>>> “useless” encodings of fence, with the hope that existing
>>>>>>>>> microarchitectures might take a while to execute it and thus it would
>>>>>>>>> still function as a slow-running instruction. It’s somewhat
>>>>>>>>> questionable whether the div is even needed, the worst that happens is
>>>>>>>>> cpu_relax isn’t very relaxed and you spin a bit faster. Any
>>>>>>>>> implementations where that’s true probably also don’t have fancy
>>>>>>>>> clock/power management anyway, and div isn’t going to be a low-power
>>>>>>>>> operation so the only real effect is likely hammering on contended
>>>>>>>>> atomics a bit more, and who cares about that on the low core count
>>>>>>>>> systems we have today.
>>>>>>>>
>>>>>>>> thanks a lot for that explanation, which made things a lot clearer.
>>>>>>>>
>>>>>>>> So as you said, dropping the div part might make the function even smaller,
>>>>>>>> though somehow part of me would want to add some sort of comment to
>>>>>>>> the function for when the next developer stumbles over the unconditional
>>>>>>>> use of pause :-) .
>>>>>>>>
>>>>>>>
>>>>>>> I agree. If that's what microarch will do, we can drop div altogether.
>>>>>>> Though microarch may be treated as nop even if it is undesirable.
>>>>>>> IIRC, the div was introduced for the rocket chip which would induce a
>>>>>>> long latency stall with div instruction (zero as operands).
>>>>>>>
>>>>>>> Does any other core or newer rocket chip actually induce a latency
>>>>>>> stall with div instruction ?
>>>>>>> If not, it is equivalent to NOP as well. We can definitely remove the div.
>>>>>>> The only cores affected will be the older rocket core.
>>>>>>>
>>>>>>> Tagging some folks to understand what their core does.
>>>>>>>
>>>>>>> @Paul Walmsley @Guo Ren @Conor Dooley ?
>>>>>>
>>>>>> I am no microarch expert by _any_ stretch of the imagination, but
>>>>>> from a quick experiment it looks like the u54s on PolarFire SoC behave
>>>>>> in the same way, and div w/ zero operands does in fact take significantly
>>>>>> longer than regular division (looks to be about 3x).
>>>>>>
>>>>>
>>>>> Thanks. Do you have any data on how much the "pause" instruction takes.
>>>>
>>>> So these numbers you may consider as being pulled out of a magic hat
>>>> as all I am doing is reading the counters from userspace and there is
>>>> some variance etc. Plus the fact that I just started hacking at some
>>>> existing code I had lying around as I'm pretty snowed under at the
>>>> moment.
>>>>
>>>> Doing the following takes about 70 cycles on both a PolarFire SoC and an
>>>> unmatched:
>>>> long divisor = 2, dividend = 100000, dest;
>>>> asm("div %0, zero, zero" : "=r" (dest));
>>>> and equates to:
>>>> sd a5,-48(s0)
>>>> div a5,zero,zero
>>>>
>>>> Clocking in at about 40 cycles is some actual divisions, I just did the
>>>> following a dozen times, doing a trivial computation:
>>>> long divisor = 2, dividend = 100000, dest;
>>>> asm("div %0, %1, %2" : "=r" (dividend) : "r" (dividend), "r" (divisor))
>>>>
>>>> ie, a load of the following:
>>>> sd a5,-48(s0)
>>>> ld a5,-48(s0)
>>>> ld a4,-40(s0)
>>>> div a5,a5,a4
>>>>
>>>> So clearly the div w/ zero args makes a difference...
>>>>
>>>> On PolarFire SoC, `0x100000F` takes approx 6 cycles. On my unmatched, it
>>>> takes approx 40. Again, I just had an asm block & called the instruction
>>>> a number times and took the average - here it was 48 times.
>>>>
>>>> Take the actual numbers with a fist full of salt, but at least the
>>>> relative numbers should be of some use to you.
>>>>
>>>> Hope that's somewhat helpful, maybe next week I can do something a
>>>> little more useful for you...
>>>>
>>>
>>> Thanks. It would be good to understand what happens when "pause" is
>>> executed on these boards ?
>>
>> The actual pause instruction? uhh, so with the usual "I don't know what
>> I am doing" disclaimer, I ran each of the .insn and pause instruction 48
>> times in a row and checked the time elapsed via rdcycle & then ran that
>> c program 1000 times in a bash loop. Got the below, the insns were run
>> first and then the pauses.
>> insn pause
>> min 2.3 3.2
>> max 9.5 10.6
>> avg 27.0 29.1
>> 5% 2.9 4.2
>> 95% 18.1 19.1
>>
>> Swapping the pause & insn order around made a minor difference, but not
>> enough to report on. I'd be very wary of drawing any real conclusions
>> from this data, but at least both are roughly similar (and certainly not
>> even close to doing the div w/ zero args.
>>
>
> Yeah. That's what I was expecting. So we can't drop the div for now. Otherwise,
> the existing hardware(don't support Zhintpause) suffers by spinning faster.

But does that actually matter in practice? If it doesn’t noticeable
affect performance then you don’t need to keep the div. There are a lot
of architectures that even just define cpu_relax() as barrier().

Jess

> Thanks for running the experiments.
>
>> Again, hope that is helpful?
>> Conor.
>>
>>>
>>>> Conor.
>>>>
>>>>> I understand that it is not available in these cores. Just wanted to
>>>>> understand if microarchitecture
>>>>> actually takes a while executing the useless encoding as pointed out by Jessica.
>>>>>
>>>>> If that's the case, we can remove the div instruction altogether.
>>>>> Otherwise, this patch will cause some performance regression
>>>>> for existing SoC (HiFive unleashed has the same core. Not sure about
>>>>> unmatched though).
>>>>> This needs to be documented at least.
>>>>>
>>>>>> Hope that's helpful,
>>>>>> Conor.
>>>>>>
>>>>>> (I just did a quick check of what pretty much amounted to a bunch of
>>>>>> div a5,zero,zero in a row versus div a5,a5,a5)
>>>>>>
>>>>>>>
>>>>>>> (Please add anybody who may have an insight to execution flow on
>>>>>>> existing Linux capable cores)
>>>>>>>
>>
>
>
> --
> Regards,
> Atish

2022-10-05 00:45:27

by Guo Ren

[permalink] [raw]
Subject: Re: [PATCH] riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y

On Wed, Oct 5, 2022 at 1:24 AM Jessica Clarke <[email protected]> wrote:
>
> On 4 Oct 2022, at 17:52, Atish Patra <[email protected]> wrote:
> >
> > On Sat, Oct 1, 2022 at 1:13 PM Conor Dooley <[email protected]> wrote:
> >>
> >> On Wed, Sep 28, 2022 at 08:26:01PM -0700, Atish Patra wrote:
> >>> On Wed, Sep 28, 2022 at 2:16 PM Conor Dooley <[email protected]> wrote:
> >>>>
> >>>> On Wed, Sep 28, 2022 at 12:21:55AM -0700, Atish Patra wrote:
> >>>>> On Sat, Sep 24, 2022 at 4:15 PM Conor Dooley <[email protected]> wrote:
> >>>>>>
> >>>>>> On Fri, Sep 23, 2022 at 11:01:28AM -0700, Atish Patra wrote:
> >>>>>>> On Fri, Sep 23, 2022 at 12:18 AM Heiko Stuebner <[email protected]> wrote:
> >>>>>>>>
> >>>>>>>> Hi,
> >>>>>>>>
> >>>>>>>> Am Donnerstag, 22. September 2022, 17:52:46 CEST schrieb Jessica Clarke:
> >>>>>>>>> On 22 Sept 2022, at 16:45, Heiko Stuebner <[email protected]> wrote:
> >>>>>>>>>>
> >>>>>>>>>> Am Donnerstag, 22. September 2022, 08:09:58 CEST schrieb Samuel Holland:
> >>>>>>>>>>> commit 8eb060e10185 ("arch/riscv: add Zihintpause support") broke
> >>>>>>>>>>> building with CONFIG_CC_OPTIMIZE_FOR_SIZE enabled (gcc 11.1.0):
> >>>>>>>>>>>
> >>>>>>>>>>> CC arch/riscv/kernel/vdso/vgettimeofday.o
> >>>>>>>>>>> In file included from <command-line>:
> >>>>>>>>>>> ./arch/riscv/include/asm/jump_label.h: In function 'cpu_relax':
> >>>>>>>>>>> ././include/linux/compiler_types.h:285:33: warning: 'asm' operand 0 probably does not match constraints
> >>>>>>>>>>> 285 | #define asm_volatile_goto(x...) asm goto(x)
> >>>>>>>>>>> | ^~~
> >>>>>>>>>>> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> >>>>>>>>>>> 41 | asm_volatile_goto(
> >>>>>>>>>>> | ^~~~~~~~~~~~~~~~~
> >>>>>>>>>>> ././include/linux/compiler_types.h:285:33: error: impossible constraint in 'asm'
> >>>>>>>>>>> 285 | #define asm_volatile_goto(x...) asm goto(x)
> >>>>>>>>>>> | ^~~
> >>>>>>>>>>> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> >>>>>>>>>>> 41 | asm_volatile_goto(
> >>>>>>>>>>> | ^~~~~~~~~~~~~~~~~
> >>>>>>>>>>> make[1]: *** [scripts/Makefile.build:249: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1
> >>>>>>>>>>> make: *** [arch/riscv/Makefile:128: vdso_prepare] Error 2
> >>>>>>>>>>>
> >>>>>>>>>>> Having a static branch in cpu_relax() is problematic because that
> >>>>>>>>>>> function is widely inlined, including in some quite complex functions
> >>>>>>>>>>> like in the VDSO. A quick measurement shows this static branch is
> >>>>>>>>>>> responsible by itself for around 40% of the jump table.
> >>>>>>>>>>>
> >>>>>>>>>>> Drop the static branch, which ends up being the same number of
> >>>>>>>>>>> instructions anyway. If Zihintpause is supported, we trade the nop from
> >>>>>>>>>>> the static branch for a div. If Zihintpause is unsupported, we trade the
> >>>>>>>>>>> jump from the static branch for (what gets interpreted as) a nop.
> >>>>>>>>>>>
> >>>>>>>>>>> Fixes: 8eb060e10185 ("arch/riscv: add Zihintpause support")
> >>>>>>>>>>> Signed-off-by: Samuel Holland <[email protected]>
> >>>>>>>>>>> ---
> >>>>>>>>>>>
> >>>>>>>>>>> arch/riscv/include/asm/hwcap.h | 3 ---
> >>>>>>>>>>> arch/riscv/include/asm/vdso/processor.h | 25 ++++++++++---------------
> >>>>>>>>>>> 2 files changed, 10 insertions(+), 18 deletions(-)
> >>>>>>>>>>>
> >>>>>>>>>>> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> >>>>>>>>>>> index 6f59ec64175e..b21d46e68386 100644
> >>>>>>>>>>> --- a/arch/riscv/include/asm/hwcap.h
> >>>>>>>>>>> +++ b/arch/riscv/include/asm/hwcap.h
> >>>>>>>>>>> @@ -68,7 +68,6 @@ enum riscv_isa_ext_id {
> >>>>>>>>>>> */
> >>>>>>>>>>> enum riscv_isa_ext_key {
> >>>>>>>>>>> RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */
> >>>>>>>>>>> - RISCV_ISA_EXT_KEY_ZIHINTPAUSE,
> >>>>>>>>>>> RISCV_ISA_EXT_KEY_MAX,
> >>>>>>>>>>> };
> >>>>>>>>>>>
> >>>>>>>>>>> @@ -88,8 +87,6 @@ static __always_inline int riscv_isa_ext2key(int num)
> >>>>>>>>>>> return RISCV_ISA_EXT_KEY_FPU;
> >>>>>>>>>>> case RISCV_ISA_EXT_d:
> >>>>>>>>>>> return RISCV_ISA_EXT_KEY_FPU;
> >>>>>>>>>>> - case RISCV_ISA_EXT_ZIHINTPAUSE:
> >>>>>>>>>>> - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE;
> >>>>>>>>>>> default:
> >>>>>>>>>>> return -EINVAL;
> >>>>>>>>>>> }
> >>>>>>>>>>> diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h
> >>>>>>>>>>> index 1e4f8b4aef79..789bdb8211a2 100644
> >>>>>>>>>>> --- a/arch/riscv/include/asm/vdso/processor.h
> >>>>>>>>>>> +++ b/arch/riscv/include/asm/vdso/processor.h
> >>>>>>>>>>> @@ -4,30 +4,25 @@
> >>>>>>>>>>>
> >>>>>>>>>>> #ifndef __ASSEMBLY__
> >>>>>>>>>>>
> >>>>>>>>>>> -#include <linux/jump_label.h>
> >>>>>>>>>>> #include <asm/barrier.h>
> >>>>>>>>>>> -#include <asm/hwcap.h>
> >>>>>>>>>>>
> >>>>>>>>>>> static inline void cpu_relax(void)
> >>>>>>>>>>> {
> >>>>>>>>>>> - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) {
> >>>>>>>>>>> #ifdef __riscv_muldiv
> >>>>>>>>>>> - int dummy;
> >>>>>>>>>>> - /* In lieu of a halt instruction, induce a long-latency stall. */
> >>>>>>>>>>> - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> >>>>>>>>>>> + int dummy;
> >>>>>>>>>>> + /* In lieu of a halt instruction, induce a long-latency stall. */
> >>>>>>>>>>> + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> >>>>>>>>>>> #endif
> >>>>>>>>>>> - } else {
> >>>>>>>>>>> - /*
> >>>>>>>>>>> - * Reduce instruction retirement.
> >>>>>>>>>>> - * This assumes the PC changes.
> >>>>>>>>>>> - */
> >>>>>>>>>>> + /*
> >>>>>>>>>>> + * Reduce instruction retirement.
> >>>>>>>>>>> + * This assumes the PC changes.
> >>>>>>>>>>> + */
> >>>>>>>>>>> #ifdef __riscv_zihintpause
> >>>>>>>>>>> - __asm__ __volatile__ ("pause");
> >>>>>>>>>>> + __asm__ __volatile__ ("pause");
> >>>>>>>>>>> #else
> >>>>>>>>>>> - /* Encoding of the pause instruction */
> >>>>>>>>>>> - __asm__ __volatile__ (".4byte 0x100000F");
> >>>>>>>>>>> + /* Encoding of the pause instruction */
> >>>>>>>>>>> + __asm__ __volatile__ (".4byte 0x100000F");
> >>>>>>>>>>> #endif
> >>>>>>>>>>
> >>>>>>>>>> hmm, though before this part of the code was only ever accessed
> >>>>>>>>>> when the zhintpause extension was really available on the running
> >>>>>>>>>> machine while now the pause instruction is called every time.
> >>>>>>>>>>
> >>>>>>>>>> So I'm just wondering, can't this run into some "illegal instruction"
> >>>>>>>>>> thingy on machines not supporting the extension?
> >>>>>>>>>
> >>>>>>>>> No. The encoding for pause was deliberately chosen to be one of the
> >>>>>>>>> “useless” encodings of fence, with the hope that existing
> >>>>>>>>> microarchitectures might take a while to execute it and thus it would
> >>>>>>>>> still function as a slow-running instruction. It’s somewhat
> >>>>>>>>> questionable whether the div is even needed, the worst that happens is
> >>>>>>>>> cpu_relax isn’t very relaxed and you spin a bit faster. Any
> >>>>>>>>> implementations where that’s true probably also don’t have fancy
> >>>>>>>>> clock/power management anyway, and div isn’t going to be a low-power
> >>>>>>>>> operation so the only real effect is likely hammering on contended
> >>>>>>>>> atomics a bit more, and who cares about that on the low core count
> >>>>>>>>> systems we have today.
> >>>>>>>>
> >>>>>>>> thanks a lot for that explanation, which made things a lot clearer.
> >>>>>>>>
> >>>>>>>> So as you said, dropping the div part might make the function even smaller,
> >>>>>>>> though somehow part of me would want to add some sort of comment to
> >>>>>>>> the function for when the next developer stumbles over the unconditional
> >>>>>>>> use of pause :-) .
> >>>>>>>>
> >>>>>>>
> >>>>>>> I agree. If that's what microarch will do, we can drop div altogether.
> >>>>>>> Though microarch may be treated as nop even if it is undesirable.
> >>>>>>> IIRC, the div was introduced for the rocket chip which would induce a
> >>>>>>> long latency stall with div instruction (zero as operands).
> >>>>>>>
> >>>>>>> Does any other core or newer rocket chip actually induce a latency
> >>>>>>> stall with div instruction ?
> >>>>>>> If not, it is equivalent to NOP as well. We can definitely remove the div.
> >>>>>>> The only cores affected will be the older rocket core.
> >>>>>>>
> >>>>>>> Tagging some folks to understand what their core does.
> >>>>>>>
> >>>>>>> @Paul Walmsley @Guo Ren @Conor Dooley ?
> >>>>>>
> >>>>>> I am no microarch expert by _any_ stretch of the imagination, but
> >>>>>> from a quick experiment it looks like the u54s on PolarFire SoC behave
> >>>>>> in the same way, and div w/ zero operands does in fact take significantly
> >>>>>> longer than regular division (looks to be about 3x).
> >>>>>>
> >>>>>
> >>>>> Thanks. Do you have any data on how much the "pause" instruction takes.
> >>>>
> >>>> So these numbers you may consider as being pulled out of a magic hat
> >>>> as all I am doing is reading the counters from userspace and there is
> >>>> some variance etc. Plus the fact that I just started hacking at some
> >>>> existing code I had lying around as I'm pretty snowed under at the
> >>>> moment.
> >>>>
> >>>> Doing the following takes about 70 cycles on both a PolarFire SoC and an
> >>>> unmatched:
> >>>> long divisor = 2, dividend = 100000, dest;
> >>>> asm("div %0, zero, zero" : "=r" (dest));
> >>>> and equates to:
> >>>> sd a5,-48(s0)
> >>>> div a5,zero,zero
> >>>>
> >>>> Clocking in at about 40 cycles is some actual divisions, I just did the
> >>>> following a dozen times, doing a trivial computation:
> >>>> long divisor = 2, dividend = 100000, dest;
> >>>> asm("div %0, %1, %2" : "=r" (dividend) : "r" (dividend), "r" (divisor))
> >>>>
> >>>> ie, a load of the following:
> >>>> sd a5,-48(s0)
> >>>> ld a5,-48(s0)
> >>>> ld a4,-40(s0)
> >>>> div a5,a5,a4
> >>>>
> >>>> So clearly the div w/ zero args makes a difference...
> >>>>
> >>>> On PolarFire SoC, `0x100000F` takes approx 6 cycles. On my unmatched, it
> >>>> takes approx 40. Again, I just had an asm block & called the instruction
> >>>> a number times and took the average - here it was 48 times.
> >>>>
> >>>> Take the actual numbers with a fist full of salt, but at least the
> >>>> relative numbers should be of some use to you.
> >>>>
> >>>> Hope that's somewhat helpful, maybe next week I can do something a
> >>>> little more useful for you...
> >>>>
> >>>
> >>> Thanks. It would be good to understand what happens when "pause" is
> >>> executed on these boards ?
> >>
> >> The actual pause instruction? uhh, so with the usual "I don't know what
> >> I am doing" disclaimer, I ran each of the .insn and pause instruction 48
> >> times in a row and checked the time elapsed via rdcycle & then ran that
> >> c program 1000 times in a bash loop. Got the below, the insns were run
> >> first and then the pauses.
> >> insn pause
> >> min 2.3 3.2
> >> max 9.5 10.6
> >> avg 27.0 29.1
> >> 5% 2.9 4.2
> >> 95% 18.1 19.1
> >>
> >> Swapping the pause & insn order around made a minor difference, but not
> >> enough to report on. I'd be very wary of drawing any real conclusions
> >> from this data, but at least both are roughly similar (and certainly not
> >> even close to doing the div w/ zero args.
> >>
> >
> > Yeah. That's what I was expecting. So we can't drop the div for now. Otherwise,
> > the existing hardware(don't support Zhintpause) suffers by spinning faster.
>
> But does that actually matter in practice? If it doesn’t noticeable
> affect performance then you don’t need to keep the div. There are a lot
> of architectures that even just define cpu_relax() as barrier().
Div is not semantic accurate for standard code, it should be in
vendors' errata. I agree to leave nop as default and put a pause
instead after the feature is detected.

>
> Jess
>
> > Thanks for running the experiments.
> >
> >> Again, hope that is helpful?
> >> Conor.
> >>
> >>>
> >>>> Conor.
> >>>>
> >>>>> I understand that it is not available in these cores. Just wanted to
> >>>>> understand if microarchitecture
> >>>>> actually takes a while executing the useless encoding as pointed out by Jessica.
> >>>>>
> >>>>> If that's the case, we can remove the div instruction altogether.
> >>>>> Otherwise, this patch will cause some performance regression
> >>>>> for existing SoC (HiFive unleashed has the same core. Not sure about
> >>>>> unmatched though).
> >>>>> This needs to be documented at least.
> >>>>>
> >>>>>> Hope that's helpful,
> >>>>>> Conor.
> >>>>>>
> >>>>>> (I just did a quick check of what pretty much amounted to a bunch of
> >>>>>> div a5,zero,zero in a row versus div a5,a5,a5)
> >>>>>>
> >>>>>>>
> >>>>>>> (Please add anybody who may have an insight to execution flow on
> >>>>>>> existing Linux capable cores)
> >>>>>>>
> >>
> >
> >
> > --
> > Regards,
> > Atish
>


--
Best Regards
Guo Ren

2022-10-05 02:35:21

by Guo Ren

[permalink] [raw]
Subject: Re: [PATCH] riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y

,

On Wed, Oct 5, 2022 at 9:01 AM Jessica Clarke <[email protected]> wrote:
>
> On 5 Oct 2022, at 01:38, Guo Ren <[email protected]> wrote:
> >
> > On Wed, Oct 5, 2022 at 1:24 AM Jessica Clarke <[email protected]> wrote:
> >>
> >> On 4 Oct 2022, at 17:52, Atish Patra <[email protected]> wrote:
> >>>
> >>> On Sat, Oct 1, 2022 at 1:13 PM Conor Dooley <[email protected]> wrote:
> >>>>
> >>>> On Wed, Sep 28, 2022 at 08:26:01PM -0700, Atish Patra wrote:
> >>>>> On Wed, Sep 28, 2022 at 2:16 PM Conor Dooley <[email protected]> wrote:
> >>>>>>
> >>>>>> On Wed, Sep 28, 2022 at 12:21:55AM -0700, Atish Patra wrote:
> >>>>>>> On Sat, Sep 24, 2022 at 4:15 PM Conor Dooley <[email protected]> wrote:
> >>>>>>>>
> >>>>>>>> On Fri, Sep 23, 2022 at 11:01:28AM -0700, Atish Patra wrote:
> >>>>>>>>> On Fri, Sep 23, 2022 at 12:18 AM Heiko Stuebner <[email protected]> wrote:
> >>>>>>>>>>
> >>>>>>>>>> Hi,
> >>>>>>>>>>
> >>>>>>>>>> Am Donnerstag, 22. September 2022, 17:52:46 CEST schrieb Jessica Clarke:
> >>>>>>>>>>> On 22 Sept 2022, at 16:45, Heiko Stuebner <[email protected]> wrote:
> >>>>>>>>>>>>
> >>>>>>>>>>>> Am Donnerstag, 22. September 2022, 08:09:58 CEST schrieb Samuel Holland:
> >>>>>>>>>>>>> commit 8eb060e10185 ("arch/riscv: add Zihintpause support") broke
> >>>>>>>>>>>>> building with CONFIG_CC_OPTIMIZE_FOR_SIZE enabled (gcc 11.1.0):
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> CC arch/riscv/kernel/vdso/vgettimeofday.o
> >>>>>>>>>>>>> In file included from <command-line>:
> >>>>>>>>>>>>> ./arch/riscv/include/asm/jump_label.h: In function 'cpu_relax':
> >>>>>>>>>>>>> ././include/linux/compiler_types.h:285:33: warning: 'asm' operand 0 probably does not match constraints
> >>>>>>>>>>>>> 285 | #define asm_volatile_goto(x...) asm goto(x)
> >>>>>>>>>>>>> | ^~~
> >>>>>>>>>>>>> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> >>>>>>>>>>>>> 41 | asm_volatile_goto(
> >>>>>>>>>>>>> | ^~~~~~~~~~~~~~~~~
> >>>>>>>>>>>>> ././include/linux/compiler_types.h:285:33: error: impossible constraint in 'asm'
> >>>>>>>>>>>>> 285 | #define asm_volatile_goto(x...) asm goto(x)
> >>>>>>>>>>>>> | ^~~
> >>>>>>>>>>>>> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> >>>>>>>>>>>>> 41 | asm_volatile_goto(
> >>>>>>>>>>>>> | ^~~~~~~~~~~~~~~~~
> >>>>>>>>>>>>> make[1]: *** [scripts/Makefile.build:249: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1
> >>>>>>>>>>>>> make: *** [arch/riscv/Makefile:128: vdso_prepare] Error 2
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Having a static branch in cpu_relax() is problematic because that
> >>>>>>>>>>>>> function is widely inlined, including in some quite complex functions
> >>>>>>>>>>>>> like in the VDSO. A quick measurement shows this static branch is
> >>>>>>>>>>>>> responsible by itself for around 40% of the jump table.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Drop the static branch, which ends up being the same number of
> >>>>>>>>>>>>> instructions anyway. If Zihintpause is supported, we trade the nop from
> >>>>>>>>>>>>> the static branch for a div. If Zihintpause is unsupported, we trade the
> >>>>>>>>>>>>> jump from the static branch for (what gets interpreted as) a nop.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Fixes: 8eb060e10185 ("arch/riscv: add Zihintpause support")
> >>>>>>>>>>>>> Signed-off-by: Samuel Holland <[email protected]>
> >>>>>>>>>>>>> ---
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> arch/riscv/include/asm/hwcap.h | 3 ---
> >>>>>>>>>>>>> arch/riscv/include/asm/vdso/processor.h | 25 ++++++++++---------------
> >>>>>>>>>>>>> 2 files changed, 10 insertions(+), 18 deletions(-)
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> >>>>>>>>>>>>> index 6f59ec64175e..b21d46e68386 100644
> >>>>>>>>>>>>> --- a/arch/riscv/include/asm/hwcap.h
> >>>>>>>>>>>>> +++ b/arch/riscv/include/asm/hwcap.h
> >>>>>>>>>>>>> @@ -68,7 +68,6 @@ enum riscv_isa_ext_id {
> >>>>>>>>>>>>> */
> >>>>>>>>>>>>> enum riscv_isa_ext_key {
> >>>>>>>>>>>>> RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */
> >>>>>>>>>>>>> - RISCV_ISA_EXT_KEY_ZIHINTPAUSE,
> >>>>>>>>>>>>> RISCV_ISA_EXT_KEY_MAX,
> >>>>>>>>>>>>> };
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> @@ -88,8 +87,6 @@ static __always_inline int riscv_isa_ext2key(int num)
> >>>>>>>>>>>>> return RISCV_ISA_EXT_KEY_FPU;
> >>>>>>>>>>>>> case RISCV_ISA_EXT_d:
> >>>>>>>>>>>>> return RISCV_ISA_EXT_KEY_FPU;
> >>>>>>>>>>>>> - case RISCV_ISA_EXT_ZIHINTPAUSE:
> >>>>>>>>>>>>> - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE;
> >>>>>>>>>>>>> default:
> >>>>>>>>>>>>> return -EINVAL;
> >>>>>>>>>>>>> }
> >>>>>>>>>>>>> diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h
> >>>>>>>>>>>>> index 1e4f8b4aef79..789bdb8211a2 100644
> >>>>>>>>>>>>> --- a/arch/riscv/include/asm/vdso/processor.h
> >>>>>>>>>>>>> +++ b/arch/riscv/include/asm/vdso/processor.h
> >>>>>>>>>>>>> @@ -4,30 +4,25 @@
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> #ifndef __ASSEMBLY__
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> -#include <linux/jump_label.h>
> >>>>>>>>>>>>> #include <asm/barrier.h>
> >>>>>>>>>>>>> -#include <asm/hwcap.h>
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> static inline void cpu_relax(void)
> >>>>>>>>>>>>> {
> >>>>>>>>>>>>> - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) {
> >>>>>>>>>>>>> #ifdef __riscv_muldiv
> >>>>>>>>>>>>> - int dummy;
> >>>>>>>>>>>>> - /* In lieu of a halt instruction, induce a long-latency stall. */
> >>>>>>>>>>>>> - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> >>>>>>>>>>>>> + int dummy;
> >>>>>>>>>>>>> + /* In lieu of a halt instruction, induce a long-latency stall. */
> >>>>>>>>>>>>> + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> >>>>>>>>>>>>> #endif
> >>>>>>>>>>>>> - } else {
> >>>>>>>>>>>>> - /*
> >>>>>>>>>>>>> - * Reduce instruction retirement.
> >>>>>>>>>>>>> - * This assumes the PC changes.
> >>>>>>>>>>>>> - */
> >>>>>>>>>>>>> + /*
> >>>>>>>>>>>>> + * Reduce instruction retirement.
> >>>>>>>>>>>>> + * This assumes the PC changes.
> >>>>>>>>>>>>> + */
> >>>>>>>>>>>>> #ifdef __riscv_zihintpause
> >>>>>>>>>>>>> - __asm__ __volatile__ ("pause");
> >>>>>>>>>>>>> + __asm__ __volatile__ ("pause");
> >>>>>>>>>>>>> #else
> >>>>>>>>>>>>> - /* Encoding of the pause instruction */
> >>>>>>>>>>>>> - __asm__ __volatile__ (".4byte 0x100000F");
> >>>>>>>>>>>>> + /* Encoding of the pause instruction */
> >>>>>>>>>>>>> + __asm__ __volatile__ (".4byte 0x100000F");
> >>>>>>>>>>>>> #endif
> >>>>>>>>>>>>
> >>>>>>>>>>>> hmm, though before this part of the code was only ever accessed
> >>>>>>>>>>>> when the zhintpause extension was really available on the running
> >>>>>>>>>>>> machine while now the pause instruction is called every time.
> >>>>>>>>>>>>
> >>>>>>>>>>>> So I'm just wondering, can't this run into some "illegal instruction"
> >>>>>>>>>>>> thingy on machines not supporting the extension?
> >>>>>>>>>>>
> >>>>>>>>>>> No. The encoding for pause was deliberately chosen to be one of the
> >>>>>>>>>>> “useless” encodings of fence, with the hope that existing
> >>>>>>>>>>> microarchitectures might take a while to execute it and thus it would
> >>>>>>>>>>> still function as a slow-running instruction. It’s somewhat
> >>>>>>>>>>> questionable whether the div is even needed, the worst that happens is
> >>>>>>>>>>> cpu_relax isn’t very relaxed and you spin a bit faster. Any
> >>>>>>>>>>> implementations where that’s true probably also don’t have fancy
> >>>>>>>>>>> clock/power management anyway, and div isn’t going to be a low-power
> >>>>>>>>>>> operation so the only real effect is likely hammering on contended
> >>>>>>>>>>> atomics a bit more, and who cares about that on the low core count
> >>>>>>>>>>> systems we have today.
> >>>>>>>>>>
> >>>>>>>>>> thanks a lot for that explanation, which made things a lot clearer.
> >>>>>>>>>>
> >>>>>>>>>> So as you said, dropping the div part might make the function even smaller,
> >>>>>>>>>> though somehow part of me would want to add some sort of comment to
> >>>>>>>>>> the function for when the next developer stumbles over the unconditional
> >>>>>>>>>> use of pause :-) .
> >>>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> I agree. If that's what microarch will do, we can drop div altogether.
> >>>>>>>>> Though microarch may be treated as nop even if it is undesirable.
> >>>>>>>>> IIRC, the div was introduced for the rocket chip which would induce a
> >>>>>>>>> long latency stall with div instruction (zero as operands).
> >>>>>>>>>
> >>>>>>>>> Does any other core or newer rocket chip actually induce a latency
> >>>>>>>>> stall with div instruction ?
> >>>>>>>>> If not, it is equivalent to NOP as well. We can definitely remove the div.
> >>>>>>>>> The only cores affected will be the older rocket core.
> >>>>>>>>>
> >>>>>>>>> Tagging some folks to understand what their core does.
> >>>>>>>>>
> >>>>>>>>> @Paul Walmsley @Guo Ren @Conor Dooley ?
> >>>>>>>>
> >>>>>>>> I am no microarch expert by _any_ stretch of the imagination, but
> >>>>>>>> from a quick experiment it looks like the u54s on PolarFire SoC behave
> >>>>>>>> in the same way, and div w/ zero operands does in fact take significantly
> >>>>>>>> longer than regular division (looks to be about 3x).
> >>>>>>>>
> >>>>>>>
> >>>>>>> Thanks. Do you have any data on how much the "pause" instruction takes.
> >>>>>>
> >>>>>> So these numbers you may consider as being pulled out of a magic hat
> >>>>>> as all I am doing is reading the counters from userspace and there is
> >>>>>> some variance etc. Plus the fact that I just started hacking at some
> >>>>>> existing code I had lying around as I'm pretty snowed under at the
> >>>>>> moment.
> >>>>>>
> >>>>>> Doing the following takes about 70 cycles on both a PolarFire SoC and an
> >>>>>> unmatched:
> >>>>>> long divisor = 2, dividend = 100000, dest;
> >>>>>> asm("div %0, zero, zero" : "=r" (dest));
> >>>>>> and equates to:
> >>>>>> sd a5,-48(s0)
> >>>>>> div a5,zero,zero
> >>>>>>
> >>>>>> Clocking in at about 40 cycles is some actual divisions, I just did the
> >>>>>> following a dozen times, doing a trivial computation:
> >>>>>> long divisor = 2, dividend = 100000, dest;
> >>>>>> asm("div %0, %1, %2" : "=r" (dividend) : "r" (dividend), "r" (divisor))
> >>>>>>
> >>>>>> ie, a load of the following:
> >>>>>> sd a5,-48(s0)
> >>>>>> ld a5,-48(s0)
> >>>>>> ld a4,-40(s0)
> >>>>>> div a5,a5,a4
> >>>>>>
> >>>>>> So clearly the div w/ zero args makes a difference...
> >>>>>>
> >>>>>> On PolarFire SoC, `0x100000F` takes approx 6 cycles. On my unmatched, it
> >>>>>> takes approx 40. Again, I just had an asm block & called the instruction
> >>>>>> a number times and took the average - here it was 48 times.
> >>>>>>
> >>>>>> Take the actual numbers with a fist full of salt, but at least the
> >>>>>> relative numbers should be of some use to you.
> >>>>>>
> >>>>>> Hope that's somewhat helpful, maybe next week I can do something a
> >>>>>> little more useful for you...
> >>>>>>
> >>>>>
> >>>>> Thanks. It would be good to understand what happens when "pause" is
> >>>>> executed on these boards ?
> >>>>
> >>>> The actual pause instruction? uhh, so with the usual "I don't know what
> >>>> I am doing" disclaimer, I ran each of the .insn and pause instruction 48
> >>>> times in a row and checked the time elapsed via rdcycle & then ran that
> >>>> c program 1000 times in a bash loop. Got the below, the insns were run
> >>>> first and then the pauses.
> >>>> insn pause
> >>>> min 2.3 3.2
> >>>> max 9.5 10.6
> >>>> avg 27.0 29.1
> >>>> 5% 2.9 4.2
> >>>> 95% 18.1 19.1
> >>>>
> >>>> Swapping the pause & insn order around made a minor difference, but not
> >>>> enough to report on. I'd be very wary of drawing any real conclusions
> >>>> from this data, but at least both are roughly similar (and certainly not
> >>>> even close to doing the div w/ zero args.
> >>>>
> >>>
> >>> Yeah. That's what I was expecting. So we can't drop the div for now. Otherwise,
> >>> the existing hardware(don't support Zhintpause) suffers by spinning faster.
> >>
> >> But does that actually matter in practice? If it doesn’t noticeable
> >> affect performance then you don’t need to keep the div. There are a lot
> >> of architectures that even just define cpu_relax() as barrier().
> > Div is not semantic accurate for standard code, it should be in
> > vendors' errata. I agree to leave nop as default and put a pause
> > instead after the feature is detected.
>
> Nobody’s suggesting a literal nop instruction, that would be worse than
> either div or pause. It’s always safe to execute pause, the question is
> just whether on existing systems that don’t implement Zihintpause it
> gets executed too quickly such that performance is degraded due to
> spinning more aggressively.
Why do you ensure pause can't be an illegal instruction in some old machine?
Why do you ensure div could save power for all microarchitectures?

nop (default) -> div/<other instructions> (moved into vendor errata)
-> pause (when ZiHintPause feature detected)

>
> Jess
>
> >>
> >> Jess
> >>
> >>> Thanks for running the experiments.
> >>>
> >>>> Again, hope that is helpful?
> >>>> Conor.
> >>>>
> >>>>>
> >>>>>> Conor.
> >>>>>>
> >>>>>>> I understand that it is not available in these cores. Just wanted to
> >>>>>>> understand if microarchitecture
> >>>>>>> actually takes a while executing the useless encoding as pointed out by Jessica.
> >>>>>>>
> >>>>>>> If that's the case, we can remove the div instruction altogether.
> >>>>>>> Otherwise, this patch will cause some performance regression
> >>>>>>> for existing SoC (HiFive unleashed has the same core. Not sure about
> >>>>>>> unmatched though).
> >>>>>>> This needs to be documented at least.
> >>>>>>>
> >>>>>>>> Hope that's helpful,
> >>>>>>>> Conor.
> >>>>>>>>
> >>>>>>>> (I just did a quick check of what pretty much amounted to a bunch of
> >>>>>>>> div a5,zero,zero in a row versus div a5,a5,a5)
> >>>>>>>>
> >>>>>>>>>
> >>>>>>>>> (Please add anybody who may have an insight to execution flow on
> >>>>>>>>> existing Linux capable cores)
> >>>>>>>>>
> >>>>
> >>>
> >>>
> >>> --
> >>> Regards,
> >>> Atish
> >>
> >
> >
> > --
> > Best Regards
> > Guo Ren
>


--
Best Regards
Guo Ren

2022-10-05 02:36:33

by Jessica Clarke

[permalink] [raw]
Subject: Re: [PATCH] riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y

On 5 Oct 2022, at 01:38, Guo Ren <[email protected]> wrote:
>
> On Wed, Oct 5, 2022 at 1:24 AM Jessica Clarke <[email protected]> wrote:
>>
>> On 4 Oct 2022, at 17:52, Atish Patra <[email protected]> wrote:
>>>
>>> On Sat, Oct 1, 2022 at 1:13 PM Conor Dooley <[email protected]> wrote:
>>>>
>>>> On Wed, Sep 28, 2022 at 08:26:01PM -0700, Atish Patra wrote:
>>>>> On Wed, Sep 28, 2022 at 2:16 PM Conor Dooley <[email protected]> wrote:
>>>>>>
>>>>>> On Wed, Sep 28, 2022 at 12:21:55AM -0700, Atish Patra wrote:
>>>>>>> On Sat, Sep 24, 2022 at 4:15 PM Conor Dooley <[email protected]> wrote:
>>>>>>>>
>>>>>>>> On Fri, Sep 23, 2022 at 11:01:28AM -0700, Atish Patra wrote:
>>>>>>>>> On Fri, Sep 23, 2022 at 12:18 AM Heiko Stuebner <[email protected]> wrote:
>>>>>>>>>>
>>>>>>>>>> Hi,
>>>>>>>>>>
>>>>>>>>>> Am Donnerstag, 22. September 2022, 17:52:46 CEST schrieb Jessica Clarke:
>>>>>>>>>>> On 22 Sept 2022, at 16:45, Heiko Stuebner <[email protected]> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> Am Donnerstag, 22. September 2022, 08:09:58 CEST schrieb Samuel Holland:
>>>>>>>>>>>>> commit 8eb060e10185 ("arch/riscv: add Zihintpause support") broke
>>>>>>>>>>>>> building with CONFIG_CC_OPTIMIZE_FOR_SIZE enabled (gcc 11.1.0):
>>>>>>>>>>>>>
>>>>>>>>>>>>> CC arch/riscv/kernel/vdso/vgettimeofday.o
>>>>>>>>>>>>> In file included from <command-line>:
>>>>>>>>>>>>> ./arch/riscv/include/asm/jump_label.h: In function 'cpu_relax':
>>>>>>>>>>>>> ././include/linux/compiler_types.h:285:33: warning: 'asm' operand 0 probably does not match constraints
>>>>>>>>>>>>> 285 | #define asm_volatile_goto(x...) asm goto(x)
>>>>>>>>>>>>> | ^~~
>>>>>>>>>>>>> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
>>>>>>>>>>>>> 41 | asm_volatile_goto(
>>>>>>>>>>>>> | ^~~~~~~~~~~~~~~~~
>>>>>>>>>>>>> ././include/linux/compiler_types.h:285:33: error: impossible constraint in 'asm'
>>>>>>>>>>>>> 285 | #define asm_volatile_goto(x...) asm goto(x)
>>>>>>>>>>>>> | ^~~
>>>>>>>>>>>>> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
>>>>>>>>>>>>> 41 | asm_volatile_goto(
>>>>>>>>>>>>> | ^~~~~~~~~~~~~~~~~
>>>>>>>>>>>>> make[1]: *** [scripts/Makefile.build:249: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1
>>>>>>>>>>>>> make: *** [arch/riscv/Makefile:128: vdso_prepare] Error 2
>>>>>>>>>>>>>
>>>>>>>>>>>>> Having a static branch in cpu_relax() is problematic because that
>>>>>>>>>>>>> function is widely inlined, including in some quite complex functions
>>>>>>>>>>>>> like in the VDSO. A quick measurement shows this static branch is
>>>>>>>>>>>>> responsible by itself for around 40% of the jump table.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Drop the static branch, which ends up being the same number of
>>>>>>>>>>>>> instructions anyway. If Zihintpause is supported, we trade the nop from
>>>>>>>>>>>>> the static branch for a div. If Zihintpause is unsupported, we trade the
>>>>>>>>>>>>> jump from the static branch for (what gets interpreted as) a nop.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Fixes: 8eb060e10185 ("arch/riscv: add Zihintpause support")
>>>>>>>>>>>>> Signed-off-by: Samuel Holland <[email protected]>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>>
>>>>>>>>>>>>> arch/riscv/include/asm/hwcap.h | 3 ---
>>>>>>>>>>>>> arch/riscv/include/asm/vdso/processor.h | 25 ++++++++++---------------
>>>>>>>>>>>>> 2 files changed, 10 insertions(+), 18 deletions(-)
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
>>>>>>>>>>>>> index 6f59ec64175e..b21d46e68386 100644
>>>>>>>>>>>>> --- a/arch/riscv/include/asm/hwcap.h
>>>>>>>>>>>>> +++ b/arch/riscv/include/asm/hwcap.h
>>>>>>>>>>>>> @@ -68,7 +68,6 @@ enum riscv_isa_ext_id {
>>>>>>>>>>>>> */
>>>>>>>>>>>>> enum riscv_isa_ext_key {
>>>>>>>>>>>>> RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */
>>>>>>>>>>>>> - RISCV_ISA_EXT_KEY_ZIHINTPAUSE,
>>>>>>>>>>>>> RISCV_ISA_EXT_KEY_MAX,
>>>>>>>>>>>>> };
>>>>>>>>>>>>>
>>>>>>>>>>>>> @@ -88,8 +87,6 @@ static __always_inline int riscv_isa_ext2key(int num)
>>>>>>>>>>>>> return RISCV_ISA_EXT_KEY_FPU;
>>>>>>>>>>>>> case RISCV_ISA_EXT_d:
>>>>>>>>>>>>> return RISCV_ISA_EXT_KEY_FPU;
>>>>>>>>>>>>> - case RISCV_ISA_EXT_ZIHINTPAUSE:
>>>>>>>>>>>>> - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE;
>>>>>>>>>>>>> default:
>>>>>>>>>>>>> return -EINVAL;
>>>>>>>>>>>>> }
>>>>>>>>>>>>> diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h
>>>>>>>>>>>>> index 1e4f8b4aef79..789bdb8211a2 100644
>>>>>>>>>>>>> --- a/arch/riscv/include/asm/vdso/processor.h
>>>>>>>>>>>>> +++ b/arch/riscv/include/asm/vdso/processor.h
>>>>>>>>>>>>> @@ -4,30 +4,25 @@
>>>>>>>>>>>>>
>>>>>>>>>>>>> #ifndef __ASSEMBLY__
>>>>>>>>>>>>>
>>>>>>>>>>>>> -#include <linux/jump_label.h>
>>>>>>>>>>>>> #include <asm/barrier.h>
>>>>>>>>>>>>> -#include <asm/hwcap.h>
>>>>>>>>>>>>>
>>>>>>>>>>>>> static inline void cpu_relax(void)
>>>>>>>>>>>>> {
>>>>>>>>>>>>> - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) {
>>>>>>>>>>>>> #ifdef __riscv_muldiv
>>>>>>>>>>>>> - int dummy;
>>>>>>>>>>>>> - /* In lieu of a halt instruction, induce a long-latency stall. */
>>>>>>>>>>>>> - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
>>>>>>>>>>>>> + int dummy;
>>>>>>>>>>>>> + /* In lieu of a halt instruction, induce a long-latency stall. */
>>>>>>>>>>>>> + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
>>>>>>>>>>>>> #endif
>>>>>>>>>>>>> - } else {
>>>>>>>>>>>>> - /*
>>>>>>>>>>>>> - * Reduce instruction retirement.
>>>>>>>>>>>>> - * This assumes the PC changes.
>>>>>>>>>>>>> - */
>>>>>>>>>>>>> + /*
>>>>>>>>>>>>> + * Reduce instruction retirement.
>>>>>>>>>>>>> + * This assumes the PC changes.
>>>>>>>>>>>>> + */
>>>>>>>>>>>>> #ifdef __riscv_zihintpause
>>>>>>>>>>>>> - __asm__ __volatile__ ("pause");
>>>>>>>>>>>>> + __asm__ __volatile__ ("pause");
>>>>>>>>>>>>> #else
>>>>>>>>>>>>> - /* Encoding of the pause instruction */
>>>>>>>>>>>>> - __asm__ __volatile__ (".4byte 0x100000F");
>>>>>>>>>>>>> + /* Encoding of the pause instruction */
>>>>>>>>>>>>> + __asm__ __volatile__ (".4byte 0x100000F");
>>>>>>>>>>>>> #endif
>>>>>>>>>>>>
>>>>>>>>>>>> hmm, though before this part of the code was only ever accessed
>>>>>>>>>>>> when the zhintpause extension was really available on the running
>>>>>>>>>>>> machine while now the pause instruction is called every time.
>>>>>>>>>>>>
>>>>>>>>>>>> So I'm just wondering, can't this run into some "illegal instruction"
>>>>>>>>>>>> thingy on machines not supporting the extension?
>>>>>>>>>>>
>>>>>>>>>>> No. The encoding for pause was deliberately chosen to be one of the
>>>>>>>>>>> “useless” encodings of fence, with the hope that existing
>>>>>>>>>>> microarchitectures might take a while to execute it and thus it would
>>>>>>>>>>> still function as a slow-running instruction. It’s somewhat
>>>>>>>>>>> questionable whether the div is even needed, the worst that happens is
>>>>>>>>>>> cpu_relax isn’t very relaxed and you spin a bit faster. Any
>>>>>>>>>>> implementations where that’s true probably also don’t have fancy
>>>>>>>>>>> clock/power management anyway, and div isn’t going to be a low-power
>>>>>>>>>>> operation so the only real effect is likely hammering on contended
>>>>>>>>>>> atomics a bit more, and who cares about that on the low core count
>>>>>>>>>>> systems we have today.
>>>>>>>>>>
>>>>>>>>>> thanks a lot for that explanation, which made things a lot clearer.
>>>>>>>>>>
>>>>>>>>>> So as you said, dropping the div part might make the function even smaller,
>>>>>>>>>> though somehow part of me would want to add some sort of comment to
>>>>>>>>>> the function for when the next developer stumbles over the unconditional
>>>>>>>>>> use of pause :-) .
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> I agree. If that's what microarch will do, we can drop div altogether.
>>>>>>>>> Though microarch may be treated as nop even if it is undesirable.
>>>>>>>>> IIRC, the div was introduced for the rocket chip which would induce a
>>>>>>>>> long latency stall with div instruction (zero as operands).
>>>>>>>>>
>>>>>>>>> Does any other core or newer rocket chip actually induce a latency
>>>>>>>>> stall with div instruction ?
>>>>>>>>> If not, it is equivalent to NOP as well. We can definitely remove the div.
>>>>>>>>> The only cores affected will be the older rocket core.
>>>>>>>>>
>>>>>>>>> Tagging some folks to understand what their core does.
>>>>>>>>>
>>>>>>>>> @Paul Walmsley @Guo Ren @Conor Dooley ?
>>>>>>>>
>>>>>>>> I am no microarch expert by _any_ stretch of the imagination, but
>>>>>>>> from a quick experiment it looks like the u54s on PolarFire SoC behave
>>>>>>>> in the same way, and div w/ zero operands does in fact take significantly
>>>>>>>> longer than regular division (looks to be about 3x).
>>>>>>>>
>>>>>>>
>>>>>>> Thanks. Do you have any data on how much the "pause" instruction takes.
>>>>>>
>>>>>> So these numbers you may consider as being pulled out of a magic hat
>>>>>> as all I am doing is reading the counters from userspace and there is
>>>>>> some variance etc. Plus the fact that I just started hacking at some
>>>>>> existing code I had lying around as I'm pretty snowed under at the
>>>>>> moment.
>>>>>>
>>>>>> Doing the following takes about 70 cycles on both a PolarFire SoC and an
>>>>>> unmatched:
>>>>>> long divisor = 2, dividend = 100000, dest;
>>>>>> asm("div %0, zero, zero" : "=r" (dest));
>>>>>> and equates to:
>>>>>> sd a5,-48(s0)
>>>>>> div a5,zero,zero
>>>>>>
>>>>>> Clocking in at about 40 cycles is some actual divisions, I just did the
>>>>>> following a dozen times, doing a trivial computation:
>>>>>> long divisor = 2, dividend = 100000, dest;
>>>>>> asm("div %0, %1, %2" : "=r" (dividend) : "r" (dividend), "r" (divisor))
>>>>>>
>>>>>> ie, a load of the following:
>>>>>> sd a5,-48(s0)
>>>>>> ld a5,-48(s0)
>>>>>> ld a4,-40(s0)
>>>>>> div a5,a5,a4
>>>>>>
>>>>>> So clearly the div w/ zero args makes a difference...
>>>>>>
>>>>>> On PolarFire SoC, `0x100000F` takes approx 6 cycles. On my unmatched, it
>>>>>> takes approx 40. Again, I just had an asm block & called the instruction
>>>>>> a number times and took the average - here it was 48 times.
>>>>>>
>>>>>> Take the actual numbers with a fist full of salt, but at least the
>>>>>> relative numbers should be of some use to you.
>>>>>>
>>>>>> Hope that's somewhat helpful, maybe next week I can do something a
>>>>>> little more useful for you...
>>>>>>
>>>>>
>>>>> Thanks. It would be good to understand what happens when "pause" is
>>>>> executed on these boards ?
>>>>
>>>> The actual pause instruction? uhh, so with the usual "I don't know what
>>>> I am doing" disclaimer, I ran each of the .insn and pause instruction 48
>>>> times in a row and checked the time elapsed via rdcycle & then ran that
>>>> c program 1000 times in a bash loop. Got the below, the insns were run
>>>> first and then the pauses.
>>>> insn pause
>>>> min 2.3 3.2
>>>> max 9.5 10.6
>>>> avg 27.0 29.1
>>>> 5% 2.9 4.2
>>>> 95% 18.1 19.1
>>>>
>>>> Swapping the pause & insn order around made a minor difference, but not
>>>> enough to report on. I'd be very wary of drawing any real conclusions
>>>> from this data, but at least both are roughly similar (and certainly not
>>>> even close to doing the div w/ zero args.
>>>>
>>>
>>> Yeah. That's what I was expecting. So we can't drop the div for now. Otherwise,
>>> the existing hardware(don't support Zhintpause) suffers by spinning faster.
>>
>> But does that actually matter in practice? If it doesn’t noticeable
>> affect performance then you don’t need to keep the div. There are a lot
>> of architectures that even just define cpu_relax() as barrier().
> Div is not semantic accurate for standard code, it should be in
> vendors' errata. I agree to leave nop as default and put a pause
> instead after the feature is detected.

Nobody’s suggesting a literal nop instruction, that would be worse than
either div or pause. It’s always safe to execute pause, the question is
just whether on existing systems that don’t implement Zihintpause it
gets executed too quickly such that performance is degraded due to
spinning more aggressively.

Jess

>>
>> Jess
>>
>>> Thanks for running the experiments.
>>>
>>>> Again, hope that is helpful?
>>>> Conor.
>>>>
>>>>>
>>>>>> Conor.
>>>>>>
>>>>>>> I understand that it is not available in these cores. Just wanted to
>>>>>>> understand if microarchitecture
>>>>>>> actually takes a while executing the useless encoding as pointed out by Jessica.
>>>>>>>
>>>>>>> If that's the case, we can remove the div instruction altogether.
>>>>>>> Otherwise, this patch will cause some performance regression
>>>>>>> for existing SoC (HiFive unleashed has the same core. Not sure about
>>>>>>> unmatched though).
>>>>>>> This needs to be documented at least.
>>>>>>>
>>>>>>>> Hope that's helpful,
>>>>>>>> Conor.
>>>>>>>>
>>>>>>>> (I just did a quick check of what pretty much amounted to a bunch of
>>>>>>>> div a5,zero,zero in a row versus div a5,a5,a5)
>>>>>>>>
>>>>>>>>>
>>>>>>>>> (Please add anybody who may have an insight to execution flow on
>>>>>>>>> existing Linux capable cores)
>>>>>>>>>
>>>>
>>>
>>>
>>> --
>>> Regards,
>>> Atish
>>
>
>
> --
> Best Regards
> Guo Ren

2022-10-05 14:35:37

by Jessica Clarke

[permalink] [raw]
Subject: Re: [PATCH] riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y

On 5 Oct 2022, at 02:40, Guo Ren <[email protected]> wrote:
>
> ,
>
> On Wed, Oct 5, 2022 at 9:01 AM Jessica Clarke <[email protected]> wrote:
>>
>> On 5 Oct 2022, at 01:38, Guo Ren <[email protected]> wrote:
>>>
>>> On Wed, Oct 5, 2022 at 1:24 AM Jessica Clarke <[email protected]> wrote:
>>>>
>>>> On 4 Oct 2022, at 17:52, Atish Patra <[email protected]> wrote:
>>>>>
>>>>> On Sat, Oct 1, 2022 at 1:13 PM Conor Dooley <[email protected]> wrote:
>>>>>>
>>>>>> On Wed, Sep 28, 2022 at 08:26:01PM -0700, Atish Patra wrote:
>>>>>>> On Wed, Sep 28, 2022 at 2:16 PM Conor Dooley <[email protected]> wrote:
>>>>>>>>
>>>>>>>> On Wed, Sep 28, 2022 at 12:21:55AM -0700, Atish Patra wrote:
>>>>>>>>> On Sat, Sep 24, 2022 at 4:15 PM Conor Dooley <[email protected]> wrote:
>>>>>>>>>>
>>>>>>>>>> On Fri, Sep 23, 2022 at 11:01:28AM -0700, Atish Patra wrote:
>>>>>>>>>>> On Fri, Sep 23, 2022 at 12:18 AM Heiko Stuebner <[email protected]> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> Hi,
>>>>>>>>>>>>
>>>>>>>>>>>> Am Donnerstag, 22. September 2022, 17:52:46 CEST schrieb Jessica Clarke:
>>>>>>>>>>>>> On 22 Sept 2022, at 16:45, Heiko Stuebner <[email protected]> wrote:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Am Donnerstag, 22. September 2022, 08:09:58 CEST schrieb Samuel Holland:
>>>>>>>>>>>>>>> commit 8eb060e10185 ("arch/riscv: add Zihintpause support") broke
>>>>>>>>>>>>>>> building with CONFIG_CC_OPTIMIZE_FOR_SIZE enabled (gcc 11.1.0):
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> CC arch/riscv/kernel/vdso/vgettimeofday.o
>>>>>>>>>>>>>>> In file included from <command-line>:
>>>>>>>>>>>>>>> ./arch/riscv/include/asm/jump_label.h: In function 'cpu_relax':
>>>>>>>>>>>>>>> ././include/linux/compiler_types.h:285:33: warning: 'asm' operand 0 probably does not match constraints
>>>>>>>>>>>>>>> 285 | #define asm_volatile_goto(x...) asm goto(x)
>>>>>>>>>>>>>>> | ^~~
>>>>>>>>>>>>>>> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
>>>>>>>>>>>>>>> 41 | asm_volatile_goto(
>>>>>>>>>>>>>>> | ^~~~~~~~~~~~~~~~~
>>>>>>>>>>>>>>> ././include/linux/compiler_types.h:285:33: error: impossible constraint in 'asm'
>>>>>>>>>>>>>>> 285 | #define asm_volatile_goto(x...) asm goto(x)
>>>>>>>>>>>>>>> | ^~~
>>>>>>>>>>>>>>> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
>>>>>>>>>>>>>>> 41 | asm_volatile_goto(
>>>>>>>>>>>>>>> | ^~~~~~~~~~~~~~~~~
>>>>>>>>>>>>>>> make[1]: *** [scripts/Makefile.build:249: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1
>>>>>>>>>>>>>>> make: *** [arch/riscv/Makefile:128: vdso_prepare] Error 2
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Having a static branch in cpu_relax() is problematic because that
>>>>>>>>>>>>>>> function is widely inlined, including in some quite complex functions
>>>>>>>>>>>>>>> like in the VDSO. A quick measurement shows this static branch is
>>>>>>>>>>>>>>> responsible by itself for around 40% of the jump table.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Drop the static branch, which ends up being the same number of
>>>>>>>>>>>>>>> instructions anyway. If Zihintpause is supported, we trade the nop from
>>>>>>>>>>>>>>> the static branch for a div. If Zihintpause is unsupported, we trade the
>>>>>>>>>>>>>>> jump from the static branch for (what gets interpreted as) a nop.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Fixes: 8eb060e10185 ("arch/riscv: add Zihintpause support")
>>>>>>>>>>>>>>> Signed-off-by: Samuel Holland <[email protected]>
>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> arch/riscv/include/asm/hwcap.h | 3 ---
>>>>>>>>>>>>>>> arch/riscv/include/asm/vdso/processor.h | 25 ++++++++++---------------
>>>>>>>>>>>>>>> 2 files changed, 10 insertions(+), 18 deletions(-)
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
>>>>>>>>>>>>>>> index 6f59ec64175e..b21d46e68386 100644
>>>>>>>>>>>>>>> --- a/arch/riscv/include/asm/hwcap.h
>>>>>>>>>>>>>>> +++ b/arch/riscv/include/asm/hwcap.h
>>>>>>>>>>>>>>> @@ -68,7 +68,6 @@ enum riscv_isa_ext_id {
>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>> enum riscv_isa_ext_key {
>>>>>>>>>>>>>>> RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */
>>>>>>>>>>>>>>> - RISCV_ISA_EXT_KEY_ZIHINTPAUSE,
>>>>>>>>>>>>>>> RISCV_ISA_EXT_KEY_MAX,
>>>>>>>>>>>>>>> };
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> @@ -88,8 +87,6 @@ static __always_inline int riscv_isa_ext2key(int num)
>>>>>>>>>>>>>>> return RISCV_ISA_EXT_KEY_FPU;
>>>>>>>>>>>>>>> case RISCV_ISA_EXT_d:
>>>>>>>>>>>>>>> return RISCV_ISA_EXT_KEY_FPU;
>>>>>>>>>>>>>>> - case RISCV_ISA_EXT_ZIHINTPAUSE:
>>>>>>>>>>>>>>> - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE;
>>>>>>>>>>>>>>> default:
>>>>>>>>>>>>>>> return -EINVAL;
>>>>>>>>>>>>>>> }
>>>>>>>>>>>>>>> diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h
>>>>>>>>>>>>>>> index 1e4f8b4aef79..789bdb8211a2 100644
>>>>>>>>>>>>>>> --- a/arch/riscv/include/asm/vdso/processor.h
>>>>>>>>>>>>>>> +++ b/arch/riscv/include/asm/vdso/processor.h
>>>>>>>>>>>>>>> @@ -4,30 +4,25 @@
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> #ifndef __ASSEMBLY__
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> -#include <linux/jump_label.h>
>>>>>>>>>>>>>>> #include <asm/barrier.h>
>>>>>>>>>>>>>>> -#include <asm/hwcap.h>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> static inline void cpu_relax(void)
>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>> - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) {
>>>>>>>>>>>>>>> #ifdef __riscv_muldiv
>>>>>>>>>>>>>>> - int dummy;
>>>>>>>>>>>>>>> - /* In lieu of a halt instruction, induce a long-latency stall. */
>>>>>>>>>>>>>>> - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
>>>>>>>>>>>>>>> + int dummy;
>>>>>>>>>>>>>>> + /* In lieu of a halt instruction, induce a long-latency stall. */
>>>>>>>>>>>>>>> + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
>>>>>>>>>>>>>>> #endif
>>>>>>>>>>>>>>> - } else {
>>>>>>>>>>>>>>> - /*
>>>>>>>>>>>>>>> - * Reduce instruction retirement.
>>>>>>>>>>>>>>> - * This assumes the PC changes.
>>>>>>>>>>>>>>> - */
>>>>>>>>>>>>>>> + /*
>>>>>>>>>>>>>>> + * Reduce instruction retirement.
>>>>>>>>>>>>>>> + * This assumes the PC changes.
>>>>>>>>>>>>>>> + */
>>>>>>>>>>>>>>> #ifdef __riscv_zihintpause
>>>>>>>>>>>>>>> - __asm__ __volatile__ ("pause");
>>>>>>>>>>>>>>> + __asm__ __volatile__ ("pause");
>>>>>>>>>>>>>>> #else
>>>>>>>>>>>>>>> - /* Encoding of the pause instruction */
>>>>>>>>>>>>>>> - __asm__ __volatile__ (".4byte 0x100000F");
>>>>>>>>>>>>>>> + /* Encoding of the pause instruction */
>>>>>>>>>>>>>>> + __asm__ __volatile__ (".4byte 0x100000F");
>>>>>>>>>>>>>>> #endif
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> hmm, though before this part of the code was only ever accessed
>>>>>>>>>>>>>> when the zhintpause extension was really available on the running
>>>>>>>>>>>>>> machine while now the pause instruction is called every time.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> So I'm just wondering, can't this run into some "illegal instruction"
>>>>>>>>>>>>>> thingy on machines not supporting the extension?
>>>>>>>>>>>>>
>>>>>>>>>>>>> No. The encoding for pause was deliberately chosen to be one of the
>>>>>>>>>>>>> “useless” encodings of fence, with the hope that existing
>>>>>>>>>>>>> microarchitectures might take a while to execute it and thus it would
>>>>>>>>>>>>> still function as a slow-running instruction. It’s somewhat
>>>>>>>>>>>>> questionable whether the div is even needed, the worst that happens is
>>>>>>>>>>>>> cpu_relax isn’t very relaxed and you spin a bit faster. Any
>>>>>>>>>>>>> implementations where that’s true probably also don’t have fancy
>>>>>>>>>>>>> clock/power management anyway, and div isn’t going to be a low-power
>>>>>>>>>>>>> operation so the only real effect is likely hammering on contended
>>>>>>>>>>>>> atomics a bit more, and who cares about that on the low core count
>>>>>>>>>>>>> systems we have today.
>>>>>>>>>>>>
>>>>>>>>>>>> thanks a lot for that explanation, which made things a lot clearer.
>>>>>>>>>>>>
>>>>>>>>>>>> So as you said, dropping the div part might make the function even smaller,
>>>>>>>>>>>> though somehow part of me would want to add some sort of comment to
>>>>>>>>>>>> the function for when the next developer stumbles over the unconditional
>>>>>>>>>>>> use of pause :-) .
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> I agree. If that's what microarch will do, we can drop div altogether.
>>>>>>>>>>> Though microarch may be treated as nop even if it is undesirable.
>>>>>>>>>>> IIRC, the div was introduced for the rocket chip which would induce a
>>>>>>>>>>> long latency stall with div instruction (zero as operands).
>>>>>>>>>>>
>>>>>>>>>>> Does any other core or newer rocket chip actually induce a latency
>>>>>>>>>>> stall with div instruction ?
>>>>>>>>>>> If not, it is equivalent to NOP as well. We can definitely remove the div.
>>>>>>>>>>> The only cores affected will be the older rocket core.
>>>>>>>>>>>
>>>>>>>>>>> Tagging some folks to understand what their core does.
>>>>>>>>>>>
>>>>>>>>>>> @Paul Walmsley @Guo Ren @Conor Dooley ?
>>>>>>>>>>
>>>>>>>>>> I am no microarch expert by _any_ stretch of the imagination, but
>>>>>>>>>> from a quick experiment it looks like the u54s on PolarFire SoC behave
>>>>>>>>>> in the same way, and div w/ zero operands does in fact take significantly
>>>>>>>>>> longer than regular division (looks to be about 3x).
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Thanks. Do you have any data on how much the "pause" instruction takes.
>>>>>>>>
>>>>>>>> So these numbers you may consider as being pulled out of a magic hat
>>>>>>>> as all I am doing is reading the counters from userspace and there is
>>>>>>>> some variance etc. Plus the fact that I just started hacking at some
>>>>>>>> existing code I had lying around as I'm pretty snowed under at the
>>>>>>>> moment.
>>>>>>>>
>>>>>>>> Doing the following takes about 70 cycles on both a PolarFire SoC and an
>>>>>>>> unmatched:
>>>>>>>> long divisor = 2, dividend = 100000, dest;
>>>>>>>> asm("div %0, zero, zero" : "=r" (dest));
>>>>>>>> and equates to:
>>>>>>>> sd a5,-48(s0)
>>>>>>>> div a5,zero,zero
>>>>>>>>
>>>>>>>> Clocking in at about 40 cycles is some actual divisions, I just did the
>>>>>>>> following a dozen times, doing a trivial computation:
>>>>>>>> long divisor = 2, dividend = 100000, dest;
>>>>>>>> asm("div %0, %1, %2" : "=r" (dividend) : "r" (dividend), "r" (divisor))
>>>>>>>>
>>>>>>>> ie, a load of the following:
>>>>>>>> sd a5,-48(s0)
>>>>>>>> ld a5,-48(s0)
>>>>>>>> ld a4,-40(s0)
>>>>>>>> div a5,a5,a4
>>>>>>>>
>>>>>>>> So clearly the div w/ zero args makes a difference...
>>>>>>>>
>>>>>>>> On PolarFire SoC, `0x100000F` takes approx 6 cycles. On my unmatched, it
>>>>>>>> takes approx 40. Again, I just had an asm block & called the instruction
>>>>>>>> a number times and took the average - here it was 48 times.
>>>>>>>>
>>>>>>>> Take the actual numbers with a fist full of salt, but at least the
>>>>>>>> relative numbers should be of some use to you.
>>>>>>>>
>>>>>>>> Hope that's somewhat helpful, maybe next week I can do something a
>>>>>>>> little more useful for you...
>>>>>>>>
>>>>>>>
>>>>>>> Thanks. It would be good to understand what happens when "pause" is
>>>>>>> executed on these boards ?
>>>>>>
>>>>>> The actual pause instruction? uhh, so with the usual "I don't know what
>>>>>> I am doing" disclaimer, I ran each of the .insn and pause instruction 48
>>>>>> times in a row and checked the time elapsed via rdcycle & then ran that
>>>>>> c program 1000 times in a bash loop. Got the below, the insns were run
>>>>>> first and then the pauses.
>>>>>> insn pause
>>>>>> min 2.3 3.2
>>>>>> max 9.5 10.6
>>>>>> avg 27.0 29.1
>>>>>> 5% 2.9 4.2
>>>>>> 95% 18.1 19.1
>>>>>>
>>>>>> Swapping the pause & insn order around made a minor difference, but not
>>>>>> enough to report on. I'd be very wary of drawing any real conclusions
>>>>>> from this data, but at least both are roughly similar (and certainly not
>>>>>> even close to doing the div w/ zero args.
>>>>>>
>>>>>
>>>>> Yeah. That's what I was expecting. So we can't drop the div for now. Otherwise,
>>>>> the existing hardware(don't support Zhintpause) suffers by spinning faster.
>>>>
>>>> But does that actually matter in practice? If it doesn’t noticeable
>>>> affect performance then you don’t need to keep the div. There are a lot
>>>> of architectures that even just define cpu_relax() as barrier().
>>> Div is not semantic accurate for standard code, it should be in
>>> vendors' errata. I agree to leave nop as default and put a pause
>>> instead after the feature is detected.
>>
>> Nobody’s suggesting a literal nop instruction, that would be worse than
>> either div or pause. It’s always safe to execute pause, the question is
>> just whether on existing systems that don’t implement Zihintpause it
>> gets executed too quickly such that performance is degraded due to
>> spinning more aggressively.
>
> Why do you ensure pause can't be an illegal instruction in some old machine?

Because that’s how it’s defined; it uses one of the many hints
(instructions that aren’t a canonical nop but are defined to behave
like one in terms of architectural side-effects) from RV32I/RV64I.

> Why do you ensure div could save power for all microarchitectures?

I don’t. In fact it almost certainly won’t make the core enter a low
power state. It will just help reduce the amount of memory traffic by
taking a while to execute. I would rather not use div at all.

Jess

> nop (default) -> div/<other instructions> (moved into vendor errata)
> -> pause (when ZiHintPause feature detected)
>
>>
>> Jess
>>
>>>>
>>>> Jess
>>>>
>>>>> Thanks for running the experiments.
>>>>>
>>>>>> Again, hope that is helpful?
>>>>>> Conor.
>>>>>>
>>>>>>>
>>>>>>>> Conor.
>>>>>>>>
>>>>>>>>> I understand that it is not available in these cores. Just wanted to
>>>>>>>>> understand if microarchitecture
>>>>>>>>> actually takes a while executing the useless encoding as pointed out by Jessica.
>>>>>>>>>
>>>>>>>>> If that's the case, we can remove the div instruction altogether.
>>>>>>>>> Otherwise, this patch will cause some performance regression
>>>>>>>>> for existing SoC (HiFive unleashed has the same core. Not sure about
>>>>>>>>> unmatched though).
>>>>>>>>> This needs to be documented at least.
>>>>>>>>>
>>>>>>>>>> Hope that's helpful,
>>>>>>>>>> Conor.
>>>>>>>>>>
>>>>>>>>>> (I just did a quick check of what pretty much amounted to a bunch of
>>>>>>>>>> div a5,zero,zero in a row versus div a5,a5,a5)
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> (Please add anybody who may have an insight to execution flow on
>>>>>>>>>>> existing Linux capable cores)
>>>>>>>>>>>
>>>>>>
>>>>>
>>>>>
>>>>> --
>>>>> Regards,
>>>>> Atish
>>>>
>>>
>>>
>>> --
>>> Best Regards
>>> Guo Ren
>>
>
>
> --
> Best Regards
> Guo Ren

2022-10-06 07:15:54

by Jisheng Zhang

[permalink] [raw]
Subject: Re: [PATCH] riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y

On Thu, Sep 22, 2022 at 01:09:58AM -0500, Samuel Holland wrote:
> commit 8eb060e10185 ("arch/riscv: add Zihintpause support") broke
> building with CONFIG_CC_OPTIMIZE_FOR_SIZE enabled (gcc 11.1.0):
>
> CC arch/riscv/kernel/vdso/vgettimeofday.o
> In file included from <command-line>:
> ./arch/riscv/include/asm/jump_label.h: In function 'cpu_relax':
> ././include/linux/compiler_types.h:285:33: warning: 'asm' operand 0 probably does not match constraints
> 285 | #define asm_volatile_goto(x...) asm goto(x)
> | ^~~
> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> 41 | asm_volatile_goto(
> | ^~~~~~~~~~~~~~~~~
> ././include/linux/compiler_types.h:285:33: error: impossible constraint in 'asm'
> 285 | #define asm_volatile_goto(x...) asm goto(x)
> | ^~~
> ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto'
> 41 | asm_volatile_goto(
> | ^~~~~~~~~~~~~~~~~
> make[1]: *** [scripts/Makefile.build:249: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1
> make: *** [arch/riscv/Makefile:128: vdso_prepare] Error 2
>
> Having a static branch in cpu_relax() is problematic because that
> function is widely inlined, including in some quite complex functions
> like in the VDSO. A quick measurement shows this static branch is
> responsible by itself for around 40% of the jump table.
>
> Drop the static branch, which ends up being the same number of
> instructions anyway. If Zihintpause is supported, we trade the nop from
> the static branch for a div. If Zihintpause is unsupported, we trade the
> jump from the static branch for (what gets interpreted as) a nop.

Hi Samuel,

I'm not sure whether it's correct to remove static branch usage from
cpu_relax, but your report inspired my patch of constifying arguments
of arch_static_branch() and arch_static_branch_jump() [1]. Could you
please also test it?

Thanks very much

[1]https://lore.kernel.org/linux-riscv/[email protected]/T/#u

>
> Fixes: 8eb060e10185 ("arch/riscv: add Zihintpause support")
> Signed-off-by: Samuel Holland <[email protected]>
> ---
>
> arch/riscv/include/asm/hwcap.h | 3 ---
> arch/riscv/include/asm/vdso/processor.h | 25 ++++++++++---------------
> 2 files changed, 10 insertions(+), 18 deletions(-)
>
> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> index 6f59ec64175e..b21d46e68386 100644
> --- a/arch/riscv/include/asm/hwcap.h
> +++ b/arch/riscv/include/asm/hwcap.h
> @@ -68,7 +68,6 @@ enum riscv_isa_ext_id {
> */
> enum riscv_isa_ext_key {
> RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */
> - RISCV_ISA_EXT_KEY_ZIHINTPAUSE,
> RISCV_ISA_EXT_KEY_MAX,
> };
>
> @@ -88,8 +87,6 @@ static __always_inline int riscv_isa_ext2key(int num)
> return RISCV_ISA_EXT_KEY_FPU;
> case RISCV_ISA_EXT_d:
> return RISCV_ISA_EXT_KEY_FPU;
> - case RISCV_ISA_EXT_ZIHINTPAUSE:
> - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE;
> default:
> return -EINVAL;
> }
> diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h
> index 1e4f8b4aef79..789bdb8211a2 100644
> --- a/arch/riscv/include/asm/vdso/processor.h
> +++ b/arch/riscv/include/asm/vdso/processor.h
> @@ -4,30 +4,25 @@
>
> #ifndef __ASSEMBLY__
>
> -#include <linux/jump_label.h>
> #include <asm/barrier.h>
> -#include <asm/hwcap.h>
>
> static inline void cpu_relax(void)
> {
> - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) {
> #ifdef __riscv_muldiv
> - int dummy;
> - /* In lieu of a halt instruction, induce a long-latency stall. */
> - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> + int dummy;
> + /* In lieu of a halt instruction, induce a long-latency stall. */
> + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
> #endif
> - } else {
> - /*
> - * Reduce instruction retirement.
> - * This assumes the PC changes.
> - */
> + /*
> + * Reduce instruction retirement.
> + * This assumes the PC changes.
> + */
> #ifdef __riscv_zihintpause
> - __asm__ __volatile__ ("pause");
> + __asm__ __volatile__ ("pause");
> #else
> - /* Encoding of the pause instruction */
> - __asm__ __volatile__ (".4byte 0x100000F");
> + /* Encoding of the pause instruction */
> + __asm__ __volatile__ (".4byte 0x100000F");
> #endif
> - }
> barrier();
> }
>
> --
> 2.35.1
>