LinuxLists.cc - [PATCH V2 0/3] riscv: Add Zicbop & prefetchw support

[permalink] [raw]

Subject: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

From: Guo Ren <[email protected]>

Cache-block prefetch instructions are HINTs to the hardware to
indicate that software intends to perform a particular type of
memory access in the near future. This patch adds prefetch.i,
prefetch.r and prefetch.w instruction definitions by
RISCV_ISA_EXT_ZICBOP cpufeature.

Signed-off-by: Guo Ren <[email protected]>
Signed-off-by: Guo Ren <[email protected]>
---
arch/riscv/Kconfig | 15 ++++++++
arch/riscv/include/asm/hwcap.h | 1 +
arch/riscv/include/asm/insn-def.h | 60 +++++++++++++++++++++++++++++++
arch/riscv/kernel/cpufeature.c | 1 +
4 files changed, 77 insertions(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 24c1799e2ec4..fcbd417d65ea 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -579,6 +579,21 @@ config RISCV_ISA_ZICBOZ

If you don't know what to do here, say Y.

+config RISCV_ISA_ZICBOP
+ bool "Zicbop extension support for cache block prefetch"
+ depends on MMU
+ depends on RISCV_ALTERNATIVE
+ default y
+ help
+ Adds support to dynamically detect the presence of the ZICBOP
+ extension (Cache Block Prefetch Operations) and enable its
+ usage.
+
+ The Zicbop extension can be used to prefetch cache block for
+ read/write fetch.
+
+ If you don't know what to do here, say Y.
+
config TOOLCHAIN_HAS_ZIHINTPAUSE
bool
default y
diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
index 06d30526ef3b..77d3b6ee25ab 100644
--- a/arch/riscv/include/asm/hwcap.h
+++ b/arch/riscv/include/asm/hwcap.h
@@ -57,6 +57,7 @@
#define RISCV_ISA_EXT_ZIHPM 42
#define RISCV_ISA_EXT_SMSTATEEN 43
#define RISCV_ISA_EXT_ZICOND 44
+#define RISCV_ISA_EXT_ZICBOP 45

#define RISCV_ISA_EXT_MAX 64

diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
index e27179b26086..bbda350a63bf 100644
--- a/arch/riscv/include/asm/insn-def.h
+++ b/arch/riscv/include/asm/insn-def.h
@@ -18,6 +18,13 @@
#define INSN_I_RD_SHIFT 7
#define INSN_I_OPCODE_SHIFT 0

+#define INSN_S_SIMM7_SHIFT 25
+#define INSN_S_RS2_SHIFT 20
+#define INSN_S_RS1_SHIFT 15
+#define INSN_S_FUNC3_SHIFT 12
+#define INSN_S_SIMM5_SHIFT 7
+#define INSN_S_OPCODE_SHIFT 0
+
#ifdef __ASSEMBLY__

#ifdef CONFIG_AS_HAS_INSN
@@ -30,6 +37,10 @@
.insn i \opcode, \func3, \rd, \rs1, \simm12
.endm

+ .macro insn_s, opcode, func3, rs2, simm12, rs1
+ .insn s \opcode, \func3, \rs2, \simm12(\rs1)
+ .endm
+
#else

#include <asm/gpr-num.h>
@@ -51,10 +62,20 @@
(\simm12 << INSN_I_SIMM12_SHIFT))
.endm

+ .macro insn_s, opcode, func3, rs2, simm12, rs1
+ .4byte ((\opcode << INSN_S_OPCODE_SHIFT) | \
+ (\func3 << INSN_S_FUNC3_SHIFT) | \
+ (.L__gpr_num_\rs2 << INSN_S_RS2_SHIFT) | \
+ (.L__gpr_num_\rs1 << INSN_S_RS1_SHIFT) | \
+ ((\simm12 & 0x1f) << INSN_S_SIMM5_SHIFT) | \
+ (((\simm12 >> 5) & 0x7f) << INSN_S_SIMM7_SHIFT))
+ .endm
+
#endif

#define __INSN_R(...) insn_r __VA_ARGS__
#define __INSN_I(...) insn_i __VA_ARGS__
+#define __INSN_S(...) insn_s __VA_ARGS__

#else /* ! __ASSEMBLY__ */

@@ -66,6 +87,9 @@
#define __INSN_I(opcode, func3, rd, rs1, simm12) \
".insn i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n"

+#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
+ ".insn s " opcode ", " func3 ", " rs2 ", " simm12 "(" rs1 ")\n"
+
#else

#include <linux/stringify.h>
@@ -92,12 +116,26 @@
" (\\simm12 << " __stringify(INSN_I_SIMM12_SHIFT) "))\n" \
" .endm\n"

+#define DEFINE_INSN_S \
+ __DEFINE_ASM_GPR_NUMS \
+" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
+" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
+" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
+" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
+" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
+" ((\\simm12 & 0x1f) << " __stringify(INSN_S_SIMM5_SHIFT) ") |" \
+" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
+" .endm\n"
+
#define UNDEFINE_INSN_R \
" .purgem insn_r\n"

#define UNDEFINE_INSN_I \
" .purgem insn_i\n"

+#define UNDEFINE_INSN_S \
+" .purgem insn_s\n"
+
#define __INSN_R(opcode, func3, func7, rd, rs1, rs2) \
DEFINE_INSN_R \
"insn_r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n" \
@@ -108,6 +146,11 @@
"insn_i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n" \
UNDEFINE_INSN_I

+#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
+ DEFINE_INSN_S \
+ "insn_s " opcode ", " func3 ", " rs2 ", " simm12 ", " rs1 "\n" \
+ UNDEFINE_INSN_S
+
#endif

#endif /* ! __ASSEMBLY__ */
@@ -120,6 +163,10 @@
__INSN_I(RV_##opcode, RV_##func3, RV_##rd, \
RV_##rs1, RV_##simm12)

+#define INSN_S(opcode, func3, rs2, simm12, rs1) \
+ __INSN_S(RV_##opcode, RV_##func3, RV_##rs2, \
+ RV_##simm12, RV_##rs1)
+
#define RV_OPCODE(v) __ASM_STR(v)
#define RV_FUNC3(v) __ASM_STR(v)
#define RV_FUNC7(v) __ASM_STR(v)
@@ -133,6 +180,7 @@
#define RV___RS2(v) __RV_REG(v)

#define RV_OPCODE_MISC_MEM RV_OPCODE(15)
+#define RV_OPCODE_OP_IMM RV_OPCODE(19)
#define RV_OPCODE_SYSTEM RV_OPCODE(115)

#define HFENCE_VVMA(vaddr, asid) \
@@ -196,4 +244,16 @@
INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \
RS1(base), SIMM12(4))

+#define CBO_PREFETCH_I(base, offset) \
+ INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(0), \
+ SIMM12(offset), RS1(base))
+
+#define CBO_PREFETCH_R(base, offset) \
+ INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(1), \
+ SIMM12(offset), RS1(base))
+
+#define CBO_PREFETCH_W(base, offset) \
+ INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
+ SIMM12(offset), RS1(base))
+
#endif /* __ASM_INSN_DEF_H */
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index b3785ffc1570..bdb02b066041 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -168,6 +168,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
__RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
__RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
__RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
+ __RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),
__RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
__RISCV_ISA_EXT_DATA(zicond, RISCV_ISA_EXT_ZICOND),
__RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),
--
2.40.1

2023-12-31 08:31:13

[permalink] [raw]

Subject: [PATCH V2 2/3] riscv: Add ARCH_HAS_PRETCHW support with Zibop

From: Guo Ren <[email protected]>

Enable Linux prefetchw primitive with Zibop cpufeature, which preloads
cache line into L1 cache for the next write operation.

Signed-off-by: Guo Ren <[email protected]>
Signed-off-by: Guo Ren <[email protected]>
---
arch/riscv/include/asm/processor.h | 16 ++++++++++++++++
1 file changed, 16 insertions(+)

diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
index f19f861cda54..8d3a2ab37678 100644
--- a/arch/riscv/include/asm/processor.h
+++ b/arch/riscv/include/asm/processor.h
@@ -13,6 +13,9 @@
#include <vdso/processor.h>

#include <asm/ptrace.h>
+#include <asm/insn-def.h>
+#include <asm/alternative-macros.h>
+#include <asm/hwcap.h>

#ifdef CONFIG_64BIT
#define DEFAULT_MAP_WINDOW (UL(1) << (MMAP_VA_BITS - 1))
@@ -106,6 +109,19 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
#define KSTK_EIP(tsk) (task_pt_regs(tsk)->epc)
#define KSTK_ESP(tsk) (task_pt_regs(tsk)->sp)

+#ifdef CONFIG_RISCV_ISA_ZICBOP
+#define ARCH_HAS_PREFETCHW
+
+#define PREFETCHW_ASM(x) \
+ ALTERNATIVE(__nops(1), CBO_PREFETCH_W(x, 0), 0, \
+ RISCV_ISA_EXT_ZICBOP, CONFIG_RISCV_ISA_ZICBOP)
+
+
+static inline void prefetchw(const void *x)
+{
+ __asm__ __volatile__(PREFETCHW_ASM(%0) : : "r" (x) : "memory");
+}
+#endif /* CONFIG_RISCV_ISA_ZICBOP */

/* Do necessary setup to start up a newly executed thread. */
extern void start_thread(struct pt_regs *regs,
--
2.40.1

2023-12-31 08:31:42

[permalink] [raw]

Subject: [PATCH V2 3/3] riscv: xchg: Prefetch the destination word for sc.w

From: Guo Ren <[email protected]>

The cost of changing a cacheline from shared to exclusive state can be
significant, especially when this is triggered by an exclusive store,
since it may result in having to retry the transaction.

This patch makes use of prefetch.w to prefetch cachelines for write
prior to lr/sc loops when using the xchg_small atomic routine.

This patch is inspired by commit: 0ea366f5e1b6 ("arm64: atomics:
prefetch the destination word for write prior to stxr").

Signed-off-by: Guo Ren <[email protected]>
Signed-off-by: Guo Ren <[email protected]>
---
arch/riscv/include/asm/cmpxchg.h | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
index 26cea2395aae..d7b9d7951f08 100644
--- a/arch/riscv/include/asm/cmpxchg.h
+++ b/arch/riscv/include/asm/cmpxchg.h
@@ -10,6 +10,7 @@

#include <asm/barrier.h>
#include <asm/fence.h>
+#include <asm/processor.h>

#define __arch_xchg_masked(prepend, append, r, p, n) \
({ \
@@ -23,6 +24,7 @@
\
__asm__ __volatile__ ( \
prepend \
+ PREFETCHW_ASM(%5) \
"0: lr.w %0, %2\n" \
" and %1, %0, %z4\n" \
" or %1, %1, %z3\n" \
@@ -30,7 +32,7 @@
" bnez %1, 0b\n" \
append \
: "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b)) \
- : "rJ" (__newx), "rJ" (~__mask) \
+ : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b) \
: "memory"); \
\
r = (__typeof__(*(p)))((__retx & __mask) >> __s); \
--
2.40.1

2024-01-01 02:29:41

[permalink] [raw]

Subject: Re: [PATCH V2 2/3] riscv: Add ARCH_HAS_PRETCHW support with Zibop

On Sun, Dec 31, 2023 at 4:30 PM <[email protected]> wrote:
>
> From: Guo Ren <[email protected]>
>
> Enable Linux prefetchw primitive with Zibop cpufeature, which preloads
> cache line into L1 cache for the next write operation.
>
> Signed-off-by: Guo Ren <[email protected]>
> Signed-off-by: Guo Ren <[email protected]>
> ---
> arch/riscv/include/asm/processor.h | 16 ++++++++++++++++
> 1 file changed, 16 insertions(+)
>
> diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
> index f19f861cda54..8d3a2ab37678 100644
> --- a/arch/riscv/include/asm/processor.h
> +++ b/arch/riscv/include/asm/processor.h
> @@ -13,6 +13,9 @@
> #include <vdso/processor.h>
>
> #include <asm/ptrace.h>
> +#include <asm/insn-def.h>
> +#include <asm/alternative-macros.h>
> +#include <asm/hwcap.h>
>
> #ifdef CONFIG_64BIT
> #define DEFAULT_MAP_WINDOW (UL(1) << (MMAP_VA_BITS - 1))
> @@ -106,6 +109,19 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
> #define KSTK_EIP(tsk) (task_pt_regs(tsk)->epc)
> #define KSTK_ESP(tsk) (task_pt_regs(tsk)->sp)
>
> +#ifdef CONFIG_RISCV_ISA_ZICBOP
> +#define ARCH_HAS_PREFETCHW
> +
> +#define PREFETCHW_ASM(x) \
> + ALTERNATIVE(__nops(1), CBO_PREFETCH_W(x, 0), 0, \
> + RISCV_ISA_EXT_ZICBOP, CONFIG_RISCV_ISA_ZICBOP)
The PREFETCHW_ASM(x) definition should be out of "ifdef
CONFIG_RISCV_ISA_ZICBOP... #endif", because xchg_small may use this
macro without CONFIG_RISCV_ISA_ZICBOP.

> +
> +
> +static inline void prefetchw(const void *x)
> +{
> + __asm__ __volatile__(PREFETCHW_ASM(%0) : : "r" (x) : "memory");
> +}
> +#endif /* CONFIG_RISCV_ISA_ZICBOP */
>
> /* Do necessary setup to start up a newly executed thread. */
> extern void start_thread(struct pt_regs *regs,
> --
> 2.40.1
>

--
Best Regards
Guo Ren

2024-01-02 10:33:06

[permalink] [raw]

Subject: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On Sun, Dec 31, 2023 at 03:29:51AM -0500, [email protected] wrote:
> From: Guo Ren <[email protected]>
>
> Cache-block prefetch instructions are HINTs to the hardware to
> indicate that software intends to perform a particular type of
> memory access in the near future. This patch adds prefetch.i,
> prefetch.r and prefetch.w instruction definitions by
> RISCV_ISA_EXT_ZICBOP cpufeature.

It also adds S-type instruction encoding support which isn't mentioned.
Actually, it'd probably be best to put the new instruction encoding in
its own separate patch.

>
> Signed-off-by: Guo Ren <[email protected]>
> Signed-off-by: Guo Ren <[email protected]>
> ---
> arch/riscv/Kconfig | 15 ++++++++
> arch/riscv/include/asm/hwcap.h | 1 +
> arch/riscv/include/asm/insn-def.h | 60 +++++++++++++++++++++++++++++++
> arch/riscv/kernel/cpufeature.c | 1 +
> 4 files changed, 77 insertions(+)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 24c1799e2ec4..fcbd417d65ea 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -579,6 +579,21 @@ config RISCV_ISA_ZICBOZ
>
> If you don't know what to do here, say Y.
>
> +config RISCV_ISA_ZICBOP
> + bool "Zicbop extension support for cache block prefetch"
> + depends on MMU
> + depends on RISCV_ALTERNATIVE
> + default y
> + help
> + Adds support to dynamically detect the presence of the ZICBOP
> + extension (Cache Block Prefetch Operations) and enable its
> + usage.
> +
> + The Zicbop extension can be used to prefetch cache block for

blocks

> + read/write fetch.
> +
> + If you don't know what to do here, say Y.
> +
> config TOOLCHAIN_HAS_ZIHINTPAUSE
> bool
> default y
> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> index 06d30526ef3b..77d3b6ee25ab 100644
> --- a/arch/riscv/include/asm/hwcap.h
> +++ b/arch/riscv/include/asm/hwcap.h
> @@ -57,6 +57,7 @@
> #define RISCV_ISA_EXT_ZIHPM 42
> #define RISCV_ISA_EXT_SMSTATEEN 43
> #define RISCV_ISA_EXT_ZICOND 44
> +#define RISCV_ISA_EXT_ZICBOP 45
>
> #define RISCV_ISA_EXT_MAX 64
>
> diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> index e27179b26086..bbda350a63bf 100644
> --- a/arch/riscv/include/asm/insn-def.h
> +++ b/arch/riscv/include/asm/insn-def.h
> @@ -18,6 +18,13 @@
> #define INSN_I_RD_SHIFT 7
> #define INSN_I_OPCODE_SHIFT 0
>
> +#define INSN_S_SIMM7_SHIFT 25
> +#define INSN_S_RS2_SHIFT 20
> +#define INSN_S_RS1_SHIFT 15
> +#define INSN_S_FUNC3_SHIFT 12
> +#define INSN_S_SIMM5_SHIFT 7
> +#define INSN_S_OPCODE_SHIFT 0
> +
> #ifdef __ASSEMBLY__
>
> #ifdef CONFIG_AS_HAS_INSN
> @@ -30,6 +37,10 @@
> .insn i \opcode, \func3, \rd, \rs1, \simm12
> .endm
>
> + .macro insn_s, opcode, func3, rs2, simm12, rs1
> + .insn s \opcode, \func3, \rs2, \simm12(\rs1)
> + .endm
> +
> #else
>
> #include <asm/gpr-num.h>
> @@ -51,10 +62,20 @@
> (\simm12 << INSN_I_SIMM12_SHIFT))
> .endm
>
> + .macro insn_s, opcode, func3, rs2, simm12, rs1
> + .4byte ((\opcode << INSN_S_OPCODE_SHIFT) | \
> + (\func3 << INSN_S_FUNC3_SHIFT) | \
> + (.L__gpr_num_\rs2 << INSN_S_RS2_SHIFT) | \
> + (.L__gpr_num_\rs1 << INSN_S_RS1_SHIFT) | \
> + ((\simm12 & 0x1f) << INSN_S_SIMM5_SHIFT) | \
> + (((\simm12 >> 5) & 0x7f) << INSN_S_SIMM7_SHIFT))
> + .endm
> +
> #endif
>
> #define __INSN_R(...) insn_r __VA_ARGS__
> #define __INSN_I(...) insn_i __VA_ARGS__
> +#define __INSN_S(...) insn_s __VA_ARGS__
>
> #else /* ! __ASSEMBLY__ */
>
> @@ -66,6 +87,9 @@
> #define __INSN_I(opcode, func3, rd, rs1, simm12) \
> ".insn i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n"
>
> +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
> + ".insn s " opcode ", " func3 ", " rs2 ", " simm12 "(" rs1 ")\n"
> +
> #else
>
> #include <linux/stringify.h>
> @@ -92,12 +116,26 @@
> " (\\simm12 << " __stringify(INSN_I_SIMM12_SHIFT) "))\n" \
> " .endm\n"
>
> +#define DEFINE_INSN_S \
> + __DEFINE_ASM_GPR_NUMS \
> +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
> +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
> +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
> +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
> +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
> +" ((\\simm12 & 0x1f) << " __stringify(INSN_S_SIMM5_SHIFT) ") |" \
> +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
> +" .endm\n"
> +
> #define UNDEFINE_INSN_R \
> " .purgem insn_r\n"
>
> #define UNDEFINE_INSN_I \
> " .purgem insn_i\n"
>
> +#define UNDEFINE_INSN_S \
> +" .purgem insn_s\n"
> +
> #define __INSN_R(opcode, func3, func7, rd, rs1, rs2) \
> DEFINE_INSN_R \
> "insn_r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n" \
> @@ -108,6 +146,11 @@
> "insn_i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n" \
> UNDEFINE_INSN_I
>
> +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
> + DEFINE_INSN_S \
> + "insn_s " opcode ", " func3 ", " rs2 ", " simm12 ", " rs1 "\n" \
> + UNDEFINE_INSN_S
> +
> #endif
>
> #endif /* ! __ASSEMBLY__ */
> @@ -120,6 +163,10 @@
> __INSN_I(RV_##opcode, RV_##func3, RV_##rd, \
> RV_##rs1, RV_##simm12)
>
> +#define INSN_S(opcode, func3, rs2, simm12, rs1) \
> + __INSN_S(RV_##opcode, RV_##func3, RV_##rs2, \
> + RV_##simm12, RV_##rs1)
> +
> #define RV_OPCODE(v) __ASM_STR(v)
> #define RV_FUNC3(v) __ASM_STR(v)
> #define RV_FUNC7(v) __ASM_STR(v)
> @@ -133,6 +180,7 @@
> #define RV___RS2(v) __RV_REG(v)
>
> #define RV_OPCODE_MISC_MEM RV_OPCODE(15)
> +#define RV_OPCODE_OP_IMM RV_OPCODE(19)
> #define RV_OPCODE_SYSTEM RV_OPCODE(115)
>
> #define HFENCE_VVMA(vaddr, asid) \
> @@ -196,4 +244,16 @@
> INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \
> RS1(base), SIMM12(4))
>
> +#define CBO_PREFETCH_I(base, offset) \
> + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(0), \
> + SIMM12(offset), RS1(base))
> +
> +#define CBO_PREFETCH_R(base, offset) \
> + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(1), \
> + SIMM12(offset), RS1(base))
> +
> +#define CBO_PREFETCH_W(base, offset) \
> + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> + SIMM12(offset), RS1(base))

Shouldn't we ensure the lower 5-bits of offset are zero by masking it?

> +
> #endif /* __ASM_INSN_DEF_H */
> diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> index b3785ffc1570..bdb02b066041 100644
> --- a/arch/riscv/kernel/cpufeature.c
> +++ b/arch/riscv/kernel/cpufeature.c
> @@ -168,6 +168,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
> __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
> __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
> __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
> + __RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),

zicbop should be above zicboz (alphabetical)

> __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
> __RISCV_ISA_EXT_DATA(zicond, RISCV_ISA_EXT_ZICOND),
> __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),
> --
> 2.40.1
>

Thanks,
drew

2024-01-02 10:45:27

[permalink] [raw]

Subject: Re: [PATCH V2 2/3] riscv: Add ARCH_HAS_PRETCHW support with Zibop

s/Zibop/Zicbop/ <<<$SUBJECT

On Sun, Dec 31, 2023 at 03:29:52AM -0500, [email protected] wrote:
> From: Guo Ren <[email protected]>
>
> Enable Linux prefetchw primitive with Zibop cpufeature, which preloads

Also s/Zibop/Zicbop/ here

> cache line into L1 cache for the next write operation.
>
> Signed-off-by: Guo Ren <[email protected]>
> Signed-off-by: Guo Ren <[email protected]>
> ---
> arch/riscv/include/asm/processor.h | 16 ++++++++++++++++
> 1 file changed, 16 insertions(+)
>
> diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
> index f19f861cda54..8d3a2ab37678 100644
> --- a/arch/riscv/include/asm/processor.h
> +++ b/arch/riscv/include/asm/processor.h
> @@ -13,6 +13,9 @@
> #include <vdso/processor.h>
>
> #include <asm/ptrace.h>
> +#include <asm/insn-def.h>
> +#include <asm/alternative-macros.h>
> +#include <asm/hwcap.h>
>
> #ifdef CONFIG_64BIT
> #define DEFAULT_MAP_WINDOW (UL(1) << (MMAP_VA_BITS - 1))
> @@ -106,6 +109,19 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
> #define KSTK_EIP(tsk) (task_pt_regs(tsk)->epc)
> #define KSTK_ESP(tsk) (task_pt_regs(tsk)->sp)
>
> +#ifdef CONFIG_RISCV_ISA_ZICBOP
> +#define ARCH_HAS_PREFETCHW
> +
> +#define PREFETCHW_ASM(x) \
> + ALTERNATIVE(__nops(1), CBO_PREFETCH_W(x, 0), 0, \
> + RISCV_ISA_EXT_ZICBOP, CONFIG_RISCV_ISA_ZICBOP)
> +
> +
> +static inline void prefetchw(const void *x)
> +{
> + __asm__ __volatile__(PREFETCHW_ASM(%0) : : "r" (x) : "memory");
> +}

Shouldn't we create an interface which exposes the offset input of
the instruction, allowing a sequence of calls to be unrolled? But
I guess that could be put off until there's a need for it.

> +#endif /* CONFIG_RISCV_ISA_ZICBOP */
>
> /* Do necessary setup to start up a newly executed thread. */
> extern void start_thread(struct pt_regs *regs,
> --
> 2.40.1
>

Thanks,
drew

2024-01-02 11:19:06

[permalink] [raw]

Subject: Re: [PATCH V2 3/3] riscv: xchg: Prefetch the destination word for sc.w

On Sun, Dec 31, 2023 at 03:29:53AM -0500, [email protected] wrote:
> From: Guo Ren <[email protected]>
>
> The cost of changing a cacheline from shared to exclusive state can be
> significant, especially when this is triggered by an exclusive store,
> since it may result in having to retry the transaction.
>
> This patch makes use of prefetch.w to prefetch cachelines for write
> prior to lr/sc loops when using the xchg_small atomic routine.
>
> This patch is inspired by commit: 0ea366f5e1b6 ("arm64: atomics:
> prefetch the destination word for write prior to stxr").
>
> Signed-off-by: Guo Ren <[email protected]>
> Signed-off-by: Guo Ren <[email protected]>
> ---
> arch/riscv/include/asm/cmpxchg.h | 4 +++-
> 1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> index 26cea2395aae..d7b9d7951f08 100644
> --- a/arch/riscv/include/asm/cmpxchg.h
> +++ b/arch/riscv/include/asm/cmpxchg.h
> @@ -10,6 +10,7 @@
>
> #include <asm/barrier.h>
> #include <asm/fence.h>
> +#include <asm/processor.h>
>
> #define __arch_xchg_masked(prepend, append, r, p, n) \

Are you sure this is based on v6.7-rc7? Because I don't see this macro.

> ({ \
> @@ -23,6 +24,7 @@
> \
> __asm__ __volatile__ ( \
> prepend \
> + PREFETCHW_ASM(%5) \
> "0: lr.w %0, %2\n" \
> " and %1, %0, %z4\n" \
> " or %1, %1, %z3\n" \
> @@ -30,7 +32,7 @@
> " bnez %1, 0b\n" \
> append \
> : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b)) \
> - : "rJ" (__newx), "rJ" (~__mask) \
> + : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b) \

I'm pretty sure we don't want to allow the J constraint for __ptr32b.

> : "memory"); \
> \
> r = (__typeof__(*(p)))((__retx & __mask) >> __s); \
> --
> 2.40.1
>

Thanks,
drew

2024-01-03 06:13:24

[permalink] [raw]

Subject: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On Tue, Jan 2, 2024 at 6:32 PM Andrew Jones <[email protected]> wrote:
>
> On Sun, Dec 31, 2023 at 03:29:51AM -0500, [email protected] wrote:
> > From: Guo Ren <[email protected]>
> >
> > Cache-block prefetch instructions are HINTs to the hardware to
> > indicate that software intends to perform a particular type of
> > memory access in the near future. This patch adds prefetch.i,
> > prefetch.r and prefetch.w instruction definitions by
> > RISCV_ISA_EXT_ZICBOP cpufeature.
>
> It also adds S-type instruction encoding support which isn't mentioned.
> Actually, it'd probably be best to put the new instruction encoding in
> its own separate patch.
Okay, I would separate the instruction encoding patch in the next version.

>
> >
> > Signed-off-by: Guo Ren <[email protected]>
> > Signed-off-by: Guo Ren <[email protected]>
> > ---
> > arch/riscv/Kconfig | 15 ++++++++
> > arch/riscv/include/asm/hwcap.h | 1 +
> > arch/riscv/include/asm/insn-def.h | 60 +++++++++++++++++++++++++++++++
> > arch/riscv/kernel/cpufeature.c | 1 +
> > 4 files changed, 77 insertions(+)
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index 24c1799e2ec4..fcbd417d65ea 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -579,6 +579,21 @@ config RISCV_ISA_ZICBOZ
> >
> > If you don't know what to do here, say Y.
> >
> > +config RISCV_ISA_ZICBOP
> > + bool "Zicbop extension support for cache block prefetch"
> > + depends on MMU
> > + depends on RISCV_ALTERNATIVE
> > + default y
> > + help
> > + Adds support to dynamically detect the presence of the ZICBOP
> > + extension (Cache Block Prefetch Operations) and enable its
> > + usage.
> > +
> > + The Zicbop extension can be used to prefetch cache block for
>
> blocks
>
> > + read/write fetch.
> > +
> > + If you don't know what to do here, say Y.
> > +
> > config TOOLCHAIN_HAS_ZIHINTPAUSE
> > bool
> > default y
> > diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > index 06d30526ef3b..77d3b6ee25ab 100644
> > --- a/arch/riscv/include/asm/hwcap.h
> > +++ b/arch/riscv/include/asm/hwcap.h
> > @@ -57,6 +57,7 @@
> > #define RISCV_ISA_EXT_ZIHPM 42
> > #define RISCV_ISA_EXT_SMSTATEEN 43
> > #define RISCV_ISA_EXT_ZICOND 44
> > +#define RISCV_ISA_EXT_ZICBOP 45
> >
> > #define RISCV_ISA_EXT_MAX 64
> >
> > diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> > index e27179b26086..bbda350a63bf 100644
> > --- a/arch/riscv/include/asm/insn-def.h
> > +++ b/arch/riscv/include/asm/insn-def.h
> > @@ -18,6 +18,13 @@
> > #define INSN_I_RD_SHIFT 7
> > #define INSN_I_OPCODE_SHIFT 0
> >
> > +#define INSN_S_SIMM7_SHIFT 25
> > +#define INSN_S_RS2_SHIFT 20
> > +#define INSN_S_RS1_SHIFT 15
> > +#define INSN_S_FUNC3_SHIFT 12
> > +#define INSN_S_SIMM5_SHIFT 7
> > +#define INSN_S_OPCODE_SHIFT 0
> > +
> > #ifdef __ASSEMBLY__
> >
> > #ifdef CONFIG_AS_HAS_INSN
> > @@ -30,6 +37,10 @@
> > .insn i \opcode, \func3, \rd, \rs1, \simm12
> > .endm
> >
> > + .macro insn_s, opcode, func3, rs2, simm12, rs1
> > + .insn s \opcode, \func3, \rs2, \simm12(\rs1)
> > + .endm
> > +
> > #else
> >
> > #include <asm/gpr-num.h>
> > @@ -51,10 +62,20 @@
> > (\simm12 << INSN_I_SIMM12_SHIFT))
> > .endm
> >
> > + .macro insn_s, opcode, func3, rs2, simm12, rs1
> > + .4byte ((\opcode << INSN_S_OPCODE_SHIFT) | \
> > + (\func3 << INSN_S_FUNC3_SHIFT) | \
> > + (.L__gpr_num_\rs2 << INSN_S_RS2_SHIFT) | \
> > + (.L__gpr_num_\rs1 << INSN_S_RS1_SHIFT) | \
> > + ((\simm12 & 0x1f) << INSN_S_SIMM5_SHIFT) | \
> > + (((\simm12 >> 5) & 0x7f) << INSN_S_SIMM7_SHIFT))
> > + .endm
> > +
> > #endif
> >
> > #define __INSN_R(...) insn_r __VA_ARGS__
> > #define __INSN_I(...) insn_i __VA_ARGS__
> > +#define __INSN_S(...) insn_s __VA_ARGS__
> >
> > #else /* ! __ASSEMBLY__ */
> >
> > @@ -66,6 +87,9 @@
> > #define __INSN_I(opcode, func3, rd, rs1, simm12) \
> > ".insn i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n"
> >
> > +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
> > + ".insn s " opcode ", " func3 ", " rs2 ", " simm12 "(" rs1 ")\n"
> > +
> > #else
> >
> > #include <linux/stringify.h>
> > @@ -92,12 +116,26 @@
> > " (\\simm12 << " __stringify(INSN_I_SIMM12_SHIFT) "))\n" \
> > " .endm\n"
> >
> > +#define DEFINE_INSN_S \
> > + __DEFINE_ASM_GPR_NUMS \
> > +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
> > +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
> > +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
> > +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
> > +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
> > +" ((\\simm12 & 0x1f) << " __stringify(INSN_S_SIMM5_SHIFT) ") |" \
> > +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
> > +" .endm\n"
> > +
> > #define UNDEFINE_INSN_R \
> > " .purgem insn_r\n"
> >
> > #define UNDEFINE_INSN_I \
> > " .purgem insn_i\n"
> >
> > +#define UNDEFINE_INSN_S \
> > +" .purgem insn_s\n"
> > +
> > #define __INSN_R(opcode, func3, func7, rd, rs1, rs2) \
> > DEFINE_INSN_R \
> > "insn_r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n" \
> > @@ -108,6 +146,11 @@
> > "insn_i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n" \
> > UNDEFINE_INSN_I
> >
> > +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
> > + DEFINE_INSN_S \
> > + "insn_s " opcode ", " func3 ", " rs2 ", " simm12 ", " rs1 "\n" \
> > + UNDEFINE_INSN_S
> > +
> > #endif
> >
> > #endif /* ! __ASSEMBLY__ */
> > @@ -120,6 +163,10 @@
> > __INSN_I(RV_##opcode, RV_##func3, RV_##rd, \
> > RV_##rs1, RV_##simm12)
> >
> > +#define INSN_S(opcode, func3, rs2, simm12, rs1) \
> > + __INSN_S(RV_##opcode, RV_##func3, RV_##rs2, \
> > + RV_##simm12, RV_##rs1)
> > +
> > #define RV_OPCODE(v) __ASM_STR(v)
> > #define RV_FUNC3(v) __ASM_STR(v)
> > #define RV_FUNC7(v) __ASM_STR(v)
> > @@ -133,6 +180,7 @@
> > #define RV___RS2(v) __RV_REG(v)
> >
> > #define RV_OPCODE_MISC_MEM RV_OPCODE(15)
> > +#define RV_OPCODE_OP_IMM RV_OPCODE(19)
> > #define RV_OPCODE_SYSTEM RV_OPCODE(115)
> >
> > #define HFENCE_VVMA(vaddr, asid) \
> > @@ -196,4 +244,16 @@
> > INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \
> > RS1(base), SIMM12(4))
> >
> > +#define CBO_PREFETCH_I(base, offset) \
> > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(0), \
> > + SIMM12(offset), RS1(base))
> > +
> > +#define CBO_PREFETCH_R(base, offset) \
> > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(1), \
> > + SIMM12(offset), RS1(base))
> > +
> > +#define CBO_PREFETCH_W(base, offset) \
> > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > + SIMM12(offset), RS1(base))
>
> Shouldn't we ensure the lower 5-bits of offset are zero by masking it?
The spec says:
"These instructions operate on the cache block whose effective address
is the sum of the base address specified in rs1 and the sign-extended
offset encoded in imm[11:0], where imm[4:0] shall equal 0b00000. The
effective address is translated into a corresponding physical address
by the appropriate translation mechanisms."

So, the user of prefetch.w should keep imm[4:0] zero. Just like the
patch has done, the whole imm[11:0] is zero.

>
> > +
> > #endif /* __ASM_INSN_DEF_H */
> > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > index b3785ffc1570..bdb02b066041 100644
> > --- a/arch/riscv/kernel/cpufeature.c
> > +++ b/arch/riscv/kernel/cpufeature.c
> > @@ -168,6 +168,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
> > __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
> > __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
> > __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
> > + __RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),
>
> zicbop should be above zicboz (alphabetical)
Yes, I would correct it, next.

>
> > __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
> > __RISCV_ISA_EXT_DATA(zicond, RISCV_ISA_EXT_ZICOND),
> > __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),
> > --
> > 2.40.1
> >
>
> Thanks,
> drew

--
Best Regards
Guo Ren

2024-01-03 06:16:07

[permalink] [raw]

Subject: Re: [PATCH V2 3/3] riscv: xchg: Prefetch the destination word for sc.w

On Tue, Jan 2, 2024 at 7:19 PM Andrew Jones <[email protected]> wrote:
>
> On Sun, Dec 31, 2023 at 03:29:53AM -0500, [email protected] wrote:
> > From: Guo Ren <[email protected]>
> >
> > The cost of changing a cacheline from shared to exclusive state can be
> > significant, especially when this is triggered by an exclusive store,
> > since it may result in having to retry the transaction.
> >
> > This patch makes use of prefetch.w to prefetch cachelines for write
> > prior to lr/sc loops when using the xchg_small atomic routine.
> >
> > This patch is inspired by commit: 0ea366f5e1b6 ("arm64: atomics:
> > prefetch the destination word for write prior to stxr").
> >
> > Signed-off-by: Guo Ren <[email protected]>
> > Signed-off-by: Guo Ren <[email protected]>
> > ---
> > arch/riscv/include/asm/cmpxchg.h | 4 +++-
> > 1 file changed, 3 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > index 26cea2395aae..d7b9d7951f08 100644
> > --- a/arch/riscv/include/asm/cmpxchg.h
> > +++ b/arch/riscv/include/asm/cmpxchg.h
> > @@ -10,6 +10,7 @@
> >
> > #include <asm/barrier.h>
> > #include <asm/fence.h>
> > +#include <asm/processor.h>
> >
> > #define __arch_xchg_masked(prepend, append, r, p, n) \
>
> Are you sure this is based on v6.7-rc7? Because I don't see this macro.
Oh, it is based on Leobras' patches. I would remove it in the next of version.

>
> > ({ \
> > @@ -23,6 +24,7 @@
> > \
> > __asm__ __volatile__ ( \
> > prepend \
> > + PREFETCHW_ASM(%5) \
> > "0: lr.w %0, %2\n" \
> > " and %1, %0, %z4\n" \
> > " or %1, %1, %z3\n" \
> > @@ -30,7 +32,7 @@
> > " bnez %1, 0b\n" \
> > append \
> > : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b)) \
> > - : "rJ" (__newx), "rJ" (~__mask) \
> > + : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b) \
>
> I'm pretty sure we don't want to allow the J constraint for __ptr32b.
>
> > : "memory"); \
> > \
> > r = (__typeof__(*(p)))((__retx & __mask) >> __s); \
> > --
> > 2.40.1
> >
>
> Thanks,
> drew

--
Best Regards
Guo Ren

2024-01-03 06:21:39

[permalink] [raw]

Subject: Re: [PATCH V2 2/3] riscv: Add ARCH_HAS_PRETCHW support with Zibop

On Tue, Jan 2, 2024 at 6:45 PM Andrew Jones <[email protected]> wrote:
>
>
> s/Zibop/Zicbop/ <<<$SUBJECT
okay

>
> On Sun, Dec 31, 2023 at 03:29:52AM -0500, [email protected] wrote:
> > From: Guo Ren <[email protected]>
> >
> > Enable Linux prefetchw primitive with Zibop cpufeature, which preloads
>
> Also s/Zibop/Zicbop/ here
okay, thx.

>
> > cache line into L1 cache for the next write operation.
> >
> > Signed-off-by: Guo Ren <[email protected]>
> > Signed-off-by: Guo Ren <[email protected]>
> > ---
> > arch/riscv/include/asm/processor.h | 16 ++++++++++++++++
> > 1 file changed, 16 insertions(+)
> >
> > diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
> > index f19f861cda54..8d3a2ab37678 100644
> > --- a/arch/riscv/include/asm/processor.h
> > +++ b/arch/riscv/include/asm/processor.h
> > @@ -13,6 +13,9 @@
> > #include <vdso/processor.h>
> >
> > #include <asm/ptrace.h>
> > +#include <asm/insn-def.h>
> > +#include <asm/alternative-macros.h>
> > +#include <asm/hwcap.h>
> >
> > #ifdef CONFIG_64BIT
> > #define DEFAULT_MAP_WINDOW (UL(1) << (MMAP_VA_BITS - 1))
> > @@ -106,6 +109,19 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
> > #define KSTK_EIP(tsk) (task_pt_regs(tsk)->epc)
> > #define KSTK_ESP(tsk) (task_pt_regs(tsk)->sp)
> >
> > +#ifdef CONFIG_RISCV_ISA_ZICBOP
> > +#define ARCH_HAS_PREFETCHW
> > +
> > +#define PREFETCHW_ASM(x) \
> > + ALTERNATIVE(__nops(1), CBO_PREFETCH_W(x, 0), 0, \
> > + RISCV_ISA_EXT_ZICBOP, CONFIG_RISCV_ISA_ZICBOP)
> > +
> > +
> > +static inline void prefetchw(const void *x)
> > +{
> > + __asm__ __volatile__(PREFETCHW_ASM(%0) : : "r" (x) : "memory");
> > +}
>
> Shouldn't we create an interface which exposes the offset input of
> the instruction, allowing a sequence of calls to be unrolled? But
> I guess that could be put off until there's a need for it.
I want to put it off until there's a user. Let's keep the whole
imm[11:0] zero for the current.

>
> > +#endif /* CONFIG_RISCV_ISA_ZICBOP */
> >
> > /* Do necessary setup to start up a newly executed thread. */
> > extern void start_thread(struct pt_regs *regs,
> > --
> > 2.40.1
> >
>
> Thanks,
> drew

--
Best Regards
Guo Ren

2024-01-03 06:50:04

[permalink] [raw]

Subject: Re: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On Wed, Jan 03, 2024 at 02:13:00PM +0800, Guo Ren wrote:
> On Tue, Jan 2, 2024 at 6:32 PM Andrew Jones <[email protected]> wrote:
> >
> > On Sun, Dec 31, 2023 at 03:29:51AM -0500, [email protected] wrote:
> > > From: Guo Ren <[email protected]>
> > >
> > > Cache-block prefetch instructions are HINTs to the hardware to
> > > indicate that software intends to perform a particular type of
> > > memory access in the near future. This patch adds prefetch.i,
> > > prefetch.r and prefetch.w instruction definitions by
> > > RISCV_ISA_EXT_ZICBOP cpufeature.
> >
> > It also adds S-type instruction encoding support which isn't mentioned.
> > Actually, it'd probably be best to put the new instruction encoding in
> > its own separate patch.
> Okay, I would separate the instruction encoding patch in the next version.
>
> >
> > >
> > > Signed-off-by: Guo Ren <[email protected]>
> > > Signed-off-by: Guo Ren <[email protected]>
> > > ---
> > > arch/riscv/Kconfig | 15 ++++++++
> > > arch/riscv/include/asm/hwcap.h | 1 +
> > > arch/riscv/include/asm/insn-def.h | 60 +++++++++++++++++++++++++++++++
> > > arch/riscv/kernel/cpufeature.c | 1 +
> > > 4 files changed, 77 insertions(+)
> > >
> > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > index 24c1799e2ec4..fcbd417d65ea 100644
> > > --- a/arch/riscv/Kconfig
> > > +++ b/arch/riscv/Kconfig
> > > @@ -579,6 +579,21 @@ config RISCV_ISA_ZICBOZ
> > >
> > > If you don't know what to do here, say Y.
> > >
> > > +config RISCV_ISA_ZICBOP
> > > + bool "Zicbop extension support for cache block prefetch"
> > > + depends on MMU
> > > + depends on RISCV_ALTERNATIVE
> > > + default y
> > > + help
> > > + Adds support to dynamically detect the presence of the ZICBOP
> > > + extension (Cache Block Prefetch Operations) and enable its
> > > + usage.
> > > +
> > > + The Zicbop extension can be used to prefetch cache block for
> >
> > blocks
> >
> > > + read/write fetch.
> > > +
> > > + If you don't know what to do here, say Y.
> > > +
> > > config TOOLCHAIN_HAS_ZIHINTPAUSE
> > > bool
> > > default y
> > > diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > > index 06d30526ef3b..77d3b6ee25ab 100644
> > > --- a/arch/riscv/include/asm/hwcap.h
> > > +++ b/arch/riscv/include/asm/hwcap.h
> > > @@ -57,6 +57,7 @@
> > > #define RISCV_ISA_EXT_ZIHPM 42
> > > #define RISCV_ISA_EXT_SMSTATEEN 43
> > > #define RISCV_ISA_EXT_ZICOND 44
> > > +#define RISCV_ISA_EXT_ZICBOP 45
> > >
> > > #define RISCV_ISA_EXT_MAX 64
> > >
> > > diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> > > index e27179b26086..bbda350a63bf 100644
> > > --- a/arch/riscv/include/asm/insn-def.h
> > > +++ b/arch/riscv/include/asm/insn-def.h
> > > @@ -18,6 +18,13 @@
> > > #define INSN_I_RD_SHIFT 7
> > > #define INSN_I_OPCODE_SHIFT 0
> > >
> > > +#define INSN_S_SIMM7_SHIFT 25
> > > +#define INSN_S_RS2_SHIFT 20
> > > +#define INSN_S_RS1_SHIFT 15
> > > +#define INSN_S_FUNC3_SHIFT 12
> > > +#define INSN_S_SIMM5_SHIFT 7
> > > +#define INSN_S_OPCODE_SHIFT 0
> > > +
> > > #ifdef __ASSEMBLY__
> > >
> > > #ifdef CONFIG_AS_HAS_INSN
> > > @@ -30,6 +37,10 @@
> > > .insn i \opcode, \func3, \rd, \rs1, \simm12
> > > .endm
> > >
> > > + .macro insn_s, opcode, func3, rs2, simm12, rs1
> > > + .insn s \opcode, \func3, \rs2, \simm12(\rs1)
> > > + .endm
> > > +
> > > #else
> > >
> > > #include <asm/gpr-num.h>
> > > @@ -51,10 +62,20 @@
> > > (\simm12 << INSN_I_SIMM12_SHIFT))
> > > .endm
> > >
> > > + .macro insn_s, opcode, func3, rs2, simm12, rs1
> > > + .4byte ((\opcode << INSN_S_OPCODE_SHIFT) | \
> > > + (\func3 << INSN_S_FUNC3_SHIFT) | \
> > > + (.L__gpr_num_\rs2 << INSN_S_RS2_SHIFT) | \
> > > + (.L__gpr_num_\rs1 << INSN_S_RS1_SHIFT) | \
> > > + ((\simm12 & 0x1f) << INSN_S_SIMM5_SHIFT) | \
> > > + (((\simm12 >> 5) & 0x7f) << INSN_S_SIMM7_SHIFT))
> > > + .endm
> > > +
> > > #endif
> > >
> > > #define __INSN_R(...) insn_r __VA_ARGS__
> > > #define __INSN_I(...) insn_i __VA_ARGS__
> > > +#define __INSN_S(...) insn_s __VA_ARGS__
> > >
> > > #else /* ! __ASSEMBLY__ */
> > >
> > > @@ -66,6 +87,9 @@
> > > #define __INSN_I(opcode, func3, rd, rs1, simm12) \
> > > ".insn i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n"
> > >
> > > +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
> > > + ".insn s " opcode ", " func3 ", " rs2 ", " simm12 "(" rs1 ")\n"
> > > +
> > > #else
> > >
> > > #include <linux/stringify.h>
> > > @@ -92,12 +116,26 @@
> > > " (\\simm12 << " __stringify(INSN_I_SIMM12_SHIFT) "))\n" \
> > > " .endm\n"
> > >
> > > +#define DEFINE_INSN_S \
> > > + __DEFINE_ASM_GPR_NUMS \
> > > +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
> > > +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
> > > +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
> > > +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
> > > +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
> > > +" ((\\simm12 & 0x1f) << " __stringify(INSN_S_SIMM5_SHIFT) ") |" \
> > > +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
> > > +" .endm\n"
> > > +
> > > #define UNDEFINE_INSN_R \
> > > " .purgem insn_r\n"
> > >
> > > #define UNDEFINE_INSN_I \
> > > " .purgem insn_i\n"
> > >
> > > +#define UNDEFINE_INSN_S \
> > > +" .purgem insn_s\n"
> > > +
> > > #define __INSN_R(opcode, func3, func7, rd, rs1, rs2) \
> > > DEFINE_INSN_R \
> > > "insn_r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n" \
> > > @@ -108,6 +146,11 @@
> > > "insn_i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n" \
> > > UNDEFINE_INSN_I
> > >
> > > +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
> > > + DEFINE_INSN_S \
> > > + "insn_s " opcode ", " func3 ", " rs2 ", " simm12 ", " rs1 "\n" \
> > > + UNDEFINE_INSN_S
> > > +
> > > #endif
> > >
> > > #endif /* ! __ASSEMBLY__ */
> > > @@ -120,6 +163,10 @@
> > > __INSN_I(RV_##opcode, RV_##func3, RV_##rd, \
> > > RV_##rs1, RV_##simm12)
> > >
> > > +#define INSN_S(opcode, func3, rs2, simm12, rs1) \
> > > + __INSN_S(RV_##opcode, RV_##func3, RV_##rs2, \
> > > + RV_##simm12, RV_##rs1)
> > > +
> > > #define RV_OPCODE(v) __ASM_STR(v)
> > > #define RV_FUNC3(v) __ASM_STR(v)
> > > #define RV_FUNC7(v) __ASM_STR(v)
> > > @@ -133,6 +180,7 @@
> > > #define RV___RS2(v) __RV_REG(v)
> > >
> > > #define RV_OPCODE_MISC_MEM RV_OPCODE(15)
> > > +#define RV_OPCODE_OP_IMM RV_OPCODE(19)
> > > #define RV_OPCODE_SYSTEM RV_OPCODE(115)
> > >
> > > #define HFENCE_VVMA(vaddr, asid) \
> > > @@ -196,4 +244,16 @@
> > > INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \
> > > RS1(base), SIMM12(4))
> > >
> > > +#define CBO_PREFETCH_I(base, offset) \
> > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(0), \
> > > + SIMM12(offset), RS1(base))
> > > +
> > > +#define CBO_PREFETCH_R(base, offset) \
> > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(1), \
> > > + SIMM12(offset), RS1(base))
> > > +
> > > +#define CBO_PREFETCH_W(base, offset) \
> > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > > + SIMM12(offset), RS1(base))
> >
> > Shouldn't we ensure the lower 5-bits of offset are zero by masking it?
> The spec says:
> "These instructions operate on the cache block whose effective address
> is the sum of the base address specified in rs1 and the sign-extended
> offset encoded in imm[11:0], where imm[4:0] shall equal 0b00000. The
> effective address is translated into a corresponding physical address
> by the appropriate translation mechanisms."
>
> So, the user of prefetch.w should keep imm[4:0] zero.

Yes, the user _should_ keep imm[4:0] zero. Unless we can validate at
compile time that all users have passed offsets with the lower 5-bits
set to zero, then I think we should mask them here, since I'd rather
not provide the user a footgun.

> Just like the
> patch has done, the whole imm[11:0] is zero.

That's just one possible use, and I think exposing the offset operand to
users makes sense for unrolled sequences of invocations, so I wouldn't
count on offset always being zero.

Thanks,
drew

2024-01-03 09:31:51

by Clément Léger

[permalink] [raw]

Subject: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On 31/12/2023 09:29, [email protected] wrote:
> From: Guo Ren <[email protected]>
>
> Cache-block prefetch instructions are HINTs to the hardware to
> indicate that software intends to perform a particular type of
> memory access in the near future. This patch adds prefetch.i,
> prefetch.r and prefetch.w instruction definitions by
> RISCV_ISA_EXT_ZICBOP cpufeature.
>
> Signed-off-by: Guo Ren <[email protected]>
> Signed-off-by: Guo Ren <[email protected]>
> ---
> arch/riscv/Kconfig | 15 ++++++++
> arch/riscv/include/asm/hwcap.h | 1 +
> arch/riscv/include/asm/insn-def.h | 60 +++++++++++++++++++++++++++++++
> arch/riscv/kernel/cpufeature.c | 1 +
> 4 files changed, 77 insertions(+)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 24c1799e2ec4..fcbd417d65ea 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -579,6 +579,21 @@ config RISCV_ISA_ZICBOZ
>
> If you don't know what to do here, say Y.
>
> +config RISCV_ISA_ZICBOP
> + bool "Zicbop extension support for cache block prefetch"
> + depends on MMU
> + depends on RISCV_ALTERNATIVE
> + default y
> + help
> + Adds support to dynamically detect the presence of the ZICBOP
> + extension (Cache Block Prefetch Operations) and enable its
> + usage.
> +
> + The Zicbop extension can be used to prefetch cache block for
> + read/write fetch.
> +
> + If you don't know what to do here, say Y.
> +
> config TOOLCHAIN_HAS_ZIHINTPAUSE
> bool
> default y
> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> index 06d30526ef3b..77d3b6ee25ab 100644
> --- a/arch/riscv/include/asm/hwcap.h
> +++ b/arch/riscv/include/asm/hwcap.h
> @@ -57,6 +57,7 @@
> #define RISCV_ISA_EXT_ZIHPM 42
> #define RISCV_ISA_EXT_SMSTATEEN 43
> #define RISCV_ISA_EXT_ZICOND 44
> +#define RISCV_ISA_EXT_ZICBOP 45

Hi Guo,

Since you are adding support for the Zicbop extension, you could
probably also allow to probe it from userspace using hwprobe. Add a few
definitions to sys_riscv.c/hwprobe.h and it will be fine.

Thanks,

Clément

>
> #define RISCV_ISA_EXT_MAX 64
>
> diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> index e27179b26086..bbda350a63bf 100644
> --- a/arch/riscv/include/asm/insn-def.h
> +++ b/arch/riscv/include/asm/insn-def.h
> @@ -18,6 +18,13 @@
> #define INSN_I_RD_SHIFT 7
> #define INSN_I_OPCODE_SHIFT 0
>
> +#define INSN_S_SIMM7_SHIFT 25
> +#define INSN_S_RS2_SHIFT 20
> +#define INSN_S_RS1_SHIFT 15
> +#define INSN_S_FUNC3_SHIFT 12
> +#define INSN_S_SIMM5_SHIFT 7
> +#define INSN_S_OPCODE_SHIFT 0
> +
> #ifdef __ASSEMBLY__
>
> #ifdef CONFIG_AS_HAS_INSN
> @@ -30,6 +37,10 @@
> .insn i \opcode, \func3, \rd, \rs1, \simm12
> .endm
>
> + .macro insn_s, opcode, func3, rs2, simm12, rs1
> + .insn s \opcode, \func3, \rs2, \simm12(\rs1)
> + .endm
> +
> #else
>
> #include <asm/gpr-num.h>
> @@ -51,10 +62,20 @@
> (\simm12 << INSN_I_SIMM12_SHIFT))
> .endm
>
> + .macro insn_s, opcode, func3, rs2, simm12, rs1
> + .4byte ((\opcode << INSN_S_OPCODE_SHIFT) | \
> + (\func3 << INSN_S_FUNC3_SHIFT) | \
> + (.L__gpr_num_\rs2 << INSN_S_RS2_SHIFT) | \
> + (.L__gpr_num_\rs1 << INSN_S_RS1_SHIFT) | \
> + ((\simm12 & 0x1f) << INSN_S_SIMM5_SHIFT) | \
> + (((\simm12 >> 5) & 0x7f) << INSN_S_SIMM7_SHIFT))
> + .endm
> +
> #endif
>
> #define __INSN_R(...) insn_r __VA_ARGS__
> #define __INSN_I(...) insn_i __VA_ARGS__
> +#define __INSN_S(...) insn_s __VA_ARGS__
>
> #else /* ! __ASSEMBLY__ */
>
> @@ -66,6 +87,9 @@
> #define __INSN_I(opcode, func3, rd, rs1, simm12) \
> ".insn i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n"
>
> +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
> + ".insn s " opcode ", " func3 ", " rs2 ", " simm12 "(" rs1 ")\n"
> +
> #else
>
> #include <linux/stringify.h>
> @@ -92,12 +116,26 @@
> " (\\simm12 << " __stringify(INSN_I_SIMM12_SHIFT) "))\n" \
> " .endm\n"
>
> +#define DEFINE_INSN_S \
> + __DEFINE_ASM_GPR_NUMS \
> +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
> +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
> +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
> +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
> +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
> +" ((\\simm12 & 0x1f) << " __stringify(INSN_S_SIMM5_SHIFT) ") |" \
> +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
> +" .endm\n"
> +
> #define UNDEFINE_INSN_R \
> " .purgem insn_r\n"
>
> #define UNDEFINE_INSN_I \
> " .purgem insn_i\n"
>
> +#define UNDEFINE_INSN_S \
> +" .purgem insn_s\n"
> +
> #define __INSN_R(opcode, func3, func7, rd, rs1, rs2) \
> DEFINE_INSN_R \
> "insn_r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n" \
> @@ -108,6 +146,11 @@
> "insn_i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n" \
> UNDEFINE_INSN_I
>
> +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
> + DEFINE_INSN_S \
> + "insn_s " opcode ", " func3 ", " rs2 ", " simm12 ", " rs1 "\n" \
> + UNDEFINE_INSN_S
> +
> #endif
>
> #endif /* ! __ASSEMBLY__ */
> @@ -120,6 +163,10 @@
> __INSN_I(RV_##opcode, RV_##func3, RV_##rd, \
> RV_##rs1, RV_##simm12)
>
> +#define INSN_S(opcode, func3, rs2, simm12, rs1) \
> + __INSN_S(RV_##opcode, RV_##func3, RV_##rs2, \
> + RV_##simm12, RV_##rs1)
> +
> #define RV_OPCODE(v) __ASM_STR(v)
> #define RV_FUNC3(v) __ASM_STR(v)
> #define RV_FUNC7(v) __ASM_STR(v)
> @@ -133,6 +180,7 @@
> #define RV___RS2(v) __RV_REG(v)
>
> #define RV_OPCODE_MISC_MEM RV_OPCODE(15)
> +#define RV_OPCODE_OP_IMM RV_OPCODE(19)
> #define RV_OPCODE_SYSTEM RV_OPCODE(115)
>
> #define HFENCE_VVMA(vaddr, asid) \
> @@ -196,4 +244,16 @@
> INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \
> RS1(base), SIMM12(4))
>
> +#define CBO_PREFETCH_I(base, offset) \
> + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(0), \
> + SIMM12(offset), RS1(base))
> +
> +#define CBO_PREFETCH_R(base, offset) \
> + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(1), \
> + SIMM12(offset), RS1(base))
> +
> +#define CBO_PREFETCH_W(base, offset) \
> + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> + SIMM12(offset), RS1(base))
> +
> #endif /* __ASM_INSN_DEF_H */
> diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> index b3785ffc1570..bdb02b066041 100644
> --- a/arch/riscv/kernel/cpufeature.c
> +++ b/arch/riscv/kernel/cpufeature.c
> @@ -168,6 +168,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
> __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
> __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
> __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
> + __RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),
> __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
> __RISCV_ISA_EXT_DATA(zicond, RISCV_ISA_EXT_ZICOND),
> __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),

2024-01-03 12:02:01

[permalink] [raw]

Subject: Re: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On Wed, Jan 03, 2024 at 10:31:37AM +0100, Cl?ment L?ger wrote:
>
>
> On 31/12/2023 09:29, [email protected] wrote:
> > From: Guo Ren <[email protected]>
> >
> > Cache-block prefetch instructions are HINTs to the hardware to
> > indicate that software intends to perform a particular type of
> > memory access in the near future. This patch adds prefetch.i,
> > prefetch.r and prefetch.w instruction definitions by
> > RISCV_ISA_EXT_ZICBOP cpufeature.
> >
> > Signed-off-by: Guo Ren <[email protected]>
> > Signed-off-by: Guo Ren <[email protected]>
> > ---
> > arch/riscv/Kconfig | 15 ++++++++
> > arch/riscv/include/asm/hwcap.h | 1 +
> > arch/riscv/include/asm/insn-def.h | 60 +++++++++++++++++++++++++++++++
> > arch/riscv/kernel/cpufeature.c | 1 +
> > 4 files changed, 77 insertions(+)
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index 24c1799e2ec4..fcbd417d65ea 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -579,6 +579,21 @@ config RISCV_ISA_ZICBOZ
> >
> > If you don't know what to do here, say Y.
> >
> > +config RISCV_ISA_ZICBOP
> > + bool "Zicbop extension support for cache block prefetch"
> > + depends on MMU
> > + depends on RISCV_ALTERNATIVE
> > + default y
> > + help
> > + Adds support to dynamically detect the presence of the ZICBOP
> > + extension (Cache Block Prefetch Operations) and enable its
> > + usage.
> > +
> > + The Zicbop extension can be used to prefetch cache block for
> > + read/write fetch.
> > +
> > + If you don't know what to do here, say Y.
> > +
> > config TOOLCHAIN_HAS_ZIHINTPAUSE
> > bool
> > default y
> > diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > index 06d30526ef3b..77d3b6ee25ab 100644
> > --- a/arch/riscv/include/asm/hwcap.h
> > +++ b/arch/riscv/include/asm/hwcap.h
> > @@ -57,6 +57,7 @@
> > #define RISCV_ISA_EXT_ZIHPM 42
> > #define RISCV_ISA_EXT_SMSTATEEN 43
> > #define RISCV_ISA_EXT_ZICOND 44
> > +#define RISCV_ISA_EXT_ZICBOP 45
>
> Hi Guo,
>
> Since you are adding support for the Zicbop extension, you could
> probably also allow to probe it from userspace using hwprobe. Add a few
> definitions to sys_riscv.c/hwprobe.h and it will be fine.

To expose to userspace, we should also start parsing the block size,
so it can also be exposed to userspace. Starting to parse the block
size first requires that we decide we need to parse the block size
(see [1]).

[1] https://lore.kernel.org/all/[email protected]/

Thanks,
drew

>
> Thanks,
>
> Cl?ment
>
> >
> > #define RISCV_ISA_EXT_MAX 64
> >
> > diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> > index e27179b26086..bbda350a63bf 100644
> > --- a/arch/riscv/include/asm/insn-def.h
> > +++ b/arch/riscv/include/asm/insn-def.h
> > @@ -18,6 +18,13 @@
> > #define INSN_I_RD_SHIFT 7
> > #define INSN_I_OPCODE_SHIFT 0
> >
> > +#define INSN_S_SIMM7_SHIFT 25
> > +#define INSN_S_RS2_SHIFT 20
> > +#define INSN_S_RS1_SHIFT 15
> > +#define INSN_S_FUNC3_SHIFT 12
> > +#define INSN_S_SIMM5_SHIFT 7
> > +#define INSN_S_OPCODE_SHIFT 0
> > +
> > #ifdef __ASSEMBLY__
> >
> > #ifdef CONFIG_AS_HAS_INSN
> > @@ -30,6 +37,10 @@
> > .insn i \opcode, \func3, \rd, \rs1, \simm12
> > .endm
> >
> > + .macro insn_s, opcode, func3, rs2, simm12, rs1
> > + .insn s \opcode, \func3, \rs2, \simm12(\rs1)
> > + .endm
> > +
> > #else
> >
> > #include <asm/gpr-num.h>
> > @@ -51,10 +62,20 @@
> > (\simm12 << INSN_I_SIMM12_SHIFT))
> > .endm
> >
> > + .macro insn_s, opcode, func3, rs2, simm12, rs1
> > + .4byte ((\opcode << INSN_S_OPCODE_SHIFT) | \
> > + (\func3 << INSN_S_FUNC3_SHIFT) | \
> > + (.L__gpr_num_\rs2 << INSN_S_RS2_SHIFT) | \
> > + (.L__gpr_num_\rs1 << INSN_S_RS1_SHIFT) | \
> > + ((\simm12 & 0x1f) << INSN_S_SIMM5_SHIFT) | \
> > + (((\simm12 >> 5) & 0x7f) << INSN_S_SIMM7_SHIFT))
> > + .endm
> > +
> > #endif
> >
> > #define __INSN_R(...) insn_r __VA_ARGS__
> > #define __INSN_I(...) insn_i __VA_ARGS__
> > +#define __INSN_S(...) insn_s __VA_ARGS__
> >
> > #else /* ! __ASSEMBLY__ */
> >
> > @@ -66,6 +87,9 @@
> > #define __INSN_I(opcode, func3, rd, rs1, simm12) \
> > ".insn i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n"
> >
> > +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
> > + ".insn s " opcode ", " func3 ", " rs2 ", " simm12 "(" rs1 ")\n"
> > +
> > #else
> >
> > #include <linux/stringify.h>
> > @@ -92,12 +116,26 @@
> > " (\\simm12 << " __stringify(INSN_I_SIMM12_SHIFT) "))\n" \
> > " .endm\n"
> >
> > +#define DEFINE_INSN_S \
> > + __DEFINE_ASM_GPR_NUMS \
> > +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
> > +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
> > +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
> > +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
> > +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
> > +" ((\\simm12 & 0x1f) << " __stringify(INSN_S_SIMM5_SHIFT) ") |" \
> > +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
> > +" .endm\n"
> > +
> > #define UNDEFINE_INSN_R \
> > " .purgem insn_r\n"
> >
> > #define UNDEFINE_INSN_I \
> > " .purgem insn_i\n"
> >
> > +#define UNDEFINE_INSN_S \
> > +" .purgem insn_s\n"
> > +
> > #define __INSN_R(opcode, func3, func7, rd, rs1, rs2) \
> > DEFINE_INSN_R \
> > "insn_r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n" \
> > @@ -108,6 +146,11 @@
> > "insn_i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n" \
> > UNDEFINE_INSN_I
> >
> > +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
> > + DEFINE_INSN_S \
> > + "insn_s " opcode ", " func3 ", " rs2 ", " simm12 ", " rs1 "\n" \
> > + UNDEFINE_INSN_S
> > +
> > #endif
> >
> > #endif /* ! __ASSEMBLY__ */
> > @@ -120,6 +163,10 @@
> > __INSN_I(RV_##opcode, RV_##func3, RV_##rd, \
> > RV_##rs1, RV_##simm12)
> >
> > +#define INSN_S(opcode, func3, rs2, simm12, rs1) \
> > + __INSN_S(RV_##opcode, RV_##func3, RV_##rs2, \
> > + RV_##simm12, RV_##rs1)
> > +
> > #define RV_OPCODE(v) __ASM_STR(v)
> > #define RV_FUNC3(v) __ASM_STR(v)
> > #define RV_FUNC7(v) __ASM_STR(v)
> > @@ -133,6 +180,7 @@
> > #define RV___RS2(v) __RV_REG(v)
> >
> > #define RV_OPCODE_MISC_MEM RV_OPCODE(15)
> > +#define RV_OPCODE_OP_IMM RV_OPCODE(19)
> > #define RV_OPCODE_SYSTEM RV_OPCODE(115)
> >
> > #define HFENCE_VVMA(vaddr, asid) \
> > @@ -196,4 +244,16 @@
> > INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \
> > RS1(base), SIMM12(4))
> >
> > +#define CBO_PREFETCH_I(base, offset) \
> > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(0), \
> > + SIMM12(offset), RS1(base))
> > +
> > +#define CBO_PREFETCH_R(base, offset) \
> > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(1), \
> > + SIMM12(offset), RS1(base))
> > +
> > +#define CBO_PREFETCH_W(base, offset) \
> > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > + SIMM12(offset), RS1(base))
> > +
> > #endif /* __ASM_INSN_DEF_H */
> > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > index b3785ffc1570..bdb02b066041 100644
> > --- a/arch/riscv/kernel/cpufeature.c
> > +++ b/arch/riscv/kernel/cpufeature.c
> > @@ -168,6 +168,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
> > __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
> > __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
> > __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
> > + __RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),
> > __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
> > __RISCV_ISA_EXT_DATA(zicond, RISCV_ISA_EXT_ZICOND),
> > __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),

2024-01-03 18:52:28

[permalink] [raw]

Subject: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On Sun, Dec 31, 2023 at 03:29:51AM -0500, [email protected] wrote:
> From: Guo Ren <[email protected]>
>
> Cache-block prefetch instructions are HINTs to the hardware to
> indicate that software intends to perform a particular type of
> memory access in the near future. This patch adds prefetch.i,
> prefetch.r and prefetch.w instruction definitions by
> RISCV_ISA_EXT_ZICBOP cpufeature.

Hi Guo Ren,

Here it would be nice to point a documentation for ZICBOP extension:
https://wiki.riscv.org/display/HOME/Recently+Ratified+Extensions

or having a nice link for:
https://drive.google.com/file/d/1jfzhNAk7viz4t2FLDZ5z4roA0LBggkfZ/view

>
> Signed-off-by: Guo Ren <[email protected]>
> Signed-off-by: Guo Ren <[email protected]>
> ---
> arch/riscv/Kconfig | 15 ++++++++
> arch/riscv/include/asm/hwcap.h | 1 +
> arch/riscv/include/asm/insn-def.h | 60 +++++++++++++++++++++++++++++++
> arch/riscv/kernel/cpufeature.c | 1 +
> 4 files changed, 77 insertions(+)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 24c1799e2ec4..fcbd417d65ea 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -579,6 +579,21 @@ config RISCV_ISA_ZICBOZ
>
> If you don't know what to do here, say Y.
>
> +config RISCV_ISA_ZICBOP
> + bool "Zicbop extension support for cache block prefetch"
> + depends on MMU
> + depends on RISCV_ALTERNATIVE
> + default y
> + help
> + Adds support to dynamically detect the presence of the ZICBOP
> + extension (Cache Block Prefetch Operations) and enable its
> + usage.
> +
> + The Zicbop extension can be used to prefetch cache block for
> + read/write fetch.
> +
> + If you don't know what to do here, say Y.
> +

According to doc:
"The Zicbop extension defines a set of cache-block prefetch instructions:
PREFETCH.R, PREFETCH.W, and PREFETCH.I"

So above text seems ok

> config TOOLCHAIN_HAS_ZIHINTPAUSE
> bool
> default y
> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> index 06d30526ef3b..77d3b6ee25ab 100644
> --- a/arch/riscv/include/asm/hwcap.h
> +++ b/arch/riscv/include/asm/hwcap.h
> @@ -57,6 +57,7 @@
> #define RISCV_ISA_EXT_ZIHPM 42
> #define RISCV_ISA_EXT_SMSTATEEN 43
> #define RISCV_ISA_EXT_ZICOND 44
> +#define RISCV_ISA_EXT_ZICBOP 45

Is this number just in kernel code, or does it mean something in the RISC-V
documentation?

>
> #define RISCV_ISA_EXT_MAX 64
>
> diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> index e27179b26086..bbda350a63bf 100644
> --- a/arch/riscv/include/asm/insn-def.h
> +++ b/arch/riscv/include/asm/insn-def.h
> @@ -18,6 +18,13 @@
> #define INSN_I_RD_SHIFT 7
> #define INSN_I_OPCODE_SHIFT 0
>
> +#define INSN_S_SIMM7_SHIFT 25
> +#define INSN_S_RS2_SHIFT 20
> +#define INSN_S_RS1_SHIFT 15
> +#define INSN_S_FUNC3_SHIFT 12
> +#define INSN_S_SIMM5_SHIFT 7
> +#define INSN_S_OPCODE_SHIFT 0
> +

The shifts seem correct for S-Type, but I would name the IMM defines in a
way we could understand where they fit in IMM:

INSN_S_SIMM5_SHIFT -> INSN_S_SIMM_0_4_SHIFT
INSN_S_SIMM7_SHIFT -> INSN_S_SIMM_5_11_SHIFT

What do you think?

> #ifdef __ASSEMBLY__
>
> #ifdef CONFIG_AS_HAS_INSN
> @@ -30,6 +37,10 @@
> .insn i \opcode, \func3, \rd, \rs1, \simm12
> .endm
>
> + .macro insn_s, opcode, func3, rs2, simm12, rs1
> + .insn s \opcode, \func3, \rs2, \simm12(\rs1)
> + .endm
> +
> #else
>
> #include <asm/gpr-num.h>
> @@ -51,10 +62,20 @@
> (\simm12 << INSN_I_SIMM12_SHIFT))
> .endm
>
> + .macro insn_s, opcode, func3, rs2, simm12, rs1
> + .4byte ((\opcode << INSN_S_OPCODE_SHIFT) | \
> + (\func3 << INSN_S_FUNC3_SHIFT) | \
> + (.L__gpr_num_\rs2 << INSN_S_RS2_SHIFT) | \
> + (.L__gpr_num_\rs1 << INSN_S_RS1_SHIFT) | \
> + ((\simm12 & 0x1f) << INSN_S_SIMM5_SHIFT) | \
> + (((\simm12 >> 5) & 0x7f) << INSN_S_SIMM7_SHIFT))
> + .endm
> +
> #endif
>
> #define __INSN_R(...) insn_r __VA_ARGS__
> #define __INSN_I(...) insn_i __VA_ARGS__
> +#define __INSN_S(...) insn_s __VA_ARGS__

As a curiosity: It's quite odd to have prefetch.{i,r,w} to be an S-Type
instruction, given this type was supposed to be for store instructions.

On prefetch.{i,r,w}:
31 24 19 14 11 6
imm[11:5] | PREFETCH_OP | rs1 | ORI | imm[4:0] | OP_IMM

For S-Type, we have:
31 24 19 14 11 6
imm[11:5] | rs1 | rs2 | funct3 | imm[4:0] | opcode

For I-Type, we have:
31 19 14 11 6
immm[11:0] | rs1 | funct3 | rd | opcode

I understand that there should be reasons for choosing S-type, but it
would make much more sense (as per instruction type, and as per parameters)
to go with I-Type.

(I understand this was done in HW, and in kernel code we have better choice
to encode it as S-Type, but I kind of find the S-Type choice odd)

>
> #else /* ! __ASSEMBLY__ */
>
> @@ -66,6 +87,9 @@
> #define __INSN_I(opcode, func3, rd, rs1, simm12) \
> ".insn i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n"
>
> +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
> + ".insn s " opcode ", " func3 ", " rs2 ", " simm12 "(" rs1 ")\n"
> +
> #else
>
> #include <linux/stringify.h>
> @@ -92,12 +116,26 @@
> " (\\simm12 << " __stringify(INSN_I_SIMM12_SHIFT) "))\n" \
> " .endm\n"
>
> +#define DEFINE_INSN_S \
> + __DEFINE_ASM_GPR_NUMS \
> +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
> +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
> +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
> +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
> +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
> +" ((\\simm12 & 0x1f) << " __stringify(INSN_S_SIMM5_SHIFT) ") |" \
> +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
> +" .endm\n"
> +
> #define UNDEFINE_INSN_R \
> " .purgem insn_r\n"
>
> #define UNDEFINE_INSN_I \
> " .purgem insn_i\n"
>
> +#define UNDEFINE_INSN_S \
> +" .purgem insn_s\n"
> +
> #define __INSN_R(opcode, func3, func7, rd, rs1, rs2) \
> DEFINE_INSN_R \
> "insn_r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n" \
> @@ -108,6 +146,11 @@
> "insn_i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n" \
> UNDEFINE_INSN_I
>
> +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
> + DEFINE_INSN_S \
> + "insn_s " opcode ", " func3 ", " rs2 ", " simm12 ", " rs1 "\n" \
> + UNDEFINE_INSN_S
> +
> #endif
>
> #endif /* ! __ASSEMBLY__ */
> @@ -120,6 +163,10 @@
> __INSN_I(RV_##opcode, RV_##func3, RV_##rd, \
> RV_##rs1, RV_##simm12)
>
> +#define INSN_S(opcode, func3, rs2, simm12, rs1) \
> + __INSN_S(RV_##opcode, RV_##func3, RV_##rs2, \
> + RV_##simm12, RV_##rs1)
> +

The defines above seem correct, but TBH I am not very used to review
.macro code.

> #define RV_OPCODE(v) __ASM_STR(v)
> #define RV_FUNC3(v) __ASM_STR(v)
> #define RV_FUNC7(v) __ASM_STR(v)
> @@ -133,6 +180,7 @@
> #define RV___RS2(v) __RV_REG(v)
>
> #define RV_OPCODE_MISC_MEM RV_OPCODE(15)
> +#define RV_OPCODE_OP_IMM RV_OPCODE(19)

Correct.

> #define RV_OPCODE_SYSTEM RV_OPCODE(115)
>
> #define HFENCE_VVMA(vaddr, asid) \
> @@ -196,4 +244,16 @@
> INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \
> RS1(base), SIMM12(4))
>
> +#define CBO_PREFETCH_I(base, offset) \
> + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(0), \
> + SIMM12(offset), RS1(base))
> +
> +#define CBO_PREFETCH_R(base, offset) \
> + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(1), \
> + SIMM12(offset), RS1(base))
> +
> +#define CBO_PREFETCH_W(base, offset) \
> + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> + SIMM12(offset), RS1(base))
> +

For OP_IMM & FUNC3(6) we have ORI, right?
For ORI, rd will be at bytes 11:7, which in PREFETCH.{i,r,w} is
offset[4:0].

IIUC, when the cpu does not support ZICBOP, this should be fine as long as
rd = 0, since changes to r0 are disregarded.

In this case, we need to guarantee offset[4:0] = 0, or else we migth write
on an unrelated register. This can be noticed in ZICBOP documentation pages
21, 22, 23, as offset[4:0] is always [0 0 0 0 0].
(Google docs in first comment)

What we need here is something like:
+ enum {
+ PREFETCH_I,
+ PREFETCH_R,
+ PREFETCH_W,
+ }
+
+ #define CBO_PREFETCH(type, base, offset) \
+ INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(type), \
+ SIMM12(offset & ~0x1f), RS1(base))

+ #define CBO_PREFETCH_I(base, offset) \
+ CBO_PREFETCH(PREFETCH_I, base, offset)
+
+ #define CBO_PREFETCH_R(base, offset) \
+ CBO_PREFETCH(PREFETCH_R, base, offset)
+
+ #define CBO_PREFETCH_W(base, offset) \
+ CBO_PREFETCH(PREFETCH_W, base, offset)
+

Maybe replacing 0x1f by some MASK macro, so it looks nicer.
(not sure how it's acceptable in asm, though).

The above would guarantee that we would never have CBO_PREFETCH_*() to mess
up any other register due to a unnoticed (base & 0x1f) != 0

Does that make sense?

> #endif /* __ASM_INSN_DEF_H */
> diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> index b3785ffc1570..bdb02b066041 100644
> --- a/arch/riscv/kernel/cpufeature.c
> +++ b/arch/riscv/kernel/cpufeature.c
> @@ -168,6 +168,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
> __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
> __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
> __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
> + __RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),
> __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
> __RISCV_ISA_EXT_DATA(zicond, RISCV_ISA_EXT_ZICOND),
> __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),
> --
> 2.40.1
>

Apart from above suggestions, seems a nice change :)

I suggest splitting this patch into 2, though:
- Introducing S-Type instructions (plz point docs for reference)
- Introduce ZICBOP extension.

Thanks!
Leo

2024-01-03 19:05:27

[permalink] [raw]

Subject: Re: [PATCH V2 2/3] riscv: Add ARCH_HAS_PRETCHW support with Zibop

On Mon, Jan 01, 2024 at 10:29:21AM +0800, Guo Ren wrote:
> On Sun, Dec 31, 2023 at 4:30 PM <[email protected]> wrote:
> >
> > From: Guo Ren <[email protected]>
> >
> > Enable Linux prefetchw primitive with Zibop cpufeature, which preloads
> > cache line into L1 cache for the next write operation.
> >
> > Signed-off-by: Guo Ren <[email protected]>
> > Signed-off-by: Guo Ren <[email protected]>
> > ---
> > arch/riscv/include/asm/processor.h | 16 ++++++++++++++++
> > 1 file changed, 16 insertions(+)
> >
> > diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
> > index f19f861cda54..8d3a2ab37678 100644
> > --- a/arch/riscv/include/asm/processor.h
> > +++ b/arch/riscv/include/asm/processor.h
> > @@ -13,6 +13,9 @@
> > #include <vdso/processor.h>
> >
> > #include <asm/ptrace.h>
> > +#include <asm/insn-def.h>
> > +#include <asm/alternative-macros.h>
> > +#include <asm/hwcap.h>
> >
> > #ifdef CONFIG_64BIT
> > #define DEFAULT_MAP_WINDOW (UL(1) << (MMAP_VA_BITS - 1))
> > @@ -106,6 +109,19 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
> > #define KSTK_EIP(tsk) (task_pt_regs(tsk)->epc)
> > #define KSTK_ESP(tsk) (task_pt_regs(tsk)->sp)
> >
> > +#ifdef CONFIG_RISCV_ISA_ZICBOP
> > +#define ARCH_HAS_PREFETCHW
> > +
> > +#define PREFETCHW_ASM(x) \
> > + ALTERNATIVE(__nops(1), CBO_PREFETCH_W(x, 0), 0, \
> > + RISCV_ISA_EXT_ZICBOP, CONFIG_RISCV_ISA_ZICBOP)
> The PREFETCHW_ASM(x) definition should be out of "ifdef
> CONFIG_RISCV_ISA_ZICBOP... #endif", because xchg_small may use this
> macro without CONFIG_RISCV_ISA_ZICBOP.
>

Agree :)

> > +
> > +
> > +static inline void prefetchw(const void *x)
> > +{
> > + __asm__ __volatile__(PREFETCHW_ASM(%0) : : "r" (x) : "memory");
> > +}
> > +#endif /* CONFIG_RISCV_ISA_ZICBOP */
> >
> > /* Do necessary setup to start up a newly executed thread. */
> > extern void start_thread(struct pt_regs *regs,
> > --
> > 2.40.1
> >
>
>
> --
> Best Regards
> Guo Ren
>

2024-01-03 19:07:15

[permalink] [raw]

Subject: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On Tue, Jan 02, 2024 at 11:32:44AM +0100, Andrew Jones wrote:
> On Sun, Dec 31, 2023 at 03:29:51AM -0500, [email protected] wrote:
> > From: Guo Ren <[email protected]>
> >
> > Cache-block prefetch instructions are HINTs to the hardware to
> > indicate that software intends to perform a particular type of
> > memory access in the near future. This patch adds prefetch.i,
> > prefetch.r and prefetch.w instruction definitions by
> > RISCV_ISA_EXT_ZICBOP cpufeature.
>
> It also adds S-type instruction encoding support which isn't mentioned.
> Actually, it'd probably be best to put the new instruction encoding in
> its own separate patch.
>
> >
> > Signed-off-by: Guo Ren <[email protected]>
> > Signed-off-by: Guo Ren <[email protected]>
> > ---
> > arch/riscv/Kconfig | 15 ++++++++
> > arch/riscv/include/asm/hwcap.h | 1 +
> > arch/riscv/include/asm/insn-def.h | 60 +++++++++++++++++++++++++++++++
> > arch/riscv/kernel/cpufeature.c | 1 +
> > 4 files changed, 77 insertions(+)
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index 24c1799e2ec4..fcbd417d65ea 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -579,6 +579,21 @@ config RISCV_ISA_ZICBOZ
> >
> > If you don't know what to do here, say Y.
> >
> > +config RISCV_ISA_ZICBOP
> > + bool "Zicbop extension support for cache block prefetch"
> > + depends on MMU
> > + depends on RISCV_ALTERNATIVE
> > + default y
> > + help
> > + Adds support to dynamically detect the presence of the ZICBOP
> > + extension (Cache Block Prefetch Operations) and enable its
> > + usage.
> > +
> > + The Zicbop extension can be used to prefetch cache block for
>
> blocks
>
> > + read/write fetch.
> > +
> > + If you don't know what to do here, say Y.
> > +
> > config TOOLCHAIN_HAS_ZIHINTPAUSE
> > bool
> > default y
> > diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > index 06d30526ef3b..77d3b6ee25ab 100644
> > --- a/arch/riscv/include/asm/hwcap.h
> > +++ b/arch/riscv/include/asm/hwcap.h
> > @@ -57,6 +57,7 @@
> > #define RISCV_ISA_EXT_ZIHPM 42
> > #define RISCV_ISA_EXT_SMSTATEEN 43
> > #define RISCV_ISA_EXT_ZICOND 44
> > +#define RISCV_ISA_EXT_ZICBOP 45
> >
> > #define RISCV_ISA_EXT_MAX 64
> >
> > diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> > index e27179b26086..bbda350a63bf 100644
> > --- a/arch/riscv/include/asm/insn-def.h
> > +++ b/arch/riscv/include/asm/insn-def.h
> > @@ -18,6 +18,13 @@
> > #define INSN_I_RD_SHIFT 7
> > #define INSN_I_OPCODE_SHIFT 0
> >
> > +#define INSN_S_SIMM7_SHIFT 25
> > +#define INSN_S_RS2_SHIFT 20
> > +#define INSN_S_RS1_SHIFT 15
> > +#define INSN_S_FUNC3_SHIFT 12
> > +#define INSN_S_SIMM5_SHIFT 7
> > +#define INSN_S_OPCODE_SHIFT 0
> > +
> > #ifdef __ASSEMBLY__
> >
> > #ifdef CONFIG_AS_HAS_INSN
> > @@ -30,6 +37,10 @@
> > .insn i \opcode, \func3, \rd, \rs1, \simm12
> > .endm
> >
> > + .macro insn_s, opcode, func3, rs2, simm12, rs1
> > + .insn s \opcode, \func3, \rs2, \simm12(\rs1)
> > + .endm
> > +
> > #else
> >
> > #include <asm/gpr-num.h>
> > @@ -51,10 +62,20 @@
> > (\simm12 << INSN_I_SIMM12_SHIFT))
> > .endm
> >
> > + .macro insn_s, opcode, func3, rs2, simm12, rs1
> > + .4byte ((\opcode << INSN_S_OPCODE_SHIFT) | \
> > + (\func3 << INSN_S_FUNC3_SHIFT) | \
> > + (.L__gpr_num_\rs2 << INSN_S_RS2_SHIFT) | \
> > + (.L__gpr_num_\rs1 << INSN_S_RS1_SHIFT) | \
> > + ((\simm12 & 0x1f) << INSN_S_SIMM5_SHIFT) | \
> > + (((\simm12 >> 5) & 0x7f) << INSN_S_SIMM7_SHIFT))
> > + .endm
> > +
> > #endif
> >
> > #define __INSN_R(...) insn_r __VA_ARGS__
> > #define __INSN_I(...) insn_i __VA_ARGS__
> > +#define __INSN_S(...) insn_s __VA_ARGS__
> >
> > #else /* ! __ASSEMBLY__ */
> >
> > @@ -66,6 +87,9 @@
> > #define __INSN_I(opcode, func3, rd, rs1, simm12) \
> > ".insn i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n"
> >
> > +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
> > + ".insn s " opcode ", " func3 ", " rs2 ", " simm12 "(" rs1 ")\n"
> > +
> > #else
> >
> > #include <linux/stringify.h>
> > @@ -92,12 +116,26 @@
> > " (\\simm12 << " __stringify(INSN_I_SIMM12_SHIFT) "))\n" \
> > " .endm\n"
> >
> > +#define DEFINE_INSN_S \
> > + __DEFINE_ASM_GPR_NUMS \
> > +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
> > +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
> > +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
> > +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
> > +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
> > +" ((\\simm12 & 0x1f) << " __stringify(INSN_S_SIMM5_SHIFT) ") |" \
> > +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
> > +" .endm\n"
> > +
> > #define UNDEFINE_INSN_R \
> > " .purgem insn_r\n"
> >
> > #define UNDEFINE_INSN_I \
> > " .purgem insn_i\n"
> >
> > +#define UNDEFINE_INSN_S \
> > +" .purgem insn_s\n"
> > +
> > #define __INSN_R(opcode, func3, func7, rd, rs1, rs2) \
> > DEFINE_INSN_R \
> > "insn_r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n" \
> > @@ -108,6 +146,11 @@
> > "insn_i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n" \
> > UNDEFINE_INSN_I
> >
> > +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
> > + DEFINE_INSN_S \
> > + "insn_s " opcode ", " func3 ", " rs2 ", " simm12 ", " rs1 "\n" \
> > + UNDEFINE_INSN_S
> > +
> > #endif
> >
> > #endif /* ! __ASSEMBLY__ */
> > @@ -120,6 +163,10 @@
> > __INSN_I(RV_##opcode, RV_##func3, RV_##rd, \
> > RV_##rs1, RV_##simm12)
> >
> > +#define INSN_S(opcode, func3, rs2, simm12, rs1) \
> > + __INSN_S(RV_##opcode, RV_##func3, RV_##rs2, \
> > + RV_##simm12, RV_##rs1)
> > +
> > #define RV_OPCODE(v) __ASM_STR(v)
> > #define RV_FUNC3(v) __ASM_STR(v)
> > #define RV_FUNC7(v) __ASM_STR(v)
> > @@ -133,6 +180,7 @@
> > #define RV___RS2(v) __RV_REG(v)
> >
> > #define RV_OPCODE_MISC_MEM RV_OPCODE(15)
> > +#define RV_OPCODE_OP_IMM RV_OPCODE(19)
> > #define RV_OPCODE_SYSTEM RV_OPCODE(115)
> >
> > #define HFENCE_VVMA(vaddr, asid) \
> > @@ -196,4 +244,16 @@
> > INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \
> > RS1(base), SIMM12(4))
> >
> > +#define CBO_PREFETCH_I(base, offset) \
> > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(0), \
> > + SIMM12(offset), RS1(base))
> > +
> > +#define CBO_PREFETCH_R(base, offset) \
> > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(1), \
> > + SIMM12(offset), RS1(base))
> > +
> > +#define CBO_PREFETCH_W(base, offset) \
> > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > + SIMM12(offset), RS1(base))
>
> Shouldn't we ensure the lower 5-bits of offset are zero by masking it?

Note for my future self, read other reviews before doing my own :)

>
> > +
> > #endif /* __ASM_INSN_DEF_H */
> > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > index b3785ffc1570..bdb02b066041 100644
> > --- a/arch/riscv/kernel/cpufeature.c
> > +++ b/arch/riscv/kernel/cpufeature.c
> > @@ -168,6 +168,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
> > __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
> > __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
> > __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
> > + __RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),
>
> zicbop should be above zicboz (alphabetical)
>
> > __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
> > __RISCV_ISA_EXT_DATA(zicond, RISCV_ISA_EXT_ZICOND),
> > __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),
> > --
> > 2.40.1
> >
>
> Thanks,
> drew
>

2024-01-03 19:30:34

[permalink] [raw]

Subject: Re: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On Wed, Jan 03, 2024 at 03:52:00PM -0300, Leonardo Bras wrote:
> On Sun, Dec 31, 2023 at 03:29:51AM -0500, [email protected] wrote:
> > From: Guo Ren <[email protected]>
> >
> > Cache-block prefetch instructions are HINTs to the hardware to
> > indicate that software intends to perform a particular type of
> > memory access in the near future. This patch adds prefetch.i,
> > prefetch.r and prefetch.w instruction definitions by
> > RISCV_ISA_EXT_ZICBOP cpufeature.
>
> Hi Guo Ren,
>
> Here it would be nice to point a documentation for ZICBOP extension:
> https://wiki.riscv.org/display/HOME/Recently+Ratified+Extensions
>
> or having a nice link for:
> https://drive.google.com/file/d/1jfzhNAk7viz4t2FLDZ5z4roA0LBggkfZ/view
>
> >
> > Signed-off-by: Guo Ren <[email protected]>
> > Signed-off-by: Guo Ren <[email protected]>
> > ---
> > arch/riscv/Kconfig | 15 ++++++++
> > arch/riscv/include/asm/hwcap.h | 1 +
> > arch/riscv/include/asm/insn-def.h | 60 +++++++++++++++++++++++++++++++
> > arch/riscv/kernel/cpufeature.c | 1 +
> > 4 files changed, 77 insertions(+)
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index 24c1799e2ec4..fcbd417d65ea 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -579,6 +579,21 @@ config RISCV_ISA_ZICBOZ
> >
> > If you don't know what to do here, say Y.
> >
> > +config RISCV_ISA_ZICBOP
> > + bool "Zicbop extension support for cache block prefetch"
> > + depends on MMU
> > + depends on RISCV_ALTERNATIVE
> > + default y
> > + help
> > + Adds support to dynamically detect the presence of the ZICBOP
> > + extension (Cache Block Prefetch Operations) and enable its
> > + usage.
> > +
> > + The Zicbop extension can be used to prefetch cache block for
> > + read/write fetch.
> > +
> > + If you don't know what to do here, say Y.
> > +
>
> According to doc:
> "The Zicbop extension defines a set of cache-block prefetch instructions:
> PREFETCH.R, PREFETCH.W, and PREFETCH.I"
>
> So above text seems ok
>
>
> > config TOOLCHAIN_HAS_ZIHINTPAUSE
> > bool
> > default y
> > diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > index 06d30526ef3b..77d3b6ee25ab 100644
> > --- a/arch/riscv/include/asm/hwcap.h
> > +++ b/arch/riscv/include/asm/hwcap.h
> > @@ -57,6 +57,7 @@
> > #define RISCV_ISA_EXT_ZIHPM 42
> > #define RISCV_ISA_EXT_SMSTATEEN 43
> > #define RISCV_ISA_EXT_ZICOND 44
> > +#define RISCV_ISA_EXT_ZICBOP 45
>
> Is this number just in kernel code, or does it mean something in the RISC-V
> documentation?

kernel

>
> >
> > #define RISCV_ISA_EXT_MAX 64
> >
> > diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> > index e27179b26086..bbda350a63bf 100644
> > --- a/arch/riscv/include/asm/insn-def.h
> > +++ b/arch/riscv/include/asm/insn-def.h
> > @@ -18,6 +18,13 @@
> > #define INSN_I_RD_SHIFT 7
> > #define INSN_I_OPCODE_SHIFT 0
> >
> > +#define INSN_S_SIMM7_SHIFT 25
> > +#define INSN_S_RS2_SHIFT 20
> > +#define INSN_S_RS1_SHIFT 15
> > +#define INSN_S_FUNC3_SHIFT 12
> > +#define INSN_S_SIMM5_SHIFT 7
> > +#define INSN_S_OPCODE_SHIFT 0
> > +
>
> The shifts seem correct for S-Type, but I would name the IMM defines in a
> way we could understand where they fit in IMM:
>
>
> INSN_S_SIMM5_SHIFT -> INSN_S_SIMM_0_4_SHIFT
> INSN_S_SIMM7_SHIFT -> INSN_S_SIMM_5_11_SHIFT
>
> What do you think?

I'm in favor of this suggestion, but then wonder if we don't need another
patch before this which renames INSN_I_SIMM12_SHIFT to
INSN_I_SIMM_0_11_SHIFT in order to keep things consistent.

>
>
> > #ifdef __ASSEMBLY__
> >
> > #ifdef CONFIG_AS_HAS_INSN
> > @@ -30,6 +37,10 @@
> > .insn i \opcode, \func3, \rd, \rs1, \simm12
> > .endm
> >
> > + .macro insn_s, opcode, func3, rs2, simm12, rs1
> > + .insn s \opcode, \func3, \rs2, \simm12(\rs1)
> > + .endm
> > +
> > #else
> >
> > #include <asm/gpr-num.h>
> > @@ -51,10 +62,20 @@
> > (\simm12 << INSN_I_SIMM12_SHIFT))
> > .endm
> >
> > + .macro insn_s, opcode, func3, rs2, simm12, rs1
> > + .4byte ((\opcode << INSN_S_OPCODE_SHIFT) | \
> > + (\func3 << INSN_S_FUNC3_SHIFT) | \
> > + (.L__gpr_num_\rs2 << INSN_S_RS2_SHIFT) | \
> > + (.L__gpr_num_\rs1 << INSN_S_RS1_SHIFT) | \
> > + ((\simm12 & 0x1f) << INSN_S_SIMM5_SHIFT) | \
> > + (((\simm12 >> 5) & 0x7f) << INSN_S_SIMM7_SHIFT))
> > + .endm
> > +
> > #endif
> >
> > #define __INSN_R(...) insn_r __VA_ARGS__
> > #define __INSN_I(...) insn_i __VA_ARGS__
> > +#define __INSN_S(...) insn_s __VA_ARGS__
>
> As a curiosity: It's quite odd to have prefetch.{i,r,w} to be an S-Type
> instruction, given this type was supposed to be for store instructions.
>
> On prefetch.{i,r,w}:
> 31 24 19 14 11 6
> imm[11:5] | PREFETCH_OP | rs1 | ORI | imm[4:0] | OP_IMM
>
> For S-Type, we have:
> 31 24 19 14 11 6
> imm[11:5] | rs1 | rs2 | funct3 | imm[4:0] | opcode
>
> For I-Type, we have:
> 31 19 14 11 6
> immm[11:0] | rs1 | funct3 | rd | opcode
>
> I understand that there should be reasons for choosing S-type, but it
> would make much more sense (as per instruction type, and as per parameters)
> to go with I-Type.
>
> (I understand this was done in HW, and in kernel code we have better choice
> to encode it as S-Type, but I kind of find the S-Type choice odd)

My speculation is that since cache block sizes will never be less than 32
bytes, it made more sense to use the S-type encoding space with imm[4:0]
hard coded to zero, allowing the I-Type encoding space to be reserved for
instructions which need arbitrary 12-bit immediates.

>
> >
> > #else /* ! __ASSEMBLY__ */
> >
> > @@ -66,6 +87,9 @@
> > #define __INSN_I(opcode, func3, rd, rs1, simm12) \
> > ".insn i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n"
> >
> > +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
> > + ".insn s " opcode ", " func3 ", " rs2 ", " simm12 "(" rs1 ")\n"
> > +
> > #else
> >
> > #include <linux/stringify.h>
> > @@ -92,12 +116,26 @@
> > " (\\simm12 << " __stringify(INSN_I_SIMM12_SHIFT) "))\n" \
> > " .endm\n"
> >
> > +#define DEFINE_INSN_S \
> > + __DEFINE_ASM_GPR_NUMS \
> > +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
> > +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
> > +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
> > +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
> > +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
> > +" ((\\simm12 & 0x1f) << " __stringify(INSN_S_SIMM5_SHIFT) ") |" \
> > +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
> > +" .endm\n"
> > +
> > #define UNDEFINE_INSN_R \
> > " .purgem insn_r\n"
> >
> > #define UNDEFINE_INSN_I \
> > " .purgem insn_i\n"
> >
> > +#define UNDEFINE_INSN_S \
> > +" .purgem insn_s\n"
> > +
> > #define __INSN_R(opcode, func3, func7, rd, rs1, rs2) \
> > DEFINE_INSN_R \
> > "insn_r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n" \
> > @@ -108,6 +146,11 @@
> > "insn_i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n" \
> > UNDEFINE_INSN_I
> >
> > +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
> > + DEFINE_INSN_S \
> > + "insn_s " opcode ", " func3 ", " rs2 ", " simm12 ", " rs1 "\n" \
> > + UNDEFINE_INSN_S
> > +
> > #endif
> >
> > #endif /* ! __ASSEMBLY__ */
> > @@ -120,6 +163,10 @@
> > __INSN_I(RV_##opcode, RV_##func3, RV_##rd, \
> > RV_##rs1, RV_##simm12)
> >
> > +#define INSN_S(opcode, func3, rs2, simm12, rs1) \
> > + __INSN_S(RV_##opcode, RV_##func3, RV_##rs2, \
> > + RV_##simm12, RV_##rs1)
> > +
>
> The defines above seem correct, but TBH I am not very used to review
> .macro code.
>
> > #define RV_OPCODE(v) __ASM_STR(v)
> > #define RV_FUNC3(v) __ASM_STR(v)
> > #define RV_FUNC7(v) __ASM_STR(v)
> > @@ -133,6 +180,7 @@
> > #define RV___RS2(v) __RV_REG(v)
> >
> > #define RV_OPCODE_MISC_MEM RV_OPCODE(15)
> > +#define RV_OPCODE_OP_IMM RV_OPCODE(19)
>
> Correct.
>
>
> > #define RV_OPCODE_SYSTEM RV_OPCODE(115)
> >
> > #define HFENCE_VVMA(vaddr, asid) \
> > @@ -196,4 +244,16 @@
> > INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \
> > RS1(base), SIMM12(4))
> >
> > +#define CBO_PREFETCH_I(base, offset) \
> > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(0), \
> > + SIMM12(offset), RS1(base))
> > +
> > +#define CBO_PREFETCH_R(base, offset) \
> > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(1), \
> > + SIMM12(offset), RS1(base))
> > +
> > +#define CBO_PREFETCH_W(base, offset) \
> > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > + SIMM12(offset), RS1(base))
> > +
>
> For OP_IMM & FUNC3(6) we have ORI, right?
> For ORI, rd will be at bytes 11:7, which in PREFETCH.{i,r,w} is
> offset[4:0].
>
> IIUC, when the cpu does not support ZICBOP, this should be fine as long as
> rd = 0, since changes to r0 are disregarded.
>
> In this case, we need to guarantee offset[4:0] = 0, or else we migth write
> on an unrelated register. This can be noticed in ZICBOP documentation pages
> 21, 22, 23, as offset[4:0] is always [0 0 0 0 0].
> (Google docs in first comment)
>
> What we need here is something like:
> + enum {
> + PREFETCH_I,
> + PREFETCH_R,
> + PREFETCH_W,
> + }

Can't use enum. This header may be included in assembly.

> +
> + #define CBO_PREFETCH(type, base, offset) \
> + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(type), \
> + SIMM12(offset & ~0x1f), RS1(base))

Yes. I suggested we mask offset as well, but ideally we'd detect a caller
using an offset with nonzero lower 5 bits at compile time.

Thanks,
drew

>
> + #define CBO_PREFETCH_I(base, offset) \
> + CBO_PREFETCH(PREFETCH_I, base, offset)
> +
> + #define CBO_PREFETCH_R(base, offset) \
> + CBO_PREFETCH(PREFETCH_R, base, offset)
> +
> + #define CBO_PREFETCH_W(base, offset) \
> + CBO_PREFETCH(PREFETCH_W, base, offset)
> +
>
> Maybe replacing 0x1f by some MASK macro, so it looks nicer.
> (not sure how it's acceptable in asm, though).
>
> The above would guarantee that we would never have CBO_PREFETCH_*() to mess
> up any other register due to a unnoticed (base & 0x1f) != 0
>
> Does that make sense?
>
> > #endif /* __ASM_INSN_DEF_H */
> > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > index b3785ffc1570..bdb02b066041 100644
> > --- a/arch/riscv/kernel/cpufeature.c
> > +++ b/arch/riscv/kernel/cpufeature.c
> > @@ -168,6 +168,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
> > __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
> > __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
> > __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
> > + __RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),
> > __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
> > __RISCV_ISA_EXT_DATA(zicond, RISCV_ISA_EXT_ZICOND),
> > __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),
> > --
> > 2.40.1
> >
>
> Apart from above suggestions, seems a nice change :)
>
> I suggest splitting this patch into 2, though:
> - Introducing S-Type instructions (plz point docs for reference)
> - Introduce ZICBOP extension.
>
> Thanks!
> Leo
>
>

2024-01-03 19:45:08

[permalink] [raw]

Subject: Re: Re: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On Wed, Jan 03, 2024 at 07:49:44AM +0100, Andrew Jones wrote:
> On Wed, Jan 03, 2024 at 02:13:00PM +0800, Guo Ren wrote:
> > On Tue, Jan 2, 2024 at 6:32 PM Andrew Jones <[email protected]> wrote:
> > >
> > > On Sun, Dec 31, 2023 at 03:29:51AM -0500, [email protected] wrote:
...
> > > > #define HFENCE_VVMA(vaddr, asid) \
> > > > @@ -196,4 +244,16 @@
> > > > INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \
> > > > RS1(base), SIMM12(4))
> > > >
> > > > +#define CBO_PREFETCH_I(base, offset) \
> > > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(0), \
> > > > + SIMM12(offset), RS1(base))
> > > > +
> > > > +#define CBO_PREFETCH_R(base, offset) \
> > > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(1), \
> > > > + SIMM12(offset), RS1(base))
> > > > +
> > > > +#define CBO_PREFETCH_W(base, offset) \
> > > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > > > + SIMM12(offset), RS1(base))
> > >
> > > Shouldn't we ensure the lower 5-bits of offset are zero by masking it?
> > The spec says:
> > "These instructions operate on the cache block whose effective address
> > is the sum of the base address specified in rs1 and the sign-extended
> > offset encoded in imm[11:0], where imm[4:0] shall equal 0b00000. The
> > effective address is translated into a corresponding physical address
> > by the appropriate translation mechanisms."
> >
> > So, the user of prefetch.w should keep imm[4:0] zero.
>
> Yes, the user _should_ keep imm[4:0] zero. Unless we can validate at
> compile time that all users have passed offsets with the lower 5-bits
> set to zero, then I think we should mask them here, since I'd rather
> not provide the user a footgun.
>
> > Just like the
> > patch has done, the whole imm[11:0] is zero.
>
> That's just one possible use, and I think exposing the offset operand to
> users makes sense for unrolled sequences of invocations, so I wouldn't
> count on offset always being zero.
>

Another thought on this line is that a base which isn't block size aligned
may not "work". The spec says

"""
...instruction indicates to hardware that the cache block whose effective
address is the sum of the base address specified in rs1 and the
sign-extended offset encoded in imm[11:0], where imm[4:0] equals
0b00000, is likely to be accessed...
"""

which implies we need an effective address which maps to a cache block.
However, unlike having a nonzero imm[4:0], I don't fear a problem with the
instruction if 'base' isn't block sized aligned, but the instruction might
not do anything.

I think we need to add DT parsing of riscv,cbop-block-size and then
use it to mask the base address in the callers of these macros. (But
that doesn't mean I don't think we still need to mask offset here.)

Thanks,
drew

2024-01-03 19:46:08

[permalink] [raw]

Subject: Re: [PATCH V2 3/3] riscv: xchg: Prefetch the destination word for sc.w

On Wed, Jan 03, 2024 at 02:15:45PM +0800, Guo Ren wrote:
> On Tue, Jan 2, 2024 at 7:19 PM Andrew Jones <[email protected]> wrote:
> >
> > On Sun, Dec 31, 2023 at 03:29:53AM -0500, [email protected] wrote:
> > > From: Guo Ren <[email protected]>
> > >
> > > The cost of changing a cacheline from shared to exclusive state can be
> > > significant, especially when this is triggered by an exclusive store,
> > > since it may result in having to retry the transaction.
> > >
> > > This patch makes use of prefetch.w to prefetch cachelines for write
> > > prior to lr/sc loops when using the xchg_small atomic routine.
> > >
> > > This patch is inspired by commit: 0ea366f5e1b6 ("arm64: atomics:
> > > prefetch the destination word for write prior to stxr").
> > >
> > > Signed-off-by: Guo Ren <[email protected]>
> > > Signed-off-by: Guo Ren <[email protected]>
> > > ---
> > > arch/riscv/include/asm/cmpxchg.h | 4 +++-
> > > 1 file changed, 3 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > > index 26cea2395aae..d7b9d7951f08 100644
> > > --- a/arch/riscv/include/asm/cmpxchg.h
> > > +++ b/arch/riscv/include/asm/cmpxchg.h
> > > @@ -10,6 +10,7 @@
> > >
> > > #include <asm/barrier.h>
> > > #include <asm/fence.h>
> > > +#include <asm/processor.h>
> > >
> > > #define __arch_xchg_masked(prepend, append, r, p, n) \
> >
> > Are you sure this is based on v6.7-rc7? Because I don't see this macro.
> Oh, it is based on Leobras' patches. I would remove it in the next of version.

I would say this next :)

>
> >
> > > ({ \
> > > @@ -23,6 +24,7 @@
> > > \
> > > __asm__ __volatile__ ( \
> > > prepend \
> > > + PREFETCHW_ASM(%5) \
> > > "0: lr.w %0, %2\n" \
> > > " and %1, %0, %z4\n" \
> > > " or %1, %1, %z3\n" \
> > > @@ -30,7 +32,7 @@
> > > " bnez %1, 0b\n" \
> > > append \
> > > : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b)) \
> > > - : "rJ" (__newx), "rJ" (~__mask) \
> > > + : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b) \
> >
> > I'm pretty sure we don't want to allow the J constraint for __ptr32b.
> >
> > > : "memory"); \
> > > \
> > > r = (__typeof__(*(p)))((__retx & __mask) >> __s); \
> > > --
> > > 2.40.1
> > >
> >
> > Thanks,
> > drew
>
>
>
> --
> Best Regards
> Guo Ren
>

Nice patch :)
Any reason it's not needed in __arch_cmpxchg_masked(), and __arch_cmpxchg() ?

Thanks!
Leo

2024-01-03 19:48:23

[permalink] [raw]

Subject: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On Sun, Dec 31, 2023 at 03:29:51AM -0500, [email protected] wrote:
...
> +#define CBO_PREFETCH_I(base, offset) \
> + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(0), \
> + SIMM12(offset), RS1(base))
> +
> +#define CBO_PREFETCH_R(base, offset) \
> + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(1), \
> + SIMM12(offset), RS1(base))
> +
> +#define CBO_PREFETCH_W(base, offset) \
> + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> + SIMM12(offset), RS1(base))

These should just be named

PREFETCH_I
PREFETCH_R
PREFETCH_W

without the CBO_ prefix. The other CMO instructions we've added have the
CBO_ prefix because their actual instruction names are e.g. cbo.zero,
but the prefix instructions are not named that way.

Thanks,
drew

2024-01-03 19:57:00

[permalink] [raw]

Subject: Re: Re: [PATCH V2 2/3] riscv: Add ARCH_HAS_PRETCHW support with Zibop

On Wed, Jan 03, 2024 at 02:19:49PM +0800, Guo Ren wrote:
> On Tue, Jan 2, 2024 at 6:45 PM Andrew Jones <[email protected]> wrote:
> >
> >
> > s/Zibop/Zicbop/ <<<$SUBJECT
> okay
>
> >
> > On Sun, Dec 31, 2023 at 03:29:52AM -0500, [email protected] wrote:
> > > From: Guo Ren <[email protected]>
> > >
> > > Enable Linux prefetchw primitive with Zibop cpufeature, which preloads
> >
> > Also s/Zibop/Zicbop/ here
> okay, thx.
>
> >
> > > cache line into L1 cache for the next write operation.
> > >
> > > Signed-off-by: Guo Ren <[email protected]>
> > > Signed-off-by: Guo Ren <[email protected]>
> > > ---
> > > arch/riscv/include/asm/processor.h | 16 ++++++++++++++++
> > > 1 file changed, 16 insertions(+)
> > >
> > > diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
> > > index f19f861cda54..8d3a2ab37678 100644
> > > --- a/arch/riscv/include/asm/processor.h
> > > +++ b/arch/riscv/include/asm/processor.h
> > > @@ -13,6 +13,9 @@
> > > #include <vdso/processor.h>
> > >
> > > #include <asm/ptrace.h>
> > > +#include <asm/insn-def.h>
> > > +#include <asm/alternative-macros.h>
> > > +#include <asm/hwcap.h>
> > >
> > > #ifdef CONFIG_64BIT
> > > #define DEFAULT_MAP_WINDOW (UL(1) << (MMAP_VA_BITS - 1))
> > > @@ -106,6 +109,19 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
> > > #define KSTK_EIP(tsk) (task_pt_regs(tsk)->epc)
> > > #define KSTK_ESP(tsk) (task_pt_regs(tsk)->sp)
> > >
> > > +#ifdef CONFIG_RISCV_ISA_ZICBOP
> > > +#define ARCH_HAS_PREFETCHW
> > > +
> > > +#define PREFETCHW_ASM(x) \
> > > + ALTERNATIVE(__nops(1), CBO_PREFETCH_W(x, 0), 0, \
> > > + RISCV_ISA_EXT_ZICBOP, CONFIG_RISCV_ISA_ZICBOP)
> > > +
> > > +
> > > +static inline void prefetchw(const void *x)
> > > +{
> > > + __asm__ __volatile__(PREFETCHW_ASM(%0) : : "r" (x) : "memory");
> > > +}
> >
> > Shouldn't we create an interface which exposes the offset input of
> > the instruction, allowing a sequence of calls to be unrolled? But
> > I guess that could be put off until there's a need for it.
> I want to put it off until there's a user. Let's keep the whole
> imm[11:0] zero for the current.

Yeah, my suggestion didn't make sense in this context anyway since we need
to match the interface in linux/prefetch.h. Considering linux/prefetch.h,
is there some reason we don't also add prefetch() at the same time?

Thanks,
drew

>
> >
> > > +#endif /* CONFIG_RISCV_ISA_ZICBOP */
> > >
> > > /* Do necessary setup to start up a newly executed thread. */
> > > extern void start_thread(struct pt_regs *regs,
> > > --
> > > 2.40.1
> > >
> >
> > Thanks,
> > drew
>
>
>
> --
> Best Regards
> Guo Ren

2024-01-03 20:34:16

[permalink] [raw]

Subject: Re: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On Wed, Jan 03, 2024 at 08:29:39PM +0100, Andrew Jones wrote:
> On Wed, Jan 03, 2024 at 03:52:00PM -0300, Leonardo Bras wrote:
> > On Sun, Dec 31, 2023 at 03:29:51AM -0500, [email protected] wrote:
> > > From: Guo Ren <[email protected]>
> > >
> > > Cache-block prefetch instructions are HINTs to the hardware to
> > > indicate that software intends to perform a particular type of
> > > memory access in the near future. This patch adds prefetch.i,
> > > prefetch.r and prefetch.w instruction definitions by
> > > RISCV_ISA_EXT_ZICBOP cpufeature.
> >
> > Hi Guo Ren,
> >
> > Here it would be nice to point a documentation for ZICBOP extension:
> > https://wiki.riscv.org/display/HOME/Recently+Ratified+Extensions
> >
> > or having a nice link for:
> > https://drive.google.com/file/d/1jfzhNAk7viz4t2FLDZ5z4roA0LBggkfZ/view
> >
> > >
> > > Signed-off-by: Guo Ren <[email protected]>
> > > Signed-off-by: Guo Ren <[email protected]>
> > > ---
> > > arch/riscv/Kconfig | 15 ++++++++
> > > arch/riscv/include/asm/hwcap.h | 1 +
> > > arch/riscv/include/asm/insn-def.h | 60 +++++++++++++++++++++++++++++++
> > > arch/riscv/kernel/cpufeature.c | 1 +
> > > 4 files changed, 77 insertions(+)
> > >
> > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > index 24c1799e2ec4..fcbd417d65ea 100644
> > > --- a/arch/riscv/Kconfig
> > > +++ b/arch/riscv/Kconfig
> > > @@ -579,6 +579,21 @@ config RISCV_ISA_ZICBOZ
> > >
> > > If you don't know what to do here, say Y.
> > >
> > > +config RISCV_ISA_ZICBOP
> > > + bool "Zicbop extension support for cache block prefetch"
> > > + depends on MMU
> > > + depends on RISCV_ALTERNATIVE
> > > + default y
> > > + help
> > > + Adds support to dynamically detect the presence of the ZICBOP
> > > + extension (Cache Block Prefetch Operations) and enable its
> > > + usage.
> > > +
> > > + The Zicbop extension can be used to prefetch cache block for
> > > + read/write fetch.
> > > +
> > > + If you don't know what to do here, say Y.
> > > +
> >
> > According to doc:
> > "The Zicbop extension defines a set of cache-block prefetch instructions:
> > PREFETCH.R, PREFETCH.W, and PREFETCH.I"
> >
> > So above text seems ok
> >
> >
> > > config TOOLCHAIN_HAS_ZIHINTPAUSE
> > > bool
> > > default y
> > > diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > > index 06d30526ef3b..77d3b6ee25ab 100644
> > > --- a/arch/riscv/include/asm/hwcap.h
> > > +++ b/arch/riscv/include/asm/hwcap.h
> > > @@ -57,6 +57,7 @@
> > > #define RISCV_ISA_EXT_ZIHPM 42
> > > #define RISCV_ISA_EXT_SMSTATEEN 43
> > > #define RISCV_ISA_EXT_ZICOND 44
> > > +#define RISCV_ISA_EXT_ZICBOP 45
> >
> > Is this number just in kernel code, or does it mean something in the RISC-V
> > documentation?
>
> kernel

Thanks!

>
> >
> > >
> > > #define RISCV_ISA_EXT_MAX 64
> > >
> > > diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> > > index e27179b26086..bbda350a63bf 100644
> > > --- a/arch/riscv/include/asm/insn-def.h
> > > +++ b/arch/riscv/include/asm/insn-def.h
> > > @@ -18,6 +18,13 @@
> > > #define INSN_I_RD_SHIFT 7
> > > #define INSN_I_OPCODE_SHIFT 0
> > >
> > > +#define INSN_S_SIMM7_SHIFT 25
> > > +#define INSN_S_RS2_SHIFT 20
> > > +#define INSN_S_RS1_SHIFT 15
> > > +#define INSN_S_FUNC3_SHIFT 12
> > > +#define INSN_S_SIMM5_SHIFT 7
> > > +#define INSN_S_OPCODE_SHIFT 0
> > > +
> >
> > The shifts seem correct for S-Type, but I would name the IMM defines in a
> > way we could understand where they fit in IMM:
> >
> >
> > INSN_S_SIMM5_SHIFT -> INSN_S_SIMM_0_4_SHIFT
> > INSN_S_SIMM7_SHIFT -> INSN_S_SIMM_5_11_SHIFT
> >
> > What do you think?
>
> I'm in favor of this suggestion, but then wonder if we don't need another
> patch before this which renames INSN_I_SIMM12_SHIFT to
> INSN_I_SIMM_0_11_SHIFT in order to keep things consistent.

Agree. If it's ok, I can provide a patch doing the rename on top of this
patchset.

>
> >
> >
> > > #ifdef __ASSEMBLY__
> > >
> > > #ifdef CONFIG_AS_HAS_INSN
> > > @@ -30,6 +37,10 @@
> > > .insn i \opcode, \func3, \rd, \rs1, \simm12
> > > .endm
> > >
> > > + .macro insn_s, opcode, func3, rs2, simm12, rs1
> > > + .insn s \opcode, \func3, \rs2, \simm12(\rs1)
> > > + .endm
> > > +
> > > #else
> > >
> > > #include <asm/gpr-num.h>
> > > @@ -51,10 +62,20 @@
> > > (\simm12 << INSN_I_SIMM12_SHIFT))
> > > .endm
> > >
> > > + .macro insn_s, opcode, func3, rs2, simm12, rs1
> > > + .4byte ((\opcode << INSN_S_OPCODE_SHIFT) | \
> > > + (\func3 << INSN_S_FUNC3_SHIFT) | \
> > > + (.L__gpr_num_\rs2 << INSN_S_RS2_SHIFT) | \
> > > + (.L__gpr_num_\rs1 << INSN_S_RS1_SHIFT) | \
> > > + ((\simm12 & 0x1f) << INSN_S_SIMM5_SHIFT) | \
> > > + (((\simm12 >> 5) & 0x7f) << INSN_S_SIMM7_SHIFT))
> > > + .endm
> > > +
> > > #endif
> > >
> > > #define __INSN_R(...) insn_r __VA_ARGS__
> > > #define __INSN_I(...) insn_i __VA_ARGS__
> > > +#define __INSN_S(...) insn_s __VA_ARGS__
> >
> > As a curiosity: It's quite odd to have prefetch.{i,r,w} to be an S-Type
> > instruction, given this type was supposed to be for store instructions.
> >
> > On prefetch.{i,r,w}:
> > 31 24 19 14 11 6
> > imm[11:5] | PREFETCH_OP | rs1 | ORI | imm[4:0] | OP_IMM
> >
> > For S-Type, we have:
> > 31 24 19 14 11 6
> > imm[11:5] | rs1 | rs2 | funct3 | imm[4:0] | opcode
> >
> > For I-Type, we have:
> > 31 19 14 11 6
> > immm[11:0] | rs1 | funct3 | rd | opcode
> >
> > I understand that there should be reasons for choosing S-type, but it
> > would make much more sense (as per instruction type, and as per parameters)
> > to go with I-Type.
> >
> > (I understand this was done in HW, and in kernel code we have better choice
> > to encode it as S-Type, but I kind of find the S-Type choice odd)
>
> My speculation is that since cache block sizes will never be less than 32
> bytes, it made more sense to use the S-type encoding space with imm[4:0]
> hard coded to zero, allowing the I-Type encoding space to be reserved for
> instructions which need arbitrary 12-bit immediates.

Yeah, this seems a good reason :)

>
> >
> > >
> > > #else /* ! __ASSEMBLY__ */
> > >
> > > @@ -66,6 +87,9 @@
> > > #define __INSN_I(opcode, func3, rd, rs1, simm12) \
> > > ".insn i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n"
> > >
> > > +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
> > > + ".insn s " opcode ", " func3 ", " rs2 ", " simm12 "(" rs1 ")\n"
> > > +
> > > #else
> > >
> > > #include <linux/stringify.h>
> > > @@ -92,12 +116,26 @@
> > > " (\\simm12 << " __stringify(INSN_I_SIMM12_SHIFT) "))\n" \
> > > " .endm\n"
> > >
> > > +#define DEFINE_INSN_S \
> > > + __DEFINE_ASM_GPR_NUMS \
> > > +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
> > > +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
> > > +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
> > > +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
> > > +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
> > > +" ((\\simm12 & 0x1f) << " __stringify(INSN_S_SIMM5_SHIFT) ") |" \
> > > +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
> > > +" .endm\n"
> > > +
> > > #define UNDEFINE_INSN_R \
> > > " .purgem insn_r\n"
> > >
> > > #define UNDEFINE_INSN_I \
> > > " .purgem insn_i\n"
> > >
> > > +#define UNDEFINE_INSN_S \
> > > +" .purgem insn_s\n"
> > > +
> > > #define __INSN_R(opcode, func3, func7, rd, rs1, rs2) \
> > > DEFINE_INSN_R \
> > > "insn_r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n" \
> > > @@ -108,6 +146,11 @@
> > > "insn_i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n" \
> > > UNDEFINE_INSN_I
> > >
> > > +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
> > > + DEFINE_INSN_S \
> > > + "insn_s " opcode ", " func3 ", " rs2 ", " simm12 ", " rs1 "\n" \
> > > + UNDEFINE_INSN_S
> > > +
> > > #endif
> > >
> > > #endif /* ! __ASSEMBLY__ */
> > > @@ -120,6 +163,10 @@
> > > __INSN_I(RV_##opcode, RV_##func3, RV_##rd, \
> > > RV_##rs1, RV_##simm12)
> > >
> > > +#define INSN_S(opcode, func3, rs2, simm12, rs1) \
> > > + __INSN_S(RV_##opcode, RV_##func3, RV_##rs2, \
> > > + RV_##simm12, RV_##rs1)
> > > +
> >
> > The defines above seem correct, but TBH I am not very used to review
> > .macro code.
> >
> > > #define RV_OPCODE(v) __ASM_STR(v)
> > > #define RV_FUNC3(v) __ASM_STR(v)
> > > #define RV_FUNC7(v) __ASM_STR(v)
> > > @@ -133,6 +180,7 @@
> > > #define RV___RS2(v) __RV_REG(v)
> > >
> > > #define RV_OPCODE_MISC_MEM RV_OPCODE(15)
> > > +#define RV_OPCODE_OP_IMM RV_OPCODE(19)
> >
> > Correct.
> >
> >
> > > #define RV_OPCODE_SYSTEM RV_OPCODE(115)
> > >
> > > #define HFENCE_VVMA(vaddr, asid) \
> > > @@ -196,4 +244,16 @@
> > > INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \
> > > RS1(base), SIMM12(4))
> > >
> > > +#define CBO_PREFETCH_I(base, offset) \
> > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(0), \
> > > + SIMM12(offset), RS1(base))
> > > +
> > > +#define CBO_PREFETCH_R(base, offset) \
> > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(1), \
> > > + SIMM12(offset), RS1(base))
> > > +
> > > +#define CBO_PREFETCH_W(base, offset) \
> > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > > + SIMM12(offset), RS1(base))
> > > +
> >
> > For OP_IMM & FUNC3(6) we have ORI, right?
> > For ORI, rd will be at bytes 11:7, which in PREFETCH.{i,r,w} is
> > offset[4:0].
> >
> > IIUC, when the cpu does not support ZICBOP, this should be fine as long as
> > rd = 0, since changes to r0 are disregarded.
> >
> > In this case, we need to guarantee offset[4:0] = 0, or else we migth write
> > on an unrelated register. This can be noticed in ZICBOP documentation pages
> > 21, 22, 23, as offset[4:0] is always [0 0 0 0 0].
> > (Google docs in first comment)
> >
> > What we need here is something like:
> > + enum {
> > + PREFETCH_I,
> > + PREFETCH_R,
> > + PREFETCH_W,
> > + }
>
> Can't use enum. This header may be included in assembly.

Oh, I suggest defines then, since it's better to make it clear instead of
using 0, 1, 3.

>
> > +
> > + #define CBO_PREFETCH(type, base, offset) \
> > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(type), \
> > + SIMM12(offset & ~0x1f), RS1(base))
>
> Yes. I suggested we mask offset as well, but ideally we'd detect a caller
> using an offset with nonzero lower 5 bits at compile time.

I would suggest the compiler would take care of this, but I am not sure
about the assembly, since I am not sure if it gets any optimization.

I don't think we can detect a caller with non-zero offset at compile time,
since it will be used in locks which can be at (potentially) any place in
the block size. (if you have any idea though, please let me know :) )

On the other hand, we could create a S-Type macro which deliberately
ignores imm[4:0], like

+ INSN_S_TRUNCATE(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
+ SIMM12(offset), RS1(base))

Which saves the bits 11:5 of offset into imm[11:5], and zero-fill
imm[4:0], like

+#define DEFINE_INSN_S \
+ __DEFINE_ASM_GPR_NUMS \
+" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
+" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
+" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
+" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
+" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
+" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
+" .endm\n"
+

Does this make sense?

Thanks!
Leo

>
> Thanks,
> drew
>
> >
> > + #define CBO_PREFETCH_I(base, offset) \
> > + CBO_PREFETCH(PREFETCH_I, base, offset)
> > +
> > + #define CBO_PREFETCH_R(base, offset) \
> > + CBO_PREFETCH(PREFETCH_R, base, offset)
> > +
> > + #define CBO_PREFETCH_W(base, offset) \
> > + CBO_PREFETCH(PREFETCH_W, base, offset)
> > +
> >
> > Maybe replacing 0x1f by some MASK macro, so it looks nicer.
> > (not sure how it's acceptable in asm, though).
> >
> > The above would guarantee that we would never have CBO_PREFETCH_*() to mess
> > up any other register due to a unnoticed (base & 0x1f) != 0
> >
> > Does that make sense?
> >
> > > #endif /* __ASM_INSN_DEF_H */
> > > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > > index b3785ffc1570..bdb02b066041 100644
> > > --- a/arch/riscv/kernel/cpufeature.c
> > > +++ b/arch/riscv/kernel/cpufeature.c
> > > @@ -168,6 +168,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
> > > __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
> > > __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
> > > __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
> > > + __RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),
> > > __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
> > > __RISCV_ISA_EXT_DATA(zicond, RISCV_ISA_EXT_ZICOND),
> > > __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),
> > > --
> > > 2.40.1
> > >
> >
> > Apart from above suggestions, seems a nice change :)
> >
> > I suggest splitting this patch into 2, though:
> > - Introducing S-Type instructions (plz point docs for reference)
> > - Introduce ZICBOP extension.
> >
> > Thanks!
> > Leo
> >
> >
>

2024-01-03 20:34:58

[permalink] [raw]

Subject: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On Wed, Jan 03, 2024 at 08:48:12PM +0100, Andrew Jones wrote:
> On Sun, Dec 31, 2023 at 03:29:51AM -0500, [email protected] wrote:
> ...
> > +#define CBO_PREFETCH_I(base, offset) \
> > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(0), \
> > + SIMM12(offset), RS1(base))
> > +
> > +#define CBO_PREFETCH_R(base, offset) \
> > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(1), \
> > + SIMM12(offset), RS1(base))
> > +
> > +#define CBO_PREFETCH_W(base, offset) \
> > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > + SIMM12(offset), RS1(base))
>
> These should just be named
>
> PREFETCH_I
> PREFETCH_R
> PREFETCH_W
>
> without the CBO_ prefix. The other CMO instructions we've added have the
> CBO_ prefix because their actual instruction names are e.g. cbo.zero,
> but the prefix instructions are not named that way.
>
> Thanks,
> drew
>

Sure, that's ok this way :)

2024-01-04 01:25:13

[permalink] [raw]

Subject: Re: [PATCH V2 3/3] riscv: xchg: Prefetch the destination word for sc.w

On Thu, Jan 4, 2024 at 3:45 AM Leonardo Bras <[email protected]> wrote:
>
> On Wed, Jan 03, 2024 at 02:15:45PM +0800, Guo Ren wrote:
> > On Tue, Jan 2, 2024 at 7:19 PM Andrew Jones <[email protected]> wrote:
> > >
> > > On Sun, Dec 31, 2023 at 03:29:53AM -0500, [email protected] wrote:
> > > > From: Guo Ren <[email protected]>
> > > >
> > > > The cost of changing a cacheline from shared to exclusive state can be
> > > > significant, especially when this is triggered by an exclusive store,
> > > > since it may result in having to retry the transaction.
> > > >
> > > > This patch makes use of prefetch.w to prefetch cachelines for write
> > > > prior to lr/sc loops when using the xchg_small atomic routine.
> > > >
> > > > This patch is inspired by commit: 0ea366f5e1b6 ("arm64: atomics:
> > > > prefetch the destination word for write prior to stxr").
> > > >
> > > > Signed-off-by: Guo Ren <[email protected]>
> > > > Signed-off-by: Guo Ren <[email protected]>
> > > > ---
> > > > arch/riscv/include/asm/cmpxchg.h | 4 +++-
> > > > 1 file changed, 3 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > > > index 26cea2395aae..d7b9d7951f08 100644
> > > > --- a/arch/riscv/include/asm/cmpxchg.h
> > > > +++ b/arch/riscv/include/asm/cmpxchg.h
> > > > @@ -10,6 +10,7 @@
> > > >
> > > > #include <asm/barrier.h>
> > > > #include <asm/fence.h>
> > > > +#include <asm/processor.h>
> > > >
> > > > #define __arch_xchg_masked(prepend, append, r, p, n) \
> > >
> > > Are you sure this is based on v6.7-rc7? Because I don't see this macro.
> > Oh, it is based on Leobras' patches. I would remove it in the next of version.
>
> I would say this next :)
Thx for the grammar correction.

>
> >
> > >
> > > > ({ \
> > > > @@ -23,6 +24,7 @@
> > > > \
> > > > __asm__ __volatile__ ( \
> > > > prepend \
> > > > + PREFETCHW_ASM(%5) \
> > > > "0: lr.w %0, %2\n" \
> > > > " and %1, %0, %z4\n" \
> > > > " or %1, %1, %z3\n" \
> > > > @@ -30,7 +32,7 @@
> > > > " bnez %1, 0b\n" \
> > > > append \
> > > > : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b)) \
> > > > - : "rJ" (__newx), "rJ" (~__mask) \
> > > > + : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b) \
> > >
> > > I'm pretty sure we don't want to allow the J constraint for __ptr32b.
> > >
> > > > : "memory"); \
> > > > \
> > > > r = (__typeof__(*(p)))((__retx & __mask) >> __s); \
> > > > --
> > > > 2.40.1
> > > >
> > >
> > > Thanks,
> > > drew
> >
> >
> >
> > --
> > Best Regards
> > Guo Ren
> >
>
> Nice patch :)
> Any reason it's not needed in __arch_cmpxchg_masked(), and __arch_cmpxchg() ?
CAS is a conditional AMO, unlike xchg (Stand AMO). Arm64 is wrong, or
they have a problem with the hardware.

>
> Thanks!
> Leo
>

--
Best Regards
Guo Ren

2024-01-04 03:57:08

[permalink] [raw]

Subject: Re: [PATCH V2 3/3] riscv: xchg: Prefetch the destination word for sc.w

On Thu, Jan 04, 2024 at 09:24:40AM +0800, Guo Ren wrote:
> On Thu, Jan 4, 2024 at 3:45 AM Leonardo Bras <[email protected]> wrote:
> >
> > On Wed, Jan 03, 2024 at 02:15:45PM +0800, Guo Ren wrote:
> > > On Tue, Jan 2, 2024 at 7:19 PM Andrew Jones <[email protected]> wrote:
> > > >
> > > > On Sun, Dec 31, 2023 at 03:29:53AM -0500, [email protected] wrote:
> > > > > From: Guo Ren <[email protected]>
> > > > >
> > > > > The cost of changing a cacheline from shared to exclusive state can be
> > > > > significant, especially when this is triggered by an exclusive store,
> > > > > since it may result in having to retry the transaction.
> > > > >
> > > > > This patch makes use of prefetch.w to prefetch cachelines for write
> > > > > prior to lr/sc loops when using the xchg_small atomic routine.
> > > > >
> > > > > This patch is inspired by commit: 0ea366f5e1b6 ("arm64: atomics:
> > > > > prefetch the destination word for write prior to stxr").
> > > > >
> > > > > Signed-off-by: Guo Ren <[email protected]>
> > > > > Signed-off-by: Guo Ren <[email protected]>
> > > > > ---
> > > > > arch/riscv/include/asm/cmpxchg.h | 4 +++-
> > > > > 1 file changed, 3 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > > > > index 26cea2395aae..d7b9d7951f08 100644
> > > > > --- a/arch/riscv/include/asm/cmpxchg.h
> > > > > +++ b/arch/riscv/include/asm/cmpxchg.h
> > > > > @@ -10,6 +10,7 @@
> > > > >
> > > > > #include <asm/barrier.h>
> > > > > #include <asm/fence.h>
> > > > > +#include <asm/processor.h>
> > > > >
> > > > > #define __arch_xchg_masked(prepend, append, r, p, n) \
> > > >
> > > > Are you sure this is based on v6.7-rc7? Because I don't see this macro.
> > > Oh, it is based on Leobras' patches. I would remove it in the next of version.
> >
> > I would say this next :)
> Thx for the grammar correction.

Oh, I was not intending to correct grammar.
I just meant the next thing I would mention is that it was based on top of
my patchset instead of v6.7-rc7:

>
> >
> > >
> > > >
> > > > > ({ \
> > > > > @@ -23,6 +24,7 @@
> > > > > \
> > > > > __asm__ __volatile__ ( \
> > > > > prepend \
> > > > > + PREFETCHW_ASM(%5) \
> > > > > "0: lr.w %0, %2\n" \
> > > > > " and %1, %0, %z4\n" \
> > > > > " or %1, %1, %z3\n" \
> > > > > @@ -30,7 +32,7 @@
> > > > > " bnez %1, 0b\n" \
> > > > > append \
> > > > > : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b)) \
> > > > > - : "rJ" (__newx), "rJ" (~__mask) \
> > > > > + : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b) \
> > > >
> > > > I'm pretty sure we don't want to allow the J constraint for __ptr32b.
> > > >
> > > > > : "memory"); \
> > > > > \
> > > > > r = (__typeof__(*(p)))((__retx & __mask) >> __s); \
> > > > > --
> > > > > 2.40.1
> > > > >
> > > >
> > > > Thanks,
> > > > drew
> > >
> > >
> > >
> > > --
> > > Best Regards
> > > Guo Ren
> > >
> >
> > Nice patch :)
> > Any reason it's not needed in __arch_cmpxchg_masked(), and __arch_cmpxchg() ?
> CAS is a conditional AMO, unlike xchg (Stand AMO). Arm64 is wrong, or
> they have a problem with the hardware.

Sorry, I was unable to fully understand the reason here.

You suggest that the PREFETCH.W was inserted on xchg_masked because it will
always switch the variable (no compare, blind CAS), but not on cmpxchg.

Is this because cmpxchg will depend on a compare, and thus it does not
garantee a write? so it would be unwise to always prefetch cacheline
exclusiveness for this cpu, where shared state would be enough.
Is that correct?

Thanks!
Leo

>
> >
> > Thanks!
> > Leo
> >
>
>
> --
> Best Regards
> Guo Ren
>

2024-01-04 08:15:49

[permalink] [raw]

Subject: Re: [PATCH V2 3/3] riscv: xchg: Prefetch the destination word for sc.w

On Thu, Jan 4, 2024 at 11:56 AM Leonardo Bras <[email protected]> wrote:
>
> On Thu, Jan 04, 2024 at 09:24:40AM +0800, Guo Ren wrote:
> > On Thu, Jan 4, 2024 at 3:45 AM Leonardo Bras <[email protected]> wrote:
> > >
> > > On Wed, Jan 03, 2024 at 02:15:45PM +0800, Guo Ren wrote:
> > > > On Tue, Jan 2, 2024 at 7:19 PM Andrew Jones <[email protected]> wrote:
> > > > >
> > > > > On Sun, Dec 31, 2023 at 03:29:53AM -0500, [email protected] wrote:
> > > > > > From: Guo Ren <[email protected]>
> > > > > >
> > > > > > The cost of changing a cacheline from shared to exclusive state can be
> > > > > > significant, especially when this is triggered by an exclusive store,
> > > > > > since it may result in having to retry the transaction.
> > > > > >
> > > > > > This patch makes use of prefetch.w to prefetch cachelines for write
> > > > > > prior to lr/sc loops when using the xchg_small atomic routine.
> > > > > >
> > > > > > This patch is inspired by commit: 0ea366f5e1b6 ("arm64: atomics:
> > > > > > prefetch the destination word for write prior to stxr").
> > > > > >
> > > > > > Signed-off-by: Guo Ren <[email protected]>
> > > > > > Signed-off-by: Guo Ren <[email protected]>
> > > > > > ---
> > > > > > arch/riscv/include/asm/cmpxchg.h | 4 +++-
> > > > > > 1 file changed, 3 insertions(+), 1 deletion(-)
> > > > > >
> > > > > > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > > > > > index 26cea2395aae..d7b9d7951f08 100644
> > > > > > --- a/arch/riscv/include/asm/cmpxchg.h
> > > > > > +++ b/arch/riscv/include/asm/cmpxchg.h
> > > > > > @@ -10,6 +10,7 @@
> > > > > >
> > > > > > #include <asm/barrier.h>
> > > > > > #include <asm/fence.h>
> > > > > > +#include <asm/processor.h>
> > > > > >
> > > > > > #define __arch_xchg_masked(prepend, append, r, p, n) \
> > > > >
> > > > > Are you sure this is based on v6.7-rc7? Because I don't see this macro.
> > > > Oh, it is based on Leobras' patches. I would remove it in the next of version.
> > >
> > > I would say this next :)
> > Thx for the grammar correction.
>
> Oh, I was not intending to correct grammar.
> I just meant the next thing I would mention is that it was based on top of
> my patchset instead of v6.7-rc7:
>
> >
> > >
> > > >
> > > > >
> > > > > > ({ \
> > > > > > @@ -23,6 +24,7 @@
> > > > > > \
> > > > > > __asm__ __volatile__ ( \
> > > > > > prepend \
> > > > > > + PREFETCHW_ASM(%5) \
> > > > > > "0: lr.w %0, %2\n" \
> > > > > > " and %1, %0, %z4\n" \
> > > > > > " or %1, %1, %z3\n" \
> > > > > > @@ -30,7 +32,7 @@
> > > > > > " bnez %1, 0b\n" \
> > > > > > append \
> > > > > > : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b)) \
> > > > > > - : "rJ" (__newx), "rJ" (~__mask) \
> > > > > > + : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b) \
> > > > >
> > > > > I'm pretty sure we don't want to allow the J constraint for __ptr32b.
> > > > >
> > > > > > : "memory"); \
> > > > > > \
> > > > > > r = (__typeof__(*(p)))((__retx & __mask) >> __s); \
> > > > > > --
> > > > > > 2.40.1
> > > > > >
> > > > >
> > > > > Thanks,
> > > > > drew
> > > >
> > > >
> > > >
> > > > --
> > > > Best Regards
> > > > Guo Ren
> > > >
> > >
> > > Nice patch :)
> > > Any reason it's not needed in __arch_cmpxchg_masked(), and __arch_cmpxchg() ?
> > CAS is a conditional AMO, unlike xchg (Stand AMO). Arm64 is wrong, or
> > they have a problem with the hardware.
>
> Sorry, I was unable to fully understand the reason here.
>
> You suggest that the PREFETCH.W was inserted on xchg_masked because it will
> always switch the variable (no compare, blind CAS), but not on cmpxchg.
>
> Is this because cmpxchg will depend on a compare, and thus it does not
> garantee a write? so it would be unwise to always prefetch cacheline
Yes, it has a comparison, so a store may not exist there.

> exclusiveness for this cpu, where shared state would be enough.
> Is that correct?
Yes, exclusiveness would invalidate other harts' cache lines.

>
> Thanks!
> Leo
>
>
> >
> > >
> > > Thanks!
> > > Leo
> > >
> >
> >
> > --
> > Best Regards
> > Guo Ren
> >
>

--
Best Regards
Guo Ren

2024-01-04 09:47:47

[permalink] [raw]

Subject: Re: Re: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On Wed, Jan 03, 2024 at 05:33:41PM -0300, Leonardo Bras wrote:
> On Wed, Jan 03, 2024 at 08:29:39PM +0100, Andrew Jones wrote:
> > On Wed, Jan 03, 2024 at 03:52:00PM -0300, Leonardo Bras wrote:
> > > On Sun, Dec 31, 2023 at 03:29:51AM -0500, [email protected] wrote:
...
> > > The shifts seem correct for S-Type, but I would name the IMM defines in a
> > > way we could understand where they fit in IMM:
> > >
> > >
> > > INSN_S_SIMM5_SHIFT -> INSN_S_SIMM_0_4_SHIFT
> > > INSN_S_SIMM7_SHIFT -> INSN_S_SIMM_5_11_SHIFT
> > >
> > > What do you think?
> >
> > I'm in favor of this suggestion, but then wonder if we don't need another
> > patch before this which renames INSN_I_SIMM12_SHIFT to
> > INSN_I_SIMM_0_11_SHIFT in order to keep things consistent.
>
> Agree. If it's ok, I can provide a patch doing the rename on top of this
> patchset.

The INSN_I change is only needed if we also take the new INSN_S shift
macros, so I think the INSN_I change should be part of this series.

BTW, I just noticed we wrote the numbers backwards. They should be

INSN_I_SIMM_11_0_SHIFT
INSN_S_SIMM_11_5_SHIFT
INSN_S_SIMM_4_0_SHIFT

> > > >
> > > > +#define CBO_PREFETCH_I(base, offset) \
> > > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(0), \
> > > > + SIMM12(offset), RS1(base))
> > > > +
> > > > +#define CBO_PREFETCH_R(base, offset) \
> > > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(1), \
> > > > + SIMM12(offset), RS1(base))
> > > > +
> > > > +#define CBO_PREFETCH_W(base, offset) \
> > > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > > > + SIMM12(offset), RS1(base))
> > > > +
> > >
> > > For OP_IMM & FUNC3(6) we have ORI, right?
> > > For ORI, rd will be at bytes 11:7, which in PREFETCH.{i,r,w} is
> > > offset[4:0].
> > >
> > > IIUC, when the cpu does not support ZICBOP, this should be fine as long as
> > > rd = 0, since changes to r0 are disregarded.
> > >
> > > In this case, we need to guarantee offset[4:0] = 0, or else we migth write
> > > on an unrelated register. This can be noticed in ZICBOP documentation pages
> > > 21, 22, 23, as offset[4:0] is always [0 0 0 0 0].
> > > (Google docs in first comment)
> > >
> > > What we need here is something like:
> > > + enum {
> > > + PREFETCH_I,
> > > + PREFETCH_R,
> > > + PREFETCH_W,
> > > + }
> >
> > Can't use enum. This header may be included in assembly.
>
> Oh, I suggest defines then, since it's better to make it clear instead of
> using 0, 1, 3.

I don't think we gain anything by adding another define in order to create
the instruction define. We have to review the number sooner or later. I'd
prefer we use the number inside the instruction define so we only need
to look one place, which is also consistent with how we use FUNC fields.

>
> >
> > > +
> > > + #define CBO_PREFETCH(type, base, offset) \
> > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(type), \
> > > + SIMM12(offset & ~0x1f), RS1(base))
> >
> > Yes. I suggested we mask offset as well, but ideally we'd detect a caller
> > using an offset with nonzero lower 5 bits at compile time.
>
> I would suggest the compiler would take care of this, but I am not sure
> about the assembly, since I am not sure if it gets any optimization.
>
> I don't think we can detect a caller with non-zero offset at compile time,
> since it will be used in locks which can be at (potentially) any place in
> the block size. (if you have any idea though, please let me know :) )
>
> On the other hand, we could create a S-Type macro which deliberately
> ignores imm[4:0], like
>
> + INSN_S_TRUNCATE(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> + SIMM12(offset), RS1(base))
>
> Which saves the bits 11:5 of offset into imm[11:5], and zero-fill
> imm[4:0], like
>
> +#define DEFINE_INSN_S \
> + __DEFINE_ASM_GPR_NUMS \
> +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
> +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
> +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
> +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
> +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
> +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
> +" .endm\n"
> +
>
> Does this make sense?

If we create a special version of INSN_S, then I suggest we create one
where its two SIMM fields are independent and then define prefetch
instructions like this

#define PREFETCH_W(base, offset) \
INSN_S_SPLIT_IMM(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
SIMM_11_5(offset >> 5), SIMM_4_0(0), RS1(base))

which would allow simple review against the spec and potentially
support other instructions which use hard coded values in the
immediate fields.

But I'm not sure it's worth it. I think

#define PREFETCH_W(base, offset) \
INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
SIMM12(offset & ~0x1f), RS1(base))

is also pretty easy to review against the spec and we don't have any other
instructions yet with other requirements for the immediates.

Thanks,
drew

2024-01-04 14:18:19

[permalink] [raw]

Subject: Re: [PATCH V2 3/3] riscv: xchg: Prefetch the destination word for sc.w

On Thu, Jan 04, 2024 at 04:14:27PM +0800, Guo Ren wrote:
> On Thu, Jan 4, 2024 at 11:56 AM Leonardo Bras <[email protected]> wrote:
> >
> > On Thu, Jan 04, 2024 at 09:24:40AM +0800, Guo Ren wrote:
> > > On Thu, Jan 4, 2024 at 3:45 AM Leonardo Bras <[email protected]> wrote:
> > > >
> > > > On Wed, Jan 03, 2024 at 02:15:45PM +0800, Guo Ren wrote:
> > > > > On Tue, Jan 2, 2024 at 7:19 PM Andrew Jones <[email protected]> wrote:
> > > > > >
> > > > > > On Sun, Dec 31, 2023 at 03:29:53AM -0500, [email protected] wrote:
> > > > > > > From: Guo Ren <[email protected]>
> > > > > > >
> > > > > > > The cost of changing a cacheline from shared to exclusive state can be
> > > > > > > significant, especially when this is triggered by an exclusive store,
> > > > > > > since it may result in having to retry the transaction.
> > > > > > >
> > > > > > > This patch makes use of prefetch.w to prefetch cachelines for write
> > > > > > > prior to lr/sc loops when using the xchg_small atomic routine.
> > > > > > >
> > > > > > > This patch is inspired by commit: 0ea366f5e1b6 ("arm64: atomics:
> > > > > > > prefetch the destination word for write prior to stxr").
> > > > > > >
> > > > > > > Signed-off-by: Guo Ren <[email protected]>
> > > > > > > Signed-off-by: Guo Ren <[email protected]>
> > > > > > > ---
> > > > > > > arch/riscv/include/asm/cmpxchg.h | 4 +++-
> > > > > > > 1 file changed, 3 insertions(+), 1 deletion(-)
> > > > > > >
> > > > > > > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > > > > > > index 26cea2395aae..d7b9d7951f08 100644
> > > > > > > --- a/arch/riscv/include/asm/cmpxchg.h
> > > > > > > +++ b/arch/riscv/include/asm/cmpxchg.h
> > > > > > > @@ -10,6 +10,7 @@
> > > > > > >
> > > > > > > #include <asm/barrier.h>
> > > > > > > #include <asm/fence.h>
> > > > > > > +#include <asm/processor.h>
> > > > > > >
> > > > > > > #define __arch_xchg_masked(prepend, append, r, p, n) \
> > > > > >
> > > > > > Are you sure this is based on v6.7-rc7? Because I don't see this macro.
> > > > > Oh, it is based on Leobras' patches. I would remove it in the next of version.
> > > >
> > > > I would say this next :)
> > > Thx for the grammar correction.
> >
> > Oh, I was not intending to correct grammar.
> > I just meant the next thing I would mention is that it was based on top of
> > my patchset instead of v6.7-rc7:
> >
> > >
> > > >
> > > > >
> > > > > >
> > > > > > > ({ \
> > > > > > > @@ -23,6 +24,7 @@
> > > > > > > \
> > > > > > > __asm__ __volatile__ ( \
> > > > > > > prepend \
> > > > > > > + PREFETCHW_ASM(%5) \
> > > > > > > "0: lr.w %0, %2\n" \
> > > > > > > " and %1, %0, %z4\n" \
> > > > > > > " or %1, %1, %z3\n" \
> > > > > > > @@ -30,7 +32,7 @@
> > > > > > > " bnez %1, 0b\n" \
> > > > > > > append \
> > > > > > > : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b)) \
> > > > > > > - : "rJ" (__newx), "rJ" (~__mask) \
> > > > > > > + : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b) \
> > > > > >
> > > > > > I'm pretty sure we don't want to allow the J constraint for __ptr32b.
> > > > > >
> > > > > > > : "memory"); \
> > > > > > > \
> > > > > > > r = (__typeof__(*(p)))((__retx & __mask) >> __s); \
> > > > > > > --
> > > > > > > 2.40.1
> > > > > > >
> > > > > >
> > > > > > Thanks,
> > > > > > drew
> > > > >
> > > > >
> > > > >
> > > > > --
> > > > > Best Regards
> > > > > Guo Ren
> > > > >
> > > >
> > > > Nice patch :)
> > > > Any reason it's not needed in __arch_cmpxchg_masked(), and __arch_cmpxchg() ?
> > > CAS is a conditional AMO, unlike xchg (Stand AMO). Arm64 is wrong, or
> > > they have a problem with the hardware.
> >
> > Sorry, I was unable to fully understand the reason here.
> >
> > You suggest that the PREFETCH.W was inserted on xchg_masked because it will
> > always switch the variable (no compare, blind CAS), but not on cmpxchg.
> >
> > Is this because cmpxchg will depend on a compare, and thus it does not
> > garantee a write? so it would be unwise to always prefetch cacheline
> Yes, it has a comparison, so a store may not exist there.
>
> > exclusiveness for this cpu, where shared state would be enough.
> > Is that correct?
> Yes, exclusiveness would invalidate other harts' cache lines.

I see.

I recall a previous discussion on computer arch which stated that any LR
would require to get a cacheline in exclusive state for lr/sc to work, but
I went through the RISC-V lr/sc documentation and could not find any info
about its cacheline behavior.

If this stands correct, the PREFETCH.W could be useful before every lr,
right?
(maybe that's the case for arm64 that you mentioned before)

Thanks!
Leo

>
> >
> > Thanks!
> > Leo
> >
> >
> > >
> > > >
> > > > Thanks!
> > > > Leo
> > > >
> > >
> > >
> > > --
> > > Best Regards
> > > Guo Ren
> > >
> >
>
>
> --
> Best Regards
> Guo Ren
>

2024-01-04 15:07:22

[permalink] [raw]

Subject: Re: Re: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On Thu, Jan 04, 2024 at 10:47:34AM +0100, Andrew Jones wrote:
> On Wed, Jan 03, 2024 at 05:33:41PM -0300, Leonardo Bras wrote:
> > On Wed, Jan 03, 2024 at 08:29:39PM +0100, Andrew Jones wrote:
> > > On Wed, Jan 03, 2024 at 03:52:00PM -0300, Leonardo Bras wrote:
> > > > On Sun, Dec 31, 2023 at 03:29:51AM -0500, [email protected] wrote:
> ...
> > > > The shifts seem correct for S-Type, but I would name the IMM defines in a
> > > > way we could understand where they fit in IMM:
> > > >
> > > >
> > > > INSN_S_SIMM5_SHIFT -> INSN_S_SIMM_0_4_SHIFT
> > > > INSN_S_SIMM7_SHIFT -> INSN_S_SIMM_5_11_SHIFT
> > > >
> > > > What do you think?
> > >
> > > I'm in favor of this suggestion, but then wonder if we don't need another
> > > patch before this which renames INSN_I_SIMM12_SHIFT to
> > > INSN_I_SIMM_0_11_SHIFT in order to keep things consistent.
> >
> > Agree. If it's ok, I can provide a patch doing the rename on top of this
> > patchset.
>
> The INSN_I change is only needed if we also take the new INSN_S shift
> macros, so I think the INSN_I change should be part of this series.

Ok then,

>
> BTW, I just noticed we wrote the numbers backwards. They should be
>
> INSN_I_SIMM_11_0_SHIFT
> INSN_S_SIMM_11_5_SHIFT
> INSN_S_SIMM_4_0_SHIFT
>

That's right, so it matches ISA documentation :)

> > > > >
> > > > > +#define CBO_PREFETCH_I(base, offset) \
> > > > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(0), \
> > > > > + SIMM12(offset), RS1(base))
> > > > > +
> > > > > +#define CBO_PREFETCH_R(base, offset) \
> > > > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(1), \
> > > > > + SIMM12(offset), RS1(base))
> > > > > +
> > > > > +#define CBO_PREFETCH_W(base, offset) \
> > > > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > > > > + SIMM12(offset), RS1(base))
> > > > > +
> > > >
> > > > For OP_IMM & FUNC3(6) we have ORI, right?
> > > > For ORI, rd will be at bytes 11:7, which in PREFETCH.{i,r,w} is
> > > > offset[4:0].
> > > >
> > > > IIUC, when the cpu does not support ZICBOP, this should be fine as long as
> > > > rd = 0, since changes to r0 are disregarded.
> > > >
> > > > In this case, we need to guarantee offset[4:0] = 0, or else we migth write
> > > > on an unrelated register. This can be noticed in ZICBOP documentation pages
> > > > 21, 22, 23, as offset[4:0] is always [0 0 0 0 0].
> > > > (Google docs in first comment)
> > > >
> > > > What we need here is something like:
> > > > + enum {
> > > > + PREFETCH_I,
> > > > + PREFETCH_R,
> > > > + PREFETCH_W,
> > > > + }
> > >
> > > Can't use enum. This header may be included in assembly.
> >
> > Oh, I suggest defines then, since it's better to make it clear instead of
> > using 0, 1, 3.
>
> I don't think we gain anything by adding another define in order to create
> the instruction define. We have to review the number sooner or later. I'd
> prefer we use the number inside the instruction define so we only need
> to look one place, which is also consistent with how we use FUNC fields.
>

Sorry, I was unable to understand the reasoning.

If we are going to review the numbers sooner or later, would not it be
better to have the instruction define to have "PREFETCH_W" instead of a
number, and a unified list of defines for instructions.

This way we don't need to look into the code for 0's 1's and 3's, but
instead just replace the number in the define list.

What am I missing?

> >
> > >
> > > > +
> > > > + #define CBO_PREFETCH(type, base, offset) \
> > > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(type), \
> > > > + SIMM12(offset & ~0x1f), RS1(base))
> > >
> > > Yes. I suggested we mask offset as well, but ideally we'd detect a caller
> > > using an offset with nonzero lower 5 bits at compile time.
> >
> > I would suggest the compiler would take care of this, but I am not sure
> > about the assembly, since I am not sure if it gets any optimization.
> >
> > I don't think we can detect a caller with non-zero offset at compile time,
> > since it will be used in locks which can be at (potentially) any place in
> > the block size. (if you have any idea though, please let me know :) )
> >
> > On the other hand, we could create a S-Type macro which deliberately
> > ignores imm[4:0], like
> >
> > + INSN_S_TRUNCATE(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > + SIMM12(offset), RS1(base))
> >
> > Which saves the bits 11:5 of offset into imm[11:5], and zero-fill
> > imm[4:0], like
> >
> > +#define DEFINE_INSN_S \
> > + __DEFINE_ASM_GPR_NUMS \
> > +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
> > +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
> > +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
> > +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
> > +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
> > +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
> > +" .endm\n"
> > +
> >
> > Does this make sense?
>
> If we create a special version of INSN_S, then I suggest we create one
> where its two SIMM fields are independent and then define prefetch
> instructions like this
>
> #define PREFETCH_W(base, offset) \
> INSN_S_SPLIT_IMM(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> SIMM_11_5(offset >> 5), SIMM_4_0(0), RS1(base))
>
> which would allow simple review against the spec and potentially
> support other instructions which use hard coded values in the
> immediate fields.
>

I agree, it looks better this way.

We could have:
INSN_S_SPLIT_IMM(OPCODE, FUNC3, RS1, RS2, SIMM_11_5, SIMM_4_0)

and implement INSN_S like:
#define INSN_S(OPCODE, FUNC3, RS1, RS2, SIMM_11_0) \
INSN_S_SPLIT_IMM(OPCODE, FUNC3, RS1, RS2, \
SIMM_11_0 >> 5, SIMM_11_0 & 0x1f)

This would avoid extra instructions in asm while not having duplicated
code.

> But I'm not sure it's worth it. I think
>
> #define PREFETCH_W(base, offset) \
> INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> SIMM12(offset & ~0x1f), RS1(base))
>
> is also pretty easy to review against the spec and we don't have any other
> instructions yet with other requirements for the immediates.
>

It makes sense, but I think having INSN_S being implemented with
INSN_S_SPLIT_IMM like suggested above would allow us to have the
benefits of a split version without the code duplication.

> Thanks,
> drew
>

Thanks!
Leo

2024-01-04 16:42:09

[permalink] [raw]

Subject: Re: Re: Re: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On Thu, Jan 04, 2024 at 12:03:57PM -0300, Leonardo Bras wrote:
...
> > > > > What we need here is something like:
> > > > > + enum {
> > > > > + PREFETCH_I,
> > > > > + PREFETCH_R,
> > > > > + PREFETCH_W,
> > > > > + }
> > > >
> > > > Can't use enum. This header may be included in assembly.
> > >
> > > Oh, I suggest defines then, since it's better to make it clear instead of
> > > using 0, 1, 3.
> >
> > I don't think we gain anything by adding another define in order to create
> > the instruction define. We have to review the number sooner or later. I'd
> > prefer we use the number inside the instruction define so we only need
> > to look one place, which is also consistent with how we use FUNC fields.
> >
>
> Sorry, I was unable to understand the reasoning.
>
> If we are going to review the numbers sooner or later, would not it be
> better to have the instruction define to have "PREFETCH_W" instead of a
> number, and a unified list of defines for instructions.
>
> This way we don't need to look into the code for 0's 1's and 3's, but
> instead just replace the number in the define list.
>
> What am I missing?

PREFETCH_W isn't defined as just 3, it's defined as
INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), ...)

Adding a define (PREFETCH_W_RS2?) for the 3 just bloats the code and
requires reviewers of PREFETCH_W to go look up another define.
OPCODE_OP_IMM gets a define because it's used in multiple instructions,
but everything else in an instruction definition should be a number
exactly matching the spec, making it easy to review, or be an argument
passed into the instruction macro.

>
> > >
> > > >
> > > > > +
> > > > > + #define CBO_PREFETCH(type, base, offset) \
> > > > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(type), \
> > > > > + SIMM12(offset & ~0x1f), RS1(base))
> > > >
> > > > Yes. I suggested we mask offset as well, but ideally we'd detect a caller
> > > > using an offset with nonzero lower 5 bits at compile time.
> > >
> > > I would suggest the compiler would take care of this, but I am not sure
> > > about the assembly, since I am not sure if it gets any optimization.
> > >
> > > I don't think we can detect a caller with non-zero offset at compile time,
> > > since it will be used in locks which can be at (potentially) any place in
> > > the block size. (if you have any idea though, please let me know :) )

I forgot to reply to this before. The reason I think it may be possible to
validate offset at compile time is because it must be a constant, i.e.
__builtin_constant_p(offset) must return true. So maybe something like

static_assert(__builtin_constant_p(offset) && !(offset & 0x1f))

I'll try to find time to play with it.

> > >
> > > On the other hand, we could create a S-Type macro which deliberately
> > > ignores imm[4:0], like
> > >
> > > + INSN_S_TRUNCATE(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > > + SIMM12(offset), RS1(base))
> > >
> > > Which saves the bits 11:5 of offset into imm[11:5], and zero-fill
> > > imm[4:0], like
> > >
> > > +#define DEFINE_INSN_S \
> > > + __DEFINE_ASM_GPR_NUMS \
> > > +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
> > > +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
> > > +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
> > > +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
> > > +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
> > > +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
> > > +" .endm\n"
> > > +
> > >
> > > Does this make sense?
> >
> > If we create a special version of INSN_S, then I suggest we create one
> > where its two SIMM fields are independent and then define prefetch
> > instructions like this
> >
> > #define PREFETCH_W(base, offset) \
> > INSN_S_SPLIT_IMM(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > SIMM_11_5(offset >> 5), SIMM_4_0(0), RS1(base))
> >
> > which would allow simple review against the spec and potentially
> > support other instructions which use hard coded values in the
> > immediate fields.
> >
>
> I agree, it looks better this way.
>
> We could have:
> INSN_S_SPLIT_IMM(OPCODE, FUNC3, RS1, RS2, SIMM_11_5, SIMM_4_0)
>
> and implement INSN_S like:
> #define INSN_S(OPCODE, FUNC3, RS1, RS2, SIMM_11_0) \
> INSN_S_SPLIT_IMM(OPCODE, FUNC3, RS1, RS2, \
> SIMM_11_0 >> 5, SIMM_11_0 & 0x1f)

That won't work since SIMM_11_0 will be a string. Actually, with
stringification in mind, I don't think defining INSN_S_SPLIT_IMM()
is a good idea.

Thanks,
drew

2024-01-04 17:43:30

[permalink] [raw]

Subject: Re: Re: Re: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On Thu, Jan 04, 2024 at 05:40:50PM +0100, Andrew Jones wrote:
> On Thu, Jan 04, 2024 at 12:03:57PM -0300, Leonardo Bras wrote:
> ...
> > > > > > What we need here is something like:
> > > > > > + enum {
> > > > > > + PREFETCH_I,
> > > > > > + PREFETCH_R,
> > > > > > + PREFETCH_W,
> > > > > > + }
> > > > >
> > > > > Can't use enum. This header may be included in assembly.
> > > >
> > > > Oh, I suggest defines then, since it's better to make it clear instead of
> > > > using 0, 1, 3.
> > >
> > > I don't think we gain anything by adding another define in order to create
> > > the instruction define. We have to review the number sooner or later. I'd
> > > prefer we use the number inside the instruction define so we only need
> > > to look one place, which is also consistent with how we use FUNC fields.
> > >
> >
> > Sorry, I was unable to understand the reasoning.
> >
> > If we are going to review the numbers sooner or later, would not it be
> > better to have the instruction define to have "PREFETCH_W" instead of a
> > number, and a unified list of defines for instructions.
> >
> > This way we don't need to look into the code for 0's 1's and 3's, but
> > instead just replace the number in the define list.
> >
> > What am I missing?
>
> PREFETCH_W isn't defined as just 3, it's defined as
> INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), ...)
>
> Adding a define (PREFETCH_W_RS2?) for the 3 just bloats the code and
> requires reviewers of PREFETCH_W to go look up another define.
> OPCODE_OP_IMM gets a define because it's used in multiple instructions,
> but everything else in an instruction definition should be a number
> exactly matching the spec, making it easy to review, or be an argument
> passed into the instruction macro.

Ok, I see your point now.
It's fine by me, then.

>
> >
> > > >
> > > > >
> > > > > > +
> > > > > > + #define CBO_PREFETCH(type, base, offset) \
> > > > > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(type), \
> > > > > > + SIMM12(offset & ~0x1f), RS1(base))
> > > > >
> > > > > Yes. I suggested we mask offset as well, but ideally we'd detect a caller
> > > > > using an offset with nonzero lower 5 bits at compile time.
> > > >
> > > > I would suggest the compiler would take care of this, but I am not sure
> > > > about the assembly, since I am not sure if it gets any optimization.
> > > >
> > > > I don't think we can detect a caller with non-zero offset at compile time,
> > > > since it will be used in locks which can be at (potentially) any place in
> > > > the block size. (if you have any idea though, please let me know :) )
>
> I forgot to reply to this before. The reason I think it may be possible to
> validate offset at compile time is because it must be a constant, i.e.
> __builtin_constant_p(offset) must return true. So maybe something like
>
> static_assert(__builtin_constant_p(offset) && !(offset & 0x1f))
>
> I'll try to find time to play with it.
>

Let me know if you find anything.

> > > >
> > > > On the other hand, we could create a S-Type macro which deliberately
> > > > ignores imm[4:0], like
> > > >
> > > > + INSN_S_TRUNCATE(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > > > + SIMM12(offset), RS1(base))
> > > >
> > > > Which saves the bits 11:5 of offset into imm[11:5], and zero-fill
> > > > imm[4:0], like
> > > >
> > > > +#define DEFINE_INSN_S \
> > > > + __DEFINE_ASM_GPR_NUMS \
> > > > +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
> > > > +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
> > > > +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
> > > > +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
> > > > +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
> > > > +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
> > > > +" .endm\n"
> > > > +
> > > >
> > > > Does this make sense?
> > >
> > > If we create a special version of INSN_S, then I suggest we create one
> > > where its two SIMM fields are independent and then define prefetch
> > > instructions like this
> > >
> > > #define PREFETCH_W(base, offset) \
> > > INSN_S_SPLIT_IMM(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > > SIMM_11_5(offset >> 5), SIMM_4_0(0), RS1(base))
> > >
> > > which would allow simple review against the spec and potentially
> > > support other instructions which use hard coded values in the
> > > immediate fields.
> > >
> >
> > I agree, it looks better this way.
> >
> > We could have:
> > INSN_S_SPLIT_IMM(OPCODE, FUNC3, RS1, RS2, SIMM_11_5, SIMM_4_0)
> >
> > and implement INSN_S like:
> > #define INSN_S(OPCODE, FUNC3, RS1, RS2, SIMM_11_0) \
> > INSN_S_SPLIT_IMM(OPCODE, FUNC3, RS1, RS2, \
> > SIMM_11_0 >> 5, SIMM_11_0 & 0x1f)
>
> That won't work since SIMM_11_0 will be a string. Actually, with
> stringification in mind, I don't think defining INSN_S_SPLIT_IMM()
> is a good idea.

I don't see how SIMM_11_0 will be a string here. Is this due to using it
on asm code?

I understand a user will call
---
PREFETCH_W(base, offset), which becomes:

INSN_S(OPCODE_OP_IMM, 6, base, 3, offset) , which becomes:

INSN_S_SPLIT_IMM(OPCODE_OP_IMM, FUNC3(6), RS1(base), RS2(3), \
SIMM_11_5(offset >> 5), SIMM_4_0(offset & 0x1f))
---

I don't see an issue here, the same wouldwork for every INSN_S()

Now suppose we make PREFETCH_W use SPLIT_IMM directly:
---
PREFETCH_W(base, offset), which becomes:

INSN_S_SPLIT_IMM(OPCODE_OP_IMM, FUNC3(6), RS1(base), RS2(3), \
SIMM_11_5(offset >> 5), SIMM_4_0(0))
---

I don't see how stringification gets in the way.

Thanks!
Leo

#define CBO_PREFETCH(type, base, offset) \
> > > > > > + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(type), \
> > > > > > + SIMM12(offset & ~0x1f), RS1(base))

>
> Thanks,
> drew
>

2024-01-05 01:13:44

[permalink] [raw]

Subject: Re: [PATCH V2 3/3] riscv: xchg: Prefetch the destination word for sc.w

On Thu, Jan 4, 2024 at 10:17 PM Leonardo Bras <[email protected]> wrote:
>
> On Thu, Jan 04, 2024 at 04:14:27PM +0800, Guo Ren wrote:
> > On Thu, Jan 4, 2024 at 11:56 AM Leonardo Bras <[email protected]> wrote:
> > >
> > > On Thu, Jan 04, 2024 at 09:24:40AM +0800, Guo Ren wrote:
> > > > On Thu, Jan 4, 2024 at 3:45 AM Leonardo Bras <[email protected]> wrote:
> > > > >
> > > > > On Wed, Jan 03, 2024 at 02:15:45PM +0800, Guo Ren wrote:
> > > > > > On Tue, Jan 2, 2024 at 7:19 PM Andrew Jones <[email protected]> wrote:
> > > > > > >
> > > > > > > On Sun, Dec 31, 2023 at 03:29:53AM -0500, [email protected] wrote:
> > > > > > > > From: Guo Ren <[email protected]>
> > > > > > > >
> > > > > > > > The cost of changing a cacheline from shared to exclusive state can be
> > > > > > > > significant, especially when this is triggered by an exclusive store,
> > > > > > > > since it may result in having to retry the transaction.
> > > > > > > >
> > > > > > > > This patch makes use of prefetch.w to prefetch cachelines for write
> > > > > > > > prior to lr/sc loops when using the xchg_small atomic routine.
> > > > > > > >
> > > > > > > > This patch is inspired by commit: 0ea366f5e1b6 ("arm64: atomics:
> > > > > > > > prefetch the destination word for write prior to stxr").
> > > > > > > >
> > > > > > > > Signed-off-by: Guo Ren <[email protected]>
> > > > > > > > Signed-off-by: Guo Ren <[email protected]>
> > > > > > > > ---
> > > > > > > > arch/riscv/include/asm/cmpxchg.h | 4 +++-
> > > > > > > > 1 file changed, 3 insertions(+), 1 deletion(-)
> > > > > > > >
> > > > > > > > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > > > > > > > index 26cea2395aae..d7b9d7951f08 100644
> > > > > > > > --- a/arch/riscv/include/asm/cmpxchg.h
> > > > > > > > +++ b/arch/riscv/include/asm/cmpxchg.h
> > > > > > > > @@ -10,6 +10,7 @@
> > > > > > > >
> > > > > > > > #include <asm/barrier.h>
> > > > > > > > #include <asm/fence.h>
> > > > > > > > +#include <asm/processor.h>
> > > > > > > >
> > > > > > > > #define __arch_xchg_masked(prepend, append, r, p, n) \
> > > > > > >
> > > > > > > Are you sure this is based on v6.7-rc7? Because I don't see this macro.
> > > > > > Oh, it is based on Leobras' patches. I would remove it in the next of version.
> > > > >
> > > > > I would say this next :)
> > > > Thx for the grammar correction.
> > >
> > > Oh, I was not intending to correct grammar.
> > > I just meant the next thing I would mention is that it was based on top of
> > > my patchset instead of v6.7-rc7:
> > >
> > > >
> > > > >
> > > > > >
> > > > > > >
> > > > > > > > ({ \
> > > > > > > > @@ -23,6 +24,7 @@
> > > > > > > > \
> > > > > > > > __asm__ __volatile__ ( \
> > > > > > > > prepend \
> > > > > > > > + PREFETCHW_ASM(%5) \
> > > > > > > > "0: lr.w %0, %2\n" \
> > > > > > > > " and %1, %0, %z4\n" \
> > > > > > > > " or %1, %1, %z3\n" \
> > > > > > > > @@ -30,7 +32,7 @@
> > > > > > > > " bnez %1, 0b\n" \
> > > > > > > > append \
> > > > > > > > : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b)) \
> > > > > > > > - : "rJ" (__newx), "rJ" (~__mask) \
> > > > > > > > + : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b) \
> > > > > > >
> > > > > > > I'm pretty sure we don't want to allow the J constraint for __ptr32b.
> > > > > > >
> > > > > > > > : "memory"); \
> > > > > > > > \
> > > > > > > > r = (__typeof__(*(p)))((__retx & __mask) >> __s); \
> > > > > > > > --
> > > > > > > > 2.40.1
> > > > > > > >
> > > > > > >
> > > > > > > Thanks,
> > > > > > > drew
> > > > > >
> > > > > >
> > > > > >
> > > > > > --
> > > > > > Best Regards
> > > > > > Guo Ren
> > > > > >
> > > > >
> > > > > Nice patch :)
> > > > > Any reason it's not needed in __arch_cmpxchg_masked(), and __arch_cmpxchg() ?
> > > > CAS is a conditional AMO, unlike xchg (Stand AMO). Arm64 is wrong, or
> > > > they have a problem with the hardware.
> > >
> > > Sorry, I was unable to fully understand the reason here.
> > >
> > > You suggest that the PREFETCH.W was inserted on xchg_masked because it will
> > > always switch the variable (no compare, blind CAS), but not on cmpxchg.
> > >
> > > Is this because cmpxchg will depend on a compare, and thus it does not
> > > garantee a write? so it would be unwise to always prefetch cacheline
> > Yes, it has a comparison, so a store may not exist there.
> >
> > > exclusiveness for this cpu, where shared state would be enough.
> > > Is that correct?
> > Yes, exclusiveness would invalidate other harts' cache lines.
>
> I see.
>
> I recall a previous discussion on computer arch which stated that any LR
> would require to get a cacheline in exclusive state for lr/sc to work, but
> I went through the RISC-V lr/sc documentation and could not find any info
> about its cacheline behavior.

No, lr couldn't get a cacheline in exclusive, that would break the ISA design.
Think about "lr + wfe" pair.

>
> If this stands correct, the PREFETCH.W could be useful before every lr,
> right?
> (maybe that's the case for arm64 that you mentioned before)
The arm64 "lr + sc" cmpxchg version is not good, don't follow that.
They are moving to the LSE's cas instruction.

>
> Thanks!
> Leo
>
> >
> > >
> > > Thanks!
> > > Leo
> > >
> > >
> > > >
> > > > >
> > > > > Thanks!
> > > > > Leo
> > > > >
> > > >
> > > >
> > > > --
> > > > Best Regards
> > > > Guo Ren
> > > >
> > >
> >
> >
> > --
> > Best Regards
> > Guo Ren
> >
>

--
Best Regards
Guo Ren

2024-01-05 13:25:14

[permalink] [raw]

Subject: Re: Re: Re: Re: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On Thu, Jan 04, 2024 at 02:43:04PM -0300, Leonardo Bras wrote:
...
> > > > > I don't think we can detect a caller with non-zero offset at compile time,
> > > > > since it will be used in locks which can be at (potentially) any place in
> > > > > the block size. (if you have any idea though, please let me know :) )
> >
> > I forgot to reply to this before. The reason I think it may be possible to
> > validate offset at compile time is because it must be a constant, i.e.
> > __builtin_constant_p(offset) must return true. So maybe something like
> >
> > static_assert(__builtin_constant_p(offset) && !(offset & 0x1f))
> >
> > I'll try to find time to play with it.
> >
>
> Let me know if you find anything.

There's nothing we can do in this file (insn-def.h), other than maybe
masking, since all magic must happen at preprocessor time, other than
a tiny bit of constant arithmetic allowed at assembly time. For C, using
a wrapper, like patch 2 of this series introduces, we could add the
static assert above. I'll suggest that in patch 2, since I've already
thought it through, but it sort of feels like overkill to me.

>
> > > > >
> > > > > On the other hand, we could create a S-Type macro which deliberately
> > > > > ignores imm[4:0], like
> > > > >
> > > > > + INSN_S_TRUNCATE(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > > > > + SIMM12(offset), RS1(base))
> > > > >
> > > > > Which saves the bits 11:5 of offset into imm[11:5], and zero-fill
> > > > > imm[4:0], like
> > > > >
> > > > > +#define DEFINE_INSN_S \
> > > > > + __DEFINE_ASM_GPR_NUMS \
> > > > > +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
> > > > > +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
> > > > > +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
> > > > > +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
> > > > > +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
> > > > > +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
> > > > > +" .endm\n"
> > > > > +
> > > > >
> > > > > Does this make sense?
> > > >
> > > > If we create a special version of INSN_S, then I suggest we create one
> > > > where its two SIMM fields are independent and then define prefetch
> > > > instructions like this
> > > >
> > > > #define PREFETCH_W(base, offset) \
> > > > INSN_S_SPLIT_IMM(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > > > SIMM_11_5(offset >> 5), SIMM_4_0(0), RS1(base))
> > > >
> > > > which would allow simple review against the spec and potentially
> > > > support other instructions which use hard coded values in the
> > > > immediate fields.
> > > >
> > >
> > > I agree, it looks better this way.
> > >
> > > We could have:
> > > INSN_S_SPLIT_IMM(OPCODE, FUNC3, RS1, RS2, SIMM_11_5, SIMM_4_0)
> > >
> > > and implement INSN_S like:
> > > #define INSN_S(OPCODE, FUNC3, RS1, RS2, SIMM_11_0) \
> > > INSN_S_SPLIT_IMM(OPCODE, FUNC3, RS1, RS2, \
> > > SIMM_11_0 >> 5, SIMM_11_0 & 0x1f)
> >
> > That won't work since SIMM_11_0 will be a string. Actually, with
> > stringification in mind, I don't think defining INSN_S_SPLIT_IMM()
> > is a good idea.
>
> I don't see how SIMM_11_0 will be a string here. Is this due to using it
> on asm code?
>
> I understand a user will call
> ---
> PREFETCH_W(base, offset), which becomes:
>
> INSN_S(OPCODE_OP_IMM, 6, base, 3, offset) , which becomes:
>
> INSN_S_SPLIT_IMM(OPCODE_OP_IMM, FUNC3(6), RS1(base), RS2(3), \
> SIMM_11_5(offset >> 5), SIMM_4_0(offset & 0x1f))

The other annotations, like SIMM12, stringify their arguments. So, if
SIMM_11_5 and SIMM_4_0 also stringified, then it wouldn't be possible
to recombine them into a simm12 for the '.insn s' directive. I suppose
SIMM_11_5 and SIMM_4_0 could just expand their arguments without
stringifying. With that, along with throwing even more ugly at it, then
it is possible to get the end result we want, which is

- PREFETCH_* instructions are defined with annotations and have a
SIMM_4_0(0) in their definitions to explicitly point out that field

- the INSN_S definition still directly maps to the .insn s directive

I got that to work with this

#define __RV_SIMM(v) v
#define RV___SIMM_11_5(v) __RV_SIMM(v)
#define RV___SIMM_4_0(v) __RV_SIMM(v)

#define __INSN_S_SPLIT_IMM(opcode, func3, rs2, simm12, rs1) \
INSN_S(opcode, func3, rs2, SIMM12(simm12), rs1)

#define INSN_S_SPLIT_IMM(opcode, func3, rs2, simm_11_5, simm_4_0, rs1) \
__INSN_S_SPLIT_IMM(opcode, func3, rs2, (RV_##simm_11_5 << 5) | RV_##simm_4_0, rs1)

#define CBO_PREFETCH_W(base, offset) \
INSN_S_SPLIT_IMM(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
__SIMM_11_5((offset) >> 5), __SIMM_4_0(0), RS1(base))

But, again, I feel like it's probably overkill...

Thanks,
drew

2024-01-05 13:31:51

[permalink] [raw]

Subject: Re: Re: [PATCH V2 2/3] riscv: Add ARCH_HAS_PRETCHW support with Zibop

On Tue, Jan 02, 2024 at 11:45:08AM +0100, Andrew Jones wrote:
>
> s/Zibop/Zicbop/ <<<$SUBJECT
>
> On Sun, Dec 31, 2023 at 03:29:52AM -0500, [email protected] wrote:
> > From: Guo Ren <[email protected]>
> >
> > Enable Linux prefetchw primitive with Zibop cpufeature, which preloads
>
> Also s/Zibop/Zicbop/ here
>
> > cache line into L1 cache for the next write operation.
> >
> > Signed-off-by: Guo Ren <[email protected]>
> > Signed-off-by: Guo Ren <[email protected]>
> > ---
> > arch/riscv/include/asm/processor.h | 16 ++++++++++++++++
> > 1 file changed, 16 insertions(+)
> >
> > diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
> > index f19f861cda54..8d3a2ab37678 100644
> > --- a/arch/riscv/include/asm/processor.h
> > +++ b/arch/riscv/include/asm/processor.h
> > @@ -13,6 +13,9 @@
> > #include <vdso/processor.h>
> >
> > #include <asm/ptrace.h>
> > +#include <asm/insn-def.h>
> > +#include <asm/alternative-macros.h>
> > +#include <asm/hwcap.h>
> >
> > #ifdef CONFIG_64BIT
> > #define DEFAULT_MAP_WINDOW (UL(1) << (MMAP_VA_BITS - 1))
> > @@ -106,6 +109,19 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
> > #define KSTK_EIP(tsk) (task_pt_regs(tsk)->epc)
> > #define KSTK_ESP(tsk) (task_pt_regs(tsk)->sp)
> >
> > +#ifdef CONFIG_RISCV_ISA_ZICBOP
> > +#define ARCH_HAS_PREFETCHW
> > +
> > +#define PREFETCHW_ASM(x) \
> > + ALTERNATIVE(__nops(1), CBO_PREFETCH_W(x, 0), 0, \
> > + RISCV_ISA_EXT_ZICBOP, CONFIG_RISCV_ISA_ZICBOP)
> > +
> > +
> > +static inline void prefetchw(const void *x)
> > +{
> > + __asm__ __volatile__(PREFETCHW_ASM(%0) : : "r" (x) : "memory");
> > +}
>
> Shouldn't we create an interface which exposes the offset input of
> the instruction, allowing a sequence of calls to be unrolled? But
> I guess that could be put off until there's a need for it.

If we did expose offset, then, because it must be constant and also must
only have bits 5-11 set, then we could add a static assert. Something like

#define prefetchw_offset(base, offset) \
({ \
static_assert(__builtin_constant_p(offset) && !(offset & ~GENMASK(11, 5))); \
__asm__ __volatile__(PREFETCHW_ASM(%0, %1) : : "r" (x), "I" (offset) : "memory"); \
})

Probably overkill though...

Thanks,
drew

2024-01-08 14:34:48

[permalink] [raw]

Subject: Re: Re: Re: Re: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On Fri, Jan 05, 2024 at 02:24:45PM +0100, Andrew Jones wrote:
> On Thu, Jan 04, 2024 at 02:43:04PM -0300, Leonardo Bras wrote:
> ...
> > > > > > I don't think we can detect a caller with non-zero offset at compile time,
> > > > > > since it will be used in locks which can be at (potentially) any place in
> > > > > > the block size. (if you have any idea though, please let me know :) )
> > >
> > > I forgot to reply to this before. The reason I think it may be possible to
> > > validate offset at compile time is because it must be a constant, i.e.
> > > __builtin_constant_p(offset) must return true. So maybe something like
> > >
> > > static_assert(__builtin_constant_p(offset) && !(offset & 0x1f))
> > >
> > > I'll try to find time to play with it.
> > >
> >
> > Let me know if you find anything.
>
> There's nothing we can do in this file (insn-def.h), other than maybe
> masking, since all magic must happen at preprocessor time, other than
> a tiny bit of constant arithmetic allowed at assembly time. For C, using
> a wrapper, like patch 2 of this series introduces, we could add the
> static assert above. I'll suggest that in patch 2, since I've already
> thought it through, but it sort of feels like overkill to me.

It makes sense.

>
> >
> > > > > >
> > > > > > On the other hand, we could create a S-Type macro which deliberately
> > > > > > ignores imm[4:0], like
> > > > > >
> > > > > > + INSN_S_TRUNCATE(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > > > > > + SIMM12(offset), RS1(base))
> > > > > >
> > > > > > Which saves the bits 11:5 of offset into imm[11:5], and zero-fill
> > > > > > imm[4:0], like
> > > > > >
> > > > > > +#define DEFINE_INSN_S \
> > > > > > + __DEFINE_ASM_GPR_NUMS \
> > > > > > +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
> > > > > > +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
> > > > > > +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
> > > > > > +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
> > > > > > +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
> > > > > > +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
> > > > > > +" .endm\n"
> > > > > > +
> > > > > >
> > > > > > Does this make sense?
> > > > >
> > > > > If we create a special version of INSN_S, then I suggest we create one
> > > > > where its two SIMM fields are independent and then define prefetch
> > > > > instructions like this
> > > > >
> > > > > #define PREFETCH_W(base, offset) \
> > > > > INSN_S_SPLIT_IMM(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > > > > SIMM_11_5(offset >> 5), SIMM_4_0(0), RS1(base))
> > > > >
> > > > > which would allow simple review against the spec and potentially
> > > > > support other instructions which use hard coded values in the
> > > > > immediate fields.
> > > > >
> > > >
> > > > I agree, it looks better this way.
> > > >
> > > > We could have:
> > > > INSN_S_SPLIT_IMM(OPCODE, FUNC3, RS1, RS2, SIMM_11_5, SIMM_4_0)
> > > >
> > > > and implement INSN_S like:
> > > > #define INSN_S(OPCODE, FUNC3, RS1, RS2, SIMM_11_0) \
> > > > INSN_S_SPLIT_IMM(OPCODE, FUNC3, RS1, RS2, \
> > > > SIMM_11_0 >> 5, SIMM_11_0 & 0x1f)
> > >
> > > That won't work since SIMM_11_0 will be a string. Actually, with
> > > stringification in mind, I don't think defining INSN_S_SPLIT_IMM()
> > > is a good idea.
> >
> > I don't see how SIMM_11_0 will be a string here. Is this due to using it
> > on asm code?
> >
> > I understand a user will call
> > ---
> > PREFETCH_W(base, offset), which becomes:
> >
> > INSN_S(OPCODE_OP_IMM, 6, base, 3, offset) , which becomes:
> >
> > INSN_S_SPLIT_IMM(OPCODE_OP_IMM, FUNC3(6), RS1(base), RS2(3), \
> > SIMM_11_5(offset >> 5), SIMM_4_0(offset & 0x1f))
>
> The other annotations, like SIMM12, stringify their arguments. So, if
> SIMM_11_5 and SIMM_4_0 also stringified, then it wouldn't be possible
> to recombine them into a simm12 for the '.insn s' directive. I suppose
> SIMM_11_5 and SIMM_4_0 could just expand their arguments without
> stringifying. With that, along with throwing even more ugly at it, then
> it is possible to get the end result we want, which is
>
> - PREFETCH_* instructions are defined with annotations and have a
> SIMM_4_0(0) in their definitions to explicitly point out that field
>
> - the INSN_S definition still directly maps to the .insn s directive
>
>
> I got that to work with this
>
> #define __RV_SIMM(v) v
> #define RV___SIMM_11_5(v) __RV_SIMM(v)
> #define RV___SIMM_4_0(v) __RV_SIMM(v)
>
> #define __INSN_S_SPLIT_IMM(opcode, func3, rs2, simm12, rs1) \
> INSN_S(opcode, func3, rs2, SIMM12(simm12), rs1)
>
> #define INSN_S_SPLIT_IMM(opcode, func3, rs2, simm_11_5, simm_4_0, rs1) \
> __INSN_S_SPLIT_IMM(opcode, func3, rs2, (RV_##simm_11_5 << 5) | RV_##simm_4_0, rs1)
>
> #define CBO_PREFETCH_W(base, offset) \
> INSN_S_SPLIT_IMM(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> __SIMM_11_5((offset) >> 5), __SIMM_4_0(0), RS1(base))
>
>
> But, again, I feel like it's probably overkill...

I though our intention was to avoid the extra IMM masking in asm, while
keeping the 5 lower bits zeroed at all times.

But IIUC here you are writing a insn_s_split_imm in terms of a plain
insn_s, which guarantees the zeroed 5 lower bits but still does an
unnecessaty masking in asm. In fact, if we use the split version, it
concatenates the two IMM parts, to then split them again in order to build
the instruction.

In my suggestion above, we make INSN_S_SPLIT_IMM() the helper / standard
way to write an s-type and write a INSN_S() in terms of the SPLIT version.

This allows using the split version when we don't need one of the halfs,
thus avoiding a masking or a rotation. The full version just splits the
halfs and pass to the split version that directly builds the instruction.

Well, I think I was under the wrong impression that we wanted to avoid the
rotation and masking, but I just noticed that we are just dealing with
splitting the offset, which is supposed to be a constant during the
generation of the instruction, so we can just guarantee the value being
masked at no runtime cost.

So in the end we are just thinking on how it could look better to the user,
and maybe the split version is unnecessary if the user guarantees the
masking to be correct. But if we are going to have it, I suggest we do
INSN_S in terms of INSN_S_SPLIT_IMM() instead of the other way around.

Thanks!
Leo

>
> Thanks,
> drew
>

2024-01-08 15:25:43

[permalink] [raw]

Subject: Re: Re: Re: Re: Re: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On Mon, Jan 08, 2024 at 11:34:15AM -0300, Leonardo Bras wrote:
> On Fri, Jan 05, 2024 at 02:24:45PM +0100, Andrew Jones wrote:
> > On Thu, Jan 04, 2024 at 02:43:04PM -0300, Leonardo Bras wrote:
> > ...
> > > > > > > I don't think we can detect a caller with non-zero offset at compile time,
> > > > > > > since it will be used in locks which can be at (potentially) any place in
> > > > > > > the block size. (if you have any idea though, please let me know :) )
> > > >
> > > > I forgot to reply to this before. The reason I think it may be possible to
> > > > validate offset at compile time is because it must be a constant, i.e.
> > > > __builtin_constant_p(offset) must return true. So maybe something like
> > > >
> > > > static_assert(__builtin_constant_p(offset) && !(offset & 0x1f))
> > > >
> > > > I'll try to find time to play with it.
> > > >
> > >
> > > Let me know if you find anything.
> >
> > There's nothing we can do in this file (insn-def.h), other than maybe
> > masking, since all magic must happen at preprocessor time, other than
> > a tiny bit of constant arithmetic allowed at assembly time. For C, using
> > a wrapper, like patch 2 of this series introduces, we could add the
> > static assert above. I'll suggest that in patch 2, since I've already
> > thought it through, but it sort of feels like overkill to me.
>
> It makes sense.
>
> >
> > >
> > > > > > >
> > > > > > > On the other hand, we could create a S-Type macro which deliberately
> > > > > > > ignores imm[4:0], like
> > > > > > >
> > > > > > > + INSN_S_TRUNCATE(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > > > > > > + SIMM12(offset), RS1(base))
> > > > > > >
> > > > > > > Which saves the bits 11:5 of offset into imm[11:5], and zero-fill
> > > > > > > imm[4:0], like
> > > > > > >
> > > > > > > +#define DEFINE_INSN_S \
> > > > > > > + __DEFINE_ASM_GPR_NUMS \
> > > > > > > +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
> > > > > > > +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
> > > > > > > +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
> > > > > > > +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
> > > > > > > +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
> > > > > > > +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
> > > > > > > +" .endm\n"
> > > > > > > +
> > > > > > >
> > > > > > > Does this make sense?
> > > > > >
> > > > > > If we create a special version of INSN_S, then I suggest we create one
> > > > > > where its two SIMM fields are independent and then define prefetch
> > > > > > instructions like this
> > > > > >
> > > > > > #define PREFETCH_W(base, offset) \
> > > > > > INSN_S_SPLIT_IMM(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > > > > > SIMM_11_5(offset >> 5), SIMM_4_0(0), RS1(base))
> > > > > >
> > > > > > which would allow simple review against the spec and potentially
> > > > > > support other instructions which use hard coded values in the
> > > > > > immediate fields.
> > > > > >
> > > > >
> > > > > I agree, it looks better this way.
> > > > >
> > > > > We could have:
> > > > > INSN_S_SPLIT_IMM(OPCODE, FUNC3, RS1, RS2, SIMM_11_5, SIMM_4_0)
> > > > >
> > > > > and implement INSN_S like:
> > > > > #define INSN_S(OPCODE, FUNC3, RS1, RS2, SIMM_11_0) \
> > > > > INSN_S_SPLIT_IMM(OPCODE, FUNC3, RS1, RS2, \
> > > > > SIMM_11_0 >> 5, SIMM_11_0 & 0x1f)
> > > >
> > > > That won't work since SIMM_11_0 will be a string. Actually, with
> > > > stringification in mind, I don't think defining INSN_S_SPLIT_IMM()
> > > > is a good idea.
> > >
> > > I don't see how SIMM_11_0 will be a string here. Is this due to using it
> > > on asm code?
> > >
> > > I understand a user will call
> > > ---
> > > PREFETCH_W(base, offset), which becomes:
> > >
> > > INSN_S(OPCODE_OP_IMM, 6, base, 3, offset) , which becomes:
> > >
> > > INSN_S_SPLIT_IMM(OPCODE_OP_IMM, FUNC3(6), RS1(base), RS2(3), \
> > > SIMM_11_5(offset >> 5), SIMM_4_0(offset & 0x1f))
> >
> > The other annotations, like SIMM12, stringify their arguments. So, if
> > SIMM_11_5 and SIMM_4_0 also stringified, then it wouldn't be possible
> > to recombine them into a simm12 for the '.insn s' directive. I suppose
> > SIMM_11_5 and SIMM_4_0 could just expand their arguments without
> > stringifying. With that, along with throwing even more ugly at it, then
> > it is possible to get the end result we want, which is
> >
> > - PREFETCH_* instructions are defined with annotations and have a
> > SIMM_4_0(0) in their definitions to explicitly point out that field
> >
> > - the INSN_S definition still directly maps to the .insn s directive
> >
> >
> > I got that to work with this
> >
> > #define __RV_SIMM(v) v
> > #define RV___SIMM_11_5(v) __RV_SIMM(v)
> > #define RV___SIMM_4_0(v) __RV_SIMM(v)
> >
> > #define __INSN_S_SPLIT_IMM(opcode, func3, rs2, simm12, rs1) \
> > INSN_S(opcode, func3, rs2, SIMM12(simm12), rs1)
> >
> > #define INSN_S_SPLIT_IMM(opcode, func3, rs2, simm_11_5, simm_4_0, rs1) \
> > __INSN_S_SPLIT_IMM(opcode, func3, rs2, (RV_##simm_11_5 << 5) | RV_##simm_4_0, rs1)
> >
> > #define CBO_PREFETCH_W(base, offset) \
> > INSN_S_SPLIT_IMM(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > __SIMM_11_5((offset) >> 5), __SIMM_4_0(0), RS1(base))
> >
> >
> > But, again, I feel like it's probably overkill...
>
> I though our intention was to avoid the extra IMM masking in asm, while
> keeping the 5 lower bits zeroed at all times.
>
> But IIUC here you are writing a insn_s_split_imm in terms of a plain
> insn_s, which guarantees the zeroed 5 lower bits but still does an
> unnecessaty masking in asm. In fact, if we use the split version, it
> concatenates the two IMM parts, to then split them again in order to build
> the instruction.

That's backwards.

INSN_S should map to its namesake directive '.insn s', which takes the
immediate as a single simm12. simm7 and simm5 are only used in the
fallback path. Also, it's likely few instructions care about the split.
Other S-type instructions would want to treat the immediate as 12 bits,
so INSN_S should not be written in terms of INSN_S_SPLIT_IMM, since we'd
split a 12-bit immediate for those instructions just to have it merged
again for the .insn s directive.

>
> In my suggestion above, we make INSN_S_SPLIT_IMM() the helper / standard
> way to write an s-type and write a INSN_S() in terms of the SPLIT version.
>
> This allows using the split version when we don't need one of the halfs,
> thus avoiding a masking or a rotation. The full version just splits the
> halfs and pass to the split version that directly builds the instruction.
>
> Well, I think I was under the wrong impression that we wanted to avoid the
> rotation and masking, but I just noticed that we are just dealing with
> splitting the offset, which is supposed to be a constant during the
> generation of the instruction, so we can just guarantee the value being
> masked at no runtime cost.

Right, there's no reason to avoid the rotation and masking with respect to
performance, as it's all done at compile time. Avoiding the operations as
much as possible is nice, though, since they're ugly and, with macro
arguments, getting the parentheses right to ensure the correct order of
operations is error prone.

>
> So in the end we are just thinking on how it could look better to the user,
> and maybe the split version is unnecessary if the user guarantees the
> masking to be correct. But if we are going to have it, I suggest we do
> INSN_S in terms of INSN_S_SPLIT_IMM() instead of the other way around.

Yes, the goal was to have __SIM_4_0(0) for the prefetch instructions to
make it simple to review against the spec, but ((offset) & ~0x1f) is also
simple to review.

For the next revision of this series, I'd be happy with just the masking
or even to just leave it up to the callers as this version does.

Thanks,
drew

2024-01-08 16:14:54

[permalink] [raw]

Subject: Re: Re: Re: Re: Re: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On Mon, Jan 08, 2024 at 04:24:59PM +0100, Andrew Jones wrote:
> On Mon, Jan 08, 2024 at 11:34:15AM -0300, Leonardo Bras wrote:
> > On Fri, Jan 05, 2024 at 02:24:45PM +0100, Andrew Jones wrote:
> > > On Thu, Jan 04, 2024 at 02:43:04PM -0300, Leonardo Bras wrote:
> > > ...
> > > > > > > > I don't think we can detect a caller with non-zero offset at compile time,
> > > > > > > > since it will be used in locks which can be at (potentially) any place in
> > > > > > > > the block size. (if you have any idea though, please let me know :) )
> > > > >
> > > > > I forgot to reply to this before. The reason I think it may be possible to
> > > > > validate offset at compile time is because it must be a constant, i.e.
> > > > > __builtin_constant_p(offset) must return true. So maybe something like
> > > > >
> > > > > static_assert(__builtin_constant_p(offset) && !(offset & 0x1f))
> > > > >
> > > > > I'll try to find time to play with it.
> > > > >
> > > >
> > > > Let me know if you find anything.
> > >
> > > There's nothing we can do in this file (insn-def.h), other than maybe
> > > masking, since all magic must happen at preprocessor time, other than
> > > a tiny bit of constant arithmetic allowed at assembly time. For C, using
> > > a wrapper, like patch 2 of this series introduces, we could add the
> > > static assert above. I'll suggest that in patch 2, since I've already
> > > thought it through, but it sort of feels like overkill to me.
> >
> > It makes sense.
> >
> > >
> > > >
> > > > > > > >
> > > > > > > > On the other hand, we could create a S-Type macro which deliberately
> > > > > > > > ignores imm[4:0], like
> > > > > > > >
> > > > > > > > + INSN_S_TRUNCATE(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > > > > > > > + SIMM12(offset), RS1(base))
> > > > > > > >
> > > > > > > > Which saves the bits 11:5 of offset into imm[11:5], and zero-fill
> > > > > > > > imm[4:0], like
> > > > > > > >
> > > > > > > > +#define DEFINE_INSN_S \
> > > > > > > > + __DEFINE_ASM_GPR_NUMS \
> > > > > > > > +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
> > > > > > > > +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
> > > > > > > > +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
> > > > > > > > +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
> > > > > > > > +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
> > > > > > > > +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
> > > > > > > > +" .endm\n"
> > > > > > > > +
> > > > > > > >
> > > > > > > > Does this make sense?
> > > > > > >
> > > > > > > If we create a special version of INSN_S, then I suggest we create one
> > > > > > > where its two SIMM fields are independent and then define prefetch
> > > > > > > instructions like this
> > > > > > >
> > > > > > > #define PREFETCH_W(base, offset) \
> > > > > > > INSN_S_SPLIT_IMM(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > > > > > > SIMM_11_5(offset >> 5), SIMM_4_0(0), RS1(base))
> > > > > > >
> > > > > > > which would allow simple review against the spec and potentially
> > > > > > > support other instructions which use hard coded values in the
> > > > > > > immediate fields.
> > > > > > >
> > > > > >
> > > > > > I agree, it looks better this way.
> > > > > >
> > > > > > We could have:
> > > > > > INSN_S_SPLIT_IMM(OPCODE, FUNC3, RS1, RS2, SIMM_11_5, SIMM_4_0)
> > > > > >
> > > > > > and implement INSN_S like:
> > > > > > #define INSN_S(OPCODE, FUNC3, RS1, RS2, SIMM_11_0) \
> > > > > > INSN_S_SPLIT_IMM(OPCODE, FUNC3, RS1, RS2, \
> > > > > > SIMM_11_0 >> 5, SIMM_11_0 & 0x1f)
> > > > >
> > > > > That won't work since SIMM_11_0 will be a string. Actually, with
> > > > > stringification in mind, I don't think defining INSN_S_SPLIT_IMM()
> > > > > is a good idea.
> > > >
> > > > I don't see how SIMM_11_0 will be a string here. Is this due to using it
> > > > on asm code?
> > > >
> > > > I understand a user will call
> > > > ---
> > > > PREFETCH_W(base, offset), which becomes:
> > > >
> > > > INSN_S(OPCODE_OP_IMM, 6, base, 3, offset) , which becomes:
> > > >
> > > > INSN_S_SPLIT_IMM(OPCODE_OP_IMM, FUNC3(6), RS1(base), RS2(3), \
> > > > SIMM_11_5(offset >> 5), SIMM_4_0(offset & 0x1f))
> > >
> > > The other annotations, like SIMM12, stringify their arguments. So, if
> > > SIMM_11_5 and SIMM_4_0 also stringified, then it wouldn't be possible
> > > to recombine them into a simm12 for the '.insn s' directive. I suppose
> > > SIMM_11_5 and SIMM_4_0 could just expand their arguments without
> > > stringifying. With that, along with throwing even more ugly at it, then
> > > it is possible to get the end result we want, which is
> > >
> > > - PREFETCH_* instructions are defined with annotations and have a
> > > SIMM_4_0(0) in their definitions to explicitly point out that field
> > >
> > > - the INSN_S definition still directly maps to the .insn s directive
> > >
> > >
> > > I got that to work with this
> > >
> > > #define __RV_SIMM(v) v
> > > #define RV___SIMM_11_5(v) __RV_SIMM(v)
> > > #define RV___SIMM_4_0(v) __RV_SIMM(v)
> > >
> > > #define __INSN_S_SPLIT_IMM(opcode, func3, rs2, simm12, rs1) \
> > > INSN_S(opcode, func3, rs2, SIMM12(simm12), rs1)
> > >
> > > #define INSN_S_SPLIT_IMM(opcode, func3, rs2, simm_11_5, simm_4_0, rs1) \
> > > __INSN_S_SPLIT_IMM(opcode, func3, rs2, (RV_##simm_11_5 << 5) | RV_##simm_4_0, rs1)
> > >
> > > #define CBO_PREFETCH_W(base, offset) \
> > > INSN_S_SPLIT_IMM(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
> > > __SIMM_11_5((offset) >> 5), __SIMM_4_0(0), RS1(base))
> > >
> > >
> > > But, again, I feel like it's probably overkill...
> >
> > I though our intention was to avoid the extra IMM masking in asm, while
> > keeping the 5 lower bits zeroed at all times.
> >
> > But IIUC here you are writing a insn_s_split_imm in terms of a plain
> > insn_s, which guarantees the zeroed 5 lower bits but still does an
> > unnecessaty masking in asm. In fact, if we use the split version, it
> > concatenates the two IMM parts, to then split them again in order to build
> > the instruction.
>
> That's backwards.
>
> INSN_S should map to its namesake directive '.insn s', which takes the
> immediate as a single simm12. simm7 and simm5 are only used in the
> fallback path.

Wait, but is not the final instruction build with a split simm7 and simm5 ?

Oh, ok, we are probably creating an asm with mnemonics first, and leaving
it to the assembler to split the simm7 and simm5 from simm12. It does make
sense.

Sorry for the confusion, I was wrongly thinking on terms of final bytecode.

> Also, it's likely few instructions care about the split.
> Other S-type instructions would want to treat the immediate as 12 bits,
> so INSN_S should not be written in terms of INSN_S_SPLIT_IMM, since we'd
> split a 12-bit immediate for those instructions just to have it merged
> again for the .insn s directive.

Yeah, makes sense now.

>
> >
> > In my suggestion above, we make INSN_S_SPLIT_IMM() the helper / standard
> > way to write an s-type and write a INSN_S() in terms of the SPLIT version.
> >
> > This allows using the split version when we don't need one of the halfs,
> > thus avoiding a masking or a rotation. The full version just splits the
> > halfs and pass to the split version that directly builds the instruction.
> >
> > Well, I think I was under the wrong impression that we wanted to avoid the
> > rotation and masking, but I just noticed that we are just dealing with
> > splitting the offset, which is supposed to be a constant during the
> > generation of the instruction, so we can just guarantee the value being
> > masked at no runtime cost.
>
> Right, there's no reason to avoid the rotation and masking with respect to
> performance, as it's all done at compile time. Avoiding the operations as
> much as possible is nice, though, since they're ugly and, with macro
> arguments, getting the parentheses right to ensure the correct order of
> operations is error prone.
>
> >
> > So in the end we are just thinking on how it could look better to the user,
> > and maybe the split version is unnecessary if the user guarantees the
> > masking to be correct. But if we are going to have it, I suggest we do
> > INSN_S in terms of INSN_S_SPLIT_IMM() instead of the other way around.
>
> Yes, the goal was to have __SIM_4_0(0) for the prefetch instructions to
> make it simple to review against the spec, but ((offset) & ~0x1f) is also
> simple to review.

Agreed

>
> For the next revision of this series, I'd be happy with just the masking
> or even to just leave it up to the callers as this version does.
>
> Thanks,
> drew
>

Thank you!
Leo

2024-01-11 10:31:45

by Clément Léger

[permalink] [raw]

Subject: Re: [PATCH V2 1/3] riscv: Add Zicbop instruction definitions & cpufeature

On 03/01/2024 13:00, Andrew Jones wrote:
> On Wed, Jan 03, 2024 at 10:31:37AM +0100, Clément Léger wrote:
>>
>>
>> On 31/12/2023 09:29, [email protected] wrote:
>>> From: Guo Ren <[email protected]>
>>>
>>> Cache-block prefetch instructions are HINTs to the hardware to
>>> indicate that software intends to perform a particular type of
>>> memory access in the near future. This patch adds prefetch.i,
>>> prefetch.r and prefetch.w instruction definitions by
>>> RISCV_ISA_EXT_ZICBOP cpufeature.
>>>
>>> Signed-off-by: Guo Ren <[email protected]>
>>> Signed-off-by: Guo Ren <[email protected]>
>>> ---
>>> arch/riscv/Kconfig | 15 ++++++++
>>> arch/riscv/include/asm/hwcap.h | 1 +
>>> arch/riscv/include/asm/insn-def.h | 60 +++++++++++++++++++++++++++++++
>>> arch/riscv/kernel/cpufeature.c | 1 +
>>> 4 files changed, 77 insertions(+)
>>>
>>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>>> index 24c1799e2ec4..fcbd417d65ea 100644
>>> --- a/arch/riscv/Kconfig
>>> +++ b/arch/riscv/Kconfig
>>> @@ -579,6 +579,21 @@ config RISCV_ISA_ZICBOZ
>>>
>>> If you don't know what to do here, say Y.
>>>
>>> +config RISCV_ISA_ZICBOP
>>> + bool "Zicbop extension support for cache block prefetch"
>>> + depends on MMU
>>> + depends on RISCV_ALTERNATIVE
>>> + default y
>>> + help
>>> + Adds support to dynamically detect the presence of the ZICBOP
>>> + extension (Cache Block Prefetch Operations) and enable its
>>> + usage.
>>> +
>>> + The Zicbop extension can be used to prefetch cache block for
>>> + read/write fetch.
>>> +
>>> + If you don't know what to do here, say Y.
>>> +
>>> config TOOLCHAIN_HAS_ZIHINTPAUSE
>>> bool
>>> default y
>>> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
>>> index 06d30526ef3b..77d3b6ee25ab 100644
>>> --- a/arch/riscv/include/asm/hwcap.h
>>> +++ b/arch/riscv/include/asm/hwcap.h
>>> @@ -57,6 +57,7 @@
>>> #define RISCV_ISA_EXT_ZIHPM 42
>>> #define RISCV_ISA_EXT_SMSTATEEN 43
>>> #define RISCV_ISA_EXT_ZICOND 44
>>> +#define RISCV_ISA_EXT_ZICBOP 45
>>
>> Hi Guo,
>>
>> Since you are adding support for the Zicbop extension, you could
>> probably also allow to probe it from userspace using hwprobe. Add a few
>> definitions to sys_riscv.c/hwprobe.h and it will be fine.
>
> To expose to userspace, we should also start parsing the block size,
> so it can also be exposed to userspace. Starting to parse the block
> size first requires that we decide we need to parse the block size
> (see [1]).

Hi Andrew, thanks for the thread.

I read it (and the other ones that are related to it) and basically, it
seems there was a first decision (expose Zicbop block size indivudally)
due to the fact the specification did not mentioned anything specific
about clock sizes but then after that, there was a clarification in the
spec stating that Zicbop and Zicbom have the same block size so the
first decision was questioned again.

From a user coherency point of view, I think it would make more sense to
expose it individually in hwprobe so that zicboz, zicbop and zicbom
have their "own" block size (even though zicbop and zicbom would use the
same one). Moreover, it would allow us for future evolution easily
without breaking any userspace later if zicbop and zicbom block size are
decoupled.

Clément

>
> [1] https://lore.kernel.org/all/[email protected]/
>
> Thanks,
> drew
>
>
>>
>> Thanks,
>>
>> Clément
>>
>>>
>>> #define RISCV_ISA_EXT_MAX 64
>>>
>>> diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
>>> index e27179b26086..bbda350a63bf 100644
>>> --- a/arch/riscv/include/asm/insn-def.h
>>> +++ b/arch/riscv/include/asm/insn-def.h
>>> @@ -18,6 +18,13 @@
>>> #define INSN_I_RD_SHIFT 7
>>> #define INSN_I_OPCODE_SHIFT 0
>>>
>>> +#define INSN_S_SIMM7_SHIFT 25
>>> +#define INSN_S_RS2_SHIFT 20
>>> +#define INSN_S_RS1_SHIFT 15
>>> +#define INSN_S_FUNC3_SHIFT 12
>>> +#define INSN_S_SIMM5_SHIFT 7
>>> +#define INSN_S_OPCODE_SHIFT 0
>>> +
>>> #ifdef __ASSEMBLY__
>>>
>>> #ifdef CONFIG_AS_HAS_INSN
>>> @@ -30,6 +37,10 @@
>>> .insn i \opcode, \func3, \rd, \rs1, \simm12
>>> .endm
>>>
>>> + .macro insn_s, opcode, func3, rs2, simm12, rs1
>>> + .insn s \opcode, \func3, \rs2, \simm12(\rs1)
>>> + .endm
>>> +
>>> #else
>>>
>>> #include <asm/gpr-num.h>
>>> @@ -51,10 +62,20 @@
>>> (\simm12 << INSN_I_SIMM12_SHIFT))
>>> .endm
>>>
>>> + .macro insn_s, opcode, func3, rs2, simm12, rs1
>>> + .4byte ((\opcode << INSN_S_OPCODE_SHIFT) | \
>>> + (\func3 << INSN_S_FUNC3_SHIFT) | \
>>> + (.L__gpr_num_\rs2 << INSN_S_RS2_SHIFT) | \
>>> + (.L__gpr_num_\rs1 << INSN_S_RS1_SHIFT) | \
>>> + ((\simm12 & 0x1f) << INSN_S_SIMM5_SHIFT) | \
>>> + (((\simm12 >> 5) & 0x7f) << INSN_S_SIMM7_SHIFT))
>>> + .endm
>>> +
>>> #endif
>>>
>>> #define __INSN_R(...) insn_r __VA_ARGS__
>>> #define __INSN_I(...) insn_i __VA_ARGS__
>>> +#define __INSN_S(...) insn_s __VA_ARGS__
>>>
>>> #else /* ! __ASSEMBLY__ */
>>>
>>> @@ -66,6 +87,9 @@
>>> #define __INSN_I(opcode, func3, rd, rs1, simm12) \
>>> ".insn i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n"
>>>
>>> +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
>>> + ".insn s " opcode ", " func3 ", " rs2 ", " simm12 "(" rs1 ")\n"
>>> +
>>> #else
>>>
>>> #include <linux/stringify.h>
>>> @@ -92,12 +116,26 @@
>>> " (\\simm12 << " __stringify(INSN_I_SIMM12_SHIFT) "))\n" \
>>> " .endm\n"
>>>
>>> +#define DEFINE_INSN_S \
>>> + __DEFINE_ASM_GPR_NUMS \
>>> +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \
>>> +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \
>>> +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \
>>> +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
>>> +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
>>> +" ((\\simm12 & 0x1f) << " __stringify(INSN_S_SIMM5_SHIFT) ") |" \
>>> +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
>>> +" .endm\n"
>>> +
>>> #define UNDEFINE_INSN_R \
>>> " .purgem insn_r\n"
>>>
>>> #define UNDEFINE_INSN_I \
>>> " .purgem insn_i\n"
>>>
>>> +#define UNDEFINE_INSN_S \
>>> +" .purgem insn_s\n"
>>> +
>>> #define __INSN_R(opcode, func3, func7, rd, rs1, rs2) \
>>> DEFINE_INSN_R \
>>> "insn_r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n" \
>>> @@ -108,6 +146,11 @@
>>> "insn_i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n" \
>>> UNDEFINE_INSN_I
>>>
>>> +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \
>>> + DEFINE_INSN_S \
>>> + "insn_s " opcode ", " func3 ", " rs2 ", " simm12 ", " rs1 "\n" \
>>> + UNDEFINE_INSN_S
>>> +
>>> #endif
>>>
>>> #endif /* ! __ASSEMBLY__ */
>>> @@ -120,6 +163,10 @@
>>> __INSN_I(RV_##opcode, RV_##func3, RV_##rd, \
>>> RV_##rs1, RV_##simm12)
>>>
>>> +#define INSN_S(opcode, func3, rs2, simm12, rs1) \
>>> + __INSN_S(RV_##opcode, RV_##func3, RV_##rs2, \
>>> + RV_##simm12, RV_##rs1)
>>> +
>>> #define RV_OPCODE(v) __ASM_STR(v)
>>> #define RV_FUNC3(v) __ASM_STR(v)
>>> #define RV_FUNC7(v) __ASM_STR(v)
>>> @@ -133,6 +180,7 @@
>>> #define RV___RS2(v) __RV_REG(v)
>>>
>>> #define RV_OPCODE_MISC_MEM RV_OPCODE(15)
>>> +#define RV_OPCODE_OP_IMM RV_OPCODE(19)
>>> #define RV_OPCODE_SYSTEM RV_OPCODE(115)
>>>
>>> #define HFENCE_VVMA(vaddr, asid) \
>>> @@ -196,4 +244,16 @@
>>> INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \
>>> RS1(base), SIMM12(4))
>>>
>>> +#define CBO_PREFETCH_I(base, offset) \
>>> + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(0), \
>>> + SIMM12(offset), RS1(base))
>>> +
>>> +#define CBO_PREFETCH_R(base, offset) \
>>> + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(1), \
>>> + SIMM12(offset), RS1(base))
>>> +
>>> +#define CBO_PREFETCH_W(base, offset) \
>>> + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \
>>> + SIMM12(offset), RS1(base))
>>> +
>>> #endif /* __ASM_INSN_DEF_H */
>>> diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
>>> index b3785ffc1570..bdb02b066041 100644
>>> --- a/arch/riscv/kernel/cpufeature.c
>>> +++ b/arch/riscv/kernel/cpufeature.c
>>> @@ -168,6 +168,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
>>> __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
>>> __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
>>> __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
>>> + __RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),
>>> __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
>>> __RISCV_ISA_EXT_DATA(zicond, RISCV_ISA_EXT_ZICOND),
>>> __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),

2024-01-11 10:46:29