Signed-off-by: Andrea Parri <[email protected]>
Suggested-by: Palmer Dabbelt <[email protected]>
---
For the MEMBARRIER maintainers: RISC-V does not have "core serializing
instructions", meaning that there is no occurence of such a term in the
RISC-V ISA. The discussion and git history about the SYNC_CORE command
suggested the implementation below: a FENCE.I instruction "synchronizes
the instruction and data streams" [1] on a CPU; in litmus parlance,
(single-hart test)
CPU0
UPDATE text ;
FENCE.I ;
EXECUTE text ; /* <-- will execute the updated/new text */
(message-passing test)
CPU0 CPU1
UPDATE text | IF (flag) { ;
WMB | FENCE.I ;
SET flag | EXECUTE text ; /* execute the new text */
| } ;
(and many others, including "maybe"s! ;-) )
How do these remarks resonate with the semantics of "a core serializing
instruction" (to be issued before returning to user-space)?
RISCV maintainers, I'm missing some paths to user-space? (besides xRET)
Andrea
[1] https://github.com/riscv/riscv-isa-manual/blob/main/src/zifencei.adoc
.../sched/membarrier-sync-core/arch-support.txt | 2 +-
arch/riscv/Kconfig | 2 ++
arch/riscv/include/asm/sync_core.h | 15 +++++++++++++++
3 files changed, 18 insertions(+), 1 deletion(-)
create mode 100644 arch/riscv/include/asm/sync_core.h
diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
index 23260ca449468..a17117d76e6d8 100644
--- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt
+++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
@@ -44,7 +44,7 @@
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
- | riscv: | TODO |
+ | riscv: | ok |
| s390: | ok |
| sh: | TODO |
| sparc: | TODO |
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 4c07b9189c867..ed7ddaedc692e 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -27,6 +27,7 @@ config RISCV
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_KCOV
+ select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_MMIOWB
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PMEM_API
@@ -35,6 +36,7 @@ config RISCV
select ARCH_HAS_SET_MEMORY if MMU
select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL
select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL
+ select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_VDSO_DATA
diff --git a/arch/riscv/include/asm/sync_core.h b/arch/riscv/include/asm/sync_core.h
new file mode 100644
index 0000000000000..d3ec6ac47ac9b
--- /dev/null
+++ b/arch/riscv/include/asm/sync_core.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_RISCV_SYNC_CORE_H
+#define _ASM_RISCV_SYNC_CORE_H
+
+/*
+ * Ensure that a core serializing instruction is issued before returning
+ * to user-mode. RISC-V implements return to user-space through an xRET
+ * instruction, which is not core serializing.
+ */
+static inline void sync_core_before_usermode(void)
+{
+ asm volatile ("fence.i" ::: "memory");
+}
+
+#endif /* _ASM_RISCV_SYNC_CORE_H */
--
2.34.1
Adding Martin, Hans and Derek to the Cc: list,
Andrea
On Thu, Aug 03, 2023 at 06:01:11AM +0200, Andrea Parri wrote:
> Signed-off-by: Andrea Parri <[email protected]>
> Suggested-by: Palmer Dabbelt <[email protected]>
> ---
> For the MEMBARRIER maintainers: RISC-V does not have "core serializing
> instructions", meaning that there is no occurence of such a term in the
> RISC-V ISA. The discussion and git history about the SYNC_CORE command
> suggested the implementation below: a FENCE.I instruction "synchronizes
> the instruction and data streams" [1] on a CPU; in litmus parlance,
>
> (single-hart test)
>
> CPU0
>
> UPDATE text ;
> FENCE.I ;
> EXECUTE text ; /* <-- will execute the updated/new text */
>
>
> (message-passing test)
>
> CPU0 CPU1
>
> UPDATE text | IF (flag) { ;
> WMB | FENCE.I ;
> SET flag | EXECUTE text ; /* execute the new text */
> | } ;
>
>
> (and many others, including "maybe"s! ;-) )
>
> How do these remarks resonate with the semantics of "a core serializing
> instruction" (to be issued before returning to user-space)?
>
> RISCV maintainers, I'm missing some paths to user-space? (besides xRET)
>
> Andrea
>
> [1] https://github.com/riscv/riscv-isa-manual/blob/main/src/zifencei.adoc
>
>
> .../sched/membarrier-sync-core/arch-support.txt | 2 +-
> arch/riscv/Kconfig | 2 ++
> arch/riscv/include/asm/sync_core.h | 15 +++++++++++++++
> 3 files changed, 18 insertions(+), 1 deletion(-)
> create mode 100644 arch/riscv/include/asm/sync_core.h
>
> diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
> index 23260ca449468..a17117d76e6d8 100644
> --- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt
> +++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
> @@ -44,7 +44,7 @@
> | openrisc: | TODO |
> | parisc: | TODO |
> | powerpc: | ok |
> - | riscv: | TODO |
> + | riscv: | ok |
> | s390: | ok |
> | sh: | TODO |
> | sparc: | TODO |
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 4c07b9189c867..ed7ddaedc692e 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -27,6 +27,7 @@ config RISCV
> select ARCH_HAS_GCOV_PROFILE_ALL
> select ARCH_HAS_GIGANTIC_PAGE
> select ARCH_HAS_KCOV
> + select ARCH_HAS_MEMBARRIER_SYNC_CORE
> select ARCH_HAS_MMIOWB
> select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
> select ARCH_HAS_PMEM_API
> @@ -35,6 +36,7 @@ config RISCV
> select ARCH_HAS_SET_MEMORY if MMU
> select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL
> select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL
> + select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
> select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
> select ARCH_HAS_UBSAN_SANITIZE_ALL
> select ARCH_HAS_VDSO_DATA
> diff --git a/arch/riscv/include/asm/sync_core.h b/arch/riscv/include/asm/sync_core.h
> new file mode 100644
> index 0000000000000..d3ec6ac47ac9b
> --- /dev/null
> +++ b/arch/riscv/include/asm/sync_core.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_RISCV_SYNC_CORE_H
> +#define _ASM_RISCV_SYNC_CORE_H
> +
> +/*
> + * Ensure that a core serializing instruction is issued before returning
> + * to user-mode. RISC-V implements return to user-space through an xRET
> + * instruction, which is not core serializing.
> + */
> +static inline void sync_core_before_usermode(void)
> +{
> + asm volatile ("fence.i" ::: "memory");
> +}
> +
> +#endif /* _ASM_RISCV_SYNC_CORE_H */
> --
> 2.34.1
>
On 8/3/23 11:45, Andrea Parri wrote:
> Adding Martin, Hans and Derek to the Cc: list,
>
> Andrea
>
>
> On Thu, Aug 03, 2023 at 06:01:11AM +0200, Andrea Parri wrote:
>> Signed-off-by: Andrea Parri <[email protected]>
>> Suggested-by: Palmer Dabbelt <[email protected]>
>> ---
>> For the MEMBARRIER maintainers: RISC-V does not have "core serializing
>> instructions", meaning that there is no occurence of such a term in the
>> RISC-V ISA. The discussion and git history about the SYNC_CORE command
>> suggested the implementation below: a FENCE.I instruction "synchronizes
>> the instruction and data streams" [1] on a CPU; in litmus parlance,
>>
Can you double-check that riscv switch_mm() implies a fence.i or
equivalent on the CPU doing the switch_mm ?
AFAIR membarrier use of sync_core_before_usermode relies on switch_mm
issuing a core serializing instruction.
Thanks,
Mathieu
>> (single-hart test)
>>
>> CPU0
>>
>> UPDATE text ;
>> FENCE.I ;
>> EXECUTE text ; /* <-- will execute the updated/new text */
>>
>>
>> (message-passing test)
>>
>> CPU0 CPU1
>>
>> UPDATE text | IF (flag) { ;
>> WMB | FENCE.I ;
>> SET flag | EXECUTE text ; /* execute the new text */
>> | } ;
>>
>>
>> (and many others, including "maybe"s! ;-) )
>>
>> How do these remarks resonate with the semantics of "a core serializing
>> instruction" (to be issued before returning to user-space)?
>>
>> RISCV maintainers, I'm missing some paths to user-space? (besides xRET)
>>
>> Andrea
>>
>> [1] https://github.com/riscv/riscv-isa-manual/blob/main/src/zifencei.adoc
>>
>>
>> .../sched/membarrier-sync-core/arch-support.txt | 2 +-
>> arch/riscv/Kconfig | 2 ++
>> arch/riscv/include/asm/sync_core.h | 15 +++++++++++++++
>> 3 files changed, 18 insertions(+), 1 deletion(-)
>> create mode 100644 arch/riscv/include/asm/sync_core.h
>>
>> diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
>> index 23260ca449468..a17117d76e6d8 100644
>> --- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt
>> +++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
>> @@ -44,7 +44,7 @@
>> | openrisc: | TODO |
>> | parisc: | TODO |
>> | powerpc: | ok |
>> - | riscv: | TODO |
>> + | riscv: | ok |
>> | s390: | ok |
>> | sh: | TODO |
>> | sparc: | TODO |
>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>> index 4c07b9189c867..ed7ddaedc692e 100644
>> --- a/arch/riscv/Kconfig
>> +++ b/arch/riscv/Kconfig
>> @@ -27,6 +27,7 @@ config RISCV
>> select ARCH_HAS_GCOV_PROFILE_ALL
>> select ARCH_HAS_GIGANTIC_PAGE
>> select ARCH_HAS_KCOV
>> + select ARCH_HAS_MEMBARRIER_SYNC_CORE
>> select ARCH_HAS_MMIOWB
>> select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
>> select ARCH_HAS_PMEM_API
>> @@ -35,6 +36,7 @@ config RISCV
>> select ARCH_HAS_SET_MEMORY if MMU
>> select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL
>> select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL
>> + select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
>> select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
>> select ARCH_HAS_UBSAN_SANITIZE_ALL
>> select ARCH_HAS_VDSO_DATA
>> diff --git a/arch/riscv/include/asm/sync_core.h b/arch/riscv/include/asm/sync_core.h
>> new file mode 100644
>> index 0000000000000..d3ec6ac47ac9b
>> --- /dev/null
>> +++ b/arch/riscv/include/asm/sync_core.h
>> @@ -0,0 +1,15 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +#ifndef _ASM_RISCV_SYNC_CORE_H
>> +#define _ASM_RISCV_SYNC_CORE_H
>> +
>> +/*
>> + * Ensure that a core serializing instruction is issued before returning
>> + * to user-mode. RISC-V implements return to user-space through an xRET
>> + * instruction, which is not core serializing.
>> + */
>> +static inline void sync_core_before_usermode(void)
>> +{
>> + asm volatile ("fence.i" ::: "memory");
>> +}
>> +
>> +#endif /* _ASM_RISCV_SYNC_CORE_H */
>> --
>> 2.34.1
>>
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
> Can you double-check that riscv switch_mm() implies a fence.i or equivalent
> on the CPU doing the switch_mm ?
AFAICT, (riscv) switch_mm() does not guarantee that.
> AFAIR membarrier use of sync_core_before_usermode relies on switch_mm
> issuing a core serializing instruction.
I see. Thanks for the clarification.
BTW, the comment in __schedule() suggests that membarrier also relies on
switch_mm() issuing a full memory barrier: I don't think this holds.
Removing the "deferred icache flush" logic in switch_mm() - in favour of
a "plain" MB; FENCE.I - would meet both of these requirements.
Other ideas?
Andrea
On 8/3/23 20:16, Andrea Parri wrote:
>> Can you double-check that riscv switch_mm() implies a fence.i or equivalent
>> on the CPU doing the switch_mm ?
>
> AFAICT, (riscv) switch_mm() does not guarantee that.
>
>
>> AFAIR membarrier use of sync_core_before_usermode relies on switch_mm
>> issuing a core serializing instruction.
>
> I see. Thanks for the clarification.
>
> BTW, the comment in __schedule() suggests that membarrier also relies on
> switch_mm() issuing a full memory barrier: I don't think this holds.
>
> Removing the "deferred icache flush" logic in switch_mm() - in favour of
> a "plain" MB; FENCE.I - would meet both of these requirements.
What is the relationship between FENCE.I and instruction cache flush on
RISC-V ?
On other architectures, there is need for careful flushing of the
instruction cache for the address range that was modified, _and_ to
issue a core serializing instruction on all cores (this last part being
performed by membarrier SYNC_CORE) between the point where the old
instructions were executed and before the new instructions are executed.
Thanks,
Mathieu
>
> Other ideas?
>
> Andrea
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
> What is the relationship between FENCE.I and instruction cache flush on
> RISC-V ?
The exact nature of this relationship is implementation-dependent. From
commentary included in the ISA portion referred to in the changelog:
A simple implementation can flush the local instruction cache and
the instruction pipeline when the FENCE.I is executed. A more
complex implementation might snoop the instruction (data) cache on
every data (instruction) cache miss, or use an inclusive unified
private L2 cache to invalidate lines from the primary instruction
cache when they are being written by a local store instruction. If
instruction and data caches are kept coherent in this way, or if
the memory system consists of only uncached RAMs, then just the
fetch pipeline needs to be flushed at a FENCE.I. [..]
Mmh, does this help?
Andrea
On 8/4/23 10:59, Andrea Parri wrote:
>> What is the relationship between FENCE.I and instruction cache flush on
>> RISC-V ?
>
> The exact nature of this relationship is implementation-dependent. From
> commentary included in the ISA portion referred to in the changelog:
>
> A simple implementation can flush the local instruction cache and
> the instruction pipeline when the FENCE.I is executed. A more
> complex implementation might snoop the instruction (data) cache on
> every data (instruction) cache miss, or use an inclusive unified
> private L2 cache to invalidate lines from the primary instruction
> cache when they are being written by a local store instruction. If
> instruction and data caches are kept coherent in this way, or if
> the memory system consists of only uncached RAMs, then just the
> fetch pipeline needs to be flushed at a FENCE.I. [..]
>
> Mmh, does this help?
Quoting
https://github.com/riscv/riscv-isa-manual/releases/download/Ratified-IMAFDQC/riscv-spec-20191213.pdf
Chapter 3 "“Zifencei” Instruction-Fetch Fence, Version 2.0"
"First, it has been recognized that on some systems, FENCE.I will be expensive to implement
and alternate mechanisms are being discussed in the memory model task group. In particular,
for designs that have an incoherent instruction cache and an incoherent data cache, or where
the instruction cache refill does not snoop a coherent data cache, both caches must be completely
flushed when a FENCE.I instruction is encountered. This problem is exacerbated when there are
multiple levels of I and D cache in front of a unified cache or outer memory system.
Second, the instruction is not powerful enough to make available at user level in a Unix-like
operating system environment. The FENCE.I only synchronizes the local hart, and the OS can
reschedule the user hart to a different physical hart after the FENCE.I. This would require the
OS to execute an additional FENCE.I as part of every context migration. For this reason, the
standard Linux ABI has removed FENCE.I from user-level and now requires a system call to
maintain instruction-fetch coherence, which allows the OS to minimize the number of FENCE.I
executions required on current systems and provides forward-compatibility with future improved
instruction-fetch coherence mechanisms.
Future approaches to instruction-fetch coherence under discussion include providing more
restricted versions of FENCE.I that only target a given address specified in rs1, and/or allowing
software to use an ABI that relies on machine-mode cache-maintenance operations."
I start to suspect that even the people working on the riscv memory model have noticed
that letting a single instruction such as FENCE.I take care of both cache coherency
*and* flush the instruction pipeline will be a performance bottleneck, because it
can only clear the whole instruction cache.
Other architectures are either cache-coherent, or have cache flushing which can be
performed on a range of addresses. This is kept apart from whatever instruction
flushes the instruction pipeline of the processor.
By keeping instruction cache flushing separate from instruction pipeline flush, we can
let membarrier (and context switches, including thread migration) only care about the
instruction pipeline part, and leave instruction cache flush to either a dedicated
system call, or to specialized instructions which are available from user-mode.
Considering that FENCE.I is forced to invalidate the whole i-cache, I don't think you
will get away with executing it from switch_mm without making performance go down the
drain on cache incoherent implementations.
In my opinion, what we would need from RISC-V for membarrier (and context switch) is a
lightweight version of FENCE.I which only flushes the instruction pipeline of the local
processor. This should ideally come with a way for architectures with incoherent caches
to flush the relevant address ranges of the i-cache which are modified by a JIT. This
i-cache flush would not be required to flush the instruction pipeline, as it is typical
to batch invalidation of various address ranges together and issue a single instruction
pipeline flush on each CPU at the end. The i-cache flush could either be done by new
instructions available from user-space (similar to aarch64), or through privileged
instructions available through system calls (similar to arm cacheflush).
Thanks,
Mathieu
>
> Andrea
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
On 8/4/23 15:16, Andrea Parri wrote:
> On Fri, Aug 04, 2023 at 02:05:55PM -0400, Mathieu Desnoyers wrote:
>> On 8/4/23 10:59, Andrea Parri wrote:
>>>> What is the relationship between FENCE.I and instruction cache flush on
>>>> RISC-V ?
>>>
>>> The exact nature of this relationship is implementation-dependent. From
>>> commentary included in the ISA portion referred to in the changelog:
>>>
>>> A simple implementation can flush the local instruction cache and
>>> the instruction pipeline when the FENCE.I is executed. A more
>>> complex implementation might snoop the instruction (data) cache on
>>> every data (instruction) cache miss, or use an inclusive unified
>>> private L2 cache to invalidate lines from the primary instruction
>>> cache when they are being written by a local store instruction. If
>>> instruction and data caches are kept coherent in this way, or if
>>> the memory system consists of only uncached RAMs, then just the
>>> fetch pipeline needs to be flushed at a FENCE.I. [..]
>>>
>>> Mmh, does this help?
>>
>> Quoting
>>
>> https://github.com/riscv/riscv-isa-manual/releases/download/Ratified-IMAFDQC/riscv-spec-20191213.pdf
>>
>> Chapter 3 "“Zifencei” Instruction-Fetch Fence, Version 2.0"
>>
>> "First, it has been recognized that on some systems, FENCE.I will be expensive to implement
>> and alternate mechanisms are being discussed in the memory model task group. In particular,
>> for designs that have an incoherent instruction cache and an incoherent data cache, or where
>> the instruction cache refill does not snoop a coherent data cache, both caches must be completely
>> flushed when a FENCE.I instruction is encountered. This problem is exacerbated when there are
>> multiple levels of I and D cache in front of a unified cache or outer memory system.
>>
>> Second, the instruction is not powerful enough to make available at user level in a Unix-like
>> operating system environment. The FENCE.I only synchronizes the local hart, and the OS can
>> reschedule the user hart to a different physical hart after the FENCE.I. This would require the
>> OS to execute an additional FENCE.I as part of every context migration. For this reason, the
>> standard Linux ABI has removed FENCE.I from user-level and now requires a system call to
>> maintain instruction-fetch coherence, which allows the OS to minimize the number of FENCE.I
>> executions required on current systems and provides forward-compatibility with future improved
>> instruction-fetch coherence mechanisms.
>>
>> Future approaches to instruction-fetch coherence under discussion include providing more
>> restricted versions of FENCE.I that only target a given address specified in rs1, and/or allowing
>> software to use an ABI that relies on machine-mode cache-maintenance operations."
>>
>> I start to suspect that even the people working on the riscv memory model have noticed
>> that letting a single instruction such as FENCE.I take care of both cache coherency
>> *and* flush the instruction pipeline will be a performance bottleneck, because it
>> can only clear the whole instruction cache.
>>
>> Other architectures are either cache-coherent, or have cache flushing which can be
>> performed on a range of addresses. This is kept apart from whatever instruction
>> flushes the instruction pipeline of the processor.
>>
>> By keeping instruction cache flushing separate from instruction pipeline flush, we can
>> let membarrier (and context switches, including thread migration) only care about the
>> instruction pipeline part, and leave instruction cache flush to either a dedicated
>> system call, or to specialized instructions which are available from user-mode.
>>
>> Considering that FENCE.I is forced to invalidate the whole i-cache, I don't think you
>> will get away with executing it from switch_mm without making performance go down the
>> drain on cache incoherent implementations.
>>
>> In my opinion, what we would need from RISC-V for membarrier (and context switch) is a
>> lightweight version of FENCE.I which only flushes the instruction pipeline of the local
>> processor. This should ideally come with a way for architectures with incoherent caches
>> to flush the relevant address ranges of the i-cache which are modified by a JIT. This
>> i-cache flush would not be required to flush the instruction pipeline, as it is typical
>> to batch invalidation of various address ranges together and issue a single instruction
>> pipeline flush on each CPU at the end. The i-cache flush could either be done by new
>> instructions available from user-space (similar to aarch64), or through privileged
>> instructions available through system calls (similar to arm cacheflush).
>
> Thanks for the remarks, Mathieu. I think it will be very helpful to
> RISC-V architects (and memory model people) to have this context and
> reasoning written down.
One more noteworthy detail: if a system call similar to ARM cacheflush(2) is implemented for
RISC-V, perhaps an iovec ABI (similar to readv(2)/writev(2)) would be relevant to handle
batching of cache flushing when address ranges are not contiguous. Maybe with a new name
like "cacheflushv(2)", so eventually other architectures could implement it as well ?
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
On Fri, Aug 04, 2023 at 02:05:55PM -0400, Mathieu Desnoyers wrote:
> On 8/4/23 10:59, Andrea Parri wrote:
> > > What is the relationship between FENCE.I and instruction cache flush on
> > > RISC-V ?
> >
> > The exact nature of this relationship is implementation-dependent. From
> > commentary included in the ISA portion referred to in the changelog:
> >
> > A simple implementation can flush the local instruction cache and
> > the instruction pipeline when the FENCE.I is executed. A more
> > complex implementation might snoop the instruction (data) cache on
> > every data (instruction) cache miss, or use an inclusive unified
> > private L2 cache to invalidate lines from the primary instruction
> > cache when they are being written by a local store instruction. If
> > instruction and data caches are kept coherent in this way, or if
> > the memory system consists of only uncached RAMs, then just the
> > fetch pipeline needs to be flushed at a FENCE.I. [..]
> >
> > Mmh, does this help?
>
> Quoting
>
> https://github.com/riscv/riscv-isa-manual/releases/download/Ratified-IMAFDQC/riscv-spec-20191213.pdf
>
> Chapter 3 "“Zifencei” Instruction-Fetch Fence, Version 2.0"
>
> "First, it has been recognized that on some systems, FENCE.I will be expensive to implement
> and alternate mechanisms are being discussed in the memory model task group. In particular,
> for designs that have an incoherent instruction cache and an incoherent data cache, or where
> the instruction cache refill does not snoop a coherent data cache, both caches must be completely
> flushed when a FENCE.I instruction is encountered. This problem is exacerbated when there are
> multiple levels of I and D cache in front of a unified cache or outer memory system.
>
> Second, the instruction is not powerful enough to make available at user level in a Unix-like
> operating system environment. The FENCE.I only synchronizes the local hart, and the OS can
> reschedule the user hart to a different physical hart after the FENCE.I. This would require the
> OS to execute an additional FENCE.I as part of every context migration. For this reason, the
> standard Linux ABI has removed FENCE.I from user-level and now requires a system call to
> maintain instruction-fetch coherence, which allows the OS to minimize the number of FENCE.I
> executions required on current systems and provides forward-compatibility with future improved
> instruction-fetch coherence mechanisms.
>
> Future approaches to instruction-fetch coherence under discussion include providing more
> restricted versions of FENCE.I that only target a given address specified in rs1, and/or allowing
> software to use an ABI that relies on machine-mode cache-maintenance operations."
>
> I start to suspect that even the people working on the riscv memory model have noticed
> that letting a single instruction such as FENCE.I take care of both cache coherency
> *and* flush the instruction pipeline will be a performance bottleneck, because it
> can only clear the whole instruction cache.
>
> Other architectures are either cache-coherent, or have cache flushing which can be
> performed on a range of addresses. This is kept apart from whatever instruction
> flushes the instruction pipeline of the processor.
>
> By keeping instruction cache flushing separate from instruction pipeline flush, we can
> let membarrier (and context switches, including thread migration) only care about the
> instruction pipeline part, and leave instruction cache flush to either a dedicated
> system call, or to specialized instructions which are available from user-mode.
>
> Considering that FENCE.I is forced to invalidate the whole i-cache, I don't think you
> will get away with executing it from switch_mm without making performance go down the
> drain on cache incoherent implementations.
>
> In my opinion, what we would need from RISC-V for membarrier (and context switch) is a
> lightweight version of FENCE.I which only flushes the instruction pipeline of the local
> processor. This should ideally come with a way for architectures with incoherent caches
> to flush the relevant address ranges of the i-cache which are modified by a JIT. This
> i-cache flush would not be required to flush the instruction pipeline, as it is typical
> to batch invalidation of various address ranges together and issue a single instruction
> pipeline flush on each CPU at the end. The i-cache flush could either be done by new
> instructions available from user-space (similar to aarch64), or through privileged
> instructions available through system calls (similar to arm cacheflush).
Thanks for the remarks, Mathieu. I think it will be very helpful to
RISC-V architects (and memory model people) to have this context and
reasoning written down.
Andrea
> One more noteworthy detail: if a system call similar to ARM cacheflush(2) is implemented for
> RISC-V, perhaps an iovec ABI (similar to readv(2)/writev(2)) would be relevant to handle
> batching of cache flushing when address ranges are not contiguous. Maybe with a new name
> like "cacheflushv(2)", so eventually other architectures could implement it as well ?
I believe that's a sensible idea. But the RISC-V maintainers can provide
a more reliable feedback.
Andrea
On Mon, 07 Aug 2023 06:19:18 PDT (-0700), [email protected] wrote:
>> One more noteworthy detail: if a system call similar to ARM cacheflush(2) is implemented for
>> RISC-V, perhaps an iovec ABI (similar to readv(2)/writev(2)) would be relevant to handle
>> batching of cache flushing when address ranges are not contiguous. Maybe with a new name
>> like "cacheflushv(2)", so eventually other architectures could implement it as well ?
>
> I believe that's a sensible idea. But the RISC-V maintainers can provide
> a more reliable feedback.
Sorry I missed this, I'm still a bit backlogged from COVID. A few of us
were having a meeting, just to try and summarize (many of these points
came up in the thread, so sorry for rehashing things):
We don't have a fence.i in the scheduling path, as fence.i is very slow
on systems that implement it by flushing the icache. Instead we have a
mechanism for deferring the fences (see flush_icache_deferred, though
I'm no longer sure that's correct which I'll mention below). As a
result userspace can't do a fence.i directly, but instead needs to make
a syscall/vdsocall so the kernel can do this bookkeeping. There's some
proposals for ISA extensions that replace fence.i, but they're still WIP
and there's a lot of fence.i-only hardware so we'll have to deal with
it.
When we did this we had a feeling this may be sub-optimal for systems
that have faster fence.i implementations (ie, coherent instruction
caches), but nobody's gotten around to doing that yet -- and maybe
there's no hardware that behaves this way. The rough plan was along the
lines of adding a prctl() where userspace can request the ability to
directly emit fence.i, which would then result in the kernel eagerly
emitting fence.i when scheduling. Some of the Java people have been
asking for this sort of feature.
From looking at the membarrier arch/scheduler hooks, I think we might
have a bug in our deferred icache flushing mechanism: specifically we
hook into switch_mm(), which this comment has me worried about
* When switching through a kernel thread, the loop in
* membarrier_{private,global}_expedited() may have observed that
* kernel thread and not issued an IPI. It is therefore possible to
* schedule between user->kernel->user threads without passing though
* switch_mm(). Membarrier requires a barrier after storing to
* rq->curr, before returning to userspace, so provide them here:
Even if there's not a bug in the RISC-V stuff, it seems that we've ended
up with pretty similar schemes here and we could remove some
arch-specific code by de-duplicating things -- IIRC there was no
membarrier when we did the original port, so I think we've just missed a
cleanup opportunity.
So I'd propose doing the following:
* Pick up a patch like this. Mmaybe exactly this, I'm going to give it
a proper review to make sure.
* Remove the RISC-V implemenation of deferred icache flushes and instead
just call into membarrier. We might need to add some more bookkeeping
here, but from a quick look it seems membarrier is doing pretty much
the same thing.
* Implement that prctl that allows userspace to ask for permission to do
direct fence.i instructions -- sort of a different project, but if
we're going to be tearing into all this code we might as well do it
now.
Charlie is volunteering to do the work here, so hopefully we'll have
something moving forward.
>
> Andrea
On 2023-10-13 13:29, Palmer Dabbelt wrote:
> On Mon, 07 Aug 2023 06:19:18 PDT (-0700), [email protected] wrote:
>>> One more noteworthy detail: if a system call similar to ARM
>>> cacheflush(2) is implemented for
>>> RISC-V, perhaps an iovec ABI (similar to readv(2)/writev(2)) would be
>>> relevant to handle
>>> batching of cache flushing when address ranges are not contiguous.
>>> Maybe with a new name
>>> like "cacheflushv(2)", so eventually other architectures could
>>> implement it as well ?
>>
>> I believe that's a sensible idea. But the RISC-V maintainers can provide
>> a more reliable feedback.
>
> Sorry I missed this, I'm still a bit backlogged from COVID. A few of us
> were having a meeting, just to try and summarize (many of these points
> came up in the thread, so sorry for rehashing things):
>
> We don't have a fence.i in the scheduling path, as fence.i is very slow
> on systems that implement it by flushing the icache. Instead we have a
> mechanism for deferring the fences (see flush_icache_deferred, though
> I'm no longer sure that's correct which I'll mention below). As a
> result userspace can't do a fence.i directly, but instead needs to make
> a syscall/vdsocall so the kernel can do this bookkeeping. There's some
> proposals for ISA extensions that replace fence.i, but they're still WIP
> and there's a lot of fence.i-only hardware so we'll have to deal with it.
>
> When we did this we had a feeling this may be sub-optimal for systems
> that have faster fence.i implementations (ie, coherent instruction
> caches), but nobody's gotten around to doing that yet -- and maybe
> there's no hardware that behaves this way. The rough plan was along the
> lines of adding a prctl() where userspace can request the ability to
> directly emit fence.i, which would then result in the kernel eagerly
> emitting fence.i when scheduling. Some of the Java people have been
> asking for this sort of feature.
There is a membarrier(2) registration scheme to ensure that core serializing
instructions are issued in the relevant scenarios.
See MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE and
Documentation/features/sched/membarrier-sync-core/arch-support.txt
The core serializing instructions are typically issued on return to userspace
on various architectures, else we rely on switch_mm() emitting the fence between
update to rq->curr->mm and return to userspace. And on the rare case where
rq->curr->mm is changed without invoking switch_mm() (transition to a
kernel thread), then we've added the relevant code in the scheduler to add
a core serializing instruction for registered processes.
>
> From looking at the membarrier arch/scheduler hooks, I think we might
> have a bug in our deferred icache flushing mechanism: specifically we
> hook into switch_mm(), which this comment has me worried about
>
> * When switching through a kernel thread, the loop in
> * membarrier_{private,global}_expedited() may have observed that
> * kernel thread and not issued an IPI. It is therefore possible to
> * schedule between user->kernel->user threads without passing
> though
> * switch_mm(). Membarrier requires a barrier after storing to
> * rq->curr, before returning to userspace, so provide them here:
I guess you wonder if the on_each_cpu_mask ipis based on the mm_cpumask(mm)
gives any level of guarantee with respect to switch_mm() modifying this
mask. (in flush_icache_mm())
In membarrier, we decided against using the mm_cpumask for various reasons:
- AFAIR, on some architectures, the mm_cpumask is a superset of the CPUs actually
used (it's never cleared),
- the point where the mm_cpumask is updated with respect to memory barriers
in the scheduler code is not as convenient as it is for updates to
"rq->curr" by the scheduler. This matters for the other purposes of
membarrier(2) which is to issue memory barriers on all threads belonging
to a process.
and instead we iterate on each online cpus, and compare the "rq->curr->mm"
pointer to the current task. Then we made sure to document all the
relevant memory barriers and core serializing instruction expectations
around rq->curr->mm update by the scheduler.
But back to the RISC-V flush_icache_mm() scheme, because it does not rely
on "rq->curr" at all, then it all depends on whether the cpu is still in
the mm_cpumask of the mm when that cpu temporarily schedules a kernel thread.
AFAIR, scheduling a kernel thread does not trigger any call to switch_mm
(it an optimization which leaves the mm in place while running the kernel
thread), so the on_each_cpu_mask using the mm_cpumask would be OK.
>
> Even if there's not a bug in the RISC-V stuff, it seems that we've ended
> up with pretty similar schemes here and we could remove some
> arch-specific code by de-duplicating things -- IIRC there was no
> membarrier when we did the original port, so I think we've just missed a
> cleanup opportunity.
Actually, membarrier(2) MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE appeared
in Linux 4.16, whereas the initial port of RISC-V appeared in Linux 4.15.
So this de-duplication has been missed by a narrow window :)
Yes, it would make sense to do this de-duplication.
>
> So I'd propose doing the following:
>
> * Pick up a patch like this. Mmaybe exactly this, I'm going to give it
> a proper review to make sure.
AFAIR this patch implements sync_core_before_usermode which gets used by
membarrier_mm_sync_core_before_usermode() to handle the uthread->kthread->uthread
case. It relies on switch_mm issuing a core serializing instruction as well.
Looking at RISC-V switch_mm(), I see that switch_mm() calls:
flush_icache_deferred(next, cpu);
which only issues a fence.i if a deferred icache flush was required. We're
missing the part that sets the icache_stale_mask cpumask bits when a
MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE is invoked.
> * Remove the RISC-V implemenation of deferred icache flushes and instead
> just call into membarrier. We might need to add some more bookkeeping
> here, but from a quick look it seems membarrier is doing pretty much
> the same thing.
The only part where I think you may want to keep some level of deferred
icache flushing as you do now is as follows:
- when membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE is invoked,
call a new architecture hook which sets cpumask bits in the mm context
that tells the next switch_mm on each cpu to issue fence.i for that mm.
- keep something like flush_icache_deferred as you have now.
Otherwise, I fear the overhead of a very expensive fence.i would be too
much when processes registering with MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE
and start doing fence.i on each and every switch_mm().
So you'd basically rely on membarrier to only issue IPIs to the CPUs which are
currently running threads belonging to the mm, and handle the switch_mm with
the sync_core_before_usermode() for uthread->kthread->uthread case, and implement
a deferred icache flush for the typical switch_mm() case.
> * Implement that prctl that allows userspace to ask for permission to do
> direct fence.i instructions -- sort of a different project, but if
> we're going to be tearing into all this code we might as well do it now.
But fence.i would only have effects on the hart it is being called from, right ?
What is the use-case for allowing user-space to issue this instruction ?
One more thing: membarrier(2) sync_core only issues things like "fence.i" on
the various cores in the system running threads belonging to the process, but
does not intend to take care of doing any kind of cache invalidation per se
(e.g. invalidating an address range worth of cache). On ARM, this is done by a
separate system call (e.g. cacheflush(2)), or can be done by instructions
available from userspace in some cases.
Do you expect to have a need for flushing only specific icache lines, or is
the intent to always flush the entire icache ?
>
> Charlie is volunteering to do the work here, so hopefully we'll have
> something moving forward.
That's great! I hope my feedback will help.
Thanks,
Mathieu
>
>>
>> Andrea
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
> But fence.i would only have effects on the hart it is being called from, right ?
> What is the use-case for allowing user-space to issue this instruction ?
Right now openjdk uses flush_icache for every cmodx write. And it does
a lot of cmodx.
If the hardware does not require an IPI/icache flushing we still need
to serialize the reader.
(locally stopping out-of-order execution/speculation at least)
And in some cases the reader knows the code stream has been changed
and can emit fence.i itself instead.
So we want the option to emit fence.i directly into the code stream.
As fence.i is an unpriv instruction there is no issue with emitting it.
But we need the assurance that context switching to a new hart does
not eliminate the effects of the fence.i.
/Robbin
>
> One more thing: membarrier(2) sync_core only issues things like "fence.i" on
> the various cores in the system running threads belonging to the process, but
> does not intend to take care of doing any kind of cache invalidation per se
> (e.g. invalidating an address range worth of cache). On ARM, this is done by a
> separate system call (e.g. cacheflush(2)), or can be done by instructions
> available from userspace in some cases.
>
> Do you expect to have a need for flushing only specific icache lines, or is
> the intent to always flush the entire icache ?
>
> >
> > Charlie is volunteering to do the work here, so hopefully we'll have
> > something moving forward.
>
> That's great! I hope my feedback will help.
>
> Thanks,
>
> Mathieu
>
> >
> >>
> >> Andrea
>
> --
> Mathieu Desnoyers
> EfficiOS Inc.
> https://www.efficios.com
>
On Thu, Nov 09, 2023 at 08:24:58PM +0100, Andrea Parri wrote:
> Mathieu, all,
>
> Sorry for the delay,
>
> > AFAIR this patch implements sync_core_before_usermode which gets used by
> > membarrier_mm_sync_core_before_usermode() to handle the uthread->kthread->uthread
> > case. It relies on switch_mm issuing a core serializing instruction as well.
> >
> > Looking at RISC-V switch_mm(), I see that switch_mm() calls:
> >
> > flush_icache_deferred(next, cpu);
> >
> > which only issues a fence.i if a deferred icache flush was required. We're
> > missing the part that sets the icache_stale_mask cpumask bits when a
> > MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE is invoked.
>
> [...]
>
> > The only part where I think you may want to keep some level of deferred
> > icache flushing as you do now is as follows:
> >
> > - when membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE is invoked,
> > call a new architecture hook which sets cpumask bits in the mm context
> > that tells the next switch_mm on each cpu to issue fence.i for that mm.
> > - keep something like flush_icache_deferred as you have now.
> >
> > Otherwise, I fear the overhead of a very expensive fence.i would be too
> > much when processes registering with MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE
> > and start doing fence.i on each and every switch_mm().
> >
> > So you'd basically rely on membarrier to only issue IPIs to the CPUs which are
> > currently running threads belonging to the mm, and handle the switch_mm with
> > the sync_core_before_usermode() for uthread->kthread->uthread case, and implement
> > a deferred icache flush for the typical switch_mm() case.
>
> I've (tried to) put this together and obtained the two patches reported below.
> Please let me know if this aligns with your intentions and/or there's interest
> in a proper submission.
>
> Andrea
>
>
> From e7d07a6c04b2565fceedcd71c2175e7df7e11d96 Mon Sep 17 00:00:00 2001
> From: Andrea Parri <[email protected]>
> Date: Thu, 9 Nov 2023 11:03:00 +0100
> Subject: [PATCH 1/2] locking: Introduce prepare_sync_core_cmd()
>
> Introduce an architecture function that architectures can use to set
> up ("prepare") SYNC_CORE commands.
>
> The function will be used by RISC-V to update its "deferred icache-
> flush" data structures (icache_stale_mask).
>
> Architectures defining prepare_sync_core_cmd() static inline need to
> select ARCH_HAS_PREPARE_SYNC_CORE_CMD.
>
> Signed-off-by: Andrea Parri <[email protected]>
> Suggested-by: Mathieu Desnoyers <[email protected]>
> ---
> include/linux/sync_core.h | 16 +++++++++++++++-
> init/Kconfig | 3 +++
> kernel/sched/membarrier.c | 1 +
> 3 files changed, 19 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/sync_core.h b/include/linux/sync_core.h
> index 013da4b8b3272..67bb9794b8758 100644
> --- a/include/linux/sync_core.h
> +++ b/include/linux/sync_core.h
> @@ -17,5 +17,19 @@ static inline void sync_core_before_usermode(void)
> }
> #endif
>
> -#endif /* _LINUX_SYNC_CORE_H */
> +#ifdef CONFIG_ARCH_HAS_PREPARE_SYNC_CORE_CMD
> +#include <asm/sync_core.h>
> +#else
> +/*
> + * This is a dummy prepare_sync_core_cmd() implementation that can be used on
> + * all architectures which provide unconditional core serializing instructions
> + * in switch_mm().
> + * If your architecture doesn't provide such core serializing instructions in
> + * switch_mm(), you may need to write your own functions.
> + */
> +static inline void prepare_sync_core_cmd(struct mm_struct *mm)
> +{
> +}
> +#endif
>
> +#endif /* _LINUX_SYNC_CORE_H */
> diff --git a/init/Kconfig b/init/Kconfig
> index 6d35728b94b2b..61f5f982ca451 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1972,6 +1972,9 @@ source "kernel/Kconfig.locks"
> config ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
> bool
>
> +config ARCH_HAS_PREPARE_SYNC_CORE_CMD
> + bool
> +
> config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
> bool
>
> diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
> index 2ad881d07752c..58f801e013988 100644
> --- a/kernel/sched/membarrier.c
> +++ b/kernel/sched/membarrier.c
> @@ -320,6 +320,7 @@ static int membarrier_private_expedited(int flags, int cpu_id)
> MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
> return -EPERM;
> ipi_func = ipi_sync_core;
> + prepare_sync_core_cmd(mm);
> } else if (flags == MEMBARRIER_FLAG_RSEQ) {
> if (!IS_ENABLED(CONFIG_RSEQ))
> return -EINVAL;
> --
> 2.34.1
>
>
> From 617896a1d58a5f8b0e5895dbc928a54e0461d959 Mon Sep 17 00:00:00 2001
> From: Andrea Parri <[email protected]>
> Date: Tue, 7 Nov 2023 21:08:06 +0100
> Subject: [PATCH 2/2] membarrier: riscv: Provide core serializing command
>
> RISC-V uses xRET instructions on return from interrupt and to go back
> to user-space; the xRET instruction is not core serializing.
>
> Use FENCE.I for providing core serialization as follows:
>
> - by calling sync_core_before_usermode() on return from interrupt (cf.
> ipi_sync_core()),
>
> - via switch_mm() and sync_core_before_usermode() (respectively, for
> uthread->uthread and kthread->uthread transitions) to go back to
> user-space.
>
> On RISC-V, the serialization in switch_mm() is activated by resetting
> the icache_stale_mask of the mm at prepare_sync_core_cmd().
>
> Signed-off-by: Andrea Parri <[email protected]>
> Suggested-by: Palmer Dabbelt <[email protected]>
> ---
> .../membarrier-sync-core/arch-support.txt | 2 +-
> arch/riscv/Kconfig | 3 +++
> arch/riscv/include/asm/sync_core.h | 23 +++++++++++++++++++
> 3 files changed, 27 insertions(+), 1 deletion(-)
> create mode 100644 arch/riscv/include/asm/sync_core.h
>
> diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
> index 23260ca449468..a17117d76e6d8 100644
> --- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt
> +++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
> @@ -44,7 +44,7 @@
> | openrisc: | TODO |
> | parisc: | TODO |
> | powerpc: | ok |
> - | riscv: | TODO |
> + | riscv: | ok |
> | s390: | ok |
> | sh: | TODO |
> | sparc: | TODO |
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 9c48fecc67191..b70a0b9ea3ee7 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -27,14 +27,17 @@ config RISCV
> select ARCH_HAS_GCOV_PROFILE_ALL
> select ARCH_HAS_GIGANTIC_PAGE
> select ARCH_HAS_KCOV
> + select ARCH_HAS_MEMBARRIER_SYNC_CORE
> select ARCH_HAS_MMIOWB
> select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
> select ARCH_HAS_PMEM_API
> + select ARCH_HAS_PREPARE_SYNC_CORE_CMD
> select ARCH_HAS_PTE_SPECIAL
> select ARCH_HAS_SET_DIRECT_MAP if MMU
> select ARCH_HAS_SET_MEMORY if MMU
> select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL
> select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL
> + select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
> select ARCH_HAS_SYSCALL_WRAPPER
> select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
> select ARCH_HAS_UBSAN_SANITIZE_ALL
> diff --git a/arch/riscv/include/asm/sync_core.h b/arch/riscv/include/asm/sync_core.h
> new file mode 100644
> index 0000000000000..8be5e07d641ab
> --- /dev/null
> +++ b/arch/riscv/include/asm/sync_core.h
> @@ -0,0 +1,23 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_RISCV_SYNC_CORE_H
> +#define _ASM_RISCV_SYNC_CORE_H
> +
> +/*
> + * RISC-V implements return to user-space through an xRET instruction,
> + * which is not core serializing.
> + */
> +static inline void sync_core_before_usermode(void)
> +{
> + asm volatile ("fence.i" ::: "memory");
> +}
> +
> +/*
> + * Ensure the next switch_mm() on every CPU issues a core serializing
> + * instruction for the given @mm.
> + */
> +static inline void prepare_sync_core_cmd(struct mm_struct *mm)
> +{
> + cpumask_setall(&mm->context.icache_stale_mask);
> +}
> +
> +#endif /* _ASM_RISCV_SYNC_CORE_H */
> --
> 2.34.1
>
This looks good to me, can you send out a non-RFC? I just sent out
patches to support userspace fence.i:
https://lore.kernel.org/linux-riscv/[email protected]/T/#t.
- Charlie
On 2023-11-22 20:07, Charlie Jenkins wrote:
> On Thu, Nov 09, 2023 at 08:24:58PM +0100, Andrea Parri wrote:
>> Mathieu, all,
>>
>> Sorry for the delay,
>>
>>> AFAIR this patch implements sync_core_before_usermode which gets used by
>>> membarrier_mm_sync_core_before_usermode() to handle the uthread->kthread->uthread
>>> case. It relies on switch_mm issuing a core serializing instruction as well.
>>>
>>> Looking at RISC-V switch_mm(), I see that switch_mm() calls:
>>>
>>> flush_icache_deferred(next, cpu);
>>>
>>> which only issues a fence.i if a deferred icache flush was required. We're
>>> missing the part that sets the icache_stale_mask cpumask bits when a
>>> MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE is invoked.
>>
>> [...]
>>
>>> The only part where I think you may want to keep some level of deferred
>>> icache flushing as you do now is as follows:
>>>
>>> - when membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE is invoked,
>>> call a new architecture hook which sets cpumask bits in the mm context
>>> that tells the next switch_mm on each cpu to issue fence.i for that mm.
>>> - keep something like flush_icache_deferred as you have now.
>>>
>>> Otherwise, I fear the overhead of a very expensive fence.i would be too
>>> much when processes registering with MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE
>>> and start doing fence.i on each and every switch_mm().
>>>
>>> So you'd basically rely on membarrier to only issue IPIs to the CPUs which are
>>> currently running threads belonging to the mm, and handle the switch_mm with
>>> the sync_core_before_usermode() for uthread->kthread->uthread case, and implement
>>> a deferred icache flush for the typical switch_mm() case.
>>
>> I've (tried to) put this together and obtained the two patches reported below.
>> Please let me know if this aligns with your intentions and/or there's interest
>> in a proper submission.
>
> This looks good to me, can you send out a non-RFC? I just sent out
> patches to support userspace fence.i:
>
https://lore.kernel.org/linux-riscv/[email protected]/T/#t.
>
> - Charlie
>
Hi Andrea,
Yes, please send those as non-RFC patches. They align well with my
intentions.
Thanks!
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
On Thu, Nov 23, 2023 at 2:07 AM Charlie Jenkins <[email protected]> wrote:
>
> On Thu, Nov 09, 2023 at 08:24:58PM +0100, Andrea Parri wrote:
> > Mathieu, all,
> >
> > Sorry for the delay,
> >
> > > AFAIR this patch implements sync_core_before_usermode which gets used by
> > > membarrier_mm_sync_core_before_usermode() to handle the uthread->kthread->uthread
> > > case. It relies on switch_mm issuing a core serializing instruction as well.
> > >
> > > Looking at RISC-V switch_mm(), I see that switch_mm() calls:
> > >
> > > flush_icache_deferred(next, cpu);
> > >
> > > which only issues a fence.i if a deferred icache flush was required. We're
> > > missing the part that sets the icache_stale_mask cpumask bits when a
> > > MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE is invoked.
> >
> > [...]
> >
> > > The only part where I think you may want to keep some level of deferred
> > > icache flushing as you do now is as follows:
> > >
> > > - when membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE is invoked,
> > > call a new architecture hook which sets cpumask bits in the mm context
> > > that tells the next switch_mm on each cpu to issue fence.i for that mm.
> > > - keep something like flush_icache_deferred as you have now.
> > >
> > > Otherwise, I fear the overhead of a very expensive fence.i would be too
> > > much when processes registering with MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE
> > > and start doing fence.i on each and every switch_mm().
> > >
> > > So you'd basically rely on membarrier to only issue IPIs to the CPUs which are
> > > currently running threads belonging to the mm, and handle the switch_mm with
> > > the sync_core_before_usermode() for uthread->kthread->uthread case, and implement
> > > a deferred icache flush for the typical switch_mm() case.
> >
> > I've (tried to) put this together and obtained the two patches reported below.
> > Please let me know if this aligns with your intentions and/or there's interest
> > in a proper submission.
> >
> > Andrea
> >
> >
> > From e7d07a6c04b2565fceedcd71c2175e7df7e11d96 Mon Sep 17 00:00:00 2001
> > From: Andrea Parri <[email protected]>
> > Date: Thu, 9 Nov 2023 11:03:00 +0100
> > Subject: [PATCH 1/2] locking: Introduce prepare_sync_core_cmd()
> >
> > Introduce an architecture function that architectures can use to set
> > up ("prepare") SYNC_CORE commands.
> >
> > The function will be used by RISC-V to update its "deferred icache-
> > flush" data structures (icache_stale_mask).
> >
> > Architectures defining prepare_sync_core_cmd() static inline need to
> > select ARCH_HAS_PREPARE_SYNC_CORE_CMD.
> >
> > Signed-off-by: Andrea Parri <[email protected]>
> > Suggested-by: Mathieu Desnoyers <[email protected]>
> > ---
> > include/linux/sync_core.h | 16 +++++++++++++++-
> > init/Kconfig | 3 +++
> > kernel/sched/membarrier.c | 1 +
> > 3 files changed, 19 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/linux/sync_core.h b/include/linux/sync_core.h
> > index 013da4b8b3272..67bb9794b8758 100644
> > --- a/include/linux/sync_core.h
> > +++ b/include/linux/sync_core.h
> > @@ -17,5 +17,19 @@ static inline void sync_core_before_usermode(void)
> > }
> > #endif
> >
> > -#endif /* _LINUX_SYNC_CORE_H */
> > +#ifdef CONFIG_ARCH_HAS_PREPARE_SYNC_CORE_CMD
> > +#include <asm/sync_core.h>
> > +#else
> > +/*
> > + * This is a dummy prepare_sync_core_cmd() implementation that can be used on
> > + * all architectures which provide unconditional core serializing instructions
> > + * in switch_mm().
> > + * If your architecture doesn't provide such core serializing instructions in
> > + * switch_mm(), you may need to write your own functions.
> > + */
> > +static inline void prepare_sync_core_cmd(struct mm_struct *mm)
> > +{
> > +}
> > +#endif
> >
> > +#endif /* _LINUX_SYNC_CORE_H */
> > diff --git a/init/Kconfig b/init/Kconfig
> > index 6d35728b94b2b..61f5f982ca451 100644
> > --- a/init/Kconfig
> > +++ b/init/Kconfig
> > @@ -1972,6 +1972,9 @@ source "kernel/Kconfig.locks"
> > config ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
> > bool
> >
> > +config ARCH_HAS_PREPARE_SYNC_CORE_CMD
> > + bool
> > +
> > config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
> > bool
> >
> > diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
> > index 2ad881d07752c..58f801e013988 100644
> > --- a/kernel/sched/membarrier.c
> > +++ b/kernel/sched/membarrier.c
> > @@ -320,6 +320,7 @@ static int membarrier_private_expedited(int flags, int cpu_id)
> > MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
> > return -EPERM;
> > ipi_func = ipi_sync_core;
> > + prepare_sync_core_cmd(mm);
> > } else if (flags == MEMBARRIER_FLAG_RSEQ) {
> > if (!IS_ENABLED(CONFIG_RSEQ))
> > return -EINVAL;
> > --
> > 2.34.1
> >
> >
> > From 617896a1d58a5f8b0e5895dbc928a54e0461d959 Mon Sep 17 00:00:00 2001
> > From: Andrea Parri <[email protected]>
> > Date: Tue, 7 Nov 2023 21:08:06 +0100
> > Subject: [PATCH 2/2] membarrier: riscv: Provide core serializing command
> >
> > RISC-V uses xRET instructions on return from interrupt and to go back
> > to user-space; the xRET instruction is not core serializing.
> >
> > Use FENCE.I for providing core serialization as follows:
> >
> > - by calling sync_core_before_usermode() on return from interrupt (cf.
> > ipi_sync_core()),
> >
> > - via switch_mm() and sync_core_before_usermode() (respectively, for
> > uthread->uthread and kthread->uthread transitions) to go back to
> > user-space.
> >
> > On RISC-V, the serialization in switch_mm() is activated by resetting
> > the icache_stale_mask of the mm at prepare_sync_core_cmd().
> >
> > Signed-off-by: Andrea Parri <[email protected]>
> > Suggested-by: Palmer Dabbelt <[email protected]>
> > ---
> > .../membarrier-sync-core/arch-support.txt | 2 +-
> > arch/riscv/Kconfig | 3 +++
> > arch/riscv/include/asm/sync_core.h | 23 +++++++++++++++++++
> > 3 files changed, 27 insertions(+), 1 deletion(-)
> > create mode 100644 arch/riscv/include/asm/sync_core.h
> >
> > diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
> > index 23260ca449468..a17117d76e6d8 100644
> > --- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt
> > +++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
> > @@ -44,7 +44,7 @@
> > | openrisc: | TODO |
> > | parisc: | TODO |
> > | powerpc: | ok |
> > - | riscv: | TODO |
> > + | riscv: | ok |
> > | s390: | ok |
> > | sh: | TODO |
> > | sparc: | TODO |
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index 9c48fecc67191..b70a0b9ea3ee7 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -27,14 +27,17 @@ config RISCV
> > select ARCH_HAS_GCOV_PROFILE_ALL
> > select ARCH_HAS_GIGANTIC_PAGE
> > select ARCH_HAS_KCOV
> > + select ARCH_HAS_MEMBARRIER_SYNC_CORE
> > select ARCH_HAS_MMIOWB
> > select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
> > select ARCH_HAS_PMEM_API
> > + select ARCH_HAS_PREPARE_SYNC_CORE_CMD
> > select ARCH_HAS_PTE_SPECIAL
> > select ARCH_HAS_SET_DIRECT_MAP if MMU
> > select ARCH_HAS_SET_MEMORY if MMU
> > select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL
> > select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL
> > + select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
> > select ARCH_HAS_SYSCALL_WRAPPER
> > select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
> > select ARCH_HAS_UBSAN_SANITIZE_ALL
> > diff --git a/arch/riscv/include/asm/sync_core.h b/arch/riscv/include/asm/sync_core.h
> > new file mode 100644
> > index 0000000000000..8be5e07d641ab
> > --- /dev/null
> > +++ b/arch/riscv/include/asm/sync_core.h
> > @@ -0,0 +1,23 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef _ASM_RISCV_SYNC_CORE_H
> > +#define _ASM_RISCV_SYNC_CORE_H
> > +
> > +/*
> > + * RISC-V implements return to user-space through an xRET instruction,
> > + * which is not core serializing.
> > + */
> > +static inline void sync_core_before_usermode(void)
> > +{
> > + asm volatile ("fence.i" ::: "memory");
> > +}
> > +
> > +/*
> > + * Ensure the next switch_mm() on every CPU issues a core serializing
> > + * instruction for the given @mm.
> > + */
> > +static inline void prepare_sync_core_cmd(struct mm_struct *mm)
> > +{
> > + cpumask_setall(&mm->context.icache_stale_mask);
> > +}
> > +
> > +#endif /* _ASM_RISCV_SYNC_CORE_H */
> > --
> > 2.34.1
> >
>
> This looks good to me, can you send out a non-RFC? I just sent out
> patches to support userspace fence.i:
> https://lore.kernel.org/linux-riscv/[email protected]/T/#t.
Thank you Charlie, exactly what we are looking for!
Perfect that you added selection for fence.i, so we in the future can
select Zjid/import.i instead.
/Robbin
>
> - Charlie
>
> > This looks good to me, can you send out a non-RFC? I just sent out
> > patches to support userspace fence.i:
> > https://lore.kernel.org/linux-riscv/[email protected]/T/#t.
> >
> > - Charlie
> >
>
> Hi Andrea,
>
> Yes, please send those as non-RFC patches. They align well with my
> intentions.
>
> Thanks!
I've just sent them (after some editing to address the 0day report):
https://lore.kernel.org/lkml/[email protected]/
Andrea