Currently, atomic ops with acquire or release ordering are implemented
as atomic ops with relaxed ordering followed by or preceded by an
acquire fence or a release fence.
Section 8.1 of the "The RISC-V Instruction Set Manual Volume I:
Unprivileged ISA", titled, "Specifying Ordering of Atomic Instructions"
says:
| To provide more efficient support for release consistency [5], each
| atomic instruction has two bits, aq and rl, used to specify additional
| memory ordering constraints as viewed by other RISC-V harts.
and
| If only the aq bit is set, the atomic memory operation is treated as
| an acquire access.
| If only the rl bit is set, the atomic memory operation is treated as a
| release access.
So, rather than using two instructions (relaxed atomic op + fence), use
a single atomic op instruction with acquire/release ordering.
Example program:
atomic_t cnt = ATOMIC_INIT(0);
atomic_fetch_add_acquire(1, &cnt);
atomic_fetch_add_release(1, &cnt);
Before:
amoadd.w a4,a5,(a4) // Atomic add with relaxed ordering
fence r,rw // Fence to force Acquire ordering
fence rw,w // Fence to force Release ordering
amoadd.w a4,a5,(a4) // Atomic add with relaxed ordering
After:
amoadd.w.aq a4,a5,(a4) // Atomic add with Acquire ordering
amoadd.w.rl a4,a5,(a4) // Atomic add with Release ordering
Signed-off-by: Puranjay Mohan <[email protected]>
---
arch/riscv/include/asm/atomic.h | 64 +++++++++++++++++++++++++++++++++
1 file changed, 64 insertions(+)
diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
index 5b96c2f61adb..024e83936910 100644
--- a/arch/riscv/include/asm/atomic.h
+++ b/arch/riscv/include/asm/atomic.h
@@ -98,6 +98,30 @@ c_type arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \
return ret; \
} \
static __always_inline \
+c_type arch_atomic##prefix##_fetch_##op##_acquire(c_type i, \
+ atomic##prefix##_t *v) \
+{ \
+ register c_type ret; \
+ __asm__ __volatile__ ( \
+ " amo" #asm_op "." #asm_type ".aq %1, %2, %0" \
+ : "+A" (v->counter), "=r" (ret) \
+ : "r" (I) \
+ : "memory"); \
+ return ret; \
+} \
+static __always_inline \
+c_type arch_atomic##prefix##_fetch_##op##_release(c_type i, \
+ atomic##prefix##_t *v) \
+{ \
+ register c_type ret; \
+ __asm__ __volatile__ ( \
+ " amo" #asm_op "." #asm_type ".rl %1, %2, %0" \
+ : "+A" (v->counter), "=r" (ret) \
+ : "r" (I) \
+ : "memory"); \
+ return ret; \
+} \
+static __always_inline \
c_type arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \
{ \
register c_type ret; \
@@ -117,6 +141,18 @@ c_type arch_atomic##prefix##_##op##_return_relaxed(c_type i, \
return arch_atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I; \
} \
static __always_inline \
+c_type arch_atomic##prefix##_##op##_return_acquire(c_type i, \
+ atomic##prefix##_t *v) \
+{ \
+ return arch_atomic##prefix##_fetch_##op##_acquire(i, v) c_op I; \
+} \
+static __always_inline \
+c_type arch_atomic##prefix##_##op##_return_release(c_type i, \
+ atomic##prefix##_t *v) \
+{ \
+ return arch_atomic##prefix##_fetch_##op##_release(i, v) c_op I; \
+} \
+static __always_inline \
c_type arch_atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v) \
{ \
return arch_atomic##prefix##_fetch_##op(i, v) c_op I; \
@@ -138,23 +174,39 @@ ATOMIC_OPS(add, add, +, i)
ATOMIC_OPS(sub, add, +, -i)
#define arch_atomic_add_return_relaxed arch_atomic_add_return_relaxed
+#define arch_atomic_add_return_acquire arch_atomic_add_return_acquire
+#define arch_atomic_add_return_release arch_atomic_add_return_release
#define arch_atomic_sub_return_relaxed arch_atomic_sub_return_relaxed
+#define arch_atomic_sub_return_acquire arch_atomic_sub_return_acquire
+#define arch_atomic_sub_return_release arch_atomic_sub_return_release
#define arch_atomic_add_return arch_atomic_add_return
#define arch_atomic_sub_return arch_atomic_sub_return
#define arch_atomic_fetch_add_relaxed arch_atomic_fetch_add_relaxed
+#define arch_atomic_fetch_add_acquire arch_atomic_fetch_add_acquire
+#define arch_atomic_fetch_add_release arch_atomic_fetch_add_release
#define arch_atomic_fetch_sub_relaxed arch_atomic_fetch_sub_relaxed
+#define arch_atomic_fetch_sub_acquire arch_atomic_fetch_sub_acquire
+#define arch_atomic_fetch_sub_release arch_atomic_fetch_sub_release
#define arch_atomic_fetch_add arch_atomic_fetch_add
#define arch_atomic_fetch_sub arch_atomic_fetch_sub
#ifndef CONFIG_GENERIC_ATOMIC64
#define arch_atomic64_add_return_relaxed arch_atomic64_add_return_relaxed
+#define arch_atomic64_add_return_acquire arch_atomic64_add_return_acquire
+#define arch_atomic64_add_return_release arch_atomic64_add_return_release
#define arch_atomic64_sub_return_relaxed arch_atomic64_sub_return_relaxed
+#define arch_atomic64_sub_return_acquire arch_atomic64_sub_return_acquire
+#define arch_atomic64_sub_return_release arch_atomic64_sub_return_release
#define arch_atomic64_add_return arch_atomic64_add_return
#define arch_atomic64_sub_return arch_atomic64_sub_return
#define arch_atomic64_fetch_add_relaxed arch_atomic64_fetch_add_relaxed
+#define arch_atomic64_fetch_add_acquire arch_atomic64_fetch_add_acquire
+#define arch_atomic64_fetch_add_release arch_atomic64_fetch_add_release
#define arch_atomic64_fetch_sub_relaxed arch_atomic64_fetch_sub_relaxed
+#define arch_atomic64_fetch_sub_acquire arch_atomic64_fetch_sub_acquire
+#define arch_atomic64_fetch_sub_release arch_atomic64_fetch_sub_release
#define arch_atomic64_fetch_add arch_atomic64_fetch_add
#define arch_atomic64_fetch_sub arch_atomic64_fetch_sub
#endif
@@ -175,16 +227,28 @@ ATOMIC_OPS( or, or, i)
ATOMIC_OPS(xor, xor, i)
#define arch_atomic_fetch_and_relaxed arch_atomic_fetch_and_relaxed
+#define arch_atomic_fetch_and_acquire arch_atomic_fetch_and_acquire
+#define arch_atomic_fetch_and_release arch_atomic_fetch_and_release
#define arch_atomic_fetch_or_relaxed arch_atomic_fetch_or_relaxed
+#define arch_atomic_fetch_or_acquire arch_atomic_fetch_or_acquire
+#define arch_atomic_fetch_or_release arch_atomic_fetch_or_release
#define arch_atomic_fetch_xor_relaxed arch_atomic_fetch_xor_relaxed
+#define arch_atomic_fetch_xor_acquire arch_atomic_fetch_xor_acquire
+#define arch_atomic_fetch_xor_release arch_atomic_fetch_xor_release
#define arch_atomic_fetch_and arch_atomic_fetch_and
#define arch_atomic_fetch_or arch_atomic_fetch_or
#define arch_atomic_fetch_xor arch_atomic_fetch_xor
#ifndef CONFIG_GENERIC_ATOMIC64
#define arch_atomic64_fetch_and_relaxed arch_atomic64_fetch_and_relaxed
+#define arch_atomic64_fetch_and_acquire arch_atomic64_fetch_and_acquire
+#define arch_atomic64_fetch_and_release arch_atomic64_fetch_and_release
#define arch_atomic64_fetch_or_relaxed arch_atomic64_fetch_or_relaxed
+#define arch_atomic64_fetch_or_acquire arch_atomic64_fetch_or_acquire
+#define arch_atomic64_fetch_or_release arch_atomic64_fetch_or_release
#define arch_atomic64_fetch_xor_relaxed arch_atomic64_fetch_xor_relaxed
+#define arch_atomic64_fetch_xor_acquire arch_atomic64_fetch_xor_acquire
+#define arch_atomic64_fetch_xor_release arch_atomic64_fetch_xor_release
#define arch_atomic64_fetch_and arch_atomic64_fetch_and
#define arch_atomic64_fetch_or arch_atomic64_fetch_or
#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
--
2.40.1
Hi Puranjay,
On Sun, May 05, 2024 at 12:33:40PM +0000, Puranjay Mohan wrote:
> Currently, atomic ops with acquire or release ordering are implemented
> as atomic ops with relaxed ordering followed by or preceded by an
> acquire fence or a release fence.
>
> Section 8.1 of the "The RISC-V Instruction Set Manual Volume I:
> Unprivileged ISA", titled, "Specifying Ordering of Atomic Instructions"
> says:
>
> | To provide more efficient support for release consistency [5], each
> | atomic instruction has two bits, aq and rl, used to specify additional
> | memory ordering constraints as viewed by other RISC-V harts.
>
> and
>
> | If only the aq bit is set, the atomic memory operation is treated as
> | an acquire access.
> | If only the rl bit is set, the atomic memory operation is treated as a
> | release access.
>
> So, rather than using two instructions (relaxed atomic op + fence), use
> a single atomic op instruction with acquire/release ordering.
>
> Example program:
>
> atomic_t cnt = ATOMIC_INIT(0);
> atomic_fetch_add_acquire(1, &cnt);
> atomic_fetch_add_release(1, &cnt);
>
> Before:
>
> amoadd.w a4,a5,(a4) // Atomic add with relaxed ordering
> fence r,rw // Fence to force Acquire ordering
>
> fence rw,w // Fence to force Release ordering
> amoadd.w a4,a5,(a4) // Atomic add with relaxed ordering
>
> After:
>
> amoadd.w.aq a4,a5,(a4) // Atomic add with Acquire ordering
>
> amoadd.w.rl a4,a5,(a4) // Atomic add with Release ordering
>
> Signed-off-by: Puranjay Mohan <[email protected]>
Your changes are effectively partially reverting:
5ce6c1f3535fa ("riscv/atomic: Strengthen implementations with fences")
Can you please provide (and possibly include in the changelog of v2) a more
thoughtful explanation for the correctness of such revert?
(Anticipating a somewhat non-trivial analysis...)
Have you tried your changes on some actual hardware? How did they perform?
Anything worth mentioning (besides the mere instruction count)?
Andrea
Andrea Parri <[email protected]> writes:
> Hi Puranjay,
>
> On Sun, May 05, 2024 at 12:33:40PM +0000, Puranjay Mohan wrote:
>> Currently, atomic ops with acquire or release ordering are implemented
>> as atomic ops with relaxed ordering followed by or preceded by an
>> acquire fence or a release fence.
>>
>> Section 8.1 of the "The RISC-V Instruction Set Manual Volume I:
>> Unprivileged ISA", titled, "Specifying Ordering of Atomic Instructions"
>> says:
>>
>> | To provide more efficient support for release consistency [5], each
>> | atomic instruction has two bits, aq and rl, used to specify additional
>> | memory ordering constraints as viewed by other RISC-V harts.
>>
>> and
>>
>> | If only the aq bit is set, the atomic memory operation is treated as
>> | an acquire access.
>> | If only the rl bit is set, the atomic memory operation is treated as a
>> | release access.
>>
>> So, rather than using two instructions (relaxed atomic op + fence), use
>> a single atomic op instruction with acquire/release ordering.
>>
>> Example program:
>>
>> atomic_t cnt = ATOMIC_INIT(0);
>> atomic_fetch_add_acquire(1, &cnt);
>> atomic_fetch_add_release(1, &cnt);
>>
>> Before:
>>
>> amoadd.w a4,a5,(a4) // Atomic add with relaxed ordering
>> fence r,rw // Fence to force Acquire ordering
>>
>> fence rw,w // Fence to force Release ordering
>> amoadd.w a4,a5,(a4) // Atomic add with relaxed ordering
>>
>> After:
>>
>> amoadd.w.aq a4,a5,(a4) // Atomic add with Acquire ordering
>>
>> amoadd.w.rl a4,a5,(a4) // Atomic add with Release ordering
>>
>> Signed-off-by: Puranjay Mohan <[email protected]>
>
> Your changes are effectively partially reverting:
>
> 5ce6c1f3535fa ("riscv/atomic: Strengthen implementations with fences")
>
> Can you please provide (and possibly include in the changelog of v2) a more
> thoughtful explanation for the correctness of such revert?
>
> (Anticipating a somewhat non-trivial analysis...)
Hi Andrea,
I think Guo Ren sent a patch[1] like this earlier but it did not get
comments yet. I will reply on that thread[1] as well.
I saw the commit 5ce6c1f3535fa ("riscv/atomic: Strengthen
implementations with fences") and all the related discussions.
This is what I understand from the discussions:
RISCV's LR.aq/SC.rl were following RCpc ordering but the LKMM expected
RCsc ordering from lock() and unlock(). So you added fences to force RCsc
ordering in the locks/atomics.
An experiment with LKMM and RISCV MM:
The following litmus test should not reach (1:r0=1 /\ 1:r1=0) with LKMM:
C unlock-lock-read-ordering
{}
/* s initially owned by P1 */
P0(int *x, int *y)
{
WRITE_ONCE(*x, 1);
smp_wmb();
WRITE_ONCE(*y, 1);
}
P1(int *x, int *y, spinlock_t *s)
{
int r0;
int r1;
r0 = READ_ONCE(*y);
spin_unlock(s);
spin_lock(s);
r1 = READ_ONCE(*x);
}
exists (1:r0=1 /\ 1:r1=0)
Which is indeed true:
Test unlock-lock-read-ordering Allowed
States 3
1:r0=0; 1:r1=0;
1:r0=0; 1:r1=1;
1:r0=1; 1:r1=1;
No
Witnesses
Positive: 0 Negative: 3
Flag unmatched-unlock
Condition exists (1:r0=1 /\ 1:r1=0)
Observation unlock-lock-read-ordering Never 0 3
Time unlock-lock-read-ordering 0.01
Hash=ab0cfdcde54d1bb1faa731533980f424
And when I map this test to RISC-V:
RISCV RISCV-unlock-lock-read-ordering
{
0:x2=x;
0:x4=y;
1:x2=x;
1:x4=y;
1:x6=s;
}
P0 | P1 ;
ori x1,x0,1 | lw x1,0(x4) ;
sw x1,0(x2) | amoswap.w.rl x0,x0,(x6) ;
fence w,w | ori x5,x0,1 ;
ori x3,x0,1 | amoswap.w.aq x0,x5,(x6) ;
sw x3,0(x4) | lw x3,0(x2) ;
exists (1:x1=1 /\ 1:x3=0)
This also doesn't reach the condition:
Test RISCV-unlock-lock-read-ordering Allowed
States 3
1:x1=0; 1:x3=0;
1:x1=0; 1:x3=1;
1:x1=1; 1:x3=1;
No
Witnesses
Positive: 0 Negative: 3
Condition exists (1:x1=1 /\ 1:x3=0)
Observation RISCV-unlock-lock-read-ordering Never 0 3
Time RISCV-unlock-lock-read-ordering 0.01
Hash=d845d36e2a8480165903870d135dd81e
Your commit mentioned that the above test would reach the exists
condition for RISCV.
So, maybe the model has been modified to make .aq and .rl RCsc now?
This presentation[2] by Dan Lustig says on page 31:
| PPO RULES 5-7
| A release that precedes an acquire in program
| order also precedes it in global memory order
| • i.e., the RCsc variant of release consistency
If above is true, removing the weak fences and using LR, SC, AMOs with
aq, rl, and aqrl bits could be used in the kernel AMOs and locks.
[1] https://lore.kernel.org/all/[email protected]/
[2] https://riscv.org/wp-content/uploads/2018/05/14.25-15.00-RISCVMemoryModelTutorial.pdf
Thanks,
Puranjay
> I think Guo Ren sent a patch[1] like this earlier but it did not get
> comments yet. I will reply on that thread[1] as well.
TBF, those changes appeared in a later submission/series,
https://lore.kernel.org/lkml/[email protected]/
a submission that received a similar feedback from the ATOMIC INFRASTRUCTURE
maintainers and myself: in short, "please explain _why_ you are doing what
you are doing".
> I saw the commit 5ce6c1f3535fa ("riscv/atomic: Strengthen
> implementations with fences") and all the related discussions.
>
> This is what I understand from the discussions:
>
> RISCV's LR.aq/SC.rl were following RCpc ordering but the LKMM expected
> RCsc ordering from lock() and unlock(). So you added fences to force RCsc
> ordering in the locks/atomics.
Appreciate the effort. Some corrections/clarifications:
When 5ce6c1f3535fa was developed, the LKMM expected "less-than-RCsc" ordering
from the lock operations. Some of those properties were illustrated by the
unlock-lock-read-ordering litmus test you reported (and included in
0123f4d76ca63 ("riscv/spinlock: Strengthen implementations with fences") ).
It's also worth mentioning that, when 5ce6c1f3535fa was discussed, the LKMM
expected similar read-read ordering properties to hold for ordinary acquire
and release operations, i.e. not necessary a lock operation.
Later changes to the LKMM relaxed those properties for ordinary acquires and
releases, and added extra ordering for locks, cf.
6e89e831a9017 ("tools/memory-model: Add extra ordering for locks and remove it for ordinary release/acquire")
ddfe12944e848 ("tools/memory-model: Provide extra ordering for unlock+lock pair on the same CPU")
Roughly speaking, such changes made the LKMM's locks RCtso, and this matches
the current LKMM's approach. (Yes I know, there is code assuming/wishing RCsc
locks... long story, not strictly related to this discussion/thread: IAC, for
completeness, I'll say more about that in my comments below.)
My changes/the current implementations provides RCtso (not RCsc) ordering for
RISCV's locks and atomics; in fact, by their very design, this RCtso is pretty
easy to see/prove:
(1) every release op provides RW to W order (or stronger);
(2) every acquire op provides more than R to R order (typically R to RW
order, but in atomic_cond_load_acquire() & co. that R-to-W order is
limited to the "R" associated with the acquire op itself).
Put together, (1-2) give R-to-R, R-to-W and W-to-W order (aka RCtso) as claimed.
Notice that this argument holds for every locks operations and types (spinlock,
rwlock, mutex, rt_mutex, semaphore, rw_semaphore, etc.) and that it does _not_
require any audit of the locking code. More on this point below.
> An experiment with LKMM and RISCV MM:
>
> The following litmus test should not reach (1:r0=1 /\ 1:r1=0) with LKMM:
>
> C unlock-lock-read-ordering
>
> {}
> /* s initially owned by P1 */
>
> P0(int *x, int *y)
> {
> WRITE_ONCE(*x, 1);
> smp_wmb();
> WRITE_ONCE(*y, 1);
> }
>
> P1(int *x, int *y, spinlock_t *s)
> {
> int r0;
> int r1;
>
> r0 = READ_ONCE(*y);
> spin_unlock(s);
> spin_lock(s);
> r1 = READ_ONCE(*x);
> }
>
> exists (1:r0=1 /\ 1:r1=0)
>
> Which is indeed true:
>
> Test unlock-lock-read-ordering Allowed
> States 3
> 1:r0=0; 1:r1=0;
> 1:r0=0; 1:r1=1;
> 1:r0=1; 1:r1=1;
> No
> Witnesses
> Positive: 0 Negative: 3
> Flag unmatched-unlock
> Condition exists (1:r0=1 /\ 1:r1=0)
> Observation unlock-lock-read-ordering Never 0 3
> Time unlock-lock-read-ordering 0.01
> Hash=ab0cfdcde54d1bb1faa731533980f424
>
> And when I map this test to RISC-V:
>
> RISCV RISCV-unlock-lock-read-ordering
> {
> 0:x2=x;
> 0:x4=y;
>
> 1:x2=x;
> 1:x4=y;
> 1:x6=s;
> }
> P0 | P1 ;
> ori x1,x0,1 | lw x1,0(x4) ;
> sw x1,0(x2) | amoswap.w.rl x0,x0,(x6) ;
> fence w,w | ori x5,x0,1 ;
> ori x3,x0,1 | amoswap.w.aq x0,x5,(x6) ;
> sw x3,0(x4) | lw x3,0(x2) ;
> exists (1:x1=1 /\ 1:x3=0)
>
> This also doesn't reach the condition:
>
> Test RISCV-unlock-lock-read-ordering Allowed
> States 3
> 1:x1=0; 1:x3=0;
> 1:x1=0; 1:x3=1;
> 1:x1=1; 1:x3=1;
> No
> Witnesses
> Positive: 0 Negative: 3
> Condition exists (1:x1=1 /\ 1:x3=0)
> Observation RISCV-unlock-lock-read-ordering Never 0 3
> Time RISCV-unlock-lock-read-ordering 0.01
> Hash=d845d36e2a8480165903870d135dd81e
Which "mapping" did you use for this experiment/analysis? Looking at the
current spinlock code for RISCV (and including the atomic changes at stake)
P1 seems to be better described by something like:
fence rw,w // arch_spin_unlock --> smp_store_release
sw
lr.w // arch_spin_trylock --> arch_try_cmpxchg
bne
sc.w.rl
bnez
fence rw,rw
or
amoadd.w.aqrl // arch_spin_lock --> atomic_fetch_add
or
lw // arch_spin_lock --> atomic_cond_read_acquire ; smp_mb (??)
bne
fence r,r
fence rw,rw
Looking at the rwlock code (for which the same RCtso property is expected to
hold, even though that hasn't been formalized in the LKMM yet), I see (again,
including your atomic changes):
amoadd.w.rl // queued_read_unlock --> atomic_sub_return_release
amoadd.w.aq // queued_read_lock --> atomic_add_return_acquire
and
fence rw,w // queue_write_unlock --> smp_store_release
sw
lr.w // queue_write_lock --> atomic_try_cmpxchg_acquire
bne
sc.w
bnez
fence r,rw
I won't list the slowpath scenarios. Or even the mutex, semaphore, etc. I
believe you got the point...
> Your commit mentioned that the above test would reach the exists
> condition for RISCV.
>
> So, maybe the model has been modified to make .aq and .rl RCsc now?
Yes. .aq and .rl are RCsc. They were considered RCpc when 5ce6c1f3535fa
0123f4d76ca63 were discussed (which happened _before_ the RISC-V's memory
model was ratified) as clearly remarked in their commit messages.
The ATOMIC maintainers went as far as "bisecting" the RISC-V ISA spec in
https://lore.kernel.org/lkml/YrPei6q4rIAx6Ymf@boqun-archlinux/
but, as they say, it's hard to help people who don't want to be helped...
> This presentation[2] by Dan Lustig says on page 31:
>
> | PPO RULES 5-7
> | A release that precedes an acquire in program
> | order also precedes it in global memory order
> | • i.e., the RCsc variant of release consistency
>
> If above is true, removing the weak fences and using LR, SC, AMOs with
> aq, rl, and aqrl bits could be used in the kernel AMOs and locks.
The problem with this argument is that it relies on all lock ops to come with
an RCsc annotation, which is simply not true in the current/RISCV code as the
few snippets above also suggested.
BTW, arm64 uses a similar argument, except all its releases/acquires come with
RCsc annotations (which greatly simplifies the analysis). The argument could
be easily made to work in RISCV _provided_ its ISA were augmented with lw.aq
and sw.rl, but it's been ~6 years...
Said this, maybe we're "lucky" and all the unlock+lock pairs will just work w/
your changes. I haven't really checked, and I probably won't until the only
motivation for such changes will be "lower inst count in qemu".
On such regard, remark that Section A.5, "Code porting and mapping guidelines"
of the RISCV ISA spec provides alternative mapping for our atomics, including
AMO mapping w/ .aq and .rl annotations: I'm sure those mappings were subject
to a fair amount of review and formal analysis (although I was not involved in
that work/review at the time): if inst count is so important to you, why not
simply follow those guidelines? (Notice that such re-write would require some
modification to non-AMO mappings, cf. smp_store_release() and LR/SC mappings.)
Andrea
Andrea Parri <[email protected]> writes:
>> I think Guo Ren sent a patch[1] like this earlier but it did not get
>> comments yet. I will reply on that thread[1] as well.
>
> TBF, those changes appeared in a later submission/series,
>
> https://lore.kernel.org/lkml/[email protected]/
>
> a submission that received a similar feedback from the ATOMIC INFRASTRUCTURE
> maintainers and myself: in short, "please explain _why_ you are doing what
> you are doing".
>
>
>> I saw the commit 5ce6c1f3535fa ("riscv/atomic: Strengthen
>> implementations with fences") and all the related discussions.
>>
>> This is what I understand from the discussions:
>>
>> RISCV's LR.aq/SC.rl were following RCpc ordering but the LKMM expected
>> RCsc ordering from lock() and unlock(). So you added fences to force RCsc
>> ordering in the locks/atomics.
>
> Appreciate the effort. Some corrections/clarifications:
>
> When 5ce6c1f3535fa was developed, the LKMM expected "less-than-RCsc" ordering
> from the lock operations. Some of those properties were illustrated by the
> unlock-lock-read-ordering litmus test you reported (and included in
>
> 0123f4d76ca63 ("riscv/spinlock: Strengthen implementations with fences") ).
>
> It's also worth mentioning that, when 5ce6c1f3535fa was discussed, the LKMM
> expected similar read-read ordering properties to hold for ordinary acquire
> and release operations, i.e. not necessary a lock operation.
>
> Later changes to the LKMM relaxed those properties for ordinary acquires and
> releases, and added extra ordering for locks, cf.
>
> 6e89e831a9017 ("tools/memory-model: Add extra ordering for locks and remove it for ordinary release/acquire")
> ddfe12944e848 ("tools/memory-model: Provide extra ordering for unlock+lock pair on the same CPU")
>
> Roughly speaking, such changes made the LKMM's locks RCtso, and this matches
> the current LKMM's approach. (Yes I know, there is code assuming/wishing RCsc
> locks... long story, not strictly related to this discussion/thread: IAC, for
> completeness, I'll say more about that in my comments below.)
>
> My changes/the current implementations provides RCtso (not RCsc) ordering for
> RISCV's locks and atomics; in fact, by their very design, this RCtso is pretty
> easy to see/prove:
>
> (1) every release op provides RW to W order (or stronger);
>
> (2) every acquire op provides more than R to R order (typically R to RW
> order, but in atomic_cond_load_acquire() & co. that R-to-W order is
> limited to the "R" associated with the acquire op itself).
>
> Put together, (1-2) give R-to-R, R-to-W and W-to-W order (aka RCtso) as claimed.
> Notice that this argument holds for every locks operations and types (spinlock,
> rwlock, mutex, rt_mutex, semaphore, rw_semaphore, etc.) and that it does _not_
> require any audit of the locking code. More on this point below.
>
>
>> An experiment with LKMM and RISCV MM:
>>
>> The following litmus test should not reach (1:r0=1 /\ 1:r1=0) with LKMM:
>>
>> C unlock-lock-read-ordering
>>
>> {}
>> /* s initially owned by P1 */
>>
>> P0(int *x, int *y)
>> {
>> WRITE_ONCE(*x, 1);
>> smp_wmb();
>> WRITE_ONCE(*y, 1);
>> }
>>
>> P1(int *x, int *y, spinlock_t *s)
>> {
>> int r0;
>> int r1;
>>
>> r0 = READ_ONCE(*y);
>> spin_unlock(s);
>> spin_lock(s);
>> r1 = READ_ONCE(*x);
>> }
>>
>> exists (1:r0=1 /\ 1:r1=0)
>>
>> Which is indeed true:
>>
>> Test unlock-lock-read-ordering Allowed
>> States 3
>> 1:r0=0; 1:r1=0;
>> 1:r0=0; 1:r1=1;
>> 1:r0=1; 1:r1=1;
>> No
>> Witnesses
>> Positive: 0 Negative: 3
>> Flag unmatched-unlock
>> Condition exists (1:r0=1 /\ 1:r1=0)
>> Observation unlock-lock-read-ordering Never 0 3
>> Time unlock-lock-read-ordering 0.01
>> Hash=ab0cfdcde54d1bb1faa731533980f424
>>
>> And when I map this test to RISC-V:
>>
>> RISCV RISCV-unlock-lock-read-ordering
>> {
>> 0:x2=x;
>> 0:x4=y;
>>
>> 1:x2=x;
>> 1:x4=y;
>> 1:x6=s;
>> }
>> P0 | P1 ;
>> ori x1,x0,1 | lw x1,0(x4) ;
>> sw x1,0(x2) | amoswap.w.rl x0,x0,(x6) ;
>> fence w,w | ori x5,x0,1 ;
>> ori x3,x0,1 | amoswap.w.aq x0,x5,(x6) ;
>> sw x3,0(x4) | lw x3,0(x2) ;
>> exists (1:x1=1 /\ 1:x3=0)
>>
>> This also doesn't reach the condition:
>>
>> Test RISCV-unlock-lock-read-ordering Allowed
>> States 3
>> 1:x1=0; 1:x3=0;
>> 1:x1=0; 1:x3=1;
>> 1:x1=1; 1:x3=1;
>> No
>> Witnesses
>> Positive: 0 Negative: 3
>> Condition exists (1:x1=1 /\ 1:x3=0)
>> Observation RISCV-unlock-lock-read-ordering Never 0 3
>> Time RISCV-unlock-lock-read-ordering 0.01
>> Hash=d845d36e2a8480165903870d135dd81e
>
> Which "mapping" did you use for this experiment/analysis? Looking at the
Actually, by mapping I meant:
R1
amoswap.w.rl
amoswap.w.aq
R2
will provide R1->R2 ordering like:
R1
spin_unlock()
spin_lock()
R2
That test is for read->read ordering enforced by unlock()->lock() and I
just wanted to say that the current RISC-V memory model provides that
with all(amo/lr/sc) .rl -> .aq operations.
> current spinlock code for RISCV (and including the atomic changes at stake)
> P1 seems to be better described by something like:
>
> fence rw,w // arch_spin_unlock --> smp_store_release
> sw
>
> lr.w // arch_spin_trylock --> arch_try_cmpxchg
> bne
> sc.w.rl
> bnez
> fence rw,rw
>
> or
>
> amoadd.w.aqrl // arch_spin_lock --> atomic_fetch_add
>
> or
>
> lw // arch_spin_lock --> atomic_cond_read_acquire ; smp_mb (??)
> bne
> fence r,r
>
> fence rw,rw
>
> Looking at the rwlock code (for which the same RCtso property is expected to
> hold, even though that hasn't been formalized in the LKMM yet), I see (again,
> including your atomic changes):
>
> amoadd.w.rl // queued_read_unlock --> atomic_sub_return_release
> amoadd.w.aq // queued_read_lock --> atomic_add_return_acquire
>
> and
>
> fence rw,w // queue_write_unlock --> smp_store_release
> sw
> lr.w // queue_write_lock --> atomic_try_cmpxchg_acquire
> bne
> sc.w
> bnez
> fence r,rw
>
> I won't list the slowpath scenarios. Or even the mutex, semaphore, etc. I
> believe you got the point...
Thanks for these details, I will do more testing with this on herd7.
Something out of curiosity?:
From my understanding of the current version of the RV memory model:
aq provides .aq -> all ordering
rl provides all -> .rl ordering
and because this is RCsc variant of release consistency
rl -> .aq
which means
R/W
amoswap.w.rl
amoswap.w.aq
R/W
Should act as a full fence? R/W -> rl -> aq -> R/W
>
>> Your commit mentioned that the above test would reach the exists
>> condition for RISCV.
>>
>> So, maybe the model has been modified to make .aq and .rl RCsc now?
>
> Yes. .aq and .rl are RCsc. They were considered RCpc when 5ce6c1f3535fa
> 0123f4d76ca63 were discussed (which happened _before_ the RISC-V's memory
> model was ratified) as clearly remarked in their commit messages.
>
> The ATOMIC maintainers went as far as "bisecting" the RISC-V ISA spec in
>
> https://lore.kernel.org/lkml/YrPei6q4rIAx6Ymf@boqun-archlinux/
>
> but, as they say, it's hard to help people who don't want to be helped...
>
>
>> This presentation[2] by Dan Lustig says on page 31:
>>
>> | PPO RULES 5-7
>> | A release that precedes an acquire in program
>> | order also precedes it in global memory order
>> | • i.e., the RCsc variant of release consistency
>>
>> If above is true, removing the weak fences and using LR, SC, AMOs with
>> aq, rl, and aqrl bits could be used in the kernel AMOs and locks.
>
> The problem with this argument is that it relies on all lock ops to come with
> an RCsc annotation, which is simply not true in the current/RISCV code as the
> few snippets above also suggested.
>
> BTW, arm64 uses a similar argument, except all its releases/acquires come with
> RCsc annotations (which greatly simplifies the analysis). The argument could
> be easily made to work in RISCV _provided_ its ISA were augmented with lwaq
> and sw.rl, but it's been ~6 years...
>
> Said this, maybe we're "lucky" and all the unlock+lock pairs will just work w/
> your changes. I haven't really checked, and I probably won't until the only
> motivation for such changes will be "lower inst count in qemu".
>
> On such regard, remark that Section A.5, "Code porting and mapping guidelines"
> of the RISCV ISA spec provides alternative mapping for our atomics, including
> AMO mapping w/ .aq and .rl annotations: I'm sure those mappings were subject
> to a fair amount of review and formal analysis (although I was not involved in
> that work/review at the time): if inst count is so important to you, why not
> simply follow those guidelines? (Notice that such re-write would require some
> modification to non-AMO mappings, cf. smp_store_release() and LR/SC mappings.)
So, I will do the following now:
1. Do some benchmarking on real hardware and find out how much overhead
these weak fences add.
2. Study the LKMM and the RVWMO for the next few weeks/months or however
much time it takes me to confidently reason about things written in
these two models.
3. Study the locking / related code of RISC-V to see what could break if
we change all these operations in accordance with "Code Porting and
Mapping Guidelines" of RISCV ISA.
4. I will use the herd7 models of LKMM and RVWMO and see if everything
works as expected after these changes.
And If I am convinced after all this, I will send a patch to implement
"Code Porting and Mapping Guidelines" + provide performance numbers from
real hardware.
Thanks for the detailed explainations and especially regarding how the
LKMM evolved.
Puranjay
> From my understanding of the current version of the RV memory model:
>
> .aq provides .aq -> all ordering
> .rl provides all -> .rl ordering
> and because this is RCsc variant of release consistency
> .rl -> .aq
>
> which means
>
> R/W
> amoswap.w.rl
> amoswap.w.aq
> R/W
>
> Should act as a full fence? R/W -> rl -> aq -> R/W
Yes, hence the RCsc ("sc" for "sequential consistent") qualification.
> So, I will do the following now:
>
> 1. Do some benchmarking on real hardware and find out how much overhead
> these weak fences add.
> 2. Study the LKMM and the RVWMO for the next few weeks/months or however
> much time it takes me to confidently reason about things written in
> these two models.
> 3. Study the locking / related code of RISC-V to see what could break if
> we change all these operations in accordance with "Code Porting and
> Mapping Guidelines" of RISCV ISA.
> 4. I will use the herd7 models of LKMM and RVWMO and see if everything
> works as expected after these changes.
>
>
> And If I am convinced after all this, I will send a patch to implement
> "Code Porting and Mapping Guidelines" + provide performance numbers from
> real hardware.
>
> Thanks for the detailed explainations and especially regarding how the
> LKMM evolved.
Sounds good! Thanks.
Andrea