2019-04-24 19:42:05

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage

The comment describing the loongson_llsc_mb() reorder case doesn't
make any sense what so ever. Instruction re-ordering is not an SMP
artifact, but rather a CPU local phenomenon. This means that _every_
LL/SC loop needs this barrier right in front to avoid the CPU from
leaking a memop inside it.

For the branch speculation case; if futex_atomic_cmpxchg_inatomic()
needs one at the bne branch target, then surely the normal
__cmpxch_asmg() implementation does too. We cannot rely on the
barriers from cmpxchg() because cmpxchg_local() is implemented with
the same macro, and branch prediction and speculation are, too, CPU
local.

Fixes: e02e07e3127d ("MIPS: Loongson: Introduce and use loongson_llsc_mb()")
Cc: Huacai Chen <[email protected]>
Cc: Huang Pei <[email protected]>
Cc: Paul Burton <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/mips/include/asm/atomic.h | 24 ++++++++++++++++++++----
arch/mips/include/asm/barrier.h | 7 ++-----
arch/mips/include/asm/bitops.h | 5 +++++
arch/mips/include/asm/cmpxchg.h | 5 +++++
arch/mips/include/asm/local.h | 2 ++
arch/mips/kernel/syscall.c | 1 +
6 files changed, 35 insertions(+), 9 deletions(-)

--- a/arch/mips/include/asm/atomic.h
+++ b/arch/mips/include/asm/atomic.h
@@ -193,6 +193,7 @@ static __inline__ int atomic_sub_if_posi
if (kernel_uses_llsc) {
int temp;

+ loongson_llsc_mb();
__asm__ __volatile__(
" .set push \n"
" .set "MIPS_ISA_LEVEL" \n"
@@ -200,16 +201,23 @@ static __inline__ int atomic_sub_if_posi
" .set pop \n"
" subu %0, %1, %3 \n"
" move %1, %0 \n"
- " bltz %0, 1f \n"
+ " bltz %0, 2f \n"
" .set push \n"
" .set "MIPS_ISA_LEVEL" \n"
" sc %1, %2 \n"
"\t" __scbeqz " %1, 1b \n"
- "1: \n"
+ "2: \n"
" .set pop \n"
: "=&r" (result), "=&r" (temp),
"+" GCC_OFF_SMALL_ASM() (v->counter)
: "Ir" (i));
+
+ if (!IS_ENABLED(CONFIG_SMP))
+ loongson_llsc_mb();
+ /*
+ * Otherwise the loongson_llsc_mb() for the bltz target is
+ * implied by the smp_llsc_mb() below.
+ */
} else {
unsigned long flags;

@@ -395,20 +403,28 @@ static __inline__ long atomic64_sub_if_p
if (kernel_uses_llsc) {
long temp;

+ loongson_llsc_mb();
__asm__ __volatile__(
" .set push \n"
" .set "MIPS_ISA_LEVEL" \n"
"1: lld %1, %2 # atomic64_sub_if_positive\n"
" dsubu %0, %1, %3 \n"
" move %1, %0 \n"
- " bltz %0, 1f \n"
+ " bltz %0, 2f \n"
" scd %1, %2 \n"
"\t" __scbeqz " %1, 1b \n"
- "1: \n"
+ "2: \n"
" .set pop \n"
: "=&r" (result), "=&r" (temp),
"+" GCC_OFF_SMALL_ASM() (v->counter)
: "Ir" (i));
+
+ if (!IS_ENABLED(CONFIG_SMP))
+ loongson_llsc_mb();
+ /*
+ * Otherwise the loongson_llsc_mb() for the bltz target is
+ * implied by the smp_llsc_mb() below.
+ */
} else {
unsigned long flags;

--- a/arch/mips/include/asm/barrier.h
+++ b/arch/mips/include/asm/barrier.h
@@ -248,10 +248,7 @@
*
* In order to avoid this we need to place a memory barrier (ie. a sync
* instruction) prior to every ll instruction, in between it & any earlier
- * memory access instructions. Many of these cases are already covered by
- * smp_mb__before_llsc() but for the remaining cases, typically ones in
- * which multiple CPUs may operate on a memory location but ordering is not
- * usually guaranteed, we use loongson_llsc_mb() below.
+ * memory access instructions.
*
* This reordering case is fixed by 3A R2 CPUs, ie. 3A2000 models and later.
*
@@ -267,7 +264,7 @@
* This case affects all current Loongson 3 CPUs.
*/
#ifdef CONFIG_CPU_LOONGSON3_WORKAROUNDS /* Loongson-3's LLSC workaround */
-#define loongson_llsc_mb() __asm__ __volatile__(__WEAK_LLSC_MB : : :"memory")
+#define loongson_llsc_mb() __asm__ __volatile__("sync" : : :"memory")
#else
#define loongson_llsc_mb() do { } while (0)
#endif
--- a/arch/mips/include/asm/bitops.h
+++ b/arch/mips/include/asm/bitops.h
@@ -249,6 +249,7 @@ static inline int test_and_set_bit(unsig
unsigned long *m = ((unsigned long *) addr) + (nr >> SZLONG_LOG);
unsigned long temp;

+ loongson_llsc_mb();
do {
__asm__ __volatile__(
" .set push \n"
@@ -305,6 +306,7 @@ static inline int test_and_set_bit_lock(
unsigned long *m = ((unsigned long *) addr) + (nr >> SZLONG_LOG);
unsigned long temp;

+ loongson_llsc_mb();
do {
__asm__ __volatile__(
" .set push \n"
@@ -364,6 +366,7 @@ static inline int test_and_clear_bit(uns
unsigned long *m = ((unsigned long *) addr) + (nr >> SZLONG_LOG);
unsigned long temp;

+ loongson_llsc_mb();
do {
__asm__ __volatile__(
" " __LL "%0, %1 # test_and_clear_bit \n"
@@ -379,6 +382,7 @@ static inline int test_and_clear_bit(uns
unsigned long *m = ((unsigned long *) addr) + (nr >> SZLONG_LOG);
unsigned long temp;

+ loongson_llsc_mb();
do {
__asm__ __volatile__(
" .set push \n"
@@ -438,6 +442,7 @@ static inline int test_and_change_bit(un
unsigned long *m = ((unsigned long *) addr) + (nr >> SZLONG_LOG);
unsigned long temp;

+ loongson_llsc_mb();
do {
__asm__ __volatile__(
" .set push \n"
--- a/arch/mips/include/asm/cmpxchg.h
+++ b/arch/mips/include/asm/cmpxchg.h
@@ -46,6 +46,7 @@ extern unsigned long __xchg_called_with_
__typeof(*(m)) __ret; \
\
if (kernel_uses_llsc) { \
+ loongson_llsc_mb(); \
__asm__ __volatile__( \
" .set push \n" \
" .set noat \n" \
@@ -117,6 +118,7 @@ static inline unsigned long __xchg(volat
__typeof(*(m)) __ret; \
\
if (kernel_uses_llsc) { \
+ loongson_llsc_mb(); \
__asm__ __volatile__( \
" .set push \n" \
" .set noat \n" \
@@ -134,6 +136,7 @@ static inline unsigned long __xchg(volat
: "=&r" (__ret), "=" GCC_OFF_SMALL_ASM() (*m) \
: GCC_OFF_SMALL_ASM() (*m), "Jr" (old), "Jr" (new) \
: "memory"); \
+ loongson_llsc_mb(); \
} else { \
unsigned long __flags; \
\
@@ -229,6 +232,7 @@ static inline unsigned long __cmpxchg64(
*/
local_irq_save(flags);

+ loongson_llsc_mb();
asm volatile(
" .set push \n"
" .set " MIPS_ISA_ARCH_LEVEL " \n"
@@ -274,6 +278,7 @@ static inline unsigned long __cmpxchg64(
"r" (old),
"r" (new)
: "memory");
+ loongson_llsc_mb();

local_irq_restore(flags);
return ret;
--- a/arch/mips/include/asm/local.h
+++ b/arch/mips/include/asm/local.h
@@ -49,6 +49,7 @@ static __inline__ long local_add_return(
} else if (kernel_uses_llsc) {
unsigned long temp;

+ loongson_llsc_mb();
__asm__ __volatile__(
" .set push \n"
" .set "MIPS_ISA_ARCH_LEVEL" \n"
@@ -96,6 +97,7 @@ static __inline__ long local_sub_return(
} else if (kernel_uses_llsc) {
unsigned long temp;

+ loongson_llsc_mb();
__asm__ __volatile__(
" .set push \n"
" .set "MIPS_ISA_ARCH_LEVEL" \n"
--- a/arch/mips/kernel/syscall.c
+++ b/arch/mips/kernel/syscall.c
@@ -132,6 +132,7 @@ static inline int mips_atomic_set(unsign
[efault] "i" (-EFAULT)
: "memory");
} else if (cpu_has_llsc) {
+ loongson_llsc_mb();
__asm__ __volatile__ (
" .set push \n"
" .set "MIPS_ISA_ARCH_LEVEL" \n"



2019-04-24 13:56:31

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage

On Wed, Apr 24, 2019 at 02:36:58PM +0200, Peter Zijlstra wrote:
> The comment describing the loongson_llsc_mb() reorder case doesn't
> make any sense what so ever. Instruction re-ordering is not an SMP
> artifact, but rather a CPU local phenomenon. This means that _every_
> LL/SC loop needs this barrier right in front to avoid the CPU from
> leaking a memop inside it.
>
> For the branch speculation case; if futex_atomic_cmpxchg_inatomic()
> needs one at the bne branch target, then surely the normal
> __cmpxch_asmg() implementation does too. We cannot rely on the
> barriers from cmpxchg() because cmpxchg_local() is implemented with
> the same macro, and branch prediction and speculation are, too, CPU
> local.

Also; just doing them all makes for much simpler rules and less
mistakes.

2019-04-25 07:10:34

by Paul Burton

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage

Hi Peter,

On Wed, Apr 24, 2019 at 02:36:58PM +0200, Peter Zijlstra wrote:
> The comment describing the loongson_llsc_mb() reorder case doesn't
> make any sense what so ever. Instruction re-ordering is not an SMP
> artifact, but rather a CPU local phenomenon. This means that _every_
> LL/SC loop needs this barrier right in front to avoid the CPU from
> leaking a memop inside it.

Does it?

The Loongson bug being described here causes an sc to succeed
erroneously if certain loads or stores are executed between the ll &
associated sc, including speculatively. On a UP system there's no code
running on other cores to race with us & cause our sc to fail - ie. sc
should always succeed anyway, so if the bug hits & the sc succeeds
what's the big deal? It would have succeeded anyway. At least that's my
understanding based on discussions with Loongson engineers a while ago.

Having said that, if you have a strong preference for adding the barrier
in UP systems anyway then I don't really object. It's not like anyone's
likely to want to run a UP kernel on the affected systems, nevermind
care about a miniscule performance impact.

One possibility your change could benefit would be if someone ran Linux
on a subset of cores & some non-Linux code on other cores, in which case
there could be something to cause the sc to fail. I've no idea if that's
something these Loongson systems ever do though.

> For the branch speculation case; if futex_atomic_cmpxchg_inatomic()
> needs one at the bne branch target, then surely the normal
> __cmpxch_asmg() implementation does too. We cannot rely on the

s/cmpxch_asmg/cmpxchg_asm/

> barriers from cmpxchg() because cmpxchg_local() is implemented with
> the same macro, and branch prediction and speculation are, too, CPU
> local.

Similar story - cmpxchg_local() ought not have have CPUs racing for
access to the memory in question. Having said that I don't know the
details of when Loongson clears LLBit (ie. causes an sc to fail), so if
it does that on based upon access to memory at a larger granularity than
the 32b or 64b value being operated on then that could be a problem so
I'm pretty happy with adding these barriers.

Thanks,
Paul

2019-04-25 08:40:05

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage

On Wed, Apr 24, 2019 at 09:18:04PM +0000, Paul Burton wrote:
> Hi Peter,
>
> On Wed, Apr 24, 2019 at 02:36:58PM +0200, Peter Zijlstra wrote:
> > The comment describing the loongson_llsc_mb() reorder case doesn't
> > make any sense what so ever. Instruction re-ordering is not an SMP
> > artifact, but rather a CPU local phenomenon. This means that _every_
> > LL/SC loop needs this barrier right in front to avoid the CPU from
> > leaking a memop inside it.
>
> Does it?

It does, however..

> The Loongson bug being described here causes an sc to succeed
> erroneously if certain loads or stores are executed between the ll &
> associated sc, including speculatively. On a UP system there's no code
> running on other cores to race with us & cause our sc to fail - ie. sc
> should always succeed anyway, so if the bug hits & the sc succeeds
> what's the big deal? It would have succeeded anyway. At least that's my
> understanding based on discussions with Loongson engineers a while ago.

Ah! So that wasn't spelled out as such. This basically says that: Yes,
it also screws with SC on UP, however the failure case is harmless.

(Also the comment with loongson_llsc_mb() seems incomplete in that it
doesn't mention the SC can also erroneously fail; typically that isn't a
problem because we'll just get an extra loop around and succeed
eventually.)

That said; I'm not entirely sure. The reason we use LL/SC even for
CPU-local variables is because of interrupts and the like. Would not a
false positive be a problem if it _should_ fail because of an interrupt?

> Having said that, if you have a strong preference for adding the barrier
> in UP systems anyway then I don't really object. It's not like anyone's
> likely to want to run a UP kernel on the affected systems, nevermind
> care about a miniscule performance impact.

It mostly all didn't make sense to me; and having a consistent recipie
for LL/SC loops is easier on the person occasionally looking at all
this (me, mostly :-).

(also, you should probably have a look at
include/asm-generic/bitops/atomic.h)

> One possibility your change could benefit would be if someone ran Linux
> on a subset of cores & some non-Linux code on other cores, in which case
> there could be something to cause the sc to fail. I've no idea if that's
> something these Loongson systems ever do though.

Or a bunch of UP guests ?

> > For the branch speculation case; if futex_atomic_cmpxchg_inatomic()
> > needs one at the bne branch target, then surely the normal
> > __cmpxch_asmg() implementation does too. We cannot rely on the
>
> s/cmpxch_asmg/cmpxchg_asm/

Typing hard :-)

2019-04-25 08:42:52

by Peter Zijlstra

[permalink] [raw]
Subject: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage


A: Because it messes up the order in which people normally read text.
Q: Why is top-posting such a bad thing?
A: Top-posting.
Q: What is the most annoying thing in e-mail?

On Thu, Apr 25, 2019 at 12:58:50PM +0800, [email protected] wrote:
> In my opinion. patch 2/3 is about Loongson's bug, and patch 4/5 is another theme.

Agreed; it's just that looking at the MIPS code to fix 4/5 made me trip
over this stuff.

> Let me explain the bug more specific:
>
> the bug ONLY matters in following situation:
>
> #. more than one cpu (assume cpu A and B) doing ll/sc on same shared
> var V
>
> #. speculative memory access from A cause A erroneously succeed sc
> operation, since the erroneously successful sc operation violate the
> coherence protocol. (here coherence protocol means the rules that CPU
> follow to implement ll/sc right)
>
> #. B succeed sc operation too, but this sc operation is right both
> logically and follow the coherence protocol, and makes A's sc wrong
> logically since only ONE sc operation can succeed.

(I know your coherence protocol is probably more complicated than MESI,
but bear with me)

So A speculatively gets V's line in Exclusive mode, speculates the Lock
flag is still there and completes the Store. This speculative store then
leaks out and violates MESI because there _should_ only be one Exclusive
owner of a line (B).

Something like that?

> If it is not LL/SC but other memory access from B on V, A's ll/sc can
> follow the atomic semantics even if A violate the coherence protocol
> in the same situation.

*shudder*...

C atomic-set

{
atomic_set(v, 1);
}

P1(atomic_t *v)
{
atomic_add_unless(v, 1, 0);
}

P2(atomic_t *v)
{
atomic_set(v, 0);
}

exists
(v=2)

So that one will still work? (that is, v=2 is forbidden)

> In one word, the bug only affect local cpu‘s ll/sc operation, and
> affect MP system.

Because it is a coherence issue, triggered by a reorder. OK.

> PS:
>
> If local_t is only ll/sc manipulated by current CPU, then no need fix it.

It _should_ be CPU local, but this was not at all clear from reading the
original changelog nor the comment with loongson_llsc_mb().

2019-04-25 09:11:50

by Peter Zijlstra

[permalink] [raw]
Subject: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage

On Thu, Apr 25, 2019 at 09:33:48AM +0200, Peter Zijlstra wrote:

> > Let me explain the bug more specific:
> >
> > the bug ONLY matters in following situation:
> >
> > #. more than one cpu (assume cpu A and B) doing ll/sc on same shared
> > var V
> >
> > #. speculative memory access from A cause A erroneously succeed sc
> > operation, since the erroneously successful sc operation violate the
> > coherence protocol. (here coherence protocol means the rules that CPU
> > follow to implement ll/sc right)
> >
> > #. B succeed sc operation too, but this sc operation is right both
> > logically and follow the coherence protocol, and makes A's sc wrong
> > logically since only ONE sc operation can succeed.
>
> (I know your coherence protocol is probably more complicated than MESI,
> but bear with me)
>
> So A speculatively gets V's line in Exclusive mode, speculates the Lock
> flag is still there and completes the Store. This speculative store then
> leaks out and violates MESI because there _should_ only be one Exclusive
> owner of a line (B).
>
> Something like that?

So B gets E (from LL), does I on A, then SC succeeds and get M. A got
I, speculates E, speculates M and lets the M escape.

That gets us with 2 competing Ms (which is of course completely
insane), one wins one looses (at random I presume).

And this violates atomic guarantees because one operation got lost.

> > If it is not LL/SC but other memory access from B on V, A's ll/sc can
> > follow the atomic semantics even if A violate the coherence protocol
> > in the same situation.
>
> *shudder*...
>
> C atomic-set
>
> {
> atomic_set(v, 1);
> }
>
> P1(atomic_t *v)
> {
> atomic_add_unless(v, 1, 0);
> }
>
> P2(atomic_t *v)
> {
> atomic_set(v, 0);
> }
>
> exists
> (v=2)
>
> So that one will still work? (that is, v=2 is forbidden)

But then in this case, P1 has E from LL, P2 does M from the STORE, which
should cause I on P1. P1 speculates E, speculates M and lets M escape.

We again have two competing Ms, one wins at random, and v==2 if P1
wins. This again violates the atomic guarantees and would invalidate
your claim of it only mattering for competing LL/SC.

Or am I missing something? (quite likely, I always get confused with
these things)

2019-04-25 09:16:01

by Peter Zijlstra

[permalink] [raw]
Subject: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage

On Thu, Apr 25, 2019 at 09:33:48AM +0200, Peter Zijlstra wrote:
> > Let me explain the bug more specific:
> >
> > the bug ONLY matters in following situation:
> >
> > #. more than one cpu (assume cpu A and B) doing ll/sc on same shared
> > var V
> >
> > #. speculative memory access from A cause A erroneously succeed sc
> > operation, since the erroneously successful sc operation violate the
> > coherence protocol. (here coherence protocol means the rules that CPU
> > follow to implement ll/sc right)
> >
> > #. B succeed sc operation too, but this sc operation is right both
> > logically and follow the coherence protocol, and makes A's sc wrong
> > logically since only ONE sc operation can succeed.

> > In one word, the bug only affect local cpu‘s ll/sc operation, and
> > affect MP system.

> > PS:
> >
> > If local_t is only ll/sc manipulated by current CPU, then no need fix it.
>
> It _should_ be CPU local, but this was not at all clear from reading the
> original changelog nor the comment with loongson_llsc_mb().

However, if it is a coherence issue, the thing is at the cacheline
level, and there is nothing that says the line isn't shared, just the
one variable isn't.

Ideally there should be minimal false sharing, but....

2019-04-25 10:44:26

by Huang Pei

[permalink] [raw]
Subject: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage

In my opinion. patch 2/3 is about Loongson's bug, and patch 4/5 is another theme.

Let me explain the bug more specific:

the bug ONLY matters in following situation:

#. more than one cpu (assume cpu A and B) doing ll/sc on same shared var V

#. speculative memory access from A cause A erroneously succeed sc operation, since the
erroneously successful sc operation violate the coherence protocol. (here coherence protocol means the rules that CPU follow to implement ll/sc right)

#. B succeed sc operation too, but this sc operation is right both logically and follow
the coherence protocol, and makes A's sc wrong logically since only ONE sc operation
can succeed.

If it is not LL/SC but other memory access from B on V, A's ll/sc can follow the atomic semantics even if A violate the coherence protocol in the same situation.

In one word, the bug only affect local cpu‘s ll/sc operation, and affect MP system.


PS:

If local_t is only ll/sc manipulated by current CPU, then no need fix it.






> -----原始邮件-----
> 发件人: "Paul Burton" <[email protected]>
> 发送时间: 2019-04-25 05:18:04 (星期四)
> 收件人: "Peter Zijlstra" <[email protected]>
> 抄送: "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "Huacai Chen" <[email protected]>, "Huang Pei" <[email protected]>
> 主题: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage
>
> Hi Peter,
>
> On Wed, Apr 24, 2019 at 02:36:58PM +0200, Peter Zijlstra wrote:
> > The comment describing the loongson_llsc_mb() reorder case doesn't
> > make any sense what so ever. Instruction re-ordering is not an SMP
> > artifact, but rather a CPU local phenomenon. This means that _every_
> > LL/SC loop needs this barrier right in front to avoid the CPU from
> > leaking a memop inside it.
>
> Does it?
>
> The Loongson bug being described here causes an sc to succeed
> erroneously if certain loads or stores are executed between the ll &
> associated sc, including speculatively. On a UP system there's no code
> running on other cores to race with us & cause our sc to fail - ie. sc
> should always succeed anyway, so if the bug hits & the sc succeeds
> what's the big deal? It would have succeeded anyway. At least that's my
> understanding based on discussions with Loongson engineers a while ago.
>
> Having said that, if you have a strong preference for adding the barrier
> in UP systems anyway then I don't really object. It's not like anyone's
> likely to want to run a UP kernel on the affected systems, nevermind
> care about a miniscule performance impact.
>
> One possibility your change could benefit would be if someone ran Linux
> on a subset of cores & some non-Linux code on other cores, in which case
> there could be something to cause the sc to fail. I've no idea if that's
> something these Loongson systems ever do though.
>
> > For the branch speculation case; if futex_atomic_cmpxchg_inatomic()
> > needs one at the bne branch target, then surely the normal
> > __cmpxch_asmg() implementation does too. We cannot rely on the
>
> s/cmpxch_asmg/cmpxchg_asm/
>
> > barriers from cmpxchg() because cmpxchg_local() is implemented with
> > the same macro, and branch prediction and speculation are, too, CPU
> > local.
>
> Similar story - cmpxchg_local() ought not have have CPUs racing for
> access to the memory in question. Having said that I don't know the
> details of when Loongson clears LLBit (ie. causes an sc to fail), so if
> it does that on based upon access to memory at a larger granularity than
> the 32b or 64b value being operated on then that could be a problem so
> I'm pretty happy with adding these barriers.
>
> Thanks,
> Paul


北京市海淀区中关村环保科技示范园龙芯产业园2号楼 100095电话: +86 (10) 62546668传真: +86 (10) 62600826http://www.loongson.cn本邮件及其附件含有龙芯中科技术有限公司的商业秘密信息,仅限于发送给上面地址中列出的个人或群组。禁止任何其他人以任何形式使用(包括但不限于全部或部 分地泄露、复制或散发)本邮件及其附件中的信息。如果您错收本邮件,请您立即电话或邮件通知发件人并删除本邮件。

This email and its attachments contain confidential information from Loongson
Technology Corporation Limited, which is intended only for the person or entity
whose address is listed above. Any use of the information contained herein in
any way (including, but not limited to, total or partial disclosure,
reproduction or dissemination) by persons other than the intended recipient(s)
is prohibited. If you receive this email in error, please notify the sender by
phone or email immediately and delete it.

2019-04-25 13:48:19

by Huang Pei

[permalink] [raw]
Subject: Re: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage




> -----原始邮件-----
> 发件人: "Peter Zijlstra" <[email protected]>
> 发送时间: 2019-04-25 17:09:07 (星期四)
> 收件人: [email protected]
> 抄送: "Paul Burton" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "Huacai Chen" <[email protected]>
> 主题: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage
>
> On Thu, Apr 25, 2019 at 09:33:48AM +0200, Peter Zijlstra wrote:
>
> > > Let me explain the bug more specific:
> > >
> > > the bug ONLY matters in following situation:
> > >
> > > #. more than one cpu (assume cpu A and B) doing ll/sc on same shared
> > > var V
> > >
> > > #. speculative memory access from A cause A erroneously succeed sc
> > > operation, since the erroneously successful sc operation violate the
> > > coherence protocol. (here coherence protocol means the rules that CPU
> > > follow to implement ll/sc right)
> > >
> > > #. B succeed sc operation too, but this sc operation is right both
> > > logically and follow the coherence protocol, and makes A's sc wrong
> > > logically since only ONE sc operation can succeed.
> >
> > (I know your coherence protocol is probably more complicated than MESI,
> > but bear with me)
> >
> > So A speculatively gets V's line in Exclusive mode, speculates the Lock
> > flag is still there and completes the Store. This speculative store then
> > leaks out and violates MESI because there _should_ only be one Exclusive
> > owner of a line (B).
> >
> > Something like that?
>
> So B gets E (from LL), does I on A, then SC succeeds and get M. A got
> I, speculates E, speculates M and lets the M escape.
>
> That gets us with 2 competing Ms (which is of course completely
> insane), one wins one looses (at random I presume).
>
> And this violates atomic guarantees because one operation got lost.


Based on what I was told:

#. A get E from LL, previous speculative access from A
kick V out, so A should get I, but A still thought A get E ;

#. and B get E from LL,so B get sc done right; V get sc atomically by B;

#. A still thought it get E, so A get sc done, V get sc by A, but not atomically.



>
> > > If it is not LL/SC but other memory access from B on V, A's ll/sc can
> > > follow the atomic semantics even if A violate the coherence protocol
> > > in the same situation.
> >
> > *shudder*...
> >
> > C atomic-set
> >
> > {
> > atomic_set(v, 1);
> > }
> >
> > P1(atomic_t *v)
> > {
> > atomic_add_unless(v, 1, 0);
> > }
> >
> > P2(atomic_t *v)
> > {
> > atomic_set(v, 0);
> > }
> >
> > exists
> > (v=2)
> >
> > So that one will still work? (that is, v=2 is forbidden)
>
> But then in this case, P1 has E from LL, P2 does M from the STORE, which
> should cause I on P1. P1 speculates E, speculates M and lets M escape.
>
> We again have two competing Ms, one wins at random, and v==2 if P1
> wins. This again violates the atomic guarantees and would invalidate
> your claim of it only mattering for competing LL/SC.
>
> Or am I missing something? (quite likely, I always get confused with
> these things)


北京市海淀区中关村环保科技示范园龙芯产业园2号楼 100095电话: +86 (10) 62546668传真: +86 (10) 62600826http://www.loongson.cn本邮件及其附件含有龙芯中科技术有限公司的商业秘密信息,仅限于发送给上面地址中列出的个人或群组。禁止任何其他人以任何形式使用(包括但不限于全部或部 分地泄露、复制或散发)本邮件及其附件中的信息。如果您错收本邮件,请您立即电话或邮件通知发件人并删除本邮件。

This email and its attachments contain confidential information from Loongson
Technology Corporation Limited, which is intended only for the person or entity
whose address is listed above. Any use of the information contained herein in
any way (including, but not limited to, total or partial disclosure,
reproduction or dissemination) by persons other than the intended recipient(s)
is prohibited. If you receive this email in error, please notify the sender by
phone or email immediately and delete it.

2019-04-25 13:51:03

by Peter Zijlstra

[permalink] [raw]
Subject: Re: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage

On Thu, Apr 25, 2019 at 07:32:59PM +0800, [email protected] wrote:

> > > If it is not LL/SC but other memory access from B on V, A's ll/sc can
> > > follow the atomic semantics even if A violate the coherence protocol
> > > in the same situation.
> >
> > *shudder*...
> >
> > C atomic-set
> >
> > {
> > atomic_set(v, 1);
> > }

This is the initial state.

>
> >
> > P1(atomic_t *v)
> > {
> > atomic_add_unless(v, 1, 0);
> > }
> >
> > P2(atomic_t *v)
> > {
> > atomic_set(v, 0);
> > }
> >
> > exists
> > (v=2)
> >
> > So that one will still work? (that is, v=2 is forbidden)
>
> you mean C,P1, P2 on 3 different CPU? I do not know much about LKMM, can explain the test case more explicit?

The 'C' is the language identifier, the 'atomic-set' is the litmus name.

The unnamed block give the initial conditions.

Pn blocks give code sequences for CPU n

The 'exists' clause is evaluated after all Pn blocks are done.

There's others in this thread that can point you to many papers and
resources on these litmus test thingies.

So basically the initial value of @v is set to 1.

Then CPU-1 does atomic_add_unless(v, 1, 0)
CPU-2 does atomic_set(v, 0)

If CPU1 goes first, it will see 1, which is not 0 and thus add 1 to 1
and obtains 2. Then CPU2 goes and writes 0, so the exist clause sees
v==0 and doesn't observe 2.

The other way around, CPU-2 goes first, writes a 0, then CPU-1 goes and
observes the 0, finds it matches 0 and doesn't add. Again, the exist
clause will find 0 doesn't match 2.

This all goes unstuck if interleaved like:


CPU-1 CPU-2

xor t0, t0
1: ll t0, v
bez t0, 2f
sw t0, v
add t0, t1
sc t0, v
beqz t0, 1b

(sorry if I got the MIPS asm wrong; it's not something I normally write)

And the store-word from CPU-2 doesn't make the SC from CPU-1 fail.

2019-04-25 14:10:25

by Huang Pei

[permalink] [raw]
Subject: Re: Re: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage




> -----原始邮件-----
> 发件人: "Peter Zijlstra" <[email protected]>
> 发送时间: 2019-04-25 20:26:11 (星期四)
> 收件人: [email protected]
> 抄送: "Paul Burton" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "Huacai Chen" <[email protected]>
> 主题: Re: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage
>
> On Thu, Apr 25, 2019 at 07:32:59PM +0800, [email protected] wrote:
>
> > > > If it is not LL/SC but other memory access from B on V, A's ll/sc can
> > > > follow the atomic semantics even if A violate the coherence protocol
> > > > in the same situation.
> > >
> > > *shudder*...
> > >
> > > C atomic-set
> > >
> > > {
> > > atomic_set(v, 1);
> > > }
>
> This is the initial state.
>
> >
> > >
> > > P1(atomic_t *v)
> > > {
> > > atomic_add_unless(v, 1, 0);
> > > }
> > >
> > > P2(atomic_t *v)
> > > {
> > > atomic_set(v, 0);
> > > }
> > >
> > > exists
> > > (v=2)
> > >
> > > So that one will still work? (that is, v=2 is forbidden)
> >
> > you mean C,P1, P2 on 3 different CPU? I do not know much about LKMM, can explain the test case more explicit?
>
> The 'C' is the language identifier, the 'atomic-set' is the litmus name.
>
> The unnamed block give the initial conditions.
>
> Pn blocks give code sequences for CPU n
>
> The 'exists' clause is evaluated after all Pn blocks are done.
>
> There's others in this thread that can point you to many papers and
> resources on these litmus test thingies.
>
> So basically the initial value of @v is set to 1.
>
> Then CPU-1 does atomic_add_unless(v, 1, 0)
> CPU-2 does atomic_set(v, 0)
>
> If CPU1 goes first, it will see 1, which is not 0 and thus add 1 to 1
> and obtains 2. Then CPU2 goes and writes 0, so the exist clause sees
> v==0 and doesn't observe 2.
>
> The other way around, CPU-2 goes first, writes a 0, then CPU-1 goes and
> observes the 0, finds it matches 0 and doesn't add. Again, the exist
> clause will find 0 doesn't match 2.
>
> This all goes unstuck if interleaved like:
>
>
> CPU-1 CPU-2
>
> xor t0, t0
> 1: ll t0, v
> bez t0, 2f
> sw t0, v
> add t0, t1
> sc t0, v
> beqz t0, 1b
>
> (sorry if I got the MIPS asm wrong; it's not something I normally write)
>
> And the store-word from CPU-2 doesn't make the SC from CPU-1 fail.
>

loongson's llsc bug DOES NOT fail this litmus( we will not get V=2);

only speculative memory access from CPU-1 can "blind" CPU-1(here blind means do ll/sc
wrong), this speculative memory access can be observed corrently by CPU2. In this
case, sw from CPU-2 can get I , which can be observed by CPU-1, and clear llbit,then
failed sc.



北京市海淀区中关村环保科技示范园龙芯产业园2号楼 100095电话: +86 (10) 62546668传真: +86 (10) 62600826http://www.loongson.cn本邮件及其附件含有龙芯中科技术有限公司的商业秘密信息,仅限于发送给上面地址中列出的个人或群组。禁止任何其他人以任何形式使用(包括但不限于全部或部 分地泄露、复制或散发)本邮件及其附件中的信息。如果您错收本邮件,请您立即电话或邮件通知发件人并删除本邮件。

This email and its attachments contain confidential information from Loongson
Technology Corporation Limited, which is intended only for the person or entity
whose address is listed above. Any use of the information contained herein in
any way (including, but not limited to, total or partial disclosure,
reproduction or dissemination) by persons other than the intended recipient(s)
is prohibited. If you receive this email in error, please notify the sender by
phone or email immediately and delete it.

2019-04-25 15:34:18

by Huang Pei

[permalink] [raw]
Subject: Re: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage




> -----原始邮件-----
> 发件人: "Peter Zijlstra" <[email protected]>
> 发送时间: 2019-04-25 15:33:48 (星期四)
> 收件人: [email protected]
> 抄送: "Paul Burton" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "Huacai Chen" <[email protected]>
> 主题: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage
>
>
> A: Because it messes up the order in which people normally read text.
> Q: Why is top-posting such a bad thing?
> A: Top-posting.
> Q: What is the most annoying thing in e-mail?
>
> On Thu, Apr 25, 2019 at 12:58:50PM +0800, [email protected] wrote:
> > In my opinion. patch 2/3 is about Loongson's bug, and patch 4/5 is another theme.
>
> Agreed; it's just that looking at the MIPS code to fix 4/5 made me trip
> over this stuff.
>
> > Let me explain the bug more specific:
> >
> > the bug ONLY matters in following situation:
> >
> > #. more than one cpu (assume cpu A and B) doing ll/sc on same shared
> > var V
> >
> > #. speculative memory access from A cause A erroneously succeed sc
> > operation, since the erroneously successful sc operation violate the
> > coherence protocol. (here coherence protocol means the rules that CPU
> > follow to implement ll/sc right)
> >
> > #. B succeed sc operation too, but this sc operation is right both
> > logically and follow the coherence protocol, and makes A's sc wrong
> > logically since only ONE sc operation can succeed.
>
> (I know your coherence protocol is probably more complicated than MESI,
> but bear with me)

our coherentce protocal is simpler than MESI.
>
> So A speculatively gets V's line in Exclusive mode, speculates the Lock
> flag is still there and completes the Store. This speculative store then
> leaks out and violates MESI because there _should_ only be one Exclusive
> owner of a line (B).
>
> Something like that?

Yes

>
> > If it is not LL/SC but other memory access from B on V, A's ll/sc can
> > follow the atomic semantics even if A violate the coherence protocol
> > in the same situation.
>
> *shudder*...
>
> C atomic-set
>
> {
> atomic_set(v, 1);
> }


>
> P1(atomic_t *v)
> {
> atomic_add_unless(v, 1, 0);
> }
>
> P2(atomic_t *v)
> {
> atomic_set(v, 0);
> }
>
> exists
> (v=2)
>
> So that one will still work? (that is, v=2 is forbidden)

you mean C,P1, P2 on 3 different CPU? I do not know much about LKMM, can explain the test case more explicit?

>
> > In one word, the bug only affect local cpu‘s ll/sc operation, and
> > affect MP system.
>
> Because it is a coherence issue, triggered by a reorder. OK.

Not exactly, it is a ll/sc issue, triggered by speculative memory access from local cpu
based on what I was told.
>
> > PS:
> >
> > If local_t is only ll/sc manipulated by current CPU, then no need fix it.
>
> It _should_ be CPU local, but this was not at all clear from reading the
> original changelog nor the comment with loongson_llsc_mb().


北京市海淀区中关村环保科技示范园龙芯产业园2号楼 100095电话: +86 (10) 62546668传真: +86 (10) 62600826http://www.loongson.cn本邮件及其附件含有龙芯中科技术有限公司的商业秘密信息,仅限于发送给上面地址中列出的个人或群组。禁止任何其他人以任何形式使用(包括但不限于全部或部 分地泄露、复制或散发)本邮件及其附件中的信息。如果您错收本邮件,请您立即电话或邮件通知发件人并删除本邮件。

This email and its attachments contain confidential information from Loongson
Technology Corporation Limited, which is intended only for the person or entity
whose address is listed above. Any use of the information contained herein in
any way (including, but not limited to, total or partial disclosure,
reproduction or dissemination) by persons other than the intended recipient(s)
is prohibited. If you receive this email in error, please notify the sender by
phone or email immediately and delete it.

2019-04-25 15:36:11

by Peter Zijlstra

[permalink] [raw]
Subject: Re: Re: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage

On Thu, Apr 25, 2019 at 08:51:17PM +0800, [email protected] wrote:

> > So basically the initial value of @v is set to 1.
> >
> > Then CPU-1 does atomic_add_unless(v, 1, 0)
> > CPU-2 does atomic_set(v, 0)
> >
> > If CPU1 goes first, it will see 1, which is not 0 and thus add 1 to 1
> > and obtains 2. Then CPU2 goes and writes 0, so the exist clause sees
> > v==0 and doesn't observe 2.
> >
> > The other way around, CPU-2 goes first, writes a 0, then CPU-1 goes and
> > observes the 0, finds it matches 0 and doesn't add. Again, the exist
> > clause will find 0 doesn't match 2.
> >
> > This all goes unstuck if interleaved like:
> >
> >
> > CPU-1 CPU-2
> >
> > xor t0, t0
> > 1: ll t0, v
> > bez t0, 2f
> > sw t0, v
> > add t0, t1
> > sc t0, v
> > beqz t0, 1b
> >
> > (sorry if I got the MIPS asm wrong; it's not something I normally write)
> >
> > And the store-word from CPU-2 doesn't make the SC from CPU-1 fail.
> >
>
> loongson's llsc bug DOES NOT fail this litmus( we will not get V=2);
>
> only speculative memory access from CPU-1 can "blind" CPU-1(here blind means do ll/sc
> wrong), this speculative memory access can be observed corrently by CPU2. In this
> case, sw from CPU-2 can get I , which can be observed by CPU-1, and clear llbit,then
> failed sc.

I'm not following, suppose CPU-1 happens as a speculation (imagine
whatever code is required to make that happen before). CPU-2 sw will
cause I on CPU-1's ll but, as in the previous email, CPU-1 will continue
as if it still has E and complete the SC.

That is; I'm just not seeing why this case would be different from two
competing LL/SCs.

2019-04-25 16:39:50

by Linus Torvalds

[permalink] [raw]
Subject: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage

On Wed, Apr 24, 2019 at 9:59 PM <[email protected]> wrote:
>
> In one word, the bug only affect local cpu‘s ll/sc operation, and affect MP system.
>
> If local_t is only ll/sc manipulated by current CPU, then no need fix it.

As to the whole MP vs UP issue:

Is this "guaranteed no problem on UP" true even in the presence of
DMA? I'm _hoping_ some MIPS chips are starting to be coherent with DMA
(but maybe DMA never participates in any coherency traffic that could
trigger the bug even if the DMA _were_ to be coherent?).

Also, as Peter mentioned, we do depend on ll/sc also reacting to
interrupts - again including on UP systems, of course. I assume that's
always handled correctly, and that an interrupt will set the right
state that the SC will not succeed?

Finally, it worries me a bit that the loongson_llsc_mb() things are
all at the C level, so who knows what compiler behavior you'll see
between the ll/sc inline asm, and the loongson_llsc_mb() generation.
Hopefully not a lot (and presumably mostly just stack reloads etc that
probably don't change any cache state), but I do wonder if the
loongson quirk should be *inside* the asm rather than separate.

Linus

2019-04-26 03:21:11

by Huang Pei

[permalink] [raw]
Subject: Re: Re: Re: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage




> -----原始邮件-----
> 发件人: "Peter Zijlstra" <[email protected]>
> 发送时间: 2019-04-25 21:31:05 (星期四)
> 收件人: [email protected]
> 抄送: "Paul Burton" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "Huacai Chen" <[email protected]>
> 主题: Re: Re: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage
>
> On Thu, Apr 25, 2019 at 08:51:17PM +0800, [email protected] wrote:
>
> > > So basically the initial value of @v is set to 1.
> > >
> > > Then CPU-1 does atomic_add_unless(v, 1, 0)
> > > CPU-2 does atomic_set(v, 0)
> > >
> > > If CPU1 goes first, it will see 1, which is not 0 and thus add 1 to 1
> > > and obtains 2. Then CPU2 goes and writes 0, so the exist clause sees
> > > v==0 and doesn't observe 2.
> > >
> > > The other way around, CPU-2 goes first, writes a 0, then CPU-1 goes and
> > > observes the 0, finds it matches 0 and doesn't add. Again, the exist
> > > clause will find 0 doesn't match 2.
> > >
> > > This all goes unstuck if interleaved like:
> > >
> > >
> > > CPU-1 CPU-2
> > >
> > > xor t0, t0
> > > 1: ll t0, v
> > > bez t0, 2f
> > > sw t0, v
> > > add t0, t1
> > > sc t0, v
> > > beqz t0, 1b
> > >
> > > (sorry if I got the MIPS asm wrong; it's not something I normally write)
> > >
> > > And the store-word from CPU-2 doesn't make the SC from CPU-1 fail.
> > >
> >
> > loongson's llsc bug DOES NOT fail this litmus( we will not get V=2);
> >
> > only speculative memory access from CPU-1 can "blind" CPU-1(here blind means do ll/sc
> > wrong), this speculative memory access can be observed corrently by CPU2. In this
> > case, sw from CPU-2 can get I , which can be observed by CPU-1, and clear llbit,then
> > failed sc.
>
> I'm not following, suppose CPU-1 happens as a speculation (imagine
> whatever code is required to make that happen before). CPU-2 sw will
> cause I on CPU-1's ll but, as in the previous email, CPU-1 will continue
> as if it still has E and complete the SC.
>
> That is; I'm just not seeing why this case would be different from two
> competing LL/SCs.
>

I get your point. I kept my eye on the sw from CPU-2, but forgot the speculative
mem access from CPU-1.

There is no difference bewteen this one and the former case.

=========================================================================
V = 1

CPU-1 CPU-2

xor t0, t0
1: ll t0, V
beqz t0, 2f

/* if speculative mem
access kick cacheline of
V out, it can blind CPU-1
and make CPU-1 believe it
still hold E on V, and can
NOT see the sw from CPU-2
actually invalid V, which
should clear LLBit of CPU-1,
but not */
sw t0, V // just after sw, V = 0
addiu t0, t0, 1

sc t0, V
/* oops, sc write t0(2)
into V with LLBit */

/* get V=2 */
beqz t0, 1b
nop
2:
================================================================================

if speculative mem access *does not* kick out cache line of V, CPU-1 can see sw
from CPU-2, and clear LLBit, which cause sc fail and retry, That's OK



北京市海淀区中关村环保科技示范园龙芯产业园2号楼 100095电话: +86 (10) 62546668传真: +86 (10) 62600826http://www.loongson.cn本邮件及其附件含有龙芯中科技术有限公司的商业秘密信息,仅限于发送给上面地址中列出的个人或群组。禁止任何其他人以任何形式使用(包括但不限于全部或部 分地泄露、复制或散发)本邮件及其附件中的信息。如果您错收本邮件,请您立即电话或邮件通知发件人并删除本邮件。

This email and its attachments contain confidential information from Loongson
Technology Corporation Limited, which is intended only for the person or entity
whose address is listed above. Any use of the information contained herein in
any way (including, but not limited to, total or partial disclosure,
reproduction or dissemination) by persons other than the intended recipient(s)
is prohibited. If you receive this email in error, please notify the sender by
phone or email immediately and delete it.

2019-05-14 15:48:46

by Peter Zijlstra

[permalink] [raw]
Subject: Re: Re: Re: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage


(sorry for the delay, I got sidetracked elsewhere)

On Fri, Apr 26, 2019 at 10:57:20AM +0800, [email protected] wrote:
> > -----原始邮件-----
> > On Thu, Apr 25, 2019 at 08:51:17PM +0800, [email protected] wrote:
> >
> > > > So basically the initial value of @v is set to 1.
> > > >
> > > > Then CPU-1 does atomic_add_unless(v, 1, 0)
> > > > CPU-2 does atomic_set(v, 0)
> > > >
> > > > If CPU1 goes first, it will see 1, which is not 0 and thus add 1 to 1
> > > > and obtains 2. Then CPU2 goes and writes 0, so the exist clause sees
> > > > v==0 and doesn't observe 2.
> > > >
> > > > The other way around, CPU-2 goes first, writes a 0, then CPU-1 goes and
> > > > observes the 0, finds it matches 0 and doesn't add. Again, the exist
> > > > clause will find 0 doesn't match 2.
> > > >
> > > > This all goes unstuck if interleaved like:
> > > >
> > > >
> > > > CPU-1 CPU-2
> > > >
> > > > xor t0, t0
> > > > 1: ll t0, v
> > > > bez t0, 2f
> > > > sw t0, v
> > > > add t0, t1
> > > > sc t0, v
> > > > beqz t0, 1b
> > > >
> > > > (sorry if I got the MIPS asm wrong; it's not something I normally write)
> > > >
> > > > And the store-word from CPU-2 doesn't make the SC from CPU-1 fail.
> > > >
> > >
> > > loongson's llsc bug DOES NOT fail this litmus( we will not get V=2);
> > >
> > > only speculative memory access from CPU-1 can "blind" CPU-1(here blind means do ll/sc
> > > wrong), this speculative memory access can be observed corrently by CPU2. In this
> > > case, sw from CPU-2 can get I , which can be observed by CPU-1, and clear llbit,then
> > > failed sc.
> >
> > I'm not following, suppose CPU-1 happens as a speculation (imagine
> > whatever code is required to make that happen before). CPU-2 sw will
> > cause I on CPU-1's ll but, as in the previous email, CPU-1 will continue
> > as if it still has E and complete the SC.
> >
> > That is; I'm just not seeing why this case would be different from two
> > competing LL/SCs.
> >
>
> I get your point. I kept my eye on the sw from CPU-2, but forgot the speculative
> mem access from CPU-1.
>
> There is no difference bewteen this one and the former case.
>
> =========================================================================
> V = 1
>
> CPU-1 CPU-2
>
> xor t0, t0
> 1: ll t0, V
> beqz t0, 2f
>
> /* if speculative mem
> access kick cacheline of
> V out, it can blind CPU-1
> and make CPU-1 believe it
> still hold E on V, and can
> NOT see the sw from CPU-2
> actually invalid V, which
> should clear LLBit of CPU-1,
> but not */
> sw t0, V // just after sw, V = 0
> addiu t0, t0, 1
>
> sc t0, V
> /* oops, sc write t0(2)
> into V with LLBit */
>
> /* get V=2 */
> beqz t0, 1b
> nop
> 2:
> ================================================================================
>
> if speculative mem access *does not* kick out cache line of V, CPU-1 can see sw
> from CPU-2, and clear LLBit, which cause sc fail and retry, That's OK

OK; so do I understand it correctly that your CPU _can_ in fact fail
that test and result in 2? If so I think I'm (finally) understanding :-)

2019-05-14 16:00:14

by Peter Zijlstra

[permalink] [raw]
Subject: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage


I think this thread got 'lost'

On Thu, Apr 25, 2019 at 11:12:58AM +0200, Peter Zijlstra wrote:
> On Thu, Apr 25, 2019 at 09:33:48AM +0200, Peter Zijlstra wrote:
> > > Let me explain the bug more specific:
> > >
> > > the bug ONLY matters in following situation:
> > >
> > > #. more than one cpu (assume cpu A and B) doing ll/sc on same shared
> > > var V
> > >
> > > #. speculative memory access from A cause A erroneously succeed sc
> > > operation, since the erroneously successful sc operation violate the
> > > coherence protocol. (here coherence protocol means the rules that CPU
> > > follow to implement ll/sc right)
> > >
> > > #. B succeed sc operation too, but this sc operation is right both
> > > logically and follow the coherence protocol, and makes A's sc wrong
> > > logically since only ONE sc operation can succeed.
>
> > > In one word, the bug only affect local cpu‘s ll/sc operation, and
> > > affect MP system.
>
> > > PS:
> > >
> > > If local_t is only ll/sc manipulated by current CPU, then no need fix it.
> >
> > It _should_ be CPU local, but this was not at all clear from reading the
> > original changelog nor the comment with loongson_llsc_mb().
>
> However, if it is a coherence issue, the thing is at the cacheline
> level, and there is nothing that says the line isn't shared, just the
> one variable isn't.
>
> Ideally there should be minimal false sharing, but....

So if two variables share a line, and one is local while the other is
shared atomic, can contention on the line, but not the variable, cause
issues for the local variable?

If not; why not? Because so far the issue is line granular due to the
coherence aspect.

2019-05-14 16:20:09

by Linus Torvalds

[permalink] [raw]
Subject: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage

On Tue, May 14, 2019 at 8:58 AM Peter Zijlstra <[email protected]> wrote:
>
> So if two variables share a line, and one is local while the other is
> shared atomic, can contention on the line, but not the variable, cause
> issues for the local variable?
>
> If not; why not? Because so far the issue is line granular due to the
> coherence aspect.

If I understood the issue correctly, it's not that cache coherence
doesn't work, it's literally that the sc succeeds when it shouldn't.

In other words, it's not going to affect anything else, but it means
that "ll/sc" isn't actually truly atomic, because the cacheline could
have bounced around to another CPU in the meantime.

So we *think* we got an atomic update, but didn't, and the "ll/sc"
pair ends up incorrectly working as a regular "load -> store" pair,
because the "sc' incorrectly thought it still had exclusive access to
the line from the "ll".

The added memory barrier isn't because it's a memory barrier, it's
just keeping the subsequent speculative instructions from getting the
cacheline back and causing that "sc" confusion.

But note how from a cache coherency standpoint, it's not about the
cache coherency being wrong, it's literally just about the ll/sc not
giving the atomicity guarantees that the sequence is *supposed* to
give. So an "atomic_inc()" can basically (under just the wrong
circumstances) essentially turn into just a non-atomic "*p++".

NOTE! I have no actual inside knowledge of what is going on. The above
is purely my reading of this thread, and maybe I have mis-understood.

Linus

2019-05-14 16:58:59

by Peter Zijlstra

[permalink] [raw]
Subject: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage

On Tue, May 14, 2019 at 09:10:34AM -0700, Linus Torvalds wrote:
> On Tue, May 14, 2019 at 8:58 AM Peter Zijlstra <[email protected]> wrote:
> >
> > So if two variables share a line, and one is local while the other is
> > shared atomic, can contention on the line, but not the variable, cause
> > issues for the local variable?
> >
> > If not; why not? Because so far the issue is line granular due to the
> > coherence aspect.
>
> If I understood the issue correctly, it's not that cache coherence
> doesn't work, it's literally that the sc succeeds when it shouldn't.
>
> In other words, it's not going to affect anything else, but it means
> that "ll/sc" isn't actually truly atomic, because the cacheline could
> have bounced around to another CPU in the meantime.
>
> So we *think* we got an atomic update, but didn't, and the "ll/sc"
> pair ends up incorrectly working as a regular "load -> store" pair,
> because the "sc' incorrectly thought it still had exclusive access to
> the line from the "ll".
>
> The added memory barrier isn't because it's a memory barrier, it's
> just keeping the subsequent speculative instructions from getting the
> cacheline back and causing that "sc" confusion.
>
> But note how from a cache coherency standpoint, it's not about the
> cache coherency being wrong, it's literally just about the ll/sc not
> giving the atomicity guarantees that the sequence is *supposed* to
> give. So an "atomic_inc()" can basically (under just the wrong
> circumstances) essentially turn into just a non-atomic "*p++".

Understood; the problem is that "*p++" is not good enough for local_t
either (on load-store architectures), since it needs to be "atomic" wrt
all other instructions on that CPU, most notably exceptions.

The issue has come up before in this thread; but I don't think it was
clearly answered before.

2019-05-14 17:09:44

by Linus Torvalds

[permalink] [raw]
Subject: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage

On Tue, May 14, 2019 at 9:56 AM Peter Zijlstra <[email protected]> wrote:
>
> Understood; the problem is that "*p++" is not good enough for local_t
> either (on load-store architectures), since it needs to be "atomic" wrt
> all other instructions on that CPU, most notably exceptions.

Right. But I don't think that's the issue here, since if it was then
it would be a problem even on UP.

And while the CPU-local ones want atomicity, they *shouldn't* have the
issue of another CPU modifying them, so even if you were to lose
exclusive ownership of the cacheline (because some other CPU is
reading your per-cpu data for statistics of whatever), the final end
result should be fine.

End result: I suspect ll/sc still works for cpu-local stuff without
any extra loongson hacks.

But I agree that it would be good to verify.

Linus

2019-05-15 13:53:08

by Huang Pei

[permalink] [raw]
Subject: Re: Re: [RFC][PATCH 2/5] mips/atomic: Fix loongson_llsc_mb() wreckage


>On Tue, May 14, 2019 at 8:58 AM Peter Zijlstra <[email protected]> wrote:
>>
>> So if two variables share a line, and one is local while the other is
>> shared atomic, can contention on the line, but not the variable, cause
>> issues for the local variable?
>>
>> If not; why not? Because so far the issue is line granular due to the
>> coherence aspect.
>
>If I understood the issue correctly, it's not that cache coherence
>doesn't work, it's literally that the sc succeeds when it shouldn't.
>
>In other words, it's not going to affect anything else, but it means
>that "ll/sc" isn't actually truly atomic, because the cacheline could
>have bounced around to another CPU in the meantime.
>
>So we *think* we got an atomic update, but didn't, and the "ll/sc"
>pair ends up incorrectly working as a regular "load -> store" pair,
>because the "sc' incorrectly thought it still had exclusive access to
>the line from the "ll".
>
>The added memory barrier isn't because it's a memory barrier, it's
>just keeping the subsequent speculative instructions from getting the
>cacheline back and causing that "sc" confusion.
>
>But note how from a cache coherency standpoint, it's not about the
>cache coherency being wrong, it's literally just about the ll/sc not
>giving the atomicity guarantees that the sequence is *supposed* to
>give. So an "atomic_inc()" can basically (under just the wrong
>circumstances) essentially turn into just a non-atomic "*p++".
>
Agreed,that is exactly what I was learned.

>NOTE! I have no actual inside knowledge of what is going on. The above
>is purely my reading of this thread, and maybe I have mis-understood.
>

you got it right.
>                  Linus