From: Alexander Sverdlin <[email protected]>
The switch to qspinlock brought a massive regression in spinlocks on
Octeon. Even after applying this series (and a patch in the
ARCH-independent code [1]) tight contended (6 cores, 1 thread per core)
spinlock loop is still 50% slower as previous ticket-based implementation.
This series implements some optimizations and has been tested on a 6-core
Octeon machine.
[1] Link: https://lkml.org/lkml/2021/1/27/1137
Alexander Sverdlin (6):
MIPS: Octeon: Implement __smp_store_release()
MIPS: Implement atomic_cmpxchg_relaxed()
MIPS: Octeon: qspinlock: Flush write buffer
MIPS: Octeon: qspinlock: Exclude mmiowb()
MIPS: Provide {atomic_}xchg_relaxed()
MIPS: cmpxchg: Use cmpxchg_local() for {cmp_}xchg_small()
arch/mips/include/asm/atomic.h | 5 +++++
arch/mips/include/asm/barrier.h | 9 +++++++++
arch/mips/include/asm/cmpxchg.h | 6 ++++++
arch/mips/include/asm/spinlock.h | 5 +++++
arch/mips/kernel/cmpxchg.c | 4 ++--
5 files changed, 27 insertions(+), 2 deletions(-)
--
2.10.2
From: Alexander Sverdlin <[email protected]>
Flushing the write buffer brings aroung 10% performace on the tight
uncontended spinlock loops on Octeon. Refer to commit 500c2e1fdbcc
("MIPS: Optimize spinlocks.").
Signed-off-by: Alexander Sverdlin <[email protected]>
---
arch/mips/include/asm/spinlock.h | 3 +++
1 file changed, 3 insertions(+)
diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h
index 8a88eb2..0a707f3 100644
--- a/arch/mips/include/asm/spinlock.h
+++ b/arch/mips/include/asm/spinlock.h
@@ -24,6 +24,9 @@ static inline void queued_spin_unlock(struct qspinlock *lock)
/* This could be optimised with ARCH_HAS_MMIOWB */
mmiowb();
smp_store_release(&lock->locked, 0);
+#ifdef CONFIG_CPU_CAVIUM_OCTEON
+ nudge_writes();
+#endif
}
#include <asm/qspinlock.h>
--
2.10.2
From: Alexander Sverdlin <[email protected]>
This has the effect of removing one redundant SYNCW from
queued_spin_lock_slowpath() on Octeon.
Signed-off-by: Alexander Sverdlin <[email protected]>
---
arch/mips/include/asm/atomic.h | 2 ++
arch/mips/include/asm/cmpxchg.h | 4 ++++
2 files changed, 6 insertions(+)
diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h
index a4e5116..3b0f54b 100644
--- a/arch/mips/include/asm/atomic.h
+++ b/arch/mips/include/asm/atomic.h
@@ -266,5 +266,7 @@ ATOMIC_SIP_OP(atomic64, s64, dsubu, lld, scd)
#define atomic_cmpxchg_relaxed(v, o, n) \
(cmpxchg_relaxed(&((v)->counter), (o), (n)))
+#define atomic_xchg_relaxed(v, new) \
+ (xchg_relaxed(&((v)->counter), (new)))
#endif /* _ASM_ATOMIC_H */
diff --git a/arch/mips/include/asm/cmpxchg.h b/arch/mips/include/asm/cmpxchg.h
index 620f01a..7830d81 100644
--- a/arch/mips/include/asm/cmpxchg.h
+++ b/arch/mips/include/asm/cmpxchg.h
@@ -110,6 +110,10 @@ unsigned long __xchg(volatile void *ptr, unsigned long x, int size)
__res; \
})
+#define xchg_relaxed(ptr, x) \
+ ((__typeof__(*(ptr))) \
+ __xchg((ptr), (unsigned long)(x), sizeof(*(ptr))))
+
#define __cmpxchg_asm(ld, st, m, old, new) \
({ \
__typeof(*(m)) __ret; \
--
2.10.2
From: Alexander Sverdlin <[email protected]>
On Octeon smp_mb() translates to SYNC while wmb+rmb translates to SYNCW
only. This brings around 10% performance on tight uncontended spinlock
loops.
Refer to commit 500c2e1fdbcc ("MIPS: Optimize spinlocks.") and the link
below.
On 6-core Octeon machine:
sysbench --test=mutex --num-threads=64 --memory-scope=local run
w/o patch: 1.60s
with patch: 1.51s
Link: https://lore.kernel.org/lkml/[email protected]/
Signed-off-by: Alexander Sverdlin <[email protected]>
---
arch/mips/include/asm/barrier.h | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h
index 49ff172..24c3f2c 100644
--- a/arch/mips/include/asm/barrier.h
+++ b/arch/mips/include/asm/barrier.h
@@ -113,6 +113,15 @@ static inline void wmb(void)
".set arch=octeon\n\t" \
"syncw\n\t" \
".set pop" : : : "memory")
+
+#define __smp_store_release(p, v) \
+do { \
+ compiletime_assert_atomic_type(*p); \
+ __smp_wmb(); \
+ __smp_rmb(); \
+ WRITE_ONCE(*p, v); \
+} while (0)
+
#else
#define smp_mb__before_llsc() smp_llsc_mb()
#define __smp_mb__before_llsc() smp_llsc_mb()
--
2.10.2
From: Alexander Sverdlin <[email protected]>
It makes no sense to fold smp_mb__before_llsc()/smp_llsc_mb() again and
again, leave only one barrier pair in the outer function.
This removes one SYNCW from __xchg_small() and brings around 10%
performance improvement in a tight spinlock loop with 6 threads on a 6 core
Octeon.
Signed-off-by: Alexander Sverdlin <[email protected]>
---
arch/mips/kernel/cmpxchg.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/mips/kernel/cmpxchg.c b/arch/mips/kernel/cmpxchg.c
index 89107de..122e85f 100644
--- a/arch/mips/kernel/cmpxchg.c
+++ b/arch/mips/kernel/cmpxchg.c
@@ -41,7 +41,7 @@ unsigned long __xchg_small(volatile void *ptr, unsigned long val, unsigned int s
do {
old32 = load32;
new32 = (load32 & ~mask) | (val << shift);
- load32 = cmpxchg(ptr32, old32, new32);
+ load32 = cmpxchg_local(ptr32, old32, new32);
} while (load32 != old32);
return (load32 & mask) >> shift;
@@ -97,7 +97,7 @@ unsigned long __cmpxchg_small(volatile void *ptr, unsigned long old,
*/
old32 = (load32 & ~mask) | (old << shift);
new32 = (load32 & ~mask) | (new << shift);
- load32 = cmpxchg(ptr32, old32, new32);
+ load32 = cmpxchg_local(ptr32, old32, new32);
if (load32 == old32)
return old;
}
--
2.10.2
From: Alexander Sverdlin <[email protected]>
On Octeon mmiowb() is SYNCW, which is already contained in
smp_store_release(). Removing superfluous barrier brings around 10%
performance on uncontended tight spinlock loops.
Signed-off-by: Alexander Sverdlin <[email protected]>
---
arch/mips/include/asm/spinlock.h | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h
index 0a707f3..fbe97b4 100644
--- a/arch/mips/include/asm/spinlock.h
+++ b/arch/mips/include/asm/spinlock.h
@@ -21,8 +21,10 @@
*/
static inline void queued_spin_unlock(struct qspinlock *lock)
{
+#ifndef CONFIG_CPU_CAVIUM_OCTEON
/* This could be optimised with ARCH_HAS_MMIOWB */
mmiowb();
+#endif
smp_store_release(&lock->locked, 0);
#ifdef CONFIG_CPU_CAVIUM_OCTEON
nudge_writes();
--
2.10.2