v3 is updated to use __pv_queued_spin_unlock, noticed by Waiman (thank you).
Thanks,
Nick
Nicholas Piggin (6):
powerpc/powernv: must include hvcall.h to get PAPR defines
powerpc/pseries: move some PAPR paravirt functions to their own file
powerpc: move spinlock implementation to simple_spinlock
powerpc/64s: implement queued spinlocks and rwlocks
powerpc/pseries: implement paravirt qspinlocks for SPLPAR
powerpc/qspinlock: optimised atomic_try_cmpxchg_lock that adds the
lock hint
arch/powerpc/Kconfig | 13 +
arch/powerpc/include/asm/Kbuild | 2 +
arch/powerpc/include/asm/atomic.h | 28 ++
arch/powerpc/include/asm/paravirt.h | 89 +++++
arch/powerpc/include/asm/qspinlock.h | 91 ++++++
arch/powerpc/include/asm/qspinlock_paravirt.h | 7 +
arch/powerpc/include/asm/simple_spinlock.h | 292 +++++++++++++++++
.../include/asm/simple_spinlock_types.h | 21 ++
arch/powerpc/include/asm/spinlock.h | 308 +-----------------
arch/powerpc/include/asm/spinlock_types.h | 17 +-
arch/powerpc/lib/Makefile | 3 +
arch/powerpc/lib/locks.c | 12 +-
arch/powerpc/platforms/powernv/pci-ioda-tce.c | 1 +
arch/powerpc/platforms/pseries/Kconfig | 5 +
arch/powerpc/platforms/pseries/setup.c | 6 +-
include/asm-generic/qspinlock.h | 4 +
16 files changed, 577 insertions(+), 322 deletions(-)
create mode 100644 arch/powerpc/include/asm/paravirt.h
create mode 100644 arch/powerpc/include/asm/qspinlock.h
create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
create mode 100644 arch/powerpc/include/asm/simple_spinlock.h
create mode 100644 arch/powerpc/include/asm/simple_spinlock_types.h
--
2.23.0
An include goes away in future patches which breaks compilation
without this.
Signed-off-by: Nicholas Piggin <[email protected]>
---
arch/powerpc/platforms/powernv/pci-ioda-tce.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index f923359d8afc..8eba6ece7808 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -15,6 +15,7 @@
#include <asm/iommu.h>
#include <asm/tce.h>
+#include <asm/hvcall.h> /* share error returns with PAPR */
#include "pci.h"
unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
--
2.23.0
Signed-off-by: Nicholas Piggin <[email protected]>
---
arch/powerpc/include/asm/paravirt.h | 61 +++++++++++++++++++++++++++++
arch/powerpc/include/asm/spinlock.h | 24 +-----------
arch/powerpc/lib/locks.c | 12 +++---
3 files changed, 68 insertions(+), 29 deletions(-)
create mode 100644 arch/powerpc/include/asm/paravirt.h
diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h
new file mode 100644
index 000000000000..7a8546660a63
--- /dev/null
+++ b/arch/powerpc/include/asm/paravirt.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __ASM_PARAVIRT_H
+#define __ASM_PARAVIRT_H
+#ifdef __KERNEL__
+
+#include <linux/jump_label.h>
+#include <asm/smp.h>
+#ifdef CONFIG_PPC64
+#include <asm/paca.h>
+#include <asm/hvcall.h>
+#endif
+
+#ifdef CONFIG_PPC_SPLPAR
+DECLARE_STATIC_KEY_FALSE(shared_processor);
+
+static inline bool is_shared_processor(void)
+{
+ return static_branch_unlikely(&shared_processor);
+}
+
+/* If bit 0 is set, the cpu has been preempted */
+static inline u32 yield_count_of(int cpu)
+{
+ __be32 yield_count = READ_ONCE(lppaca_of(cpu).yield_count);
+ return be32_to_cpu(yield_count);
+}
+
+static inline void yield_to_preempted(int cpu, u32 yield_count)
+{
+ plpar_hcall_norets(H_CONFER, get_hard_smp_processor_id(cpu), yield_count);
+}
+#else
+static inline bool is_shared_processor(void)
+{
+ return false;
+}
+
+static inline u32 yield_count_of(int cpu)
+{
+ return 0;
+}
+
+extern void ___bad_yield_to_preempted(void);
+static inline void yield_to_preempted(int cpu, u32 yield_count)
+{
+ ___bad_yield_to_preempted(); /* This would be a bug */
+}
+#endif
+
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(int cpu)
+{
+ if (!is_shared_processor())
+ return false;
+ if (yield_count_of(cpu) & 1)
+ return true;
+ return false;
+}
+
+#endif /* __KERNEL__ */
+#endif /* __ASM_PARAVIRT_H */
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 2d620896cdae..79be9bb10bbb 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -15,11 +15,10 @@
*
* (the type definitions are in asm/spinlock_types.h)
*/
-#include <linux/jump_label.h>
#include <linux/irqflags.h>
+#include <asm/paravirt.h>
#ifdef CONFIG_PPC64
#include <asm/paca.h>
-#include <asm/hvcall.h>
#endif
#include <asm/synch.h>
#include <asm/ppc-opcode.h>
@@ -35,18 +34,6 @@
#define LOCK_TOKEN 1
#endif
-#ifdef CONFIG_PPC_PSERIES
-DECLARE_STATIC_KEY_FALSE(shared_processor);
-
-#define vcpu_is_preempted vcpu_is_preempted
-static inline bool vcpu_is_preempted(int cpu)
-{
- if (!static_branch_unlikely(&shared_processor))
- return false;
- return !!(be32_to_cpu(lppaca_of(cpu).yield_count) & 1);
-}
-#endif
-
static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
{
return lock.slock == 0;
@@ -110,15 +97,6 @@ static inline void splpar_spin_yield(arch_spinlock_t *lock) {};
static inline void splpar_rw_yield(arch_rwlock_t *lock) {};
#endif
-static inline bool is_shared_processor(void)
-{
-#ifdef CONFIG_PPC_SPLPAR
- return static_branch_unlikely(&shared_processor);
-#else
- return false;
-#endif
-}
-
static inline void spin_yield(arch_spinlock_t *lock)
{
if (is_shared_processor())
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index 6440d5943c00..04165b7a163f 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -27,14 +27,14 @@ void splpar_spin_yield(arch_spinlock_t *lock)
return;
holder_cpu = lock_value & 0xffff;
BUG_ON(holder_cpu >= NR_CPUS);
- yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);
+
+ yield_count = yield_count_of(holder_cpu);
if ((yield_count & 1) == 0)
return; /* virtual cpu is currently running */
rmb();
if (lock->slock != lock_value)
return; /* something has changed */
- plpar_hcall_norets(H_CONFER,
- get_hard_smp_processor_id(holder_cpu), yield_count);
+ yield_to_preempted(holder_cpu, yield_count);
}
EXPORT_SYMBOL_GPL(splpar_spin_yield);
@@ -53,13 +53,13 @@ void splpar_rw_yield(arch_rwlock_t *rw)
return; /* no write lock at present */
holder_cpu = lock_value & 0xffff;
BUG_ON(holder_cpu >= NR_CPUS);
- yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);
+
+ yield_count = yield_count_of(holder_cpu);
if ((yield_count & 1) == 0)
return; /* virtual cpu is currently running */
rmb();
if (rw->lock != lock_value)
return; /* something has changed */
- plpar_hcall_norets(H_CONFER,
- get_hard_smp_processor_id(holder_cpu), yield_count);
+ yield_to_preempted(holder_cpu, yield_count);
}
#endif
--
2.23.0
To prepare for queued spinlocks. This is a simple rename except to update
preprocessor guard name and a file reference.
Signed-off-by: Nicholas Piggin <[email protected]>
---
arch/powerpc/include/asm/simple_spinlock.h | 292 ++++++++++++++++++
.../include/asm/simple_spinlock_types.h | 21 ++
arch/powerpc/include/asm/spinlock.h | 285 +----------------
arch/powerpc/include/asm/spinlock_types.h | 12 +-
4 files changed, 315 insertions(+), 295 deletions(-)
create mode 100644 arch/powerpc/include/asm/simple_spinlock.h
create mode 100644 arch/powerpc/include/asm/simple_spinlock_types.h
diff --git a/arch/powerpc/include/asm/simple_spinlock.h b/arch/powerpc/include/asm/simple_spinlock.h
new file mode 100644
index 000000000000..e048c041c4a9
--- /dev/null
+++ b/arch/powerpc/include/asm/simple_spinlock.h
@@ -0,0 +1,292 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __ASM_SIMPLE_SPINLOCK_H
+#define __ASM_SIMPLE_SPINLOCK_H
+#ifdef __KERNEL__
+
+/*
+ * Simple spin lock operations.
+ *
+ * Copyright (C) 2001-2004 Paul Mackerras <[email protected]>, IBM
+ * Copyright (C) 2001 Anton Blanchard <[email protected]>, IBM
+ * Copyright (C) 2002 Dave Engebretsen <[email protected]>, IBM
+ * Rework to support virtual processors
+ *
+ * Type of int is used as a full 64b word is not necessary.
+ *
+ * (the type definitions are in asm/simple_spinlock_types.h)
+ */
+#include <linux/irqflags.h>
+#include <asm/paravirt.h>
+#ifdef CONFIG_PPC64
+#include <asm/paca.h>
+#endif
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+
+#ifdef CONFIG_PPC64
+/* use 0x800000yy when locked, where yy == CPU number */
+#ifdef __BIG_ENDIAN__
+#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token))
+#else
+#define LOCK_TOKEN (*(u32 *)(&get_paca()->paca_index))
+#endif
+#else
+#define LOCK_TOKEN 1
+#endif
+
+static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
+{
+ return lock.slock == 0;
+}
+
+static inline int arch_spin_is_locked(arch_spinlock_t *lock)
+{
+ smp_mb();
+ return !arch_spin_value_unlocked(*lock);
+}
+
+/*
+ * This returns the old value in the lock, so we succeeded
+ * in getting the lock if the return value is 0.
+ */
+static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock)
+{
+ unsigned long tmp, token;
+
+ token = LOCK_TOKEN;
+ __asm__ __volatile__(
+"1: " PPC_LWARX(%0,0,%2,1) "\n\
+ cmpwi 0,%0,0\n\
+ bne- 2f\n\
+ stwcx. %1,0,%2\n\
+ bne- 1b\n"
+ PPC_ACQUIRE_BARRIER
+"2:"
+ : "=&r" (tmp)
+ : "r" (token), "r" (&lock->slock)
+ : "cr0", "memory");
+
+ return tmp;
+}
+
+static inline int arch_spin_trylock(arch_spinlock_t *lock)
+{
+ return __arch_spin_trylock(lock) == 0;
+}
+
+/*
+ * On a system with shared processors (that is, where a physical
+ * processor is multiplexed between several virtual processors),
+ * there is no point spinning on a lock if the holder of the lock
+ * isn't currently scheduled on a physical processor. Instead
+ * we detect this situation and ask the hypervisor to give the
+ * rest of our timeslice to the lock holder.
+ *
+ * So that we can tell which virtual processor is holding a lock,
+ * we put 0x80000000 | smp_processor_id() in the lock when it is
+ * held. Conveniently, we have a word in the paca that holds this
+ * value.
+ */
+
+#if defined(CONFIG_PPC_SPLPAR)
+/* We only yield to the hypervisor if we are in shared processor mode */
+void splpar_spin_yield(arch_spinlock_t *lock);
+void splpar_rw_yield(arch_rwlock_t *lock);
+#else /* SPLPAR */
+static inline void splpar_spin_yield(arch_spinlock_t *lock) {};
+static inline void splpar_rw_yield(arch_rwlock_t *lock) {};
+#endif
+
+static inline void spin_yield(arch_spinlock_t *lock)
+{
+ if (is_shared_processor())
+ splpar_spin_yield(lock);
+ else
+ barrier();
+}
+
+static inline void rw_yield(arch_rwlock_t *lock)
+{
+ if (is_shared_processor())
+ splpar_rw_yield(lock);
+ else
+ barrier();
+}
+
+static inline void arch_spin_lock(arch_spinlock_t *lock)
+{
+ while (1) {
+ if (likely(__arch_spin_trylock(lock) == 0))
+ break;
+ do {
+ HMT_low();
+ if (is_shared_processor())
+ splpar_spin_yield(lock);
+ } while (unlikely(lock->slock != 0));
+ HMT_medium();
+ }
+}
+
+static inline
+void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
+{
+ unsigned long flags_dis;
+
+ while (1) {
+ if (likely(__arch_spin_trylock(lock) == 0))
+ break;
+ local_save_flags(flags_dis);
+ local_irq_restore(flags);
+ do {
+ HMT_low();
+ if (is_shared_processor())
+ splpar_spin_yield(lock);
+ } while (unlikely(lock->slock != 0));
+ HMT_medium();
+ local_irq_restore(flags_dis);
+ }
+}
+#define arch_spin_lock_flags arch_spin_lock_flags
+
+static inline void arch_spin_unlock(arch_spinlock_t *lock)
+{
+ __asm__ __volatile__("# arch_spin_unlock\n\t"
+ PPC_RELEASE_BARRIER: : :"memory");
+ lock->slock = 0;
+}
+
+/*
+ * Read-write spinlocks, allowing multiple readers
+ * but only one writer.
+ *
+ * NOTE! it is quite common to have readers in interrupts
+ * but no interrupt writers. For those circumstances we
+ * can "mix" irq-safe locks - any writer needs to get a
+ * irq-safe write-lock, but readers can get non-irqsafe
+ * read-locks.
+ */
+
+#ifdef CONFIG_PPC64
+#define __DO_SIGN_EXTEND "extsw %0,%0\n"
+#define WRLOCK_TOKEN LOCK_TOKEN /* it's negative */
+#else
+#define __DO_SIGN_EXTEND
+#define WRLOCK_TOKEN (-1)
+#endif
+
+/*
+ * This returns the old value in the lock + 1,
+ * so we got a read lock if the return value is > 0.
+ */
+static inline long __arch_read_trylock(arch_rwlock_t *rw)
+{
+ long tmp;
+
+ __asm__ __volatile__(
+"1: " PPC_LWARX(%0,0,%1,1) "\n"
+ __DO_SIGN_EXTEND
+" addic. %0,%0,1\n\
+ ble- 2f\n"
+" stwcx. %0,0,%1\n\
+ bne- 1b\n"
+ PPC_ACQUIRE_BARRIER
+"2:" : "=&r" (tmp)
+ : "r" (&rw->lock)
+ : "cr0", "xer", "memory");
+
+ return tmp;
+}
+
+/*
+ * This returns the old value in the lock,
+ * so we got the write lock if the return value is 0.
+ */
+static inline long __arch_write_trylock(arch_rwlock_t *rw)
+{
+ long tmp, token;
+
+ token = WRLOCK_TOKEN;
+ __asm__ __volatile__(
+"1: " PPC_LWARX(%0,0,%2,1) "\n\
+ cmpwi 0,%0,0\n\
+ bne- 2f\n"
+" stwcx. %1,0,%2\n\
+ bne- 1b\n"
+ PPC_ACQUIRE_BARRIER
+"2:" : "=&r" (tmp)
+ : "r" (token), "r" (&rw->lock)
+ : "cr0", "memory");
+
+ return tmp;
+}
+
+static inline void arch_read_lock(arch_rwlock_t *rw)
+{
+ while (1) {
+ if (likely(__arch_read_trylock(rw) > 0))
+ break;
+ do {
+ HMT_low();
+ if (is_shared_processor())
+ splpar_rw_yield(rw);
+ } while (unlikely(rw->lock < 0));
+ HMT_medium();
+ }
+}
+
+static inline void arch_write_lock(arch_rwlock_t *rw)
+{
+ while (1) {
+ if (likely(__arch_write_trylock(rw) == 0))
+ break;
+ do {
+ HMT_low();
+ if (is_shared_processor())
+ splpar_rw_yield(rw);
+ } while (unlikely(rw->lock != 0));
+ HMT_medium();
+ }
+}
+
+static inline int arch_read_trylock(arch_rwlock_t *rw)
+{
+ return __arch_read_trylock(rw) > 0;
+}
+
+static inline int arch_write_trylock(arch_rwlock_t *rw)
+{
+ return __arch_write_trylock(rw) == 0;
+}
+
+static inline void arch_read_unlock(arch_rwlock_t *rw)
+{
+ long tmp;
+
+ __asm__ __volatile__(
+ "# read_unlock\n\t"
+ PPC_RELEASE_BARRIER
+"1: lwarx %0,0,%1\n\
+ addic %0,%0,-1\n"
+" stwcx. %0,0,%1\n\
+ bne- 1b"
+ : "=&r"(tmp)
+ : "r"(&rw->lock)
+ : "cr0", "xer", "memory");
+}
+
+static inline void arch_write_unlock(arch_rwlock_t *rw)
+{
+ __asm__ __volatile__("# write_unlock\n\t"
+ PPC_RELEASE_BARRIER: : :"memory");
+ rw->lock = 0;
+}
+
+#define arch_spin_relax(lock) spin_yield(lock)
+#define arch_read_relax(lock) rw_yield(lock)
+#define arch_write_relax(lock) rw_yield(lock)
+
+/* See include/linux/spinlock.h */
+#define smp_mb__after_spinlock() smp_mb()
+
+#endif /* __KERNEL__ */
+#endif /* __ASM_SIMPLE_SPINLOCK_H */
diff --git a/arch/powerpc/include/asm/simple_spinlock_types.h b/arch/powerpc/include/asm/simple_spinlock_types.h
new file mode 100644
index 000000000000..7c2b48ce62dc
--- /dev/null
+++ b/arch/powerpc/include/asm/simple_spinlock_types.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H
+#define _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+typedef struct {
+ volatile unsigned int slock;
+} arch_spinlock_t;
+
+#define __ARCH_SPIN_LOCK_UNLOCKED { 0 }
+
+typedef struct {
+ volatile signed int lock;
+} arch_rwlock_t;
+
+#define __ARCH_RW_LOCK_UNLOCKED { 0 }
+
+#endif
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 79be9bb10bbb..21357fe05fe0 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -3,290 +3,7 @@
#define __ASM_SPINLOCK_H
#ifdef __KERNEL__
-/*
- * Simple spin lock operations.
- *
- * Copyright (C) 2001-2004 Paul Mackerras <[email protected]>, IBM
- * Copyright (C) 2001 Anton Blanchard <[email protected]>, IBM
- * Copyright (C) 2002 Dave Engebretsen <[email protected]>, IBM
- * Rework to support virtual processors
- *
- * Type of int is used as a full 64b word is not necessary.
- *
- * (the type definitions are in asm/spinlock_types.h)
- */
-#include <linux/irqflags.h>
-#include <asm/paravirt.h>
-#ifdef CONFIG_PPC64
-#include <asm/paca.h>
-#endif
-#include <asm/synch.h>
-#include <asm/ppc-opcode.h>
-
-#ifdef CONFIG_PPC64
-/* use 0x800000yy when locked, where yy == CPU number */
-#ifdef __BIG_ENDIAN__
-#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token))
-#else
-#define LOCK_TOKEN (*(u32 *)(&get_paca()->paca_index))
-#endif
-#else
-#define LOCK_TOKEN 1
-#endif
-
-static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
-{
- return lock.slock == 0;
-}
-
-static inline int arch_spin_is_locked(arch_spinlock_t *lock)
-{
- smp_mb();
- return !arch_spin_value_unlocked(*lock);
-}
-
-/*
- * This returns the old value in the lock, so we succeeded
- * in getting the lock if the return value is 0.
- */
-static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock)
-{
- unsigned long tmp, token;
-
- token = LOCK_TOKEN;
- __asm__ __volatile__(
-"1: " PPC_LWARX(%0,0,%2,1) "\n\
- cmpwi 0,%0,0\n\
- bne- 2f\n\
- stwcx. %1,0,%2\n\
- bne- 1b\n"
- PPC_ACQUIRE_BARRIER
-"2:"
- : "=&r" (tmp)
- : "r" (token), "r" (&lock->slock)
- : "cr0", "memory");
-
- return tmp;
-}
-
-static inline int arch_spin_trylock(arch_spinlock_t *lock)
-{
- return __arch_spin_trylock(lock) == 0;
-}
-
-/*
- * On a system with shared processors (that is, where a physical
- * processor is multiplexed between several virtual processors),
- * there is no point spinning on a lock if the holder of the lock
- * isn't currently scheduled on a physical processor. Instead
- * we detect this situation and ask the hypervisor to give the
- * rest of our timeslice to the lock holder.
- *
- * So that we can tell which virtual processor is holding a lock,
- * we put 0x80000000 | smp_processor_id() in the lock when it is
- * held. Conveniently, we have a word in the paca that holds this
- * value.
- */
-
-#if defined(CONFIG_PPC_SPLPAR)
-/* We only yield to the hypervisor if we are in shared processor mode */
-void splpar_spin_yield(arch_spinlock_t *lock);
-void splpar_rw_yield(arch_rwlock_t *lock);
-#else /* SPLPAR */
-static inline void splpar_spin_yield(arch_spinlock_t *lock) {};
-static inline void splpar_rw_yield(arch_rwlock_t *lock) {};
-#endif
-
-static inline void spin_yield(arch_spinlock_t *lock)
-{
- if (is_shared_processor())
- splpar_spin_yield(lock);
- else
- barrier();
-}
-
-static inline void rw_yield(arch_rwlock_t *lock)
-{
- if (is_shared_processor())
- splpar_rw_yield(lock);
- else
- barrier();
-}
-
-static inline void arch_spin_lock(arch_spinlock_t *lock)
-{
- while (1) {
- if (likely(__arch_spin_trylock(lock) == 0))
- break;
- do {
- HMT_low();
- if (is_shared_processor())
- splpar_spin_yield(lock);
- } while (unlikely(lock->slock != 0));
- HMT_medium();
- }
-}
-
-static inline
-void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
-{
- unsigned long flags_dis;
-
- while (1) {
- if (likely(__arch_spin_trylock(lock) == 0))
- break;
- local_save_flags(flags_dis);
- local_irq_restore(flags);
- do {
- HMT_low();
- if (is_shared_processor())
- splpar_spin_yield(lock);
- } while (unlikely(lock->slock != 0));
- HMT_medium();
- local_irq_restore(flags_dis);
- }
-}
-#define arch_spin_lock_flags arch_spin_lock_flags
-
-static inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
- __asm__ __volatile__("# arch_spin_unlock\n\t"
- PPC_RELEASE_BARRIER: : :"memory");
- lock->slock = 0;
-}
-
-/*
- * Read-write spinlocks, allowing multiple readers
- * but only one writer.
- *
- * NOTE! it is quite common to have readers in interrupts
- * but no interrupt writers. For those circumstances we
- * can "mix" irq-safe locks - any writer needs to get a
- * irq-safe write-lock, but readers can get non-irqsafe
- * read-locks.
- */
-
-#ifdef CONFIG_PPC64
-#define __DO_SIGN_EXTEND "extsw %0,%0\n"
-#define WRLOCK_TOKEN LOCK_TOKEN /* it's negative */
-#else
-#define __DO_SIGN_EXTEND
-#define WRLOCK_TOKEN (-1)
-#endif
-
-/*
- * This returns the old value in the lock + 1,
- * so we got a read lock if the return value is > 0.
- */
-static inline long __arch_read_trylock(arch_rwlock_t *rw)
-{
- long tmp;
-
- __asm__ __volatile__(
-"1: " PPC_LWARX(%0,0,%1,1) "\n"
- __DO_SIGN_EXTEND
-" addic. %0,%0,1\n\
- ble- 2f\n"
-" stwcx. %0,0,%1\n\
- bne- 1b\n"
- PPC_ACQUIRE_BARRIER
-"2:" : "=&r" (tmp)
- : "r" (&rw->lock)
- : "cr0", "xer", "memory");
-
- return tmp;
-}
-
-/*
- * This returns the old value in the lock,
- * so we got the write lock if the return value is 0.
- */
-static inline long __arch_write_trylock(arch_rwlock_t *rw)
-{
- long tmp, token;
-
- token = WRLOCK_TOKEN;
- __asm__ __volatile__(
-"1: " PPC_LWARX(%0,0,%2,1) "\n\
- cmpwi 0,%0,0\n\
- bne- 2f\n"
-" stwcx. %1,0,%2\n\
- bne- 1b\n"
- PPC_ACQUIRE_BARRIER
-"2:" : "=&r" (tmp)
- : "r" (token), "r" (&rw->lock)
- : "cr0", "memory");
-
- return tmp;
-}
-
-static inline void arch_read_lock(arch_rwlock_t *rw)
-{
- while (1) {
- if (likely(__arch_read_trylock(rw) > 0))
- break;
- do {
- HMT_low();
- if (is_shared_processor())
- splpar_rw_yield(rw);
- } while (unlikely(rw->lock < 0));
- HMT_medium();
- }
-}
-
-static inline void arch_write_lock(arch_rwlock_t *rw)
-{
- while (1) {
- if (likely(__arch_write_trylock(rw) == 0))
- break;
- do {
- HMT_low();
- if (is_shared_processor())
- splpar_rw_yield(rw);
- } while (unlikely(rw->lock != 0));
- HMT_medium();
- }
-}
-
-static inline int arch_read_trylock(arch_rwlock_t *rw)
-{
- return __arch_read_trylock(rw) > 0;
-}
-
-static inline int arch_write_trylock(arch_rwlock_t *rw)
-{
- return __arch_write_trylock(rw) == 0;
-}
-
-static inline void arch_read_unlock(arch_rwlock_t *rw)
-{
- long tmp;
-
- __asm__ __volatile__(
- "# read_unlock\n\t"
- PPC_RELEASE_BARRIER
-"1: lwarx %0,0,%1\n\
- addic %0,%0,-1\n"
-" stwcx. %0,0,%1\n\
- bne- 1b"
- : "=&r"(tmp)
- : "r"(&rw->lock)
- : "cr0", "xer", "memory");
-}
-
-static inline void arch_write_unlock(arch_rwlock_t *rw)
-{
- __asm__ __volatile__("# write_unlock\n\t"
- PPC_RELEASE_BARRIER: : :"memory");
- rw->lock = 0;
-}
-
-#define arch_spin_relax(lock) spin_yield(lock)
-#define arch_read_relax(lock) rw_yield(lock)
-#define arch_write_relax(lock) rw_yield(lock)
-
-/* See include/linux/spinlock.h */
-#define smp_mb__after_spinlock() smp_mb()
+#include <asm/simple_spinlock.h>
#endif /* __KERNEL__ */
#endif /* __ASM_SPINLOCK_H */
diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
index 87adaf13b7e8..3906f52dae65 100644
--- a/arch/powerpc/include/asm/spinlock_types.h
+++ b/arch/powerpc/include/asm/spinlock_types.h
@@ -6,16 +6,6 @@
# error "please don't include this file directly"
#endif
-typedef struct {
- volatile unsigned int slock;
-} arch_spinlock_t;
-
-#define __ARCH_SPIN_LOCK_UNLOCKED { 0 }
-
-typedef struct {
- volatile signed int lock;
-} arch_rwlock_t;
-
-#define __ARCH_RW_LOCK_UNLOCKED { 0 }
+#include <asm/simple_spinlock_types.h>
#endif
--
2.23.0
These have shown significantly improved performance and fairness when
spinlock contention is moderate to high on very large systems.
[ Numbers hopefully forthcoming after more testing, but initial
results look good ]
Thanks to the fast path, single threaded performance is not noticably
hurt.
Signed-off-by: Nicholas Piggin <[email protected]>
---
arch/powerpc/Kconfig | 13 ++++++++++++
arch/powerpc/include/asm/Kbuild | 2 ++
arch/powerpc/include/asm/qspinlock.h | 25 +++++++++++++++++++++++
arch/powerpc/include/asm/spinlock.h | 5 +++++
arch/powerpc/include/asm/spinlock_types.h | 5 +++++
arch/powerpc/lib/Makefile | 3 +++
include/asm-generic/qspinlock.h | 2 ++
7 files changed, 55 insertions(+)
create mode 100644 arch/powerpc/include/asm/qspinlock.h
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 24ac85c868db..17663ea57697 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -146,6 +146,8 @@ config PPC
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF if PPC64
+ select ARCH_USE_QUEUED_RWLOCKS if PPC_QUEUED_SPINLOCKS
+ select ARCH_USE_QUEUED_SPINLOCKS if PPC_QUEUED_SPINLOCKS
select ARCH_WANT_IPC_PARSE_VERSION
select ARCH_WEAK_RELEASE_ACQUIRE
select BINFMT_ELF
@@ -492,6 +494,17 @@ config HOTPLUG_CPU
Say N if you are unsure.
+config PPC_QUEUED_SPINLOCKS
+ bool "Queued spinlocks"
+ depends on SMP
+ default "y" if PPC_BOOK3S_64
+ help
+ Say Y here to use to use queued spinlocks which are more complex
+ but give better salability and fairness on large SMP and NUMA
+ systems.
+
+ If unsure, say "Y" if you have lots of cores, otherwise "N".
+
config ARCH_CPU_PROBE_RELEASE
def_bool y
depends on HOTPLUG_CPU
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index dadbcf3a0b1e..1dd8b6adff5e 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -6,5 +6,7 @@ generated-y += syscall_table_spu.h
generic-y += export.h
generic-y += local64.h
generic-y += mcs_spinlock.h
+generic-y += qrwlock.h
+generic-y += qspinlock.h
generic-y += vtime.h
generic-y += early_ioremap.h
diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
new file mode 100644
index 000000000000..c49e33e24edd
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_QSPINLOCK_H
+#define _ASM_POWERPC_QSPINLOCK_H
+
+#include <asm-generic/qspinlock_types.h>
+
+#define _Q_PENDING_LOOPS (1 << 9) /* not tuned */
+
+#define smp_mb__after_spinlock() smp_mb()
+
+static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
+{
+ /*
+ * This barrier was added to simple spinlocks by commit 51d7d5205d338,
+ * but it should now be possible to remove it, asm arm64 has done with
+ * commit c6f5d02b6a0f.
+ */
+ smp_mb();
+ return atomic_read(&lock->val);
+}
+#define queued_spin_is_locked queued_spin_is_locked
+
+#include <asm-generic/qspinlock.h>
+
+#endif /* _ASM_POWERPC_QSPINLOCK_H */
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 21357fe05fe0..434615f1d761 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -3,7 +3,12 @@
#define __ASM_SPINLOCK_H
#ifdef __KERNEL__
+#ifdef CONFIG_PPC_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#include <asm/qrwlock.h>
+#else
#include <asm/simple_spinlock.h>
+#endif
#endif /* __KERNEL__ */
#endif /* __ASM_SPINLOCK_H */
diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
index 3906f52dae65..c5d742f18021 100644
--- a/arch/powerpc/include/asm/spinlock_types.h
+++ b/arch/powerpc/include/asm/spinlock_types.h
@@ -6,6 +6,11 @@
# error "please don't include this file directly"
#endif
+#ifdef CONFIG_PPC_QUEUED_SPINLOCKS
+#include <asm-generic/qspinlock_types.h>
+#include <asm-generic/qrwlock_types.h>
+#else
#include <asm/simple_spinlock_types.h>
+#endif
#endif
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 5e994cda8e40..d66a645503eb 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -41,7 +41,10 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \
obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
memcpy_64.o memcpy_mcsafe_64.o
+ifndef CONFIG_PPC_QUEUED_SPINLOCKS
obj64-$(CONFIG_SMP) += locks.o
+endif
+
obj64-$(CONFIG_ALTIVEC) += vmx-helper.o
obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o \
test_emulate_step_exec_instr.o
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index fde943d180e0..fb0a814d4395 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -12,6 +12,7 @@
#include <asm-generic/qspinlock_types.h>
+#ifndef queued_spin_is_locked
/**
* queued_spin_is_locked - is the spinlock locked?
* @lock: Pointer to queued spinlock structure
@@ -25,6 +26,7 @@ static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
*/
return atomic_read(&lock->val);
}
+#endif
/**
* queued_spin_value_unlocked - is the spinlock structure unlocked?
--
2.23.0
This brings the behaviour of the uncontended fast path back to
roughly equivalent to simple spinlocks -- a single atomic op with
lock hint.
Signed-off-by: Nicholas Piggin <[email protected]>
---
arch/powerpc/include/asm/atomic.h | 28 ++++++++++++++++++++++++++++
arch/powerpc/include/asm/qspinlock.h | 2 +-
2 files changed, 29 insertions(+), 1 deletion(-)
diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index 498785ffc25f..f6a3d145ffb7 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -193,6 +193,34 @@ static __inline__ int atomic_dec_return_relaxed(atomic_t *v)
#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
#define atomic_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new))
+/*
+ * Don't want to override the generic atomic_try_cmpxchg_acquire, because
+ * we add a lock hint to the lwarx, which may not be wanted for the
+ * _acquire case (and is not used by the other _acquire variants so it
+ * would be a surprise).
+ */
+static __always_inline bool
+atomic_try_cmpxchg_lock(atomic_t *v, int *old, int new)
+{
+ int r, o = *old;
+
+ __asm__ __volatile__ (
+"1:\t" PPC_LWARX(%0,0,%2,1) " # atomic_try_cmpxchg_acquire \n"
+" cmpw 0,%0,%3 \n"
+" bne- 2f \n"
+" stwcx. %4,0,%2 \n"
+" bne- 1b \n"
+"\t" PPC_ACQUIRE_BARRIER " \n"
+"2: \n"
+ : "=&r" (r), "+m" (v->counter)
+ : "r" (&v->counter), "r" (o), "r" (new)
+ : "cr0", "memory");
+
+ if (unlikely(r != o))
+ *old = r;
+ return likely(r == o);
+}
+
/**
* atomic_fetch_add_unless - add unless the number is a given value
* @v: pointer of type atomic_t
diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
index f5066f00a08c..b752d34517b3 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -37,7 +37,7 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock)
{
u32 val = 0;
- if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
+ if (likely(atomic_try_cmpxchg_lock(&lock->val, &val, _Q_LOCKED_VAL)))
return;
queued_spin_lock_slowpath(lock, val);
--
2.23.0
Signed-off-by: Nicholas Piggin <[email protected]>
---
arch/powerpc/include/asm/paravirt.h | 28 ++++++++
arch/powerpc/include/asm/qspinlock.h | 66 +++++++++++++++++++
arch/powerpc/include/asm/qspinlock_paravirt.h | 7 ++
arch/powerpc/platforms/pseries/Kconfig | 5 ++
arch/powerpc/platforms/pseries/setup.c | 6 +-
include/asm-generic/qspinlock.h | 2 +
6 files changed, 113 insertions(+), 1 deletion(-)
create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h
index 7a8546660a63..f2d51f929cf5 100644
--- a/arch/powerpc/include/asm/paravirt.h
+++ b/arch/powerpc/include/asm/paravirt.h
@@ -29,6 +29,16 @@ static inline void yield_to_preempted(int cpu, u32 yield_count)
{
plpar_hcall_norets(H_CONFER, get_hard_smp_processor_id(cpu), yield_count);
}
+
+static inline void prod_cpu(int cpu)
+{
+ plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu));
+}
+
+static inline void yield_to_any(void)
+{
+ plpar_hcall_norets(H_CONFER, -1, 0);
+}
#else
static inline bool is_shared_processor(void)
{
@@ -45,6 +55,19 @@ static inline void yield_to_preempted(int cpu, u32 yield_count)
{
___bad_yield_to_preempted(); /* This would be a bug */
}
+
+extern void ___bad_yield_to_any(void);
+static inline void yield_to_any(void)
+{
+ ___bad_yield_to_any(); /* This would be a bug */
+}
+
+extern void ___bad_prod_cpu(void);
+static inline void prod_cpu(int cpu)
+{
+ ___bad_prod_cpu(); /* This would be a bug */
+}
+
#endif
#define vcpu_is_preempted vcpu_is_preempted
@@ -57,5 +80,10 @@ static inline bool vcpu_is_preempted(int cpu)
return false;
}
+static inline bool pv_is_native_spin_unlock(void)
+{
+ return !is_shared_processor();
+}
+
#endif /* __KERNEL__ */
#endif /* __ASM_PARAVIRT_H */
diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
index c49e33e24edd..f5066f00a08c 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -3,9 +3,47 @@
#define _ASM_POWERPC_QSPINLOCK_H
#include <asm-generic/qspinlock_types.h>
+#include <asm/paravirt.h>
#define _Q_PENDING_LOOPS (1 << 9) /* not tuned */
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_queued_spin_unlock(struct qspinlock *lock);
+
+static __always_inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+ if (!is_shared_processor())
+ native_queued_spin_lock_slowpath(lock, val);
+ else
+ __pv_queued_spin_lock_slowpath(lock, val);
+}
+
+#define queued_spin_unlock queued_spin_unlock
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+ if (!is_shared_processor())
+ smp_store_release(&lock->locked, 0);
+ else
+ __pv_queued_spin_unlock(lock);
+}
+
+#else
+extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+#endif
+
+static __always_inline void queued_spin_lock(struct qspinlock *lock)
+{
+ u32 val = 0;
+
+ if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
+ return;
+
+ queued_spin_lock_slowpath(lock, val);
+}
+#define queued_spin_lock queued_spin_lock
+
#define smp_mb__after_spinlock() smp_mb()
static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
@@ -20,6 +58,34 @@ static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
}
#define queued_spin_is_locked queued_spin_is_locked
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define SPIN_THRESHOLD (1<<15) /* not tuned */
+
+static __always_inline void pv_wait(u8 *ptr, u8 val)
+{
+ if (*ptr != val)
+ return;
+ yield_to_any();
+ /*
+ * We could pass in a CPU here if waiting in the queue and yield to
+ * the previous CPU in the queue.
+ */
+}
+
+static __always_inline void pv_kick(int cpu)
+{
+ prod_cpu(cpu);
+}
+
+extern void __pv_init_lock_hash(void);
+
+static inline void pv_spinlocks_init(void)
+{
+ __pv_init_lock_hash();
+}
+
+#endif
+
#include <asm-generic/qspinlock.h>
#endif /* _ASM_POWERPC_QSPINLOCK_H */
diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h b/arch/powerpc/include/asm/qspinlock_paravirt.h
new file mode 100644
index 000000000000..750d1b5e0202
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __ASM_QSPINLOCK_PARAVIRT_H
+#define __ASM_QSPINLOCK_PARAVIRT_H
+
+EXPORT_SYMBOL(__pv_queued_spin_unlock);
+
+#endif /* __ASM_QSPINLOCK_PARAVIRT_H */
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 24c18362e5ea..756e727b383f 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -25,9 +25,14 @@ config PPC_PSERIES
select SWIOTLB
default y
+config PARAVIRT_SPINLOCKS
+ bool
+ default n
+
config PPC_SPLPAR
depends on PPC_PSERIES
bool "Support for shared-processor logical partitions"
+ select PARAVIRT_SPINLOCKS if PPC_QUEUED_SPINLOCKS
help
Enabling this option will make the kernel run more efficiently
on logically-partitioned pSeries systems which use shared
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 2db8469e475f..747a203d9453 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -771,8 +771,12 @@ static void __init pSeries_setup_arch(void)
if (firmware_has_feature(FW_FEATURE_LPAR)) {
vpa_init(boot_cpuid);
- if (lppaca_shared_proc(get_lppaca()))
+ if (lppaca_shared_proc(get_lppaca())) {
static_branch_enable(&shared_processor);
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+ pv_spinlocks_init();
+#endif
+ }
ppc_md.power_save = pseries_lpar_idle;
ppc_md.enable_pmcs = pseries_lpar_enable_pmcs;
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index fb0a814d4395..38ca14e79a86 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -69,6 +69,7 @@ static __always_inline int queued_spin_trylock(struct qspinlock *lock)
extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+#ifndef queued_spin_lock
/**
* queued_spin_lock - acquire a queued spinlock
* @lock: Pointer to queued spinlock structure
@@ -82,6 +83,7 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock)
queued_spin_lock_slowpath(lock, val);
}
+#endif
#ifndef queued_spin_unlock
/**
--
2.23.0
On 7/6/20 12:35 AM, Nicholas Piggin wrote:
> v3 is updated to use __pv_queued_spin_unlock, noticed by Waiman (thank you).
>
> Thanks,
> Nick
>
> Nicholas Piggin (6):
> powerpc/powernv: must include hvcall.h to get PAPR defines
> powerpc/pseries: move some PAPR paravirt functions to their own file
> powerpc: move spinlock implementation to simple_spinlock
> powerpc/64s: implement queued spinlocks and rwlocks
> powerpc/pseries: implement paravirt qspinlocks for SPLPAR
> powerpc/qspinlock: optimised atomic_try_cmpxchg_lock that adds the
> lock hint
>
> arch/powerpc/Kconfig | 13 +
> arch/powerpc/include/asm/Kbuild | 2 +
> arch/powerpc/include/asm/atomic.h | 28 ++
> arch/powerpc/include/asm/paravirt.h | 89 +++++
> arch/powerpc/include/asm/qspinlock.h | 91 ++++++
> arch/powerpc/include/asm/qspinlock_paravirt.h | 7 +
> arch/powerpc/include/asm/simple_spinlock.h | 292 +++++++++++++++++
> .../include/asm/simple_spinlock_types.h | 21 ++
> arch/powerpc/include/asm/spinlock.h | 308 +-----------------
> arch/powerpc/include/asm/spinlock_types.h | 17 +-
> arch/powerpc/lib/Makefile | 3 +
> arch/powerpc/lib/locks.c | 12 +-
> arch/powerpc/platforms/powernv/pci-ioda-tce.c | 1 +
> arch/powerpc/platforms/pseries/Kconfig | 5 +
> arch/powerpc/platforms/pseries/setup.c | 6 +-
> include/asm-generic/qspinlock.h | 4 +
> 16 files changed, 577 insertions(+), 322 deletions(-)
> create mode 100644 arch/powerpc/include/asm/paravirt.h
> create mode 100644 arch/powerpc/include/asm/qspinlock.h
> create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
> create mode 100644 arch/powerpc/include/asm/simple_spinlock.h
> create mode 100644 arch/powerpc/include/asm/simple_spinlock_types.h
>
This patch looks OK to me.
I had run some microbenchmark on powerpc system with or w/o the patch.
On a 2-socket 160-thread SMT4 POWER9 system (not virtualized):
5.8.0-rc4
=========
Running locktest with spinlock [runtime = 10s, load = 1]
Threads = 160, Min/Mean/Max = 77,665/90,153/106,895
Threads = 160, Total Rate = 1,441,759 op/s; Percpu Rate = 9,011 op/s
Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1]
Threads = 160, Min/Mean/Max = 47,879/53,807/63,689
Threads = 160, Total Rate = 860,192 op/s; Percpu Rate = 5,376 op/s
Running locktest with spinlock [runtime = 10s, load = 1]
Threads = 80, Min/Mean/Max = 242,907/319,514/463,161
Threads = 80, Total Rate = 2,555 kop/s; Percpu Rate = 32 kop/s
Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1]
Threads = 80, Min/Mean/Max = 146,161/187,474/259,270
Threads = 80, Total Rate = 1,498 kop/s; Percpu Rate = 19 kop/s
Running locktest with spinlock [runtime = 10s, load = 1]
Threads = 40, Min/Mean/Max = 646,639/1,000,817/1,455,205
Threads = 40, Total Rate = 4,001 kop/s; Percpu Rate = 100 kop/s
Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1]
Threads = 40, Min/Mean/Max = 402,165/597,132/814,555
Threads = 40, Total Rate = 2,388 kop/s; Percpu Rate = 60 kop/s
5.8.0-rc4-qlock+
================
Running locktest with spinlock [runtime = 10s, load = 1]
Threads = 160, Min/Mean/Max = 123,835/124,580/124,587
Threads = 160, Total Rate = 1,992 kop/s; Percpu Rate = 12 kop/s
Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1]
Threads = 160, Min/Mean/Max = 254,210/264,714/276,784
Threads = 160, Total Rate = 4,231 kop/s; Percpu Rate = 26 kop/s
Running locktest with spinlock [runtime = 10s, load = 1]
Threads = 80, Min/Mean/Max = 599,715/603,397/603,450
Threads = 80, Total Rate = 4,825 kop/s; Percpu Rate = 60 kop/s
Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1]
Threads = 80, Min/Mean/Max = 492,687/525,224/567,456
Threads = 80, Total Rate = 4,199 kop/s; Percpu Rate = 52 kop/s
Running locktest with spinlock [runtime = 10s, load = 1]
Threads = 40, Min/Mean/Max = 1,325,623/1,325,628/1,325,636
Threads = 40, Total Rate = 5,299 kop/s; Percpu Rate = 132 kop/s
Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1]
Threads = 40, Min/Mean/Max = 1,249,731/1,292,977/1,342,815
Threads = 40, Total Rate = 5,168 kop/s; Percpu Rate = 129 kop/s
On systems on large number of cpus, qspinlock lock is faster and more fair.
With some tuning, we may be able to squeeze out more performance.
Cheers,
Longman
Excerpts from Waiman Long's message of July 7, 2020 4:39 am:
> On 7/6/20 12:35 AM, Nicholas Piggin wrote:
>> v3 is updated to use __pv_queued_spin_unlock, noticed by Waiman (thank you).
>>
>> Thanks,
>> Nick
>>
>> Nicholas Piggin (6):
>> powerpc/powernv: must include hvcall.h to get PAPR defines
>> powerpc/pseries: move some PAPR paravirt functions to their own file
>> powerpc: move spinlock implementation to simple_spinlock
>> powerpc/64s: implement queued spinlocks and rwlocks
>> powerpc/pseries: implement paravirt qspinlocks for SPLPAR
>> powerpc/qspinlock: optimised atomic_try_cmpxchg_lock that adds the
>> lock hint
>>
>> arch/powerpc/Kconfig | 13 +
>> arch/powerpc/include/asm/Kbuild | 2 +
>> arch/powerpc/include/asm/atomic.h | 28 ++
>> arch/powerpc/include/asm/paravirt.h | 89 +++++
>> arch/powerpc/include/asm/qspinlock.h | 91 ++++++
>> arch/powerpc/include/asm/qspinlock_paravirt.h | 7 +
>> arch/powerpc/include/asm/simple_spinlock.h | 292 +++++++++++++++++
>> .../include/asm/simple_spinlock_types.h | 21 ++
>> arch/powerpc/include/asm/spinlock.h | 308 +-----------------
>> arch/powerpc/include/asm/spinlock_types.h | 17 +-
>> arch/powerpc/lib/Makefile | 3 +
>> arch/powerpc/lib/locks.c | 12 +-
>> arch/powerpc/platforms/powernv/pci-ioda-tce.c | 1 +
>> arch/powerpc/platforms/pseries/Kconfig | 5 +
>> arch/powerpc/platforms/pseries/setup.c | 6 +-
>> include/asm-generic/qspinlock.h | 4 +
>> 16 files changed, 577 insertions(+), 322 deletions(-)
>> create mode 100644 arch/powerpc/include/asm/paravirt.h
>> create mode 100644 arch/powerpc/include/asm/qspinlock.h
>> create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
>> create mode 100644 arch/powerpc/include/asm/simple_spinlock.h
>> create mode 100644 arch/powerpc/include/asm/simple_spinlock_types.h
>>
> This patch looks OK to me.
Thanks for reviewing and testing.
> I had run some microbenchmark on powerpc system with or w/o the patch.
>
> On a 2-socket 160-thread SMT4 POWER9 system (not virtualized):
>
> 5.8.0-rc4
> =========
>
> Running locktest with spinlock [runtime = 10s, load = 1]
> Threads = 160, Min/Mean/Max = 77,665/90,153/106,895
> Threads = 160, Total Rate = 1,441,759 op/s; Percpu Rate = 9,011 op/s
>
> Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1]
> Threads = 160, Min/Mean/Max = 47,879/53,807/63,689
> Threads = 160, Total Rate = 860,192 op/s; Percpu Rate = 5,376 op/s
>
> Running locktest with spinlock [runtime = 10s, load = 1]
> Threads = 80, Min/Mean/Max = 242,907/319,514/463,161
> Threads = 80, Total Rate = 2,555 kop/s; Percpu Rate = 32 kop/s
>
> Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1]
> Threads = 80, Min/Mean/Max = 146,161/187,474/259,270
> Threads = 80, Total Rate = 1,498 kop/s; Percpu Rate = 19 kop/s
>
> Running locktest with spinlock [runtime = 10s, load = 1]
> Threads = 40, Min/Mean/Max = 646,639/1,000,817/1,455,205
> Threads = 40, Total Rate = 4,001 kop/s; Percpu Rate = 100 kop/s
>
> Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1]
> Threads = 40, Min/Mean/Max = 402,165/597,132/814,555
> Threads = 40, Total Rate = 2,388 kop/s; Percpu Rate = 60 kop/s
>
> 5.8.0-rc4-qlock+
> ================
>
> Running locktest with spinlock [runtime = 10s, load = 1]
> Threads = 160, Min/Mean/Max = 123,835/124,580/124,587
> Threads = 160, Total Rate = 1,992 kop/s; Percpu Rate = 12 kop/s
>
> Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1]
> Threads = 160, Min/Mean/Max = 254,210/264,714/276,784
> Threads = 160, Total Rate = 4,231 kop/s; Percpu Rate = 26 kop/s
>
> Running locktest with spinlock [runtime = 10s, load = 1]
> Threads = 80, Min/Mean/Max = 599,715/603,397/603,450
> Threads = 80, Total Rate = 4,825 kop/s; Percpu Rate = 60 kop/s
>
> Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1]
> Threads = 80, Min/Mean/Max = 492,687/525,224/567,456
> Threads = 80, Total Rate = 4,199 kop/s; Percpu Rate = 52 kop/s
>
> Running locktest with spinlock [runtime = 10s, load = 1]
> Threads = 40, Min/Mean/Max = 1,325,623/1,325,628/1,325,636
> Threads = 40, Total Rate = 5,299 kop/s; Percpu Rate = 132 kop/s
>
> Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1]
> Threads = 40, Min/Mean/Max = 1,249,731/1,292,977/1,342,815
> Threads = 40, Total Rate = 5,168 kop/s; Percpu Rate = 129 kop/s
>
> On systems on large number of cpus, qspinlock lock is faster and more fair.
>
> With some tuning, we may be able to squeeze out more performance.
Yes, powerpc could certainly get more performance out of the slow
paths, and then there are a few parameters to tune.
We don't have a good alternate patching for function calls yet, but
that would be something to do for native vs pv.
And then there seem to be one or two tunable parameters we could
experiment with.
The paravirt locks may need a bit more tuning. Some simple testing
under KVM shows we might be a bit slower in some cases. Whether this
is fairness or something else I'm not sure. The current simple pv
spinlock code can do a directed yield to the lock holder CPU, whereas
the pv qspl here just does a general yield. I think we might actually
be able to change that to also support directed yield. Though I'm
not sure if this is actually the cause of the slowdown yet.
Thanks,
Nick
On 7/7/20 1:57 AM, Nicholas Piggin wrote:
> Yes, powerpc could certainly get more performance out of the slow
> paths, and then there are a few parameters to tune.
>
> We don't have a good alternate patching for function calls yet, but
> that would be something to do for native vs pv.
>
> And then there seem to be one or two tunable parameters we could
> experiment with.
>
> The paravirt locks may need a bit more tuning. Some simple testing
> under KVM shows we might be a bit slower in some cases. Whether this
> is fairness or something else I'm not sure. The current simple pv
> spinlock code can do a directed yield to the lock holder CPU, whereas
> the pv qspl here just does a general yield. I think we might actually
> be able to change that to also support directed yield. Though I'm
> not sure if this is actually the cause of the slowdown yet.
Regarding the paravirt lock, I have taken a further look into the
current PPC spinlock code. There is an equivalent of pv_wait() but no
pv_kick(). Maybe PPC doesn't really need that. Attached are two
additional qspinlock patches that adds a CONFIG_PARAVIRT_QSPINLOCKS_LITE
option to not require pv_kick(). There is also a fixup patch to be
applied after your patchset.
I don't have access to a PPC LPAR with shared processor at the moment,
so I can't test the performance of the paravirt code. Would you mind
adding my patches and do some performance test on your end to see if it
gives better result?
Thanks,
Longman
Excerpts from Waiman Long's message of July 8, 2020 1:33 pm:
> On 7/7/20 1:57 AM, Nicholas Piggin wrote:
>> Yes, powerpc could certainly get more performance out of the slow
>> paths, and then there are a few parameters to tune.
>>
>> We don't have a good alternate patching for function calls yet, but
>> that would be something to do for native vs pv.
>>
>> And then there seem to be one or two tunable parameters we could
>> experiment with.
>>
>> The paravirt locks may need a bit more tuning. Some simple testing
>> under KVM shows we might be a bit slower in some cases. Whether this
>> is fairness or something else I'm not sure. The current simple pv
>> spinlock code can do a directed yield to the lock holder CPU, whereas
>> the pv qspl here just does a general yield. I think we might actually
>> be able to change that to also support directed yield. Though I'm
>> not sure if this is actually the cause of the slowdown yet.
>
> Regarding the paravirt lock, I have taken a further look into the
> current PPC spinlock code. There is an equivalent of pv_wait() but no
> pv_kick(). Maybe PPC doesn't really need that.
So powerpc has two types of wait, either undirected "all processors" or
directed to a specific processor which has been preempted by the
hypervisor.
The simple spinlock code does a directed wait, because it knows the CPU
which is holding the lock. In this case, there is a sequence that is
used to ensure we don't wait if the condition has become true, and the
target CPU does not need to kick the waiter it will happen automatically
(see splpar_spin_yield). This is preferable because we only wait as
needed and don't require the kick operation.
The pv spinlock code I did uses the undirected wait, because we don't
know the CPU number which we are waiting on. This is undesirable because
it's higher overhead and the wait is not so accurate.
I think perhaps we could change things so we wait on the correct CPU
when queued, which might be good enough (we could also put the lock
owner CPU in the spinlock word, if we add another format).
> Attached are two
> additional qspinlock patches that adds a CONFIG_PARAVIRT_QSPINLOCKS_LITE
> option to not require pv_kick(). There is also a fixup patch to be
> applied after your patchset.
>
> I don't have access to a PPC LPAR with shared processor at the moment,
> so I can't test the performance of the paravirt code. Would you mind
> adding my patches and do some performance test on your end to see if it
> gives better result?
Great, I'll do some tests. Any suggestions for what to try?
Thanks,
Nick
On Tue, Jul 07, 2020 at 03:57:06PM +1000, Nicholas Piggin wrote:
> Yes, powerpc could certainly get more performance out of the slow
> paths, and then there are a few parameters to tune.
Can you clarify? The slow path is already in use on ARM64 which is weak,
so I doubt there's superfluous serialization present. And Will spend a
fair amount of time on making that thing guarantee forward progressm, so
there just isn't too much room to play.
> We don't have a good alternate patching for function calls yet, but
> that would be something to do for native vs pv.
Going by your jump_label implementation, support for static_call should
be fairly straight forward too, no?
https://lkml.kernel.org/r/[email protected]
On Tue, Jul 07, 2020 at 11:33:45PM -0400, Waiman Long wrote:
> From 5d7941a498935fb225b2c7a3108cbf590114c3db Mon Sep 17 00:00:00 2001
> From: Waiman Long <[email protected]>
> Date: Tue, 7 Jul 2020 22:29:16 -0400
> Subject: [PATCH 2/9] locking/pvqspinlock: Introduce
> CONFIG_PARAVIRT_QSPINLOCKS_LITE
>
> Add a new PARAVIRT_QSPINLOCKS_LITE config option that allows
> architectures to use the PV qspinlock code without the need to use or
> implement a pv_kick() function, thus eliminating the atomic unlock
> overhead. The non-atomic queued_spin_unlock() can be used instead.
> The pv_wait() function will still be needed, but it can be a dummy
> function.
>
> With that option set, the hybrid PV queued/unfair locking code should
> still be able to make it performant enough in a paravirtualized
How is this supposed to work? If there is no kick, you have no control
over who wakes up and fairness goes out the window entirely.
You don't even begin to explain...
On 7/8/20 1:10 AM, Nicholas Piggin wrote:
> Excerpts from Waiman Long's message of July 8, 2020 1:33 pm:
>> On 7/7/20 1:57 AM, Nicholas Piggin wrote:
>>> Yes, powerpc could certainly get more performance out of the slow
>>> paths, and then there are a few parameters to tune.
>>>
>>> We don't have a good alternate patching for function calls yet, but
>>> that would be something to do for native vs pv.
>>>
>>> And then there seem to be one or two tunable parameters we could
>>> experiment with.
>>>
>>> The paravirt locks may need a bit more tuning. Some simple testing
>>> under KVM shows we might be a bit slower in some cases. Whether this
>>> is fairness or something else I'm not sure. The current simple pv
>>> spinlock code can do a directed yield to the lock holder CPU, whereas
>>> the pv qspl here just does a general yield. I think we might actually
>>> be able to change that to also support directed yield. Though I'm
>>> not sure if this is actually the cause of the slowdown yet.
>> Regarding the paravirt lock, I have taken a further look into the
>> current PPC spinlock code. There is an equivalent of pv_wait() but no
>> pv_kick(). Maybe PPC doesn't really need that.
> So powerpc has two types of wait, either undirected "all processors" or
> directed to a specific processor which has been preempted by the
> hypervisor.
>
> The simple spinlock code does a directed wait, because it knows the CPU
> which is holding the lock. In this case, there is a sequence that is
> used to ensure we don't wait if the condition has become true, and the
> target CPU does not need to kick the waiter it will happen automatically
> (see splpar_spin_yield). This is preferable because we only wait as
> needed and don't require the kick operation.
Thanks for the explanation.
>
> The pv spinlock code I did uses the undirected wait, because we don't
> know the CPU number which we are waiting on. This is undesirable because
> it's higher overhead and the wait is not so accurate.
>
> I think perhaps we could change things so we wait on the correct CPU
> when queued, which might be good enough (we could also put the lock
> owner CPU in the spinlock word, if we add another format).
The LS byte of the lock word is used to indicate locking status. If we
have less than 255 cpus, we can put the (cpu_nr + 1) into the lock byte.
The special 0xff value can be used to indicate a cpu number >= 255 for
indirect yield. The required change to the qspinlock code will be
minimal, I think.
>> Attached are two
>> additional qspinlock patches that adds a CONFIG_PARAVIRT_QSPINLOCKS_LITE
>> option to not require pv_kick(). There is also a fixup patch to be
>> applied after your patchset.
>>
>> I don't have access to a PPC LPAR with shared processor at the moment,
>> so I can't test the performance of the paravirt code. Would you mind
>> adding my patches and do some performance test on your end to see if it
>> gives better result?
> Great, I'll do some tests. Any suggestions for what to try?
I will just like to see if it will produce some better performance
result compared with your current version.
Cheers,
Longman
On 7/8/20 4:32 AM, Peter Zijlstra wrote:
> On Tue, Jul 07, 2020 at 11:33:45PM -0400, Waiman Long wrote:
>> From 5d7941a498935fb225b2c7a3108cbf590114c3db Mon Sep 17 00:00:00 2001
>> From: Waiman Long <[email protected]>
>> Date: Tue, 7 Jul 2020 22:29:16 -0400
>> Subject: [PATCH 2/9] locking/pvqspinlock: Introduce
>> CONFIG_PARAVIRT_QSPINLOCKS_LITE
>>
>> Add a new PARAVIRT_QSPINLOCKS_LITE config option that allows
>> architectures to use the PV qspinlock code without the need to use or
>> implement a pv_kick() function, thus eliminating the atomic unlock
>> overhead. The non-atomic queued_spin_unlock() can be used instead.
>> The pv_wait() function will still be needed, but it can be a dummy
>> function.
>>
>> With that option set, the hybrid PV queued/unfair locking code should
>> still be able to make it performant enough in a paravirtualized
> How is this supposed to work? If there is no kick, you have no control
> over who wakes up and fairness goes out the window entirely.
>
> You don't even begin to explain...
>
I don't have a full understanding of how the PPC hypervisor work myself.
Apparently, a cpu kick may not be needed.
This is just a test patch to see if it yields better result. It is
subjected to further modifcation.
Cheers,
Longman
On 7/8/20 4:41 AM, Peter Zijlstra wrote:
> On Tue, Jul 07, 2020 at 03:57:06PM +1000, Nicholas Piggin wrote:
>> Yes, powerpc could certainly get more performance out of the slow
>> paths, and then there are a few parameters to tune.
> Can you clarify? The slow path is already in use on ARM64 which is weak,
> so I doubt there's superfluous serialization present. And Will spend a
> fair amount of time on making that thing guarantee forward progressm, so
> there just isn't too much room to play.
>
>> We don't have a good alternate patching for function calls yet, but
>> that would be something to do for native vs pv.
> Going by your jump_label implementation, support for static_call should
> be fairly straight forward too, no?
>
> https://lkml.kernel.org/r/[email protected]
>
Speaking of static_call, I am also looking forward to it. Do you have an
idea when that will be merged?
Cheers,
Longman
On 7/8/20 7:50 PM, Waiman Long wrote:
> On 7/8/20 1:10 AM, Nicholas Piggin wrote:
>> Excerpts from Waiman Long's message of July 8, 2020 1:33 pm:
>>> On 7/7/20 1:57 AM, Nicholas Piggin wrote:
>>>> Yes, powerpc could certainly get more performance out of the slow
>>>> paths, and then there are a few parameters to tune.
>>>>
>>>> We don't have a good alternate patching for function calls yet, but
>>>> that would be something to do for native vs pv.
>>>>
>>>> And then there seem to be one or two tunable parameters we could
>>>> experiment with.
>>>>
>>>> The paravirt locks may need a bit more tuning. Some simple testing
>>>> under KVM shows we might be a bit slower in some cases. Whether this
>>>> is fairness or something else I'm not sure. The current simple pv
>>>> spinlock code can do a directed yield to the lock holder CPU, whereas
>>>> the pv qspl here just does a general yield. I think we might actually
>>>> be able to change that to also support directed yield. Though I'm
>>>> not sure if this is actually the cause of the slowdown yet.
>>> Regarding the paravirt lock, I have taken a further look into the
>>> current PPC spinlock code. There is an equivalent of pv_wait() but no
>>> pv_kick(). Maybe PPC doesn't really need that.
>> So powerpc has two types of wait, either undirected "all processors" or
>> directed to a specific processor which has been preempted by the
>> hypervisor.
>>
>> The simple spinlock code does a directed wait, because it knows the CPU
>> which is holding the lock. In this case, there is a sequence that is
>> used to ensure we don't wait if the condition has become true, and the
>> target CPU does not need to kick the waiter it will happen automatically
>> (see splpar_spin_yield). This is preferable because we only wait as
>> needed and don't require the kick operation.
> Thanks for the explanation.
>>
>> The pv spinlock code I did uses the undirected wait, because we don't
>> know the CPU number which we are waiting on. This is undesirable because
>> it's higher overhead and the wait is not so accurate.
>>
>> I think perhaps we could change things so we wait on the correct CPU
>> when queued, which might be good enough (we could also put the lock
>> owner CPU in the spinlock word, if we add another format).
>
> The LS byte of the lock word is used to indicate locking status. If we
> have less than 255 cpus, we can put the (cpu_nr + 1) into the lock
> byte. The special 0xff value can be used to indicate a cpu number >=
> 255 for indirect yield. The required change to the qspinlock code will
> be minimal, I think.
BTW, we can also keep track of the previous cpu in the waiting queue.
Due to lock stealing, that may not be the cpu that is holding the lock.
Maybe we can use this, if available, in case the cpu number is >= 255.
Regards,
Longman
On Wed, Jul 08, 2020 at 07:54:34PM -0400, Waiman Long wrote:
> On 7/8/20 4:41 AM, Peter Zijlstra wrote:
> > On Tue, Jul 07, 2020 at 03:57:06PM +1000, Nicholas Piggin wrote:
> > > Yes, powerpc could certainly get more performance out of the slow
> > > paths, and then there are a few parameters to tune.
> > Can you clarify? The slow path is already in use on ARM64 which is weak,
> > so I doubt there's superfluous serialization present. And Will spend a
> > fair amount of time on making that thing guarantee forward progressm, so
> > there just isn't too much room to play.
> >
> > > We don't have a good alternate patching for function calls yet, but
> > > that would be something to do for native vs pv.
> > Going by your jump_label implementation, support for static_call should
> > be fairly straight forward too, no?
> >
> > https://lkml.kernel.org/r/[email protected]
> >
> Speaking of static_call, I am also looking forward to it. Do you have an
> idea when that will be merged?
0day had one crash on the last round, I think Steve send a fix for that
last night and I'll go look at it.
That said, the last posting got 0 feedback, so either everybody is
really happy with it, or not interested. So let us know in the thread,
with some review feedback.
Once I get through enough of the inbox to actually find the fix and test
it, I'll also update the thread, and maybe threaten to merge it if
everybody stays silent :-)
Nicholas Piggin <[email protected]> writes:
> An include goes away in future patches which breaks compilation
> without this.
>
> Signed-off-by: Nicholas Piggin <[email protected]>
> ---
> arch/powerpc/platforms/powernv/pci-ioda-tce.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> index f923359d8afc..8eba6ece7808 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> @@ -15,6 +15,7 @@
>
> #include <asm/iommu.h>
> #include <asm/tce.h>
> +#include <asm/hvcall.h> /* share error returns with PAPR */
> #include "pci.h"
>
> unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
> --
> 2.23.0
This isn't needed anymore AFAICS, since:
5f202c1a1d42 ("powerpc/powernv/ioda: Return correct error if TCE level allocation failed")
cheers
Nicholas Piggin <[email protected]> writes:
>
Little bit of changelog would be nice :D
> Signed-off-by: Nicholas Piggin <[email protected]>
> ---
> arch/powerpc/include/asm/paravirt.h | 61 +++++++++++++++++++++++++++++
> arch/powerpc/include/asm/spinlock.h | 24 +-----------
> arch/powerpc/lib/locks.c | 12 +++---
> 3 files changed, 68 insertions(+), 29 deletions(-)
> create mode 100644 arch/powerpc/include/asm/paravirt.h
>
> diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h
> new file mode 100644
> index 000000000000..7a8546660a63
> --- /dev/null
> +++ b/arch/powerpc/include/asm/paravirt.h
> @@ -0,0 +1,61 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +#ifndef __ASM_PARAVIRT_H
> +#define __ASM_PARAVIRT_H
Should be _ASM_POWERPC_PARAVIRT_H
> +#ifdef __KERNEL__
We shouldn't need __KERNEL__ in here, it's not a uapi header.
cheers
Nicholas Piggin <[email protected]> writes:
> To prepare for queued spinlocks. This is a simple rename except to update
> preprocessor guard name and a file reference.
>
> Signed-off-by: Nicholas Piggin <[email protected]>
> ---
> arch/powerpc/include/asm/simple_spinlock.h | 292 ++++++++++++++++++
> .../include/asm/simple_spinlock_types.h | 21 ++
> arch/powerpc/include/asm/spinlock.h | 285 +----------------
> arch/powerpc/include/asm/spinlock_types.h | 12 +-
> 4 files changed, 315 insertions(+), 295 deletions(-)
> create mode 100644 arch/powerpc/include/asm/simple_spinlock.h
> create mode 100644 arch/powerpc/include/asm/simple_spinlock_types.h
>
> diff --git a/arch/powerpc/include/asm/simple_spinlock.h b/arch/powerpc/include/asm/simple_spinlock.h
> new file mode 100644
> index 000000000000..e048c041c4a9
> --- /dev/null
> +++ b/arch/powerpc/include/asm/simple_spinlock.h
> @@ -0,0 +1,292 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +#ifndef __ASM_SIMPLE_SPINLOCK_H
> +#define __ASM_SIMPLE_SPINLOCK_H
_ASM_POWERPC_SIMPLE_SPINLOCK_H
> +#ifdef __KERNEL__
Shouldn't be necessary.
> +/*
> + * Simple spin lock operations.
> + *
> + * Copyright (C) 2001-2004 Paul Mackerras <[email protected]>, IBM
> + * Copyright (C) 2001 Anton Blanchard <[email protected]>, IBM
> + * Copyright (C) 2002 Dave Engebretsen <[email protected]>, IBM
> + * Rework to support virtual processors
> + *
> + * Type of int is used as a full 64b word is not necessary.
> + *
> + * (the type definitions are in asm/simple_spinlock_types.h)
> + */
> +#include <linux/irqflags.h>
> +#include <asm/paravirt.h>
> +#ifdef CONFIG_PPC64
> +#include <asm/paca.h>
> +#endif
I don't think paca.h needs a CONFIG_PPC64 guard, it contains one. I know
you're just moving the code, but still nice to cleanup slightly along
the way.
cheers
Nicholas Piggin <[email protected]> writes:
> These have shown significantly improved performance and fairness when
> spinlock contention is moderate to high on very large systems.
>
> [ Numbers hopefully forthcoming after more testing, but initial
> results look good ]
Would be good to have something here, even if it's preliminary.
> Thanks to the fast path, single threaded performance is not noticably
> hurt.
>
> Signed-off-by: Nicholas Piggin <[email protected]>
> ---
> arch/powerpc/Kconfig | 13 ++++++++++++
> arch/powerpc/include/asm/Kbuild | 2 ++
> arch/powerpc/include/asm/qspinlock.h | 25 +++++++++++++++++++++++
> arch/powerpc/include/asm/spinlock.h | 5 +++++
> arch/powerpc/include/asm/spinlock_types.h | 5 +++++
> arch/powerpc/lib/Makefile | 3 +++
> include/asm-generic/qspinlock.h | 2 ++
Who's ack do we need for that part?
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 24ac85c868db..17663ea57697 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -492,6 +494,17 @@ config HOTPLUG_CPU
>
> Say N if you are unsure.
>
> +config PPC_QUEUED_SPINLOCKS
> + bool "Queued spinlocks"
> + depends on SMP
> + default "y" if PPC_BOOK3S_64
Not sure about default y? At least until we've got a better idea of the
perf impact on a range of small/big new/old systems.
> + help
> + Say Y here to use to use queued spinlocks which are more complex
> + but give better salability and fairness on large SMP and NUMA
> + systems.
> +
> + If unsure, say "Y" if you have lots of cores, otherwise "N".
Would be nice if we could give a range for "lots".
> diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
> index dadbcf3a0b1e..1dd8b6adff5e 100644
> --- a/arch/powerpc/include/asm/Kbuild
> +++ b/arch/powerpc/include/asm/Kbuild
> @@ -6,5 +6,7 @@ generated-y += syscall_table_spu.h
> generic-y += export.h
> generic-y += local64.h
> generic-y += mcs_spinlock.h
> +generic-y += qrwlock.h
> +generic-y += qspinlock.h
The 2nd line spits a warning about a redundant entry. I think you want
to just drop it.
cheers
On Thu, Jul 09, 2020 at 08:20:25PM +1000, Michael Ellerman wrote:
> Nicholas Piggin <[email protected]> writes:
> > These have shown significantly improved performance and fairness when
> > spinlock contention is moderate to high on very large systems.
> >
> > [ Numbers hopefully forthcoming after more testing, but initial
> > results look good ]
>
> Would be good to have something here, even if it's preliminary.
>
> > Thanks to the fast path, single threaded performance is not noticably
> > hurt.
> >
> > Signed-off-by: Nicholas Piggin <[email protected]>
> > ---
> > arch/powerpc/Kconfig | 13 ++++++++++++
> > arch/powerpc/include/asm/Kbuild | 2 ++
> > arch/powerpc/include/asm/qspinlock.h | 25 +++++++++++++++++++++++
> > arch/powerpc/include/asm/spinlock.h | 5 +++++
> > arch/powerpc/include/asm/spinlock_types.h | 5 +++++
> > arch/powerpc/lib/Makefile | 3 +++
>
> > include/asm-generic/qspinlock.h | 2 ++
>
> Who's ack do we need for that part?
Mine I suppose would do, as discussed earlier, it probably isn't
required anymore, but I understand the paranoia of not wanting to change
too many things at once :-)
Acked-by: Peter Zijlstra (Intel) <[email protected]>
Nicholas Piggin <[email protected]> writes:
> Signed-off-by: Nicholas Piggin <[email protected]>
> ---
> arch/powerpc/include/asm/paravirt.h | 28 ++++++++
> arch/powerpc/include/asm/qspinlock.h | 66 +++++++++++++++++++
> arch/powerpc/include/asm/qspinlock_paravirt.h | 7 ++
> arch/powerpc/platforms/pseries/Kconfig | 5 ++
> arch/powerpc/platforms/pseries/setup.c | 6 +-
> include/asm-generic/qspinlock.h | 2 +
Another ack?
> diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h
> index 7a8546660a63..f2d51f929cf5 100644
> --- a/arch/powerpc/include/asm/paravirt.h
> +++ b/arch/powerpc/include/asm/paravirt.h
> @@ -45,6 +55,19 @@ static inline void yield_to_preempted(int cpu, u32 yield_count)
> {
> ___bad_yield_to_preempted(); /* This would be a bug */
> }
> +
> +extern void ___bad_yield_to_any(void);
> +static inline void yield_to_any(void)
> +{
> + ___bad_yield_to_any(); /* This would be a bug */
> +}
Why do we do that rather than just not defining yield_to_any() at all
and letting the build fail on that?
There's a condition somewhere that we know will false at compile time
and drop the call before linking?
> diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h b/arch/powerpc/include/asm/qspinlock_paravirt.h
> new file mode 100644
> index 000000000000..750d1b5e0202
> --- /dev/null
> +++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
> @@ -0,0 +1,7 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +#ifndef __ASM_QSPINLOCK_PARAVIRT_H
> +#define __ASM_QSPINLOCK_PARAVIRT_H
_ASM_POWERPC_QSPINLOCK_PARAVIRT_H please.
> +
> +EXPORT_SYMBOL(__pv_queued_spin_unlock);
Why's that in a header? Should that (eventually) go with the generic implementation?
> diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
> index 24c18362e5ea..756e727b383f 100644
> --- a/arch/powerpc/platforms/pseries/Kconfig
> +++ b/arch/powerpc/platforms/pseries/Kconfig
> @@ -25,9 +25,14 @@ config PPC_PSERIES
> select SWIOTLB
> default y
>
> +config PARAVIRT_SPINLOCKS
> + bool
> + default n
default n is the default.
> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
> index 2db8469e475f..747a203d9453 100644
> --- a/arch/powerpc/platforms/pseries/setup.c
> +++ b/arch/powerpc/platforms/pseries/setup.c
> @@ -771,8 +771,12 @@ static void __init pSeries_setup_arch(void)
> if (firmware_has_feature(FW_FEATURE_LPAR)) {
> vpa_init(boot_cpuid);
>
> - if (lppaca_shared_proc(get_lppaca()))
> + if (lppaca_shared_proc(get_lppaca())) {
> static_branch_enable(&shared_processor);
> +#ifdef CONFIG_PARAVIRT_SPINLOCKS
> + pv_spinlocks_init();
> +#endif
> + }
We could avoid the ifdef with this I think?
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 434615f1d761..6ec72282888d 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -10,5 +10,9 @@
#include <asm/simple_spinlock.h>
#endif
+#ifndef CONFIG_PARAVIRT_SPINLOCKS
+static inline void pv_spinlocks_init(void) { }
+#endif
+
#endif /* __KERNEL__ */
#endif /* __ASM_SPINLOCK_H */
cheers
On Thu, Jul 09, 2020 at 08:53:16PM +1000, Michael Ellerman wrote:
> Nicholas Piggin <[email protected]> writes:
>
> > Signed-off-by: Nicholas Piggin <[email protected]>
> > ---
> > arch/powerpc/include/asm/paravirt.h | 28 ++++++++
> > arch/powerpc/include/asm/qspinlock.h | 66 +++++++++++++++++++
> > arch/powerpc/include/asm/qspinlock_paravirt.h | 7 ++
> > arch/powerpc/platforms/pseries/Kconfig | 5 ++
> > arch/powerpc/platforms/pseries/setup.c | 6 +-
> > include/asm-generic/qspinlock.h | 2 +
>
> Another ack?
Acked-by: Peter Zijlstra (Intel) <[email protected]>
On 7/9/20 6:53 AM, Michael Ellerman wrote:
> Nicholas Piggin <[email protected]> writes:
>
>> Signed-off-by: Nicholas Piggin <[email protected]>
>> ---
>> arch/powerpc/include/asm/paravirt.h | 28 ++++++++
>> arch/powerpc/include/asm/qspinlock.h | 66 +++++++++++++++++++
>> arch/powerpc/include/asm/qspinlock_paravirt.h | 7 ++
>> arch/powerpc/platforms/pseries/Kconfig | 5 ++
>> arch/powerpc/platforms/pseries/setup.c | 6 +-
>> include/asm-generic/qspinlock.h | 2 +
> Another ack?
>
I am OK with adding the #ifdef around queued_spin_lock().
Acked-by: Waiman Long <[email protected]>
>> diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h
>> index 7a8546660a63..f2d51f929cf5 100644
>> --- a/arch/powerpc/include/asm/paravirt.h
>> +++ b/arch/powerpc/include/asm/paravirt.h
>> @@ -45,6 +55,19 @@ static inline void yield_to_preempted(int cpu, u32 yield_count)
>> {
>> ___bad_yield_to_preempted(); /* This would be a bug */
>> }
>> +
>> +extern void ___bad_yield_to_any(void);
>> +static inline void yield_to_any(void)
>> +{
>> + ___bad_yield_to_any(); /* This would be a bug */
>> +}
> Why do we do that rather than just not defining yield_to_any() at all
> and letting the build fail on that?
>
> There's a condition somewhere that we know will false at compile time
> and drop the call before linking?
>
>> diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h b/arch/powerpc/include/asm/qspinlock_paravirt.h
>> new file mode 100644
>> index 000000000000..750d1b5e0202
>> --- /dev/null
>> +++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
>> @@ -0,0 +1,7 @@
>> +/* SPDX-License-Identifier: GPL-2.0-or-later */
>> +#ifndef __ASM_QSPINLOCK_PARAVIRT_H
>> +#define __ASM_QSPINLOCK_PARAVIRT_H
> _ASM_POWERPC_QSPINLOCK_PARAVIRT_H please.
>
>> +
>> +EXPORT_SYMBOL(__pv_queued_spin_unlock);
> Why's that in a header? Should that (eventually) go with the generic implementation?
The PV qspinlock implementation is not that generic at the moment. Even
though native qspinlock is used by a number of archs, PV qspinlock is
only currently used in x86. This is certainly an area that needs
improvement.
>> diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
>> index 24c18362e5ea..756e727b383f 100644
>> --- a/arch/powerpc/platforms/pseries/Kconfig
>> +++ b/arch/powerpc/platforms/pseries/Kconfig
>> @@ -25,9 +25,14 @@ config PPC_PSERIES
>> select SWIOTLB
>> default y
>>
>> +config PARAVIRT_SPINLOCKS
>> + bool
>> + default n
> default n is the default.
>
>> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
>> index 2db8469e475f..747a203d9453 100644
>> --- a/arch/powerpc/platforms/pseries/setup.c
>> +++ b/arch/powerpc/platforms/pseries/setup.c
>> @@ -771,8 +771,12 @@ static void __init pSeries_setup_arch(void)
>> if (firmware_has_feature(FW_FEATURE_LPAR)) {
>> vpa_init(boot_cpuid);
>>
>> - if (lppaca_shared_proc(get_lppaca()))
>> + if (lppaca_shared_proc(get_lppaca())) {
>> static_branch_enable(&shared_processor);
>> +#ifdef CONFIG_PARAVIRT_SPINLOCKS
>> + pv_spinlocks_init();
>> +#endif
>> + }
> We could avoid the ifdef with this I think?
>
> diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
> index 434615f1d761..6ec72282888d 100644
> --- a/arch/powerpc/include/asm/spinlock.h
> +++ b/arch/powerpc/include/asm/spinlock.h
> @@ -10,5 +10,9 @@
> #include <asm/simple_spinlock.h>
> #endif
>
> +#ifndef CONFIG_PARAVIRT_SPINLOCKS
> +static inline void pv_spinlocks_init(void) { }
> +#endif
> +
> #endif /* __KERNEL__ */
> #endif /* __ASM_SPINLOCK_H */
>
>
> cheers
>
We don't really need to do a pv_spinlocks_init() if pv_kick() isn't
supported.
Cheers,
Longman
Excerpts from Peter Zijlstra's message of July 8, 2020 6:41 pm:
> On Tue, Jul 07, 2020 at 03:57:06PM +1000, Nicholas Piggin wrote:
>> Yes, powerpc could certainly get more performance out of the slow
>> paths, and then there are a few parameters to tune.
>
Sorry for the delay, got bogged down and distracted by other things :(
> Can you clarify? The slow path is already in use on ARM64 which is weak,
> so I doubt there's superfluous serialization present. And Will spend a
> fair amount of time on making that thing guarantee forward progressm, so
> there just isn't too much room to play.
Sure, the way the pending not-queued slowpath (which I guess is the
medium-path) is implemented is just poorly structured for LL/SC. It
has one more atomic than necessary (queued_fetch_set_pending_acquire),
and a lot of branches in suboptimal order.
Attached patch (completely untested just compiled and looked at asm
so far) is a way we can fix this on powerpc I think. It's actually
very little generic code change which is good, duplicated medium-path
logic unfortunately but that's no worse than something like x86
really.
>> We don't have a good alternate patching for function calls yet, but
>> that would be something to do for native vs pv.
>
> Going by your jump_label implementation, support for static_call should
> be fairly straight forward too, no?
>
> https://lkml.kernel.org/r/[email protected]
Nice, yeah it should be. I've wanted this for ages!
powerpc is kind of annoying to implement that with limited call range,
Hmm, not sure if we'd need a new linker feature to support it. We'd
provide call site patch space for indirect branches for those out of
range of direct call, so that should work fine. The trick would be
patching in the TOC lookup for the function... should be doable somehow.
Thanks,
Nick
---
diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
index b752d34517b3..26d8766a1106 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -31,16 +31,57 @@ static inline void queued_spin_unlock(struct qspinlock *lock)
#else
extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void queued_spin_lock_slowpath_queue(struct qspinlock *lock);
#endif
static __always_inline void queued_spin_lock(struct qspinlock *lock)
{
- u32 val = 0;
-
- if (likely(atomic_try_cmpxchg_lock(&lock->val, &val, _Q_LOCKED_VAL)))
+ atomic_t *a = &lock->val;
+ u32 val;
+
+again:
+ asm volatile(
+"1:\t" PPC_LWARX(%0,0,%1,1) " # queued_spin_lock \n"
+ : "=&r" (val)
+ : "r" (&a->counter)
+ : "memory");
+
+ if (likely(val == 0)) {
+ asm_volatile_goto(
+ " stwcx. %0,0,%1 \n"
+ " bne- %l[again] \n"
+ "\t" PPC_ACQUIRE_BARRIER " \n"
+ :
+ : "r"(_Q_LOCKED_VAL), "r" (&a->counter)
+ : "cr0", "memory"
+ : again );
return;
-
- queued_spin_lock_slowpath(lock, val);
+ }
+
+ if (likely(val == _Q_LOCKED_VAL)) {
+ asm_volatile_goto(
+ " stwcx. %0,0,%1 \n"
+ " bne- %l[again] \n"
+ :
+ : "r"(_Q_LOCKED_VAL | _Q_PENDING_VAL), "r" (&a->counter)
+ : "cr0", "memory"
+ : again );
+
+ atomic_cond_read_acquire(a, !(VAL & _Q_LOCKED_MASK));
+// clear_pending_set_locked(lock);
+ WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
+// lockevent_inc(lock_pending);
+ return;
+ }
+
+ if (val == _Q_PENDING_VAL) {
+ int cnt = _Q_PENDING_LOOPS;
+ val = atomic_cond_read_relaxed(a,
+ (VAL != _Q_PENDING_VAL) || !cnt--);
+ if (!(val & ~_Q_LOCKED_MASK))
+ goto again;
+ }
+ queued_spin_lock_slowpath_queue(lock);
}
#define queued_spin_lock queued_spin_lock
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index b9515fcc9b29..ebcc6f5d99d5 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -287,10 +287,14 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
#ifdef CONFIG_PARAVIRT_SPINLOCKS
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
+#define queued_spin_lock_slowpath_queue native_queued_spin_lock_slowpath_queue
#endif
#endif /* _GEN_PV_LOCK_SLOWPATH */
+void queued_spin_lock_slowpath_queue(struct qspinlock *lock);
+static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock);
+
/**
* queued_spin_lock_slowpath - acquire the queued spinlock
* @lock: Pointer to queued spinlock structure
@@ -314,12 +318,6 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
*/
void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
{
- struct mcs_spinlock *prev, *next, *node;
- u32 old, tail;
- int idx;
-
- BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
-
if (pv_enabled())
goto pv_queue;
@@ -397,6 +395,26 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
queue:
lockevent_inc(lock_slowpath);
pv_queue:
+ __queued_spin_lock_slowpath_queue(lock);
+}
+EXPORT_SYMBOL(queued_spin_lock_slowpath);
+
+void queued_spin_lock_slowpath_queue(struct qspinlock *lock)
+{
+ lockevent_inc(lock_slowpath);
+ __queued_spin_lock_slowpath_queue(lock);
+}
+EXPORT_SYMBOL(queued_spin_lock_slowpath_queue);
+
+static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock)
+{
+ struct mcs_spinlock *prev, *next, *node;
+ u32 old, tail;
+ u32 val;
+ int idx;
+
+ BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+
node = this_cpu_ptr(&qnodes[0].mcs);
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);
@@ -559,7 +577,6 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
*/
__this_cpu_dec(qnodes[0].mcs.count);
}
-EXPORT_SYMBOL(queued_spin_lock_slowpath);
/*
* Generate the paravirt code for queued_spin_unlock_slowpath().
Excerpts from Peter Zijlstra's message of July 9, 2020 6:31 pm:
> On Wed, Jul 08, 2020 at 07:54:34PM -0400, Waiman Long wrote:
>> On 7/8/20 4:41 AM, Peter Zijlstra wrote:
>> > On Tue, Jul 07, 2020 at 03:57:06PM +1000, Nicholas Piggin wrote:
>> > > Yes, powerpc could certainly get more performance out of the slow
>> > > paths, and then there are a few parameters to tune.
>> > Can you clarify? The slow path is already in use on ARM64 which is weak,
>> > so I doubt there's superfluous serialization present. And Will spend a
>> > fair amount of time on making that thing guarantee forward progressm, so
>> > there just isn't too much room to play.
>> >
>> > > We don't have a good alternate patching for function calls yet, but
>> > > that would be something to do for native vs pv.
>> > Going by your jump_label implementation, support for static_call should
>> > be fairly straight forward too, no?
>> >
>> > https://lkml.kernel.org/r/[email protected]
>> >
>> Speaking of static_call, I am also looking forward to it. Do you have an
>> idea when that will be merged?
>
> 0day had one crash on the last round, I think Steve send a fix for that
> last night and I'll go look at it.
>
> That said, the last posting got 0 feedback, so either everybody is
> really happy with it, or not interested. So let us know in the thread,
> with some review feedback.
>
> Once I get through enough of the inbox to actually find the fix and test
> it, I'll also update the thread, and maybe threaten to merge it if
> everybody stays silent :-)
I'd like to use it in powerpc. We have code now for example that patches
a branch immediately at the top of memcpy which branches to a different
version of the function. pv queued spinlock selection obviously, and
there's a bunch of platform ops struct things that get filled in at boot
time, etc.
So +1 here if you can get them through. I'm not 100% sure we can do
it with existing toolchain and no ugly hacks, but there's no way to
structure things that can get around that AFAIKS. We'd eventually
use it though, I'd say.
Thanks,
Nick
On 7/21/20 7:08 AM, Nicholas Piggin wrote:
> diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
> index b752d34517b3..26d8766a1106 100644
> --- a/arch/powerpc/include/asm/qspinlock.h
> +++ b/arch/powerpc/include/asm/qspinlock.h
> @@ -31,16 +31,57 @@ static inline void queued_spin_unlock(struct qspinlock *lock)
>
> #else
> extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
> +extern void queued_spin_lock_slowpath_queue(struct qspinlock *lock);
> #endif
>
> static __always_inline void queued_spin_lock(struct qspinlock *lock)
> {
> - u32 val = 0;
> -
> - if (likely(atomic_try_cmpxchg_lock(&lock->val, &val, _Q_LOCKED_VAL)))
> + atomic_t *a = &lock->val;
> + u32 val;
> +
> +again:
> + asm volatile(
> +"1:\t" PPC_LWARX(%0,0,%1,1) " # queued_spin_lock \n"
> + : "=&r" (val)
> + : "r" (&a->counter)
> + : "memory");
> +
> + if (likely(val == 0)) {
> + asm_volatile_goto(
> + " stwcx. %0,0,%1 \n"
> + " bne- %l[again] \n"
> + "\t" PPC_ACQUIRE_BARRIER " \n"
> + :
> + : "r"(_Q_LOCKED_VAL), "r" (&a->counter)
> + : "cr0", "memory"
> + : again );
> return;
> -
> - queued_spin_lock_slowpath(lock, val);
> + }
> +
> + if (likely(val == _Q_LOCKED_VAL)) {
> + asm_volatile_goto(
> + " stwcx. %0,0,%1 \n"
> + " bne- %l[again] \n"
> + :
> + : "r"(_Q_LOCKED_VAL | _Q_PENDING_VAL), "r" (&a->counter)
> + : "cr0", "memory"
> + : again );
> +
> + atomic_cond_read_acquire(a, !(VAL & _Q_LOCKED_MASK));
> +// clear_pending_set_locked(lock);
> + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
> +// lockevent_inc(lock_pending);
> + return;
> + }
> +
> + if (val == _Q_PENDING_VAL) {
> + int cnt = _Q_PENDING_LOOPS;
> + val = atomic_cond_read_relaxed(a,
> + (VAL != _Q_PENDING_VAL) || !cnt--);
> + if (!(val & ~_Q_LOCKED_MASK))
> + goto again;
> + }
> + queued_spin_lock_slowpath_queue(lock);
> }
> #define queued_spin_lock queued_spin_lock
>
I am fine with the arch code override some part of the generic code.
> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> index b9515fcc9b29..ebcc6f5d99d5 100644
> --- a/kernel/locking/qspinlock.c
> +++ b/kernel/locking/qspinlock.c
> @@ -287,10 +287,14 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
>
> #ifdef CONFIG_PARAVIRT_SPINLOCKS
> #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
> +#define queued_spin_lock_slowpath_queue native_queued_spin_lock_slowpath_queue
> #endif
>
> #endif /* _GEN_PV_LOCK_SLOWPATH */
>
> +void queued_spin_lock_slowpath_queue(struct qspinlock *lock);
> +static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock);
> +
> /**
> * queued_spin_lock_slowpath - acquire the queued spinlock
> * @lock: Pointer to queued spinlock structure
> @@ -314,12 +318,6 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
> */
> void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
> {
> - struct mcs_spinlock *prev, *next, *node;
> - u32 old, tail;
> - int idx;
> -
> - BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
> -
> if (pv_enabled())
> goto pv_queue;
>
> @@ -397,6 +395,26 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
> queue:
> lockevent_inc(lock_slowpath);
> pv_queue:
> + __queued_spin_lock_slowpath_queue(lock);
> +}
> +EXPORT_SYMBOL(queued_spin_lock_slowpath);
> +
> +void queued_spin_lock_slowpath_queue(struct qspinlock *lock)
> +{
> + lockevent_inc(lock_slowpath);
> + __queued_spin_lock_slowpath_queue(lock);
> +}
> +EXPORT_SYMBOL(queued_spin_lock_slowpath_queue);
> +
> +static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock)
> +{
> + struct mcs_spinlock *prev, *next, *node;
> + u32 old, tail;
> + u32 val;
> + int idx;
> +
> + BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
> +
> node = this_cpu_ptr(&qnodes[0].mcs);
> idx = node->count++;
> tail = encode_tail(smp_processor_id(), idx);
> @@ -559,7 +577,6 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
> */
> __this_cpu_dec(qnodes[0].mcs.count);
> }
> -EXPORT_SYMBOL(queued_spin_lock_slowpath);
>
> /*
> * Generate the paravirt code for queued_spin_unlock_slowpath().
>
I would prefer to extract out the pending bit handling code out into a
separate helper function which can be overridden by the arch code
instead of breaking the slowpath into 2 pieces.
Cheers,
Longman
Excerpts from Waiman Long's message of July 22, 2020 12:36 am:
> On 7/21/20 7:08 AM, Nicholas Piggin wrote:
>> diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
>> index b752d34517b3..26d8766a1106 100644
>> --- a/arch/powerpc/include/asm/qspinlock.h
>> +++ b/arch/powerpc/include/asm/qspinlock.h
>> @@ -31,16 +31,57 @@ static inline void queued_spin_unlock(struct qspinlock *lock)
>>
>> #else
>> extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
>> +extern void queued_spin_lock_slowpath_queue(struct qspinlock *lock);
>> #endif
>>
>> static __always_inline void queued_spin_lock(struct qspinlock *lock)
>> {
>> - u32 val = 0;
>> -
>> - if (likely(atomic_try_cmpxchg_lock(&lock->val, &val, _Q_LOCKED_VAL)))
>> + atomic_t *a = &lock->val;
>> + u32 val;
>> +
>> +again:
>> + asm volatile(
>> +"1:\t" PPC_LWARX(%0,0,%1,1) " # queued_spin_lock \n"
>> + : "=&r" (val)
>> + : "r" (&a->counter)
>> + : "memory");
>> +
>> + if (likely(val == 0)) {
>> + asm_volatile_goto(
>> + " stwcx. %0,0,%1 \n"
>> + " bne- %l[again] \n"
>> + "\t" PPC_ACQUIRE_BARRIER " \n"
>> + :
>> + : "r"(_Q_LOCKED_VAL), "r" (&a->counter)
>> + : "cr0", "memory"
>> + : again );
>> return;
>> -
>> - queued_spin_lock_slowpath(lock, val);
>> + }
>> +
>> + if (likely(val == _Q_LOCKED_VAL)) {
>> + asm_volatile_goto(
>> + " stwcx. %0,0,%1 \n"
>> + " bne- %l[again] \n"
>> + :
>> + : "r"(_Q_LOCKED_VAL | _Q_PENDING_VAL), "r" (&a->counter)
>> + : "cr0", "memory"
>> + : again );
>> +
>> + atomic_cond_read_acquire(a, !(VAL & _Q_LOCKED_MASK));
>> +// clear_pending_set_locked(lock);
>> + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
>> +// lockevent_inc(lock_pending);
>> + return;
>> + }
>> +
>> + if (val == _Q_PENDING_VAL) {
>> + int cnt = _Q_PENDING_LOOPS;
>> + val = atomic_cond_read_relaxed(a,
>> + (VAL != _Q_PENDING_VAL) || !cnt--);
>> + if (!(val & ~_Q_LOCKED_MASK))
>> + goto again;
>> + }
>> + queued_spin_lock_slowpath_queue(lock);
>> }
>> #define queued_spin_lock queued_spin_lock
>>
>
> I am fine with the arch code override some part of the generic code.
Cool.
>> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
>> index b9515fcc9b29..ebcc6f5d99d5 100644
>> --- a/kernel/locking/qspinlock.c
>> +++ b/kernel/locking/qspinlock.c
>> @@ -287,10 +287,14 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
>>
>> #ifdef CONFIG_PARAVIRT_SPINLOCKS
>> #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
>> +#define queued_spin_lock_slowpath_queue native_queued_spin_lock_slowpath_queue
>> #endif
>>
>> #endif /* _GEN_PV_LOCK_SLOWPATH */
>>
>> +void queued_spin_lock_slowpath_queue(struct qspinlock *lock);
>> +static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock);
>> +
>> /**
>> * queued_spin_lock_slowpath - acquire the queued spinlock
>> * @lock: Pointer to queued spinlock structure
>> @@ -314,12 +318,6 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
>> */
>> void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
>> {
>> - struct mcs_spinlock *prev, *next, *node;
>> - u32 old, tail;
>> - int idx;
>> -
>> - BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
>> -
>> if (pv_enabled())
>> goto pv_queue;
>>
>> @@ -397,6 +395,26 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
>> queue:
>> lockevent_inc(lock_slowpath);
>> pv_queue:
>> + __queued_spin_lock_slowpath_queue(lock);
>> +}
>> +EXPORT_SYMBOL(queued_spin_lock_slowpath);
>> +
>> +void queued_spin_lock_slowpath_queue(struct qspinlock *lock)
>> +{
>> + lockevent_inc(lock_slowpath);
>> + __queued_spin_lock_slowpath_queue(lock);
>> +}
>> +EXPORT_SYMBOL(queued_spin_lock_slowpath_queue);
>> +
>> +static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock)
>> +{
>> + struct mcs_spinlock *prev, *next, *node;
>> + u32 old, tail;
>> + u32 val;
>> + int idx;
>> +
>> + BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
>> +
>> node = this_cpu_ptr(&qnodes[0].mcs);
>> idx = node->count++;
>> tail = encode_tail(smp_processor_id(), idx);
>> @@ -559,7 +577,6 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
>> */
>> __this_cpu_dec(qnodes[0].mcs.count);
>> }
>> -EXPORT_SYMBOL(queued_spin_lock_slowpath);
>>
>> /*
>> * Generate the paravirt code for queued_spin_unlock_slowpath().
>>
> I would prefer to extract out the pending bit handling code out into a
> separate helper function which can be overridden by the arch code
> instead of breaking the slowpath into 2 pieces.
You mean have the arch provide a queued_spin_lock_slowpath_pending
function that the slow path calls?
I would actually prefer the pending handling can be made inline in
the queued_spin_lock function, especially with out-of-line locks it
makes sense to put it there.
We could ifdef out queued_spin_lock_slowpath_queue if it's not used,
then __queued_spin_lock_slowpath_queue would be inlined into the
caller so there would be no split?
Thanks,
Nick
On Thu, Jul 09, 2020 at 12:06:13PM -0400, Waiman Long wrote:
> We don't really need to do a pv_spinlocks_init() if pv_kick() isn't
> supported.
Waiman, if you cannot explain how not having kick is a sane thing, what
are you saying here?
Excerpts from Michael Ellerman's message of July 9, 2020 8:53 pm:
> Nicholas Piggin <[email protected]> writes:
>
>> Signed-off-by: Nicholas Piggin <[email protected]>
>> ---
>> arch/powerpc/include/asm/paravirt.h | 28 ++++++++
>> arch/powerpc/include/asm/qspinlock.h | 66 +++++++++++++++++++
>> arch/powerpc/include/asm/qspinlock_paravirt.h | 7 ++
>> arch/powerpc/platforms/pseries/Kconfig | 5 ++
>> arch/powerpc/platforms/pseries/setup.c | 6 +-
>> include/asm-generic/qspinlock.h | 2 +
>
> Another ack?
>
>> diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h
>> index 7a8546660a63..f2d51f929cf5 100644
>> --- a/arch/powerpc/include/asm/paravirt.h
>> +++ b/arch/powerpc/include/asm/paravirt.h
>> @@ -45,6 +55,19 @@ static inline void yield_to_preempted(int cpu, u32 yield_count)
>> {
>> ___bad_yield_to_preempted(); /* This would be a bug */
>> }
>> +
>> +extern void ___bad_yield_to_any(void);
>> +static inline void yield_to_any(void)
>> +{
>> + ___bad_yield_to_any(); /* This would be a bug */
>> +}
>
> Why do we do that rather than just not defining yield_to_any() at all
> and letting the build fail on that?
>
> There's a condition somewhere that we know will false at compile time
> and drop the call before linking?
Mainly so you could use it in if (IS_ENABLED()) blocks, but would still
catch the (presumably buggy) case where something calls it without the
option set.
I think I had it arranged a different way that was using IS_ENABLED
earlier and changed it but might as well keep it this way.
>
>> diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h b/arch/powerpc/include/asm/qspinlock_paravirt.h
>> new file mode 100644
>> index 000000000000..750d1b5e0202
>> --- /dev/null
>> +++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
>> @@ -0,0 +1,7 @@
>> +/* SPDX-License-Identifier: GPL-2.0-or-later */
>> +#ifndef __ASM_QSPINLOCK_PARAVIRT_H
>> +#define __ASM_QSPINLOCK_PARAVIRT_H
>
> _ASM_POWERPC_QSPINLOCK_PARAVIRT_H please.
>
>> +
>> +EXPORT_SYMBOL(__pv_queued_spin_unlock);
>
> Why's that in a header? Should that (eventually) go with the generic implementation?
Yeah the qspinlock_paravirt.h header is a bit weird and only gets
included into kernel/locking/qspinlock.c
>> diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
>> index 24c18362e5ea..756e727b383f 100644
>> --- a/arch/powerpc/platforms/pseries/Kconfig
>> +++ b/arch/powerpc/platforms/pseries/Kconfig
>> @@ -25,9 +25,14 @@ config PPC_PSERIES
>> select SWIOTLB
>> default y
>>
>> +config PARAVIRT_SPINLOCKS
>> + bool
>> + default n
>
> default n is the default.
>
>> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
>> index 2db8469e475f..747a203d9453 100644
>> --- a/arch/powerpc/platforms/pseries/setup.c
>> +++ b/arch/powerpc/platforms/pseries/setup.c
>> @@ -771,8 +771,12 @@ static void __init pSeries_setup_arch(void)
>> if (firmware_has_feature(FW_FEATURE_LPAR)) {
>> vpa_init(boot_cpuid);
>>
>> - if (lppaca_shared_proc(get_lppaca()))
>> + if (lppaca_shared_proc(get_lppaca())) {
>> static_branch_enable(&shared_processor);
>> +#ifdef CONFIG_PARAVIRT_SPINLOCKS
>> + pv_spinlocks_init();
>> +#endif
>> + }
>
> We could avoid the ifdef with this I think?
Yes I think so.
Thanks,
Nick
On 7/23/20 9:30 AM, Nicholas Piggin wrote:
>> I would prefer to extract out the pending bit handling code out into a
>> separate helper function which can be overridden by the arch code
>> instead of breaking the slowpath into 2 pieces.
> You mean have the arch provide a queued_spin_lock_slowpath_pending
> function that the slow path calls?
>
> I would actually prefer the pending handling can be made inline in
> the queued_spin_lock function, especially with out-of-line locks it
> makes sense to put it there.
>
> We could ifdef out queued_spin_lock_slowpath_queue if it's not used,
> then __queued_spin_lock_slowpath_queue would be inlined into the
> caller so there would be no split?
The pending code is an optimization for lightly contended locks. That is
why I think it is appropriate to extract it into a helper function and
mark it as such.
You can certainly put the code in the arch's spin_lock code, you just
has to override the generic pending code by a null function.
Cheers,
Longman
On Mon, Jul 06, 2020 at 02:35:38PM +1000, Nicholas Piggin wrote:
> These have shown significantly improved performance and fairness when
> spinlock contention is moderate to high on very large systems.
>
> [ Numbers hopefully forthcoming after more testing, but initial
> results look good ]
>
> Thanks to the fast path, single threaded performance is not noticably
> hurt.
>
> Signed-off-by: Nicholas Piggin <[email protected]>
> ---
> arch/powerpc/Kconfig | 13 ++++++++++++
> arch/powerpc/include/asm/Kbuild | 2 ++
> arch/powerpc/include/asm/qspinlock.h | 25 +++++++++++++++++++++++
> arch/powerpc/include/asm/spinlock.h | 5 +++++
> arch/powerpc/include/asm/spinlock_types.h | 5 +++++
> arch/powerpc/lib/Makefile | 3 +++
> include/asm-generic/qspinlock.h | 2 ++
> 7 files changed, 55 insertions(+)
> create mode 100644 arch/powerpc/include/asm/qspinlock.h
>
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 24ac85c868db..17663ea57697 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -146,6 +146,8 @@ config PPC
> select ARCH_SUPPORTS_ATOMIC_RMW
> select ARCH_USE_BUILTIN_BSWAP
> select ARCH_USE_CMPXCHG_LOCKREF if PPC64
> + select ARCH_USE_QUEUED_RWLOCKS if PPC_QUEUED_SPINLOCKS
> + select ARCH_USE_QUEUED_SPINLOCKS if PPC_QUEUED_SPINLOCKS
> select ARCH_WANT_IPC_PARSE_VERSION
> select ARCH_WEAK_RELEASE_ACQUIRE
> select BINFMT_ELF
> @@ -492,6 +494,17 @@ config HOTPLUG_CPU
>
> Say N if you are unsure.
>
> +config PPC_QUEUED_SPINLOCKS
> + bool "Queued spinlocks"
> + depends on SMP
> + default "y" if PPC_BOOK3S_64
> + help
> + Say Y here to use to use queued spinlocks which are more complex
> + but give better salability and fairness on large SMP and NUMA
^ +c?
Thanks
Michal
> + systems.
> +
> + If unsure, say "Y" if you have lots of cores, otherwise "N".
> +
> config ARCH_CPU_PROBE_RELEASE
> def_bool y
> depends on HOTPLUG_CPU
> diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
> index dadbcf3a0b1e..1dd8b6adff5e 100644
> --- a/arch/powerpc/include/asm/Kbuild
> +++ b/arch/powerpc/include/asm/Kbuild
> @@ -6,5 +6,7 @@ generated-y += syscall_table_spu.h
> generic-y += export.h
> generic-y += local64.h
> generic-y += mcs_spinlock.h
> +generic-y += qrwlock.h
> +generic-y += qspinlock.h
> generic-y += vtime.h
> generic-y += early_ioremap.h
> diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
> new file mode 100644
> index 000000000000..c49e33e24edd
> --- /dev/null
> +++ b/arch/powerpc/include/asm/qspinlock.h
> @@ -0,0 +1,25 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_POWERPC_QSPINLOCK_H
> +#define _ASM_POWERPC_QSPINLOCK_H
> +
> +#include <asm-generic/qspinlock_types.h>
> +
> +#define _Q_PENDING_LOOPS (1 << 9) /* not tuned */
> +
> +#define smp_mb__after_spinlock() smp_mb()
> +
> +static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
> +{
> + /*
> + * This barrier was added to simple spinlocks by commit 51d7d5205d338,
> + * but it should now be possible to remove it, asm arm64 has done with
> + * commit c6f5d02b6a0f.
> + */
> + smp_mb();
> + return atomic_read(&lock->val);
> +}
> +#define queued_spin_is_locked queued_spin_is_locked
> +
> +#include <asm-generic/qspinlock.h>
> +
> +#endif /* _ASM_POWERPC_QSPINLOCK_H */
> diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
> index 21357fe05fe0..434615f1d761 100644
> --- a/arch/powerpc/include/asm/spinlock.h
> +++ b/arch/powerpc/include/asm/spinlock.h
> @@ -3,7 +3,12 @@
> #define __ASM_SPINLOCK_H
> #ifdef __KERNEL__
>
> +#ifdef CONFIG_PPC_QUEUED_SPINLOCKS
> +#include <asm/qspinlock.h>
> +#include <asm/qrwlock.h>
> +#else
> #include <asm/simple_spinlock.h>
> +#endif
>
> #endif /* __KERNEL__ */
> #endif /* __ASM_SPINLOCK_H */
> diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
> index 3906f52dae65..c5d742f18021 100644
> --- a/arch/powerpc/include/asm/spinlock_types.h
> +++ b/arch/powerpc/include/asm/spinlock_types.h
> @@ -6,6 +6,11 @@
> # error "please don't include this file directly"
> #endif
>
> +#ifdef CONFIG_PPC_QUEUED_SPINLOCKS
> +#include <asm-generic/qspinlock_types.h>
> +#include <asm-generic/qrwlock_types.h>
> +#else
> #include <asm/simple_spinlock_types.h>
> +#endif
>
> #endif
> diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
> index 5e994cda8e40..d66a645503eb 100644
> --- a/arch/powerpc/lib/Makefile
> +++ b/arch/powerpc/lib/Makefile
> @@ -41,7 +41,10 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \
> obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
> memcpy_64.o memcpy_mcsafe_64.o
>
> +ifndef CONFIG_PPC_QUEUED_SPINLOCKS
> obj64-$(CONFIG_SMP) += locks.o
> +endif
> +
> obj64-$(CONFIG_ALTIVEC) += vmx-helper.o
> obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o \
> test_emulate_step_exec_instr.o
> diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
> index fde943d180e0..fb0a814d4395 100644
> --- a/include/asm-generic/qspinlock.h
> +++ b/include/asm-generic/qspinlock.h
> @@ -12,6 +12,7 @@
>
> #include <asm-generic/qspinlock_types.h>
>
> +#ifndef queued_spin_is_locked
> /**
> * queued_spin_is_locked - is the spinlock locked?
> * @lock: Pointer to queued spinlock structure
> @@ -25,6 +26,7 @@ static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
> */
> return atomic_read(&lock->val);
> }
> +#endif
>
> /**
> * queued_spin_value_unlocked - is the spinlock structure unlocked?
> --
> 2.23.0
>
Excerpts from Waiman Long's message of July 24, 2020 12:29 am:
> On 7/23/20 9:30 AM, Nicholas Piggin wrote:
>>> I would prefer to extract out the pending bit handling code out into a
>>> separate helper function which can be overridden by the arch code
>>> instead of breaking the slowpath into 2 pieces.
>> You mean have the arch provide a queued_spin_lock_slowpath_pending
>> function that the slow path calls?
>>
>> I would actually prefer the pending handling can be made inline in
>> the queued_spin_lock function, especially with out-of-line locks it
>> makes sense to put it there.
>>
>> We could ifdef out queued_spin_lock_slowpath_queue if it's not used,
>> then __queued_spin_lock_slowpath_queue would be inlined into the
>> caller so there would be no split?
>
> The pending code is an optimization for lightly contended locks. That is
> why I think it is appropriate to extract it into a helper function and
> mark it as such.
>
> You can certainly put the code in the arch's spin_lock code, you just
> has to override the generic pending code by a null function.
I see what you mean. I guess that would work fine.
Thanks,
Nick
On 7/23/20 10:00 AM, Peter Zijlstra wrote:
> On Thu, Jul 09, 2020 at 12:06:13PM -0400, Waiman Long wrote:
>> We don't really need to do a pv_spinlocks_init() if pv_kick() isn't
>> supported.
> Waiman, if you cannot explain how not having kick is a sane thing, what
> are you saying here?
>
The current PPC paravirt spinlock code doesn't do any cpu kick. It does
an equivalence of pv_wait by yielding the cpu to the lock holder only.
The pv_spinlocks_init() is for setting up the hash table for doing
pv_kick. If we don't need to do pv_kick, we don't need the hash table.
I am not saying that pv_kick is not needed for the PPC environment. I
was just trying to adapt the pvqspinlock code to such an environment
first. Further investigation on how to implement some kind of pv_kick
will be something that we may want to do as a follow on.
BTW, do you have any comment on my v2 lock holder cpu info qspinlock
patch? I will have to update the patch to fix the reported 0-day test
problem, but I want to collect other feedback before sending out v3.
Cheers,
Longman
On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
> BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
> I will have to update the patch to fix the reported 0-day test problem, but
> I want to collect other feedback before sending out v3.
I want to say I hate it all, it adds instructions to a path we spend an
aweful lot of time optimizing without really getting anything back for
it.
Will, how do you feel about it?
On 7/23/20 2:47 PM, [email protected] wrote:
> On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
>> BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
>> I will have to update the patch to fix the reported 0-day test problem, but
>> I want to collect other feedback before sending out v3.
> I want to say I hate it all, it adds instructions to a path we spend an
> aweful lot of time optimizing without really getting anything back for
> it.
It does add some extra instruction that may slow it down slightly, but I
don't agree that it gives nothing back. The cpu lock holder information
can be useful in analyzing crash dumps and in some debugging situation.
I think it can be useful in RHEL for this readon. How about an x86
config option to allow distros to decide if they want to have it
enabled? I will make sure that it will have no performance degradation
if the option is not enabled.
Cheers,
Longman
On Thu, Jul 23, 2020 at 03:04:13PM -0400, Waiman Long wrote:
> On 7/23/20 2:47 PM, [email protected] wrote:
> > On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
> > > BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
> > > I will have to update the patch to fix the reported 0-day test problem, but
> > > I want to collect other feedback before sending out v3.
> > I want to say I hate it all, it adds instructions to a path we spend an
> > aweful lot of time optimizing without really getting anything back for
> > it.
>
> It does add some extra instruction that may slow it down slightly, but I
> don't agree that it gives nothing back. The cpu lock holder information can
> be useful in analyzing crash dumps and in some debugging situation. I think
> it can be useful in RHEL for this readon. How about an x86 config option to
> allow distros to decide if they want to have it enabled? I will make sure
> that it will have no performance degradation if the option is not enabled.
Config knobs suck too; they create a maintenance burden (we get to make
sure all the permutations works/build/etc..) and effectively nobody uses
them, since world+dog uses what distros pick.
Anyway, instead of adding a second per-cpu variable, can you see how
horrible something like this is:
unsigned char adds(unsigned char var, unsigned char val)
{
unsigned short sat = 0xff, tmp = var;
asm ("addb %[val], %b[var];"
"cmovc %[sat], %[var];"
: [var] "+r" (tmp)
: [val] "ir" (val), [sat] "r" (sat)
);
return tmp;
}
Another thing to try is, instead of threading that lockval throughout
the thing, simply:
#define _Q_LOCKED_VAL this_cpu_read_stable(cpu_sat)
or combined with the above
#define _Q_LOCKED_VAL adds(this_cpu_read_stable(cpu_number), 2)
and see if the compiler really makes a mess of things.
On Thu, Jul 23, 2020 at 09:58:55PM +0200, [email protected] wrote:
> asm ("addb %[val], %b[var];"
> "cmovc %[sat], %[var];"
> : [var] "+r" (tmp)
> : [val] "ir" (val), [sat] "r" (sat)
> );
"var" (operand 0) needs an earlyclobber ("sat" is read after "var" is
written for the first time).
Segher
On 7/23/20 3:58 PM, [email protected] wrote:
> On Thu, Jul 23, 2020 at 03:04:13PM -0400, Waiman Long wrote:
>> On 7/23/20 2:47 PM, [email protected] wrote:
>>> On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
>>>> BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
>>>> I will have to update the patch to fix the reported 0-day test problem, but
>>>> I want to collect other feedback before sending out v3.
>>> I want to say I hate it all, it adds instructions to a path we spend an
>>> aweful lot of time optimizing without really getting anything back for
>>> it.
>> It does add some extra instruction that may slow it down slightly, but I
>> don't agree that it gives nothing back. The cpu lock holder information can
>> be useful in analyzing crash dumps and in some debugging situation. I think
>> it can be useful in RHEL for this readon. How about an x86 config option to
>> allow distros to decide if they want to have it enabled? I will make sure
>> that it will have no performance degradation if the option is not enabled.
> Config knobs suck too; they create a maintenance burden (we get to make
> sure all the permutations works/build/etc..) and effectively nobody uses
> them, since world+dog uses what distros pick.
>
> Anyway, instead of adding a second per-cpu variable, can you see how
> horrible something like this is:
>
> unsigned char adds(unsigned char var, unsigned char val)
> {
> unsigned short sat = 0xff, tmp = var;
>
> asm ("addb %[val], %b[var];"
> "cmovc %[sat], %[var];"
> : [var] "+r" (tmp)
> : [val] "ir" (val), [sat] "r" (sat)
> );
>
> return tmp;
> }
>
> Another thing to try is, instead of threading that lockval throughout
> the thing, simply:
>
> #define _Q_LOCKED_VAL this_cpu_read_stable(cpu_sat)
>
> or combined with the above
>
> #define _Q_LOCKED_VAL adds(this_cpu_read_stable(cpu_number), 2)
>
> and see if the compiler really makes a mess of things.
>
Thanks for the suggestion. I will try that out.
Cheers,
Longman
On Thu, Jul 23, 2020 at 08:47:59PM +0200, [email protected] wrote:
> On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
> > BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
> > I will have to update the patch to fix the reported 0-day test problem, but
> > I want to collect other feedback before sending out v3.
>
> I want to say I hate it all, it adds instructions to a path we spend an
> aweful lot of time optimizing without really getting anything back for
> it.
>
> Will, how do you feel about it?
I can see it potentially being useful for debugging, but I hate the
limitation to 256 CPUs. Even arm64 is hitting that now.
Also, you're talking ~1% gains here. I think our collective time would
be better spent off reviewing the CNA series and trying to make it more
deterministic.
Will
On 7/24/20 4:16 AM, Will Deacon wrote:
> On Thu, Jul 23, 2020 at 08:47:59PM +0200, [email protected] wrote:
>> On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
>>> BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
>>> I will have to update the patch to fix the reported 0-day test problem, but
>>> I want to collect other feedback before sending out v3.
>> I want to say I hate it all, it adds instructions to a path we spend an
>> aweful lot of time optimizing without really getting anything back for
>> it.
>>
>> Will, how do you feel about it?
> I can see it potentially being useful for debugging, but I hate the
> limitation to 256 CPUs. Even arm64 is hitting that now.
After thinking more about that, I think we can use all the remaining
bits in the 16-bit locked_pending. Reserving 1 bit for locked and 1 bit
for pending, there are 14 bits left. So as long as NR_CPUS < 16k
(requirement for 16-bit locked_pending), we can put all possible cpu
numbers into the lock. We can also just use smp_processor_id() without
additional percpu data.
>
> Also, you're talking ~1% gains here. I think our collective time would
> be better spent off reviewing the CNA series and trying to make it more
> deterministic.
I thought you guys are not interested in CNA. I do want to get CNA
merged, if possible. Let review the current version again and see if
there are ways we can further improve it.
Cheers,
Longman
On 7/24/20 3:10 PM, Waiman Long wrote:
> On 7/24/20 4:16 AM, Will Deacon wrote:
>> On Thu, Jul 23, 2020 at 08:47:59PM +0200, [email protected] wrote:
>>> On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
>>>> BTW, do you have any comment on my v2 lock holder cpu info
>>>> qspinlock patch?
>>>> I will have to update the patch to fix the reported 0-day test
>>>> problem, but
>>>> I want to collect other feedback before sending out v3.
>>> I want to say I hate it all, it adds instructions to a path we spend an
>>> aweful lot of time optimizing without really getting anything back for
>>> it.
>>>
>>> Will, how do you feel about it?
>> I can see it potentially being useful for debugging, but I hate the
>> limitation to 256 CPUs. Even arm64 is hitting that now.
>
> After thinking more about that, I think we can use all the remaining
> bits in the 16-bit locked_pending. Reserving 1 bit for locked and 1
> bit for pending, there are 14 bits left. So as long as NR_CPUS < 16k
> (requirement for 16-bit locked_pending), we can put all possible cpu
> numbers into the lock. We can also just use smp_processor_id() without
> additional percpu data.
Sorry, that doesn't work. The extra bits in the pending byte won't get
cleared on unlock. That will have noticeable performance impact.
Clearing the pending byte on unlock will cause other performance
problem. So I guess we will have to limit the cpu number in the locked byte.
Regards,
Longman
On Fri, Jul 24, 2020 at 03:10:59PM -0400, Waiman Long wrote:
> On 7/24/20 4:16 AM, Will Deacon wrote:
> > On Thu, Jul 23, 2020 at 08:47:59PM +0200, [email protected] wrote:
> > > On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
> > > > BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
> > > > I will have to update the patch to fix the reported 0-day test problem, but
> > > > I want to collect other feedback before sending out v3.
> > > I want to say I hate it all, it adds instructions to a path we spend an
> > > aweful lot of time optimizing without really getting anything back for
> > > it.
> > >
> > > Will, how do you feel about it?
> > I can see it potentially being useful for debugging, but I hate the
> > limitation to 256 CPUs. Even arm64 is hitting that now.
>
> After thinking more about that, I think we can use all the remaining bits in
> the 16-bit locked_pending. Reserving 1 bit for locked and 1 bit for pending,
> there are 14 bits left. So as long as NR_CPUS < 16k (requirement for 16-bit
> locked_pending), we can put all possible cpu numbers into the lock. We can
> also just use smp_processor_id() without additional percpu data.
That sounds horrific, wouldn't that destroy the whole point of using a
byte for pending?
> > Also, you're talking ~1% gains here. I think our collective time would
> > be better spent off reviewing the CNA series and trying to make it more
> > deterministic.
>
> I thought you guys are not interested in CNA. I do want to get CNA merged,
> if possible. Let review the current version again and see if there are ways
> we can further improve it.
It's not a lack of interrest. We were struggling with the fairness
issues and the complexity of the thing. I forgot the current state of
matters, but at one point UNLOCK was O(n) in waiters, which is, of
course, 'unfortunate'.
I'll have to look up whatever notes remain, but the basic idea of
keeping remote nodes on a secondary list is obviously breaking all sorts
of fairness. After that they pile on a bunch of hacks to fix the worst
of them, but it feels exactly like that, a bunch of hacks.
One of the things I suppose we ought to do is see if some of the ideas
of phase-fair locks can be applied to this.
That coupled with a chronic lack of time for anything :-(
On 7/25/20 1:26 PM, Peter Zijlstra wrote:
> On Fri, Jul 24, 2020 at 03:10:59PM -0400, Waiman Long wrote:
>> On 7/24/20 4:16 AM, Will Deacon wrote:
>>> On Thu, Jul 23, 2020 at 08:47:59PM +0200, [email protected] wrote:
>>>> On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
>>>>> BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
>>>>> I will have to update the patch to fix the reported 0-day test problem, but
>>>>> I want to collect other feedback before sending out v3.
>>>> I want to say I hate it all, it adds instructions to a path we spend an
>>>> aweful lot of time optimizing without really getting anything back for
>>>> it.
>>>>
>>>> Will, how do you feel about it?
>>> I can see it potentially being useful for debugging, but I hate the
>>> limitation to 256 CPUs. Even arm64 is hitting that now.
>> After thinking more about that, I think we can use all the remaining bits in
>> the 16-bit locked_pending. Reserving 1 bit for locked and 1 bit for pending,
>> there are 14 bits left. So as long as NR_CPUS < 16k (requirement for 16-bit
>> locked_pending), we can put all possible cpu numbers into the lock. We can
>> also just use smp_processor_id() without additional percpu data.
> That sounds horrific, wouldn't that destroy the whole point of using a
> byte for pending?
You are right. I realized that later on and had sent a follow-up mail to
correct that.
>>> Also, you're talking ~1% gains here. I think our collective time would
>>> be better spent off reviewing the CNA series and trying to make it more
>>> deterministic.
>> I thought you guys are not interested in CNA. I do want to get CNA merged,
>> if possible. Let review the current version again and see if there are ways
>> we can further improve it.
> It's not a lack of interrest. We were struggling with the fairness
> issues and the complexity of the thing. I forgot the current state of
> matters, but at one point UNLOCK was O(n) in waiters, which is, of
> course, 'unfortunate'.
>
> I'll have to look up whatever notes remain, but the basic idea of
> keeping remote nodes on a secondary list is obviously breaking all sorts
> of fairness. After that they pile on a bunch of hacks to fix the worst
> of them, but it feels exactly like that, a bunch of hacks.
>
> One of the things I suppose we ought to do is see if some of the ideas
> of phase-fair locks can be applied to this.
That could be a possible solution to ensure better fairness.
>
> That coupled with a chronic lack of time for anything :-(
>
That is always true and I feel this way too:-)
Cheers,
Longman