2022-07-04 15:08:13

by Nicholas Piggin

[permalink] [raw]
Subject: [PATCH 00/13] locking/qspinlock: simplify code generation

Hi,

Been recently looking a bit closer at queued spinlock code, and
found it's a little tricky to follow especially the pv generation.
This series tries to improve the situation. It's not well tested
outside powerpc, but it's really the x86 pv code that is the
other major complexity that should need some review and testing.
Opinions?

Thanks,
Nick

Nicholas Piggin (13):
locking/qspinlock: remove pv_node abstraction
locking/qspinlock: inline mcs_spinlock functions into qspinlock
locking/qspinlock: split common mcs queueing code into its own
function
locking/qspinlock: move pv lock word helpers into qspinlock.c
locking/qspinlock: be less clever with the preprocessor
locking/qspinlock: merge qspinlock_paravirt.h into qspinlock.c
locking/qspinlock: remove arch qspinlock_paravirt.h includes
locking/qspinlock: stop renaming queued_spin_lock_slowpath to
native_queued_spin_lock_slowpath
locking/qspinlock: rename __pv_init_lock_hash to pv_spinlocks_init
locking/qspinlock: paravirt use simple trylock in case idx overflows
locking/qspinlock: Use queued_spin_trylock in
pv_hybrid_queued_unfair_trylock
locking/qspinlock: separate pv_wait_node from the non-paravirt path
locking/qspinlock: simplify pv_wait_head_or_lock calling scheme

arch/alpha/include/asm/Kbuild | 1 -
arch/arc/include/asm/Kbuild | 1 -
arch/arm/include/asm/mcs_spinlock.h | 24 -
arch/arm64/include/asm/Kbuild | 1 -
arch/hexagon/include/asm/Kbuild | 1 -
arch/ia64/include/asm/Kbuild | 1 -
arch/m68k/include/asm/Kbuild | 1 -
arch/microblaze/include/asm/Kbuild | 1 -
arch/mips/include/asm/Kbuild | 1 -
arch/nios2/include/asm/Kbuild | 1 -
arch/parisc/include/asm/Kbuild | 1 -
arch/powerpc/include/asm/Kbuild | 1 -
arch/powerpc/include/asm/qspinlock.h | 45 +-
arch/powerpc/include/asm/qspinlock_paravirt.h | 7 -
arch/powerpc/include/asm/spinlock.h | 2 +-
arch/s390/include/asm/Kbuild | 1 -
arch/sh/include/asm/Kbuild | 1 -
arch/sparc/include/asm/Kbuild | 1 -
arch/um/include/asm/Kbuild | 1 -
arch/x86/hyperv/hv_spinlock.c | 2 +-
arch/x86/include/asm/Kbuild | 1 -
arch/x86/include/asm/qspinlock.h | 19 +-
arch/x86/include/asm/qspinlock_paravirt.h | 72 --
arch/x86/kernel/kvm.c | 2 +-
arch/x86/kernel/paravirt-spinlocks.c | 71 ++
arch/x86/kernel/paravirt.c | 2 +-
arch/x86/xen/spinlock.c | 2 +-
arch/xtensa/include/asm/Kbuild | 1 -
include/asm-generic/mcs_spinlock.h | 13 -
include/asm-generic/qspinlock.h | 6 +
kernel/locking/mcs_spinlock.h | 121 ---
kernel/locking/qspinlock.c | 834 ++++++++++++++----
kernel/locking/qspinlock_paravirt.h | 562 ------------
33 files changed, 764 insertions(+), 1037 deletions(-)
delete mode 100644 arch/arm/include/asm/mcs_spinlock.h
delete mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
delete mode 100644 arch/x86/include/asm/qspinlock_paravirt.h
delete mode 100644 include/asm-generic/mcs_spinlock.h
delete mode 100644 kernel/locking/mcs_spinlock.h
delete mode 100644 kernel/locking/qspinlock_paravirt.h

--
2.35.1


2022-07-04 15:09:14

by Nicholas Piggin

[permalink] [raw]
Subject: [PATCH 07/13] locking/qspinlock: remove arch qspinlock_paravirt.h includes

Signed-off-by: Nicholas Piggin <[email protected]>
---
arch/powerpc/include/asm/qspinlock_paravirt.h | 7 --
arch/x86/include/asm/qspinlock.h | 4 ++
arch/x86/include/asm/qspinlock_paravirt.h | 72 -------------------
arch/x86/kernel/paravirt-spinlocks.c | 71 ++++++++++++++++++
kernel/locking/qspinlock.c | 11 +--
5 files changed, 76 insertions(+), 89 deletions(-)
delete mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
delete mode 100644 arch/x86/include/asm/qspinlock_paravirt.h

diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h b/arch/powerpc/include/asm/qspinlock_paravirt.h
deleted file mode 100644
index 6b60e7736a47..000000000000
--- a/arch/powerpc/include/asm/qspinlock_paravirt.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _ASM_POWERPC_QSPINLOCK_PARAVIRT_H
-#define _ASM_POWERPC_QSPINLOCK_PARAVIRT_H
-
-EXPORT_SYMBOL(__pv_queued_spin_unlock);
-
-#endif /* _ASM_POWERPC_QSPINLOCK_PARAVIRT_H */
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index d87451df480b..7f914fe7bc30 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -34,6 +34,10 @@ extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
extern bool nopvspin;

+#ifdef CONFIG_64BIT
+#define __pv_queued_spin_unlock __pv_queued_spin_unlock
+#endif
+
#define queued_spin_unlock queued_spin_unlock
/**
* queued_spin_unlock - release a queued spinlock
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
deleted file mode 100644
index 892fd8c3a6f7..000000000000
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __ASM_QSPINLOCK_PARAVIRT_H
-#define __ASM_QSPINLOCK_PARAVIRT_H
-
-#include <asm/ibt.h>
-
-/*
- * For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit
- * registers. For i386, however, only 1 32-bit register needs to be saved
- * and restored. So an optimized version of __pv_queued_spin_unlock() is
- * hand-coded for 64-bit, but it isn't worthwhile to do it for 32-bit.
- */
-#ifdef CONFIG_64BIT
-
-PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath);
-#define __pv_queued_spin_unlock __pv_queued_spin_unlock
-#define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock"
-#define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath"
-
-/*
- * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock
- * which combines the registers saving trunk and the body of the following
- * C code:
- *
- * void __pv_queued_spin_unlock(struct qspinlock *lock)
- * {
- * u8 lockval = cmpxchg(&lock->locked, _Q_LOCKED_VAL, 0);
- *
- * if (likely(lockval == _Q_LOCKED_VAL))
- * return;
- * pv_queued_spin_unlock_slowpath(lock, lockval);
- * }
- *
- * For x86-64,
- * rdi = lock (first argument)
- * rsi = lockval (second argument)
- * rdx = internal variable (set to 0)
- */
-asm (".pushsection .text;"
- ".globl " PV_UNLOCK ";"
- ".type " PV_UNLOCK ", @function;"
- ".align 4,0x90;"
- PV_UNLOCK ": "
- ASM_ENDBR
- FRAME_BEGIN
- "push %rdx;"
- "mov $0x1,%eax;"
- "xor %edx,%edx;"
- LOCK_PREFIX "cmpxchg %dl,(%rdi);"
- "cmp $0x1,%al;"
- "jne .slowpath;"
- "pop %rdx;"
- FRAME_END
- ASM_RET
- ".slowpath: "
- "push %rsi;"
- "movzbl %al,%esi;"
- "call " PV_UNLOCK_SLOWPATH ";"
- "pop %rsi;"
- "pop %rdx;"
- FRAME_END
- ASM_RET
- ".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
- ".popsection");
-
-#else /* CONFIG_64BIT */
-
-extern void __pv_queued_spin_unlock(struct qspinlock *lock);
-PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock);
-
-#endif /* CONFIG_64BIT */
-#endif
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 9e1ea99ad9df..c6a107dfe20d 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/jump_label.h>

+#include <asm/ibt.h>
#include <asm/paravirt.h>

__visible void __native_queued_spin_unlock(struct qspinlock *lock)
@@ -15,6 +16,76 @@ __visible void __native_queued_spin_unlock(struct qspinlock *lock)
}
PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);

+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+/*
+ * For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit
+ * registers. For i386, however, only 1 32-bit register needs to be saved
+ * and restored. So an optimized version of __pv_queued_spin_unlock() is
+ * hand-coded for 64-bit, but it isn't worthwhile to do it for 32-bit.
+ */
+#ifdef CONFIG_64BIT
+
+__visible void
+__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked);
+
+PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath);
+#define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock"
+#define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath"
+
+/*
+ * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock
+ * which combines the registers saving trunk and the body of the following
+ * C code:
+ *
+ * void __pv_queued_spin_unlock(struct qspinlock *lock)
+ * {
+ * u8 lockval = cmpxchg(&lock->locked, _Q_LOCKED_VAL, 0);
+ *
+ * if (likely(lockval == _Q_LOCKED_VAL))
+ * return;
+ * pv_queued_spin_unlock_slowpath(lock, lockval);
+ * }
+ *
+ * For x86-64,
+ * rdi = lock (first argument)
+ * rsi = lockval (second argument)
+ * rdx = internal variable (set to 0)
+ */
+asm (".pushsection .text;"
+ ".globl " PV_UNLOCK ";"
+ ".type " PV_UNLOCK ", @function;"
+ ".align 4,0x90;"
+ PV_UNLOCK ": "
+ ASM_ENDBR
+ FRAME_BEGIN
+ "push %rdx;"
+ "mov $0x1,%eax;"
+ "xor %edx,%edx;"
+ LOCK_PREFIX "cmpxchg %dl,(%rdi);"
+ "cmp $0x1,%al;"
+ "jne .slowpath;"
+ "pop %rdx;"
+ FRAME_END
+ ASM_RET
+ ".slowpath: "
+ "push %rsi;"
+ "movzbl %al,%esi;"
+ "call " PV_UNLOCK_SLOWPATH ";"
+ "pop %rsi;"
+ "pop %rdx;"
+ FRAME_END
+ ASM_RET
+ ".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
+ ".popsection");
+
+#else /* CONFIG_64BIT */
+
+extern void __pv_queued_spin_unlock(struct qspinlock *lock);
+PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock);
+
+#endif /* CONFIG_64BIT */
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
bool pv_is_native_spin_unlock(void)
{
return pv_ops.lock.queued_spin_unlock.func ==
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 9a235b0d98ca..4045b5683ecb 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -743,16 +743,6 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
pv_kick(node->cpu);
}

-/*
- * Include the architecture specific callee-save thunk of the
- * __pv_queued_spin_unlock(). This thunk is put together with
- * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
- * function close to each other sharing consecutive instruction cachelines.
- * Alternatively, architecture specific version of __pv_queued_spin_unlock()
- * can be defined.
- */
-#include <asm/qspinlock_paravirt.h>
-
#ifndef __pv_queued_spin_unlock
__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
{
@@ -769,6 +759,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)

__pv_queued_spin_unlock_slowpath(lock, locked);
}
+EXPORT_SYMBOL(__pv_queued_spin_unlock);
#endif

#else /* CONFIG_PARAVIRT_SPINLOCKS */
--
2.35.1

2022-07-04 15:10:02

by Nicholas Piggin

[permalink] [raw]
Subject: [PATCH 01/13] locking/qspinlock: remove pv_node abstraction

There isn't much point trying to separate struct qnode from struct pv_node
when struct qnode has to know about pv_node anyway.

Signed-off-by: Nicholas Piggin <[email protected]>
---
kernel/locking/qspinlock.c | 3 ++-
kernel/locking/qspinlock_paravirt.h | 34 ++++++++++++-----------------
2 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 65a9a10caa6f..a0fc21d99199 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -82,7 +82,8 @@
struct qnode {
struct mcs_spinlock mcs;
#ifdef CONFIG_PARAVIRT_SPINLOCKS
- long reserved[2];
+ int cpu;
+ u8 state;
#endif
};

diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index e84d21aa0722..b6a175155f36 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -47,12 +47,6 @@ enum vcpu_state {
vcpu_hashed, /* = pv_hash'ed + vcpu_halted */
};

-struct pv_node {
- struct mcs_spinlock mcs;
- int cpu;
- u8 state;
-};
-
/*
* Hybrid PV queued/unfair lock
*
@@ -170,7 +164,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
*/
struct pv_hash_entry {
struct qspinlock *lock;
- struct pv_node *node;
+ struct qnode *node;
};

#define PV_HE_PER_LINE (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
@@ -209,7 +203,7 @@ void __init __pv_init_lock_hash(void)
offset < (1 << pv_lock_hash_bits); \
offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])

-static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
+static struct qspinlock **pv_hash(struct qspinlock *lock, struct qnode *node)
{
unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
struct pv_hash_entry *he;
@@ -236,11 +230,11 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
BUG();
}

-static struct pv_node *pv_unhash(struct qspinlock *lock)
+static struct qnode *pv_unhash(struct qspinlock *lock)
{
unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
struct pv_hash_entry *he;
- struct pv_node *node;
+ struct qnode *node;

for_each_hash_entry(he, offset, hash) {
if (READ_ONCE(he->lock) == lock) {
@@ -264,7 +258,7 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
* in a running state.
*/
static inline bool
-pv_wait_early(struct pv_node *prev, int loop)
+pv_wait_early(struct qnode *prev, int loop)
{
if ((loop & PV_PREV_CHECK_MASK) != 0)
return false;
@@ -277,9 +271,9 @@ pv_wait_early(struct pv_node *prev, int loop)
*/
static void pv_init_node(struct mcs_spinlock *node)
{
- struct pv_node *pn = (struct pv_node *)node;
+ struct qnode *pn = (struct qnode *)node;

- BUILD_BUG_ON(sizeof(struct pv_node) > sizeof(struct qnode));
+ BUILD_BUG_ON(sizeof(struct qnode) > sizeof(struct qnode));

pn->cpu = smp_processor_id();
pn->state = vcpu_running;
@@ -292,8 +286,8 @@ static void pv_init_node(struct mcs_spinlock *node)
*/
static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
{
- struct pv_node *pn = (struct pv_node *)node;
- struct pv_node *pp = (struct pv_node *)prev;
+ struct qnode *pn = (struct qnode *)node;
+ struct qnode *pp = (struct qnode *)prev;
int loop;
bool wait_early;

@@ -359,7 +353,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
*/
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
- struct pv_node *pn = (struct pv_node *)node;
+ struct qnode *pn = (struct qnode *)node;

/*
* If the vCPU is indeed halted, advance its state to match that of
@@ -402,7 +396,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
static u32
pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
{
- struct pv_node *pn = (struct pv_node *)node;
+ struct qnode *pn = (struct qnode *)node;
struct qspinlock **lp = NULL;
int waitcnt = 0;
int loop;
@@ -492,7 +486,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
__visible void
__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
{
- struct pv_node *node;
+ struct qnode *node;

if (unlikely(locked != _Q_SLOW_VAL)) {
WARN(!debug_locks_silent,
@@ -517,14 +511,14 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
node = pv_unhash(lock);

/*
- * Now that we have a reference to the (likely) blocked pv_node,
+ * Now that we have a reference to the (likely) blocked qnode,
* release the lock.
*/
smp_store_release(&lock->locked, 0);

/*
* At this point the memory pointed at by lock can be freed/reused,
- * however we can still use the pv_node to kick the CPU.
+ * however we can still use the qnode to kick the CPU.
* The other vCPU may not really be halted, but kicking an active
* vCPU is harmless other than the additional latency in completing
* the unlock.
--
2.35.1

2022-07-05 18:11:13

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 00/13] locking/qspinlock: simplify code generation

On Tue, Jul 05, 2022 at 12:38:07AM +1000, Nicholas Piggin wrote:
> Hi,
>
> Been recently looking a bit closer at queued spinlock code, and
> found it's a little tricky to follow especially the pv generation.
> This series tries to improve the situation. It's not well tested
> outside powerpc, but it's really the x86 pv code that is the
> other major complexity that should need some review and testing.
> Opinions?

perhaps something like so on top/instead? This would still allow
slotting in other implementations with relative ease and the compilers
should constant fold all this.

--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -609,7 +609,7 @@ static void pv_kick_node(struct qspinloc
*
* The current value of the lock will be returned for additional processing.
*/
-static void pv_wait_head_or_lock(struct qspinlock *lock, struct qnode *node)
+static u32 pv_wait_head_or_lock(struct qspinlock *lock, struct qnode *node)
{
struct qspinlock **lp = NULL;
int waitcnt = 0;
@@ -641,7 +641,7 @@ static void pv_wait_head_or_lock(struct
set_pending(lock);
for (loop = SPIN_THRESHOLD; loop; loop--) {
if (trylock_clear_pending(lock))
- return; /* got lock */
+ goto out; /* got lock */
cpu_relax();
}
clear_pending(lock);
@@ -669,7 +669,7 @@ static void pv_wait_head_or_lock(struct
*/
WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
WRITE_ONCE(*lp, NULL);
- return; /* got lock */
+ goto out; /* got lock */
}
}
WRITE_ONCE(node->state, vcpu_hashed);
@@ -683,12 +683,22 @@ static void pv_wait_head_or_lock(struct
*/
}

+out:
/*
* The cmpxchg() or xchg() call before coming here provides the
* acquire semantics for locking.
*/
+ return atomic_read(&lock->val);
}

+static const struct queue_ops pv_ops = {
+ .init_node = pv_init_node,
+ .trylock = pv_hybrid_queued_unfair_trylock,
+ .wait_node = pv_wait_node,
+ .wait_head_or_lock = pv_wait_head_or_lock,
+ .kick_node = pv_kick_node,
+};
+
/*
* PV versions of the unlock fastpath and slowpath functions to be used
* instead of queued_spin_unlock().
@@ -756,18 +766,18 @@ __visible void __pv_queued_spin_unlock(s
EXPORT_SYMBOL(__pv_queued_spin_unlock);
#endif

-#else /* CONFIG_PARAVIRT_SPINLOCKS */
-static __always_inline void pv_init_node(struct qnode *node) { }
-static __always_inline void pv_wait_node(struct qnode *node,
- struct qnode *prev) { }
-static __always_inline void pv_kick_node(struct qspinlock *lock,
- struct qnode *node) { }
-static __always_inline void pv_wait_head_or_lock(struct qspinlock *lock,
- struct qnode *node) { }
-static __always_inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) { BUILD_BUG(); }
#endif /* CONFIG_PARAVIRT_SPINLOCKS */

-static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, bool paravirt)
+struct queue_ops {
+ void (*init_node)(struct qnode *node);
+ bool (*trylock)(struct qspinlock *lock);
+ void (*wait_node)(struct qnode *node, struct qnode *prev);
+ u32 (*wait_head_or_lock)(struct qspinlock *lock, struct qnode *node);
+ void (*kick_node)(struct qspinlock *lock, struct qnode *node);
+};
+
+static __always_inline
+void queued_spin_lock_mcs_queue(struct qspinlock *lock, const struct queue_ops *ops)
{
struct qnode *prev, *next, *node;
u32 val, old, tail;
@@ -813,16 +823,16 @@ static inline void queued_spin_lock_mcs_

node->locked = 0;
node->next = NULL;
- if (paravirt)
- pv_init_node(node);
+ if (ops && ops->init_node)
+ ops->init_node(node);

/*
* We touched a (possibly) cold cacheline in the per-cpu queue node;
* attempt the trylock once more in the hope someone let go while we
* weren't watching.
*/
- if (paravirt) {
- if (pv_hybrid_queued_unfair_trylock(lock))
+ if (ops && ops->trylock) {
+ if (ops->trylock(lock))
goto release;
} else {
if (queued_spin_trylock(lock))
@@ -857,8 +867,8 @@ static inline void queued_spin_lock_mcs_
WRITE_ONCE(prev->next, node);

/* Wait for mcs node lock to be released */
- if (paravirt)
- pv_wait_node(node, prev);
+ if (ops && ops->wait_node)
+ ops->wait_node(node, prev);
else
smp_cond_load_acquire(&node->locked, VAL);

@@ -893,12 +903,11 @@ static inline void queued_spin_lock_mcs_
* If PV isn't active, 0 will be returned instead.
*
*/
- if (paravirt) {
- pv_wait_head_or_lock(lock, node);
- val = atomic_read(&lock->val);
+ if (ops && ops->wait_head_or_lock) {
+ val = ops->wait_head_or_lock(lock, node);
} else {
val = atomic_cond_read_acquire(&lock->val,
- !(VAL & _Q_LOCKED_PENDING_MASK));
+ !(VAL & _Q_LOCKED_PENDING_MASK));
}

/*
@@ -1049,14 +1058,14 @@ void queued_spin_lock_slowpath(struct qs
*/
queue:
lockevent_inc(lock_slowpath);
- queued_spin_lock_mcs_queue(lock, false);
+ queued_spin_lock_mcs_queue(lock, NULL);
}
EXPORT_SYMBOL(queued_spin_lock_slowpath);

#ifdef CONFIG_PARAVIRT_SPINLOCKS
void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
{
- queued_spin_lock_mcs_queue(lock, true);
+ queued_spin_lock_mcs_queue(lock, &pv_ops);
}
EXPORT_SYMBOL(__pv_queued_spin_lock_slowpath);

2022-07-06 23:27:44

by Boqun Feng

[permalink] [raw]
Subject: Re: [PATCH 01/13] locking/qspinlock: remove pv_node abstraction

On Tue, Jul 05, 2022 at 12:38:08AM +1000, Nicholas Piggin wrote:
> There isn't much point trying to separate struct qnode from struct pv_node
> when struct qnode has to know about pv_node anyway.
>
> Signed-off-by: Nicholas Piggin <[email protected]>
> ---
> kernel/locking/qspinlock.c | 3 ++-
> kernel/locking/qspinlock_paravirt.h | 34 ++++++++++++-----------------
> 2 files changed, 16 insertions(+), 21 deletions(-)
>
> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> index 65a9a10caa6f..a0fc21d99199 100644
> --- a/kernel/locking/qspinlock.c
> +++ b/kernel/locking/qspinlock.c
> @@ -82,7 +82,8 @@
> struct qnode {
> struct mcs_spinlock mcs;
> #ifdef CONFIG_PARAVIRT_SPINLOCKS
> - long reserved[2];
> + int cpu;
> + u8 state;
> #endif
> };
>
> diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
> index e84d21aa0722..b6a175155f36 100644
> --- a/kernel/locking/qspinlock_paravirt.h
> +++ b/kernel/locking/qspinlock_paravirt.h
> @@ -47,12 +47,6 @@ enum vcpu_state {
> vcpu_hashed, /* = pv_hash'ed + vcpu_halted */
> };
>
> -struct pv_node {
> - struct mcs_spinlock mcs;
> - int cpu;
> - u8 state;
> -};
> -
> /*
> * Hybrid PV queued/unfair lock
> *
> @@ -170,7 +164,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
> */
> struct pv_hash_entry {
> struct qspinlock *lock;
> - struct pv_node *node;
> + struct qnode *node;
> };
>
> #define PV_HE_PER_LINE (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
> @@ -209,7 +203,7 @@ void __init __pv_init_lock_hash(void)
> offset < (1 << pv_lock_hash_bits); \
> offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])
>
> -static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
> +static struct qspinlock **pv_hash(struct qspinlock *lock, struct qnode *node)
> {
> unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
> struct pv_hash_entry *he;
> @@ -236,11 +230,11 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
> BUG();
> }
>
> -static struct pv_node *pv_unhash(struct qspinlock *lock)
> +static struct qnode *pv_unhash(struct qspinlock *lock)
> {
> unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
> struct pv_hash_entry *he;
> - struct pv_node *node;
> + struct qnode *node;
>
> for_each_hash_entry(he, offset, hash) {
> if (READ_ONCE(he->lock) == lock) {
> @@ -264,7 +258,7 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
> * in a running state.
> */
> static inline bool
> -pv_wait_early(struct pv_node *prev, int loop)
> +pv_wait_early(struct qnode *prev, int loop)
> {
> if ((loop & PV_PREV_CHECK_MASK) != 0)
> return false;
> @@ -277,9 +271,9 @@ pv_wait_early(struct pv_node *prev, int loop)
> */
> static void pv_init_node(struct mcs_spinlock *node)
> {
> - struct pv_node *pn = (struct pv_node *)node;
> + struct qnode *pn = (struct qnode *)node;
>
> - BUILD_BUG_ON(sizeof(struct pv_node) > sizeof(struct qnode));
> + BUILD_BUG_ON(sizeof(struct qnode) > sizeof(struct qnode));

This line can actually be removed ;-)

Other part looks good to me.

Acked-by: Boqun Feng <[email protected]>

Regards,
Boqun

>
> pn->cpu = smp_processor_id();
> pn->state = vcpu_running;
> @@ -292,8 +286,8 @@ static void pv_init_node(struct mcs_spinlock *node)
> */
> static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
> {
> - struct pv_node *pn = (struct pv_node *)node;
> - struct pv_node *pp = (struct pv_node *)prev;
> + struct qnode *pn = (struct qnode *)node;
> + struct qnode *pp = (struct qnode *)prev;
> int loop;
> bool wait_early;
>
> @@ -359,7 +353,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
> */
> static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
> {
> - struct pv_node *pn = (struct pv_node *)node;
> + struct qnode *pn = (struct qnode *)node;
>
> /*
> * If the vCPU is indeed halted, advance its state to match that of
> @@ -402,7 +396,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
> static u32
> pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
> {
> - struct pv_node *pn = (struct pv_node *)node;
> + struct qnode *pn = (struct qnode *)node;
> struct qspinlock **lp = NULL;
> int waitcnt = 0;
> int loop;
> @@ -492,7 +486,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
> __visible void
> __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
> {
> - struct pv_node *node;
> + struct qnode *node;
>
> if (unlikely(locked != _Q_SLOW_VAL)) {
> WARN(!debug_locks_silent,
> @@ -517,14 +511,14 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
> node = pv_unhash(lock);
>
> /*
> - * Now that we have a reference to the (likely) blocked pv_node,
> + * Now that we have a reference to the (likely) blocked qnode,
> * release the lock.
> */
> smp_store_release(&lock->locked, 0);
>
> /*
> * At this point the memory pointed at by lock can be freed/reused,
> - * however we can still use the pv_node to kick the CPU.
> + * however we can still use the qnode to kick the CPU.
> * The other vCPU may not really be halted, but kicking an active
> * vCPU is harmless other than the additional latency in completing
> * the unlock.
> --
> 2.35.1
>

2022-07-12 01:15:22

by Nicholas Piggin

[permalink] [raw]
Subject: Re: [PATCH 00/13] locking/qspinlock: simplify code generation

Excerpts from Peter Zijlstra's message of July 6, 2022 3:59 am:
> On Tue, Jul 05, 2022 at 12:38:07AM +1000, Nicholas Piggin wrote:
>> Hi,
>>
>> Been recently looking a bit closer at queued spinlock code, and
>> found it's a little tricky to follow especially the pv generation.
>> This series tries to improve the situation. It's not well tested
>> outside powerpc, but it's really the x86 pv code that is the
>> other major complexity that should need some review and testing.
>> Opinions?
>
> perhaps something like so on top/instead? This would still allow
> slotting in other implementations with relative ease and the compilers
> should constant fold all this.

Yeah that could be a bit neater... I don't know. It all has to be
inlined and compiled together so it's a matter of taste in syntactic
sugar. Doing it with C is probably better than doing it with CPP,
all else being equal.

At the moment I'm not planning to replace the PV functions on powerpc
though. If/when it comes to that I'd say more changes would be needed.

Thanks,
Nick

>
> --- a/kernel/locking/qspinlock.c
> +++ b/kernel/locking/qspinlock.c
> @@ -609,7 +609,7 @@ static void pv_kick_node(struct qspinloc
> *
> * The current value of the lock will be returned for additional processing.
> */
> -static void pv_wait_head_or_lock(struct qspinlock *lock, struct qnode *node)
> +static u32 pv_wait_head_or_lock(struct qspinlock *lock, struct qnode *node)
> {
> struct qspinlock **lp = NULL;
> int waitcnt = 0;
> @@ -641,7 +641,7 @@ static void pv_wait_head_or_lock(struct
> set_pending(lock);
> for (loop = SPIN_THRESHOLD; loop; loop--) {
> if (trylock_clear_pending(lock))
> - return; /* got lock */
> + goto out; /* got lock */
> cpu_relax();
> }
> clear_pending(lock);
> @@ -669,7 +669,7 @@ static void pv_wait_head_or_lock(struct
> */
> WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
> WRITE_ONCE(*lp, NULL);
> - return; /* got lock */
> + goto out; /* got lock */
> }
> }
> WRITE_ONCE(node->state, vcpu_hashed);
> @@ -683,12 +683,22 @@ static void pv_wait_head_or_lock(struct
> */
> }
>
> +out:
> /*
> * The cmpxchg() or xchg() call before coming here provides the
> * acquire semantics for locking.
> */
> + return atomic_read(&lock->val);
> }
>
> +static const struct queue_ops pv_ops = {
> + .init_node = pv_init_node,
> + .trylock = pv_hybrid_queued_unfair_trylock,
> + .wait_node = pv_wait_node,
> + .wait_head_or_lock = pv_wait_head_or_lock,
> + .kick_node = pv_kick_node,
> +};
> +
> /*
> * PV versions of the unlock fastpath and slowpath functions to be used
> * instead of queued_spin_unlock().
> @@ -756,18 +766,18 @@ __visible void __pv_queued_spin_unlock(s
> EXPORT_SYMBOL(__pv_queued_spin_unlock);
> #endif
>
> -#else /* CONFIG_PARAVIRT_SPINLOCKS */
> -static __always_inline void pv_init_node(struct qnode *node) { }
> -static __always_inline void pv_wait_node(struct qnode *node,
> - struct qnode *prev) { }
> -static __always_inline void pv_kick_node(struct qspinlock *lock,
> - struct qnode *node) { }
> -static __always_inline void pv_wait_head_or_lock(struct qspinlock *lock,
> - struct qnode *node) { }
> -static __always_inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) { BUILD_BUG(); }
> #endif /* CONFIG_PARAVIRT_SPINLOCKS */
>
> -static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, bool paravirt)
> +struct queue_ops {
> + void (*init_node)(struct qnode *node);
> + bool (*trylock)(struct qspinlock *lock);
> + void (*wait_node)(struct qnode *node, struct qnode *prev);
> + u32 (*wait_head_or_lock)(struct qspinlock *lock, struct qnode *node);
> + void (*kick_node)(struct qspinlock *lock, struct qnode *node);
> +};
> +
> +static __always_inline
> +void queued_spin_lock_mcs_queue(struct qspinlock *lock, const struct queue_ops *ops)
> {
> struct qnode *prev, *next, *node;
> u32 val, old, tail;
> @@ -813,16 +823,16 @@ static inline void queued_spin_lock_mcs_
>
> node->locked = 0;
> node->next = NULL;
> - if (paravirt)
> - pv_init_node(node);
> + if (ops && ops->init_node)
> + ops->init_node(node);
>
> /*
> * We touched a (possibly) cold cacheline in the per-cpu queue node;
> * attempt the trylock once more in the hope someone let go while we
> * weren't watching.
> */
> - if (paravirt) {
> - if (pv_hybrid_queued_unfair_trylock(lock))
> + if (ops && ops->trylock) {
> + if (ops->trylock(lock))
> goto release;
> } else {
> if (queued_spin_trylock(lock))
> @@ -857,8 +867,8 @@ static inline void queued_spin_lock_mcs_
> WRITE_ONCE(prev->next, node);
>
> /* Wait for mcs node lock to be released */
> - if (paravirt)
> - pv_wait_node(node, prev);
> + if (ops && ops->wait_node)
> + ops->wait_node(node, prev);
> else
> smp_cond_load_acquire(&node->locked, VAL);
>
> @@ -893,12 +903,11 @@ static inline void queued_spin_lock_mcs_
> * If PV isn't active, 0 will be returned instead.
> *
> */
> - if (paravirt) {
> - pv_wait_head_or_lock(lock, node);
> - val = atomic_read(&lock->val);
> + if (ops && ops->wait_head_or_lock) {
> + val = ops->wait_head_or_lock(lock, node);
> } else {
> val = atomic_cond_read_acquire(&lock->val,
> - !(VAL & _Q_LOCKED_PENDING_MASK));
> + !(VAL & _Q_LOCKED_PENDING_MASK));
> }
>
> /*
> @@ -1049,14 +1058,14 @@ void queued_spin_lock_slowpath(struct qs
> */
> queue:
> lockevent_inc(lock_slowpath);
> - queued_spin_lock_mcs_queue(lock, false);
> + queued_spin_lock_mcs_queue(lock, NULL);
> }
> EXPORT_SYMBOL(queued_spin_lock_slowpath);
>
> #ifdef CONFIG_PARAVIRT_SPINLOCKS
> void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
> {
> - queued_spin_lock_mcs_queue(lock, true);
> + queued_spin_lock_mcs_queue(lock, &pv_ops);
> }
> EXPORT_SYMBOL(__pv_queued_spin_lock_slowpath);
>
>