2022-07-04 15:06:28

by Nicholas Piggin

[permalink] [raw]
Subject: [PATCH 05/13] locking/qspinlock: be less clever with the preprocessor

Stop qspinlock.c including itself and avoid most of the function
renaming with the preprocessor.

This is mostly done by having the common slowpath code take a 'bool
paravirt' argument and adjusting code based on that.

Signed-off-by: Nicholas Piggin <[email protected]>
---
kernel/locking/qspinlock.c | 116 ++++++++++++----------------
kernel/locking/qspinlock_paravirt.h | 10 +--
2 files changed, 52 insertions(+), 74 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 8f2173e22479..b96c58ca51de 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -11,8 +11,6 @@
* Peter Zijlstra <[email protected]>
*/

-#ifndef _GEN_PV_LOCK_SLOWPATH
-
#include <linux/smp.h>
#include <linux/bug.h>
#include <linux/cpumask.h>
@@ -285,35 +283,21 @@ static __always_inline void set_locked(struct qspinlock *lock)
WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
}

-
-/*
- * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
- * all the PV callbacks.
- */
-
-static __always_inline void __pv_init_node(struct qnode *node) { }
-static __always_inline void __pv_wait_node(struct qnode *node,
- struct qnode *prev) { }
-static __always_inline void __pv_kick_node(struct qspinlock *lock,
- struct qnode *node) { }
-static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
- struct qnode *node)
- { return 0; }
-
-#define pv_enabled() false
-
-#define pv_init_node __pv_init_node
-#define pv_wait_node __pv_wait_node
-#define pv_kick_node __pv_kick_node
-#define pv_wait_head_or_lock __pv_wait_head_or_lock
-
#ifdef CONFIG_PARAVIRT_SPINLOCKS
-#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
-#endif
-
-#endif /* _GEN_PV_LOCK_SLOWPATH */
+#include "qspinlock_paravirt.h"
+#else /* CONFIG_PARAVIRT_SPINLOCKS */
+static __always_inline void pv_init_node(struct qnode *node) { }
+static __always_inline void pv_wait_node(struct qnode *node,
+ struct qnode *prev) { }
+static __always_inline void pv_kick_node(struct qspinlock *lock,
+ struct qnode *node) { }
+static __always_inline u32 pv_wait_head_or_lock(struct qspinlock *lock,
+ struct qnode *node)
+ { return 0; }
+static __always_inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) { BUILD_BUG(); }
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */

-static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock)
+static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, bool paravirt)
{
struct qnode *prev, *next, *node;
u32 val, old, tail;
@@ -338,8 +322,13 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock)
*/
if (unlikely(idx >= MAX_NODES)) {
lockevent_inc(lock_no_node);
- while (!queued_spin_trylock(lock))
- cpu_relax();
+ if (paravirt) {
+ while (!pv_hybrid_queued_unfair_trylock(lock))
+ cpu_relax();
+ } else {
+ while (!queued_spin_trylock(lock))
+ cpu_relax();
+ }
goto release;
}

@@ -359,15 +348,21 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock)

node->locked = 0;
node->next = NULL;
- pv_init_node(node);
+ if (paravirt)
+ pv_init_node(node);

/*
* We touched a (possibly) cold cacheline in the per-cpu queue node;
* attempt the trylock once more in the hope someone let go while we
* weren't watching.
*/
- if (queued_spin_trylock(lock))
- goto release;
+ if (paravirt) {
+ if (pv_hybrid_queued_unfair_trylock(lock))
+ goto release;
+ } else {
+ if (queued_spin_trylock(lock))
+ goto release;
+ }

/*
* Ensure that the initialisation of @node is complete before we
@@ -396,7 +391,8 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock)
/* Link @node into the waitqueue. */
WRITE_ONCE(prev->next, node);

- pv_wait_node(node, prev);
+ if (paravirt)
+ pv_wait_node(node, prev);
/* Wait for mcs node lock to be released */
smp_cond_load_acquire(&node->locked, VAL);

@@ -432,8 +428,10 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock)
* If PV isn't active, 0 will be returned instead.
*
*/
- if ((val = pv_wait_head_or_lock(lock, node)))
- goto locked;
+ if (paravirt) {
+ if ((val = pv_wait_head_or_lock(lock, node)))
+ goto locked;
+ }

val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK));

@@ -478,7 +476,8 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock)
next = smp_cond_load_relaxed(&node->next, (VAL));

smp_store_release(&next->locked, 1); /* unlock the mcs node lock */
- pv_kick_node(lock, next);
+ if (paravirt)
+ pv_kick_node(lock, next);

release:
trace_contention_end(lock, 0);
@@ -510,13 +509,12 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock)
* contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' :
* queue : ^--' :
*/
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
+#endif
+
void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
{
- if (pv_enabled()) {
- queued_spin_lock_mcs_queue(lock);
- return;
- }
-
if (virt_spin_lock(lock))
return;

@@ -590,31 +588,17 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
*/
queue:
lockevent_inc(lock_slowpath);
- queued_spin_lock_mcs_queue(lock);
+ queued_spin_lock_mcs_queue(lock, false);
}
EXPORT_SYMBOL(queued_spin_lock_slowpath);

-/*
- * Generate the paravirt code for queued_spin_unlock_slowpath().
- */
-#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
-#define _GEN_PV_LOCK_SLOWPATH
-
-#undef pv_enabled
-#define pv_enabled() true
-
-#undef pv_init_node
-#undef pv_wait_node
-#undef pv_kick_node
-#undef pv_wait_head_or_lock
-
-#define queued_spin_lock_mcs_queue __pv_queued_spin_lock_mcs_queue
-
-#undef queued_spin_lock_slowpath
-#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
-
-#include "qspinlock_paravirt.h"
-#include "qspinlock.c"
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#undef queued_spin_lock_slowpath
+void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+ queued_spin_lock_mcs_queue(lock, true);
+}
+EXPORT_SYMBOL(__pv_queued_spin_lock_slowpath);

bool nopvspin __initdata;
static __init int parse_nopvspin(char *arg)
@@ -623,4 +607,4 @@ static __init int parse_nopvspin(char *arg)
return 0;
}
early_param("nopvspin", parse_nopvspin);
-#endif
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 97385861adc2..f1922e3a0f7d 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -1,8 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _GEN_PV_LOCK_SLOWPATH
-#error "do not include this file"
-#endif
-
#include <linux/hash.h>
#include <linux/memblock.h>
#include <linux/debug_locks.h>
@@ -50,9 +46,8 @@ enum vcpu_state {
/*
* Hybrid PV queued/unfair lock
*
- * By replacing the regular queued_spin_trylock() with the function below,
- * it will be called once when a lock waiter enter the PV slowpath before
- * being queued.
+ * This function is called once when a lock waiter enters the PV slowpath
+ * before being queued.
*
* The pending bit is set by the queue head vCPU of the MCS wait queue in
* pv_wait_head_or_lock() to signal that it is ready to spin on the lock.
@@ -71,7 +66,6 @@ enum vcpu_state {
* queued lock (no lock starvation) and an unfair lock (good performance
* on not heavily contended locks).
*/
-#define queued_spin_trylock(l) pv_hybrid_queued_unfair_trylock(l)
static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
{
/*
--
2.35.1


2022-07-05 17:28:41

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 05/13] locking/qspinlock: be less clever with the preprocessor

On Tue, Jul 05, 2022 at 12:38:12AM +1000, Nicholas Piggin wrote:
> Stop qspinlock.c including itself and avoid most of the function
> renaming with the preprocessor.
>
> This is mostly done by having the common slowpath code take a 'bool
> paravirt' argument and adjusting code based on that.

What does code-gen do? Is it clever enough to constant fold the lot?

Should we be using __always_inline to ensure the compiler doesn't decide
against inlining because $raisin and then emitting extra condtionals?

2022-07-05 20:09:27

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH 05/13] locking/qspinlock: be less clever with the preprocessor

On 7/4/22 10:38, Nicholas Piggin wrote:
> Stop qspinlock.c including itself and avoid most of the function
> renaming with the preprocessor.
>
> This is mostly done by having the common slowpath code take a 'bool
> paravirt' argument and adjusting code based on that.
>
> Signed-off-by: Nicholas Piggin <[email protected]>
> ---
> kernel/locking/qspinlock.c | 116 ++++++++++++----------------
> kernel/locking/qspinlock_paravirt.h | 10 +--
> 2 files changed, 52 insertions(+), 74 deletions(-)
>
> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> index 8f2173e22479..b96c58ca51de 100644
> --- a/kernel/locking/qspinlock.c
> +++ b/kernel/locking/qspinlock.c
> @@ -11,8 +11,6 @@
> * Peter Zijlstra <[email protected]>
> */
>
> -#ifndef _GEN_PV_LOCK_SLOWPATH
> -
> #include <linux/smp.h>
> #include <linux/bug.h>
> #include <linux/cpumask.h>
> @@ -285,35 +283,21 @@ static __always_inline void set_locked(struct qspinlock *lock)
> WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
> }
>
> -
> -/*
> - * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
> - * all the PV callbacks.
> - */
> -
> -static __always_inline void __pv_init_node(struct qnode *node) { }
> -static __always_inline void __pv_wait_node(struct qnode *node,
> - struct qnode *prev) { }
> -static __always_inline void __pv_kick_node(struct qspinlock *lock,
> - struct qnode *node) { }
> -static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
> - struct qnode *node)
> - { return 0; }
> -
> -#define pv_enabled() false
> -
> -#define pv_init_node __pv_init_node
> -#define pv_wait_node __pv_wait_node
> -#define pv_kick_node __pv_kick_node
> -#define pv_wait_head_or_lock __pv_wait_head_or_lock
> -
> #ifdef CONFIG_PARAVIRT_SPINLOCKS
> -#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
> -#endif
> -
> -#endif /* _GEN_PV_LOCK_SLOWPATH */
> +#include "qspinlock_paravirt.h"
> +#else /* CONFIG_PARAVIRT_SPINLOCKS */
> +static __always_inline void pv_init_node(struct qnode *node) { }
> +static __always_inline void pv_wait_node(struct qnode *node,
> + struct qnode *prev) { }
> +static __always_inline void pv_kick_node(struct qspinlock *lock,
> + struct qnode *node) { }
> +static __always_inline u32 pv_wait_head_or_lock(struct qspinlock *lock,
> + struct qnode *node)
> + { return 0; }
> +static __always_inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) { BUILD_BUG(); }
> +#endif /* CONFIG_PARAVIRT_SPINLOCKS */
>
> -static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock)
> +static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, bool paravirt)

Using "const bool paravirt" may help the compiler generating better code
by eliminating dead one, if it is not doing that already.

> {
> struct qnode *prev, *next, *node;
> u32 val, old, tail;
> @@ -338,8 +322,13 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock)
> */
> if (unlikely(idx >= MAX_NODES)) {
> lockevent_inc(lock_no_node);
> - while (!queued_spin_trylock(lock))
> - cpu_relax();
> + if (paravirt) {
> + while (!pv_hybrid_queued_unfair_trylock(lock))
> + cpu_relax();
> + } else {
> + while (!queued_spin_trylock(lock))
> + cpu_relax();
> + }

The code will look a bit better if you add the following helper function
and use it instead.

static inline bool queued_spin_trylock_common(struct qspinlock *lock,
const bool paravirt)
{
        if (paravirt)
                return pv_hybrid_queued_unfair_trylock(lock);
        else
                return queued_spin_trylock(lock);
}

Cheers,
Longman

2022-07-12 00:51:11

by Nicholas Piggin

[permalink] [raw]
Subject: Re: [PATCH 05/13] locking/qspinlock: be less clever with the preprocessor

Excerpts from Waiman Long's message of July 6, 2022 6:02 am:
> On 7/4/22 10:38, Nicholas Piggin wrote:
>> Stop qspinlock.c including itself and avoid most of the function
>> renaming with the preprocessor.
>>
>> This is mostly done by having the common slowpath code take a 'bool
>> paravirt' argument and adjusting code based on that.
>>
>> Signed-off-by: Nicholas Piggin <[email protected]>
>> ---
>> kernel/locking/qspinlock.c | 116 ++++++++++++----------------
>> kernel/locking/qspinlock_paravirt.h | 10 +--
>> 2 files changed, 52 insertions(+), 74 deletions(-)
>>
>> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
>> index 8f2173e22479..b96c58ca51de 100644
>> --- a/kernel/locking/qspinlock.c
>> +++ b/kernel/locking/qspinlock.c
>> @@ -11,8 +11,6 @@
>> * Peter Zijlstra <[email protected]>
>> */
>>
>> -#ifndef _GEN_PV_LOCK_SLOWPATH
>> -
>> #include <linux/smp.h>
>> #include <linux/bug.h>
>> #include <linux/cpumask.h>
>> @@ -285,35 +283,21 @@ static __always_inline void set_locked(struct qspinlock *lock)
>> WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
>> }
>>
>> -
>> -/*
>> - * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
>> - * all the PV callbacks.
>> - */
>> -
>> -static __always_inline void __pv_init_node(struct qnode *node) { }
>> -static __always_inline void __pv_wait_node(struct qnode *node,
>> - struct qnode *prev) { }
>> -static __always_inline void __pv_kick_node(struct qspinlock *lock,
>> - struct qnode *node) { }
>> -static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
>> - struct qnode *node)
>> - { return 0; }
>> -
>> -#define pv_enabled() false
>> -
>> -#define pv_init_node __pv_init_node
>> -#define pv_wait_node __pv_wait_node
>> -#define pv_kick_node __pv_kick_node
>> -#define pv_wait_head_or_lock __pv_wait_head_or_lock
>> -
>> #ifdef CONFIG_PARAVIRT_SPINLOCKS
>> -#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
>> -#endif
>> -
>> -#endif /* _GEN_PV_LOCK_SLOWPATH */
>> +#include "qspinlock_paravirt.h"
>> +#else /* CONFIG_PARAVIRT_SPINLOCKS */
>> +static __always_inline void pv_init_node(struct qnode *node) { }
>> +static __always_inline void pv_wait_node(struct qnode *node,
>> + struct qnode *prev) { }
>> +static __always_inline void pv_kick_node(struct qspinlock *lock,
>> + struct qnode *node) { }
>> +static __always_inline u32 pv_wait_head_or_lock(struct qspinlock *lock,
>> + struct qnode *node)
>> + { return 0; }
>> +static __always_inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) { BUILD_BUG(); }
>> +#endif /* CONFIG_PARAVIRT_SPINLOCKS */
>>
>> -static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock)
>> +static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, bool paravirt)
>
> Using "const bool paravirt" may help the compiler generating better code
> by eliminating dead one, if it is not doing that already.

I think adding always_inline per Peter's suggestion should be enough.
The compiler should be able to see it's constant. Actually I'm surprised
attribute pure is helpful within a compilation unit, I would think the
compiler should be able to deduce that too.

That said, no problem with massaging things to help the compiler DTRT.

>
>> {
>> struct qnode *prev, *next, *node;
>> u32 val, old, tail;
>> @@ -338,8 +322,13 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock)
>> */
>> if (unlikely(idx >= MAX_NODES)) {
>> lockevent_inc(lock_no_node);
>> - while (!queued_spin_trylock(lock))
>> - cpu_relax();
>> + if (paravirt) {
>> + while (!pv_hybrid_queued_unfair_trylock(lock))
>> + cpu_relax();
>> + } else {
>> + while (!queued_spin_trylock(lock))
>> + cpu_relax();
>> + }
>
> The code will look a bit better if you add the following helper function
> and use it instead.
>
> static inline bool queued_spin_trylock_common(struct qspinlock *lock,
> const bool paravirt)
> {
>         if (paravirt)
>                 return pv_hybrid_queued_unfair_trylock(lock);
>         else
>                 return queued_spin_trylock(lock);
> }

I did change that later on actually to always just use the basic
trylock. This was trying to be a more mechanical conversion that didn't
change too much.

Thanks,
Nick

2022-07-12 01:32:57

by Nicholas Piggin

[permalink] [raw]
Subject: Re: [PATCH 05/13] locking/qspinlock: be less clever with the preprocessor

Excerpts from Peter Zijlstra's message of July 6, 2022 3:08 am:
> On Tue, Jul 05, 2022 at 12:38:12AM +1000, Nicholas Piggin wrote:
>> Stop qspinlock.c including itself and avoid most of the function
>> renaming with the preprocessor.
>>
>> This is mostly done by having the common slowpath code take a 'bool
>> paravirt' argument and adjusting code based on that.
>
> What does code-gen do? Is it clever enough to constant fold the lot?
>
> Should we be using __always_inline to ensure the compiler doesn't decide
> against inlining because $raisin and then emitting extra condtionals?
>

It seems to fold it. Code is just different enough to make it hard
to follow what the asm differences are exactly but doesn't seem to
pass 'paravirt' anywhere.

Yes it does need __always_inline certainly. Only one path will ever
be used at runtime so any icache sharing decision by the compiler
would be wrong (and we don't care about image size very much).

Thanks,
Nick