LinuxLists.cc - [PATCH v2] locking/pvqspinlock: Add lock holder CPU argument to pv

2016-04-14 18:42:34

Subject: [PATCH v2] locking/pvqspinlock: Add lock holder CPU argument to pv_wait()

Pan Xinhui was asking for a lock holder cpu argument in pv_wait()
to help the porting of pvqspinlock to PPC. The new argument will can
potentially help hypervisor expediate the execution of the critical
section so that the lock holder vCPU can release the lock sooner.

This patch does just that by storing the previous node vCPU number.
In pv_wait_head_or_lock(), pv_wait() will be called with that vCPU
number as it is likely to be the lock holder.

In pv_wait_node(), the newly added pv_lookup_hash() function will
be called to look up the queue head and pass in the lock holder vCPU
number stored there.

This patch introduces negligible overhead to the current pvqspinlock
code. The extra lockcpu argument isn't currently used in x86
architecture.

Suggested-by: Pan Xinhui <[email protected]>
Signed-off-by: Waiman Long <[email protected]>
---
v1->v2:
- In pv_wait_node(), lookup the hash table for the queue head and pass
the lock holder cpu number to pv_wait().

arch/x86/include/asm/paravirt.h | 4 +-
arch/x86/include/asm/paravirt_types.h | 2 +-
arch/x86/kernel/kvm.c | 2 +-
arch/x86/xen/spinlock.c | 2 +-
kernel/locking/qspinlock.c | 5 ++-
kernel/locking/qspinlock_paravirt.h | 58 ++++++++++++++++++++++++++++++---
kernel/locking/qspinlock_stat.h | 8 ++--
7 files changed, 65 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 601f1b8..b89eccf 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -676,9 +676,9 @@ static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
PVOP_VCALLEE1(pv_lock_ops.queued_spin_unlock, lock);
}

-static __always_inline void pv_wait(u8 *ptr, u8 val)
+static __always_inline void pv_wait(u8 *ptr, u8 val, int lockcpu)
{
- PVOP_VCALL2(pv_lock_ops.wait, ptr, val);
+ PVOP_VCALL3(pv_lock_ops.wait, ptr, val, lockcpu);
}

static __always_inline void pv_kick(int cpu)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index e8c2326..2fc26c1 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -312,7 +312,7 @@ struct pv_lock_ops {
void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val);
struct paravirt_callee_save queued_spin_unlock;

- void (*wait)(u8 *ptr, u8 val);
+ void (*wait)(u8 *ptr, u8 val, int lockcpu);
void (*kick)(int cpu);
#else /* !CONFIG_QUEUED_SPINLOCKS */
struct paravirt_callee_save lock_spinning;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index dc1207e..47ab4e1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -590,7 +590,7 @@ static void kvm_kick_cpu(int cpu)

#include <asm/qspinlock.h>

-static void kvm_wait(u8 *ptr, u8 val)
+static void kvm_wait(u8 *ptr, u8 val, int lockcpu)
{
unsigned long flags;

diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 9e2ba5c..6f78c41 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -33,7 +33,7 @@ static void xen_qlock_kick(int cpu)
/*
* Halt the current CPU & release it back to the host
*/
-static void xen_qlock_wait(u8 *byte, u8 val)
+static void xen_qlock_wait(u8 *byte, u8 val, int lockcpu)
{
int irq = __this_cpu_read(lock_kicker_irq);

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ce2f75e..99f31e4 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -248,7 +248,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
*/

static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
+static __always_inline void __pv_wait_node(struct qspinlock *lock,
+ struct mcs_spinlock *node,
struct mcs_spinlock *prev) { }
static __always_inline void __pv_kick_node(struct qspinlock *lock,
struct mcs_spinlock *node) { }
@@ -407,7 +408,7 @@ queue:
prev = decode_tail(old);
WRITE_ONCE(prev->next, node);

- pv_wait_node(node, prev);
+ pv_wait_node(lock, node, prev);
arch_mcs_spin_lock_contended(&node->locked);

/*
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 21ede57..895224e 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -51,6 +51,7 @@ struct pv_node {
struct mcs_spinlock __res[3];

int cpu;
+ int prev_cpu; /* Previous node cpu */
u8 state;
};

@@ -156,8 +157,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
* 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
*
* Since we should not be holding locks from NMI context (very rare indeed) the
- * max load factor is 0.75, which is around the point where open addressing
- * breaks down.
+ * max load factor is 0.75.
*
*/
struct pv_hash_entry {
@@ -251,6 +251,31 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
}

/*
+ * Look up the given lock in the hash table
+ * Return the pv_node if found, NULL otherwise
+ */
+static struct pv_node *pv_lookup_hash(struct qspinlock *lock)
+{
+ unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+ struct pv_hash_entry *he;
+
+ for_each_hash_entry(he, offset, hash) {
+ struct qspinlock *l = READ_ONCE(he->lock);
+
+ if (l == lock)
+ return READ_ONCE(he->node);
+ /*
+ * Presence of an empty slot signal the end of search. We
+ * may miss the entry, but that will limit the amount of
+ * time doing the search when the desired entry isn't there.
+ */
+ else if (!l)
+ break;
+ }
+ return NULL;
+}
+
+/*
* Return true if when it is time to check the previous node which is not
* in a running state.
*/
@@ -275,6 +300,7 @@ static void pv_init_node(struct mcs_spinlock *node)

pn->cpu = smp_processor_id();
pn->state = vcpu_running;
+ pn->prev_cpu = -1;
}

/*
@@ -282,7 +308,8 @@ static void pv_init_node(struct mcs_spinlock *node)
* pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
* behalf.
*/
-static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
+static void pv_wait_node(struct qspinlock *lock, struct mcs_spinlock *node,
+ struct mcs_spinlock *prev)
{
struct pv_node *pn = (struct pv_node *)node;
struct pv_node *pp = (struct pv_node *)prev;
@@ -290,6 +317,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
int loop;
bool wait_early;

+ pn->prev_cpu = pp->cpu; /* Save previous node vCPU */
+
/* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
for (;; waitcnt++) {
for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
@@ -314,10 +343,21 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
smp_store_mb(pn->state, vcpu_halted);

if (!READ_ONCE(node->locked)) {
+ struct pv_node *hn;
+
qstat_inc(qstat_pv_wait_node, true);
qstat_inc(qstat_pv_wait_again, waitcnt);
qstat_inc(qstat_pv_wait_early, wait_early);
- pv_wait(&pn->state, vcpu_halted);
+
+ /*
+ * We try to locate the queue head pv_node by looking
+ * up the hash table. If it is not found, use the
+ * CPU in the previous node instead.
+ */
+ hn = pv_lookup_hash(lock);
+ if (!hn)
+ hn = pn;
+ pv_wait(&pn->state, vcpu_halted, hn->prev_cpu);
}

/*
@@ -453,7 +493,15 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
WRITE_ONCE(pn->state, vcpu_halted);
qstat_inc(qstat_pv_wait_head, true);
qstat_inc(qstat_pv_wait_again, waitcnt);
- pv_wait(&l->locked, _Q_SLOW_VAL);
+
+ /*
+ * Pass in the previous node vCPU nmber which is likely to be
+ * the lock holder vCPU. This additional information may help
+ * the hypervisor to give more resource to that vCPU so that
+ * it can release the lock faster. With lock stealing,
+ * however, that vCPU may not be the actual lock holder.
+ */
+ pv_wait(&l->locked, _Q_SLOW_VAL, pn->prev_cpu);

/*
* The unlocker should have freed the lock before kicking the
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index eb2a2c9..8728348 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -266,12 +266,12 @@ static inline void __pv_kick(int cpu)
/*
* Replacement function for pv_wait()
*/
-static inline void __pv_wait(u8 *ptr, u8 val)
+static inline void __pv_wait(u8 *ptr, u8 val, int lockcpu)
{
u64 *pkick_time = this_cpu_ptr(&pv_kick_time);

*pkick_time = 0;
- pv_wait(ptr, val);
+ pv_wait(ptr, val, lockcpu);
if (*pkick_time) {
this_cpu_add(qstats[qstat_pv_latency_wake],
sched_clock() - *pkick_time);
@@ -279,8 +279,8 @@ static inline void __pv_wait(u8 *ptr, u8 val)
}
}

-#define pv_kick(c) __pv_kick(c)
-#define pv_wait(p, v) __pv_wait(p, v)
+#define pv_kick(c) __pv_kick(c)
+#define pv_wait(p, v, c) __pv_wait(p, v, c)

#else /* CONFIG_QUEUED_LOCK_STAT */

--
1.7.1

2016-04-20 12:08:15

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH v2] locking/pvqspinlock: Add lock holder CPU argument to pv_wait()

On Thu, Apr 14, 2016 at 02:41:58PM -0400, Waiman Long wrote:
> Pan Xinhui was asking for a lock holder cpu argument in pv_wait()
> to help the porting of pvqspinlock to PPC. The new argument will can
> potentially help hypervisor expediate the execution of the critical
> section so that the lock holder vCPU can release the lock sooner.
>
> This patch does just that by storing the previous node vCPU number.
> In pv_wait_head_or_lock(), pv_wait() will be called with that vCPU
> number as it is likely to be the lock holder.
>
> In pv_wait_node(), the newly added pv_lookup_hash() function will
> be called to look up the queue head and pass in the lock holder vCPU
> number stored there.
>
> This patch introduces negligible overhead to the current pvqspinlock
> code. The extra lockcpu argument isn't currently used in x86
> architecture.

This Changelog is completely useless; it does not explain how this
works at all.

> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> index ce2f75e..99f31e4 100644
> --- a/kernel/locking/qspinlock.c
> +++ b/kernel/locking/qspinlock.c
> @@ -248,7 +248,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
> */
>
> static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
> -static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
> +static __always_inline void __pv_wait_node(struct qspinlock *lock,
> + struct mcs_spinlock *node,
> struct mcs_spinlock *prev) { }
> static __always_inline void __pv_kick_node(struct qspinlock *lock,
> struct mcs_spinlock *node) { }
> @@ -407,7 +408,7 @@ queue:
> prev = decode_tail(old);
> WRITE_ONCE(prev->next, node);
>
> - pv_wait_node(node, prev);
> + pv_wait_node(lock, node, prev);
> arch_mcs_spin_lock_contended(&node->locked);
>
> /*
> diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
> index 21ede57..895224e 100644
> --- a/kernel/locking/qspinlock_paravirt.h
> +++ b/kernel/locking/qspinlock_paravirt.h
> @@ -51,6 +51,7 @@ struct pv_node {
> struct mcs_spinlock __res[3];
>
> int cpu;
> + int prev_cpu; /* Previous node cpu */

That is a horrible name; what is a 'node cpu'.

> u8 state;
> };
>
> @@ -156,8 +157,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
> * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
> *
> * Since we should not be holding locks from NMI context (very rare indeed) the
> - * max load factor is 0.75, which is around the point where open addressing
> - * breaks down.
> + * max load factor is 0.75.

Why? Isn't that true anymore?

> *
> */
> struct pv_hash_entry {
> @@ -251,6 +251,31 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
> }
>
> /*
> + * Look up the given lock in the hash table
> + * Return the pv_node if found, NULL otherwise
> + */
> +static struct pv_node *pv_lookup_hash(struct qspinlock *lock)
> +{
> + unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
> + struct pv_hash_entry *he;
> +
> + for_each_hash_entry(he, offset, hash) {
> + struct qspinlock *l = READ_ONCE(he->lock);
> +
> + if (l == lock)

The other loop writes:

if (READ_ONCE(he->lock) == lock)

> + return READ_ONCE(he->node);
> + /*
> + * Presence of an empty slot signal the end of search. We
> + * may miss the entry, but that will limit the amount of
> + * time doing the search when the desired entry isn't there.
> + */
> + else if (!l)
> + break;

That 'else' is entirely pointless. Also, why isn't this: return NULL;

> + }
> + return NULL;

and this BUG() ?

> +}
> +
> +/*
> * Return true if when it is time to check the previous node which is not
> * in a running state.
> */
> @@ -275,6 +300,7 @@ static void pv_init_node(struct mcs_spinlock *node)
>
> pn->cpu = smp_processor_id();
> pn->state = vcpu_running;
> + pn->prev_cpu = -1;

This does not match the struct element order.

> }
>
> /*
> @@ -282,7 +308,8 @@ static void pv_init_node(struct mcs_spinlock *node)
> * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
> * behalf.
> */
> -static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
> +static void pv_wait_node(struct qspinlock *lock, struct mcs_spinlock *node,
> + struct mcs_spinlock *prev)
> {
> struct pv_node *pn = (struct pv_node *)node;
> struct pv_node *pp = (struct pv_node *)prev;
> @@ -290,6 +317,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
> int loop;
> bool wait_early;
>
> + pn->prev_cpu = pp->cpu; /* Save previous node vCPU */

again a useless comment.

> +
> /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
> for (;; waitcnt++) {
> for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
> @@ -314,10 +343,21 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
> smp_store_mb(pn->state, vcpu_halted);
>
> if (!READ_ONCE(node->locked)) {
> + struct pv_node *hn;
> +
> qstat_inc(qstat_pv_wait_node, true);
> qstat_inc(qstat_pv_wait_again, waitcnt);
> qstat_inc(qstat_pv_wait_early, wait_early);
> - pv_wait(&pn->state, vcpu_halted);
> +
> + /*
> + * We try to locate the queue head pv_node by looking
> + * up the hash table. If it is not found, use the
> + * CPU in the previous node instead.
> + */
> + hn = pv_lookup_hash(lock);
> + if (!hn)
> + hn = pn;

This is potentially expensive... it does not explain why this lookup can
fail etc.. nor mentioned that lock stealing caveat.

> + pv_wait(&pn->state, vcpu_halted, hn->prev_cpu);
> }
>
> /*
> @@ -453,7 +493,15 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
> WRITE_ONCE(pn->state, vcpu_halted);
> qstat_inc(qstat_pv_wait_head, true);
> qstat_inc(qstat_pv_wait_again, waitcnt);
> - pv_wait(&l->locked, _Q_SLOW_VAL);
> +
> + /*
> + * Pass in the previous node vCPU nmber which is likely to be
> + * the lock holder vCPU. This additional information may help
> + * the hypervisor to give more resource to that vCPU so that
> + * it can release the lock faster. With lock stealing,
> + * however, that vCPU may not be the actual lock holder.
> + */
> + pv_wait(&l->locked, _Q_SLOW_VAL, pn->prev_cpu);

urgh..

With all the holes in, does it really still matter?

In any case, I would really only want to see this together with the
patches that make use of it, and then still have it have numbers with
and without this thing.

2016-04-20 14:16:32

by Pan Xinhui

[permalink] [raw]

Subject: Re: [PATCH v2] locking/pvqspinlock: Add lock holder CPU argument to pv_wait()

Hello Peter

On 2016年04月20日 20:08, Peter Zijlstra wrote:
> On Thu, Apr 14, 2016 at 02:41:58PM -0400, Waiman Long wrote:
>> Pan Xinhui was asking for a lock holder cpu argument in pv_wait()
>> to help the porting of pvqspinlock to PPC. The new argument will can
>> potentially help hypervisor expediate the execution of the critical
>> section so that the lock holder vCPU can release the lock sooner.
>>
>> This patch does just that by storing the previous node vCPU number.
>> In pv_wait_head_or_lock(), pv_wait() will be called with that vCPU
>> number as it is likely to be the lock holder.
>>
>> In pv_wait_node(), the newly added pv_lookup_hash() function will
>> be called to look up the queue head and pass in the lock holder vCPU
>> number stored there.
>>
>> This patch introduces negligible overhead to the current pvqspinlock
>> code. The extra lockcpu argument isn't currently used in x86
>> architecture.
>
> This Changelog is completely useless; it does not explain how this
> works at all.
>
>
>> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
>> index ce2f75e..99f31e4 100644
>> --- a/kernel/locking/qspinlock.c
>> +++ b/kernel/locking/qspinlock.c
>> @@ -248,7 +248,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
>> */
>>
>> static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
>> -static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
>> +static __always_inline void __pv_wait_node(struct qspinlock *lock,
>> + struct mcs_spinlock *node,
>> struct mcs_spinlock *prev) { }
>> static __always_inline void __pv_kick_node(struct qspinlock *lock,
>> struct mcs_spinlock *node) { }
>> @@ -407,7 +408,7 @@ queue:
>> prev = decode_tail(old);
>> WRITE_ONCE(prev->next, node);
>>
>> - pv_wait_node(node, prev);
>> + pv_wait_node(lock, node, prev);
>> arch_mcs_spin_lock_contended(&node->locked);
>>
>> /*
>> diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
>> index 21ede57..895224e 100644
>> --- a/kernel/locking/qspinlock_paravirt.h
>> +++ b/kernel/locking/qspinlock_paravirt.h
>> @@ -51,6 +51,7 @@ struct pv_node {
>> struct mcs_spinlock __res[3];
>>
>> int cpu;
>> + int prev_cpu; /* Previous node cpu */
>
> That is a horrible name; what is a 'node cpu'.
>
>> u8 state;
>> };
>>
>> @@ -156,8 +157,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
>> * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
>> *
>> * Since we should not be holding locks from NMI context (very rare indeed) the
>> - * max load factor is 0.75, which is around the point where open addressing
>> - * breaks down.
>> + * max load factor is 0.75.
>
> Why? Isn't that true anymore?
>
>> *
>> */
>> struct pv_hash_entry {
>> @@ -251,6 +251,31 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
>> }
>>
>> /*
>> + * Look up the given lock in the hash table
>> + * Return the pv_node if found, NULL otherwise
>> + */
>> +static struct pv_node *pv_lookup_hash(struct qspinlock *lock)
>> +{
>> + unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
>> + struct pv_hash_entry *he;
>> +
>> + for_each_hash_entry(he, offset, hash) {
>> + struct qspinlock *l = READ_ONCE(he->lock);
>> +
>> + if (l == lock)
>
> The other loop writes:
>
> if (READ_ONCE(he->lock) == lock)
>
Maybe because we check l is NULL or not later. So save one load.

>> + return READ_ONCE(he->node);
>> + /*
>> + * Presence of an empty slot signal the end of search. We
>> + * may miss the entry, but that will limit the amount of
>> + * time doing the search when the desired entry isn't there.
>> + */
>> + else if (!l)
>> + break;
>
> That 'else' is entirely pointless. Also, why isn't this: return NULL;
>
>> + }
>> + return NULL;
>
> and this BUG() ?
>
It's not a bug, the lock might not be stored in the hashtable. in unlock function, we will unhash the lock, then what will happen is:

cpu1 cpu2 cpu3
pv_kick_node pv_wait_head_or_lock pv_wait_node

WRITE_ONCE(pn->state, vcpu_running);
if (cmpxchg(&pn->state,
vcpu_halted, vcpu_hashed) != vcpu_halted)
return;
pv_hash_lookup //no lock in hashtable

So there is such case that we search the whole hashtable and the lock is not found. :(
Waiman assume that if l = null, the lock is not stored. however the lock might be there actually.
But to avoid the worst case I just mentioned above, it can quickly finish the lookup.
So I agree with Waiman.

> +}
>> +
>> +/*
>> * Return true if when it is time to check the previous node which is not
>> * in a running state.
>> */
>> @@ -275,6 +300,7 @@ static void pv_init_node(struct mcs_spinlock *node)
>>
>> pn->cpu = smp_processor_id();
>> pn->state = vcpu_running;
>> + pn->prev_cpu = -1;
>
> This does not match the struct element order.
>
>> }
>>
>> /*
>> @@ -282,7 +308,8 @@ static void pv_init_node(struct mcs_spinlock *node)
>> * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
>> * behalf.
>> */
>> -static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
>> +static void pv_wait_node(struct qspinlock *lock, struct mcs_spinlock *node,
>> + struct mcs_spinlock *prev)
>> {
>> struct pv_node *pn = (struct pv_node *)node;
>> struct pv_node *pp = (struct pv_node *)prev;
>> @@ -290,6 +317,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
>> int loop;
>> bool wait_early;
>>
>> + pn->prev_cpu = pp->cpu; /* Save previous node vCPU */
>
> again a useless comment.
>
>> +
>> /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
>> for (;; waitcnt++) {
>> for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
>> @@ -314,10 +343,21 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
>> smp_store_mb(pn->state, vcpu_halted);
>>
>> if (!READ_ONCE(node->locked)) {
>> + struct pv_node *hn;
>> +
>> qstat_inc(qstat_pv_wait_node, true);
>> qstat_inc(qstat_pv_wait_again, waitcnt);
>> qstat_inc(qstat_pv_wait_early, wait_early);
>> - pv_wait(&pn->state, vcpu_halted);
>> +
>> + /*
>> + * We try to locate the queue head pv_node by looking
>> + * up the hash table. If it is not found, use the
>> + * CPU in the previous node instead.
>> + */
>> + hn = pv_lookup_hash(lock);
>> + if (!hn)
>> + hn = pn;
>
> This is potentially expensive... it does not explain why this lookup can
> fail etc.. nor mentioned that lock stealing caveat.
>
Yes, it's expensive. Normally, PPC phyp don't always need the correct holder. That means current vcpu can just give up its slice.
There is one lpar hvcall H_CONFER. I paste some spec below.

hcall (const uint32 H_CONFER, /*Confer the calling virtual processor’s cycles to the specified processor*/
int32proc, /*Target Processor number -- minus 1 is all partition processors */
uint32 dispatch); /* The dispatch number (ignored if proc=caller) */

So we really don't need the correct holder all the time. :)

>> + pv_wait(&pn->state, vcpu_halted, hn->prev_cpu);
>> }
>>
>> /*
>> @@ -453,7 +493,15 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
>> WRITE_ONCE(pn->state, vcpu_halted);
>> qstat_inc(qstat_pv_wait_head, true);
>> qstat_inc(qstat_pv_wait_again, waitcnt);
>> - pv_wait(&l->locked, _Q_SLOW_VAL);
>> +
>> + /*
>> + * Pass in the previous node vCPU nmber which is likely to be
>> + * the lock holder vCPU. This additional information may help
>> + * the hypervisor to give more resource to that vCPU so that
>> + * it can release the lock faster. With lock stealing,
>> + * however, that vCPU may not be the actual lock holder.
>> + */
>> + pv_wait(&l->locked, _Q_SLOW_VAL, pn->prev_cpu);
>
>
> urgh..
>
>
> With all the holes in, does it really still matter?
>
> In any case, I would really only want to see this together with the
> patches that make use of it, and then still have it have numbers with
> and without this thing.
>
>

I am preparing the patches. try to send them out during this week. :)

>

2016-04-20 14:18:27

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH v2] locking/pvqspinlock: Add lock holder CPU argument to pv_wait()

On Wed, Apr 20, 2016 at 10:15:09PM +0800, Pan Xinhui wrote:
> >> +static struct pv_node *pv_lookup_hash(struct qspinlock *lock)
> >> +{
> >> + unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
> >> + struct pv_hash_entry *he;
> >> +
> >> + for_each_hash_entry(he, offset, hash) {
> >> + struct qspinlock *l = READ_ONCE(he->lock);
> >> +
> >> + if (l == lock)
> >
> > The other loop writes:
> >
> > if (READ_ONCE(he->lock) == lock)
> >
> Maybe because we check l is NULL or not later. So save one load.

Ah duh, yes.

> >> + return READ_ONCE(he->node);
> >> + /*
> >> + * Presence of an empty slot signal the end of search. We
> >> + * may miss the entry, but that will limit the amount of
> >> + * time doing the search when the desired entry isn't there.
> >> + */
> >> + else if (!l)
> >> + break;
> >
> > That 'else' is entirely pointless. Also, why isn't this: return NULL;
> >
> >> + }
> >> + return NULL;
> >
> > and this BUG() ?
> >
> It's not a bug, the lock might not be stored in the hashtable. in unlock function, we will unhash the lock, then what will happen is:

It should be if the above becomes a return NULL, no?

If we can iterate the _entire_ hashtable, this lookup can be immensely
expensive and we should not be doing it inside of a wait-loop.

2016-04-20 14:19:53

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH v2] locking/pvqspinlock: Add lock holder CPU argument to pv_wait()

On Wed, Apr 20, 2016 at 10:15:09PM +0800, Pan Xinhui wrote:
> So there is such case that we search the whole hashtable and the lock is not found. :(
> Waiman assume that if l = null, the lock is not stored. however the lock might be there actually.
> But to avoid the worst case I just mentioned above, it can quickly finish the lookup.

> >> +
> >> + /*
> >> + * We try to locate the queue head pv_node by looking
> >> + * up the hash table. If it is not found, use the
> >> + * CPU in the previous node instead.
> >> + */
> >> + hn = pv_lookup_hash(lock);
> >> + if (!hn)
> >> + hn = pn;
> >
> > This is potentially expensive... it does not explain why this lookup can
> > fail etc.. nor mentioned that lock stealing caveat.
> >
> Yes, it's expensive. Normally, PPC phyp don't always need the correct
> holder. That means current vcpu can just give up its slice. There is
> one lpar hvcall H_CONFER. I paste some spec below.

Ok, so if we can indeed scan the _entire_ hashtable, then we really
should not have that in common code. That's seriously expensive.

2016-04-20 14:38:07

by Pan Xinhui

[permalink] [raw]

Subject: Re: [PATCH v2] locking/pvqspinlock: Add lock holder CPU argument to pv_wait()

On 2016年04月20日 22:19, Peter Zijlstra wrote:
> On Wed, Apr 20, 2016 at 10:15:09PM +0800, Pan Xinhui wrote:
>> So there is such case that we search the whole hashtable and the lock is not found. :(
>> Waiman assume that if l = null, the lock is not stored. however the lock might be there actually.
>> But to avoid the worst case I just mentioned above, it can quickly finish the lookup.
>
>
>>>> +
>>>> + /*
>>>> + * We try to locate the queue head pv_node by looking
>>>> + * up the hash table. If it is not found, use the
>>>> + * CPU in the previous node instead.
>>>> + */
>>>> + hn = pv_lookup_hash(lock);
>>>> + if (!hn)
>>>> + hn = pn;
>>>
>>> This is potentially expensive... it does not explain why this lookup can
>>> fail etc.. nor mentioned that lock stealing caveat.
>>>
>> Yes, it's expensive. Normally, PPC phyp don't always need the correct
>> holder. That means current vcpu can just give up its slice. There is
>> one lpar hvcall H_CONFER. I paste some spec below.
>
> Ok, so if we can indeed scan the _entire_ hashtable, then we really
> should not have that in common code. That's seriously expensive.
>
Okay, I will try to add the holder lookup code in arch/...

But I just come up with one idea,
in __pv_queued_spin_unlock_slowpath()
we will kick the node->cpu, who will become the holder soon.
I think we can somehow record the node->cpu and use it in pv_wait_node :)

thanks
xinhui

2016-04-20 15:06:25

by Pan Xinhui

[permalink] [raw]

Subject: Re: [PATCH v2] locking/pvqspinlock: Add lock holder CPU argument to pv_wait()

On 2016年04月20日 22:18, Peter Zijlstra wrote:
> On Wed, Apr 20, 2016 at 10:15:09PM +0800, Pan Xinhui wrote:
>>>> +static struct pv_node *pv_lookup_hash(struct qspinlock *lock)
>>>> +{
>>>> + unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
>>>> + struct pv_hash_entry *he;
>>>> +
>>>> + for_each_hash_entry(he, offset, hash) {
>>>> + struct qspinlock *l = READ_ONCE(he->lock);
>>>> +
>>>> + if (l == lock)
>>>
>>> The other loop writes:
>>>
>>> if (READ_ONCE(he->lock) == lock)
>>>
>> Maybe because we check l is NULL or not later. So save one load.
>
> Ah duh, yes.
>
>>>> + return READ_ONCE(he->node);
>>>> + /*
>>>> + * Presence of an empty slot signal the end of search. We
>>>> + * may miss the entry, but that will limit the amount of
>>>> + * time doing the search when the desired entry isn't there.
>>>> + */
>>>> + else if (!l)
>>>> + break;
>>>
>>> That 'else' is entirely pointless. Also, why isn't this: return NULL;
>>>
>>>> + }
>>>> + return NULL;
>>>
>>> and this BUG() ?
>>>
>> It's not a bug, the lock might not be stored in the hashtable. in unlock function, we will unhash the lock, then what will happen is:
>
> It should be if the above becomes a return NULL, no?
>
no, the lock might not be there, even if we search the whole hashtable.
Only pv_kick_node and pv_wait_head_or_lock will hash the lock. if both vcpu's state is vcpu_running, who will hash the lock on behalf of us?

Can pv_wait return without anyone kicking it? If yes, then this not a bug.

> If we can iterate the _entire_ hashtable, this lookup can be immensely
> expensive and we should not be doing it inside of a wait-loop.
>

2016-04-20 17:47:09

by Waiman Long

[permalink] [raw]

Subject: Re: [PATCH v2] locking/pvqspinlock: Add lock holder CPU argument to pv_wait()

On 04/20/2016 08:08 AM, Peter Zijlstra wrote:
> On Thu, Apr 14, 2016 at 02:41:58PM -0400, Waiman Long wrote:
>> Pan Xinhui was asking for a lock holder cpu argument in pv_wait()
>> to help the porting of pvqspinlock to PPC. The new argument will can
>> potentially help hypervisor expediate the execution of the critical
>> section so that the lock holder vCPU can release the lock sooner.
>>
>> This patch does just that by storing the previous node vCPU number.
>> In pv_wait_head_or_lock(), pv_wait() will be called with that vCPU
>> number as it is likely to be the lock holder.
>>
>> In pv_wait_node(), the newly added pv_lookup_hash() function will
>> be called to look up the queue head and pass in the lock holder vCPU
>> number stored there.
>>
>> This patch introduces negligible overhead to the current pvqspinlock
>> code. The extra lockcpu argument isn't currently used in x86
>> architecture.
> This Changelog is completely useless; it does not explain how this
> works at all.
>

I will rework the changelog to make it reflect what is being done by the
patch.

>> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
>> index ce2f75e..99f31e4 100644
>> --- a/kernel/locking/qspinlock.c
>> +++ b/kernel/locking/qspinlock.c
>> @@ -248,7 +248,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
>> */
>>
>> static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
>> -static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
>> +static __always_inline void __pv_wait_node(struct qspinlock *lock,
>> + struct mcs_spinlock *node,
>> struct mcs_spinlock *prev) { }
>> static __always_inline void __pv_kick_node(struct qspinlock *lock,
>> struct mcs_spinlock *node) { }
>> @@ -407,7 +408,7 @@ queue:
>> prev = decode_tail(old);
>> WRITE_ONCE(prev->next, node);
>>
>> - pv_wait_node(node, prev);
>> + pv_wait_node(lock, node, prev);
>> arch_mcs_spin_lock_contended(&node->locked);
>>
>> /*
>> diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
>> index 21ede57..895224e 100644
>> --- a/kernel/locking/qspinlock_paravirt.h
>> +++ b/kernel/locking/qspinlock_paravirt.h
>> @@ -51,6 +51,7 @@ struct pv_node {
>> struct mcs_spinlock __res[3];
>>
>> int cpu;
>> + int prev_cpu; /* Previous node cpu */
> That is a horrible name; what is a 'node cpu'.

I will fix that.

>> u8 state;
>> };
>>
>> @@ -156,8 +157,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
>> * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
>> *
>> * Since we should not be holding locks from NMI context (very rare indeed) the
>> - * max load factor is 0.75, which is around the point where open addressing
>> - * breaks down.
>> + * max load factor is 0.75.
> Why? Isn't that true anymore?
>

I made a mistake here taking the comment out. Will restore that in the
next patch.

>> *
>> */
>> struct pv_hash_entry {
>> @@ -251,6 +251,31 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
>> }
>>
>> /*
>> + * Look up the given lock in the hash table
>> + * Return the pv_node if found, NULL otherwise
>> + */
>> +static struct pv_node *pv_lookup_hash(struct qspinlock *lock)
>> +{
>> + unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
>> + struct pv_hash_entry *he;
>> +
>> + for_each_hash_entry(he, offset, hash) {
>> + struct qspinlock *l = READ_ONCE(he->lock);
>> +
>> + if (l == lock)
> The other loop writes:
>
> if (READ_ONCE(he->lock) == lock)
>
>> + return READ_ONCE(he->node);
>> + /*
>> + * Presence of an empty slot signal the end of search. We
>> + * may miss the entry, but that will limit the amount of
>> + * time doing the search when the desired entry isn't there.
>> + */
>> + else if (!l)
>> + break;
> That 'else' is entirely pointless. Also, why isn't this: return NULL;
>
>> + }
>> + return NULL;
> and this BUG() ?
>

Since the lookup is done in pv_wait_node(), it is entirely possible that
the desired entry isn't there in the hash. The lookup is on a best
effort basis and returning an entry isn't guaranteed. In that case, the
-1 argument will be passed to pv_wait(). We can't put the BUG() call
there as it will actually be called.

>> +}
>> +
>> +/*
>> * Return true if when it is time to check the previous node which is not
>> * in a running state.
>> */
>> @@ -275,6 +300,7 @@ static void pv_init_node(struct mcs_spinlock *node)
>>
>> pn->cpu = smp_processor_id();
>> pn->state = vcpu_running;
>> + pn->prev_cpu = -1;
> This does not match the struct element order.

OK, will rearrange the order.

>> }
>>
>> /*
>> @@ -282,7 +308,8 @@ static void pv_init_node(struct mcs_spinlock *node)
>> * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
>> * behalf.
>> */
>> -static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
>> +static void pv_wait_node(struct qspinlock *lock, struct mcs_spinlock *node,
>> + struct mcs_spinlock *prev)
>> {
>> struct pv_node *pn = (struct pv_node *)node;
>> struct pv_node *pp = (struct pv_node *)prev;
>> @@ -290,6 +317,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
>> int loop;
>> bool wait_early;
>>
>> + pn->prev_cpu = pp->cpu; /* Save previous node vCPU */
> again a useless comment.
>
>> +
>> /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
>> for (;; waitcnt++) {
>> for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
>> @@ -314,10 +343,21 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
>> smp_store_mb(pn->state, vcpu_halted);
>>
>> if (!READ_ONCE(node->locked)) {
>> + struct pv_node *hn;
>> +
>> qstat_inc(qstat_pv_wait_node, true);
>> qstat_inc(qstat_pv_wait_again, waitcnt);
>> qstat_inc(qstat_pv_wait_early, wait_early);
>> - pv_wait(&pn->state, vcpu_halted);
>> +
>> + /*
>> + * We try to locate the queue head pv_node by looking
>> + * up the hash table. If it is not found, use the
>> + * CPU in the previous node instead.
>> + */
>> + hn = pv_lookup_hash(lock);
>> + if (!hn)
>> + hn = pn;
> This is potentially expensive... it does not explain why this lookup can
> fail etc.. nor mentioned that lock stealing caveat.

Yes, the lookup can be expensive. That is why I stop it when it hit a
null entry. In addition, the lookup will only happen if the vCPU is
going to suspend itself. The vCPU suspend and resume is really slow
anyway. So it won't add too much additional overhead in this regard. I
will expend the comment a bit to explain the situation.

>
>> + pv_wait(&pn->state, vcpu_halted, hn->prev_cpu);
>> }
>>
>> /*
>> @@ -453,7 +493,15 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
>> WRITE_ONCE(pn->state, vcpu_halted);
>> qstat_inc(qstat_pv_wait_head, true);
>> qstat_inc(qstat_pv_wait_again, waitcnt);
>> - pv_wait(&l->locked, _Q_SLOW_VAL);
>> +
>> + /*
>> + * Pass in the previous node vCPU nmber which is likely to be
>> + * the lock holder vCPU. This additional information may help
>> + * the hypervisor to give more resource to that vCPU so that
>> + * it can release the lock faster. With lock stealing,
>> + * however, that vCPU may not be the actual lock holder.
>> + */
>> + pv_wait(&l->locked, _Q_SLOW_VAL, pn->prev_cpu);
>
> urgh..
>
>
> With all the holes in, does it really still matter?
>
> In any case, I would really only want to see this together with the
> patches that make use of it, and then still have it have numbers with
> and without this thing.
>
>
I do intend for Xinhui to pick up my patch as part of his patchset to
enable qspinlock for PPC. I will send out an updated one to address the
feedback that you give.

Cheers,
Longman

2016-04-20 17:50:59

by Waiman Long

[permalink] [raw]

Subject: Re: [PATCH v2] locking/pvqspinlock: Add lock holder CPU argument to pv_wait()

On 04/20/2016 10:19 AM, Peter Zijlstra wrote:
> On Wed, Apr 20, 2016 at 10:15:09PM +0800, Pan Xinhui wrote:
>> So there is such case that we search the whole hashtable and the lock is not found. :(
>> Waiman assume that if l = null, the lock is not stored. however the lock might be there actually.
>> But to avoid the worst case I just mentioned above, it can quickly finish the lookup.
>
>>>> +
>>>> + /*
>>>> + * We try to locate the queue head pv_node by looking
>>>> + * up the hash table. If it is not found, use the
>>>> + * CPU in the previous node instead.
>>>> + */
>>>> + hn = pv_lookup_hash(lock);
>>>> + if (!hn)
>>>> + hn = pn;
>>> This is potentially expensive... it does not explain why this lookup can
>>> fail etc.. nor mentioned that lock stealing caveat.
>>>
>> Yes, it's expensive. Normally, PPC phyp don't always need the correct
>> holder. That means current vcpu can just give up its slice. There is
>> one lpar hvcall H_CONFER. I paste some spec below.
> Ok, so if we can indeed scan the _entire_ hashtable, then we really
> should not have that in common code. That's seriously expensive.

No, it should not scan the entire hashtable and there is no point in
doing so.

Cheers,
Longman

2016-04-20 17:58:50

by Waiman Long

[permalink] [raw]

Subject: Re: [PATCH v2] locking/pvqspinlock: Add lock holder CPU argument to pv_wait()

On 04/20/2016 10:36 AM, Pan Xinhui wrote:
>
> On 2016年04月20日 22:19, Peter Zijlstra wrote:
>> On Wed, Apr 20, 2016 at 10:15:09PM +0800, Pan Xinhui wrote:
>>> So there is such case that we search the whole hashtable and the lock is not found. :(
>>> Waiman assume that if l = null, the lock is not stored. however the lock might be there actually.
>>> But to avoid the worst case I just mentioned above, it can quickly finish the lookup.
>>
>>>>> +
>>>>> + /*
>>>>> + * We try to locate the queue head pv_node by looking
>>>>> + * up the hash table. If it is not found, use the
>>>>> + * CPU in the previous node instead.
>>>>> + */
>>>>> + hn = pv_lookup_hash(lock);
>>>>> + if (!hn)
>>>>> + hn = pn;
>>>> This is potentially expensive... it does not explain why this lookup can
>>>> fail etc.. nor mentioned that lock stealing caveat.
>>>>
>>> Yes, it's expensive. Normally, PPC phyp don't always need the correct
>>> holder. That means current vcpu can just give up its slice. There is
>>> one lpar hvcall H_CONFER. I paste some spec below.
>> Ok, so if we can indeed scan the _entire_ hashtable, then we really
>> should not have that in common code. That's seriously expensive.
>>
> Okay, I will try to add the holder lookup code in arch/...
>
> But I just come up with one idea,
> in __pv_queued_spin_unlock_slowpath()
> we will kick the node->cpu, who will become the holder soon.
> I think we can somehow record the node->cpu and use it in pv_wait_node :)
>
> thanks
> xinhui
>

That will make the code more complex. Unless you find it provide real
performance improvement, I won't suggest doing that for the time being.

Cheers,
Longman