2016-04-13 20:59:43

by Waiman Long

[permalink] [raw]
Subject: [PATCH] locking/pvqspinlock: Add lock holder CPU argument to pv_wait()

Pan Xinhui was asking for a lock holder cpu argument in pv_wait()
to help the porting of pvqspinlock to PPC. The new argument will can
potentially help hypervisor expediate the execution of the critical
section so that the lock holder vCPU can release the lock sooner.

This patch does just that by storing the previous node vCPU number.
In pv_wait_head_or_lock(), pv_wait() will be called with that vCPU
number as it is likely to be the lock holder. In pv_wait_node(),
-1 will be passed to pv_wait() instead to indicate that it doesn't
know what the current lock holder is.

This patch introduces negligible overhead to the current pvqspinlock
code. The extra lockcpu argument isn't currently used in x86
architecture.

Signed-off-by: Waiman Long <[email protected]>
---
arch/x86/include/asm/paravirt.h | 4 ++--
arch/x86/include/asm/paravirt_types.h | 2 +-
arch/x86/kernel/kvm.c | 2 +-
arch/x86/xen/spinlock.c | 2 +-
kernel/locking/qspinlock_paravirt.h | 19 +++++++++++++++----
kernel/locking/qspinlock_stat.h | 8 ++++----
6 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 601f1b8..b89eccf 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -676,9 +676,9 @@ static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
PVOP_VCALLEE1(pv_lock_ops.queued_spin_unlock, lock);
}

-static __always_inline void pv_wait(u8 *ptr, u8 val)
+static __always_inline void pv_wait(u8 *ptr, u8 val, int lockcpu)
{
- PVOP_VCALL2(pv_lock_ops.wait, ptr, val);
+ PVOP_VCALL3(pv_lock_ops.wait, ptr, val, lockcpu);
}

static __always_inline void pv_kick(int cpu)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index e8c2326..2fc26c1 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -312,7 +312,7 @@ struct pv_lock_ops {
void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val);
struct paravirt_callee_save queued_spin_unlock;

- void (*wait)(u8 *ptr, u8 val);
+ void (*wait)(u8 *ptr, u8 val, int lockcpu);
void (*kick)(int cpu);
#else /* !CONFIG_QUEUED_SPINLOCKS */
struct paravirt_callee_save lock_spinning;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index dc1207e..47ab4e1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -590,7 +590,7 @@ static void kvm_kick_cpu(int cpu)

#include <asm/qspinlock.h>

-static void kvm_wait(u8 *ptr, u8 val)
+static void kvm_wait(u8 *ptr, u8 val, int lockcpu)
{
unsigned long flags;

diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 9e2ba5c..6f78c41 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -33,7 +33,7 @@ static void xen_qlock_kick(int cpu)
/*
* Halt the current CPU & release it back to the host
*/
-static void xen_qlock_wait(u8 *byte, u8 val)
+static void xen_qlock_wait(u8 *byte, u8 val, int lockcpu)
{
int irq = __this_cpu_read(lock_kicker_irq);

diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 21ede57..4bec98b 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -51,6 +51,7 @@ struct pv_node {
struct mcs_spinlock __res[3];

int cpu;
+ int prev_cpu; /* Previous node cpu */
u8 state;
};

@@ -156,8 +157,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
* 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
*
* Since we should not be holding locks from NMI context (very rare indeed) the
- * max load factor is 0.75, which is around the point where open addressing
- * breaks down.
+ * max load factor is 0.75.
*
*/
struct pv_hash_entry {
@@ -275,6 +275,7 @@ static void pv_init_node(struct mcs_spinlock *node)

pn->cpu = smp_processor_id();
pn->state = vcpu_running;
+ pn->prev_cpu = -1;
}

/*
@@ -290,6 +291,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
int loop;
bool wait_early;

+ pn->prev_cpu = pp->cpu; /* Save previous node vCPU */
+
/* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
for (;; waitcnt++) {
for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
@@ -317,7 +320,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
qstat_inc(qstat_pv_wait_node, true);
qstat_inc(qstat_pv_wait_again, waitcnt);
qstat_inc(qstat_pv_wait_early, wait_early);
- pv_wait(&pn->state, vcpu_halted);
+ pv_wait(&pn->state, vcpu_halted, -1);
}

/*
@@ -453,7 +456,15 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
WRITE_ONCE(pn->state, vcpu_halted);
qstat_inc(qstat_pv_wait_head, true);
qstat_inc(qstat_pv_wait_again, waitcnt);
- pv_wait(&l->locked, _Q_SLOW_VAL);
+
+ /*
+ * Pass in the previous node vCPU nmber which is likely to be
+ * the lock holder vCPU. This additional information may help
+ * the hypervisor to give more resource to that vCPU so that
+ * it can release the lock faster. With lock stealing,
+ * however, that vCPU may not be the actual lock holder.
+ */
+ pv_wait(&l->locked, _Q_SLOW_VAL, pn->prev_cpu);

/*
* The unlocker should have freed the lock before kicking the
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index eb2a2c9..8728348 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -266,12 +266,12 @@ static inline void __pv_kick(int cpu)
/*
* Replacement function for pv_wait()
*/
-static inline void __pv_wait(u8 *ptr, u8 val)
+static inline void __pv_wait(u8 *ptr, u8 val, int lockcpu)
{
u64 *pkick_time = this_cpu_ptr(&pv_kick_time);

*pkick_time = 0;
- pv_wait(ptr, val);
+ pv_wait(ptr, val, lockcpu);
if (*pkick_time) {
this_cpu_add(qstats[qstat_pv_latency_wake],
sched_clock() - *pkick_time);
@@ -279,8 +279,8 @@ static inline void __pv_wait(u8 *ptr, u8 val)
}
}

-#define pv_kick(c) __pv_kick(c)
-#define pv_wait(p, v) __pv_wait(p, v)
+#define pv_kick(c) __pv_kick(c)
+#define pv_wait(p, v, c) __pv_wait(p, v, c)

#else /* CONFIG_QUEUED_LOCK_STAT */

--
1.7.1


2016-04-14 00:21:58

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] locking/pvqspinlock: Add lock holder CPU argument to pv_wait()

On Wed, Apr 13, 2016 at 04:59:20PM -0400, Waiman Long wrote:
> Pan Xinhui was asking for a lock holder cpu argument in pv_wait()
> to help the porting of pvqspinlock to PPC. The new argument will can
> potentially help hypervisor expediate the execution of the critical
> section so that the lock holder vCPU can release the lock sooner.
>
> This patch does just that by storing the previous node vCPU number.
> In pv_wait_head_or_lock(), pv_wait() will be called with that vCPU
> number as it is likely to be the lock holder. In pv_wait_node(),
> -1 will be passed to pv_wait() instead to indicate that it doesn't
> know what the current lock holder is.

Without knowing why he needs this, it is very hard to tell if this will
suffice.

Xinhui, what do you need the extra argument for?

2016-04-14 01:59:35

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH] locking/pvqspinlock: Add lock holder CPU argument to pv_wait()

On 04/13/2016 08:21 PM, Peter Zijlstra wrote:
> On Wed, Apr 13, 2016 at 04:59:20PM -0400, Waiman Long wrote:
>> Pan Xinhui was asking for a lock holder cpu argument in pv_wait()
>> to help the porting of pvqspinlock to PPC. The new argument will can
>> potentially help hypervisor expediate the execution of the critical
>> section so that the lock holder vCPU can release the lock sooner.
>>
>> This patch does just that by storing the previous node vCPU number.
>> In pv_wait_head_or_lock(), pv_wait() will be called with that vCPU
>> number as it is likely to be the lock holder. In pv_wait_node(),
>> -1 will be passed to pv_wait() instead to indicate that it doesn't
>> know what the current lock holder is.
> Without knowing why he needs this, it is very hard to tell if this will
> suffice.
>
> Xinhui, what do you need the extra argument for?

I have some other people asking me if pvqspinlock was able to pass in
information about which CPU was the lock holder before as he cited the
PPC code has this capability to somehow kick up the vCPU that has the
lock. That can be potentially useful in x86 too.

Cheers,
Longman

2016-04-14 08:41:23

by Pan Xinhui

[permalink] [raw]
Subject: Re: [PATCH] locking/pvqspinlock: Add lock holder CPU argument to pv_wait()

On 2016年04月14日 08:21, Peter Zijlstra wrote:
> On Wed, Apr 13, 2016 at 04:59:20PM -0400, Waiman Long wrote:
>> Pan Xinhui was asking for a lock holder cpu argument in pv_wait()
>> to help the porting of pvqspinlock to PPC. The new argument will can
>> potentially help hypervisor expediate the execution of the critical
>> section so that the lock holder vCPU can release the lock sooner.
>>
>> This patch does just that by storing the previous node vCPU number.
>> In pv_wait_head_or_lock(), pv_wait() will be called with that vCPU
>> number as it is likely to be the lock holder. In pv_wait_node(),
>> -1 will be passed to pv_wait() instead to indicate that it doesn't
>> know what the current lock holder is.
>
> Without knowing why he needs this, it is very hard to tell if this will
> suffice.
>
> Xinhui, what do you need the extra argument for?
>

We the could give current vcpu's slice to that vcpu. The lock holder has more slice to run, the lock might be unlocked earlier. :)

2016-04-14 09:37:29

by Pan Xinhui

[permalink] [raw]
Subject: Re: [PATCH] locking/pvqspinlock: Add lock holder CPU argument to pv_wait()



On 2016年04月14日 04:59, Waiman Long wrote:
> Pan Xinhui was asking for a lock holder cpu argument in pv_wait()
> to help the porting of pvqspinlock to PPC. The new argument will can
> potentially help hypervisor expediate the execution of the critical
> section so that the lock holder vCPU can release the lock sooner.
>
> This patch does just that by storing the previous node vCPU number.
> In pv_wait_head_or_lock(), pv_wait() will be called with that vCPU
> number as it is likely to be the lock holder. In pv_wait_node(),
> -1 will be passed to pv_wait() instead to indicate that it doesn't
> know what the current lock holder is.
>
> This patch introduces negligible overhead to the current pvqspinlock
> code. The extra lockcpu argument isn't currently used in x86
> architecture.
>
> Signed-off-by: Waiman Long <[email protected]>
> ---
> arch/x86/include/asm/paravirt.h | 4 ++--
> arch/x86/include/asm/paravirt_types.h | 2 +-
> arch/x86/kernel/kvm.c | 2 +-
> arch/x86/xen/spinlock.c | 2 +-
> kernel/locking/qspinlock_paravirt.h | 19 +++++++++++++++----
> kernel/locking/qspinlock_stat.h | 8 ++++----
> 6 files changed, 24 insertions(+), 13 deletions(-)
>
> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
> index 601f1b8..b89eccf 100644
> --- a/arch/x86/include/asm/paravirt.h
> +++ b/arch/x86/include/asm/paravirt.h
> @@ -676,9 +676,9 @@ static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
> PVOP_VCALLEE1(pv_lock_ops.queued_spin_unlock, lock);
> }
>
> -static __always_inline void pv_wait(u8 *ptr, u8 val)
> +static __always_inline void pv_wait(u8 *ptr, u8 val, int lockcpu)
> {
> - PVOP_VCALL2(pv_lock_ops.wait, ptr, val);
> + PVOP_VCALL3(pv_lock_ops.wait, ptr, val, lockcpu);
> }
>
> static __always_inline void pv_kick(int cpu)
> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
> index e8c2326..2fc26c1 100644
> --- a/arch/x86/include/asm/paravirt_types.h
> +++ b/arch/x86/include/asm/paravirt_types.h
> @@ -312,7 +312,7 @@ struct pv_lock_ops {
> void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val);
> struct paravirt_callee_save queued_spin_unlock;
>
> - void (*wait)(u8 *ptr, u8 val);
> + void (*wait)(u8 *ptr, u8 val, int lockcpu);
> void (*kick)(int cpu);
> #else /* !CONFIG_QUEUED_SPINLOCKS */
> struct paravirt_callee_save lock_spinning;
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index dc1207e..47ab4e1 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -590,7 +590,7 @@ static void kvm_kick_cpu(int cpu)
>
> #include <asm/qspinlock.h>
>
> -static void kvm_wait(u8 *ptr, u8 val)
> +static void kvm_wait(u8 *ptr, u8 val, int lockcpu)
> {
> unsigned long flags;
>
> diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
> index 9e2ba5c..6f78c41 100644
> --- a/arch/x86/xen/spinlock.c
> +++ b/arch/x86/xen/spinlock.c
> @@ -33,7 +33,7 @@ static void xen_qlock_kick(int cpu)
> /*
> * Halt the current CPU & release it back to the host
> */
> -static void xen_qlock_wait(u8 *byte, u8 val)
> +static void xen_qlock_wait(u8 *byte, u8 val, int lockcpu)
> {
> int irq = __this_cpu_read(lock_kicker_irq);
>
> diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
> index 21ede57..4bec98b 100644
> --- a/kernel/locking/qspinlock_paravirt.h
> +++ b/kernel/locking/qspinlock_paravirt.h
> @@ -51,6 +51,7 @@ struct pv_node {
> struct mcs_spinlock __res[3];
>
> int cpu;
> + int prev_cpu; /* Previous node cpu */
> u8 state;
> };
>
> @@ -156,8 +157,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
> * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
> *
> * Since we should not be holding locks from NMI context (very rare indeed) the
> - * max load factor is 0.75, which is around the point where open addressing
> - * breaks down.
> + * max load factor is 0.75.
> *
> */
> struct pv_hash_entry {
> @@ -275,6 +275,7 @@ static void pv_init_node(struct mcs_spinlock *node)
>
> pn->cpu = smp_processor_id();
> pn->state = vcpu_running;
> + pn->prev_cpu = -1;
> }
>
> /*
> @@ -290,6 +291,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
> int loop;
> bool wait_early;
>
> + pn->prev_cpu = pp->cpu; /* Save previous node vCPU */
> +
> /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
> for (;; waitcnt++) {
> for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
> @@ -317,7 +320,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
> qstat_inc(qstat_pv_wait_node, true);
> qstat_inc(qstat_pv_wait_again, waitcnt);
> qstat_inc(qstat_pv_wait_early, wait_early);
> - pv_wait(&pn->state, vcpu_halted);
> + pv_wait(&pn->state, vcpu_halted, -1);
If the contention is high, we might run here. And we indeed need the lock holder on such scenario.
how about that, we store the lock into pv_node, then search the lock in hashtable,
code might look like,
node = pv_hash_lookup(pn->lock);
pv_wait(...,node->holder);
thanks
xinhui
> }
>
> /*
> @@ -453,7 +456,15 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
> WRITE_ONCE(pn->state, vcpu_halted);
> qstat_inc(qstat_pv_wait_head, true);
> qstat_inc(qstat_pv_wait_again, waitcnt);
> - pv_wait(&l->locked, _Q_SLOW_VAL);
> +
> + /*
> + * Pass in the previous node vCPU nmber which is likely to be
> + * the lock holder vCPU. This additional information may help
> + * the hypervisor to give more resource to that vCPU so that
> + * it can release the lock faster. With lock stealing,
> + * however, that vCPU may not be the actual lock holder.
> + */
> + pv_wait(&l->locked, _Q_SLOW_VAL, pn->prev_cpu);
>
> /*
> * The unlocker should have freed the lock before kicking the
> diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
> index eb2a2c9..8728348 100644
> --- a/kernel/locking/qspinlock_stat.h
> +++ b/kernel/locking/qspinlock_stat.h
> @@ -266,12 +266,12 @@ static inline void __pv_kick(int cpu)
> /*
> * Replacement function for pv_wait()
> */
> -static inline void __pv_wait(u8 *ptr, u8 val)
> +static inline void __pv_wait(u8 *ptr, u8 val, int lockcpu)
> {
> u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
>
> *pkick_time = 0;
> - pv_wait(ptr, val);
> + pv_wait(ptr, val, lockcpu);
> if (*pkick_time) {
> this_cpu_add(qstats[qstat_pv_latency_wake],
> sched_clock() - *pkick_time);
> @@ -279,8 +279,8 @@ static inline void __pv_wait(u8 *ptr, u8 val)
> }
> }
>
> -#define pv_kick(c) __pv_kick(c)
> -#define pv_wait(p, v) __pv_wait(p, v)
> +#define pv_kick(c) __pv_kick(c)
> +#define pv_wait(p, v, c) __pv_wait(p, v, c)
>
> #else /* CONFIG_QUEUED_LOCK_STAT */
>

2016-04-14 14:34:59

by Pan Xinhui

[permalink] [raw]
Subject: Re: [PATCH] locking/pvqspinlock: Add lock holder CPU argument to pv_wait()

hello, Waiman
I try your patch, thanks!

also I do some improvement.
below code diff has been tested, it works for me. :)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ce2f75e..99f31e4 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -248,7 +248,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
*/

static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
+static __always_inline void __pv_wait_node(struct qspinlock *lock,
+ struct mcs_spinlock *node,
struct mcs_spinlock *prev) { }
static __always_inline void __pv_kick_node(struct qspinlock *lock,
struct mcs_spinlock *node) { }
@@ -407,7 +408,7 @@ queue:
prev = decode_tail(old);
WRITE_ONCE(prev->next, node);

- pv_wait_node(node, prev);
+ pv_wait_node(lock, node, prev);
arch_mcs_spin_lock_contended(&node->locked);

/*
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 01a6d16..75ccfd3 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -255,6 +257,19 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
BUG();
}

+static struct pv_node *pv_hash_lookup(struct qspinlock *lock)
+{
+ unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+ struct pv_hash_entry *he;
+
+ for_each_hash_entry(he, offset, hash) {
+ if (READ_ONCE(he->lock) == lock) {
+ return he->node;
+ }
+ }
+ return NULL;
+}
+
/*
* Return true if when it is time to check the previous node which is not
* in a running state.
@@ -287,14 +303,17 @@ static void pv_init_node(struct mcs_spinlock *node)
* pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
* behalf.
*/
-static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
+static void pv_wait_node(struct qspinlock *lock, struct mcs_spinlock *node,
+ struct mcs_spinlock *prev)
{
struct pv_node *pn = (struct pv_node *)node;
struct pv_node *pp = (struct pv_node *)prev;
+ struct pv_node *ph;
int waitcnt = 0;
int loop;
bool wait_early;

+ pn->prev_cpu = pp->cpu;
/* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
for (;; waitcnt++) {
for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
@@ -322,7 +341,10 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
qstat_inc(qstat_pv_wait_node, true);
qstat_inc(qstat_pv_wait_again, waitcnt);
qstat_inc(qstat_pv_wait_early, wait_early);
- pv_wait(&pn->state, vcpu_halted);
+ ph = pv_hash_lookup(lock);
+ if (!ph)
+ ph = pp;
+ pv_wait(&pn->state, vcpu_halted, ph->prev_cpu);
}

/*

any comments are welcome. I put my patch here just for simplicity, or need I send it out in a new thread?

thanks
xinhui

On 2016年04月14日 17:36, Pan Xinhui wrote:
>
>
> On 2016年04月14日 04:59, Waiman Long wrote:
>> Pan Xinhui was asking for a lock holder cpu argument in pv_wait()
>> to help the porting of pvqspinlock to PPC. The new argument will can
>> potentially help hypervisor expediate the execution of the critical
>> section so that the lock holder vCPU can release the lock sooner.
>>
>> This patch does just that by storing the previous node vCPU number.
>> In pv_wait_head_or_lock(), pv_wait() will be called with that vCPU
>> number as it is likely to be the lock holder. In pv_wait_node(),
>> -1 will be passed to pv_wait() instead to indicate that it doesn't
>> know what the current lock holder is.
>>
>> This patch introduces negligible overhead to the current pvqspinlock
>> code. The extra lockcpu argument isn't currently used in x86
>> architecture.
>>
>> Signed-off-by: Waiman Long <[email protected]>
>> ---
>> arch/x86/include/asm/paravirt.h | 4 ++--
>> arch/x86/include/asm/paravirt_types.h | 2 +-
>> arch/x86/kernel/kvm.c | 2 +-
>> arch/x86/xen/spinlock.c | 2 +-
>> kernel/locking/qspinlock_paravirt.h | 19 +++++++++++++++----
>> kernel/locking/qspinlock_stat.h | 8 ++++----
>> 6 files changed, 24 insertions(+), 13 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
>> index 601f1b8..b89eccf 100644
>> --- a/arch/x86/include/asm/paravirt.h
>> +++ b/arch/x86/include/asm/paravirt.h
>> @@ -676,9 +676,9 @@ static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
>> PVOP_VCALLEE1(pv_lock_ops.queued_spin_unlock, lock);
>> }
>>
>> -static __always_inline void pv_wait(u8 *ptr, u8 val)
>> +static __always_inline void pv_wait(u8 *ptr, u8 val, int lockcpu)
>> {
>> - PVOP_VCALL2(pv_lock_ops.wait, ptr, val);
>> + PVOP_VCALL3(pv_lock_ops.wait, ptr, val, lockcpu);
>> }
>>
>> static __always_inline void pv_kick(int cpu)
>> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
>> index e8c2326..2fc26c1 100644
>> --- a/arch/x86/include/asm/paravirt_types.h
>> +++ b/arch/x86/include/asm/paravirt_types.h
>> @@ -312,7 +312,7 @@ struct pv_lock_ops {
>> void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val);
>> struct paravirt_callee_save queued_spin_unlock;
>>
>> - void (*wait)(u8 *ptr, u8 val);
>> + void (*wait)(u8 *ptr, u8 val, int lockcpu);
>> void (*kick)(int cpu);
>> #else /* !CONFIG_QUEUED_SPINLOCKS */
>> struct paravirt_callee_save lock_spinning;
>> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
>> index dc1207e..47ab4e1 100644
>> --- a/arch/x86/kernel/kvm.c
>> +++ b/arch/x86/kernel/kvm.c
>> @@ -590,7 +590,7 @@ static void kvm_kick_cpu(int cpu)
>>
>> #include <asm/qspinlock.h>
>>
>> -static void kvm_wait(u8 *ptr, u8 val)
>> +static void kvm_wait(u8 *ptr, u8 val, int lockcpu)
>> {
>> unsigned long flags;
>>
>> diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
>> index 9e2ba5c..6f78c41 100644
>> --- a/arch/x86/xen/spinlock.c
>> +++ b/arch/x86/xen/spinlock.c
>> @@ -33,7 +33,7 @@ static void xen_qlock_kick(int cpu)
>> /*
>> * Halt the current CPU & release it back to the host
>> */
>> -static void xen_qlock_wait(u8 *byte, u8 val)
>> +static void xen_qlock_wait(u8 *byte, u8 val, int lockcpu)
>> {
>> int irq = __this_cpu_read(lock_kicker_irq);
>>
>> diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
>> index 21ede57..4bec98b 100644
>> --- a/kernel/locking/qspinlock_paravirt.h
>> +++ b/kernel/locking/qspinlock_paravirt.h
>> @@ -51,6 +51,7 @@ struct pv_node {
>> struct mcs_spinlock __res[3];
>>
>> int cpu;
>> + int prev_cpu; /* Previous node cpu */
>> u8 state;
>> };
>>
>> @@ -156,8 +157,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
>> * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
>> *
>> * Since we should not be holding locks from NMI context (very rare indeed) the
>> - * max load factor is 0.75, which is around the point where open addressing
>> - * breaks down.
>> + * max load factor is 0.75.
>> *
>> */
>> struct pv_hash_entry {
>> @@ -275,6 +275,7 @@ static void pv_init_node(struct mcs_spinlock *node)
>>
>> pn->cpu = smp_processor_id();
>> pn->state = vcpu_running;
>> + pn->prev_cpu = -1;
>> }
>>
>> /*
>> @@ -290,6 +291,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
>> int loop;
>> bool wait_early;
>>
>> + pn->prev_cpu = pp->cpu; /* Save previous node vCPU */
>> +
>> /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
>> for (;; waitcnt++) {
>> for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
>> @@ -317,7 +320,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
>> qstat_inc(qstat_pv_wait_node, true);
>> qstat_inc(qstat_pv_wait_again, waitcnt);
>> qstat_inc(qstat_pv_wait_early, wait_early);
>> - pv_wait(&pn->state, vcpu_halted);
>> + pv_wait(&pn->state, vcpu_halted, -1);
> If the contention is high, we might run here. And we indeed need the lock holder on such scenario.
> how about that, we store the lock into pv_node, then search the lock in hashtable,
> code might look like,
> node = pv_hash_lookup(pn->lock);
> pv_wait(...,node->holder);
> thanks
> xinhui
>> }
>>
>> /*
>> @@ -453,7 +456,15 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
>> WRITE_ONCE(pn->state, vcpu_halted);
>> qstat_inc(qstat_pv_wait_head, true);
>> qstat_inc(qstat_pv_wait_again, waitcnt);
>> - pv_wait(&l->locked, _Q_SLOW_VAL);
>> +
>> + /*
>> + * Pass in the previous node vCPU nmber which is likely to be
>> + * the lock holder vCPU. This additional information may help
>> + * the hypervisor to give more resource to that vCPU so that
>> + * it can release the lock faster. With lock stealing,
>> + * however, that vCPU may not be the actual lock holder.
>> + */
>> + pv_wait(&l->locked, _Q_SLOW_VAL, pn->prev_cpu);
>>
>> /*
>> * The unlocker should have freed the lock before kicking the
>> diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
>> index eb2a2c9..8728348 100644
>> --- a/kernel/locking/qspinlock_stat.h
>> +++ b/kernel/locking/qspinlock_stat.h
>> @@ -266,12 +266,12 @@ static inline void __pv_kick(int cpu)
>> /*
>> * Replacement function for pv_wait()
>> */
>> -static inline void __pv_wait(u8 *ptr, u8 val)
>> +static inline void __pv_wait(u8 *ptr, u8 val, int lockcpu)
>> {
>> u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
>>
>> *pkick_time = 0;
>> - pv_wait(ptr, val);
>> + pv_wait(ptr, val, lockcpu);
>> if (*pkick_time) {
>> this_cpu_add(qstats[qstat_pv_latency_wake],
>> sched_clock() - *pkick_time);
>> @@ -279,8 +279,8 @@ static inline void __pv_wait(u8 *ptr, u8 val)
>> }
>> }
>>
>> -#define pv_kick(c) __pv_kick(c)
>> -#define pv_wait(p, v) __pv_wait(p, v)
>> +#define pv_kick(c) __pv_kick(c)
>> +#define pv_wait(p, v, c) __pv_wait(p, v, c)
>>
>> #else /* CONFIG_QUEUED_LOCK_STAT */
>>
>

2016-04-14 18:47:36

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH] locking/pvqspinlock: Add lock holder CPU argument to pv_wait()

On 04/14/2016 10:34 AM, Pan Xinhui wrote:
> hello, Waiman
> I try your patch, thanks!
>
> also I do some improvement.
> below code diff has been tested, it works for me. :)
>
> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> index ce2f75e..99f31e4 100644
> --- a/kernel/locking/qspinlock.c
> +++ b/kernel/locking/qspinlock.c
> @@ -248,7 +248,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
> */
>
> static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
> -static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
> +static __always_inline void __pv_wait_node(struct qspinlock *lock,
> + struct mcs_spinlock *node,
> struct mcs_spinlock *prev) { }
> static __always_inline void __pv_kick_node(struct qspinlock *lock,
> struct mcs_spinlock *node) { }
> @@ -407,7 +408,7 @@ queue:
> prev = decode_tail(old);
> WRITE_ONCE(prev->next, node);
>
> - pv_wait_node(node, prev);
> + pv_wait_node(lock, node, prev);
> arch_mcs_spin_lock_contended(&node->locked);
>
> /*
> diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
> index 01a6d16..75ccfd3 100644
> --- a/kernel/locking/qspinlock_paravirt.h
> +++ b/kernel/locking/qspinlock_paravirt.h
> @@ -255,6 +257,19 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
> BUG();
> }
>
> +static struct pv_node *pv_hash_lookup(struct qspinlock *lock)
> +{
> + unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
> + struct pv_hash_entry *he;
> +
> + for_each_hash_entry(he, offset, hash) {
> + if (READ_ONCE(he->lock) == lock) {
> + return he->node;
> + }
> + }
> + return NULL;
> +}
> +
> /*
> * Return true if when it is time to check the previous node which is not
> * in a running state.
> @@ -287,14 +303,17 @@ static void pv_init_node(struct mcs_spinlock *node)
> * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
> * behalf.
> */
> -static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
> +static void pv_wait_node(struct qspinlock *lock, struct mcs_spinlock *node,
> + struct mcs_spinlock *prev)
> {
> struct pv_node *pn = (struct pv_node *)node;
> struct pv_node *pp = (struct pv_node *)prev;
> + struct pv_node *ph;
> int waitcnt = 0;
> int loop;
> bool wait_early;
>
> + pn->prev_cpu = pp->cpu;
> /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
> for (;; waitcnt++) {
> for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
> @@ -322,7 +341,10 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
> qstat_inc(qstat_pv_wait_node, true);
> qstat_inc(qstat_pv_wait_again, waitcnt);
> qstat_inc(qstat_pv_wait_early, wait_early);
> - pv_wait(&pn->state, vcpu_halted);
> + ph = pv_hash_lookup(lock);
> + if (!ph)
> + ph = pp;
> + pv_wait(&pn->state, vcpu_halted, ph->prev_cpu);
> }
>
> /*
>
> any comments are welcome. I put my patch here just for simplicity, or need I send it out in a new thread?
>
> thanks
> xinhui
>
>

I have sent out a v2 patch that incorporate your change with some minor
twists.

Cheers,
Longman