Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932588AbbHHDTB (ORCPT ); Fri, 7 Aug 2015 23:19:01 -0400 Received: from g4t3425.houston.hp.com ([15.201.208.53]:33987 "EHLO g4t3425.houston.hp.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754073AbbHHDS1 (ORCPT ); Fri, 7 Aug 2015 23:18:27 -0400 From: Waiman Long To: Peter Zijlstra , Ingo Molnar , Thomas Gleixner , "H. Peter Anvin" Cc: x86@kernel.org, linux-kernel@vger.kernel.org, Scott J Norton , Douglas Hatch , Davidlohr Bueso , Waiman Long Subject: [PATCH v5 3/6] locking/pvqspinlock: Collect slowpath lock statistics Date: Fri, 7 Aug 2015 23:17:58 -0400 Message-Id: <1439003881-17349-4-git-send-email-Waiman.Long@hp.com> X-Mailer: git-send-email 1.7.1 In-Reply-To: <1439003881-17349-1-git-send-email-Waiman.Long@hp.com> References: <1439003881-17349-1-git-send-email-Waiman.Long@hp.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 9523 Lines: 346 This patch enables the accumulation of kicking and waiting related PV qspinlock statistics when the new QUEUED_LOCK_STAT configuration option is selected. It also enables the collection of kicking and wakeup latencies which have a heavy dependency on the CPUs being used. The measured latencies for different CPUs are: CPU Wakeup Kicking --- ------ ------- Haswell-EX 89.8us 7.4us Westmere-EX 67.6us 9.3us The measured latencies varied a bit from run-to-run. The wakeup latency is much higher than the kicking latency. A sample of statistics counts after a kernel build (no CPU overcommit) was: hash_hops_count=576912 kick_latencies=5258025484 kick_unlock_count=576911 kick_wait_count=576903 pending_fail_count=10722 pending_lock_count=6123545 spurious_wakeup=92 wait_again_count=75 wait_head_count=60 wait_node_count=576936 wake_latencies=37061460652 Signed-off-by: Waiman Long --- arch/x86/Kconfig | 7 ++ kernel/locking/qspinlock_paravirt.h | 178 ++++++++++++++++++++++++++++++++++- 2 files changed, 180 insertions(+), 5 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4c9c8b8..86bf53e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -719,6 +719,13 @@ config PARAVIRT_SPINLOCKS If you are unsure how to answer this question, answer Y. +config QUEUED_LOCK_STAT + bool "Paravirt queued lock statistics" + depends on PARAVIRT && DEBUG_FS && QUEUED_SPINLOCKS + ---help--- + Enable the collection of statistical data on the behavior of + paravirtualized queued spinlocks and report them on debugfs. + source "arch/x86/xen/Kconfig" config KVM_GUEST diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 94f9adf..5eb5dea 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -49,6 +49,151 @@ struct pv_node { }; /* + * PV qspinlock statistics + */ +enum pv_qlock_stat { + pvstat_wait_head, + pvstat_wait_node, + pvstat_wait_again, + pvstat_kick_wait, + pvstat_kick_unlock, + pvstat_pend_lock, + pvstat_pend_fail, + pvstat_spurious, + pvstat_hops, + pvstat_num /* Total number of statistics counts */ +}; + +#ifdef CONFIG_QUEUED_LOCK_STAT +/* + * Collect pvqspinlock statiatics + */ +#include +#include + +static const char * const stat_fsnames[pvstat_num] = { + [pvstat_wait_head] = "wait_head_count", + [pvstat_wait_node] = "wait_node_count", + [pvstat_wait_again] = "wait_again_count", + [pvstat_kick_wait] = "kick_wait_count", + [pvstat_kick_unlock] = "kick_unlock_count", + [pvstat_pend_lock] = "pending_lock_count", + [pvstat_pend_fail] = "pending_fail_count", + [pvstat_spurious] = "spurious_wakeup", + [pvstat_hops] = "hash_hops_count", +}; + +static atomic_t pvstats[pvstat_num]; + +/* + * pv_kick_latencies = sum of all pv_kick latencies in ns + * pv_wake_latencies = sum of all wakeup latencies in ns + * + * Avg kick latency = pv_kick_latencies/kick_unlock_count + * Avg wake latency = pv_wake_latencies/kick_wait_count + * Avg # of hops/hash = hash_hops_count/kick_unlock_count + */ +static atomic64_t pv_kick_latencies, pv_wake_latencies; +static DEFINE_PER_CPU(u64, pv_kick_time); + +/* + * Reset all the statistics counts if set + */ +static bool reset_cnts __read_mostly; + +/* + * Initialize debugfs for the PV qspinlock statistics + */ +static int __init pv_qspinlock_debugfs(void) +{ + struct dentry *d_pvqlock = debugfs_create_dir("pv-qspinlock", NULL); + int i; + + if (!d_pvqlock) + pr_warn("Could not create 'pv-qspinlock' debugfs directory\n"); + + for (i = 0; i < pvstat_num; i++) + debugfs_create_u32(stat_fsnames[i], 0444, d_pvqlock, + (u32 *)&pvstats[i]); + debugfs_create_u64("kick_latencies", 0444, d_pvqlock, + (u64 *)&pv_kick_latencies); + debugfs_create_u64("wake_latencies", 0444, d_pvqlock, + (u64 *)&pv_wake_latencies); + debugfs_create_bool("reset_cnts", 0644, d_pvqlock, (u32 *)&reset_cnts); + return 0; +} +fs_initcall(pv_qspinlock_debugfs); + +/* + * Reset all the counts + */ +static noinline void pvstat_reset(void) +{ + int i; + + for (i = 0; i < pvstat_num; i++) + atomic_set(&pvstats[i], 0); + atomic64_set(&pv_kick_latencies, 0); + atomic64_set(&pv_wake_latencies, 0); + reset_cnts = 0; +} + +/* + * Increment the PV qspinlock statistics counts + */ +static inline void pvstat_inc(enum pv_qlock_stat stat) +{ + atomic_inc(&pvstats[stat]); + if (unlikely(reset_cnts)) + pvstat_reset(); +} + +/* + * PV hash hop count + */ +static inline void pvstat_hop(int hopcnt) +{ + atomic_add(hopcnt, &pvstats[pvstat_hops]); +} + +/* + * Replacement function for pv_kick() + */ +static inline void __pv_kick(int cpu) +{ + u64 start = sched_clock(); + + *per_cpu_ptr(&pv_kick_time, cpu) = start; + pv_kick(cpu); + atomic64_add(sched_clock() - start, &pv_kick_latencies); +} + +/* + * Replacement function for pv_wait() + */ +static inline void __pv_wait(u8 *ptr, u8 val) +{ + u64 *pkick_time = this_cpu_ptr(&pv_kick_time); + + *pkick_time = 0; + pv_wait(ptr, val); + if (*pkick_time) { + atomic64_add(sched_clock() - *pkick_time, &pv_wake_latencies); + pvstat_inc(pvstat_kick_wait); + } +} + +#define pv_kick(c) __pv_kick(c) +#define pv_wait(p, v) __pv_wait(p, v) + +#else /* CONFIG_QUEUED_LOCK_STAT */ + +static inline void pvstat_inc(enum pv_qlock_stat stat) { } +static inline void pvstat_hop(int hopcnt) { } + +#endif /* CONFIG_QUEUED_LOCK_STAT */ + +/* * Lock and MCS node addresses hash table for fast lookup * * Hashing is done on a per-cacheline basis to minimize the need to access @@ -108,10 +253,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node) { unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits); struct pv_hash_entry *he; + int hopcnt = 0; for_each_hash_entry(he, offset, hash) { + hopcnt++; if (!cmpxchg(&he->lock, NULL, lock)) { WRITE_ONCE(he->node, node); + pvstat_hop(hopcnt); return &he->lock; } } @@ -212,6 +360,7 @@ static int pv_pending_lock(struct qspinlock *lock, u32 val) val = smp_load_acquire(&lock->val.counter); if (!(val & _Q_LOCKED_MASK)) { clear_pending_set_locked(lock); + pvstat_inc(pvstat_pend_lock); goto gotlock; } } @@ -221,11 +370,13 @@ static int pv_pending_lock(struct qspinlock *lock, u32 val) * pending bit set, so clear the pending bit and fall back to queuing. */ clear_pending(lock); + pvstat_inc(pvstat_pend_fail); queue: return 0; gotlock: + pvstat_inc(pvstat_pend_lock); return 1; } @@ -237,9 +388,10 @@ gotlock: static void pv_wait_node(struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; + int waitcnt = 0; int loop; - for (;;) { + for (;; waitcnt++) { for (loop = SPIN_THRESHOLD; loop; loop--) { if (READ_ONCE(node->locked)) return; @@ -257,15 +409,22 @@ static void pv_wait_node(struct mcs_spinlock *node) */ smp_store_mb(pn->state, vcpu_halted); - if (!READ_ONCE(node->locked)) + if (!READ_ONCE(node->locked)) { + pvstat_inc(pvstat_wait_node); + if (waitcnt) + pvstat_inc(pvstat_wait_again); pv_wait(&pn->state, vcpu_halted); + } /* - * If pv_kick_node() changed us to vcpu_hashed, retain that value - * so that pv_wait_head() knows to not also try to hash this lock. + * If pv_kick_node() changed us to vcpu_hashed, retain that + * value so that pv_wait_head() knows to not also try to hash + * this lock. */ cmpxchg(&pn->state, vcpu_halted, vcpu_running); + if (READ_ONCE(node->locked)) + break; /* * If the locked flag is still not set after wakeup, it is a * spurious wakeup and the vCPU should wait again. However, @@ -273,6 +432,7 @@ static void pv_wait_node(struct mcs_spinlock *node) * So it is better to spin for a while in the hope that the * MCS lock will be released soon. */ + pvstat_inc(pvstat_spurious); } /* @@ -323,6 +483,7 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) struct pv_node *pn = (struct pv_node *)node; struct __qspinlock *l = (void *)lock; struct qspinlock **lp = NULL; + int waitcnt = 0; int loop; /* @@ -332,7 +493,7 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) if (READ_ONCE(pn->state) == vcpu_hashed) lp = (struct qspinlock **)1; - for (;;) { + for (;; waitcnt++) { for (loop = SPIN_THRESHOLD; loop; loop--) { if (!READ_ONCE(l->locked)) return; @@ -366,14 +527,20 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) return; } } + pvstat_inc(pvstat_wait_head); + if (waitcnt) + pvstat_inc(pvstat_wait_again); pv_wait(&l->locked, _Q_SLOW_VAL); + if (!READ_ONCE(l->locked)) + return; /* * The unlocker should have freed the lock before kicking the * CPU. So if the lock is still not free, it is a spurious * wakeup and so the vCPU should wait again after spinning for * a while. */ + pvstat_inc(pvstat_spurious); } /* @@ -437,6 +604,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) * vCPU is harmless other than the additional latency in completing * the unlock. */ + pvstat_inc(pvstat_kick_unlock); pv_kick(node->cpu); } /* -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/