2013-06-09 19:37:08

by Paul E. McKenney

[permalink] [raw]
Subject: [PATCH RFC ticketlock] Auto-queued ticketlock

Breaking up locks is better than implementing high-contention locks, but
if we must have high-contention locks, why not make them automatically
switch between light-weight ticket locks at low contention and queued
locks at high contention?

This commit therefore allows ticket locks to automatically switch between
pure ticketlock and queued-lock operation as needed. If too many CPUs
are spinning on a given ticket lock, a queue structure will be allocated
and the lock will switch to queued-lock operation. When the lock becomes
free, it will switch back into ticketlock operation. The low-order bit
of the head counter is used to indicate that the lock is in queued mode,
which forces an unconditional mismatch between the head and tail counters.
This approach means that the common-case code path under conditions of
low contention is very nearly that of a plain ticket lock.

A fixed number of queueing structures is statically allocated in an
array. The ticket-lock address is used to hash into an initial element,
but if that element is already in use, it moves to the next element. If
the entire array is already in use, continue to spin in ticket mode.

This has been only lightly tested in the kernel, though a userspace
implementation has survived substantial testing.

Signed-off-by: Paul E. McKenney <[email protected]>

diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 33692ea..b4a91b0 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -34,6 +34,8 @@
# define UNLOCK_LOCK_PREFIX
#endif

+#ifndef CONFIG_TICKET_LOCK_QUEUED
+
/*
* Ticket locks are conceptually two parts, one indicating the current head of
* the queue, and the other indicating the current tail. The lock is acquired
@@ -62,6 +64,25 @@ static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
barrier(); /* make sure nothing creeps before the lock is taken */
}

+#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
+
+bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
+
+static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
+{
+ register struct __raw_tickets inc = { .tail = 2 };
+
+ inc = xadd(&lock->tickets, inc);
+ for (;;) {
+ if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
+ break;
+ inc.head = ACCESS_ONCE(lock->tickets.head);
+ }
+ barrier(); /* smp_mb() on Power or ARM. */
+}
+
+#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
+
static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
{
arch_spinlock_t old, new;
@@ -70,17 +91,37 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
if (old.tickets.head != old.tickets.tail)
return 0;

+#ifndef CONFIG_TICKET_LOCK_QUEUED
new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
+#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
+ new.head_tail = old.head_tail + (2 << TICKET_SHIFT);
+#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */

/* cmpxchg is a full barrier, so nothing can move before it */
return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
}

+#ifndef CONFIG_TICKET_LOCK_QUEUED
+
static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
{
__add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
}

+#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
+
+extern void tkt_q_do_wake(arch_spinlock_t *asp);
+
+static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
+{
+ __ticket_t head = 2;
+
+ head = xadd(&lock->tickets.head, 2);
+ if (head & 0x1)
+ tkt_q_do_wake(lock);
+}
+#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
+
static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
{
struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index ad0ad07..cdaefdd 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -7,12 +7,18 @@

#include <linux/types.h>

-#if (CONFIG_NR_CPUS < 256)
+#if (CONFIG_NR_CPUS < 128)
typedef u8 __ticket_t;
typedef u16 __ticketpair_t;
-#else
+#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
+#elif (CONFIG_NR_CPUS < 32768)
typedef u16 __ticket_t;
typedef u32 __ticketpair_t;
+#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
+#else
+typedef u32 __ticket_t;
+typedef u64 __ticketpair_t;
+#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
#endif

#define TICKET_SHIFT (sizeof(__ticket_t) * 8)
@@ -21,7 +27,11 @@ typedef struct arch_spinlock {
union {
__ticketpair_t head_tail;
struct __raw_tickets {
+#ifdef __BIG_ENDIAN__
+ __ticket_t tail, head;
+#else /* #ifdef __BIG_ENDIAN__ */
__ticket_t head, tail;
+#endif /* #else #ifdef __BIG_ENDIAN__ */
} tickets;
};
} arch_spinlock_t;
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e9ef6d6..816a87c 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -15,6 +15,7 @@
#include <asm/byteorder.h>
#include <uapi/linux/kernel.h>

+#define UCHAR_MAX ((u8)(~0U))
#define USHRT_MAX ((u16)(~0U))
#define SHRT_MAX ((s16)(USHRT_MAX>>1))
#define SHRT_MIN ((s16)(-SHRT_MAX - 1))
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 44511d1..ad9c67c 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -223,3 +223,21 @@ endif
config MUTEX_SPIN_ON_OWNER
def_bool y
depends on SMP && !DEBUG_MUTEXES
+
+config TICKET_LOCK_QUEUED
+ bool "Dynamically switch between ticket and queued locking"
+ default n
+ ---help---
+ Enable dynamic switching between ticketlock and queued locking
+ on a per-lock basis. This option will slow down low-contention
+ acquisition and release very slightly (additional conditional
+ in release path), but will provide more efficient operation at
+ high levels of lock contention. High-contention operation will
+ not be quite as efficient as would be a pure queued lock, but
+ this dynamic approach consumes less memory than queud locks
+ and also runs faster at low levels of contention.
+
+ Say "Y" if you are running on a large system with a workload
+ that is likely to result in high levels of contention.
+
+ Say "N" if you are unsure.
diff --git a/kernel/Makefile b/kernel/Makefile
index 271fd31..70a91f7 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -51,6 +51,7 @@ endif
obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
new file mode 100644
index 0000000..f01b760
--- /dev/null
+++ b/kernel/tktqlock.c
@@ -0,0 +1,333 @@
+/*
+ * Queued ticket spinlocks.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2013
+ *
+ * Authors: Paul E. McKenney <[email protected]>
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+
+struct tkt_q {
+ int cpu;
+ __ticket_t tail;
+ struct tkt_q *next;
+};
+
+struct tkt_q_head {
+ arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
+ s32 head_tkt; /* Head ticket when started queuing. */
+ struct tkt_q *spin; /* Head of queue. */
+ struct tkt_q **spin_tail; /* Tail of queue. */
+};
+
+/*
+ * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
+ * given ticket lock to motivate switching to spinning on a queue.
+ * The reason that it is twice the number is because the bottom bit of
+ * the ticket is reserved for the bit that indicates that a queue is
+ * associated with the lock.
+ */
+#define TKT_Q_SWITCH (16 * 2)
+
+/*
+ * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
+ * might have multiple highly contended locks, so provide more queues for
+ * systems with larger numbers of CPUs.
+ */
+#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
+
+/* The queues themselves. */
+struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
+
+/* Advance to the next queue slot, wrapping around to the beginning. */
+static int tkt_q_next_slot(int i)
+{
+ return (++i < TKT_Q_NQUEUES) ? i : 0;
+}
+
+/* Very crude hash from lock address to queue slot number. */
+static unsigned long tkt_q_hash(arch_spinlock_t *asp)
+{
+ return (((unsigned long)asp) >> 8) % TKT_Q_NQUEUES;
+}
+
+/*
+ * Return a pointer to the queue header associated with the specified lock,
+ * or return NULL if there is no queue for the lock or if the lock's queue
+ * is in transition.
+ */
+static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
+{
+ int i;
+ int start;
+
+ start = i = tkt_q_hash(asp);
+ do
+ if (tkt_q_heads[i].ref == asp)
+ return &tkt_q_heads[i];
+ while ((i = tkt_q_next_slot(i)) != start);
+ return NULL;
+}
+
+/*
+ * Try to stop queuing, reverting back to normal ticket-lock operation.
+ * We can only stop queuing when the queue is empty, which means that
+ * we need to correctly handle races where someone shows up in the queue
+ * just as we are trying to dispense with the queue. They win, we lose.
+ */
+static bool tkt_q_try_unqueue(arch_spinlock_t *asp, struct tkt_q_head *tqhp)
+{
+ arch_spinlock_t asold;
+ arch_spinlock_t asnew;
+
+ /* Pick up the ticket values. */
+ asold = ACCESS_ONCE(*asp);
+ if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
+
+ /* Attempt to mark the lock as not having a queue. */
+ asnew = asold;
+ asnew.tickets.head &= ~0x1;
+ if (cmpxchg(&asp->head_tail,
+ asold.head_tail,
+ asnew.head_tail) == asold.head_tail) {
+
+ /* Succeeded, mark the queue as unused. */
+ ACCESS_ONCE(tqhp->ref) = NULL;
+ return true;
+ }
+ }
+
+ /* Failed, tell the caller there is still a queue to pass off to. */
+ return false;
+}
+
+/*
+ * Hand the lock off to the first CPU on the queue.
+ */
+void tkt_q_do_wake(arch_spinlock_t *asp)
+{
+ struct tkt_q_head *tqhp;
+ struct tkt_q *tqp;
+
+ /* If the queue is still being set up, wait for it. */
+ while ((tqhp = tkt_q_find_head(asp)) == NULL)
+ cpu_relax();
+
+ for (;;) {
+
+ /* Find the first queue element. */
+ tqp = ACCESS_ONCE(tqhp->spin);
+ if (tqp != NULL)
+ break; /* Element exists, hand off lock. */
+ if (tkt_q_try_unqueue(asp, tqhp))
+ return; /* No element, successfully removed queue. */
+ cpu_relax();
+ }
+ if (ACCESS_ONCE(tqhp->head_tkt) != -1)
+ ACCESS_ONCE(tqhp->head_tkt) = -1;
+ smp_mb(); /* Order pointer fetch and assignment against handoff. */
+ ACCESS_ONCE(tqp->cpu) = -1;
+}
+
+/*
+ * Given a lock that already has a queue associated with it, spin on
+ * that queue. Return false if there was no queue (which means we do not
+ * hold the lock) and true otherwise (meaning we -do- hold the lock).
+ */
+bool tkt_q_do_spin(arch_spinlock_t *asp, struct __raw_tickets inc)
+{
+ struct tkt_q **oldtail;
+ struct tkt_q tq;
+ struct tkt_q_head *tqhp;
+
+ /*
+ * Ensure that accesses to queue header happen after sensing
+ * the lock's have-queue bit.
+ */
+ smp_mb(); /* See above block comment. */
+
+ /* If there no longer is a queue, leave. */
+ tqhp = tkt_q_find_head(asp);
+ if (tqhp == NULL)
+ return false;
+
+ /* Initialize our queue element. */
+ tq.cpu = raw_smp_processor_id();
+ tq.tail = inc.tail;
+ tq.next = NULL;
+
+ /* Check to see if we already hold the lock. */
+ if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
+ /* The last holder left before queue formed, we hold lock. */
+ tqhp->head_tkt = -1;
+ return true;
+ }
+
+ /* Add our element to the tail of the queue. */
+ oldtail = xchg(&tqhp->spin_tail, &tq.next);
+ ACCESS_ONCE(*oldtail) = &tq;
+
+ /* Spin until handoff. */
+ while (ACCESS_ONCE(tq.cpu) != -1)
+ cpu_relax();
+
+ /*
+ * Remove our element from the queue. If the queue is now empty,
+ * update carefully so that the next acquisition will queue itself
+ * at the head of the list.
+ */
+ if (tq.next == NULL) {
+
+ /* Mark the queue empty. */
+ tqhp->spin = NULL;
+
+ /* Try to point the tail back at the head. */
+ if (cmpxchg(&tqhp->spin_tail,
+ &tq.next,
+ &tqhp->spin) == &tq.next)
+ return true; /* Succeeded, queue is now empty. */
+
+ /* Failed, if needed, wait for the enqueue to complete. */
+ while (tq.next == NULL)
+ cpu_relax();
+
+ /* The following code will repair the head. */
+ }
+ smp_mb(); /* Force ordering between handoff and critical section. */
+
+ /* Advance list-head pointer. */
+ ACCESS_ONCE(tqhp->spin) = tq.next;
+ return true;
+}
+
+/*
+ * Given a lock that does not have a queue, attempt to associate the
+ * i-th queue with it, returning true if successful (meaning we hold
+ * the lock) or false otherwise (meaning we do -not- hold the lock).
+ * Note that the caller has already filled in ->ref with 0x1, so we
+ * own the queue.
+ */
+static bool
+tkt_q_init_contend(int i, arch_spinlock_t *asp, struct __raw_tickets inc)
+{
+ arch_spinlock_t asold;
+ arch_spinlock_t asnew;
+ struct tkt_q_head *tqhp;
+
+ /* Initialize the i-th queue header. */
+ tqhp = &tkt_q_heads[i];
+ tqhp->spin = NULL;
+ tqhp->spin_tail = &tqhp->spin;
+
+ /* Each pass through this loop attempts to mark the lock as queued. */
+ do {
+ asold.head_tail = ACCESS_ONCE(asp->head_tail);
+ asnew = asold;
+ if (asnew.tickets.head & 0x1) {
+
+ /* Someone beat us to it, back out. */
+ smp_mb();
+ ACCESS_ONCE(tqhp->ref) = NULL;
+
+ /* Spin on the queue element they set up. */
+ return tkt_q_do_spin(asp, inc);
+ }
+
+ /* The low-order bit in the head counter says "queued". */
+ asnew.tickets.head |= 0x1;
+ } while (cmpxchg(&asp->head_tail,
+ asold.head_tail,
+ asnew.head_tail) != asold.head_tail);
+
+ /* Point the queue at the lock and go spin on it. */
+ tqhp->head_tkt = asold.tickets.head;
+ smp_mb(); /* Ensure head_tkt is set prior to queuers seeing tqhp. */
+ ACCESS_ONCE(tqhp->ref) = asp;
+ return tkt_q_do_spin(asp, inc);
+}
+
+/*
+ * Start handling a period of high contention by finding a queue to associate
+ * with this lock. Returns true if successful (in which case we hold the
+ * lock) and false otherwise (in which case we do -not- hold the lock).
+ */
+bool tkt_q_start_contend(arch_spinlock_t *asp, struct __raw_tickets inc)
+{
+ int i;
+ int start;
+
+ /* Hash the lock address to find a starting point. */
+ start = i = tkt_q_hash(asp);
+
+ /*
+ * Each pass through the following loop attempts to associate
+ * the lock with the corresponding queue.
+ */
+ do {
+ /*
+ * Use 0x1 to mark the queue in use, but also avoiding
+ * any spinners trying to use it before we get it all
+ * initialized.
+ */
+ if (cmpxchg(&tkt_q_heads[i].ref,
+ NULL,
+ (arch_spinlock_t *)0x1) == NULL) {
+
+ /* Succeeded, now go initialize it. */
+ return tkt_q_init_contend(i, asp, inc);
+ }
+
+ /* If someone beat us to it, go spin on their queue. */
+ if (ACCESS_ONCE(asp->tickets.head) & 0x1)
+ return tkt_q_do_spin(asp, inc);
+ } while ((i = tkt_q_next_slot(i)) != start);
+
+ /* All the queues are in use, revert to spinning on the ticket lock. */
+ return false;
+}
+
+bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
+{
+ if (unlikely(inc.head & 0x1)) {
+
+ /* This lock has a queue, so go spin on the queue. */
+ if (tkt_q_do_spin(ap, inc))
+ return true;
+
+ /* Get here if the queue is in transition: Retry next time. */
+
+ } else if (TICKET_T_CMP_GE(ACCESS_ONCE(ap->tickets.tail) - TKT_Q_SWITCH,
+ ACCESS_ONCE(ap->tickets.head))) {
+
+ /*
+ * This lock has lots of spinners, but no queue.
+ * Go create a queue to spin on.
+ */
+ if (tkt_q_start_contend(ap, inc))
+ return true;
+
+ /* Get here if the queue is in transition: Retry next time. */
+ }
+
+ /* Either no need for a queue or the queue is in transition. Spin. */
+ cpu_relax();
+ return false;
+}


2013-06-10 20:48:04

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Sun, 2013-06-09 at 12:36 -0700, Paul E. McKenney wrote:

> --- a/arch/x86/include/asm/spinlock_types.h
> +++ b/arch/x86/include/asm/spinlock_types.h
> @@ -7,12 +7,18 @@
>
> #include <linux/types.h>
>
> -#if (CONFIG_NR_CPUS < 256)
> +#if (CONFIG_NR_CPUS < 128)
> typedef u8 __ticket_t;
> typedef u16 __ticketpair_t;
> -#else
> +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
> +#elif (CONFIG_NR_CPUS < 32768)
> typedef u16 __ticket_t;
> typedef u32 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
> +#else
> +typedef u32 __ticket_t;
> +typedef u64 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
> #endif
>
> #define TICKET_SHIFT (sizeof(__ticket_t) * 8)
> @@ -21,7 +27,11 @@ typedef struct arch_spinlock {
> union {
> __ticketpair_t head_tail;
> struct __raw_tickets {
> +#ifdef __BIG_ENDIAN__

Is there such a thing as a BIG_ENDIAN x86 box? This is in
arch/x86/include/asm/spinlock_types.h

-- Steve

> + __ticket_t tail, head;
> +#else /* #ifdef __BIG_ENDIAN__ */
> __ticket_t head, tail;
> +#endif /* #else #ifdef __BIG_ENDIAN__ */
> } tickets;
> };
> } arch_spinlock_t;

2013-06-10 20:59:02

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Mon, Jun 10, 2013 at 04:47:58PM -0400, Steven Rostedt wrote:
> On Sun, 2013-06-09 at 12:36 -0700, Paul E. McKenney wrote:
>
> > --- a/arch/x86/include/asm/spinlock_types.h
> > +++ b/arch/x86/include/asm/spinlock_types.h
> > @@ -7,12 +7,18 @@
> >
> > #include <linux/types.h>
> >
> > -#if (CONFIG_NR_CPUS < 256)
> > +#if (CONFIG_NR_CPUS < 128)
> > typedef u8 __ticket_t;
> > typedef u16 __ticketpair_t;
> > -#else
> > +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
> > +#elif (CONFIG_NR_CPUS < 32768)
> > typedef u16 __ticket_t;
> > typedef u32 __ticketpair_t;
> > +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
> > +#else
> > +typedef u32 __ticket_t;
> > +typedef u64 __ticketpair_t;
> > +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
> > #endif
> >
> > #define TICKET_SHIFT (sizeof(__ticket_t) * 8)
> > @@ -21,7 +27,11 @@ typedef struct arch_spinlock {
> > union {
> > __ticketpair_t head_tail;
> > struct __raw_tickets {
> > +#ifdef __BIG_ENDIAN__
>
> Is there such a thing as a BIG_ENDIAN x86 box? This is in
> arch/x86/include/asm/spinlock_types.h

Nope. Preparation work for moving this to common code.

Good point though, I should have skipped this #ifdef until I got everything
into common code.

Thanx, Paul

> -- Steve
>
> > + __ticket_t tail, head;
> > +#else /* #ifdef __BIG_ENDIAN__ */
> > __ticket_t head, tail;
> > +#endif /* #else #ifdef __BIG_ENDIAN__ */
> > } tickets;
> > };
> > } arch_spinlock_t;
>
>

2013-06-10 21:02:06

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Mon, 10 Jun 2013, Steven Rostedt wrote:
> On Sun, 2013-06-09 at 12:36 -0700, Paul E. McKenney wrote:
> > +#ifdef __BIG_ENDIAN__
>
> Is there such a thing as a BIG_ENDIAN x86 box? This is in
> arch/x86/include/asm/spinlock_types.h

That's just an habit for people who have been forced to deal with BE
CPUs.

The sad thing is that BE CPUs have been designed by folks who blindly
expanded BCD computing without understanding the hardware
implications.

Unfortunately they managed to inflict BE to the network protocols
which in turn manifested the only excuse of keeping the BE nonsense
alive.

Thanks,

tglx

2013-06-10 21:08:30

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Sun, 2013-06-09 at 12:36 -0700, Paul E. McKenney wrote:
>
> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> +
> +static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> +{
> + register struct __raw_tickets inc = { .tail = 2 };
> +
> + inc = xadd(&lock->tickets, inc);
> + for (;;) {
> + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> + break;
> + inc.head = ACCESS_ONCE(lock->tickets.head);
> + }
> + barrier(); /* smp_mb() on Power or ARM. */
> +}
> +
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +

To avoid the above code duplication, I would have this instead:

#ifdef CONFIG_TICKET_LOCK_QUEUED

bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
#define __TKT_SPIN_INC 2

#else

static inline bool tkt_spin_pass(arch_spinlock_t *ap, struct
__raw_tickets inc)
{
return false;
}

#define __TKT_SPIN_INC 1

#endif

static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
{
register struct __raw_tickets inc = { .tail = __TKT_SPIN_INC };

inc = xadd(&lock->tickets, inc);

for (;;) {
if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
break;
cpu_relax();
inc.head = ACCESS_ONCE(lock->tickets.head);
}
barrier; /* make sure nothing creeps before the lock is taken */
}

-- Steve

2013-06-10 21:15:34

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Mon, Jun 10, 2013 at 11:01:50PM +0200, Thomas Gleixner wrote:
> On Mon, 10 Jun 2013, Steven Rostedt wrote:
> > On Sun, 2013-06-09 at 12:36 -0700, Paul E. McKenney wrote:
> > > +#ifdef __BIG_ENDIAN__
> >
> > Is there such a thing as a BIG_ENDIAN x86 box? This is in
> > arch/x86/include/asm/spinlock_types.h
>
> That's just an habit for people who have been forced to deal with BE
> CPUs.
>
> The sad thing is that BE CPUs have been designed by folks who blindly
> expanded BCD computing without understanding the hardware
> implications.
>
> Unfortunately they managed to inflict BE to the network protocols
> which in turn manifested the only excuse of keeping the BE nonsense
> alive.

Hey, we had to do -something-, after all, EBCDIC lost out to ASCII!!! ;-)

Thanx, Paul

2013-06-10 21:30:37

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Mon, Jun 10, 2013 at 05:08:25PM -0400, Steven Rostedt wrote:
> On Sun, 2013-06-09 at 12:36 -0700, Paul E. McKenney wrote:
> >
> > +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> > +
> > +static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> > +{
> > + register struct __raw_tickets inc = { .tail = 2 };
> > +
> > + inc = xadd(&lock->tickets, inc);
> > + for (;;) {
> > + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> > + break;
> > + inc.head = ACCESS_ONCE(lock->tickets.head);
> > + }
> > + barrier(); /* smp_mb() on Power or ARM. */
> > +}
> > +
> > +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
>
> To avoid the above code duplication, I would have this instead:

Nice! I have updated accordingly.

Thanx, Paul

> #ifdef CONFIG_TICKET_LOCK_QUEUED
>
> bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> #define __TKT_SPIN_INC 2
>
> #else
>
> static inline bool tkt_spin_pass(arch_spinlock_t *ap, struct
> __raw_tickets inc)
> {
> return false;
> }
>
> #define __TKT_SPIN_INC 1
>
> #endif
>
> static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> {
> register struct __raw_tickets inc = { .tail = __TKT_SPIN_INC };
>
> inc = xadd(&lock->tickets, inc);
>
> for (;;) {
> if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> break;
> cpu_relax();
> inc.head = ACCESS_ONCE(lock->tickets.head);
> }
> barrier; /* make sure nothing creeps before the lock is taken */
> }
>
> -- Steve
>
>

2013-06-10 21:35:15

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Sun, 2013-06-09 at 12:36 -0700, Paul E. McKenney wrote:
> Breaking up locks is better than implementing high-contention locks, but
> if we must have high-contention locks, why not make them automatically
> switch between light-weight ticket locks at low contention and queued
> locks at high contention?
>
> This commit therefore allows ticket locks to automatically switch between
> pure ticketlock and queued-lock operation as needed. If too many CPUs
> are spinning on a given ticket lock, a queue structure will be allocated
> and the lock will switch to queued-lock operation. When the lock becomes
> free, it will switch back into ticketlock operation. The low-order bit
> of the head counter is used to indicate that the lock is in queued mode,
> which forces an unconditional mismatch between the head and tail counters.
> This approach means that the common-case code path under conditions of
> low contention is very nearly that of a plain ticket lock.
>
> A fixed number of queueing structures is statically allocated in an
> array. The ticket-lock address is used to hash into an initial element,
> but if that element is already in use, it moves to the next element. If
> the entire array is already in use, continue to spin in ticket mode.
>
> This has been only lightly tested in the kernel, though a userspace
> implementation has survived substantial testing.
>
> Signed-off-by: Paul E. McKenney <[email protected]>
>

This looks a great idea ;)

> +
> +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> +{
> + __ticket_t head = 2;
> +
> + head = xadd(&lock->tickets.head, 2);

head = xadd(&lock->tickets.head, head);

> + if (head & 0x1)
> + tkt_q_do_wake(lock);
> +}
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */

> + */
> +void tkt_q_do_wake(arch_spinlock_t *asp)
> +{
> + struct tkt_q_head *tqhp;
> + struct tkt_q *tqp;
> +
> + /* If the queue is still being set up, wait for it. */
> + while ((tqhp = tkt_q_find_head(asp)) == NULL)
> + cpu_relax();
> +
> + for (;;) {
> +
> + /* Find the first queue element. */
> + tqp = ACCESS_ONCE(tqhp->spin);
> + if (tqp != NULL)
> + break; /* Element exists, hand off lock. */
> + if (tkt_q_try_unqueue(asp, tqhp))
> + return; /* No element, successfully removed queue. */
> + cpu_relax();
> + }
> + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> + ACCESS_ONCE(tqhp->head_tkt) = -1;
> + smp_mb(); /* Order pointer fetch and assignment against handoff. */
> + ACCESS_ONCE(tqp->cpu) = -1;
> +}

EXPORT_SYMBOL(tkt_q_do_wake) ?

Hmm, unfortunately I lack time this week to fully read the patch !


2013-06-10 21:54:40

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Mon, Jun 10, 2013 at 02:35:06PM -0700, Eric Dumazet wrote:
> On Sun, 2013-06-09 at 12:36 -0700, Paul E. McKenney wrote:
> > Breaking up locks is better than implementing high-contention locks, but
> > if we must have high-contention locks, why not make them automatically
> > switch between light-weight ticket locks at low contention and queued
> > locks at high contention?
> >
> > This commit therefore allows ticket locks to automatically switch between
> > pure ticketlock and queued-lock operation as needed. If too many CPUs
> > are spinning on a given ticket lock, a queue structure will be allocated
> > and the lock will switch to queued-lock operation. When the lock becomes
> > free, it will switch back into ticketlock operation. The low-order bit
> > of the head counter is used to indicate that the lock is in queued mode,
> > which forces an unconditional mismatch between the head and tail counters.
> > This approach means that the common-case code path under conditions of
> > low contention is very nearly that of a plain ticket lock.
> >
> > A fixed number of queueing structures is statically allocated in an
> > array. The ticket-lock address is used to hash into an initial element,
> > but if that element is already in use, it moves to the next element. If
> > the entire array is already in use, continue to spin in ticket mode.
> >
> > This has been only lightly tested in the kernel, though a userspace
> > implementation has survived substantial testing.
> >
> > Signed-off-by: Paul E. McKenney <[email protected]>
>
> This looks a great idea ;)

Glad you like it! Hopefully workloads like it as well. ;-)

> > +
> > +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> > +{
> > + __ticket_t head = 2;
> > +
> > + head = xadd(&lock->tickets.head, 2);
>
> head = xadd(&lock->tickets.head, head);

Yikes! Good catch, fixed.

> > + if (head & 0x1)
> > + tkt_q_do_wake(lock);
> > +}
> > +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
>
> > + */
> > +void tkt_q_do_wake(arch_spinlock_t *asp)
> > +{
> > + struct tkt_q_head *tqhp;
> > + struct tkt_q *tqp;
> > +
> > + /* If the queue is still being set up, wait for it. */
> > + while ((tqhp = tkt_q_find_head(asp)) == NULL)
> > + cpu_relax();
> > +
> > + for (;;) {
> > +
> > + /* Find the first queue element. */
> > + tqp = ACCESS_ONCE(tqhp->spin);
> > + if (tqp != NULL)
> > + break; /* Element exists, hand off lock. */
> > + if (tkt_q_try_unqueue(asp, tqhp))
> > + return; /* No element, successfully removed queue. */
> > + cpu_relax();
> > + }
> > + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> > + ACCESS_ONCE(tqhp->head_tkt) = -1;
> > + smp_mb(); /* Order pointer fetch and assignment against handoff. */
> > + ACCESS_ONCE(tqp->cpu) = -1;
> > +}
>
> EXPORT_SYMBOL(tkt_q_do_wake) ?

Good point, just in case we want to use spinlocks in modules. ;-)
Same for tkt_spin_pass(), I guess.

> Hmm, unfortunately I lack time this week to fully read the patch !

I suspect that there is very little danger of this patch going in this
week, so you should have some additional time. ;-)

Thanx, Paul

2013-06-10 23:03:00

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Sun, 2013-06-09 at 12:36 -0700, Paul E. McKenney wrote:

> +/*
> + * Return a pointer to the queue header associated with the specified lock,
> + * or return NULL if there is no queue for the lock or if the lock's queue
> + * is in transition.
> + */
> +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)

BTW, what does "asp" mean? arch_spinlock? If so, can we just call it
"lock" and be consistent with all the other spinlock calls in the
kernel. Because, I keep thinking this has something to do with Microsoft
dynamic web pages.

-- Steve

> +{
> + int i;
> + int start;
> +
> + start = i = tkt_q_hash(asp);
> + do
> + if (tkt_q_heads[i].ref == asp)
> + return &tkt_q_heads[i];
> + while ((i = tkt_q_next_slot(i)) != start);
> + return NULL;
> +}
> +

2013-06-11 00:23:08

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Mon, Jun 10, 2013 at 07:02:56PM -0400, Steven Rostedt wrote:
> On Sun, 2013-06-09 at 12:36 -0700, Paul E. McKenney wrote:
>
> > +/*
> > + * Return a pointer to the queue header associated with the specified lock,
> > + * or return NULL if there is no queue for the lock or if the lock's queue
> > + * is in transition.
> > + */
> > +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
>
> BTW, what does "asp" mean? arch_spinlock?

"arch_spinlock pointer", but yes. Or I suppose a millenia-late warning
to Cleopatra.

> If so, can we just call it
> "lock" and be consistent with all the other spinlock calls in the
> kernel. Because, I keep thinking this has something to do with Microsoft
> dynamic web pages.

Fair enough!

Thanx, Paul

> -- Steve
>
> > +{
> > + int i;
> > + int start;
> > +
> > + start = i = tkt_q_hash(asp);
> > + do
> > + if (tkt_q_heads[i].ref == asp)
> > + return &tkt_q_heads[i];
> > + while ((i = tkt_q_next_slot(i)) != start);
> > + return NULL;
> > +}
> > +
>
>

2013-06-11 00:44:45

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Sun, 2013-06-09 at 12:36 -0700, Paul E. McKenney wrote:

> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> +
> +static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> +{
> + register struct __raw_tickets inc = { .tail = 2 };
> +
> + inc = xadd(&lock->tickets, inc);
> + for (;;) {
> + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> + break;
> + inc.head = ACCESS_ONCE(lock->tickets.head);
> + }
> + barrier(); /* smp_mb() on Power or ARM. */
> +}
> +
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> {
> arch_spinlock_t old, new;
> @@ -70,17 +91,37 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> if (old.tickets.head != old.tickets.tail)
> return 0;
>
> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> + new.head_tail = old.head_tail + (2 << TICKET_SHIFT);
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
>
> /* cmpxchg is a full barrier, so nothing can move before it */
> return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
> }
>
> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> +
> static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> {
> __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
> }
>
> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> +extern void tkt_q_do_wake(arch_spinlock_t *asp);
> +
> +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> +{
> + __ticket_t head = 2;
> +
> + head = xadd(&lock->tickets.head, 2);
> + if (head & 0x1)
> + tkt_q_do_wake(lock);
> +}
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
> {
> struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> index ad0ad07..cdaefdd 100644
> --- a/arch/x86/include/asm/spinlock_types.h
> +++ b/arch/x86/include/asm/spinlock_types.h
> @@ -7,12 +7,18 @@
>
> #include <linux/types.h>
>
> -#if (CONFIG_NR_CPUS < 256)
> +#if (CONFIG_NR_CPUS < 128)
> typedef u8 __ticket_t;
> typedef u16 __ticketpair_t;
> -#else
> +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
> +#elif (CONFIG_NR_CPUS < 32768)
> typedef u16 __ticket_t;
> typedef u32 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
> +#else
> +typedef u32 __ticket_t;
> +typedef u64 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
> #endif
>
> #define TICKET_SHIFT (sizeof(__ticket_t) * 8)
> @@ -21,7 +27,11 @@ typedef struct arch_spinlock {
> union {
> __ticketpair_t head_tail;
> struct __raw_tickets {
> +#ifdef __BIG_ENDIAN__
> + __ticket_t tail, head;
> +#else /* #ifdef __BIG_ENDIAN__ */
> __ticket_t head, tail;
> +#endif /* #else #ifdef __BIG_ENDIAN__ */
> } tickets;
> };
> } arch_spinlock_t;
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index e9ef6d6..816a87c 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -15,6 +15,7 @@
> #include <asm/byteorder.h>
> #include <uapi/linux/kernel.h>
>
> +#define UCHAR_MAX ((u8)(~0U))
> #define USHRT_MAX ((u16)(~0U))
> #define SHRT_MAX ((s16)(USHRT_MAX>>1))
> #define SHRT_MIN ((s16)(-SHRT_MAX - 1))
> diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> index 44511d1..ad9c67c 100644
> --- a/kernel/Kconfig.locks
> +++ b/kernel/Kconfig.locks
> @@ -223,3 +223,21 @@ endif
> config MUTEX_SPIN_ON_OWNER
> def_bool y
> depends on SMP && !DEBUG_MUTEXES
> +
> +config TICKET_LOCK_QUEUED
> + bool "Dynamically switch between ticket and queued locking"
> + default n
> + ---help---
> + Enable dynamic switching between ticketlock and queued locking
> + on a per-lock basis. This option will slow down low-contention
> + acquisition and release very slightly (additional conditional
> + in release path), but will provide more efficient operation at
> + high levels of lock contention. High-contention operation will
> + not be quite as efficient as would be a pure queued lock, but
> + this dynamic approach consumes less memory than queud locks
> + and also runs faster at low levels of contention.
> +
> + Say "Y" if you are running on a large system with a workload
> + that is likely to result in high levels of contention.
> +
> + Say "N" if you are unsure.
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 271fd31..70a91f7 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -51,6 +51,7 @@ endif
> obj-$(CONFIG_SMP) += spinlock.o
> obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
> obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
> +obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
> obj-$(CONFIG_UID16) += uid16.o
> obj-$(CONFIG_MODULES) += module.o
> obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
> diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> new file mode 100644
> index 0000000..f01b760
> --- /dev/null
> +++ b/kernel/tktqlock.c
> @@ -0,0 +1,333 @@
> +/*
> + * Queued ticket spinlocks.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + * Copyright IBM Corporation, 2013
> + *
> + * Authors: Paul E. McKenney <[email protected]>
> + */
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/spinlock.h>
> +#include <linux/smp.h>
> +#include <linux/percpu.h>
> +
> +struct tkt_q {
> + int cpu;
> + __ticket_t tail;
> + struct tkt_q *next;
> +};
> +
> +struct tkt_q_head {
> + arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
> + s32 head_tkt; /* Head ticket when started queuing. */
> + struct tkt_q *spin; /* Head of queue. */
> + struct tkt_q **spin_tail; /* Tail of queue. */
> +};
> +
> +/*
> + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> + * given ticket lock to motivate switching to spinning on a queue.
> + * The reason that it is twice the number is because the bottom bit of
> + * the ticket is reserved for the bit that indicates that a queue is
> + * associated with the lock.
> + */
> +#define TKT_Q_SWITCH (16 * 2)
> +
> +/*
> + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> + * might have multiple highly contended locks, so provide more queues for
> + * systems with larger numbers of CPUs.
> + */
> +#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
> +
> +/* The queues themselves. */
> +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
> +
> +/* Advance to the next queue slot, wrapping around to the beginning. */
> +static int tkt_q_next_slot(int i)
> +{
> + return (++i < TKT_Q_NQUEUES) ? i : 0;
> +}
> +
> +/* Very crude hash from lock address to queue slot number. */
> +static unsigned long tkt_q_hash(arch_spinlock_t *asp)
> +{
> + return (((unsigned long)asp) >> 8) % TKT_Q_NQUEUES;
> +}
> +
> +/*
> + * Return a pointer to the queue header associated with the specified lock,
> + * or return NULL if there is no queue for the lock or if the lock's queue
> + * is in transition.
> + */
> +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
> +{
> + int i;
> + int start;
> +
> + start = i = tkt_q_hash(asp);
> + do
> + if (tkt_q_heads[i].ref == asp)
> + return &tkt_q_heads[i];
> + while ((i = tkt_q_next_slot(i)) != start);
> + return NULL;
> +}
> +
> +/*
> + * Try to stop queuing, reverting back to normal ticket-lock operation.
> + * We can only stop queuing when the queue is empty, which means that
> + * we need to correctly handle races where someone shows up in the queue
> + * just as we are trying to dispense with the queue. They win, we lose.
> + */
> +static bool tkt_q_try_unqueue(arch_spinlock_t *asp, struct tkt_q_head *tqhp)
> +{
> + arch_spinlock_t asold;
> + arch_spinlock_t asnew;
> +
> + /* Pick up the ticket values. */
> + asold = ACCESS_ONCE(*asp);
> + if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
> +
> + /* Attempt to mark the lock as not having a queue. */
> + asnew = asold;
> + asnew.tickets.head &= ~0x1;
> + if (cmpxchg(&asp->head_tail,
> + asold.head_tail,
> + asnew.head_tail) == asold.head_tail) {
> +
> + /* Succeeded, mark the queue as unused. */
> + ACCESS_ONCE(tqhp->ref) = NULL;
> + return true;
> + }
> + }
> +
> + /* Failed, tell the caller there is still a queue to pass off to. */
> + return false;
> +}
> +
> +/*
> + * Hand the lock off to the first CPU on the queue.
> + */
> +void tkt_q_do_wake(arch_spinlock_t *asp)
> +{
> + struct tkt_q_head *tqhp;
> + struct tkt_q *tqp;
> +
> + /* If the queue is still being set up, wait for it. */
> + while ((tqhp = tkt_q_find_head(asp)) == NULL)
> + cpu_relax();
> +
> + for (;;) {
> +
> + /* Find the first queue element. */
> + tqp = ACCESS_ONCE(tqhp->spin);
> + if (tqp != NULL)
> + break; /* Element exists, hand off lock. */
> + if (tkt_q_try_unqueue(asp, tqhp))
> + return; /* No element, successfully removed queue. */
> + cpu_relax();
> + }
> + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> + ACCESS_ONCE(tqhp->head_tkt) = -1;
> + smp_mb(); /* Order pointer fetch and assignment against handoff. */
> + ACCESS_ONCE(tqp->cpu) = -1;
> +}
> +
> +/*
> + * Given a lock that already has a queue associated with it, spin on
> + * that queue. Return false if there was no queue (which means we do not
> + * hold the lock) and true otherwise (meaning we -do- hold the lock).
> + */
> +bool tkt_q_do_spin(arch_spinlock_t *asp, struct __raw_tickets inc)
> +{
> + struct tkt_q **oldtail;
> + struct tkt_q tq;
> + struct tkt_q_head *tqhp;
> +
> + /*
> + * Ensure that accesses to queue header happen after sensing
> + * the lock's have-queue bit.
> + */
> + smp_mb(); /* See above block comment. */
> +
> + /* If there no longer is a queue, leave. */
> + tqhp = tkt_q_find_head(asp);
> + if (tqhp == NULL)
> + return false;
> +
> + /* Initialize our queue element. */
> + tq.cpu = raw_smp_processor_id();
> + tq.tail = inc.tail;
> + tq.next = NULL;
> +
> + /* Check to see if we already hold the lock. */
> + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> + /* The last holder left before queue formed, we hold lock. */
> + tqhp->head_tkt = -1;
> + return true;
> + }
> +
> + /* Add our element to the tail of the queue. */
> + oldtail = xchg(&tqhp->spin_tail, &tq.next);

Boy this is tricky code! I thought I found a race window here, but as I
went to write my email saying "Gotcha!" I found that it wasn't a race
after all. But as I went though the effort of writing this, I figured I
would send this out as documentation for others to see. Hmm, I wonder if
we can use this email to add more comments. Anyway, here's what I
thought was wrong ;-)


OK, I originally thought there was a race window here. Let's say that an
NMI hit right here, and it happens to be a big one, where lots of things
can happen on other CPUs right now.

The scenario is that there's just one item on the queue, which is
waiting for the lock to be released, and is spinning below in the:

while (ACCESS_ONCE(tq.cpu) != -1)
cpu_relax();

And then the lock is released, where in tkt_q_do_wake() the following is
called:

ACCESS_ONCE(tqp->cpu) = -1;

Now the old queued task is released. But it's tq->next hasn't been set
yet, and is still NULL. It leaves by doing:

ACCESS_ONCE(tqhp->spin) = tq.next;
return true;

All before this task gets to set *oldtail to &tq. But, I then looked
below...


> + ACCESS_ONCE(*oldtail) = &tq;
> +
> + /* Spin until handoff. */
> + while (ACCESS_ONCE(tq.cpu) != -1)
> + cpu_relax();
> +
> + /*
> + * Remove our element from the queue. If the queue is now empty,
> + * update carefully so that the next acquisition will queue itself
> + * at the head of the list.
> + */
> + if (tq.next == NULL) {

This checks for that scenario. As if the old task were to come out
spinning, the problem would only be if it was the last one on the list,
and its tq.next was NULL. But if that was the case, then we set spin to
NULL and do the next trick, where I thought I gotcha again...


> +
> + /* Mark the queue empty. */
> + tqhp->spin = NULL;
> +
> + /* Try to point the tail back at the head. */
> + if (cmpxchg(&tqhp->spin_tail,
> + &tq.next,
> + &tqhp->spin) == &tq.next)

Here, I was thinking, oh wait, what happens if this is called right
before the xchg() above. Then we would set spin_tail but not update the
old tq.next. But wait! look at what we assign spin_tail to. It's the
address of spin, which would be what oldtail would point to above, and
then above would set spin to the new tq!

OK, I haven't found a issue here yet, but youss are beiing trickssy! We
don't like trickssy, and we must find precccciouss!!!


This code is starting to make me look like Gollum :-p

-- Steve

> + return true; /* Succeeded, queue is now empty. */
> +
> + /* Failed, if needed, wait for the enqueue to complete. */
> + while (tq.next == NULL)
> + cpu_relax();
> +
> + /* The following code will repair the head. */
> + }
> + smp_mb(); /* Force ordering between handoff and critical section. */
> +
> + /* Advance list-head pointer. */
> + ACCESS_ONCE(tqhp->spin) = tq.next;
> + return true;
> +}
> +
> +/*
> + * Given a lock that does not have a queue, attempt to associate the
> + * i-th queue with it, returning true if successful (meaning we hold
> + * the lock) or false otherwise (meaning we do -not- hold the lock).
> + * Note that the caller has already filled in ->ref with 0x1, so we
> + * own the queue.
> + */
> +static bool
> +tkt_q_init_contend(int i, arch_spinlock_t *asp, struct __raw_tickets inc)
> +{
> + arch_spinlock_t asold;
> + arch_spinlock_t asnew;
> + struct tkt_q_head *tqhp;
> +
> + /* Initialize the i-th queue header. */
> + tqhp = &tkt_q_heads[i];
> + tqhp->spin = NULL;
> + tqhp->spin_tail = &tqhp->spin;
> +
> + /* Each pass through this loop attempts to mark the lock as queued. */
> + do {
> + asold.head_tail = ACCESS_ONCE(asp->head_tail);
> + asnew = asold;
> + if (asnew.tickets.head & 0x1) {
> +
> + /* Someone beat us to it, back out. */
> + smp_mb();
> + ACCESS_ONCE(tqhp->ref) = NULL;
> +
> + /* Spin on the queue element they set up. */
> + return tkt_q_do_spin(asp, inc);
> + }
> +
> + /* The low-order bit in the head counter says "queued". */
> + asnew.tickets.head |= 0x1;
> + } while (cmpxchg(&asp->head_tail,
> + asold.head_tail,
> + asnew.head_tail) != asold.head_tail);
> +
> + /* Point the queue at the lock and go spin on it. */
> + tqhp->head_tkt = asold.tickets.head;
> + smp_mb(); /* Ensure head_tkt is set prior to queuers seeing tqhp. */
> + ACCESS_ONCE(tqhp->ref) = asp;
> + return tkt_q_do_spin(asp, inc);
> +}
> +
> +/*
> + * Start handling a period of high contention by finding a queue to associate
> + * with this lock. Returns true if successful (in which case we hold the
> + * lock) and false otherwise (in which case we do -not- hold the lock).
> + */
> +bool tkt_q_start_contend(arch_spinlock_t *asp, struct __raw_tickets inc)
> +{
> + int i;
> + int start;
> +
> + /* Hash the lock address to find a starting point. */
> + start = i = tkt_q_hash(asp);
> +
> + /*
> + * Each pass through the following loop attempts to associate
> + * the lock with the corresponding queue.
> + */
> + do {
> + /*
> + * Use 0x1 to mark the queue in use, but also avoiding
> + * any spinners trying to use it before we get it all
> + * initialized.
> + */
> + if (cmpxchg(&tkt_q_heads[i].ref,
> + NULL,
> + (arch_spinlock_t *)0x1) == NULL) {
> +
> + /* Succeeded, now go initialize it. */
> + return tkt_q_init_contend(i, asp, inc);
> + }
> +
> + /* If someone beat us to it, go spin on their queue. */
> + if (ACCESS_ONCE(asp->tickets.head) & 0x1)
> + return tkt_q_do_spin(asp, inc);
> + } while ((i = tkt_q_next_slot(i)) != start);
> +
> + /* All the queues are in use, revert to spinning on the ticket lock. */
> + return false;
> +}
> +
> +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> +{
> + if (unlikely(inc.head & 0x1)) {
> +
> + /* This lock has a queue, so go spin on the queue. */
> + if (tkt_q_do_spin(ap, inc))
> + return true;
> +
> + /* Get here if the queue is in transition: Retry next time. */
> +
> + } else if (TICKET_T_CMP_GE(ACCESS_ONCE(ap->tickets.tail) - TKT_Q_SWITCH,
> + ACCESS_ONCE(ap->tickets.head))) {
> +
> + /*
> + * This lock has lots of spinners, but no queue.
> + * Go create a queue to spin on.
> + */
> + if (tkt_q_start_contend(ap, inc))
> + return true;
> +
> + /* Get here if the queue is in transition: Retry next time. */
> + }
> +
> + /* Either no need for a queue or the queue is in transition. Spin. */
> + cpu_relax();
> + return false;
> +}

2013-06-11 00:51:17

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Mon, Jun 10, 2013 at 5:44 PM, Steven Rostedt <[email protected]> wrote:
>
> OK, I haven't found a issue here yet, but youss are beiing trickssy! We
> don't like trickssy, and we must find precccciouss!!!

.. and I personally have my usual reservations. I absolutely hate
papering over scalability issues, and historically whenever people
have ever thought that we want complex spinlocks, the problem has
always been that the locking sucks.

So reinforced by previous events, I really feel that code that needs
this kind of spinlock is broken and needs to be fixed, rather than
actually introduce tricky spinlocks.

So in order to merge something like this, I want (a) numbers for real
loads and (b) explanations for why the spinlock users cannot be fixed.

Because "we might hit loads" is just not good enough. I would counter
with "hiding problems causes more of them".

Linus

2013-06-11 01:04:14

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Sun, 2013-06-09 at 12:36 -0700, Paul E. McKenney wrote:
> Breaking up locks is better than implementing high-contention locks, but
> if we must have high-contention locks, why not make them automatically
> switch between light-weight ticket locks at low contention and queued
> locks at high contention?
>
> This commit therefore allows ticket locks to automatically switch between
> pure ticketlock and queued-lock operation as needed. If too many CPUs
> are spinning on a given ticket lock, a queue structure will be allocated
> and the lock will switch to queued-lock operation. When the lock becomes
> free, it will switch back into ticketlock operation. The low-order bit
> of the head counter is used to indicate that the lock is in queued mode,
> which forces an unconditional mismatch between the head and tail counters.
> This approach means that the common-case code path under conditions of
> low contention is very nearly that of a plain ticket lock.
>
> A fixed number of queueing structures is statically allocated in an
> array. The ticket-lock address is used to hash into an initial element,
> but if that element is already in use, it moves to the next element. If
> the entire array is already in use, continue to spin in ticket mode.
>
> This has been only lightly tested in the kernel, though a userspace
> implementation has survived substantial testing.

I guess the point of this patch is to lower the cache ping pong effect
of spin locks. I believe Rik was doing something similar to this as
well.

Now, when we switch from ticket to queue, we basically blow away the old
FIFO and all the tasks do a thundering herd to get on the queue. Even if
the task was next to get the lock, it could end up being stuck at the
back of the queue again and have to wait. When I put my real-time hat
on, this bothers me. Even though it's still a bound latency, as it only
gets put to the end of the queue once, it just doubled the length of the
worse case grabbing of a lock.

-- Steve



>
> Signed-off-by: Paul E. McKenney <[email protected]>
>
> diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
> index 33692ea..b4a91b0 100644
> --- a/arch/x86/include/asm/spinlock.h
> +++ b/arch/x86/include/asm/spinlock.h
> @@ -34,6 +34,8 @@
> # define UNLOCK_LOCK_PREFIX
> #endif
>
> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> +
> /*
> * Ticket locks are conceptually two parts, one indicating the current head of
> * the queue, and the other indicating the current tail. The lock is acquired
> @@ -62,6 +64,25 @@ static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> barrier(); /* make sure nothing creeps before the lock is taken */
> }
>
> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> +
> +static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> +{
> + register struct __raw_tickets inc = { .tail = 2 };
> +
> + inc = xadd(&lock->tickets, inc);
> + for (;;) {
> + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> + break;
> + inc.head = ACCESS_ONCE(lock->tickets.head);
> + }
> + barrier(); /* smp_mb() on Power or ARM. */
> +}
> +
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> {
> arch_spinlock_t old, new;
> @@ -70,17 +91,37 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> if (old.tickets.head != old.tickets.tail)
> return 0;
>
> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> + new.head_tail = old.head_tail + (2 << TICKET_SHIFT);
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
>
> /* cmpxchg is a full barrier, so nothing can move before it */
> return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
> }
>
> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> +
> static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> {
> __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
> }
>
> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> +extern void tkt_q_do_wake(arch_spinlock_t *asp);
> +
> +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> +{
> + __ticket_t head = 2;
> +
> + head = xadd(&lock->tickets.head, 2);
> + if (head & 0x1)
> + tkt_q_do_wake(lock);
> +}
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
> {
> struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> index ad0ad07..cdaefdd 100644
> --- a/arch/x86/include/asm/spinlock_types.h
> +++ b/arch/x86/include/asm/spinlock_types.h
> @@ -7,12 +7,18 @@
>
> #include <linux/types.h>
>
> -#if (CONFIG_NR_CPUS < 256)
> +#if (CONFIG_NR_CPUS < 128)
> typedef u8 __ticket_t;
> typedef u16 __ticketpair_t;
> -#else
> +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
> +#elif (CONFIG_NR_CPUS < 32768)
> typedef u16 __ticket_t;
> typedef u32 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
> +#else
> +typedef u32 __ticket_t;
> +typedef u64 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
> #endif
>
> #define TICKET_SHIFT (sizeof(__ticket_t) * 8)
> @@ -21,7 +27,11 @@ typedef struct arch_spinlock {
> union {
> __ticketpair_t head_tail;
> struct __raw_tickets {
> +#ifdef __BIG_ENDIAN__
> + __ticket_t tail, head;
> +#else /* #ifdef __BIG_ENDIAN__ */
> __ticket_t head, tail;
> +#endif /* #else #ifdef __BIG_ENDIAN__ */
> } tickets;
> };
> } arch_spinlock_t;
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index e9ef6d6..816a87c 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -15,6 +15,7 @@
> #include <asm/byteorder.h>
> #include <uapi/linux/kernel.h>
>
> +#define UCHAR_MAX ((u8)(~0U))
> #define USHRT_MAX ((u16)(~0U))
> #define SHRT_MAX ((s16)(USHRT_MAX>>1))
> #define SHRT_MIN ((s16)(-SHRT_MAX - 1))
> diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> index 44511d1..ad9c67c 100644
> --- a/kernel/Kconfig.locks
> +++ b/kernel/Kconfig.locks
> @@ -223,3 +223,21 @@ endif
> config MUTEX_SPIN_ON_OWNER
> def_bool y
> depends on SMP && !DEBUG_MUTEXES
> +
> +config TICKET_LOCK_QUEUED
> + bool "Dynamically switch between ticket and queued locking"
> + default n
> + ---help---
> + Enable dynamic switching between ticketlock and queued locking
> + on a per-lock basis. This option will slow down low-contention
> + acquisition and release very slightly (additional conditional
> + in release path), but will provide more efficient operation at
> + high levels of lock contention. High-contention operation will
> + not be quite as efficient as would be a pure queued lock, but
> + this dynamic approach consumes less memory than queud locks
> + and also runs faster at low levels of contention.
> +
> + Say "Y" if you are running on a large system with a workload
> + that is likely to result in high levels of contention.
> +
> + Say "N" if you are unsure.
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 271fd31..70a91f7 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -51,6 +51,7 @@ endif
> obj-$(CONFIG_SMP) += spinlock.o
> obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
> obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
> +obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
> obj-$(CONFIG_UID16) += uid16.o
> obj-$(CONFIG_MODULES) += module.o
> obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
> diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> new file mode 100644
> index 0000000..f01b760
> --- /dev/null
> +++ b/kernel/tktqlock.c
> @@ -0,0 +1,333 @@
> +/*
> + * Queued ticket spinlocks.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + * Copyright IBM Corporation, 2013
> + *
> + * Authors: Paul E. McKenney <[email protected]>
> + */
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/spinlock.h>
> +#include <linux/smp.h>
> +#include <linux/percpu.h>
> +
> +struct tkt_q {
> + int cpu;
> + __ticket_t tail;
> + struct tkt_q *next;
> +};
> +
> +struct tkt_q_head {
> + arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
> + s32 head_tkt; /* Head ticket when started queuing. */
> + struct tkt_q *spin; /* Head of queue. */
> + struct tkt_q **spin_tail; /* Tail of queue. */
> +};
> +
> +/*
> + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> + * given ticket lock to motivate switching to spinning on a queue.
> + * The reason that it is twice the number is because the bottom bit of
> + * the ticket is reserved for the bit that indicates that a queue is
> + * associated with the lock.
> + */
> +#define TKT_Q_SWITCH (16 * 2)
> +
> +/*
> + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> + * might have multiple highly contended locks, so provide more queues for
> + * systems with larger numbers of CPUs.
> + */
> +#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
> +
> +/* The queues themselves. */
> +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
> +
> +/* Advance to the next queue slot, wrapping around to the beginning. */
> +static int tkt_q_next_slot(int i)
> +{
> + return (++i < TKT_Q_NQUEUES) ? i : 0;
> +}
> +
> +/* Very crude hash from lock address to queue slot number. */
> +static unsigned long tkt_q_hash(arch_spinlock_t *asp)
> +{
> + return (((unsigned long)asp) >> 8) % TKT_Q_NQUEUES;
> +}
> +
> +/*
> + * Return a pointer to the queue header associated with the specified lock,
> + * or return NULL if there is no queue for the lock or if the lock's queue
> + * is in transition.
> + */
> +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
> +{
> + int i;
> + int start;
> +
> + start = i = tkt_q_hash(asp);
> + do
> + if (tkt_q_heads[i].ref == asp)
> + return &tkt_q_heads[i];
> + while ((i = tkt_q_next_slot(i)) != start);
> + return NULL;
> +}
> +
> +/*
> + * Try to stop queuing, reverting back to normal ticket-lock operation.
> + * We can only stop queuing when the queue is empty, which means that
> + * we need to correctly handle races where someone shows up in the queue
> + * just as we are trying to dispense with the queue. They win, we lose.
> + */
> +static bool tkt_q_try_unqueue(arch_spinlock_t *asp, struct tkt_q_head *tqhp)
> +{
> + arch_spinlock_t asold;
> + arch_spinlock_t asnew;
> +
> + /* Pick up the ticket values. */
> + asold = ACCESS_ONCE(*asp);
> + if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
> +
> + /* Attempt to mark the lock as not having a queue. */
> + asnew = asold;
> + asnew.tickets.head &= ~0x1;
> + if (cmpxchg(&asp->head_tail,
> + asold.head_tail,
> + asnew.head_tail) == asold.head_tail) {
> +
> + /* Succeeded, mark the queue as unused. */
> + ACCESS_ONCE(tqhp->ref) = NULL;
> + return true;
> + }
> + }
> +
> + /* Failed, tell the caller there is still a queue to pass off to. */
> + return false;
> +}
> +
> +/*
> + * Hand the lock off to the first CPU on the queue.
> + */
> +void tkt_q_do_wake(arch_spinlock_t *asp)
> +{
> + struct tkt_q_head *tqhp;
> + struct tkt_q *tqp;
> +
> + /* If the queue is still being set up, wait for it. */
> + while ((tqhp = tkt_q_find_head(asp)) == NULL)
> + cpu_relax();
> +
> + for (;;) {
> +
> + /* Find the first queue element. */
> + tqp = ACCESS_ONCE(tqhp->spin);
> + if (tqp != NULL)
> + break; /* Element exists, hand off lock. */
> + if (tkt_q_try_unqueue(asp, tqhp))
> + return; /* No element, successfully removed queue. */
> + cpu_relax();
> + }
> + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> + ACCESS_ONCE(tqhp->head_tkt) = -1;
> + smp_mb(); /* Order pointer fetch and assignment against handoff. */
> + ACCESS_ONCE(tqp->cpu) = -1;
> +}
> +
> +/*
> + * Given a lock that already has a queue associated with it, spin on
> + * that queue. Return false if there was no queue (which means we do not
> + * hold the lock) and true otherwise (meaning we -do- hold the lock).
> + */
> +bool tkt_q_do_spin(arch_spinlock_t *asp, struct __raw_tickets inc)
> +{
> + struct tkt_q **oldtail;
> + struct tkt_q tq;
> + struct tkt_q_head *tqhp;
> +
> + /*
> + * Ensure that accesses to queue header happen after sensing
> + * the lock's have-queue bit.
> + */
> + smp_mb(); /* See above block comment. */
> +
> + /* If there no longer is a queue, leave. */
> + tqhp = tkt_q_find_head(asp);
> + if (tqhp == NULL)
> + return false;
> +
> + /* Initialize our queue element. */
> + tq.cpu = raw_smp_processor_id();
> + tq.tail = inc.tail;
> + tq.next = NULL;
> +
> + /* Check to see if we already hold the lock. */
> + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> + /* The last holder left before queue formed, we hold lock. */
> + tqhp->head_tkt = -1;
> + return true;
> + }
> +
> + /* Add our element to the tail of the queue. */
> + oldtail = xchg(&tqhp->spin_tail, &tq.next);
> + ACCESS_ONCE(*oldtail) = &tq;
> +
> + /* Spin until handoff. */
> + while (ACCESS_ONCE(tq.cpu) != -1)
> + cpu_relax();
> +
> + /*
> + * Remove our element from the queue. If the queue is now empty,
> + * update carefully so that the next acquisition will queue itself
> + * at the head of the list.
> + */
> + if (tq.next == NULL) {
> +
> + /* Mark the queue empty. */
> + tqhp->spin = NULL;
> +
> + /* Try to point the tail back at the head. */
> + if (cmpxchg(&tqhp->spin_tail,
> + &tq.next,
> + &tqhp->spin) == &tq.next)
> + return true; /* Succeeded, queue is now empty. */
> +
> + /* Failed, if needed, wait for the enqueue to complete. */
> + while (tq.next == NULL)
> + cpu_relax();
> +
> + /* The following code will repair the head. */
> + }
> + smp_mb(); /* Force ordering between handoff and critical section. */
> +
> + /* Advance list-head pointer. */
> + ACCESS_ONCE(tqhp->spin) = tq.next;
> + return true;
> +}
> +
> +/*
> + * Given a lock that does not have a queue, attempt to associate the
> + * i-th queue with it, returning true if successful (meaning we hold
> + * the lock) or false otherwise (meaning we do -not- hold the lock).
> + * Note that the caller has already filled in ->ref with 0x1, so we
> + * own the queue.
> + */
> +static bool
> +tkt_q_init_contend(int i, arch_spinlock_t *asp, struct __raw_tickets inc)
> +{
> + arch_spinlock_t asold;
> + arch_spinlock_t asnew;
> + struct tkt_q_head *tqhp;
> +
> + /* Initialize the i-th queue header. */
> + tqhp = &tkt_q_heads[i];
> + tqhp->spin = NULL;
> + tqhp->spin_tail = &tqhp->spin;
> +
> + /* Each pass through this loop attempts to mark the lock as queued. */
> + do {
> + asold.head_tail = ACCESS_ONCE(asp->head_tail);
> + asnew = asold;
> + if (asnew.tickets.head & 0x1) {
> +
> + /* Someone beat us to it, back out. */
> + smp_mb();
> + ACCESS_ONCE(tqhp->ref) = NULL;
> +
> + /* Spin on the queue element they set up. */
> + return tkt_q_do_spin(asp, inc);
> + }
> +
> + /* The low-order bit in the head counter says "queued". */
> + asnew.tickets.head |= 0x1;
> + } while (cmpxchg(&asp->head_tail,
> + asold.head_tail,
> + asnew.head_tail) != asold.head_tail);
> +
> + /* Point the queue at the lock and go spin on it. */
> + tqhp->head_tkt = asold.tickets.head;
> + smp_mb(); /* Ensure head_tkt is set prior to queuers seeing tqhp. */
> + ACCESS_ONCE(tqhp->ref) = asp;
> + return tkt_q_do_spin(asp, inc);
> +}
> +
> +/*
> + * Start handling a period of high contention by finding a queue to associate
> + * with this lock. Returns true if successful (in which case we hold the
> + * lock) and false otherwise (in which case we do -not- hold the lock).
> + */
> +bool tkt_q_start_contend(arch_spinlock_t *asp, struct __raw_tickets inc)
> +{
> + int i;
> + int start;
> +
> + /* Hash the lock address to find a starting point. */
> + start = i = tkt_q_hash(asp);
> +
> + /*
> + * Each pass through the following loop attempts to associate
> + * the lock with the corresponding queue.
> + */
> + do {
> + /*
> + * Use 0x1 to mark the queue in use, but also avoiding
> + * any spinners trying to use it before we get it all
> + * initialized.
> + */
> + if (cmpxchg(&tkt_q_heads[i].ref,
> + NULL,
> + (arch_spinlock_t *)0x1) == NULL) {
> +
> + /* Succeeded, now go initialize it. */
> + return tkt_q_init_contend(i, asp, inc);
> + }
> +
> + /* If someone beat us to it, go spin on their queue. */
> + if (ACCESS_ONCE(asp->tickets.head) & 0x1)
> + return tkt_q_do_spin(asp, inc);
> + } while ((i = tkt_q_next_slot(i)) != start);
> +
> + /* All the queues are in use, revert to spinning on the ticket lock. */
> + return false;
> +}
> +
> +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> +{
> + if (unlikely(inc.head & 0x1)) {
> +
> + /* This lock has a queue, so go spin on the queue. */
> + if (tkt_q_do_spin(ap, inc))
> + return true;
> +
> + /* Get here if the queue is in transition: Retry next time. */
> +
> + } else if (TICKET_T_CMP_GE(ACCESS_ONCE(ap->tickets.tail) - TKT_Q_SWITCH,
> + ACCESS_ONCE(ap->tickets.head))) {
> +
> + /*
> + * This lock has lots of spinners, but no queue.
> + * Go create a queue to spin on.
> + */
> + if (tkt_q_start_contend(ap, inc))
> + return true;
> +
> + /* Get here if the queue is in transition: Retry next time. */
> + }
> +
> + /* Either no need for a queue or the queue is in transition. Spin. */
> + cpu_relax();
> + return false;
> +}

2013-06-11 08:00:20

by Lai Jiangshan

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On 06/11/2013 08:51 AM, Linus Torvalds wrote:
> On Mon, Jun 10, 2013 at 5:44 PM, Steven Rostedt <[email protected]> wrote:
>>
>> OK, I haven't found a issue here yet, but youss are beiing trickssy! We
>> don't like trickssy, and we must find precccciouss!!!
>
> .. and I personally have my usual reservations. I absolutely hate
> papering over scalability issues, and historically whenever people
> have ever thought that we want complex spinlocks, the problem has
> always been that the locking sucks.
>
> So reinforced by previous events, I really feel that code that needs
> this kind of spinlock is broken and needs to be fixed, rather than
> actually introduce tricky spinlocks.
>
> So in order to merge something like this, I want (a) numbers for real
> loads and (b) explanations for why the spinlock users cannot be fixed.
>
> Because "we might hit loads" is just not good enough. I would counter
> with "hiding problems causes more of them".
>

Hi, all

Off-topic, although I am in this community for several years,
I am not exactly clear with this problem.

1) In general case, which lock is the most competitive in the kernel? what it protects for?
2) In which special case, which lock is the most competitive in the kernel? what it protects for?
3) In general case, which list is the most hot list?
4) In which special case, which list is the most hot list?

thanks,
Lai

2013-06-11 09:52:49

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Mon, Jun 10, 2013 at 09:04:09PM -0400, Steven Rostedt wrote:
> On Sun, 2013-06-09 at 12:36 -0700, Paul E. McKenney wrote:
> > Breaking up locks is better than implementing high-contention locks, but
> > if we must have high-contention locks, why not make them automatically
> > switch between light-weight ticket locks at low contention and queued
> > locks at high contention?
> >
> > This commit therefore allows ticket locks to automatically switch between
> > pure ticketlock and queued-lock operation as needed. If too many CPUs
> > are spinning on a given ticket lock, a queue structure will be allocated
> > and the lock will switch to queued-lock operation. When the lock becomes
> > free, it will switch back into ticketlock operation. The low-order bit
> > of the head counter is used to indicate that the lock is in queued mode,
> > which forces an unconditional mismatch between the head and tail counters.
> > This approach means that the common-case code path under conditions of
> > low contention is very nearly that of a plain ticket lock.
> >
> > A fixed number of queueing structures is statically allocated in an
> > array. The ticket-lock address is used to hash into an initial element,
> > but if that element is already in use, it moves to the next element. If
> > the entire array is already in use, continue to spin in ticket mode.
> >
> > This has been only lightly tested in the kernel, though a userspace
> > implementation has survived substantial testing.
>
> I guess the point of this patch is to lower the cache ping pong effect
> of spin locks. I believe Rik was doing something similar to this as
> well.

Yep, as was Michel Lespinasse, IIRC.

> Now, when we switch from ticket to queue, we basically blow away the old
> FIFO and all the tasks do a thundering herd to get on the queue. Even if
> the task was next to get the lock, it could end up being stuck at the
> back of the queue again and have to wait. When I put my real-time hat
> on, this bothers me. Even though it's still a bound latency, as it only
> gets put to the end of the queue once, it just doubled the length of the
> worse case grabbing of a lock.

Almost.

The size of the switch-to-queue thundering herd is limited by the
ticket gap that initiates the switch. So what you are saying is that
in an RT kernel, you might want to tune down the ticket gap. Which
reminds me -- I do need to make this tuning more explicit.

Thanx, Paul

> -- Steve
>
>
>
> >
> > Signed-off-by: Paul E. McKenney <[email protected]>
> >
> > diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
> > index 33692ea..b4a91b0 100644
> > --- a/arch/x86/include/asm/spinlock.h
> > +++ b/arch/x86/include/asm/spinlock.h
> > @@ -34,6 +34,8 @@
> > # define UNLOCK_LOCK_PREFIX
> > #endif
> >
> > +#ifndef CONFIG_TICKET_LOCK_QUEUED
> > +
> > /*
> > * Ticket locks are conceptually two parts, one indicating the current head of
> > * the queue, and the other indicating the current tail. The lock is acquired
> > @@ -62,6 +64,25 @@ static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> > barrier(); /* make sure nothing creeps before the lock is taken */
> > }
> >
> > +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> > +
> > +static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> > +{
> > + register struct __raw_tickets inc = { .tail = 2 };
> > +
> > + inc = xadd(&lock->tickets, inc);
> > + for (;;) {
> > + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> > + break;
> > + inc.head = ACCESS_ONCE(lock->tickets.head);
> > + }
> > + barrier(); /* smp_mb() on Power or ARM. */
> > +}
> > +
> > +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> > {
> > arch_spinlock_t old, new;
> > @@ -70,17 +91,37 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> > if (old.tickets.head != old.tickets.tail)
> > return 0;
> >
> > +#ifndef CONFIG_TICKET_LOCK_QUEUED
> > new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
> > +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > + new.head_tail = old.head_tail + (2 << TICKET_SHIFT);
> > +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> >
> > /* cmpxchg is a full barrier, so nothing can move before it */
> > return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
> > }
> >
> > +#ifndef CONFIG_TICKET_LOCK_QUEUED
> > +
> > static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> > {
> > __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
> > }
> >
> > +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > +extern void tkt_q_do_wake(arch_spinlock_t *asp);
> > +
> > +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> > +{
> > + __ticket_t head = 2;
> > +
> > + head = xadd(&lock->tickets.head, 2);
> > + if (head & 0x1)
> > + tkt_q_do_wake(lock);
> > +}
> > +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
> > {
> > struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> > diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> > index ad0ad07..cdaefdd 100644
> > --- a/arch/x86/include/asm/spinlock_types.h
> > +++ b/arch/x86/include/asm/spinlock_types.h
> > @@ -7,12 +7,18 @@
> >
> > #include <linux/types.h>
> >
> > -#if (CONFIG_NR_CPUS < 256)
> > +#if (CONFIG_NR_CPUS < 128)
> > typedef u8 __ticket_t;
> > typedef u16 __ticketpair_t;
> > -#else
> > +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
> > +#elif (CONFIG_NR_CPUS < 32768)
> > typedef u16 __ticket_t;
> > typedef u32 __ticketpair_t;
> > +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
> > +#else
> > +typedef u32 __ticket_t;
> > +typedef u64 __ticketpair_t;
> > +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
> > #endif
> >
> > #define TICKET_SHIFT (sizeof(__ticket_t) * 8)
> > @@ -21,7 +27,11 @@ typedef struct arch_spinlock {
> > union {
> > __ticketpair_t head_tail;
> > struct __raw_tickets {
> > +#ifdef __BIG_ENDIAN__
> > + __ticket_t tail, head;
> > +#else /* #ifdef __BIG_ENDIAN__ */
> > __ticket_t head, tail;
> > +#endif /* #else #ifdef __BIG_ENDIAN__ */
> > } tickets;
> > };
> > } arch_spinlock_t;
> > diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> > index e9ef6d6..816a87c 100644
> > --- a/include/linux/kernel.h
> > +++ b/include/linux/kernel.h
> > @@ -15,6 +15,7 @@
> > #include <asm/byteorder.h>
> > #include <uapi/linux/kernel.h>
> >
> > +#define UCHAR_MAX ((u8)(~0U))
> > #define USHRT_MAX ((u16)(~0U))
> > #define SHRT_MAX ((s16)(USHRT_MAX>>1))
> > #define SHRT_MIN ((s16)(-SHRT_MAX - 1))
> > diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> > index 44511d1..ad9c67c 100644
> > --- a/kernel/Kconfig.locks
> > +++ b/kernel/Kconfig.locks
> > @@ -223,3 +223,21 @@ endif
> > config MUTEX_SPIN_ON_OWNER
> > def_bool y
> > depends on SMP && !DEBUG_MUTEXES
> > +
> > +config TICKET_LOCK_QUEUED
> > + bool "Dynamically switch between ticket and queued locking"
> > + default n
> > + ---help---
> > + Enable dynamic switching between ticketlock and queued locking
> > + on a per-lock basis. This option will slow down low-contention
> > + acquisition and release very slightly (additional conditional
> > + in release path), but will provide more efficient operation at
> > + high levels of lock contention. High-contention operation will
> > + not be quite as efficient as would be a pure queued lock, but
> > + this dynamic approach consumes less memory than queud locks
> > + and also runs faster at low levels of contention.
> > +
> > + Say "Y" if you are running on a large system with a workload
> > + that is likely to result in high levels of contention.
> > +
> > + Say "N" if you are unsure.
> > diff --git a/kernel/Makefile b/kernel/Makefile
> > index 271fd31..70a91f7 100644
> > --- a/kernel/Makefile
> > +++ b/kernel/Makefile
> > @@ -51,6 +51,7 @@ endif
> > obj-$(CONFIG_SMP) += spinlock.o
> > obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
> > obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
> > +obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
> > obj-$(CONFIG_UID16) += uid16.o
> > obj-$(CONFIG_MODULES) += module.o
> > obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
> > diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> > new file mode 100644
> > index 0000000..f01b760
> > --- /dev/null
> > +++ b/kernel/tktqlock.c
> > @@ -0,0 +1,333 @@
> > +/*
> > + * Queued ticket spinlocks.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, write to the Free Software
> > + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> > + *
> > + * Copyright IBM Corporation, 2013
> > + *
> > + * Authors: Paul E. McKenney <[email protected]>
> > + */
> > +#include <linux/types.h>
> > +#include <linux/kernel.h>
> > +#include <linux/spinlock.h>
> > +#include <linux/smp.h>
> > +#include <linux/percpu.h>
> > +
> > +struct tkt_q {
> > + int cpu;
> > + __ticket_t tail;
> > + struct tkt_q *next;
> > +};
> > +
> > +struct tkt_q_head {
> > + arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
> > + s32 head_tkt; /* Head ticket when started queuing. */
> > + struct tkt_q *spin; /* Head of queue. */
> > + struct tkt_q **spin_tail; /* Tail of queue. */
> > +};
> > +
> > +/*
> > + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> > + * given ticket lock to motivate switching to spinning on a queue.
> > + * The reason that it is twice the number is because the bottom bit of
> > + * the ticket is reserved for the bit that indicates that a queue is
> > + * associated with the lock.
> > + */
> > +#define TKT_Q_SWITCH (16 * 2)
> > +
> > +/*
> > + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> > + * might have multiple highly contended locks, so provide more queues for
> > + * systems with larger numbers of CPUs.
> > + */
> > +#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
> > +
> > +/* The queues themselves. */
> > +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
> > +
> > +/* Advance to the next queue slot, wrapping around to the beginning. */
> > +static int tkt_q_next_slot(int i)
> > +{
> > + return (++i < TKT_Q_NQUEUES) ? i : 0;
> > +}
> > +
> > +/* Very crude hash from lock address to queue slot number. */
> > +static unsigned long tkt_q_hash(arch_spinlock_t *asp)
> > +{
> > + return (((unsigned long)asp) >> 8) % TKT_Q_NQUEUES;
> > +}
> > +
> > +/*
> > + * Return a pointer to the queue header associated with the specified lock,
> > + * or return NULL if there is no queue for the lock or if the lock's queue
> > + * is in transition.
> > + */
> > +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
> > +{
> > + int i;
> > + int start;
> > +
> > + start = i = tkt_q_hash(asp);
> > + do
> > + if (tkt_q_heads[i].ref == asp)
> > + return &tkt_q_heads[i];
> > + while ((i = tkt_q_next_slot(i)) != start);
> > + return NULL;
> > +}
> > +
> > +/*
> > + * Try to stop queuing, reverting back to normal ticket-lock operation.
> > + * We can only stop queuing when the queue is empty, which means that
> > + * we need to correctly handle races where someone shows up in the queue
> > + * just as we are trying to dispense with the queue. They win, we lose.
> > + */
> > +static bool tkt_q_try_unqueue(arch_spinlock_t *asp, struct tkt_q_head *tqhp)
> > +{
> > + arch_spinlock_t asold;
> > + arch_spinlock_t asnew;
> > +
> > + /* Pick up the ticket values. */
> > + asold = ACCESS_ONCE(*asp);
> > + if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
> > +
> > + /* Attempt to mark the lock as not having a queue. */
> > + asnew = asold;
> > + asnew.tickets.head &= ~0x1;
> > + if (cmpxchg(&asp->head_tail,
> > + asold.head_tail,
> > + asnew.head_tail) == asold.head_tail) {
> > +
> > + /* Succeeded, mark the queue as unused. */
> > + ACCESS_ONCE(tqhp->ref) = NULL;
> > + return true;
> > + }
> > + }
> > +
> > + /* Failed, tell the caller there is still a queue to pass off to. */
> > + return false;
> > +}
> > +
> > +/*
> > + * Hand the lock off to the first CPU on the queue.
> > + */
> > +void tkt_q_do_wake(arch_spinlock_t *asp)
> > +{
> > + struct tkt_q_head *tqhp;
> > + struct tkt_q *tqp;
> > +
> > + /* If the queue is still being set up, wait for it. */
> > + while ((tqhp = tkt_q_find_head(asp)) == NULL)
> > + cpu_relax();
> > +
> > + for (;;) {
> > +
> > + /* Find the first queue element. */
> > + tqp = ACCESS_ONCE(tqhp->spin);
> > + if (tqp != NULL)
> > + break; /* Element exists, hand off lock. */
> > + if (tkt_q_try_unqueue(asp, tqhp))
> > + return; /* No element, successfully removed queue. */
> > + cpu_relax();
> > + }
> > + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> > + ACCESS_ONCE(tqhp->head_tkt) = -1;
> > + smp_mb(); /* Order pointer fetch and assignment against handoff. */
> > + ACCESS_ONCE(tqp->cpu) = -1;
> > +}
> > +
> > +/*
> > + * Given a lock that already has a queue associated with it, spin on
> > + * that queue. Return false if there was no queue (which means we do not
> > + * hold the lock) and true otherwise (meaning we -do- hold the lock).
> > + */
> > +bool tkt_q_do_spin(arch_spinlock_t *asp, struct __raw_tickets inc)
> > +{
> > + struct tkt_q **oldtail;
> > + struct tkt_q tq;
> > + struct tkt_q_head *tqhp;
> > +
> > + /*
> > + * Ensure that accesses to queue header happen after sensing
> > + * the lock's have-queue bit.
> > + */
> > + smp_mb(); /* See above block comment. */
> > +
> > + /* If there no longer is a queue, leave. */
> > + tqhp = tkt_q_find_head(asp);
> > + if (tqhp == NULL)
> > + return false;
> > +
> > + /* Initialize our queue element. */
> > + tq.cpu = raw_smp_processor_id();
> > + tq.tail = inc.tail;
> > + tq.next = NULL;
> > +
> > + /* Check to see if we already hold the lock. */
> > + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> > + /* The last holder left before queue formed, we hold lock. */
> > + tqhp->head_tkt = -1;
> > + return true;
> > + }
> > +
> > + /* Add our element to the tail of the queue. */
> > + oldtail = xchg(&tqhp->spin_tail, &tq.next);
> > + ACCESS_ONCE(*oldtail) = &tq;
> > +
> > + /* Spin until handoff. */
> > + while (ACCESS_ONCE(tq.cpu) != -1)
> > + cpu_relax();
> > +
> > + /*
> > + * Remove our element from the queue. If the queue is now empty,
> > + * update carefully so that the next acquisition will queue itself
> > + * at the head of the list.
> > + */
> > + if (tq.next == NULL) {
> > +
> > + /* Mark the queue empty. */
> > + tqhp->spin = NULL;
> > +
> > + /* Try to point the tail back at the head. */
> > + if (cmpxchg(&tqhp->spin_tail,
> > + &tq.next,
> > + &tqhp->spin) == &tq.next)
> > + return true; /* Succeeded, queue is now empty. */
> > +
> > + /* Failed, if needed, wait for the enqueue to complete. */
> > + while (tq.next == NULL)
> > + cpu_relax();
> > +
> > + /* The following code will repair the head. */
> > + }
> > + smp_mb(); /* Force ordering between handoff and critical section. */
> > +
> > + /* Advance list-head pointer. */
> > + ACCESS_ONCE(tqhp->spin) = tq.next;
> > + return true;
> > +}
> > +
> > +/*
> > + * Given a lock that does not have a queue, attempt to associate the
> > + * i-th queue with it, returning true if successful (meaning we hold
> > + * the lock) or false otherwise (meaning we do -not- hold the lock).
> > + * Note that the caller has already filled in ->ref with 0x1, so we
> > + * own the queue.
> > + */
> > +static bool
> > +tkt_q_init_contend(int i, arch_spinlock_t *asp, struct __raw_tickets inc)
> > +{
> > + arch_spinlock_t asold;
> > + arch_spinlock_t asnew;
> > + struct tkt_q_head *tqhp;
> > +
> > + /* Initialize the i-th queue header. */
> > + tqhp = &tkt_q_heads[i];
> > + tqhp->spin = NULL;
> > + tqhp->spin_tail = &tqhp->spin;
> > +
> > + /* Each pass through this loop attempts to mark the lock as queued. */
> > + do {
> > + asold.head_tail = ACCESS_ONCE(asp->head_tail);
> > + asnew = asold;
> > + if (asnew.tickets.head & 0x1) {
> > +
> > + /* Someone beat us to it, back out. */
> > + smp_mb();
> > + ACCESS_ONCE(tqhp->ref) = NULL;
> > +
> > + /* Spin on the queue element they set up. */
> > + return tkt_q_do_spin(asp, inc);
> > + }
> > +
> > + /* The low-order bit in the head counter says "queued". */
> > + asnew.tickets.head |= 0x1;
> > + } while (cmpxchg(&asp->head_tail,
> > + asold.head_tail,
> > + asnew.head_tail) != asold.head_tail);
> > +
> > + /* Point the queue at the lock and go spin on it. */
> > + tqhp->head_tkt = asold.tickets.head;
> > + smp_mb(); /* Ensure head_tkt is set prior to queuers seeing tqhp. */
> > + ACCESS_ONCE(tqhp->ref) = asp;
> > + return tkt_q_do_spin(asp, inc);
> > +}
> > +
> > +/*
> > + * Start handling a period of high contention by finding a queue to associate
> > + * with this lock. Returns true if successful (in which case we hold the
> > + * lock) and false otherwise (in which case we do -not- hold the lock).
> > + */
> > +bool tkt_q_start_contend(arch_spinlock_t *asp, struct __raw_tickets inc)
> > +{
> > + int i;
> > + int start;
> > +
> > + /* Hash the lock address to find a starting point. */
> > + start = i = tkt_q_hash(asp);
> > +
> > + /*
> > + * Each pass through the following loop attempts to associate
> > + * the lock with the corresponding queue.
> > + */
> > + do {
> > + /*
> > + * Use 0x1 to mark the queue in use, but also avoiding
> > + * any spinners trying to use it before we get it all
> > + * initialized.
> > + */
> > + if (cmpxchg(&tkt_q_heads[i].ref,
> > + NULL,
> > + (arch_spinlock_t *)0x1) == NULL) {
> > +
> > + /* Succeeded, now go initialize it. */
> > + return tkt_q_init_contend(i, asp, inc);
> > + }
> > +
> > + /* If someone beat us to it, go spin on their queue. */
> > + if (ACCESS_ONCE(asp->tickets.head) & 0x1)
> > + return tkt_q_do_spin(asp, inc);
> > + } while ((i = tkt_q_next_slot(i)) != start);
> > +
> > + /* All the queues are in use, revert to spinning on the ticket lock. */
> > + return false;
> > +}
> > +
> > +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> > +{
> > + if (unlikely(inc.head & 0x1)) {
> > +
> > + /* This lock has a queue, so go spin on the queue. */
> > + if (tkt_q_do_spin(ap, inc))
> > + return true;
> > +
> > + /* Get here if the queue is in transition: Retry next time. */
> > +
> > + } else if (TICKET_T_CMP_GE(ACCESS_ONCE(ap->tickets.tail) - TKT_Q_SWITCH,
> > + ACCESS_ONCE(ap->tickets.head))) {
> > +
> > + /*
> > + * This lock has lots of spinners, but no queue.
> > + * Go create a queue to spin on.
> > + */
> > + if (tkt_q_start_contend(ap, inc))
> > + return true;
> > +
> > + /* Get here if the queue is in transition: Retry next time. */
> > + }
> > +
> > + /* Either no need for a queue or the queue is in transition. Spin. */
> > + cpu_relax();
> > + return false;
> > +}
>
>

2013-06-11 09:57:08

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Mon, Jun 10, 2013 at 08:44:40PM -0400, Steven Rostedt wrote:
> On Sun, 2013-06-09 at 12:36 -0700, Paul E. McKenney wrote:
>
> > +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> > +
> > +static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> > +{
> > + register struct __raw_tickets inc = { .tail = 2 };
> > +
> > + inc = xadd(&lock->tickets, inc);
> > + for (;;) {
> > + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> > + break;
> > + inc.head = ACCESS_ONCE(lock->tickets.head);
> > + }
> > + barrier(); /* smp_mb() on Power or ARM. */
> > +}
> > +
> > +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> > {
> > arch_spinlock_t old, new;
> > @@ -70,17 +91,37 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> > if (old.tickets.head != old.tickets.tail)
> > return 0;
> >
> > +#ifndef CONFIG_TICKET_LOCK_QUEUED
> > new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
> > +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > + new.head_tail = old.head_tail + (2 << TICKET_SHIFT);
> > +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> >
> > /* cmpxchg is a full barrier, so nothing can move before it */
> > return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
> > }
> >
> > +#ifndef CONFIG_TICKET_LOCK_QUEUED
> > +
> > static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> > {
> > __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
> > }
> >
> > +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > +extern void tkt_q_do_wake(arch_spinlock_t *asp);
> > +
> > +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> > +{
> > + __ticket_t head = 2;
> > +
> > + head = xadd(&lock->tickets.head, 2);
> > + if (head & 0x1)
> > + tkt_q_do_wake(lock);
> > +}
> > +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
> > {
> > struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> > diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> > index ad0ad07..cdaefdd 100644
> > --- a/arch/x86/include/asm/spinlock_types.h
> > +++ b/arch/x86/include/asm/spinlock_types.h
> > @@ -7,12 +7,18 @@
> >
> > #include <linux/types.h>
> >
> > -#if (CONFIG_NR_CPUS < 256)
> > +#if (CONFIG_NR_CPUS < 128)
> > typedef u8 __ticket_t;
> > typedef u16 __ticketpair_t;
> > -#else
> > +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
> > +#elif (CONFIG_NR_CPUS < 32768)
> > typedef u16 __ticket_t;
> > typedef u32 __ticketpair_t;
> > +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
> > +#else
> > +typedef u32 __ticket_t;
> > +typedef u64 __ticketpair_t;
> > +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
> > #endif
> >
> > #define TICKET_SHIFT (sizeof(__ticket_t) * 8)
> > @@ -21,7 +27,11 @@ typedef struct arch_spinlock {
> > union {
> > __ticketpair_t head_tail;
> > struct __raw_tickets {
> > +#ifdef __BIG_ENDIAN__
> > + __ticket_t tail, head;
> > +#else /* #ifdef __BIG_ENDIAN__ */
> > __ticket_t head, tail;
> > +#endif /* #else #ifdef __BIG_ENDIAN__ */
> > } tickets;
> > };
> > } arch_spinlock_t;
> > diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> > index e9ef6d6..816a87c 100644
> > --- a/include/linux/kernel.h
> > +++ b/include/linux/kernel.h
> > @@ -15,6 +15,7 @@
> > #include <asm/byteorder.h>
> > #include <uapi/linux/kernel.h>
> >
> > +#define UCHAR_MAX ((u8)(~0U))
> > #define USHRT_MAX ((u16)(~0U))
> > #define SHRT_MAX ((s16)(USHRT_MAX>>1))
> > #define SHRT_MIN ((s16)(-SHRT_MAX - 1))
> > diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> > index 44511d1..ad9c67c 100644
> > --- a/kernel/Kconfig.locks
> > +++ b/kernel/Kconfig.locks
> > @@ -223,3 +223,21 @@ endif
> > config MUTEX_SPIN_ON_OWNER
> > def_bool y
> > depends on SMP && !DEBUG_MUTEXES
> > +
> > +config TICKET_LOCK_QUEUED
> > + bool "Dynamically switch between ticket and queued locking"
> > + default n
> > + ---help---
> > + Enable dynamic switching between ticketlock and queued locking
> > + on a per-lock basis. This option will slow down low-contention
> > + acquisition and release very slightly (additional conditional
> > + in release path), but will provide more efficient operation at
> > + high levels of lock contention. High-contention operation will
> > + not be quite as efficient as would be a pure queued lock, but
> > + this dynamic approach consumes less memory than queud locks
> > + and also runs faster at low levels of contention.
> > +
> > + Say "Y" if you are running on a large system with a workload
> > + that is likely to result in high levels of contention.
> > +
> > + Say "N" if you are unsure.
> > diff --git a/kernel/Makefile b/kernel/Makefile
> > index 271fd31..70a91f7 100644
> > --- a/kernel/Makefile
> > +++ b/kernel/Makefile
> > @@ -51,6 +51,7 @@ endif
> > obj-$(CONFIG_SMP) += spinlock.o
> > obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
> > obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
> > +obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
> > obj-$(CONFIG_UID16) += uid16.o
> > obj-$(CONFIG_MODULES) += module.o
> > obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
> > diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> > new file mode 100644
> > index 0000000..f01b760
> > --- /dev/null
> > +++ b/kernel/tktqlock.c
> > @@ -0,0 +1,333 @@
> > +/*
> > + * Queued ticket spinlocks.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, write to the Free Software
> > + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> > + *
> > + * Copyright IBM Corporation, 2013
> > + *
> > + * Authors: Paul E. McKenney <[email protected]>
> > + */
> > +#include <linux/types.h>
> > +#include <linux/kernel.h>
> > +#include <linux/spinlock.h>
> > +#include <linux/smp.h>
> > +#include <linux/percpu.h>
> > +
> > +struct tkt_q {
> > + int cpu;
> > + __ticket_t tail;
> > + struct tkt_q *next;
> > +};
> > +
> > +struct tkt_q_head {
> > + arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
> > + s32 head_tkt; /* Head ticket when started queuing. */
> > + struct tkt_q *spin; /* Head of queue. */
> > + struct tkt_q **spin_tail; /* Tail of queue. */
> > +};
> > +
> > +/*
> > + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> > + * given ticket lock to motivate switching to spinning on a queue.
> > + * The reason that it is twice the number is because the bottom bit of
> > + * the ticket is reserved for the bit that indicates that a queue is
> > + * associated with the lock.
> > + */
> > +#define TKT_Q_SWITCH (16 * 2)
> > +
> > +/*
> > + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> > + * might have multiple highly contended locks, so provide more queues for
> > + * systems with larger numbers of CPUs.
> > + */
> > +#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
> > +
> > +/* The queues themselves. */
> > +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
> > +
> > +/* Advance to the next queue slot, wrapping around to the beginning. */
> > +static int tkt_q_next_slot(int i)
> > +{
> > + return (++i < TKT_Q_NQUEUES) ? i : 0;
> > +}
> > +
> > +/* Very crude hash from lock address to queue slot number. */
> > +static unsigned long tkt_q_hash(arch_spinlock_t *asp)
> > +{
> > + return (((unsigned long)asp) >> 8) % TKT_Q_NQUEUES;
> > +}
> > +
> > +/*
> > + * Return a pointer to the queue header associated with the specified lock,
> > + * or return NULL if there is no queue for the lock or if the lock's queue
> > + * is in transition.
> > + */
> > +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
> > +{
> > + int i;
> > + int start;
> > +
> > + start = i = tkt_q_hash(asp);
> > + do
> > + if (tkt_q_heads[i].ref == asp)
> > + return &tkt_q_heads[i];
> > + while ((i = tkt_q_next_slot(i)) != start);
> > + return NULL;
> > +}
> > +
> > +/*
> > + * Try to stop queuing, reverting back to normal ticket-lock operation.
> > + * We can only stop queuing when the queue is empty, which means that
> > + * we need to correctly handle races where someone shows up in the queue
> > + * just as we are trying to dispense with the queue. They win, we lose.
> > + */
> > +static bool tkt_q_try_unqueue(arch_spinlock_t *asp, struct tkt_q_head *tqhp)
> > +{
> > + arch_spinlock_t asold;
> > + arch_spinlock_t asnew;
> > +
> > + /* Pick up the ticket values. */
> > + asold = ACCESS_ONCE(*asp);
> > + if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
> > +
> > + /* Attempt to mark the lock as not having a queue. */
> > + asnew = asold;
> > + asnew.tickets.head &= ~0x1;
> > + if (cmpxchg(&asp->head_tail,
> > + asold.head_tail,
> > + asnew.head_tail) == asold.head_tail) {
> > +
> > + /* Succeeded, mark the queue as unused. */
> > + ACCESS_ONCE(tqhp->ref) = NULL;
> > + return true;
> > + }
> > + }
> > +
> > + /* Failed, tell the caller there is still a queue to pass off to. */
> > + return false;
> > +}
> > +
> > +/*
> > + * Hand the lock off to the first CPU on the queue.
> > + */
> > +void tkt_q_do_wake(arch_spinlock_t *asp)
> > +{
> > + struct tkt_q_head *tqhp;
> > + struct tkt_q *tqp;
> > +
> > + /* If the queue is still being set up, wait for it. */
> > + while ((tqhp = tkt_q_find_head(asp)) == NULL)
> > + cpu_relax();
> > +
> > + for (;;) {
> > +
> > + /* Find the first queue element. */
> > + tqp = ACCESS_ONCE(tqhp->spin);
> > + if (tqp != NULL)
> > + break; /* Element exists, hand off lock. */
> > + if (tkt_q_try_unqueue(asp, tqhp))
> > + return; /* No element, successfully removed queue. */
> > + cpu_relax();
> > + }
> > + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> > + ACCESS_ONCE(tqhp->head_tkt) = -1;
> > + smp_mb(); /* Order pointer fetch and assignment against handoff. */
> > + ACCESS_ONCE(tqp->cpu) = -1;
> > +}
> > +
> > +/*
> > + * Given a lock that already has a queue associated with it, spin on
> > + * that queue. Return false if there was no queue (which means we do not
> > + * hold the lock) and true otherwise (meaning we -do- hold the lock).
> > + */
> > +bool tkt_q_do_spin(arch_spinlock_t *asp, struct __raw_tickets inc)
> > +{
> > + struct tkt_q **oldtail;
> > + struct tkt_q tq;
> > + struct tkt_q_head *tqhp;
> > +
> > + /*
> > + * Ensure that accesses to queue header happen after sensing
> > + * the lock's have-queue bit.
> > + */
> > + smp_mb(); /* See above block comment. */
> > +
> > + /* If there no longer is a queue, leave. */
> > + tqhp = tkt_q_find_head(asp);
> > + if (tqhp == NULL)
> > + return false;
> > +
> > + /* Initialize our queue element. */
> > + tq.cpu = raw_smp_processor_id();
> > + tq.tail = inc.tail;
> > + tq.next = NULL;
> > +
> > + /* Check to see if we already hold the lock. */
> > + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> > + /* The last holder left before queue formed, we hold lock. */
> > + tqhp->head_tkt = -1;
> > + return true;
> > + }
> > +
> > + /* Add our element to the tail of the queue. */
> > + oldtail = xchg(&tqhp->spin_tail, &tq.next);
>
> Boy this is tricky code! I thought I found a race window here, but as I
> went to write my email saying "Gotcha!" I found that it wasn't a race
> after all. But as I went though the effort of writing this, I figured I
> would send this out as documentation for others to see. Hmm, I wonder if
> we can use this email to add more comments. Anyway, here's what I
> thought was wrong ;-)

If you didn't know any better, you might even think that I had done
something like this before. ;-)

> OK, I originally thought there was a race window here. Let's say that an
> NMI hit right here, and it happens to be a big one, where lots of things
> can happen on other CPUs right now.
>
> The scenario is that there's just one item on the queue, which is
> waiting for the lock to be released, and is spinning below in the:
>
> while (ACCESS_ONCE(tq.cpu) != -1)
> cpu_relax();
>
> And then the lock is released, where in tkt_q_do_wake() the following is
> called:
>
> ACCESS_ONCE(tqp->cpu) = -1;
>
> Now the old queued task is released. But it's tq->next hasn't been set
> yet, and is still NULL. It leaves by doing:
>
> ACCESS_ONCE(tqhp->spin) = tq.next;
> return true;
>
> All before this task gets to set *oldtail to &tq. But, I then looked
> below...
>
>
> > + ACCESS_ONCE(*oldtail) = &tq;
> > +
> > + /* Spin until handoff. */
> > + while (ACCESS_ONCE(tq.cpu) != -1)
> > + cpu_relax();
> > +
> > + /*
> > + * Remove our element from the queue. If the queue is now empty,
> > + * update carefully so that the next acquisition will queue itself
> > + * at the head of the list.
> > + */
> > + if (tq.next == NULL) {
>
> This checks for that scenario.

Yep!

> As if the old task were to come out
> spinning, the problem would only be if it was the last one on the list,
> and its tq.next was NULL. But if that was the case, then we set spin to
> NULL and do the next trick, where I thought I gotcha again...
>
>
> > +
> > + /* Mark the queue empty. */
> > + tqhp->spin = NULL;
> > +
> > + /* Try to point the tail back at the head. */
> > + if (cmpxchg(&tqhp->spin_tail,
> > + &tq.next,
> > + &tqhp->spin) == &tq.next)
>
> Here, I was thinking, oh wait, what happens if this is called right
> before the xchg() above. Then we would set spin_tail but not update the
> old tq.next. But wait! look at what we assign spin_tail to. It's the
> address of spin, which would be what oldtail would point to above, and
> then above would set spin to the new tq!

Yep again!

> OK, I haven't found a issue here yet, but youss are beiing trickssy! We
> don't like trickssy, and we must find precccciouss!!!
>
>
> This code is starting to make me look like Gollum :-p

Hmmm... The time and effort to do this might almost have been worthwhile
just to accomplish that! ;-)

But yes, this would need better comments, design documentation, or
maybe both.

Thanx, Paul

> -- Steve
>
> > + return true; /* Succeeded, queue is now empty. */
> > +
> > + /* Failed, if needed, wait for the enqueue to complete. */
> > + while (tq.next == NULL)
> > + cpu_relax();
> > +
> > + /* The following code will repair the head. */
> > + }
> > + smp_mb(); /* Force ordering between handoff and critical section. */
> > +
> > + /* Advance list-head pointer. */
> > + ACCESS_ONCE(tqhp->spin) = tq.next;
> > + return true;
> > +}
> > +
> > +/*
> > + * Given a lock that does not have a queue, attempt to associate the
> > + * i-th queue with it, returning true if successful (meaning we hold
> > + * the lock) or false otherwise (meaning we do -not- hold the lock).
> > + * Note that the caller has already filled in ->ref with 0x1, so we
> > + * own the queue.
> > + */
> > +static bool
> > +tkt_q_init_contend(int i, arch_spinlock_t *asp, struct __raw_tickets inc)
> > +{
> > + arch_spinlock_t asold;
> > + arch_spinlock_t asnew;
> > + struct tkt_q_head *tqhp;
> > +
> > + /* Initialize the i-th queue header. */
> > + tqhp = &tkt_q_heads[i];
> > + tqhp->spin = NULL;
> > + tqhp->spin_tail = &tqhp->spin;
> > +
> > + /* Each pass through this loop attempts to mark the lock as queued. */
> > + do {
> > + asold.head_tail = ACCESS_ONCE(asp->head_tail);
> > + asnew = asold;
> > + if (asnew.tickets.head & 0x1) {
> > +
> > + /* Someone beat us to it, back out. */
> > + smp_mb();
> > + ACCESS_ONCE(tqhp->ref) = NULL;
> > +
> > + /* Spin on the queue element they set up. */
> > + return tkt_q_do_spin(asp, inc);
> > + }
> > +
> > + /* The low-order bit in the head counter says "queued". */
> > + asnew.tickets.head |= 0x1;
> > + } while (cmpxchg(&asp->head_tail,
> > + asold.head_tail,
> > + asnew.head_tail) != asold.head_tail);
> > +
> > + /* Point the queue at the lock and go spin on it. */
> > + tqhp->head_tkt = asold.tickets.head;
> > + smp_mb(); /* Ensure head_tkt is set prior to queuers seeing tqhp. */
> > + ACCESS_ONCE(tqhp->ref) = asp;
> > + return tkt_q_do_spin(asp, inc);
> > +}
> > +
> > +/*
> > + * Start handling a period of high contention by finding a queue to associate
> > + * with this lock. Returns true if successful (in which case we hold the
> > + * lock) and false otherwise (in which case we do -not- hold the lock).
> > + */
> > +bool tkt_q_start_contend(arch_spinlock_t *asp, struct __raw_tickets inc)
> > +{
> > + int i;
> > + int start;
> > +
> > + /* Hash the lock address to find a starting point. */
> > + start = i = tkt_q_hash(asp);
> > +
> > + /*
> > + * Each pass through the following loop attempts to associate
> > + * the lock with the corresponding queue.
> > + */
> > + do {
> > + /*
> > + * Use 0x1 to mark the queue in use, but also avoiding
> > + * any spinners trying to use it before we get it all
> > + * initialized.
> > + */
> > + if (cmpxchg(&tkt_q_heads[i].ref,
> > + NULL,
> > + (arch_spinlock_t *)0x1) == NULL) {
> > +
> > + /* Succeeded, now go initialize it. */
> > + return tkt_q_init_contend(i, asp, inc);
> > + }
> > +
> > + /* If someone beat us to it, go spin on their queue. */
> > + if (ACCESS_ONCE(asp->tickets.head) & 0x1)
> > + return tkt_q_do_spin(asp, inc);
> > + } while ((i = tkt_q_next_slot(i)) != start);
> > +
> > + /* All the queues are in use, revert to spinning on the ticket lock. */
> > + return false;
> > +}
> > +
> > +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> > +{
> > + if (unlikely(inc.head & 0x1)) {
> > +
> > + /* This lock has a queue, so go spin on the queue. */
> > + if (tkt_q_do_spin(ap, inc))
> > + return true;
> > +
> > + /* Get here if the queue is in transition: Retry next time. */
> > +
> > + } else if (TICKET_T_CMP_GE(ACCESS_ONCE(ap->tickets.tail) - TKT_Q_SWITCH,
> > + ACCESS_ONCE(ap->tickets.head))) {
> > +
> > + /*
> > + * This lock has lots of spinners, but no queue.
> > + * Go create a queue to spin on.
> > + */
> > + if (tkt_q_start_contend(ap, inc))
> > + return true;
> > +
> > + /* Get here if the queue is in transition: Retry next time. */
> > + }
> > +
> > + /* Either no need for a queue or the queue is in transition. Spin. */
> > + cpu_relax();
> > + return false;
> > +}
>
>

2013-06-11 10:07:03

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Mon, Jun 10, 2013 at 05:51:14PM -0700, Linus Torvalds wrote:
> On Mon, Jun 10, 2013 at 5:44 PM, Steven Rostedt <[email protected]> wrote:
> >
> > OK, I haven't found a issue here yet, but youss are beiing trickssy! We
> > don't like trickssy, and we must find precccciouss!!!

Heh! You should see what it looks like if you make slightly different
design decisions. For example, just you try switching back from queued
to ticket mode while there are still CPUs spinning on the lock! ;-)

> .. and I personally have my usual reservations. I absolutely hate
> papering over scalability issues, and historically whenever people
> have ever thought that we want complex spinlocks, the problem has
> always been that the locking sucks.
>
> So reinforced by previous events, I really feel that code that needs
> this kind of spinlock is broken and needs to be fixed, rather than
> actually introduce tricky spinlocks.

If the only effect of this patch submission is to give people a bit more
motivation to solve the underlying lock-contention problems, I am happy.

> So in order to merge something like this, I want (a) numbers for real
> loads and (b) explanations for why the spinlock users cannot be fixed.
>
> Because "we might hit loads" is just not good enough. I would counter
> with "hiding problems causes more of them".

Agreed. As I said in the first paragraph of the commit log:

... if we must have high-contention locks, why not make them
automatically switch between light-weight ticket locks at low
contention and queued locks at high contention?

The reason that I created this patch was that I was seeing people
arguing for locks optimized for high contention, and the ones that I
saw required the developer to predict which locks would encounter high
levels of contention. Changes in workloads would of course invalidate
those predictions.

But again, if the only effect of this patch submission is to give people
a bit more motivation to solve the underlying lock-contention problems,
I am happy.

Thanx, Paul

2013-06-11 10:14:12

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 03:53:17PM +0800, Lai Jiangshan wrote:
> On 06/11/2013 08:51 AM, Linus Torvalds wrote:
> > On Mon, Jun 10, 2013 at 5:44 PM, Steven Rostedt <[email protected]> wrote:
> >>
> >> OK, I haven't found a issue here yet, but youss are beiing trickssy! We
> >> don't like trickssy, and we must find precccciouss!!!
> >
> > .. and I personally have my usual reservations. I absolutely hate
> > papering over scalability issues, and historically whenever people
> > have ever thought that we want complex spinlocks, the problem has
> > always been that the locking sucks.
> >
> > So reinforced by previous events, I really feel that code that needs
> > this kind of spinlock is broken and needs to be fixed, rather than
> > actually introduce tricky spinlocks.
> >
> > So in order to merge something like this, I want (a) numbers for real
> > loads and (b) explanations for why the spinlock users cannot be fixed.
> >
> > Because "we might hit loads" is just not good enough. I would counter
> > with "hiding problems causes more of them".
> >
>
> Hi, all
>
> Off-topic, although I am in this community for several years,
> I am not exactly clear with this problem.
>
> 1) In general case, which lock is the most competitive in the kernel? what it protects for?
> 2) In which special case, which lock is the most competitive in the kernel? what it protects for?
> 3) In general case, which list is the most hot list?
> 4) In which special case, which list is the most hot list?

Others would know better than I, but mmap_sem has been called out as a
prime offender for some workloads. There is of course some debate as
to whether the fault lies mmap_sem or with the workloads. There have
been some efforts to solve this one on LKML, plus some in academia have
worked on this as well:

http://people.csail.mit.edu/nickolai/papers/clements-bonsai.pdf
http://pdos.csail.mit.edu/papers/radixvm:eurosys13.pdf

And IIRC this was the subject of a session at a recent minisummit.

There are a few locks within the RCU implementation that have popped
up from time to time on very large systems, but I have dealt with those
and have plans for each should it become a problem. The plans probably
won't survive first contact with a real workload, but having thought
things through is very helpful.

Thanx, Paul

2013-06-11 14:48:19

by Lai Jiangshan

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Mon, Jun 10, 2013 at 3:36 AM, Paul E. McKenney
<[email protected]> wrote:
> Breaking up locks is better than implementing high-contention locks, but
> if we must have high-contention locks, why not make them automatically
> switch between light-weight ticket locks at low contention and queued
> locks at high contention?
>
> This commit therefore allows ticket locks to automatically switch between
> pure ticketlock and queued-lock operation as needed. If too many CPUs
> are spinning on a given ticket lock, a queue structure will be allocated
> and the lock will switch to queued-lock operation. When the lock becomes
> free, it will switch back into ticketlock operation. The low-order bit
> of the head counter is used to indicate that the lock is in queued mode,
> which forces an unconditional mismatch between the head and tail counters.
> This approach means that the common-case code path under conditions of
> low contention is very nearly that of a plain ticket lock.
>
> A fixed number of queueing structures is statically allocated in an
> array. The ticket-lock address is used to hash into an initial element,
> but if that element is already in use, it moves to the next element. If
> the entire array is already in use, continue to spin in ticket mode.
>
> This has been only lightly tested in the kernel, though a userspace
> implementation has survived substantial testing.
>
> Signed-off-by: Paul E. McKenney <[email protected]>
>
> diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
> index 33692ea..b4a91b0 100644
> --- a/arch/x86/include/asm/spinlock.h
> +++ b/arch/x86/include/asm/spinlock.h
> @@ -34,6 +34,8 @@
> # define UNLOCK_LOCK_PREFIX
> #endif
>
> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> +
> /*
> * Ticket locks are conceptually two parts, one indicating the current head of
> * the queue, and the other indicating the current tail. The lock is acquired
> @@ -62,6 +64,25 @@ static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> barrier(); /* make sure nothing creeps before the lock is taken */
> }
>
> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> +
> +static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> +{
> + register struct __raw_tickets inc = { .tail = 2 };
> +
> + inc = xadd(&lock->tickets, inc);
> + for (;;) {
> + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> + break;
> + inc.head = ACCESS_ONCE(lock->tickets.head);
> + }
> + barrier(); /* smp_mb() on Power or ARM. */
> +}
> +
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> {
> arch_spinlock_t old, new;
> @@ -70,17 +91,37 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> if (old.tickets.head != old.tickets.tail)
> return 0;
>
> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> + new.head_tail = old.head_tail + (2 << TICKET_SHIFT);
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
>
> /* cmpxchg is a full barrier, so nothing can move before it */
> return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
> }
>
> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> +
> static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> {
> __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
> }
>
> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> +extern void tkt_q_do_wake(arch_spinlock_t *asp);
> +
> +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> +{
> + __ticket_t head = 2;
> +
> + head = xadd(&lock->tickets.head, 2);
> + if (head & 0x1)
> + tkt_q_do_wake(lock);
> +}
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
> {
> struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> index ad0ad07..cdaefdd 100644
> --- a/arch/x86/include/asm/spinlock_types.h
> +++ b/arch/x86/include/asm/spinlock_types.h
> @@ -7,12 +7,18 @@
>
> #include <linux/types.h>
>
> -#if (CONFIG_NR_CPUS < 256)
> +#if (CONFIG_NR_CPUS < 128)
> typedef u8 __ticket_t;
> typedef u16 __ticketpair_t;
> -#else
> +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
> +#elif (CONFIG_NR_CPUS < 32768)
> typedef u16 __ticket_t;
> typedef u32 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
> +#else
> +typedef u32 __ticket_t;
> +typedef u64 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
> #endif
>
> #define TICKET_SHIFT (sizeof(__ticket_t) * 8)
> @@ -21,7 +27,11 @@ typedef struct arch_spinlock {
> union {
> __ticketpair_t head_tail;
> struct __raw_tickets {
> +#ifdef __BIG_ENDIAN__
> + __ticket_t tail, head;
> +#else /* #ifdef __BIG_ENDIAN__ */
> __ticket_t head, tail;
> +#endif /* #else #ifdef __BIG_ENDIAN__ */
> } tickets;
> };
> } arch_spinlock_t;
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index e9ef6d6..816a87c 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -15,6 +15,7 @@
> #include <asm/byteorder.h>
> #include <uapi/linux/kernel.h>
>
> +#define UCHAR_MAX ((u8)(~0U))
> #define USHRT_MAX ((u16)(~0U))
> #define SHRT_MAX ((s16)(USHRT_MAX>>1))
> #define SHRT_MIN ((s16)(-SHRT_MAX - 1))
> diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> index 44511d1..ad9c67c 100644
> --- a/kernel/Kconfig.locks
> +++ b/kernel/Kconfig.locks
> @@ -223,3 +223,21 @@ endif
> config MUTEX_SPIN_ON_OWNER
> def_bool y
> depends on SMP && !DEBUG_MUTEXES
> +
> +config TICKET_LOCK_QUEUED
> + bool "Dynamically switch between ticket and queued locking"
> + default n
> + ---help---
> + Enable dynamic switching between ticketlock and queued locking
> + on a per-lock basis. This option will slow down low-contention
> + acquisition and release very slightly (additional conditional
> + in release path), but will provide more efficient operation at
> + high levels of lock contention. High-contention operation will
> + not be quite as efficient as would be a pure queued lock, but
> + this dynamic approach consumes less memory than queud locks
> + and also runs faster at low levels of contention.
> +
> + Say "Y" if you are running on a large system with a workload
> + that is likely to result in high levels of contention.
> +
> + Say "N" if you are unsure.
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 271fd31..70a91f7 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -51,6 +51,7 @@ endif
> obj-$(CONFIG_SMP) += spinlock.o
> obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
> obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
> +obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
> obj-$(CONFIG_UID16) += uid16.o
> obj-$(CONFIG_MODULES) += module.o
> obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
> diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> new file mode 100644
> index 0000000..f01b760
> --- /dev/null
> +++ b/kernel/tktqlock.c
> @@ -0,0 +1,333 @@
> +/*
> + * Queued ticket spinlocks.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + * Copyright IBM Corporation, 2013
> + *
> + * Authors: Paul E. McKenney <[email protected]>
> + */
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/spinlock.h>
> +#include <linux/smp.h>
> +#include <linux/percpu.h>
> +
> +struct tkt_q {
> + int cpu;
> + __ticket_t tail;
> + struct tkt_q *next;
> +};
> +
> +struct tkt_q_head {
> + arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
> + s32 head_tkt; /* Head ticket when started queuing. */
> + struct tkt_q *spin; /* Head of queue. */
> + struct tkt_q **spin_tail; /* Tail of queue. */
> +};
> +
> +/*
> + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> + * given ticket lock to motivate switching to spinning on a queue.
> + * The reason that it is twice the number is because the bottom bit of
> + * the ticket is reserved for the bit that indicates that a queue is
> + * associated with the lock.
> + */
> +#define TKT_Q_SWITCH (16 * 2)
> +
> +/*
> + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> + * might have multiple highly contended locks, so provide more queues for
> + * systems with larger numbers of CPUs.
> + */
> +#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
> +
> +/* The queues themselves. */
> +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
> +
> +/* Advance to the next queue slot, wrapping around to the beginning. */
> +static int tkt_q_next_slot(int i)
> +{
> + return (++i < TKT_Q_NQUEUES) ? i : 0;
> +}
> +
> +/* Very crude hash from lock address to queue slot number. */
> +static unsigned long tkt_q_hash(arch_spinlock_t *asp)
> +{
> + return (((unsigned long)asp) >> 8) % TKT_Q_NQUEUES;
> +}
> +
> +/*
> + * Return a pointer to the queue header associated with the specified lock,
> + * or return NULL if there is no queue for the lock or if the lock's queue
> + * is in transition.
> + */
> +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
> +{
> + int i;
> + int start;
> +
> + start = i = tkt_q_hash(asp);
> + do
> + if (tkt_q_heads[i].ref == asp)
> + return &tkt_q_heads[i];
> + while ((i = tkt_q_next_slot(i)) != start);
> + return NULL;
> +}
> +
> +/*
> + * Try to stop queuing, reverting back to normal ticket-lock operation.
> + * We can only stop queuing when the queue is empty, which means that
> + * we need to correctly handle races where someone shows up in the queue
> + * just as we are trying to dispense with the queue. They win, we lose.
> + */
> +static bool tkt_q_try_unqueue(arch_spinlock_t *asp, struct tkt_q_head *tqhp)
> +{
> + arch_spinlock_t asold;
> + arch_spinlock_t asnew;
> +
> + /* Pick up the ticket values. */
> + asold = ACCESS_ONCE(*asp);
> + if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
> +
> + /* Attempt to mark the lock as not having a queue. */
> + asnew = asold;
> + asnew.tickets.head &= ~0x1;
> + if (cmpxchg(&asp->head_tail,
> + asold.head_tail,
> + asnew.head_tail) == asold.head_tail) {
> +
> + /* Succeeded, mark the queue as unused. */
> + ACCESS_ONCE(tqhp->ref) = NULL;
> + return true;
> + }
> + }
> +
> + /* Failed, tell the caller there is still a queue to pass off to. */
> + return false;
> +}
> +
> +/*
> + * Hand the lock off to the first CPU on the queue.
> + */
> +void tkt_q_do_wake(arch_spinlock_t *asp)
> +{
> + struct tkt_q_head *tqhp;
> + struct tkt_q *tqp;
> +
> + /* If the queue is still being set up, wait for it. */
> + while ((tqhp = tkt_q_find_head(asp)) == NULL)
> + cpu_relax();
> +
> + for (;;) {
> +
> + /* Find the first queue element. */
> + tqp = ACCESS_ONCE(tqhp->spin);
> + if (tqp != NULL)
> + break; /* Element exists, hand off lock. */
> + if (tkt_q_try_unqueue(asp, tqhp))
> + return; /* No element, successfully removed queue. */
> + cpu_relax();
> + }
> + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> + ACCESS_ONCE(tqhp->head_tkt) = -1;
> + smp_mb(); /* Order pointer fetch and assignment against handoff. */
> + ACCESS_ONCE(tqp->cpu) = -1;
> +}
> +
> +/*
> + * Given a lock that already has a queue associated with it, spin on
> + * that queue. Return false if there was no queue (which means we do not
> + * hold the lock) and true otherwise (meaning we -do- hold the lock).
> + */
> +bool tkt_q_do_spin(arch_spinlock_t *asp, struct __raw_tickets inc)
> +{
> + struct tkt_q **oldtail;
> + struct tkt_q tq;
> + struct tkt_q_head *tqhp;
> +
> + /*
> + * Ensure that accesses to queue header happen after sensing
> + * the lock's have-queue bit.
> + */
> + smp_mb(); /* See above block comment. */
> +
> + /* If there no longer is a queue, leave. */
> + tqhp = tkt_q_find_head(asp);
> + if (tqhp == NULL)
> + return false;
> +
> + /* Initialize our queue element. */
> + tq.cpu = raw_smp_processor_id();
> + tq.tail = inc.tail;
> + tq.next = NULL;
> +
> + /* Check to see if we already hold the lock. */
> + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> + /* The last holder left before queue formed, we hold lock. */
> + tqhp->head_tkt = -1;
> + return true;
> + }
> +
> + /* Add our element to the tail of the queue. */
> + oldtail = xchg(&tqhp->spin_tail, &tq.next);
> + ACCESS_ONCE(*oldtail) = &tq;
> +
> + /* Spin until handoff. */
> + while (ACCESS_ONCE(tq.cpu) != -1)
> + cpu_relax();
> +
> + /*
> + * Remove our element from the queue. If the queue is now empty,
> + * update carefully so that the next acquisition will queue itself
> + * at the head of the list.
> + */
> + if (tq.next == NULL) {
> +
> + /* Mark the queue empty. */
> + tqhp->spin = NULL;
> +
> + /* Try to point the tail back at the head. */
> + if (cmpxchg(&tqhp->spin_tail,
> + &tq.next,
> + &tqhp->spin) == &tq.next)
> + return true; /* Succeeded, queue is now empty. */
> +
> + /* Failed, if needed, wait for the enqueue to complete. */
> + while (tq.next == NULL)
> + cpu_relax();
> +
> + /* The following code will repair the head. */
> + }
> + smp_mb(); /* Force ordering between handoff and critical section. */
> +
> + /* Advance list-head pointer. */
> + ACCESS_ONCE(tqhp->spin) = tq.next;
> + return true;
> +}
> +
> +/*
> + * Given a lock that does not have a queue, attempt to associate the
> + * i-th queue with it, returning true if successful (meaning we hold
> + * the lock) or false otherwise (meaning we do -not- hold the lock).
> + * Note that the caller has already filled in ->ref with 0x1, so we
> + * own the queue.
> + */
> +static bool
> +tkt_q_init_contend(int i, arch_spinlock_t *asp, struct __raw_tickets inc)
> +{
> + arch_spinlock_t asold;
> + arch_spinlock_t asnew;
> + struct tkt_q_head *tqhp;
> +
> + /* Initialize the i-th queue header. */
> + tqhp = &tkt_q_heads[i];
> + tqhp->spin = NULL;
> + tqhp->spin_tail = &tqhp->spin;
> +
> + /* Each pass through this loop attempts to mark the lock as queued. */
> + do {
> + asold.head_tail = ACCESS_ONCE(asp->head_tail);
> + asnew = asold;
> + if (asnew.tickets.head & 0x1) {
> +
> + /* Someone beat us to it, back out. */
> + smp_mb();
> + ACCESS_ONCE(tqhp->ref) = NULL;
> +
> + /* Spin on the queue element they set up. */
> + return tkt_q_do_spin(asp, inc);
> + }
> +
> + /* The low-order bit in the head counter says "queued". */
> + asnew.tickets.head |= 0x1;
> + } while (cmpxchg(&asp->head_tail,
> + asold.head_tail,
> + asnew.head_tail) != asold.head_tail);
> +
> + /* Point the queue at the lock and go spin on it. */
> + tqhp->head_tkt = asold.tickets.head;
> + smp_mb(); /* Ensure head_tkt is set prior to queuers seeing tqhp. */
> + ACCESS_ONCE(tqhp->ref) = asp;
> + return tkt_q_do_spin(asp, inc);
> +}

Just small revise.

I just move " tqhp->head_tkt = asold.tickets.head;" into the loop, so
we can use "asp->tickets.head & 0x1" to
indicates that queued spinlock is prepared instead of by "tqhp->ref == asp".

See the append diff.
(And I guess, after it you can force only the CPUs which
"inc.tail - tqhp->head_tkt > TKT_Q_SWITCH"
do queued spin to remove the thundering herd)

diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
index f01b760..4ea409b 100644
--- a/kernel/tktqlock.c
+++ b/kernel/tktqlock.c
@@ -27,7 +27,6 @@

struct tkt_q {
int cpu;
- __ticket_t tail;
struct tkt_q *next;
};

@@ -127,9 +126,8 @@ void tkt_q_do_wake(arch_spinlock_t *asp)
struct tkt_q_head *tqhp;
struct tkt_q *tqp;

- /* If the queue is still being set up, wait for it. */
- while ((tqhp = tkt_q_find_head(asp)) == NULL)
- cpu_relax();
+ tqhp = tkt_q_find_head(asp);
+ BUG_ON(!tqhp);

for (;;) {

@@ -141,8 +139,6 @@ void tkt_q_do_wake(arch_spinlock_t *asp)
return; /* No element, successfully removed queue. */
cpu_relax();
}
- if (ACCESS_ONCE(tqhp->head_tkt) != -1)
- ACCESS_ONCE(tqhp->head_tkt) = -1;
smp_mb(); /* Order pointer fetch and assignment against handoff. */
ACCESS_ONCE(tqp->cpu) = -1;
}
@@ -164,20 +160,16 @@ bool tkt_q_do_spin(arch_spinlock_t *asp, struct
__raw_tickets inc)
*/
smp_mb(); /* See above block comment. */

- /* If there no longer is a queue, leave. */
tqhp = tkt_q_find_head(asp);
- if (tqhp == NULL)
- return false;
+ BUG_ON(!tqhp);

/* Initialize our queue element. */
tq.cpu = raw_smp_processor_id();
- tq.tail = inc.tail;
tq.next = NULL;

/* Check to see if we already hold the lock. */
if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
/* The last holder left before queue formed, we hold lock. */
- tqhp->head_tkt = -1;
return true;
}

@@ -251,16 +243,14 @@ tkt_q_init_contend(int i, arch_spinlock_t *asp,
struct __raw_tickets inc)
return tkt_q_do_spin(asp, inc);
}

+ /* Point the queue at the lock and go spin on it. */
+ tqhp->head_tkt = asold.tickets.head;
/* The low-order bit in the head counter says "queued". */
asnew.tickets.head |= 0x1;
} while (cmpxchg(&asp->head_tail,
asold.head_tail,
asnew.head_tail) != asold.head_tail);

- /* Point the queue at the lock and go spin on it. */
- tqhp->head_tkt = asold.tickets.head;
- smp_mb(); /* Ensure head_tkt is set prior to queuers seeing tqhp. */
- ACCESS_ONCE(tqhp->ref) = asp;
return tkt_q_do_spin(asp, inc);
}

@@ -282,14 +272,9 @@ bool tkt_q_start_contend(arch_spinlock_t *asp,
struct __raw_tickets inc)
* the lock with the corresponding queue.
*/
do {
- /*
- * Use 0x1 to mark the queue in use, but also avoiding
- * any spinners trying to use it before we get it all
- * initialized.
- */
if (cmpxchg(&tkt_q_heads[i].ref,
NULL,
- (arch_spinlock_t *)0x1) == NULL) {
+ asp) == NULL) {

/* Succeeded, now go initialize it. */
return tkt_q_init_contend(i, asp, inc);

2013-06-11 15:10:32

by Lai Jiangshan

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 10:48 PM, Lai Jiangshan <[email protected]> wrote:
> On Mon, Jun 10, 2013 at 3:36 AM, Paul E. McKenney
> <[email protected]> wrote:
>> Breaking up locks is better than implementing high-contention locks, but
>> if we must have high-contention locks, why not make them automatically
>> switch between light-weight ticket locks at low contention and queued
>> locks at high contention?
>>
>> This commit therefore allows ticket locks to automatically switch between
>> pure ticketlock and queued-lock operation as needed. If too many CPUs
>> are spinning on a given ticket lock, a queue structure will be allocated
>> and the lock will switch to queued-lock operation. When the lock becomes
>> free, it will switch back into ticketlock operation. The low-order bit
>> of the head counter is used to indicate that the lock is in queued mode,
>> which forces an unconditional mismatch between the head and tail counters.
>> This approach means that the common-case code path under conditions of
>> low contention is very nearly that of a plain ticket lock.
>>
>> A fixed number of queueing structures is statically allocated in an
>> array. The ticket-lock address is used to hash into an initial element,
>> but if that element is already in use, it moves to the next element. If
>> the entire array is already in use, continue to spin in ticket mode.
>>
>> This has been only lightly tested in the kernel, though a userspace
>> implementation has survived substantial testing.
>>
>> Signed-off-by: Paul E. McKenney <[email protected]>
>>
>> diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
>> index 33692ea..b4a91b0 100644
>> --- a/arch/x86/include/asm/spinlock.h
>> +++ b/arch/x86/include/asm/spinlock.h
>> @@ -34,6 +34,8 @@
>> # define UNLOCK_LOCK_PREFIX
>> #endif
>>
>> +#ifndef CONFIG_TICKET_LOCK_QUEUED
>> +
>> /*
>> * Ticket locks are conceptually two parts, one indicating the current head of
>> * the queue, and the other indicating the current tail. The lock is acquired
>> @@ -62,6 +64,25 @@ static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
>> barrier(); /* make sure nothing creeps before the lock is taken */
>> }
>>
>> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
>> +
>> +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
>> +
>> +static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
>> +{
>> + register struct __raw_tickets inc = { .tail = 2 };
>> +
>> + inc = xadd(&lock->tickets, inc);
>> + for (;;) {
>> + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
>> + break;
>> + inc.head = ACCESS_ONCE(lock->tickets.head);
>> + }
>> + barrier(); /* smp_mb() on Power or ARM. */
>> +}
>> +
>> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
>> +
>> static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
>> {
>> arch_spinlock_t old, new;
>> @@ -70,17 +91,37 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
>> if (old.tickets.head != old.tickets.tail)
>> return 0;
>>
>> +#ifndef CONFIG_TICKET_LOCK_QUEUED
>> new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
>> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
>> + new.head_tail = old.head_tail + (2 << TICKET_SHIFT);
>> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
>>
>> /* cmpxchg is a full barrier, so nothing can move before it */
>> return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
>> }
>>
>> +#ifndef CONFIG_TICKET_LOCK_QUEUED
>> +
>> static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
>> {
>> __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
>> }
>>
>> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
>> +
>> +extern void tkt_q_do_wake(arch_spinlock_t *asp);
>> +
>> +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
>> +{
>> + __ticket_t head = 2;
>> +
>> + head = xadd(&lock->tickets.head, 2);
>> + if (head & 0x1)
>> + tkt_q_do_wake(lock);
>> +}
>> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
>> +
>> static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
>> {
>> struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
>> diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
>> index ad0ad07..cdaefdd 100644
>> --- a/arch/x86/include/asm/spinlock_types.h
>> +++ b/arch/x86/include/asm/spinlock_types.h
>> @@ -7,12 +7,18 @@
>>
>> #include <linux/types.h>
>>
>> -#if (CONFIG_NR_CPUS < 256)
>> +#if (CONFIG_NR_CPUS < 128)
>> typedef u8 __ticket_t;
>> typedef u16 __ticketpair_t;
>> -#else
>> +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
>> +#elif (CONFIG_NR_CPUS < 32768)
>> typedef u16 __ticket_t;
>> typedef u32 __ticketpair_t;
>> +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
>> +#else
>> +typedef u32 __ticket_t;
>> +typedef u64 __ticketpair_t;
>> +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
>> #endif
>>
>> #define TICKET_SHIFT (sizeof(__ticket_t) * 8)
>> @@ -21,7 +27,11 @@ typedef struct arch_spinlock {
>> union {
>> __ticketpair_t head_tail;
>> struct __raw_tickets {
>> +#ifdef __BIG_ENDIAN__
>> + __ticket_t tail, head;
>> +#else /* #ifdef __BIG_ENDIAN__ */
>> __ticket_t head, tail;
>> +#endif /* #else #ifdef __BIG_ENDIAN__ */
>> } tickets;
>> };
>> } arch_spinlock_t;
>> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
>> index e9ef6d6..816a87c 100644
>> --- a/include/linux/kernel.h
>> +++ b/include/linux/kernel.h
>> @@ -15,6 +15,7 @@
>> #include <asm/byteorder.h>
>> #include <uapi/linux/kernel.h>
>>
>> +#define UCHAR_MAX ((u8)(~0U))
>> #define USHRT_MAX ((u16)(~0U))
>> #define SHRT_MAX ((s16)(USHRT_MAX>>1))
>> #define SHRT_MIN ((s16)(-SHRT_MAX - 1))
>> diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
>> index 44511d1..ad9c67c 100644
>> --- a/kernel/Kconfig.locks
>> +++ b/kernel/Kconfig.locks
>> @@ -223,3 +223,21 @@ endif
>> config MUTEX_SPIN_ON_OWNER
>> def_bool y
>> depends on SMP && !DEBUG_MUTEXES
>> +
>> +config TICKET_LOCK_QUEUED
>> + bool "Dynamically switch between ticket and queued locking"
>> + default n
>> + ---help---
>> + Enable dynamic switching between ticketlock and queued locking
>> + on a per-lock basis. This option will slow down low-contention
>> + acquisition and release very slightly (additional conditional
>> + in release path), but will provide more efficient operation at
>> + high levels of lock contention. High-contention operation will
>> + not be quite as efficient as would be a pure queued lock, but
>> + this dynamic approach consumes less memory than queud locks
>> + and also runs faster at low levels of contention.
>> +
>> + Say "Y" if you are running on a large system with a workload
>> + that is likely to result in high levels of contention.
>> +
>> + Say "N" if you are unsure.
>> diff --git a/kernel/Makefile b/kernel/Makefile
>> index 271fd31..70a91f7 100644
>> --- a/kernel/Makefile
>> +++ b/kernel/Makefile
>> @@ -51,6 +51,7 @@ endif
>> obj-$(CONFIG_SMP) += spinlock.o
>> obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
>> obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
>> +obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
>> obj-$(CONFIG_UID16) += uid16.o
>> obj-$(CONFIG_MODULES) += module.o
>> obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
>> diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
>> new file mode 100644
>> index 0000000..f01b760
>> --- /dev/null
>> +++ b/kernel/tktqlock.c
>> @@ -0,0 +1,333 @@
>> +/*
>> + * Queued ticket spinlocks.
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program; if not, write to the Free Software
>> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
>> + *
>> + * Copyright IBM Corporation, 2013
>> + *
>> + * Authors: Paul E. McKenney <[email protected]>
>> + */
>> +#include <linux/types.h>
>> +#include <linux/kernel.h>
>> +#include <linux/spinlock.h>
>> +#include <linux/smp.h>
>> +#include <linux/percpu.h>
>> +
>> +struct tkt_q {
>> + int cpu;
>> + __ticket_t tail;
>> + struct tkt_q *next;
>> +};
>> +
>> +struct tkt_q_head {
>> + arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
>> + s32 head_tkt; /* Head ticket when started queuing. */
>> + struct tkt_q *spin; /* Head of queue. */
>> + struct tkt_q **spin_tail; /* Tail of queue. */
>> +};
>> +
>> +/*
>> + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
>> + * given ticket lock to motivate switching to spinning on a queue.
>> + * The reason that it is twice the number is because the bottom bit of
>> + * the ticket is reserved for the bit that indicates that a queue is
>> + * associated with the lock.
>> + */
>> +#define TKT_Q_SWITCH (16 * 2)
>> +
>> +/*
>> + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
>> + * might have multiple highly contended locks, so provide more queues for
>> + * systems with larger numbers of CPUs.
>> + */
>> +#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
>> +
>> +/* The queues themselves. */
>> +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
>> +
>> +/* Advance to the next queue slot, wrapping around to the beginning. */
>> +static int tkt_q_next_slot(int i)
>> +{
>> + return (++i < TKT_Q_NQUEUES) ? i : 0;
>> +}
>> +
>> +/* Very crude hash from lock address to queue slot number. */
>> +static unsigned long tkt_q_hash(arch_spinlock_t *asp)
>> +{
>> + return (((unsigned long)asp) >> 8) % TKT_Q_NQUEUES;
>> +}
>> +
>> +/*
>> + * Return a pointer to the queue header associated with the specified lock,
>> + * or return NULL if there is no queue for the lock or if the lock's queue
>> + * is in transition.
>> + */
>> +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
>> +{
>> + int i;
>> + int start;
>> +
>> + start = i = tkt_q_hash(asp);
>> + do
>> + if (tkt_q_heads[i].ref == asp)
>> + return &tkt_q_heads[i];
>> + while ((i = tkt_q_next_slot(i)) != start);
>> + return NULL;
>> +}
>> +
>> +/*
>> + * Try to stop queuing, reverting back to normal ticket-lock operation.
>> + * We can only stop queuing when the queue is empty, which means that
>> + * we need to correctly handle races where someone shows up in the queue
>> + * just as we are trying to dispense with the queue. They win, we lose.
>> + */
>> +static bool tkt_q_try_unqueue(arch_spinlock_t *asp, struct tkt_q_head *tqhp)
>> +{
>> + arch_spinlock_t asold;
>> + arch_spinlock_t asnew;
>> +
>> + /* Pick up the ticket values. */
>> + asold = ACCESS_ONCE(*asp);
>> + if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
>> +
>> + /* Attempt to mark the lock as not having a queue. */
>> + asnew = asold;
>> + asnew.tickets.head &= ~0x1;
>> + if (cmpxchg(&asp->head_tail,
>> + asold.head_tail,
>> + asnew.head_tail) == asold.head_tail) {
>> +
>> + /* Succeeded, mark the queue as unused. */
>> + ACCESS_ONCE(tqhp->ref) = NULL;
>> + return true;
>> + }
>> + }
>> +
>> + /* Failed, tell the caller there is still a queue to pass off to. */
>> + return false;
>> +}
>> +
>> +/*
>> + * Hand the lock off to the first CPU on the queue.
>> + */
>> +void tkt_q_do_wake(arch_spinlock_t *asp)
>> +{
>> + struct tkt_q_head *tqhp;
>> + struct tkt_q *tqp;
>> +
>> + /* If the queue is still being set up, wait for it. */
>> + while ((tqhp = tkt_q_find_head(asp)) == NULL)
>> + cpu_relax();
>> +
>> + for (;;) {
>> +
>> + /* Find the first queue element. */
>> + tqp = ACCESS_ONCE(tqhp->spin);
>> + if (tqp != NULL)
>> + break; /* Element exists, hand off lock. */
>> + if (tkt_q_try_unqueue(asp, tqhp))
>> + return; /* No element, successfully removed queue. */
>> + cpu_relax();
>> + }
>> + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
>> + ACCESS_ONCE(tqhp->head_tkt) = -1;
>> + smp_mb(); /* Order pointer fetch and assignment against handoff. */
>> + ACCESS_ONCE(tqp->cpu) = -1;
>> +}
>> +
>> +/*
>> + * Given a lock that already has a queue associated with it, spin on
>> + * that queue. Return false if there was no queue (which means we do not
>> + * hold the lock) and true otherwise (meaning we -do- hold the lock).
>> + */
>> +bool tkt_q_do_spin(arch_spinlock_t *asp, struct __raw_tickets inc)
>> +{
>> + struct tkt_q **oldtail;
>> + struct tkt_q tq;
>> + struct tkt_q_head *tqhp;
>> +
>> + /*
>> + * Ensure that accesses to queue header happen after sensing
>> + * the lock's have-queue bit.
>> + */
>> + smp_mb(); /* See above block comment. */
>> +
>> + /* If there no longer is a queue, leave. */
>> + tqhp = tkt_q_find_head(asp);
>> + if (tqhp == NULL)
>> + return false;
>> +
>> + /* Initialize our queue element. */
>> + tq.cpu = raw_smp_processor_id();
>> + tq.tail = inc.tail;
>> + tq.next = NULL;
>> +
>> + /* Check to see if we already hold the lock. */
>> + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
>> + /* The last holder left before queue formed, we hold lock. */
>> + tqhp->head_tkt = -1;
>> + return true;
>> + }
>> +
>> + /* Add our element to the tail of the queue. */
>> + oldtail = xchg(&tqhp->spin_tail, &tq.next);
>> + ACCESS_ONCE(*oldtail) = &tq;
>> +
>> + /* Spin until handoff. */
>> + while (ACCESS_ONCE(tq.cpu) != -1)
>> + cpu_relax();
>> +
>> + /*
>> + * Remove our element from the queue. If the queue is now empty,
>> + * update carefully so that the next acquisition will queue itself
>> + * at the head of the list.
>> + */
>> + if (tq.next == NULL) {
>> +
>> + /* Mark the queue empty. */
>> + tqhp->spin = NULL;
>> +
>> + /* Try to point the tail back at the head. */
>> + if (cmpxchg(&tqhp->spin_tail,
>> + &tq.next,
>> + &tqhp->spin) == &tq.next)
>> + return true; /* Succeeded, queue is now empty. */
>> +
>> + /* Failed, if needed, wait for the enqueue to complete. */
>> + while (tq.next == NULL)
>> + cpu_relax();
>> +
>> + /* The following code will repair the head. */
>> + }
>> + smp_mb(); /* Force ordering between handoff and critical section. */
>> +
>> + /* Advance list-head pointer. */
>> + ACCESS_ONCE(tqhp->spin) = tq.next;
>> + return true;
>> +}
>> +
>> +/*
>> + * Given a lock that does not have a queue, attempt to associate the
>> + * i-th queue with it, returning true if successful (meaning we hold
>> + * the lock) or false otherwise (meaning we do -not- hold the lock).
>> + * Note that the caller has already filled in ->ref with 0x1, so we
>> + * own the queue.
>> + */
>> +static bool
>> +tkt_q_init_contend(int i, arch_spinlock_t *asp, struct __raw_tickets inc)
>> +{
>> + arch_spinlock_t asold;
>> + arch_spinlock_t asnew;
>> + struct tkt_q_head *tqhp;
>> +
>> + /* Initialize the i-th queue header. */
>> + tqhp = &tkt_q_heads[i];
>> + tqhp->spin = NULL;
>> + tqhp->spin_tail = &tqhp->spin;
>> +
>> + /* Each pass through this loop attempts to mark the lock as queued. */
>> + do {
>> + asold.head_tail = ACCESS_ONCE(asp->head_tail);
>> + asnew = asold;
>> + if (asnew.tickets.head & 0x1) {
>> +
>> + /* Someone beat us to it, back out. */
>> + smp_mb();
>> + ACCESS_ONCE(tqhp->ref) = NULL;
>> +
>> + /* Spin on the queue element they set up. */
>> + return tkt_q_do_spin(asp, inc);
>> + }
>> +
>> + /* The low-order bit in the head counter says "queued". */
>> + asnew.tickets.head |= 0x1;
>> + } while (cmpxchg(&asp->head_tail,
>> + asold.head_tail,
>> + asnew.head_tail) != asold.head_tail);
>> +
>> + /* Point the queue at the lock and go spin on it. */
>> + tqhp->head_tkt = asold.tickets.head;
>> + smp_mb(); /* Ensure head_tkt is set prior to queuers seeing tqhp. */
>> + ACCESS_ONCE(tqhp->ref) = asp;
>> + return tkt_q_do_spin(asp, inc);
>> +}
>
> Just small revise.

Sorry it is wrong. tkt_q_find_head() will returns wrong.
could we use only tkt_q_heads[tkt_q_hash(asp)] instead of find a free one?

>
> I just move " tqhp->head_tkt = asold.tickets.head;" into the loop, so
> we can use "asp->tickets.head & 0x1" to
> indicates that queued spinlock is prepared instead of by "tqhp->ref == asp".
>
> See the append diff.
> (And I guess, after it you can force only the CPUs which
> "inc.tail - tqhp->head_tkt > TKT_Q_SWITCH"
> do queued spin to remove the thundering herd)
>
> diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> index f01b760..4ea409b 100644
> --- a/kernel/tktqlock.c
> +++ b/kernel/tktqlock.c
> @@ -27,7 +27,6 @@
>
> struct tkt_q {
> int cpu;
> - __ticket_t tail;
> struct tkt_q *next;
> };
>
> @@ -127,9 +126,8 @@ void tkt_q_do_wake(arch_spinlock_t *asp)
> struct tkt_q_head *tqhp;
> struct tkt_q *tqp;
>
> - /* If the queue is still being set up, wait for it. */
> - while ((tqhp = tkt_q_find_head(asp)) == NULL)
> - cpu_relax();
> + tqhp = tkt_q_find_head(asp);
> + BUG_ON(!tqhp);
>
> for (;;) {
>
> @@ -141,8 +139,6 @@ void tkt_q_do_wake(arch_spinlock_t *asp)
> return; /* No element, successfully removed queue. */
> cpu_relax();
> }
> - if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> - ACCESS_ONCE(tqhp->head_tkt) = -1;
> smp_mb(); /* Order pointer fetch and assignment against handoff. */
> ACCESS_ONCE(tqp->cpu) = -1;
> }
> @@ -164,20 +160,16 @@ bool tkt_q_do_spin(arch_spinlock_t *asp, struct
> __raw_tickets inc)
> */
> smp_mb(); /* See above block comment. */
>
> - /* If there no longer is a queue, leave. */
> tqhp = tkt_q_find_head(asp);
> - if (tqhp == NULL)
> - return false;
> + BUG_ON(!tqhp);
>
> /* Initialize our queue element. */
> tq.cpu = raw_smp_processor_id();
> - tq.tail = inc.tail;
> tq.next = NULL;
>
> /* Check to see if we already hold the lock. */
> if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> /* The last holder left before queue formed, we hold lock. */
> - tqhp->head_tkt = -1;
> return true;
> }
>
> @@ -251,16 +243,14 @@ tkt_q_init_contend(int i, arch_spinlock_t *asp,
> struct __raw_tickets inc)
> return tkt_q_do_spin(asp, inc);
> }
>
> + /* Point the queue at the lock and go spin on it. */
> + tqhp->head_tkt = asold.tickets.head;
> /* The low-order bit in the head counter says "queued". */
> asnew.tickets.head |= 0x1;
> } while (cmpxchg(&asp->head_tail,
> asold.head_tail,
> asnew.head_tail) != asold.head_tail);
>
> - /* Point the queue at the lock and go spin on it. */
> - tqhp->head_tkt = asold.tickets.head;
> - smp_mb(); /* Ensure head_tkt is set prior to queuers seeing tqhp. */
> - ACCESS_ONCE(tqhp->ref) = asp;
> return tkt_q_do_spin(asp, inc);
> }
>
> @@ -282,14 +272,9 @@ bool tkt_q_start_contend(arch_spinlock_t *asp,
> struct __raw_tickets inc)
> * the lock with the corresponding queue.
> */
> do {
> - /*
> - * Use 0x1 to mark the queue in use, but also avoiding
> - * any spinners trying to use it before we get it all
> - * initialized.
> - */
> if (cmpxchg(&tkt_q_heads[i].ref,
> NULL,
> - (arch_spinlock_t *)0x1) == NULL) {
> + asp) == NULL) {
>
> /* Succeeded, now go initialize it. */
> return tkt_q_init_contend(i, asp, inc);

if (tkt_q_heads[i].ref == asp)

2013-06-11 15:15:38

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 02:56:55AM -0700, Paul E. McKenney wrote:
> On Mon, Jun 10, 2013 at 08:44:40PM -0400, Steven Rostedt wrote:
> > On Sun, 2013-06-09 at 12:36 -0700, Paul E. McKenney wrote:

[ . . . ]

> > > +bool tkt_q_do_spin(arch_spinlock_t *asp, struct __raw_tickets inc)
> > > +{
> > > + struct tkt_q **oldtail;
> > > + struct tkt_q tq;
> > > + struct tkt_q_head *tqhp;
> > > +
> > > + /*
> > > + * Ensure that accesses to queue header happen after sensing
> > > + * the lock's have-queue bit.
> > > + */
> > > + smp_mb(); /* See above block comment. */
> > > +
> > > + /* If there no longer is a queue, leave. */
> > > + tqhp = tkt_q_find_head(asp);
> > > + if (tqhp == NULL)
> > > + return false;
> > > +
> > > + /* Initialize our queue element. */
> > > + tq.cpu = raw_smp_processor_id();
> > > + tq.tail = inc.tail;
> > > + tq.next = NULL;
> > > +
> > > + /* Check to see if we already hold the lock. */
> > > + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> > > + /* The last holder left before queue formed, we hold lock. */
> > > + tqhp->head_tkt = -1;
> > > + return true;
> > > + }
> > > +
> > > + /* Add our element to the tail of the queue. */
> > > + oldtail = xchg(&tqhp->spin_tail, &tq.next);
> >
> > Boy this is tricky code! I thought I found a race window here, but as I
> > went to write my email saying "Gotcha!" I found that it wasn't a race
> > after all. But as I went though the effort of writing this, I figured I
> > would send this out as documentation for others to see. Hmm, I wonder if
> > we can use this email to add more comments. Anyway, here's what I
> > thought was wrong ;-)
>
> If you didn't know any better, you might even think that I had done
> something like this before. ;-)
>
> > OK, I originally thought there was a race window here. Let's say that an
> > NMI hit right here, and it happens to be a big one, where lots of things
> > can happen on other CPUs right now.
> >
> > The scenario is that there's just one item on the queue, which is
> > waiting for the lock to be released, and is spinning below in the:
> >
> > while (ACCESS_ONCE(tq.cpu) != -1)
> > cpu_relax();
> >
> > And then the lock is released, where in tkt_q_do_wake() the following is
> > called:
> >
> > ACCESS_ONCE(tqp->cpu) = -1;
> >
> > Now the old queued task is released. But it's tq->next hasn't been set
> > yet, and is still NULL. It leaves by doing:
> >
> > ACCESS_ONCE(tqhp->spin) = tq.next;
> > return true;
> >
> > All before this task gets to set *oldtail to &tq. But, I then looked
> > below...
> >
> >
> > > + ACCESS_ONCE(*oldtail) = &tq;
> > > +
> > > + /* Spin until handoff. */
> > > + while (ACCESS_ONCE(tq.cpu) != -1)
> > > + cpu_relax();
> > > +
> > > + /*
> > > + * Remove our element from the queue. If the queue is now empty,
> > > + * update carefully so that the next acquisition will queue itself
> > > + * at the head of the list.
> > > + */
> > > + if (tq.next == NULL) {
> >
> > This checks for that scenario.
>
> Yep!
>
> > As if the old task were to come out
> > spinning, the problem would only be if it was the last one on the list,
> > and its tq.next was NULL. But if that was the case, then we set spin to
> > NULL and do the next trick, where I thought I gotcha again...
> >
> >
> > > +
> > > + /* Mark the queue empty. */
> > > + tqhp->spin = NULL;
> > > +
> > > + /* Try to point the tail back at the head. */
> > > + if (cmpxchg(&tqhp->spin_tail,
> > > + &tq.next,
> > > + &tqhp->spin) == &tq.next)
> >
> > Here, I was thinking, oh wait, what happens if this is called right
> > before the xchg() above. Then we would set spin_tail but not update the
> > old tq.next. But wait! look at what we assign spin_tail to. It's the
> > address of spin, which would be what oldtail would point to above, and
> > then above would set spin to the new tq!
>
> Yep again!
>
> > OK, I haven't found a issue here yet, but youss are beiing trickssy! We
> > don't like trickssy, and we must find precccciouss!!!
> >
> >
> > This code is starting to make me look like Gollum :-p
>
> Hmmm... The time and effort to do this might almost have been worthwhile
> just to accomplish that! ;-)
>
> But yes, this would need better comments, design documentation, or
> maybe both.

And for whatever it might be worth, here is an attempted upgrade for
comments.

First, I upgrade the comment for the xchg() that does the enqueue:

/*
* Add our element to the tail of the queue. Note that if the
* queue is empty, the ->spin_tail pointer will reference
* the queue's head pointer, namely ->spin.
*/
oldtail = xchg(&tqhp->spin_tail, &tq.next);
ACCESS_ONCE(*oldtail) = &tq;

Next, I upgrade the comment for the dequeue operation:

/*
* Remove our element from the queue. If the queue is now empty,
* update carefully so that the next acquisition will enqueue itself
* at the head of the list. Of course, the next enqueue operation
* might be happening concurrently, and this code needs to handle all
* of the possible combinations, keeping in mind that the enqueue
* operation happens in two stages: (1) update the tail pointer and
* (2) update the predecessor's ->next pointer. With this in mind,
* the following code needs to deal with three scenarios:
*
* 1. tq is the last entry. In this case, we use cmpxchg to
* point the list tail back to the list head (->spin). If
* the cmpxchg fails, that indicates that we are instead
* in scenario 2 below. If the cmpxchg succeeds, the next
* enqueue operation's tail-pointer exchange will enqueue
* the next element at the queue head, because the ->spin_tail
* pointer now references the queue head.
*
* 2. tq is the last entry, and the next entry has updated the
* tail pointer but has not yet updated tq.next. In this
* case, tq.next is NULL, the cmpxchg will fail, and the
* code will wait for the enqueue to complete before completing
* removal of tq from the list.
*
* 3. tq is not the last pointer. In this case, tq.next is non-NULL,
* so the following code simply removes tq from the list.
*/
if (tq.next == NULL) {

/* Mark the queue empty. */
tqhp->spin = NULL;

/* Try to point the tail back at the head. */
if (cmpxchg(&tqhp->spin_tail,
&tq.next,
&tqhp->spin) == &tq.next)
return true; /* Succeeded, queue is now empty. */

/* Failed, if needed, wait for the enqueue to complete. */
while (tq.next == NULL)
cpu_relax();

/* The following code will repair the head. */
}
smp_mb(); /* Force ordering between handoff and critical section. */

/*
* Advance list-head pointer. This same task will be the next to
* access this when releasing the lock, so no need for a memory
* barrier after the following assignment.
*/
ACCESS_ONCE(tqhp->spin) = tq.next;
return true;
}

Thanx, Paul

2013-06-11 15:22:50

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, 2013-06-11 at 03:14 -0700, Paul E. McKenney wrote:
>
> > Off-topic, although I am in this community for several years,
> > I am not exactly clear with this problem.
> >
> > 1) In general case, which lock is the most competitive in the kernel? what it protects for?
> > 2) In which special case, which lock is the most competitive in the kernel? what it protects for?
> > 3) In general case, which list is the most hot list?
> > 4) In which special case, which list is the most hot list?
>
> Others would know better than I, but mmap_sem has been called out as a

If the contention is with mmap_sem, then I doubt this is going to help
much, as that's a sleeping rw semaphore. Now, rw semaphores are
implemented with raw spinlocks, but I doubt that would be the main point
of contention, compared to the sleeping part.

-- Steve

> prime offender for some workloads. There is of course some debate as
> to whether the fault lies mmap_sem or with the workloads. There have
> been some efforts to solve this one on LKML, plus some in academia have
> worked on this as well:
>

2013-06-11 15:57:37

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On 06/09/2013 03:36 PM, Paul E. McKenney wrote:
> Breaking up locks is better than implementing high-contention locks, but
> if we must have high-contention locks, why not make them automatically
> switch between light-weight ticket locks at low contention and queued
> locks at high contention?
>
> This commit therefore allows ticket locks to automatically switch between
> pure ticketlock and queued-lock operation as needed. If too many CPUs
> are spinning on a given ticket lock, a queue structure will be allocated
> and the lock will switch to queued-lock operation. When the lock becomes
> free, it will switch back into ticketlock operation. The low-order bit
> of the head counter is used to indicate that the lock is in queued mode,
> which forces an unconditional mismatch between the head and tail counters.
> This approach means that the common-case code path under conditions of
> low contention is very nearly that of a plain ticket lock.
>
> A fixed number of queueing structures is statically allocated in an
> array. The ticket-lock address is used to hash into an initial element,
> but if that element is already in use, it moves to the next element. If
> the entire array is already in use, continue to spin in ticket mode.
>
> This has been only lightly tested in the kernel, though a userspace
> implementation has survived substantial testing.
>
> Signed-off-by: Paul E. McKenney<[email protected]>

This is an interesting patch and I think it is useful for workloads that
run on systems with a large number of CPUs.

> diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> index ad0ad07..cdaefdd 100644
> --- a/arch/x86/include/asm/spinlock_types.h
> +++ b/arch/x86/include/asm/spinlock_types.h
> @@ -7,12 +7,18 @@
>
> #include<linux/types.h>
>
> -#if (CONFIG_NR_CPUS< 256)
> +#if (CONFIG_NR_CPUS< 128)
> typedef u8 __ticket_t;
> typedef u16 __ticketpair_t;
> -#else
> +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2>= (unsigned char)((a) - (b)))
> +#elif (CONFIG_NR_CPUS< 32768)
> typedef u16 __ticket_t;
> typedef u32 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2>= (unsigned short)((a) - (b)))
> +#else
> +typedef u32 __ticket_t;
> +typedef u64 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2>= (unsigned int)((a) - (b)))
> #endif

It is theoretically possible that a large number of CPUs (says 64 or
more with CONFIG_NR_CPUS < 128) can acquire a ticket from the lock
before the check for TICKET_T_CMP_GE() in tkt_spin_pass(). So the check
will fail even when there is a large number of CPUs contending for the
lock. The chance of this happening is, of course, extremely rare. This
is not an error as the lock is still working as it should be without
your change.

>
> +/*
> + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> + * given ticket lock to motivate switching to spinning on a queue.
> + * The reason that it is twice the number is because the bottom bit of
> + * the ticket is reserved for the bit that indicates that a queue is
> + * associated with the lock.
> + */
> +#define TKT_Q_SWITCH (16 * 2)
> +
> +/*
> + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> + * might have multiple highly contended locks, so provide more queues for
> + * systems with larger numbers of CPUs.
> + */
> +#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
> +
> +/* The queues themselves. */
> +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];

I am a bit concern about the size of the head queue table itself. RHEL6,
for example, had defined CONFIG_NR_CPUS to be 4096 which mean a table
size of 256. Maybe it is better to dynamically allocate the table at
init time depending on the actual number of CPUs in the system.

> +/*
> + * Return a pointer to the queue header associated with the specified lock,
> + * or return NULL if there is no queue for the lock or if the lock's queue
> + * is in transition.
> + */
> +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
> +{
> + int i;
> + int start;
> +
> + start = i = tkt_q_hash(asp);
> + do
> + if (tkt_q_heads[i].ref == asp)
> + return&tkt_q_heads[i];
> + while ((i = tkt_q_next_slot(i)) != start);
> + return NULL;
> +}

With a table size of 256 and you have to scan the whole table to find
the right head queue. This can be a significant overhead. I will suggest
setting a limiting of how many entries it scans before it aborts rather
than checking the whole table.

> +/*
> + * Hand the lock off to the first CPU on the queue.
> + */
> +void tkt_q_do_wake(arch_spinlock_t *asp)
> +{
> + struct tkt_q_head *tqhp;
> + struct tkt_q *tqp;
> +
> + /* If the queue is still being set up, wait for it. */
> + while ((tqhp = tkt_q_find_head(asp)) == NULL)
> + cpu_relax();
> +
> + for (;;) {
> +
> + /* Find the first queue element. */
> + tqp = ACCESS_ONCE(tqhp->spin);
> + if (tqp != NULL)
> + break; /* Element exists, hand off lock. */
> + if (tkt_q_try_unqueue(asp, tqhp))
> + return; /* No element, successfully removed queue. */
> + cpu_relax();
> + }
> + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> + ACCESS_ONCE(tqhp->head_tkt) = -1;

In case NR_CPUS is 32768 or higher, the ticket will be of type u32 and
tqhp->head_tkt is s32. So -1 will be a valid ticket number. You may have
to conditionally define head_tkt to be s64 when the ticket is u32.

Do you have any data on how much this patch can actually improve
performance on certain workloads? This will help the discussion here.

Regards,
Longman

2013-06-11 16:20:37

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, 2013-06-11 at 11:57 -0400, Waiman Long wrote:

> This is an interesting patch and I think it is useful for workloads that
> run on systems with a large number of CPUs.

I would say it is definitely a fun academic patch, now if it is
something for a production environment remains to be seen.

>
> > diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> > index ad0ad07..cdaefdd 100644
> > --- a/arch/x86/include/asm/spinlock_types.h
> > +++ b/arch/x86/include/asm/spinlock_types.h
> > @@ -7,12 +7,18 @@
> >
> > #include<linux/types.h>
> >
> > -#if (CONFIG_NR_CPUS< 256)
> > +#if (CONFIG_NR_CPUS< 128)
> > typedef u8 __ticket_t;
> > typedef u16 __ticketpair_t;
> > -#else
> > +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2>= (unsigned char)((a) - (b)))
> > +#elif (CONFIG_NR_CPUS< 32768)
> > typedef u16 __ticket_t;
> > typedef u32 __ticketpair_t;
> > +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2>= (unsigned short)((a) - (b)))
> > +#else
> > +typedef u32 __ticket_t;
> > +typedef u64 __ticketpair_t;
> > +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2>= (unsigned int)((a) - (b)))
> > #endif
>
> It is theoretically possible that a large number of CPUs (says 64 or
> more with CONFIG_NR_CPUS < 128) can acquire a ticket from the lock
> before the check for TICKET_T_CMP_GE() in tkt_spin_pass(). So the check
> will fail even when there is a large number of CPUs contending for the
> lock. The chance of this happening is, of course, extremely rare. This
> is not an error as the lock is still working as it should be without
> your change.

Can you explain this more. How can you acquire the ticket and update at
the same time? If a queue has been set, then you can't acquire the
ticket as the head has a 1 appended to it.


> >
> > +/*
> > + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> > + * given ticket lock to motivate switching to spinning on a queue.
> > + * The reason that it is twice the number is because the bottom bit of
> > + * the ticket is reserved for the bit that indicates that a queue is
> > + * associated with the lock.
> > + */
> > +#define TKT_Q_SWITCH (16 * 2)
> > +
> > +/*
> > + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> > + * might have multiple highly contended locks, so provide more queues for
> > + * systems with larger numbers of CPUs.
> > + */
> > +#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
> > +
> > +/* The queues themselves. */
> > +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
>
> I am a bit concern about the size of the head queue table itself. RHEL6,
> for example, had defined CONFIG_NR_CPUS to be 4096 which mean a table
> size of 256. Maybe it is better to dynamically allocate the table at
> init time depending on the actual number of CPUs in the system.

Yeah, it can be allocated dynamically at boot.

>
> > +/*
> > + * Return a pointer to the queue header associated with the specified lock,
> > + * or return NULL if there is no queue for the lock or if the lock's queue
> > + * is in transition.
> > + */
> > +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
> > +{
> > + int i;
> > + int start;
> > +
> > + start = i = tkt_q_hash(asp);
> > + do
> > + if (tkt_q_heads[i].ref == asp)
> > + return&tkt_q_heads[i];
> > + while ((i = tkt_q_next_slot(i)) != start);
> > + return NULL;
> > +}
>
> With a table size of 256 and you have to scan the whole table to find
> the right head queue. This can be a significant overhead. I will suggest
> setting a limiting of how many entries it scans before it aborts rather
> than checking the whole table.

We could add a limit, but in practice I'm not sure that would have any
issue. I thought the same thing when I first saw this, but to hit most
of the list, would require a large collision in the hash algorithm,
would could probably be fixed with a better hash.

>
> > +/*
> > + * Hand the lock off to the first CPU on the queue.
> > + */
> > +void tkt_q_do_wake(arch_spinlock_t *asp)
> > +{
> > + struct tkt_q_head *tqhp;
> > + struct tkt_q *tqp;
> > +
> > + /* If the queue is still being set up, wait for it. */
> > + while ((tqhp = tkt_q_find_head(asp)) == NULL)
> > + cpu_relax();
> > +
> > + for (;;) {
> > +
> > + /* Find the first queue element. */
> > + tqp = ACCESS_ONCE(tqhp->spin);
> > + if (tqp != NULL)
> > + break; /* Element exists, hand off lock. */
> > + if (tkt_q_try_unqueue(asp, tqhp))
> > + return; /* No element, successfully removed queue. */
> > + cpu_relax();
> > + }
> > + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> > + ACCESS_ONCE(tqhp->head_tkt) = -1;
>
> In case NR_CPUS is 32768 or higher, the ticket will be of type u32 and
> tqhp->head_tkt is s32. So -1 will be a valid ticket number. You may have
> to conditionally define head_tkt to be s64 when the ticket is u32.

Good point.

>
> Do you have any data on how much this patch can actually improve
> performance on certain workloads? This will help the discussion here.

Yeah, that's come up already in the thread. Linus wants to see hard
numbers *and* an explanation of why the contended locks can't be fixed,
before he even considers merging this type of change.

-- Steve

2013-06-11 16:22:30

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 10:48:17PM +0800, Lai Jiangshan wrote:
> On Mon, Jun 10, 2013 at 3:36 AM, Paul E. McKenney
> <[email protected]> wrote:
> > Breaking up locks is better than implementing high-contention locks, but
> > if we must have high-contention locks, why not make them automatically
> > switch between light-weight ticket locks at low contention and queued
> > locks at high contention?
> >
> > This commit therefore allows ticket locks to automatically switch between
> > pure ticketlock and queued-lock operation as needed. If too many CPUs
> > are spinning on a given ticket lock, a queue structure will be allocated
> > and the lock will switch to queued-lock operation. When the lock becomes
> > free, it will switch back into ticketlock operation. The low-order bit
> > of the head counter is used to indicate that the lock is in queued mode,
> > which forces an unconditional mismatch between the head and tail counters.
> > This approach means that the common-case code path under conditions of
> > low contention is very nearly that of a plain ticket lock.
> >
> > A fixed number of queueing structures is statically allocated in an
> > array. The ticket-lock address is used to hash into an initial element,
> > but if that element is already in use, it moves to the next element. If
> > the entire array is already in use, continue to spin in ticket mode.
> >
> > This has been only lightly tested in the kernel, though a userspace
> > implementation has survived substantial testing.
> >
> > Signed-off-by: Paul E. McKenney <[email protected]>
> >
> > diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
> > index 33692ea..b4a91b0 100644
> > --- a/arch/x86/include/asm/spinlock.h
> > +++ b/arch/x86/include/asm/spinlock.h
> > @@ -34,6 +34,8 @@
> > # define UNLOCK_LOCK_PREFIX
> > #endif
> >
> > +#ifndef CONFIG_TICKET_LOCK_QUEUED
> > +
> > /*
> > * Ticket locks are conceptually two parts, one indicating the current head of
> > * the queue, and the other indicating the current tail. The lock is acquired
> > @@ -62,6 +64,25 @@ static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> > barrier(); /* make sure nothing creeps before the lock is taken */
> > }
> >
> > +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> > +
> > +static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> > +{
> > + register struct __raw_tickets inc = { .tail = 2 };
> > +
> > + inc = xadd(&lock->tickets, inc);
> > + for (;;) {
> > + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> > + break;
> > + inc.head = ACCESS_ONCE(lock->tickets.head);
> > + }
> > + barrier(); /* smp_mb() on Power or ARM. */
> > +}
> > +
> > +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> > {
> > arch_spinlock_t old, new;
> > @@ -70,17 +91,37 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> > if (old.tickets.head != old.tickets.tail)
> > return 0;
> >
> > +#ifndef CONFIG_TICKET_LOCK_QUEUED
> > new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
> > +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > + new.head_tail = old.head_tail + (2 << TICKET_SHIFT);
> > +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> >
> > /* cmpxchg is a full barrier, so nothing can move before it */
> > return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
> > }
> >
> > +#ifndef CONFIG_TICKET_LOCK_QUEUED
> > +
> > static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> > {
> > __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
> > }
> >
> > +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > +extern void tkt_q_do_wake(arch_spinlock_t *asp);
> > +
> > +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> > +{
> > + __ticket_t head = 2;
> > +
> > + head = xadd(&lock->tickets.head, 2);
> > + if (head & 0x1)
> > + tkt_q_do_wake(lock);
> > +}
> > +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
> > {
> > struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> > diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> > index ad0ad07..cdaefdd 100644
> > --- a/arch/x86/include/asm/spinlock_types.h
> > +++ b/arch/x86/include/asm/spinlock_types.h
> > @@ -7,12 +7,18 @@
> >
> > #include <linux/types.h>
> >
> > -#if (CONFIG_NR_CPUS < 256)
> > +#if (CONFIG_NR_CPUS < 128)
> > typedef u8 __ticket_t;
> > typedef u16 __ticketpair_t;
> > -#else
> > +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
> > +#elif (CONFIG_NR_CPUS < 32768)
> > typedef u16 __ticket_t;
> > typedef u32 __ticketpair_t;
> > +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
> > +#else
> > +typedef u32 __ticket_t;
> > +typedef u64 __ticketpair_t;
> > +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
> > #endif
> >
> > #define TICKET_SHIFT (sizeof(__ticket_t) * 8)
> > @@ -21,7 +27,11 @@ typedef struct arch_spinlock {
> > union {
> > __ticketpair_t head_tail;
> > struct __raw_tickets {
> > +#ifdef __BIG_ENDIAN__
> > + __ticket_t tail, head;
> > +#else /* #ifdef __BIG_ENDIAN__ */
> > __ticket_t head, tail;
> > +#endif /* #else #ifdef __BIG_ENDIAN__ */
> > } tickets;
> > };
> > } arch_spinlock_t;
> > diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> > index e9ef6d6..816a87c 100644
> > --- a/include/linux/kernel.h
> > +++ b/include/linux/kernel.h
> > @@ -15,6 +15,7 @@
> > #include <asm/byteorder.h>
> > #include <uapi/linux/kernel.h>
> >
> > +#define UCHAR_MAX ((u8)(~0U))
> > #define USHRT_MAX ((u16)(~0U))
> > #define SHRT_MAX ((s16)(USHRT_MAX>>1))
> > #define SHRT_MIN ((s16)(-SHRT_MAX - 1))
> > diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> > index 44511d1..ad9c67c 100644
> > --- a/kernel/Kconfig.locks
> > +++ b/kernel/Kconfig.locks
> > @@ -223,3 +223,21 @@ endif
> > config MUTEX_SPIN_ON_OWNER
> > def_bool y
> > depends on SMP && !DEBUG_MUTEXES
> > +
> > +config TICKET_LOCK_QUEUED
> > + bool "Dynamically switch between ticket and queued locking"
> > + default n
> > + ---help---
> > + Enable dynamic switching between ticketlock and queued locking
> > + on a per-lock basis. This option will slow down low-contention
> > + acquisition and release very slightly (additional conditional
> > + in release path), but will provide more efficient operation at
> > + high levels of lock contention. High-contention operation will
> > + not be quite as efficient as would be a pure queued lock, but
> > + this dynamic approach consumes less memory than queud locks
> > + and also runs faster at low levels of contention.
> > +
> > + Say "Y" if you are running on a large system with a workload
> > + that is likely to result in high levels of contention.
> > +
> > + Say "N" if you are unsure.
> > diff --git a/kernel/Makefile b/kernel/Makefile
> > index 271fd31..70a91f7 100644
> > --- a/kernel/Makefile
> > +++ b/kernel/Makefile
> > @@ -51,6 +51,7 @@ endif
> > obj-$(CONFIG_SMP) += spinlock.o
> > obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
> > obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
> > +obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
> > obj-$(CONFIG_UID16) += uid16.o
> > obj-$(CONFIG_MODULES) += module.o
> > obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
> > diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> > new file mode 100644
> > index 0000000..f01b760
> > --- /dev/null
> > +++ b/kernel/tktqlock.c
> > @@ -0,0 +1,333 @@
> > +/*
> > + * Queued ticket spinlocks.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, write to the Free Software
> > + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> > + *
> > + * Copyright IBM Corporation, 2013
> > + *
> > + * Authors: Paul E. McKenney <[email protected]>
> > + */
> > +#include <linux/types.h>
> > +#include <linux/kernel.h>
> > +#include <linux/spinlock.h>
> > +#include <linux/smp.h>
> > +#include <linux/percpu.h>
> > +
> > +struct tkt_q {
> > + int cpu;
> > + __ticket_t tail;
> > + struct tkt_q *next;
> > +};
> > +
> > +struct tkt_q_head {
> > + arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
> > + s32 head_tkt; /* Head ticket when started queuing. */
> > + struct tkt_q *spin; /* Head of queue. */
> > + struct tkt_q **spin_tail; /* Tail of queue. */
> > +};
> > +
> > +/*
> > + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> > + * given ticket lock to motivate switching to spinning on a queue.
> > + * The reason that it is twice the number is because the bottom bit of
> > + * the ticket is reserved for the bit that indicates that a queue is
> > + * associated with the lock.
> > + */
> > +#define TKT_Q_SWITCH (16 * 2)
> > +
> > +/*
> > + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> > + * might have multiple highly contended locks, so provide more queues for
> > + * systems with larger numbers of CPUs.
> > + */
> > +#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
> > +
> > +/* The queues themselves. */
> > +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
> > +
> > +/* Advance to the next queue slot, wrapping around to the beginning. */
> > +static int tkt_q_next_slot(int i)
> > +{
> > + return (++i < TKT_Q_NQUEUES) ? i : 0;
> > +}
> > +
> > +/* Very crude hash from lock address to queue slot number. */
> > +static unsigned long tkt_q_hash(arch_spinlock_t *asp)
> > +{
> > + return (((unsigned long)asp) >> 8) % TKT_Q_NQUEUES;
> > +}
> > +
> > +/*
> > + * Return a pointer to the queue header associated with the specified lock,
> > + * or return NULL if there is no queue for the lock or if the lock's queue
> > + * is in transition.
> > + */
> > +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
> > +{
> > + int i;
> > + int start;
> > +
> > + start = i = tkt_q_hash(asp);
> > + do
> > + if (tkt_q_heads[i].ref == asp)
> > + return &tkt_q_heads[i];
> > + while ((i = tkt_q_next_slot(i)) != start);
> > + return NULL;
> > +}
> > +
> > +/*
> > + * Try to stop queuing, reverting back to normal ticket-lock operation.
> > + * We can only stop queuing when the queue is empty, which means that
> > + * we need to correctly handle races where someone shows up in the queue
> > + * just as we are trying to dispense with the queue. They win, we lose.
> > + */
> > +static bool tkt_q_try_unqueue(arch_spinlock_t *asp, struct tkt_q_head *tqhp)
> > +{
> > + arch_spinlock_t asold;
> > + arch_spinlock_t asnew;
> > +
> > + /* Pick up the ticket values. */
> > + asold = ACCESS_ONCE(*asp);
> > + if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
> > +
> > + /* Attempt to mark the lock as not having a queue. */
> > + asnew = asold;
> > + asnew.tickets.head &= ~0x1;
> > + if (cmpxchg(&asp->head_tail,
> > + asold.head_tail,
> > + asnew.head_tail) == asold.head_tail) {
> > +
> > + /* Succeeded, mark the queue as unused. */
> > + ACCESS_ONCE(tqhp->ref) = NULL;
> > + return true;
> > + }
> > + }
> > +
> > + /* Failed, tell the caller there is still a queue to pass off to. */
> > + return false;
> > +}
> > +
> > +/*
> > + * Hand the lock off to the first CPU on the queue.
> > + */
> > +void tkt_q_do_wake(arch_spinlock_t *asp)
> > +{
> > + struct tkt_q_head *tqhp;
> > + struct tkt_q *tqp;
> > +
> > + /* If the queue is still being set up, wait for it. */
> > + while ((tqhp = tkt_q_find_head(asp)) == NULL)
> > + cpu_relax();
> > +
> > + for (;;) {
> > +
> > + /* Find the first queue element. */
> > + tqp = ACCESS_ONCE(tqhp->spin);
> > + if (tqp != NULL)
> > + break; /* Element exists, hand off lock. */
> > + if (tkt_q_try_unqueue(asp, tqhp))
> > + return; /* No element, successfully removed queue. */
> > + cpu_relax();
> > + }
> > + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> > + ACCESS_ONCE(tqhp->head_tkt) = -1;
> > + smp_mb(); /* Order pointer fetch and assignment against handoff. */
> > + ACCESS_ONCE(tqp->cpu) = -1;
> > +}
> > +
> > +/*
> > + * Given a lock that already has a queue associated with it, spin on
> > + * that queue. Return false if there was no queue (which means we do not
> > + * hold the lock) and true otherwise (meaning we -do- hold the lock).
> > + */
> > +bool tkt_q_do_spin(arch_spinlock_t *asp, struct __raw_tickets inc)
> > +{
> > + struct tkt_q **oldtail;
> > + struct tkt_q tq;
> > + struct tkt_q_head *tqhp;
> > +
> > + /*
> > + * Ensure that accesses to queue header happen after sensing
> > + * the lock's have-queue bit.
> > + */
> > + smp_mb(); /* See above block comment. */
> > +
> > + /* If there no longer is a queue, leave. */
> > + tqhp = tkt_q_find_head(asp);
> > + if (tqhp == NULL)
> > + return false;
> > +
> > + /* Initialize our queue element. */
> > + tq.cpu = raw_smp_processor_id();
> > + tq.tail = inc.tail;
> > + tq.next = NULL;
> > +
> > + /* Check to see if we already hold the lock. */
> > + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> > + /* The last holder left before queue formed, we hold lock. */
> > + tqhp->head_tkt = -1;
> > + return true;
> > + }
> > +
> > + /* Add our element to the tail of the queue. */
> > + oldtail = xchg(&tqhp->spin_tail, &tq.next);
> > + ACCESS_ONCE(*oldtail) = &tq;
> > +
> > + /* Spin until handoff. */
> > + while (ACCESS_ONCE(tq.cpu) != -1)
> > + cpu_relax();
> > +
> > + /*
> > + * Remove our element from the queue. If the queue is now empty,
> > + * update carefully so that the next acquisition will queue itself
> > + * at the head of the list.
> > + */
> > + if (tq.next == NULL) {
> > +
> > + /* Mark the queue empty. */
> > + tqhp->spin = NULL;
> > +
> > + /* Try to point the tail back at the head. */
> > + if (cmpxchg(&tqhp->spin_tail,
> > + &tq.next,
> > + &tqhp->spin) == &tq.next)
> > + return true; /* Succeeded, queue is now empty. */
> > +
> > + /* Failed, if needed, wait for the enqueue to complete. */
> > + while (tq.next == NULL)
> > + cpu_relax();
> > +
> > + /* The following code will repair the head. */
> > + }
> > + smp_mb(); /* Force ordering between handoff and critical section. */
> > +
> > + /* Advance list-head pointer. */
> > + ACCESS_ONCE(tqhp->spin) = tq.next;
> > + return true;
> > +}
> > +
> > +/*
> > + * Given a lock that does not have a queue, attempt to associate the
> > + * i-th queue with it, returning true if successful (meaning we hold
> > + * the lock) or false otherwise (meaning we do -not- hold the lock).
> > + * Note that the caller has already filled in ->ref with 0x1, so we
> > + * own the queue.
> > + */
> > +static bool
> > +tkt_q_init_contend(int i, arch_spinlock_t *asp, struct __raw_tickets inc)
> > +{
> > + arch_spinlock_t asold;
> > + arch_spinlock_t asnew;
> > + struct tkt_q_head *tqhp;
> > +
> > + /* Initialize the i-th queue header. */
> > + tqhp = &tkt_q_heads[i];
> > + tqhp->spin = NULL;
> > + tqhp->spin_tail = &tqhp->spin;
> > +
> > + /* Each pass through this loop attempts to mark the lock as queued. */
> > + do {
> > + asold.head_tail = ACCESS_ONCE(asp->head_tail);
> > + asnew = asold;
> > + if (asnew.tickets.head & 0x1) {
> > +
> > + /* Someone beat us to it, back out. */
> > + smp_mb();
> > + ACCESS_ONCE(tqhp->ref) = NULL;
> > +
> > + /* Spin on the queue element they set up. */
> > + return tkt_q_do_spin(asp, inc);
> > + }
> > +
> > + /* The low-order bit in the head counter says "queued". */
> > + asnew.tickets.head |= 0x1;
> > + } while (cmpxchg(&asp->head_tail,
> > + asold.head_tail,
> > + asnew.head_tail) != asold.head_tail);
> > +
> > + /* Point the queue at the lock and go spin on it. */
> > + tqhp->head_tkt = asold.tickets.head;
> > + smp_mb(); /* Ensure head_tkt is set prior to queuers seeing tqhp. */
> > + ACCESS_ONCE(tqhp->ref) = asp;
> > + return tkt_q_do_spin(asp, inc);
> > +}
>
> Just small revise.
>
> I just move " tqhp->head_tkt = asold.tickets.head;" into the loop, so

Excellent point, this allows removing the memory barrier because we can rely
on the cmpxchg()'s barriers instead. I have made this change. Now to see
if it still works... And it does, at least for light testing.

> we can use "asp->tickets.head & 0x1" to
> indicates that queued spinlock is prepared instead of by "tqhp->ref == asp".

Hmmm...

Suppose the following sequence of events happens:

o Several of the CPUs notices that there are too many CPUs spinning,
so they each allocate different queue elements.

o They all set their tqhp->ref to asp, so that there are (say)
three consecutive queue entries that are labeled as belonging
to this lock.

o Suppose that the CPU corresponding to the last of these three
entries succeeds in setting the low-order bit in the ticket
lock. The other two CPUs will the set their tqhp->ref to NULL,
but before that can happen...

o One of the other CPUs that is spinning notices the low-order
bit, and therefore searches for a queue element with the
right value of tqhp->ref -- and finds the first one, enqueuing
itself.

o The CPU corresponding to this same entry now sets tqhp->ref to
NULL, so that the spinning CPU is now spinning forever on a queue
entry that is no longer marked as in use.

How does your change prevent this from happening? It looks to me like it
can in fact happen. My patch avoids this by refusing to mark the queue
element with asp until after the bit has been set.

Also, when you remove the setting of ->head_tkt to -1, what prevents
spurious lock grants after the counters wrap during a long time spent
in queued mode?

Your elimination of the tkt_q structure's ->tail field is OK for production,
but this field is very helpful for debugging.

> See the append diff.
> (And I guess, after it you can force only the CPUs which
> "inc.tail - tqhp->head_tkt > TKT_Q_SWITCH"
> do queued spin to remove the thundering herd)

I could see taking that sort of approach, but why not instead simply
choose a relatively small value for TKT_Q_SWITCH? The unfairness is
very sharply bounded in that case, so are you sure that removing the
thundering herd is really worth it?

Thanx, Paul

> diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> index f01b760..4ea409b 100644
> --- a/kernel/tktqlock.c
> +++ b/kernel/tktqlock.c
> @@ -27,7 +27,6 @@
>
> struct tkt_q {
> int cpu;
> - __ticket_t tail;
> struct tkt_q *next;
> };
>
> @@ -127,9 +126,8 @@ void tkt_q_do_wake(arch_spinlock_t *asp)
> struct tkt_q_head *tqhp;
> struct tkt_q *tqp;
>
> - /* If the queue is still being set up, wait for it. */
> - while ((tqhp = tkt_q_find_head(asp)) == NULL)
> - cpu_relax();
> + tqhp = tkt_q_find_head(asp);
> + BUG_ON(!tqhp);
>
> for (;;) {
>
> @@ -141,8 +139,6 @@ void tkt_q_do_wake(arch_spinlock_t *asp)
> return; /* No element, successfully removed queue. */
> cpu_relax();
> }
> - if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> - ACCESS_ONCE(tqhp->head_tkt) = -1;
> smp_mb(); /* Order pointer fetch and assignment against handoff. */
> ACCESS_ONCE(tqp->cpu) = -1;
> }
> @@ -164,20 +160,16 @@ bool tkt_q_do_spin(arch_spinlock_t *asp, struct
> __raw_tickets inc)
> */
> smp_mb(); /* See above block comment. */
>
> - /* If there no longer is a queue, leave. */
> tqhp = tkt_q_find_head(asp);
> - if (tqhp == NULL)
> - return false;
> + BUG_ON(!tqhp);
>
> /* Initialize our queue element. */
> tq.cpu = raw_smp_processor_id();
> - tq.tail = inc.tail;
> tq.next = NULL;
>
> /* Check to see if we already hold the lock. */
> if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> /* The last holder left before queue formed, we hold lock. */
> - tqhp->head_tkt = -1;
> return true;
> }
>
> @@ -251,16 +243,14 @@ tkt_q_init_contend(int i, arch_spinlock_t *asp,
> struct __raw_tickets inc)
> return tkt_q_do_spin(asp, inc);
> }
>
> + /* Point the queue at the lock and go spin on it. */
> + tqhp->head_tkt = asold.tickets.head;
> /* The low-order bit in the head counter says "queued". */
> asnew.tickets.head |= 0x1;
> } while (cmpxchg(&asp->head_tail,
> asold.head_tail,
> asnew.head_tail) != asold.head_tail);
>
> - /* Point the queue at the lock and go spin on it. */
> - tqhp->head_tkt = asold.tickets.head;
> - smp_mb(); /* Ensure head_tkt is set prior to queuers seeing tqhp. */
> - ACCESS_ONCE(tqhp->ref) = asp;
> return tkt_q_do_spin(asp, inc);
> }
>
> @@ -282,14 +272,9 @@ bool tkt_q_start_contend(arch_spinlock_t *asp,
> struct __raw_tickets inc)
> * the lock with the corresponding queue.
> */
> do {
> - /*
> - * Use 0x1 to mark the queue in use, but also avoiding
> - * any spinners trying to use it before we get it all
> - * initialized.
> - */
> if (cmpxchg(&tkt_q_heads[i].ref,
> NULL,
> - (arch_spinlock_t *)0x1) == NULL) {
> + asp) == NULL) {
>
> /* Succeeded, now go initialize it. */
> return tkt_q_init_contend(i, asp, inc);
>

2013-06-11 16:43:46

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 11:57:14AM -0400, Waiman Long wrote:
> On 06/09/2013 03:36 PM, Paul E. McKenney wrote:
> >Breaking up locks is better than implementing high-contention locks, but
> >if we must have high-contention locks, why not make them automatically
> >switch between light-weight ticket locks at low contention and queued
> >locks at high contention?
> >
> >This commit therefore allows ticket locks to automatically switch between
> >pure ticketlock and queued-lock operation as needed. If too many CPUs
> >are spinning on a given ticket lock, a queue structure will be allocated
> >and the lock will switch to queued-lock operation. When the lock becomes
> >free, it will switch back into ticketlock operation. The low-order bit
> >of the head counter is used to indicate that the lock is in queued mode,
> >which forces an unconditional mismatch between the head and tail counters.
> >This approach means that the common-case code path under conditions of
> >low contention is very nearly that of a plain ticket lock.
> >
> >A fixed number of queueing structures is statically allocated in an
> >array. The ticket-lock address is used to hash into an initial element,
> >but if that element is already in use, it moves to the next element. If
> >the entire array is already in use, continue to spin in ticket mode.
> >
> >This has been only lightly tested in the kernel, though a userspace
> >implementation has survived substantial testing.
> >
> >Signed-off-by: Paul E. McKenney<[email protected]>
>
> This is an interesting patch and I think it is useful for workloads
> that run on systems with a large number of CPUs.
>
> >diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> >index ad0ad07..cdaefdd 100644
> >--- a/arch/x86/include/asm/spinlock_types.h
> >+++ b/arch/x86/include/asm/spinlock_types.h
> >@@ -7,12 +7,18 @@
> >
> > #include<linux/types.h>
> >
> >-#if (CONFIG_NR_CPUS< 256)
> >+#if (CONFIG_NR_CPUS< 128)
> > typedef u8 __ticket_t;
> > typedef u16 __ticketpair_t;
> >-#else
> >+#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2>= (unsigned char)((a) - (b)))
> >+#elif (CONFIG_NR_CPUS< 32768)
> > typedef u16 __ticket_t;
> > typedef u32 __ticketpair_t;
> >+#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2>= (unsigned short)((a) - (b)))
> >+#else
> >+typedef u32 __ticket_t;
> >+typedef u64 __ticketpair_t;
> >+#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2>= (unsigned int)((a) - (b)))
> > #endif
>
> It is theoretically possible that a large number of CPUs (says 64 or
> more with CONFIG_NR_CPUS < 128) can acquire a ticket from the lock
> before the check for TICKET_T_CMP_GE() in tkt_spin_pass(). So the
> check will fail even when there is a large number of CPUs contending
> for the lock. The chance of this happening is, of course, extremely
> rare. This is not an error as the lock is still working as it should
> be without your change.

Good point, I need to change the limits from 128 and 32768 to 64 and 16384
in order to guarantee that the comparison will work correctly. Done.

> >+/*
> >+ * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> >+ * given ticket lock to motivate switching to spinning on a queue.
> >+ * The reason that it is twice the number is because the bottom bit of
> >+ * the ticket is reserved for the bit that indicates that a queue is
> >+ * associated with the lock.
> >+ */
> >+#define TKT_Q_SWITCH (16 * 2)
> >+
> >+/*
> >+ * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> >+ * might have multiple highly contended locks, so provide more queues for
> >+ * systems with larger numbers of CPUs.
> >+ */
> >+#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
> >+
> >+/* The queues themselves. */
> >+struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
>
> I am a bit concern about the size of the head queue table itself.
> RHEL6, for example, had defined CONFIG_NR_CPUS to be 4096 which mean
> a table size of 256. Maybe it is better to dynamically allocate the
> table at init time depending on the actual number of CPUs in the
> system.

But if your kernel is built for 4096 CPUs, the 32*256=8192 bytes of memory
is way down in the noise. Systems that care about that small an amount
of memory probably have a small enough number of CPUs that they can just
turn off queueing at build time using CONFIG_TICKET_LOCK_QUEUED=n, right?

> >+/*
> >+ * Return a pointer to the queue header associated with the specified lock,
> >+ * or return NULL if there is no queue for the lock or if the lock's queue
> >+ * is in transition.
> >+ */
> >+static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
> >+{
> >+ int i;
> >+ int start;
> >+
> >+ start = i = tkt_q_hash(asp);
> >+ do
> >+ if (tkt_q_heads[i].ref == asp)
> >+ return&tkt_q_heads[i];
> >+ while ((i = tkt_q_next_slot(i)) != start);
> >+ return NULL;
> >+}
>
> With a table size of 256 and you have to scan the whole table to
> find the right head queue. This can be a significant overhead. I
> will suggest setting a limiting of how many entries it scans before
> it aborts rather than checking the whole table.

But it will scan 256 entries only if there are 256 other locks in queued
mode, which is -very- unlikely, even given 4096 CPUs. That said, if you
show me that this results in a real latency problem on a real system,
I would be happy to provide a way to limit the search.

> >+/*
> >+ * Hand the lock off to the first CPU on the queue.
> >+ */
> >+void tkt_q_do_wake(arch_spinlock_t *asp)
> >+{
> >+ struct tkt_q_head *tqhp;
> >+ struct tkt_q *tqp;
> >+
> >+ /* If the queue is still being set up, wait for it. */
> >+ while ((tqhp = tkt_q_find_head(asp)) == NULL)
> >+ cpu_relax();
> >+
> >+ for (;;) {
> >+
> >+ /* Find the first queue element. */
> >+ tqp = ACCESS_ONCE(tqhp->spin);
> >+ if (tqp != NULL)
> >+ break; /* Element exists, hand off lock. */
> >+ if (tkt_q_try_unqueue(asp, tqhp))
> >+ return; /* No element, successfully removed queue. */
> >+ cpu_relax();
> >+ }
> >+ if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> >+ ACCESS_ONCE(tqhp->head_tkt) = -1;
>
> In case NR_CPUS is 32768 or higher, the ticket will be of type u32
> and tqhp->head_tkt is s32. So -1 will be a valid ticket number. You
> may have to conditionally define head_tkt to be s64 when the ticket
> is u32.

Good catch! For the moment, I just made head_tkt unconditionally s64.
I bet that the extra comparison work has no system-visible effect. ;-)

> Do you have any data on how much this patch can actually improve
> performance on certain workloads? This will help the discussion
> here.

I could post some microbenchmark numbers if that would help.

Thanx, Paul

2013-06-11 16:44:46

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 12:20:32PM -0400, Steven Rostedt wrote:
> On Tue, 2013-06-11 at 11:57 -0400, Waiman Long wrote:
>
> > This is an interesting patch and I think it is useful for workloads that
> > run on systems with a large number of CPUs.
>
> I would say it is definitely a fun academic patch, now if it is
> something for a production environment remains to be seen.

At the moment, it should be considered an intellectual exercise. Might be
useful at some point, but I would personally rather that the offending
lock be broken up to reduce its contention.

> > > diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> > > index ad0ad07..cdaefdd 100644
> > > --- a/arch/x86/include/asm/spinlock_types.h
> > > +++ b/arch/x86/include/asm/spinlock_types.h
> > > @@ -7,12 +7,18 @@
> > >
> > > #include<linux/types.h>
> > >
> > > -#if (CONFIG_NR_CPUS< 256)
> > > +#if (CONFIG_NR_CPUS< 128)
> > > typedef u8 __ticket_t;
> > > typedef u16 __ticketpair_t;
> > > -#else
> > > +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2>= (unsigned char)((a) - (b)))
> > > +#elif (CONFIG_NR_CPUS< 32768)
> > > typedef u16 __ticket_t;
> > > typedef u32 __ticketpair_t;
> > > +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2>= (unsigned short)((a) - (b)))
> > > +#else
> > > +typedef u32 __ticket_t;
> > > +typedef u64 __ticketpair_t;
> > > +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2>= (unsigned int)((a) - (b)))
> > > #endif
> >
> > It is theoretically possible that a large number of CPUs (says 64 or
> > more with CONFIG_NR_CPUS < 128) can acquire a ticket from the lock
> > before the check for TICKET_T_CMP_GE() in tkt_spin_pass(). So the check
> > will fail even when there is a large number of CPUs contending for the
> > lock. The chance of this happening is, of course, extremely rare. This
> > is not an error as the lock is still working as it should be without
> > your change.
>
> Can you explain this more. How can you acquire the ticket and update at
> the same time? If a queue has been set, then you can't acquire the
> ticket as the head has a 1 appended to it.

Suppose that CONFIG_NR_CPUS=127, and suppose that 65 CPUs atomically
increment ->tail before ...

Ah, good point. If TKT_Q_SWITCH is less than 64, then at least one CPU
will see the need to switch to queued mode, and will do so regardless of
what the other CPUs think. The key point is that each CPU will get its
ticket from the xadd(), and these will be issued in order. I therefore
backed out my change of the limits.

> > > +/*
> > > + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> > > + * given ticket lock to motivate switching to spinning on a queue.
> > > + * The reason that it is twice the number is because the bottom bit of
> > > + * the ticket is reserved for the bit that indicates that a queue is
> > > + * associated with the lock.
> > > + */
> > > +#define TKT_Q_SWITCH (16 * 2)
> > > +
> > > +/*
> > > + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> > > + * might have multiple highly contended locks, so provide more queues for
> > > + * systems with larger numbers of CPUs.
> > > + */
> > > +#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
> > > +
> > > +/* The queues themselves. */
> > > +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
> >
> > I am a bit concern about the size of the head queue table itself. RHEL6,
> > for example, had defined CONFIG_NR_CPUS to be 4096 which mean a table
> > size of 256. Maybe it is better to dynamically allocate the table at
> > init time depending on the actual number of CPUs in the system.
>
> Yeah, it can be allocated dynamically at boot.

But let's first demonstrate the need. Keep in mind that an early-boot
deadlock would exercise this code. Yes, it is just a check for NULL,
but on the other hand I didn't get the impression that you thought that
this code was too simple. ;-)

> > > +/*
> > > + * Return a pointer to the queue header associated with the specified lock,
> > > + * or return NULL if there is no queue for the lock or if the lock's queue
> > > + * is in transition.
> > > + */
> > > +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
> > > +{
> > > + int i;
> > > + int start;
> > > +
> > > + start = i = tkt_q_hash(asp);
> > > + do
> > > + if (tkt_q_heads[i].ref == asp)
> > > + return&tkt_q_heads[i];
> > > + while ((i = tkt_q_next_slot(i)) != start);
> > > + return NULL;
> > > +}
> >
> > With a table size of 256 and you have to scan the whole table to find
> > the right head queue. This can be a significant overhead. I will suggest
> > setting a limiting of how many entries it scans before it aborts rather
> > than checking the whole table.
>
> We could add a limit, but in practice I'm not sure that would have any
> issue. I thought the same thing when I first saw this, but to hit most
> of the list, would require a large collision in the hash algorithm,
> would could probably be fixed with a better hash.
>
> >
> > > +/*
> > > + * Hand the lock off to the first CPU on the queue.
> > > + */
> > > +void tkt_q_do_wake(arch_spinlock_t *asp)
> > > +{
> > > + struct tkt_q_head *tqhp;
> > > + struct tkt_q *tqp;
> > > +
> > > + /* If the queue is still being set up, wait for it. */
> > > + while ((tqhp = tkt_q_find_head(asp)) == NULL)
> > > + cpu_relax();
> > > +
> > > + for (;;) {
> > > +
> > > + /* Find the first queue element. */
> > > + tqp = ACCESS_ONCE(tqhp->spin);
> > > + if (tqp != NULL)
> > > + break; /* Element exists, hand off lock. */
> > > + if (tkt_q_try_unqueue(asp, tqhp))
> > > + return; /* No element, successfully removed queue. */
> > > + cpu_relax();
> > > + }
> > > + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> > > + ACCESS_ONCE(tqhp->head_tkt) = -1;
> >
> > In case NR_CPUS is 32768 or higher, the ticket will be of type u32 and
> > tqhp->head_tkt is s32. So -1 will be a valid ticket number. You may have
> > to conditionally define head_tkt to be s64 when the ticket is u32.
>
> Good point.
>
> >
> > Do you have any data on how much this patch can actually improve
> > performance on certain workloads? This will help the discussion here.
>
> Yeah, that's come up already in the thread. Linus wants to see hard
> numbers *and* an explanation of why the contended locks can't be fixed,
> before he even considers merging this type of change.

A point that I am definitely -not- arguing with. ;-)

Thanx, Paul

2013-06-11 16:45:28

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 11:22:45AM -0400, Steven Rostedt wrote:
> On Tue, 2013-06-11 at 03:14 -0700, Paul E. McKenney wrote:
> >
> > > Off-topic, although I am in this community for several years,
> > > I am not exactly clear with this problem.
> > >
> > > 1) In general case, which lock is the most competitive in the kernel? what it protects for?
> > > 2) In which special case, which lock is the most competitive in the kernel? what it protects for?
> > > 3) In general case, which list is the most hot list?
> > > 4) In which special case, which list is the most hot list?
> >
> > Others would know better than I, but mmap_sem has been called out as a
>
> If the contention is with mmap_sem, then I doubt this is going to help
> much, as that's a sleeping rw semaphore. Now, rw semaphores are
> implemented with raw spinlocks, but I doubt that would be the main point
> of contention, compared to the sleeping part.

If I remember correctly, someone actually hit this earlier this year,
which prompted use of a special-purpose queued lock to guard the
semaphore data. I don't recall whether it was mmap_sem or not, so
cannot say whether it was a straight mutex or an rw semaphore.

Thanx, Paul

> -- Steve
>
> > prime offender for some workloads. There is of course some debate as
> > to whether the fault lies mmap_sem or with the workloads. There have
> > been some efforts to solve this one on LKML, plus some in academia have
> > worked on this as well:
> >
>
>

2013-06-11 16:49:30

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 11:10:30PM +0800, Lai Jiangshan wrote:
> On Tue, Jun 11, 2013 at 10:48 PM, Lai Jiangshan <[email protected]> wrote:
> > On Mon, Jun 10, 2013 at 3:36 AM, Paul E. McKenney
> > <[email protected]> wrote:
> >> Breaking up locks is better than implementing high-contention locks, but
> >> if we must have high-contention locks, why not make them automatically
> >> switch between light-weight ticket locks at low contention and queued
> >> locks at high contention?
> >>
> >> This commit therefore allows ticket locks to automatically switch between
> >> pure ticketlock and queued-lock operation as needed. If too many CPUs
> >> are spinning on a given ticket lock, a queue structure will be allocated
> >> and the lock will switch to queued-lock operation. When the lock becomes
> >> free, it will switch back into ticketlock operation. The low-order bit
> >> of the head counter is used to indicate that the lock is in queued mode,
> >> which forces an unconditional mismatch between the head and tail counters.
> >> This approach means that the common-case code path under conditions of
> >> low contention is very nearly that of a plain ticket lock.
> >>
> >> A fixed number of queueing structures is statically allocated in an
> >> array. The ticket-lock address is used to hash into an initial element,
> >> but if that element is already in use, it moves to the next element. If
> >> the entire array is already in use, continue to spin in ticket mode.
> >>
> >> This has been only lightly tested in the kernel, though a userspace
> >> implementation has survived substantial testing.
> >>
> >> Signed-off-by: Paul E. McKenney <[email protected]>
> >>
> >> diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
> >> index 33692ea..b4a91b0 100644
> >> --- a/arch/x86/include/asm/spinlock.h
> >> +++ b/arch/x86/include/asm/spinlock.h
> >> @@ -34,6 +34,8 @@
> >> # define UNLOCK_LOCK_PREFIX
> >> #endif
> >>
> >> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> >> +
> >> /*
> >> * Ticket locks are conceptually two parts, one indicating the current head of
> >> * the queue, and the other indicating the current tail. The lock is acquired
> >> @@ -62,6 +64,25 @@ static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> >> barrier(); /* make sure nothing creeps before the lock is taken */
> >> }
> >>
> >> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> >> +
> >> +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> >> +
> >> +static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> >> +{
> >> + register struct __raw_tickets inc = { .tail = 2 };
> >> +
> >> + inc = xadd(&lock->tickets, inc);
> >> + for (;;) {
> >> + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> >> + break;
> >> + inc.head = ACCESS_ONCE(lock->tickets.head);
> >> + }
> >> + barrier(); /* smp_mb() on Power or ARM. */
> >> +}
> >> +
> >> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> >> +
> >> static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> >> {
> >> arch_spinlock_t old, new;
> >> @@ -70,17 +91,37 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> >> if (old.tickets.head != old.tickets.tail)
> >> return 0;
> >>
> >> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> >> new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
> >> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> >> + new.head_tail = old.head_tail + (2 << TICKET_SHIFT);
> >> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> >>
> >> /* cmpxchg is a full barrier, so nothing can move before it */
> >> return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
> >> }
> >>
> >> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> >> +
> >> static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> >> {
> >> __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
> >> }
> >>
> >> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> >> +
> >> +extern void tkt_q_do_wake(arch_spinlock_t *asp);
> >> +
> >> +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> >> +{
> >> + __ticket_t head = 2;
> >> +
> >> + head = xadd(&lock->tickets.head, 2);
> >> + if (head & 0x1)
> >> + tkt_q_do_wake(lock);
> >> +}
> >> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> >> +
> >> static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
> >> {
> >> struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> >> diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> >> index ad0ad07..cdaefdd 100644
> >> --- a/arch/x86/include/asm/spinlock_types.h
> >> +++ b/arch/x86/include/asm/spinlock_types.h
> >> @@ -7,12 +7,18 @@
> >>
> >> #include <linux/types.h>
> >>
> >> -#if (CONFIG_NR_CPUS < 256)
> >> +#if (CONFIG_NR_CPUS < 128)
> >> typedef u8 __ticket_t;
> >> typedef u16 __ticketpair_t;
> >> -#else
> >> +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
> >> +#elif (CONFIG_NR_CPUS < 32768)
> >> typedef u16 __ticket_t;
> >> typedef u32 __ticketpair_t;
> >> +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
> >> +#else
> >> +typedef u32 __ticket_t;
> >> +typedef u64 __ticketpair_t;
> >> +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
> >> #endif
> >>
> >> #define TICKET_SHIFT (sizeof(__ticket_t) * 8)
> >> @@ -21,7 +27,11 @@ typedef struct arch_spinlock {
> >> union {
> >> __ticketpair_t head_tail;
> >> struct __raw_tickets {
> >> +#ifdef __BIG_ENDIAN__
> >> + __ticket_t tail, head;
> >> +#else /* #ifdef __BIG_ENDIAN__ */
> >> __ticket_t head, tail;
> >> +#endif /* #else #ifdef __BIG_ENDIAN__ */
> >> } tickets;
> >> };
> >> } arch_spinlock_t;
> >> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> >> index e9ef6d6..816a87c 100644
> >> --- a/include/linux/kernel.h
> >> +++ b/include/linux/kernel.h
> >> @@ -15,6 +15,7 @@
> >> #include <asm/byteorder.h>
> >> #include <uapi/linux/kernel.h>
> >>
> >> +#define UCHAR_MAX ((u8)(~0U))
> >> #define USHRT_MAX ((u16)(~0U))
> >> #define SHRT_MAX ((s16)(USHRT_MAX>>1))
> >> #define SHRT_MIN ((s16)(-SHRT_MAX - 1))
> >> diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> >> index 44511d1..ad9c67c 100644
> >> --- a/kernel/Kconfig.locks
> >> +++ b/kernel/Kconfig.locks
> >> @@ -223,3 +223,21 @@ endif
> >> config MUTEX_SPIN_ON_OWNER
> >> def_bool y
> >> depends on SMP && !DEBUG_MUTEXES
> >> +
> >> +config TICKET_LOCK_QUEUED
> >> + bool "Dynamically switch between ticket and queued locking"
> >> + default n
> >> + ---help---
> >> + Enable dynamic switching between ticketlock and queued locking
> >> + on a per-lock basis. This option will slow down low-contention
> >> + acquisition and release very slightly (additional conditional
> >> + in release path), but will provide more efficient operation at
> >> + high levels of lock contention. High-contention operation will
> >> + not be quite as efficient as would be a pure queued lock, but
> >> + this dynamic approach consumes less memory than queud locks
> >> + and also runs faster at low levels of contention.
> >> +
> >> + Say "Y" if you are running on a large system with a workload
> >> + that is likely to result in high levels of contention.
> >> +
> >> + Say "N" if you are unsure.
> >> diff --git a/kernel/Makefile b/kernel/Makefile
> >> index 271fd31..70a91f7 100644
> >> --- a/kernel/Makefile
> >> +++ b/kernel/Makefile
> >> @@ -51,6 +51,7 @@ endif
> >> obj-$(CONFIG_SMP) += spinlock.o
> >> obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
> >> obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
> >> +obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
> >> obj-$(CONFIG_UID16) += uid16.o
> >> obj-$(CONFIG_MODULES) += module.o
> >> obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
> >> diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> >> new file mode 100644
> >> index 0000000..f01b760
> >> --- /dev/null
> >> +++ b/kernel/tktqlock.c
> >> @@ -0,0 +1,333 @@
> >> +/*
> >> + * Queued ticket spinlocks.
> >> + *
> >> + * This program is free software; you can redistribute it and/or modify
> >> + * it under the terms of the GNU General Public License as published by
> >> + * the Free Software Foundation; either version 2 of the License, or
> >> + * (at your option) any later version.
> >> + *
> >> + * This program is distributed in the hope that it will be useful,
> >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> >> + * GNU General Public License for more details.
> >> + *
> >> + * You should have received a copy of the GNU General Public License
> >> + * along with this program; if not, write to the Free Software
> >> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> >> + *
> >> + * Copyright IBM Corporation, 2013
> >> + *
> >> + * Authors: Paul E. McKenney <[email protected]>
> >> + */
> >> +#include <linux/types.h>
> >> +#include <linux/kernel.h>
> >> +#include <linux/spinlock.h>
> >> +#include <linux/smp.h>
> >> +#include <linux/percpu.h>
> >> +
> >> +struct tkt_q {
> >> + int cpu;
> >> + __ticket_t tail;
> >> + struct tkt_q *next;
> >> +};
> >> +
> >> +struct tkt_q_head {
> >> + arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
> >> + s32 head_tkt; /* Head ticket when started queuing. */
> >> + struct tkt_q *spin; /* Head of queue. */
> >> + struct tkt_q **spin_tail; /* Tail of queue. */
> >> +};
> >> +
> >> +/*
> >> + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> >> + * given ticket lock to motivate switching to spinning on a queue.
> >> + * The reason that it is twice the number is because the bottom bit of
> >> + * the ticket is reserved for the bit that indicates that a queue is
> >> + * associated with the lock.
> >> + */
> >> +#define TKT_Q_SWITCH (16 * 2)
> >> +
> >> +/*
> >> + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> >> + * might have multiple highly contended locks, so provide more queues for
> >> + * systems with larger numbers of CPUs.
> >> + */
> >> +#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
> >> +
> >> +/* The queues themselves. */
> >> +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
> >> +
> >> +/* Advance to the next queue slot, wrapping around to the beginning. */
> >> +static int tkt_q_next_slot(int i)
> >> +{
> >> + return (++i < TKT_Q_NQUEUES) ? i : 0;
> >> +}
> >> +
> >> +/* Very crude hash from lock address to queue slot number. */
> >> +static unsigned long tkt_q_hash(arch_spinlock_t *asp)
> >> +{
> >> + return (((unsigned long)asp) >> 8) % TKT_Q_NQUEUES;
> >> +}
> >> +
> >> +/*
> >> + * Return a pointer to the queue header associated with the specified lock,
> >> + * or return NULL if there is no queue for the lock or if the lock's queue
> >> + * is in transition.
> >> + */
> >> +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
> >> +{
> >> + int i;
> >> + int start;
> >> +
> >> + start = i = tkt_q_hash(asp);
> >> + do
> >> + if (tkt_q_heads[i].ref == asp)
> >> + return &tkt_q_heads[i];
> >> + while ((i = tkt_q_next_slot(i)) != start);
> >> + return NULL;
> >> +}
> >> +
> >> +/*
> >> + * Try to stop queuing, reverting back to normal ticket-lock operation.
> >> + * We can only stop queuing when the queue is empty, which means that
> >> + * we need to correctly handle races where someone shows up in the queue
> >> + * just as we are trying to dispense with the queue. They win, we lose.
> >> + */
> >> +static bool tkt_q_try_unqueue(arch_spinlock_t *asp, struct tkt_q_head *tqhp)
> >> +{
> >> + arch_spinlock_t asold;
> >> + arch_spinlock_t asnew;
> >> +
> >> + /* Pick up the ticket values. */
> >> + asold = ACCESS_ONCE(*asp);
> >> + if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
> >> +
> >> + /* Attempt to mark the lock as not having a queue. */
> >> + asnew = asold;
> >> + asnew.tickets.head &= ~0x1;
> >> + if (cmpxchg(&asp->head_tail,
> >> + asold.head_tail,
> >> + asnew.head_tail) == asold.head_tail) {
> >> +
> >> + /* Succeeded, mark the queue as unused. */
> >> + ACCESS_ONCE(tqhp->ref) = NULL;
> >> + return true;
> >> + }
> >> + }
> >> +
> >> + /* Failed, tell the caller there is still a queue to pass off to. */
> >> + return false;
> >> +}
> >> +
> >> +/*
> >> + * Hand the lock off to the first CPU on the queue.
> >> + */
> >> +void tkt_q_do_wake(arch_spinlock_t *asp)
> >> +{
> >> + struct tkt_q_head *tqhp;
> >> + struct tkt_q *tqp;
> >> +
> >> + /* If the queue is still being set up, wait for it. */
> >> + while ((tqhp = tkt_q_find_head(asp)) == NULL)
> >> + cpu_relax();
> >> +
> >> + for (;;) {
> >> +
> >> + /* Find the first queue element. */
> >> + tqp = ACCESS_ONCE(tqhp->spin);
> >> + if (tqp != NULL)
> >> + break; /* Element exists, hand off lock. */
> >> + if (tkt_q_try_unqueue(asp, tqhp))
> >> + return; /* No element, successfully removed queue. */
> >> + cpu_relax();
> >> + }
> >> + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> >> + ACCESS_ONCE(tqhp->head_tkt) = -1;
> >> + smp_mb(); /* Order pointer fetch and assignment against handoff. */
> >> + ACCESS_ONCE(tqp->cpu) = -1;
> >> +}
> >> +
> >> +/*
> >> + * Given a lock that already has a queue associated with it, spin on
> >> + * that queue. Return false if there was no queue (which means we do not
> >> + * hold the lock) and true otherwise (meaning we -do- hold the lock).
> >> + */
> >> +bool tkt_q_do_spin(arch_spinlock_t *asp, struct __raw_tickets inc)
> >> +{
> >> + struct tkt_q **oldtail;
> >> + struct tkt_q tq;
> >> + struct tkt_q_head *tqhp;
> >> +
> >> + /*
> >> + * Ensure that accesses to queue header happen after sensing
> >> + * the lock's have-queue bit.
> >> + */
> >> + smp_mb(); /* See above block comment. */
> >> +
> >> + /* If there no longer is a queue, leave. */
> >> + tqhp = tkt_q_find_head(asp);
> >> + if (tqhp == NULL)
> >> + return false;
> >> +
> >> + /* Initialize our queue element. */
> >> + tq.cpu = raw_smp_processor_id();
> >> + tq.tail = inc.tail;
> >> + tq.next = NULL;
> >> +
> >> + /* Check to see if we already hold the lock. */
> >> + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> >> + /* The last holder left before queue formed, we hold lock. */
> >> + tqhp->head_tkt = -1;
> >> + return true;
> >> + }
> >> +
> >> + /* Add our element to the tail of the queue. */
> >> + oldtail = xchg(&tqhp->spin_tail, &tq.next);
> >> + ACCESS_ONCE(*oldtail) = &tq;
> >> +
> >> + /* Spin until handoff. */
> >> + while (ACCESS_ONCE(tq.cpu) != -1)
> >> + cpu_relax();
> >> +
> >> + /*
> >> + * Remove our element from the queue. If the queue is now empty,
> >> + * update carefully so that the next acquisition will queue itself
> >> + * at the head of the list.
> >> + */
> >> + if (tq.next == NULL) {
> >> +
> >> + /* Mark the queue empty. */
> >> + tqhp->spin = NULL;
> >> +
> >> + /* Try to point the tail back at the head. */
> >> + if (cmpxchg(&tqhp->spin_tail,
> >> + &tq.next,
> >> + &tqhp->spin) == &tq.next)
> >> + return true; /* Succeeded, queue is now empty. */
> >> +
> >> + /* Failed, if needed, wait for the enqueue to complete. */
> >> + while (tq.next == NULL)
> >> + cpu_relax();
> >> +
> >> + /* The following code will repair the head. */
> >> + }
> >> + smp_mb(); /* Force ordering between handoff and critical section. */
> >> +
> >> + /* Advance list-head pointer. */
> >> + ACCESS_ONCE(tqhp->spin) = tq.next;
> >> + return true;
> >> +}
> >> +
> >> +/*
> >> + * Given a lock that does not have a queue, attempt to associate the
> >> + * i-th queue with it, returning true if successful (meaning we hold
> >> + * the lock) or false otherwise (meaning we do -not- hold the lock).
> >> + * Note that the caller has already filled in ->ref with 0x1, so we
> >> + * own the queue.
> >> + */
> >> +static bool
> >> +tkt_q_init_contend(int i, arch_spinlock_t *asp, struct __raw_tickets inc)
> >> +{
> >> + arch_spinlock_t asold;
> >> + arch_spinlock_t asnew;
> >> + struct tkt_q_head *tqhp;
> >> +
> >> + /* Initialize the i-th queue header. */
> >> + tqhp = &tkt_q_heads[i];
> >> + tqhp->spin = NULL;
> >> + tqhp->spin_tail = &tqhp->spin;
> >> +
> >> + /* Each pass through this loop attempts to mark the lock as queued. */
> >> + do {
> >> + asold.head_tail = ACCESS_ONCE(asp->head_tail);
> >> + asnew = asold;
> >> + if (asnew.tickets.head & 0x1) {
> >> +
> >> + /* Someone beat us to it, back out. */
> >> + smp_mb();
> >> + ACCESS_ONCE(tqhp->ref) = NULL;
> >> +
> >> + /* Spin on the queue element they set up. */
> >> + return tkt_q_do_spin(asp, inc);
> >> + }
> >> +
> >> + /* The low-order bit in the head counter says "queued". */
> >> + asnew.tickets.head |= 0x1;
> >> + } while (cmpxchg(&asp->head_tail,
> >> + asold.head_tail,
> >> + asnew.head_tail) != asold.head_tail);
> >> +
> >> + /* Point the queue at the lock and go spin on it. */
> >> + tqhp->head_tkt = asold.tickets.head;
> >> + smp_mb(); /* Ensure head_tkt is set prior to queuers seeing tqhp. */
> >> + ACCESS_ONCE(tqhp->ref) = asp;
> >> + return tkt_q_do_spin(asp, inc);
> >> +}
> >
> > Just small revise.
>
> Sorry it is wrong. tkt_q_find_head() will returns wrong.
> could we use only tkt_q_heads[tkt_q_hash(asp)] instead of find a free one?

Glad you agree. ;-)

I did consider doing that, but was too worried about hash collisions.
The hash function could be improved, but that would make it more
expensive, which is not a good thing for code on the critical lock-handoff
path.

Another approach is to permanently associate queues with each lock,
but that increases the size of the lock -- something that has raised
concerns in the past. But if adding 32 bytes to each ticketlock was OK,
this simplifies things quite a bit.

Thanx, Paul

> > I just move " tqhp->head_tkt = asold.tickets.head;" into the loop, so
> > we can use "asp->tickets.head & 0x1" to
> > indicates that queued spinlock is prepared instead of by "tqhp->ref == asp".
> >
> > See the append diff.
> > (And I guess, after it you can force only the CPUs which
> > "inc.tail - tqhp->head_tkt > TKT_Q_SWITCH"
> > do queued spin to remove the thundering herd)
> >
> > diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> > index f01b760..4ea409b 100644
> > --- a/kernel/tktqlock.c
> > +++ b/kernel/tktqlock.c
> > @@ -27,7 +27,6 @@
> >
> > struct tkt_q {
> > int cpu;
> > - __ticket_t tail;
> > struct tkt_q *next;
> > };
> >
> > @@ -127,9 +126,8 @@ void tkt_q_do_wake(arch_spinlock_t *asp)
> > struct tkt_q_head *tqhp;
> > struct tkt_q *tqp;
> >
> > - /* If the queue is still being set up, wait for it. */
> > - while ((tqhp = tkt_q_find_head(asp)) == NULL)
> > - cpu_relax();
> > + tqhp = tkt_q_find_head(asp);
> > + BUG_ON(!tqhp);
> >
> > for (;;) {
> >
> > @@ -141,8 +139,6 @@ void tkt_q_do_wake(arch_spinlock_t *asp)
> > return; /* No element, successfully removed queue. */
> > cpu_relax();
> > }
> > - if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> > - ACCESS_ONCE(tqhp->head_tkt) = -1;
> > smp_mb(); /* Order pointer fetch and assignment against handoff. */
> > ACCESS_ONCE(tqp->cpu) = -1;
> > }
> > @@ -164,20 +160,16 @@ bool tkt_q_do_spin(arch_spinlock_t *asp, struct
> > __raw_tickets inc)
> > */
> > smp_mb(); /* See above block comment. */
> >
> > - /* If there no longer is a queue, leave. */
> > tqhp = tkt_q_find_head(asp);
> > - if (tqhp == NULL)
> > - return false;
> > + BUG_ON(!tqhp);
> >
> > /* Initialize our queue element. */
> > tq.cpu = raw_smp_processor_id();
> > - tq.tail = inc.tail;
> > tq.next = NULL;
> >
> > /* Check to see if we already hold the lock. */
> > if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> > /* The last holder left before queue formed, we hold lock. */
> > - tqhp->head_tkt = -1;
> > return true;
> > }
> >
> > @@ -251,16 +243,14 @@ tkt_q_init_contend(int i, arch_spinlock_t *asp,
> > struct __raw_tickets inc)
> > return tkt_q_do_spin(asp, inc);
> > }
> >
> > + /* Point the queue at the lock and go spin on it. */
> > + tqhp->head_tkt = asold.tickets.head;
> > /* The low-order bit in the head counter says "queued". */
> > asnew.tickets.head |= 0x1;
> > } while (cmpxchg(&asp->head_tail,
> > asold.head_tail,
> > asnew.head_tail) != asold.head_tail);
> >
> > - /* Point the queue at the lock and go spin on it. */
> > - tqhp->head_tkt = asold.tickets.head;
> > - smp_mb(); /* Ensure head_tkt is set prior to queuers seeing tqhp. */
> > - ACCESS_ONCE(tqhp->ref) = asp;
> > return tkt_q_do_spin(asp, inc);
> > }
> >
> > @@ -282,14 +272,9 @@ bool tkt_q_start_contend(arch_spinlock_t *asp,
> > struct __raw_tickets inc)
> > * the lock with the corresponding queue.
> > */
> > do {
> > - /*
> > - * Use 0x1 to mark the queue in use, but also avoiding
> > - * any spinners trying to use it before we get it all
> > - * initialized.
> > - */
> > if (cmpxchg(&tkt_q_heads[i].ref,
> > NULL,
> > - (arch_spinlock_t *)0x1) == NULL) {
> > + asp) == NULL) {
> >
> > /* Succeeded, now go initialize it. */
> > return tkt_q_init_contend(i, asp, inc);
>
> if (tkt_q_heads[i].ref == asp)
>

2013-06-11 17:01:59

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, 2013-06-11 at 09:36 -0700, Paul E. McKenney wrote:

> > I am a bit concern about the size of the head queue table itself.
> > RHEL6, for example, had defined CONFIG_NR_CPUS to be 4096 which mean
> > a table size of 256. Maybe it is better to dynamically allocate the
> > table at init time depending on the actual number of CPUs in the
> > system.
>
> But if your kernel is built for 4096 CPUs, the 32*256=8192 bytes of memory
> is way down in the noise. Systems that care about that small an amount
> of memory probably have a small enough number of CPUs that they can just
> turn off queueing at build time using CONFIG_TICKET_LOCK_QUEUED=n, right?

If this turns out to work for large machines, that means that distros
will enable it, and distros tend to up the NR_CPUS, which is defined at
compile time and is set regardless of if you are running with 2 CPUs or
a 1000 CPUs.

For now it's fine to use NR_CPUS, but I always try to avoid it. Working
in the ARM and POWER environment you are use to lots of kernels compiled
specifically for the target. But in the x86 world, it is basically one
kernel for all environments, where NR_CPUS does make a big difference.

-- Steve

2013-06-11 17:13:05

by Paul E. McKenney

[permalink] [raw]
Subject: [PATCH RFC ticketlock] v2 Auto-queued ticketlock

Breaking up locks is better than implementing high-contention locks, but
if we must have high-contention locks, why not make them automatically
switch between light-weight ticket locks at low contention and queued
locks at high contention? After all, this would remove the need for
the developer to predict which locks will be highly contended.

This commit allows ticket locks to automatically switch between pure
ticketlock and queued-lock operation as needed. If too many CPUs are
spinning on a given ticket lock, a queue structure will be allocated
and the lock will switch to queued-lock operation. When the lock becomes
free, it will switch back into ticketlock operation. The low-order bit
of the head counter is used to indicate that the lock is in queued mode,
which forces an unconditional mismatch between the head and tail counters.
This approach means that the common-case code path under conditions of
low contention is very nearly that of a plain ticket lock.

A fixed number of queueing structures is statically allocated in an
array. The ticket-lock address is used to hash into an initial element,
but if that element is already in use, it moves to the next element. If
the entire array is already in use, continue to spin in ticket mode.

Signed-off-by: Paul E. McKenney <[email protected]>
[ paulmck: Eliminate duplicate code and update comments (Steven Rostedt). ]
[ paulmck: Address Eric Dumazet review feedback. ]
[ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
[ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]

diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 33692ea..03d184e 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -34,6 +34,21 @@
# define UNLOCK_LOCK_PREFIX
#endif

+#ifdef CONFIG_TICKET_LOCK_QUEUED
+
+#define __TKT_SPIN_INC 2
+bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
+
+#else /* #ifdef CONFIG_TICKET_LOCK_QUEUED */
+
+#define __TKT_SPIN_INC 1
+static inline bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
+{
+ return false;
+}
+
+#endif /* #else #ifdef CONFIG_TICKET_LOCK_QUEUED */
+
/*
* Ticket locks are conceptually two parts, one indicating the current head of
* the queue, and the other indicating the current tail. The lock is acquired
@@ -49,17 +64,15 @@
*/
static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
{
- register struct __raw_tickets inc = { .tail = 1 };
+ register struct __raw_tickets inc = { .tail = __TKT_SPIN_INC };

inc = xadd(&lock->tickets, inc);
-
for (;;) {
- if (inc.head == inc.tail)
+ if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
break;
- cpu_relax();
inc.head = ACCESS_ONCE(lock->tickets.head);
}
- barrier(); /* make sure nothing creeps before the lock is taken */
+ barrier(); /* Make sure nothing creeps in before the lock is taken. */
}

static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
@@ -70,17 +83,37 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
if (old.tickets.head != old.tickets.tail)
return 0;

+#ifndef CONFIG_TICKET_LOCK_QUEUED
new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
+#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
+ new.head_tail = old.head_tail + (2 << TICKET_SHIFT);
+#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */

/* cmpxchg is a full barrier, so nothing can move before it */
return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
}

+#ifndef CONFIG_TICKET_LOCK_QUEUED
+
static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
{
__add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
}

+#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
+
+extern void tkt_q_do_wake(arch_spinlock_t *lock);
+
+static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
+{
+ __ticket_t head = 2;
+
+ head = xadd(&lock->tickets.head, head);
+ if (head & 0x1)
+ tkt_q_do_wake(lock);
+}
+#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
+
static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
{
struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index ad0ad07..cdaefdd 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -7,12 +7,18 @@

#include <linux/types.h>

-#if (CONFIG_NR_CPUS < 256)
+#if (CONFIG_NR_CPUS < 128)
typedef u8 __ticket_t;
typedef u16 __ticketpair_t;
-#else
+#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
+#elif (CONFIG_NR_CPUS < 32768)
typedef u16 __ticket_t;
typedef u32 __ticketpair_t;
+#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
+#else
+typedef u32 __ticket_t;
+typedef u64 __ticketpair_t;
+#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
#endif

#define TICKET_SHIFT (sizeof(__ticket_t) * 8)
@@ -21,7 +27,11 @@ typedef struct arch_spinlock {
union {
__ticketpair_t head_tail;
struct __raw_tickets {
+#ifdef __BIG_ENDIAN__
+ __ticket_t tail, head;
+#else /* #ifdef __BIG_ENDIAN__ */
__ticket_t head, tail;
+#endif /* #else #ifdef __BIG_ENDIAN__ */
} tickets;
};
} arch_spinlock_t;
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e9ef6d6..816a87c 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -15,6 +15,7 @@
#include <asm/byteorder.h>
#include <uapi/linux/kernel.h>

+#define UCHAR_MAX ((u8)(~0U))
#define USHRT_MAX ((u16)(~0U))
#define SHRT_MAX ((s16)(USHRT_MAX>>1))
#define SHRT_MIN ((s16)(-SHRT_MAX - 1))
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 44511d1..900c0f0 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -223,3 +223,38 @@ endif
config MUTEX_SPIN_ON_OWNER
def_bool y
depends on SMP && !DEBUG_MUTEXES
+
+config TICKET_LOCK_QUEUED
+ bool "Dynamically switch between ticket and queued locking"
+ depends on SMP
+ default n
+ ---help---
+ Enable dynamic switching between ticketlock and queued locking
+ on a per-lock basis. This option will slow down low-contention
+ acquisition and release very slightly (additional conditional
+ in release path), but will provide more efficient operation at
+ high levels of lock contention. High-contention operation will
+ not be quite as efficient as would be a pure queued lock, but
+ this dynamic approach consumes less memory than queud locks
+ and also runs faster at low levels of contention.
+
+ Say "Y" if you are running on a large system with a workload
+ that is likely to result in high levels of contention.
+
+ Say "N" if you are unsure.
+
+config TICKET_LOCK_QUEUED_SWITCH
+ int "When to switch from ticket to queued locking"
+ depends on TICKET_LOCK_QUEUED
+ default 8
+ range 3 32
+ ---help---
+ Specify how many tasks should be spinning on the lock before
+ switching to queued mode. Systems with low-latency memory/cache
+ interconnects will prefer larger numbers, while extreme low-latency
+ and real-time workloads will prefer a smaller number. Of course,
+ extreme real-time workloads would be even happier if contention
+ on the locks were reduced to the point that there was never any
+ need for queued locking in the first place.
+
+ Take the default if you are unsure.
diff --git a/kernel/Makefile b/kernel/Makefile
index 271fd31..70a91f7 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -51,6 +51,7 @@ endif
obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
new file mode 100644
index 0000000..182bdcb
--- /dev/null
+++ b/kernel/tktqlock.c
@@ -0,0 +1,369 @@
+/*
+ * Queued ticket spinlocks.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2013
+ *
+ * Authors: Paul E. McKenney <[email protected]>
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+
+struct tkt_q {
+ int cpu;
+ __ticket_t tail;
+ struct tkt_q *next;
+};
+
+struct tkt_q_head {
+ arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
+ s64 head_tkt; /* Head ticket when started queuing. */
+ struct tkt_q *spin; /* Head of queue. */
+ struct tkt_q **spin_tail; /* Tail of queue. */
+};
+
+/*
+ * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
+ * given ticket lock to motivate switching to spinning on a queue.
+ * The reason that it is twice the number is because the bottom bit of
+ * the ticket is reserved for the bit that indicates that a queue is
+ * associated with the lock.
+ */
+#define TKT_Q_SWITCH (CONFIG_TICKET_LOCK_QUEUED_SWITCH * 2)
+
+/*
+ * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
+ * might have multiple highly contended locks, so provide more queues for
+ * systems with larger numbers of CPUs.
+ */
+#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
+
+/* The queues themselves. */
+struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
+
+/* Advance to the next queue slot, wrapping around to the beginning. */
+static int tkt_q_next_slot(int i)
+{
+ return (++i < TKT_Q_NQUEUES) ? i : 0;
+}
+
+/* Very crude hash from lock address to queue slot number. */
+static unsigned long tkt_q_hash(arch_spinlock_t *lock)
+{
+ return (((unsigned long)lock) >> 8) % TKT_Q_NQUEUES;
+}
+
+/*
+ * Return a pointer to the queue header associated with the specified lock,
+ * or return NULL if there is no queue for the lock or if the lock's queue
+ * is in transition.
+ */
+static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *lock)
+{
+ int i;
+ int start;
+
+ start = i = tkt_q_hash(lock);
+ do
+ if (tkt_q_heads[i].ref == lock)
+ return &tkt_q_heads[i];
+ while ((i = tkt_q_next_slot(i)) != start);
+ return NULL;
+}
+
+/*
+ * Try to stop queuing, reverting back to normal ticket-lock operation.
+ * We can only stop queuing when the queue is empty, which means that
+ * we need to correctly handle races where someone shows up in the queue
+ * just as we are trying to dispense with the queue. They win, we lose.
+ */
+static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)
+{
+ arch_spinlock_t asold;
+ arch_spinlock_t asnew;
+
+ /* Pick up the ticket values. */
+ asold = ACCESS_ONCE(*lock);
+ if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
+
+ /* Attempt to mark the lock as not having a queue. */
+ asnew = asold;
+ asnew.tickets.head &= ~0x1;
+ if (cmpxchg(&lock->head_tail,
+ asold.head_tail,
+ asnew.head_tail) == asold.head_tail) {
+
+ /* Succeeded, mark the queue as unused. */
+ ACCESS_ONCE(tqhp->ref) = NULL;
+ return true;
+ }
+ }
+
+ /* Failed, tell the caller there is still a queue to pass off to. */
+ return false;
+}
+
+/*
+ * Hand the lock off to the first CPU on the queue.
+ */
+void tkt_q_do_wake(arch_spinlock_t *lock)
+{
+ struct tkt_q_head *tqhp;
+ struct tkt_q *tqp;
+
+ /* If the queue is still being set up, wait for it. */
+ while ((tqhp = tkt_q_find_head(lock)) == NULL)
+ cpu_relax();
+
+ for (;;) {
+
+ /* Find the first queue element. */
+ tqp = ACCESS_ONCE(tqhp->spin);
+ if (tqp != NULL)
+ break; /* Element exists, hand off lock. */
+ if (tkt_q_try_unqueue(lock, tqhp))
+ return; /* No element, successfully removed queue. */
+ cpu_relax();
+ }
+ if (ACCESS_ONCE(tqhp->head_tkt) != -1)
+ ACCESS_ONCE(tqhp->head_tkt) = -1;
+ smp_mb(); /* Order pointer fetch and assignment against handoff. */
+ ACCESS_ONCE(tqp->cpu) = -1;
+}
+EXPORT_SYMBOL(tkt_q_do_wake);
+
+/*
+ * Given a lock that already has a queue associated with it, spin on
+ * that queue. Return false if there was no queue (which means we do not
+ * hold the lock) and true otherwise (meaning we -do- hold the lock).
+ */
+bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
+{
+ struct tkt_q **oldtail;
+ struct tkt_q tq;
+ struct tkt_q_head *tqhp;
+
+ /*
+ * Ensure that accesses to queue header happen after sensing
+ * the lock's have-queue bit.
+ */
+ smp_mb(); /* See above block comment. */
+
+ /* If there no longer is a queue, leave. */
+ tqhp = tkt_q_find_head(lock);
+ if (tqhp == NULL)
+ return false;
+
+ /* Initialize our queue element. */
+ tq.cpu = raw_smp_processor_id();
+ tq.tail = inc.tail;
+ tq.next = NULL;
+
+ /* Check to see if we already hold the lock. */
+ if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
+ /* The last holder left before queue formed, we hold lock. */
+ tqhp->head_tkt = -1;
+ return true;
+ }
+
+ /*
+ * Add our element to the tail of the queue. Note that if the
+ * queue is empty, the ->spin_tail pointer will reference
+ * the queue's head pointer, namely ->spin.
+ */
+ oldtail = xchg(&tqhp->spin_tail, &tq.next);
+ ACCESS_ONCE(*oldtail) = &tq;
+
+ /* Spin until handoff. */
+ while (ACCESS_ONCE(tq.cpu) != -1)
+ cpu_relax();
+
+ /*
+ * Remove our element from the queue. If the queue is now empty,
+ * update carefully so that the next acquisition will enqueue itself
+ * at the head of the list. Of course, the next enqueue operation
+ * might be happening concurrently, and this code needs to handle all
+ * of the possible combinations, keeping in mind that the enqueue
+ * operation happens in two stages: (1) update the tail pointer and
+ * (2) update the predecessor's ->next pointer. With this in mind,
+ * the following code needs to deal with three scenarios:
+ *
+ * 1. tq is the last entry. In this case, we use cmpxchg to
+ * point the list tail back to the list head (->spin). If
+ * the cmpxchg fails, that indicates that we are instead
+ * in scenario 2 below. If the cmpxchg succeeds, the next
+ * enqueue operation's tail-pointer exchange will enqueue
+ * the next element at the queue head, because the ->spin_tail
+ * pointer now references the queue head.
+ *
+ * 2. tq is the last entry, and the next entry has updated the
+ * tail pointer but has not yet updated tq.next. In this
+ * case, tq.next is NULL, the cmpxchg will fail, and the
+ * code will wait for the enqueue to complete before completing
+ * removal of tq from the list.
+ *
+ * 3. tq is not the last pointer. In this case, tq.next is non-NULL,
+ * so the following code simply removes tq from the list.
+ */
+ if (tq.next == NULL) {
+
+ /* Mark the queue empty. */
+ tqhp->spin = NULL;
+
+ /* Try to point the tail back at the head. */
+ if (cmpxchg(&tqhp->spin_tail,
+ &tq.next,
+ &tqhp->spin) == &tq.next)
+ return true; /* Succeeded, queue is now empty. */
+
+ /* Failed, if needed, wait for the enqueue to complete. */
+ while (tq.next == NULL)
+ cpu_relax();
+
+ /* The following code will repair the head. */
+ }
+ smp_mb(); /* Force ordering between handoff and critical section. */
+
+ /*
+ * Advance list-head pointer. This same task will be the next to
+ * access this when releasing the lock, so no need for a memory
+ * barrier after the following assignment.
+ */
+ ACCESS_ONCE(tqhp->spin) = tq.next;
+ return true;
+}
+
+/*
+ * Given a lock that does not have a queue, attempt to associate the
+ * i-th queue with it, returning true if successful (meaning we hold
+ * the lock) or false otherwise (meaning we do -not- hold the lock).
+ * Note that the caller has already filled in ->ref with 0x1, so we
+ * own the queue.
+ */
+static bool
+tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
+{
+ arch_spinlock_t asold;
+ arch_spinlock_t asnew;
+ struct tkt_q_head *tqhp;
+
+ /* Initialize the i-th queue header. */
+ tqhp = &tkt_q_heads[i];
+ tqhp->spin = NULL;
+ tqhp->spin_tail = &tqhp->spin;
+
+ /* Each pass through this loop attempts to mark the lock as queued. */
+ do {
+ asold.head_tail = ACCESS_ONCE(lock->head_tail);
+ asnew = asold;
+ if (asnew.tickets.head & 0x1) {
+
+ /* Someone beat us to it, back out. */
+ smp_mb();
+ ACCESS_ONCE(tqhp->ref) = NULL;
+
+ /* Spin on the queue element they set up. */
+ return tkt_q_do_spin(lock, inc);
+ }
+
+ /*
+ * Record the head counter in case one of the spinning
+ * CPUs already holds the lock but doesn't realize it yet.
+ */
+ tqhp->head_tkt = asold.tickets.head;
+
+ /* The low-order bit in the head counter says "queued". */
+ asnew.tickets.head |= 0x1;
+ } while (cmpxchg(&lock->head_tail,
+ asold.head_tail,
+ asnew.head_tail) != asold.head_tail);
+
+ /* Point the queue at the lock and go spin on it. */
+ ACCESS_ONCE(tqhp->ref) = lock;
+ return tkt_q_do_spin(lock, inc);
+}
+
+/*
+ * Start handling a period of high contention by finding a queue to associate
+ * with this lock. Returns true if successful (in which case we hold the
+ * lock) and false otherwise (in which case we do -not- hold the lock).
+ */
+bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
+{
+ int i;
+ int start;
+
+ /* Hash the lock address to find a starting point. */
+ start = i = tkt_q_hash(lock);
+
+ /*
+ * Each pass through the following loop attempts to associate
+ * the lock with the corresponding queue.
+ */
+ do {
+ /*
+ * Use 0x1 to mark the queue in use, but also avoiding
+ * any spinners trying to use it before we get it all
+ * initialized.
+ */
+ if (cmpxchg(&tkt_q_heads[i].ref,
+ NULL,
+ (arch_spinlock_t *)0x1) == NULL) {
+
+ /* Succeeded, now go initialize it. */
+ return tkt_q_init_contend(i, lock, inc);
+ }
+
+ /* If someone beat us to it, go spin on their queue. */
+ if (ACCESS_ONCE(lock->tickets.head) & 0x1)
+ return tkt_q_do_spin(lock, inc);
+ } while ((i = tkt_q_next_slot(i)) != start);
+
+ /* All the queues are in use, revert to spinning on the ticket lock. */
+ return false;
+}
+
+bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
+{
+ if (unlikely(inc.head & 0x1)) {
+
+ /* This lock has a queue, so go spin on the queue. */
+ if (tkt_q_do_spin(ap, inc))
+ return true;
+
+ /* Get here if the queue is in transition: Retry next time. */
+
+ } else if (TICKET_T_CMP_GE(ACCESS_ONCE(ap->tickets.tail) - TKT_Q_SWITCH,
+ ACCESS_ONCE(ap->tickets.head))) {
+
+ /*
+ * This lock has lots of spinners, but no queue.
+ * Go create a queue to spin on.
+ */
+ if (tkt_q_start_contend(ap, inc))
+ return true;
+
+ /* Get here if the queue is in transition: Retry next time. */
+ }
+
+ /* Either no need for a queue or the queue is in transition. Spin. */
+ cpu_relax();
+ return false;
+}
+EXPORT_SYMBOL(tkt_spin_pass);

2013-06-11 17:13:56

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, 2013-06-11 at 09:43 -0700, Paul E. McKenney wrote:

> > > I am a bit concern about the size of the head queue table itself. RHEL6,
> > > for example, had defined CONFIG_NR_CPUS to be 4096 which mean a table
> > > size of 256. Maybe it is better to dynamically allocate the table at
> > > init time depending on the actual number of CPUs in the system.
> >
> > Yeah, it can be allocated dynamically at boot.
>
> But let's first demonstrate the need. Keep in mind that an early-boot
> deadlock would exercise this code.

I think an early-boot deadlock has more problems than this :-)

Now if we allocate this before other CPUs are enabled, there's no need
to worry about accessing it before they are used. They can only be used
on contention, and there would be no contention when we are only running
on one CPU.


> Yes, it is just a check for NULL,
> but on the other hand I didn't get the impression that you thought that
> this code was too simple. ;-)

I wouldn't change the code that uses it. It should never be hit, and if
it is triggered by an early boot deadlock, then I think this would
actually be a plus. An early boot deadlock would cause the system to
hang with no feedback whats so ever, causing the developer hours of
crying for mommy and pulling out their hair because the system just
stops doing anything except to show the developer a blinking cursor that
blinks "haha, haha, haha".

But if an early boot deadlock were to cause this code to be triggered
and do a NULL pointer dereference, then the system crashes. It would
most likely produce a backtrace that will give a lot more information to
the developer to see what is happening here. Sure, it may confuse them
at first, but then they can say: "why is this code triggering before we
have other CPUS? Oh, I have a deadlock here" and go fix the code in a
matter of minutes instead of hours.

Note, I don't even see this triggering with an early boot deadlock. The
only way that can happen is if the task tries to take a spinlock it
already owns, or an interrupt goes off and grabs a spinlock that the
task currently has but didn't disable interrupts. The ticket counter
would be just 2, which is far below the threshold that triggers the
queuing.

-- Steve

2013-06-11 17:17:54

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 9:48 AM, Paul E. McKenney
<[email protected]> wrote:
>
> Another approach is to permanently associate queues with each lock,
> but that increases the size of the lock -- something that has raised
> concerns in the past. But if adding 32 bytes to each ticketlock was OK,
> this simplifies things quite a bit.

Yeah, no. The spinlocks need to be small. We have them in
size-conscious data structures like "struct dentry" and "struct page",
and they really must not be bigger than an "int" in the non-debug
case.

In fact, I've occasionally thought about combining a spinlock with a
refcounter if that could make things fit in 32 bits on smaller
machines, because we also have ops like "atomic_dec_and_lock()" that
could possibly be optimized if they fit in one word. That is probably
not worth it, but spinlocks do need to remain small.

Linus

2013-06-11 17:20:12

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 01:01:55PM -0400, Steven Rostedt wrote:
> On Tue, 2013-06-11 at 09:36 -0700, Paul E. McKenney wrote:
>
> > > I am a bit concern about the size of the head queue table itself.
> > > RHEL6, for example, had defined CONFIG_NR_CPUS to be 4096 which mean
> > > a table size of 256. Maybe it is better to dynamically allocate the
> > > table at init time depending on the actual number of CPUs in the
> > > system.
> >
> > But if your kernel is built for 4096 CPUs, the 32*256=8192 bytes of memory
> > is way down in the noise. Systems that care about that small an amount
> > of memory probably have a small enough number of CPUs that they can just
> > turn off queueing at build time using CONFIG_TICKET_LOCK_QUEUED=n, right?
>
> If this turns out to work for large machines, that means that distros
> will enable it, and distros tend to up the NR_CPUS, which is defined at
> compile time and is set regardless of if you are running with 2 CPUs or
> a 1000 CPUs.
>
> For now it's fine to use NR_CPUS, but I always try to avoid it. Working
> in the ARM and POWER environment you are use to lots of kernels compiled
> specifically for the target. But in the x86 world, it is basically one
> kernel for all environments, where NR_CPUS does make a big difference.

Fair point. Something to worry about should this ever be in danger of
actually going upstream. ;-)

Thanx, Paul

2013-06-11 17:33:10

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 10:17:52AM -0700, Linus Torvalds wrote:
> On Tue, Jun 11, 2013 at 9:48 AM, Paul E. McKenney
> <[email protected]> wrote:
> >
> > Another approach is to permanently associate queues with each lock,
> > but that increases the size of the lock -- something that has raised
> > concerns in the past. But if adding 32 bytes to each ticketlock was OK,
> > this simplifies things quite a bit.
>
> Yeah, no. The spinlocks need to be small. We have them in
> size-conscious data structures like "struct dentry" and "struct page",
> and they really must not be bigger than an "int" in the non-debug
> case.
>
> In fact, I've occasionally thought about combining a spinlock with a
> refcounter if that could make things fit in 32 bits on smaller
> machines, because we also have ops like "atomic_dec_and_lock()" that
> could possibly be optimized if they fit in one word. That is probably
> not worth it, but spinlocks do need to remain small.

I was afraid of that. On the other hand, I guess that this means that
I sent out the correct patch of the two that I prepared. ;-)

Thanx, Paul

2013-06-11 17:35:55

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] v2 Auto-queued ticketlock

Hmm. Something just struck me when reading this patch..

Our memory ordering semantics in our *current* locks are very very
subtle. We have just a "barrier()" between the

inc.head = ACCESS_ONCE(lock->tickets.head);
if (inc.head == inc.tail)
break; /* success */

and the inside of the locked region.

I think it's safe because of the new memory ordering semantics (loads
are in-order, and stores only move *down*), but there's not even a
comment about it.

So let's at least comment the current locks before making them even
more complex and subtle..

Linus

2013-06-11 17:36:09

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On 06/11/2013 12:20 PM, Steven Rostedt wrote:
>>> diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
>>> index ad0ad07..cdaefdd 100644
>>> --- a/arch/x86/include/asm/spinlock_types.h
>>> +++ b/arch/x86/include/asm/spinlock_types.h
>>> @@ -7,12 +7,18 @@
>>>
>>> #include<linux/types.h>
>>>
>>> -#if (CONFIG_NR_CPUS< 256)
>>> +#if (CONFIG_NR_CPUS< 128)
>>> typedef u8 __ticket_t;
>>> typedef u16 __ticketpair_t;
>>> -#else
>>> +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2>= (unsigned char)((a) - (b)))
>>> +#elif (CONFIG_NR_CPUS< 32768)
>>> typedef u16 __ticket_t;
>>> typedef u32 __ticketpair_t;
>>> +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2>= (unsigned short)((a) - (b)))
>>> +#else
>>> +typedef u32 __ticket_t;
>>> +typedef u64 __ticketpair_t;
>>> +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2>= (unsigned int)((a) - (b)))
>>> #endif
>> It is theoretically possible that a large number of CPUs (says 64 or
>> more with CONFIG_NR_CPUS< 128) can acquire a ticket from the lock
>> before the check for TICKET_T_CMP_GE() in tkt_spin_pass(). So the check
>> will fail even when there is a large number of CPUs contending for the
>> lock. The chance of this happening is, of course, extremely rare. This
>> is not an error as the lock is still working as it should be without
>> your change.
> Can you explain this more. How can you acquire the ticket and update at
> the same time? If a queue has been set, then you can't acquire the
> ticket as the head has a 1 appended to it.

I am sorry if I confuse you. What I meant is queuing up at the tail of
the ticket lock incrementing the tail number, not actually getting the lock.

>>
>>> +/*
>>> + * Return a pointer to the queue header associated with the specified lock,
>>> + * or return NULL if there is no queue for the lock or if the lock's queue
>>> + * is in transition.
>>> + */
>>> +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
>>> +{
>>> + int i;
>>> + int start;
>>> +
>>> + start = i = tkt_q_hash(asp);
>>> + do
>>> + if (tkt_q_heads[i].ref == asp)
>>> + return&tkt_q_heads[i];
>>> + while ((i = tkt_q_next_slot(i)) != start);
>>> + return NULL;
>>> +}
>> With a table size of 256 and you have to scan the whole table to find
>> the right head queue. This can be a significant overhead. I will suggest
>> setting a limiting of how many entries it scans before it aborts rather
>> than checking the whole table.
> We could add a limit, but in practice I'm not sure that would have any
> issue. I thought the same thing when I first saw this, but to hit most
> of the list, would require a large collision in the hash algorithm,
> would could probably be fixed with a better hash.

The current code will scan the whole table until either it gets a match
or whole table scan is completed. I first thought that hitting a NULL
entry can stop the search, but that is not true. It is entirely possible
that an entry was used when a queue is created but become empty
immediately after that. So we have to scan the whole table to be sure or
unless we impose a limit on how many entries we scan.

Regards,
Longman

2013-06-11 17:36:30

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] v2 Auto-queued ticketlock

On Tue, 2013-06-11 at 10:02 -0700, Paul E. McKenney wrote:

> +#ifdef CONFIG_TICKET_LOCK_QUEUED
> +
> +#define __TKT_SPIN_INC 2
> +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> +
> +#else /* #ifdef CONFIG_TICKET_LOCK_QUEUED */
> +
> +#define __TKT_SPIN_INC 1
> +static inline bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> +{
> + return false;
> +}
> +
> +#endif /* #else #ifdef CONFIG_TICKET_LOCK_QUEUED */
> +
> /*
> * Ticket locks are conceptually two parts, one indicating the current head of
> * the queue, and the other indicating the current tail. The lock is acquired
> @@ -49,17 +64,15 @@
> */
> static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> {
> - register struct __raw_tickets inc = { .tail = 1 };
> + register struct __raw_tickets inc = { .tail = __TKT_SPIN_INC };
>
> inc = xadd(&lock->tickets, inc);
> -
> for (;;) {
> - if (inc.head == inc.tail)
> + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> break;
> - cpu_relax();

Overheating the CPU are we ;-)

Keeping the cpu_relax() doesn't hurt, even when TICKET_LOCK_QUEUE is
enabled. As the only latency to worry about is when tkt_spin_pass()
returns true, where it breaks out of the loop anyway.

But if you really don't want the double call to cpu_relax(), we can
probably remove the cpu_relax from tkt_spin_pass() and keep this one, or
in the above tkt_spin_pass() where TICK_LOCK_QUEUED is not set, we can
do:

static inline bool tkt_spin_pass(arch_spinlock_t *ap, struct
__raw_tickets inc)
{
cpu_relax();
return false;
}

Honesty, I would say remove it from tkt_spin_pass() when returning
false.

-- Steve


> inc.head = ACCESS_ONCE(lock->tickets.head);
> }
> - barrier(); /* make sure nothing creeps before the lock is taken */
> + barrier(); /* Make sure nothing creeps in before the lock is taken. */
> }
>

2013-06-11 17:44:47

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 01:13:53PM -0400, Steven Rostedt wrote:
> On Tue, 2013-06-11 at 09:43 -0700, Paul E. McKenney wrote:
>
> > > > I am a bit concern about the size of the head queue table itself. RHEL6,
> > > > for example, had defined CONFIG_NR_CPUS to be 4096 which mean a table
> > > > size of 256. Maybe it is better to dynamically allocate the table at
> > > > init time depending on the actual number of CPUs in the system.
> > >
> > > Yeah, it can be allocated dynamically at boot.
> >
> > But let's first demonstrate the need. Keep in mind that an early-boot
> > deadlock would exercise this code.
>
> I think an early-boot deadlock has more problems than this :-)
>
> Now if we allocate this before other CPUs are enabled, there's no need
> to worry about accessing it before they are used. They can only be used
> on contention, and there would be no contention when we are only running
> on one CPU.
>
> > Yes, it is just a check for NULL,
> > but on the other hand I didn't get the impression that you thought that
> > this code was too simple. ;-)
>
> I wouldn't change the code that uses it. It should never be hit, and if
> it is triggered by an early boot deadlock, then I think this would
> actually be a plus. An early boot deadlock would cause the system to
> hang with no feedback whats so ever, causing the developer hours of
> crying for mommy and pulling out their hair because the system just
> stops doing anything except to show the developer a blinking cursor that
> blinks "haha, haha, haha".
>
> But if an early boot deadlock were to cause this code to be triggered
> and do a NULL pointer dereference, then the system crashes. It would
> most likely produce a backtrace that will give a lot more information to
> the developer to see what is happening here. Sure, it may confuse them
> at first, but then they can say: "why is this code triggering before we
> have other CPUS? Oh, I have a deadlock here" and go fix the code in a
> matter of minutes instead of hours.
>
> Note, I don't even see this triggering with an early boot deadlock. The
> only way that can happen is if the task tries to take a spinlock it
> already owns, or an interrupt goes off and grabs a spinlock that the
> task currently has but didn't disable interrupts. The ticket counter
> would be just 2, which is far below the threshold that triggers the
> queuing.

Fair point. I suppose that the hapless kernel hacker could set
the threshold really low for that case.

But we are talking about only 8192 bytes of memory here. Is that really
enough to worry about on current systems?

Thanx, Paul

2013-06-11 17:50:12

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] v2 Auto-queued ticketlock

On Tue, Jun 11, 2013 at 10:35:53AM -0700, Linus Torvalds wrote:
> Hmm. Something just struck me when reading this patch..
>
> Our memory ordering semantics in our *current* locks are very very
> subtle. We have just a "barrier()" between the
>
> inc.head = ACCESS_ONCE(lock->tickets.head);
> if (inc.head == inc.tail)
> break; /* success */
>
> and the inside of the locked region.
>
> I think it's safe because of the new memory ordering semantics (loads
> are in-order, and stores only move *down*), but there's not even a
> comment about it.
>
> So let's at least comment the current locks before making them even
> more complex and subtle..

Would it make sense to have something like an smp_tso() that was a
compiler barrier for TSO systems (x86, s390, sparc, etc.) but that
emitted the needed memory-barrier instruction for weakly ordered systems?

Seems to me to be easy to do, and helps describe the intent better.

Thanx, Paul

2013-06-11 17:53:16

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Mon, 2013-06-10 at 17:51 -0700, Linus Torvalds wrote:
> On Mon, Jun 10, 2013 at 5:44 PM, Steven Rostedt <[email protected]> wrote:
> >
> > OK, I haven't found a issue here yet, but youss are beiing trickssy! We
> > don't like trickssy, and we must find precccciouss!!!
>
> .. and I personally have my usual reservations. I absolutely hate
> papering over scalability issues, and historically whenever people
> have ever thought that we want complex spinlocks, the problem has
> always been that the locking sucks.
>
> So reinforced by previous events, I really feel that code that needs
> this kind of spinlock is broken and needs to be fixed, rather than
> actually introduce tricky spinlocks.
>
> So in order to merge something like this, I want (a) numbers for real
> loads and (b) explanations for why the spinlock users cannot be fixed.

I hate to be the bearer of bad news but I got some pretty bad aim7
performance numbers with this patch on an 8-socket (80 core) 256 Gb
memory DL980 box against a vanilla 3.10-rc4 kernel:

* shared workload:
10-100 users is in the noise area.
100-2000 users: -13% throughput.

* high_systime workload:
10-700 users is in the noise area.
700-2000 users: -55% throughput.

* disk:
10-100 users -57% throughput.
100-1000 users: -25% throughput
1000-2000 users: +8% throughput (this patch only benefits when we have a
lot of concurrency).

* custom:
10-100 users: -33% throughput.
100-2000 users: -46% throughput.

* alltests:
10-1000 users is in the noise area.
1000-2000 users: -10% throughput.

One notable exception is the short workload where we actually see
positive numbers:
10-100 users: +40% throughput.
100-2000 users: +69% throughput.

Thanks,
Davidlohr

2013-06-11 17:54:10

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] v2 Auto-queued ticketlock

On Tue, Jun 11, 2013 at 01:36:27PM -0400, Steven Rostedt wrote:
> On Tue, 2013-06-11 at 10:02 -0700, Paul E. McKenney wrote:
>
> > +#ifdef CONFIG_TICKET_LOCK_QUEUED
> > +
> > +#define __TKT_SPIN_INC 2
> > +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> > +
> > +#else /* #ifdef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > +#define __TKT_SPIN_INC 1
> > +static inline bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> > +{
> > + return false;
> > +}
> > +
> > +#endif /* #else #ifdef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > /*
> > * Ticket locks are conceptually two parts, one indicating the current head of
> > * the queue, and the other indicating the current tail. The lock is acquired
> > @@ -49,17 +64,15 @@
> > */
> > static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> > {
> > - register struct __raw_tickets inc = { .tail = 1 };
> > + register struct __raw_tickets inc = { .tail = __TKT_SPIN_INC };
> >
> > inc = xadd(&lock->tickets, inc);
> > -
> > for (;;) {
> > - if (inc.head == inc.tail)
> > + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> > break;
> > - cpu_relax();
>
> Overheating the CPU are we ;-)
>
> Keeping the cpu_relax() doesn't hurt, even when TICKET_LOCK_QUEUE is
> enabled. As the only latency to worry about is when tkt_spin_pass()
> returns true, where it breaks out of the loop anyway.
>
> But if you really don't want the double call to cpu_relax(), we can
> probably remove the cpu_relax from tkt_spin_pass() and keep this one, or
> in the above tkt_spin_pass() where TICK_LOCK_QUEUED is not set, we can
> do:
>
> static inline bool tkt_spin_pass(arch_spinlock_t *ap, struct
> __raw_tickets inc)
> {
> cpu_relax();
> return false;
> }
>
> Honesty, I would say remove it from tkt_spin_pass() when returning
> false.

Sold! I moved the cpu_relax() from tkt_spin_pass()'s false return to
the spin loop in __ticket_spin_lock(). Misguided attempt on my part to
minimize __ticket_spin_lock()'s size.

Thanx, Paul

> -- Steve
>
>
> > inc.head = ACCESS_ONCE(lock->tickets.head);
> > }
> > - barrier(); /* make sure nothing creeps before the lock is taken */
> > + barrier(); /* Make sure nothing creeps in before the lock is taken. */
> > }
> >
>
>

2013-06-11 18:05:52

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 10:53:06AM -0700, Davidlohr Bueso wrote:
> On Mon, 2013-06-10 at 17:51 -0700, Linus Torvalds wrote:
> > On Mon, Jun 10, 2013 at 5:44 PM, Steven Rostedt <[email protected]> wrote:
> > >
> > > OK, I haven't found a issue here yet, but youss are beiing trickssy! We
> > > don't like trickssy, and we must find precccciouss!!!
> >
> > .. and I personally have my usual reservations. I absolutely hate
> > papering over scalability issues, and historically whenever people
> > have ever thought that we want complex spinlocks, the problem has
> > always been that the locking sucks.
> >
> > So reinforced by previous events, I really feel that code that needs
> > this kind of spinlock is broken and needs to be fixed, rather than
> > actually introduce tricky spinlocks.
> >
> > So in order to merge something like this, I want (a) numbers for real
> > loads and (b) explanations for why the spinlock users cannot be fixed.
>
> I hate to be the bearer of bad news but I got some pretty bad aim7
> performance numbers with this patch on an 8-socket (80 core) 256 Gb
> memory DL980 box against a vanilla 3.10-rc4 kernel:

Looks pretty ugly, sorry that it doesn't help in many of your situations.

Any info on what bottlenecks you are encountering?

Thanx, Paul

> * shared workload:
> 10-100 users is in the noise area.
> 100-2000 users: -13% throughput.
>
> * high_systime workload:
> 10-700 users is in the noise area.
> 700-2000 users: -55% throughput.
>
> * disk:
> 10-100 users -57% throughput.
> 100-1000 users: -25% throughput
> 1000-2000 users: +8% throughput (this patch only benefits when we have a
> lot of concurrency).
>
> * custom:
> 10-100 users: -33% throughput.
> 100-2000 users: -46% throughput.
>
> * alltests:
> 10-1000 users is in the noise area.
> 1000-2000 users: -10% throughput.
>
> One notable exception is the short workload where we actually see
> positive numbers:
> 10-100 users: +40% throughput.
> 100-2000 users: +69% throughput.
>
> Thanks,
> Davidlohr
>

2013-06-11 18:10:36

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, 2013-06-11 at 10:53 -0700, Davidlohr Bueso wrote:

> I hate to be the bearer of bad news but I got some pretty bad aim7
> performance numbers with this patch on an 8-socket (80 core) 256 Gb
> memory DL980 box against a vanilla 3.10-rc4 kernel:

This doesn't surprise me as the spin lock now contains a function call
on any contention. Not to mention the added i$ pressure on the embedded
spinlock code having to setup a function call.

Even if the queues are not used, it adds a slight overhead to all
spinlocks, due to the code size increase as well as a function call on
all contention, which will also have an impact on i$ and branch
prediction.


>
> * shared workload:
> 10-100 users is in the noise area.
> 100-2000 users: -13% throughput.
>
> * high_systime workload:
> 10-700 users is in the noise area.
> 700-2000 users: -55% throughput.
>
> * disk:
> 10-100 users -57% throughput.
> 100-1000 users: -25% throughput
> 1000-2000 users: +8% throughput (this patch only benefits when we have a

Perhaps this actually started using the queues?

> lot of concurrency).
>
> * custom:
> 10-100 users: -33% throughput.
> 100-2000 users: -46% throughput.
>
> * alltests:
> 10-1000 users is in the noise area.
> 1000-2000 users: -10% throughput.
>
> One notable exception is the short workload where we actually see
> positive numbers:
> 10-100 users: +40% throughput.
> 100-2000 users: +69% throughput.

Perhaps short work loads have a cold cache, and the impact on cache is
not as drastic?

It would be interesting to see what perf reports on these runs.

-- Steve

2013-06-11 18:14:26

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, 2013-06-11 at 14:10 -0400, Steven Rostedt wrote:
> On Tue, 2013-06-11 at 10:53 -0700, Davidlohr Bueso wrote:
>
> > I hate to be the bearer of bad news but I got some pretty bad aim7
> > performance numbers with this patch on an 8-socket (80 core) 256 Gb
> > memory DL980 box against a vanilla 3.10-rc4 kernel:
>
> This doesn't surprise me as the spin lock now contains a function call
> on any contention. Not to mention the added i$ pressure on the embedded
> spinlock code having to setup a function call.
>
> Even if the queues are not used, it adds a slight overhead to all
> spinlocks, due to the code size increase as well as a function call on
> all contention, which will also have an impact on i$ and branch
> prediction.

Agreed.

> >
> > * shared workload:
> > 10-100 users is in the noise area.
> > 100-2000 users: -13% throughput.
> >
> > * high_systime workload:
> > 10-700 users is in the noise area.
> > 700-2000 users: -55% throughput.
> >
> > * disk:
> > 10-100 users -57% throughput.
> > 100-1000 users: -25% throughput
> > 1000-2000 users: +8% throughput (this patch only benefits when we have a
>
> Perhaps this actually started using the queues?
>
> > lot of concurrency).
> >
> > * custom:
> > 10-100 users: -33% throughput.
> > 100-2000 users: -46% throughput.
> >
> > * alltests:
> > 10-1000 users is in the noise area.
> > 1000-2000 users: -10% throughput.
> >
> > One notable exception is the short workload where we actually see
> > positive numbers:
> > 10-100 users: +40% throughput.
> > 100-2000 users: +69% throughput.
>
> Perhaps short work loads have a cold cache, and the impact on cache is
> not as drastic?
>
> It would be interesting to see what perf reports on these runs.

I didn't actually collect perf traces in this run but I will rerun it
with that in mind. I'm also running some OLTP and data mining Oracle
based benchmarks where I'm already collecting perf reports.

Will post when I have everything.

Thanks,
Davidlohr

2013-06-11 18:42:15

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On 06/11/2013 12:36 PM, Paul E. McKenney wrote:
>
>> I am a bit concern about the size of the head queue table itself.
>> RHEL6, for example, had defined CONFIG_NR_CPUS to be 4096 which mean
>> a table size of 256. Maybe it is better to dynamically allocate the
>> table at init time depending on the actual number of CPUs in the
>> system.
> But if your kernel is built for 4096 CPUs, the 32*256=8192 bytes of memory
> is way down in the noise. Systems that care about that small an amount
> of memory probably have a small enough number of CPUs that they can just
> turn off queueing at build time using CONFIG_TICKET_LOCK_QUEUED=n, right?

My concern is more about the latency on the table scan than the actual
memory that was used.

>
>>> +/*
>>> + * Return a pointer to the queue header associated with the specified lock,
>>> + * or return NULL if there is no queue for the lock or if the lock's queue
>>> + * is in transition.
>>> + */
>>> +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
>>> +{
>>> + int i;
>>> + int start;
>>> +
>>> + start = i = tkt_q_hash(asp);
>>> + do
>>> + if (tkt_q_heads[i].ref == asp)
>>> + return&tkt_q_heads[i];
>>> + while ((i = tkt_q_next_slot(i)) != start);
>>> + return NULL;
>>> +}
>> With a table size of 256 and you have to scan the whole table to
>> find the right head queue. This can be a significant overhead. I
>> will suggest setting a limiting of how many entries it scans before
>> it aborts rather than checking the whole table.
> But it will scan 256 entries only if there are 256 other locks in queued
> mode, which is -very- unlikely, even given 4096 CPUs. That said, if you
> show me that this results in a real latency problem on a real system,
> I would be happy to provide a way to limit the search.

Looking at the code more carefully, the chance of actually scanning 256
entries is very small. However, I now have some concern on the way you
set up the initial queue.

+/*
+ * Start handling a period of high contention by finding a queue to associate
+ * with this lock. Returns true if successful (in which case we hold the
+ * lock) and false otherwise (in which case we do -not- hold the lock).
+ */
+bool tkt_q_start_contend(arch_spinlock_t *asp, struct __raw_tickets inc)
+{
+ int i;
+ int start;
+
+ /* Hash the lock address to find a starting point. */
+ start = i = tkt_q_hash(asp);
+
+ /*
+ * Each pass through the following loop attempts to associate
+ * the lock with the corresponding queue.
+ */
+ do {
+ /*
+ * Use 0x1 to mark the queue in use, but also avoiding
+ * any spinners trying to use it before we get it all
+ * initialized.
+ */
+ if (cmpxchg(&tkt_q_heads[i].ref,
+ NULL,
+ (arch_spinlock_t *)0x1) == NULL) {
+
+ /* Succeeded, now go initialize it. */
+ return tkt_q_init_contend(i, asp, inc);
+ }
+
+ /* If someone beat us to it, go spin on their queue. */
+ if (ACCESS_ONCE(asp->tickets.head)& 0x1)
+ return tkt_q_do_spin(asp, inc);
+ } while ((i = tkt_q_next_slot(i)) != start);
+
+ /* All the queues are in use, revert to spinning on the ticket lock. */
+ return false;
+}
+

Unconditional cmpxchg() can be a source of high contention by itself.
Considering that 16 threads may be doing cmpxchg() more or less
simultaneously on the same cache line, it can cause a lot of contention.
It will be better if you check to see if tkt_q_heads[i] is NULL first
before doing cmpxchg.

Another point is that the 16 threads maybe setting up the queues in
consecutive slots in the head table. This is both a source of contention
and a waste of effort. One possible solution is to add one more field
(set to cpuid + 1, for example) to indicate that that setup is being
done with asp set to the target lock address immediately. We will need
to use cmpxchg128() for 64-bit machine, though. Another solution is to
have only that thread with ticket number that is a fixed distance from
head (e.g. 16*2) to do the queue setup while the rest wait until the
setup is done before spinning on the queue.

As my colleague Davidlohr had reported there are more regressions than
performance improvement in the AIM7 benchmark. I believe that queue
setup contention is likely a source of performance regression.

Regards,
Longman

2013-06-11 18:47:07

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 02:10:31PM -0400, Steven Rostedt wrote:
> On Tue, 2013-06-11 at 10:53 -0700, Davidlohr Bueso wrote:
>
> > I hate to be the bearer of bad news but I got some pretty bad aim7
> > performance numbers with this patch on an 8-socket (80 core) 256 Gb
> > memory DL980 box against a vanilla 3.10-rc4 kernel:
>
> This doesn't surprise me as the spin lock now contains a function call
> on any contention. Not to mention the added i$ pressure on the embedded
> spinlock code having to setup a function call.
>
> Even if the queues are not used, it adds a slight overhead to all
> spinlocks, due to the code size increase as well as a function call on
> all contention, which will also have an impact on i$ and branch
> prediction.

Was this system hyperthreaded? If so, it might be suffering from the
misplaced cpu_relax(), which would mean that hardware threads spinning
on the lock would fail to inform the CPU that it was not doing anything
useful.

Thanx, Paul

> > * shared workload:
> > 10-100 users is in the noise area.
> > 100-2000 users: -13% throughput.
> >
> > * high_systime workload:
> > 10-700 users is in the noise area.
> > 700-2000 users: -55% throughput.
> >
> > * disk:
> > 10-100 users -57% throughput.
> > 100-1000 users: -25% throughput
> > 1000-2000 users: +8% throughput (this patch only benefits when we have a
>
> Perhaps this actually started using the queues?
>
> > lot of concurrency).
> >
> > * custom:
> > 10-100 users: -33% throughput.
> > 100-2000 users: -46% throughput.
> >
> > * alltests:
> > 10-1000 users is in the noise area.
> > 1000-2000 users: -10% throughput.
> >
> > One notable exception is the short workload where we actually see
> > positive numbers:
> > 10-100 users: +40% throughput.
> > 100-2000 users: +69% throughput.
>
> Perhaps short work loads have a cold cache, and the impact on cache is
> not as drastic?
>
> It would be interesting to see what perf reports on these runs.
>
> -- Steve
>
>

2013-06-11 18:54:47

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, 2013-06-11 at 14:41 -0400, Waiman Long wrote:
> On 06/11/2013 12:36 PM, Paul E. McKenney wrote:
> >
> >> I am a bit concern about the size of the head queue table itself.
> >> RHEL6, for example, had defined CONFIG_NR_CPUS to be 4096 which mean
> >> a table size of 256. Maybe it is better to dynamically allocate the
> >> table at init time depending on the actual number of CPUs in the
> >> system.
> > But if your kernel is built for 4096 CPUs, the 32*256=8192 bytes of memory
> > is way down in the noise. Systems that care about that small an amount
> > of memory probably have a small enough number of CPUs that they can just
> > turn off queueing at build time using CONFIG_TICKET_LOCK_QUEUED=n, right?
>
> My concern is more about the latency on the table scan than the actual
> memory that was used.
>
> >
> >>> +/*
> >>> + * Return a pointer to the queue header associated with the specified lock,
> >>> + * or return NULL if there is no queue for the lock or if the lock's queue
> >>> + * is in transition.
> >>> + */
> >>> +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
> >>> +{
> >>> + int i;
> >>> + int start;
> >>> +
> >>> + start = i = tkt_q_hash(asp);
> >>> + do
> >>> + if (tkt_q_heads[i].ref == asp)
> >>> + return&tkt_q_heads[i];
> >>> + while ((i = tkt_q_next_slot(i)) != start);
> >>> + return NULL;
> >>> +}
> >> With a table size of 256 and you have to scan the whole table to
> >> find the right head queue. This can be a significant overhead. I
> >> will suggest setting a limiting of how many entries it scans before
> >> it aborts rather than checking the whole table.
> > But it will scan 256 entries only if there are 256 other locks in queued
> > mode, which is -very- unlikely, even given 4096 CPUs. That said, if you
> > show me that this results in a real latency problem on a real system,
> > I would be happy to provide a way to limit the search.
>
> Looking at the code more carefully, the chance of actually scanning 256
> entries is very small. However, I now have some concern on the way you
> set up the initial queue.
>
> +/*
> + * Start handling a period of high contention by finding a queue to associate
> + * with this lock. Returns true if successful (in which case we hold the
> + * lock) and false otherwise (in which case we do -not- hold the lock).
> + */
> +bool tkt_q_start_contend(arch_spinlock_t *asp, struct __raw_tickets inc)
> +{
> + int i;
> + int start;
> +
> + /* Hash the lock address to find a starting point. */
> + start = i = tkt_q_hash(asp);
> +
> + /*
> + * Each pass through the following loop attempts to associate
> + * the lock with the corresponding queue.
> + */
> + do {
> + /*
> + * Use 0x1 to mark the queue in use, but also avoiding
> + * any spinners trying to use it before we get it all
> + * initialized.
> + */
> + if (cmpxchg(&tkt_q_heads[i].ref,
> + NULL,
> + (arch_spinlock_t *)0x1) == NULL) {
> +
> + /* Succeeded, now go initialize it. */
> + return tkt_q_init_contend(i, asp, inc);
> + }
> +
> + /* If someone beat us to it, go spin on their queue. */
> + if (ACCESS_ONCE(asp->tickets.head)& 0x1)
> + return tkt_q_do_spin(asp, inc);
> + } while ((i = tkt_q_next_slot(i)) != start);
> +
> + /* All the queues are in use, revert to spinning on the ticket lock. */
> + return false;
> +}
> +
>
> Unconditional cmpxchg() can be a source of high contention by itself.
> Considering that 16 threads may be doing cmpxchg() more or less
> simultaneously on the same cache line, it can cause a lot of contention.
> It will be better if you check to see if tkt_q_heads[i] is NULL first
> before doing cmpxchg.

Good point, we already noticed good benefits in mutexes and rwsems when
using test and cmpxchg techniques.

Thanks,
davidlohr

2013-06-11 19:49:51

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 02:41:59PM -0400, Waiman Long wrote:
> On 06/11/2013 12:36 PM, Paul E. McKenney wrote:
> >
> >>I am a bit concern about the size of the head queue table itself.
> >>RHEL6, for example, had defined CONFIG_NR_CPUS to be 4096 which mean
> >>a table size of 256. Maybe it is better to dynamically allocate the
> >>table at init time depending on the actual number of CPUs in the
> >>system.
> >But if your kernel is built for 4096 CPUs, the 32*256=8192 bytes of memory
> >is way down in the noise. Systems that care about that small an amount
> >of memory probably have a small enough number of CPUs that they can just
> >turn off queueing at build time using CONFIG_TICKET_LOCK_QUEUED=n, right?
>
> My concern is more about the latency on the table scan than the
> actual memory that was used.
>
> >>>+/*
> >>>+ * Return a pointer to the queue header associated with the specified lock,
> >>>+ * or return NULL if there is no queue for the lock or if the lock's queue
> >>>+ * is in transition.
> >>>+ */
> >>>+static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
> >>>+{
> >>>+ int i;
> >>>+ int start;
> >>>+
> >>>+ start = i = tkt_q_hash(asp);
> >>>+ do
> >>>+ if (tkt_q_heads[i].ref == asp)
> >>>+ return&tkt_q_heads[i];
> >>>+ while ((i = tkt_q_next_slot(i)) != start);
> >>>+ return NULL;
> >>>+}
> >>With a table size of 256 and you have to scan the whole table to
> >>find the right head queue. This can be a significant overhead. I
> >>will suggest setting a limiting of how many entries it scans before
> >>it aborts rather than checking the whole table.
> >But it will scan 256 entries only if there are 256 other locks in queued
> >mode, which is -very- unlikely, even given 4096 CPUs. That said, if you
> >show me that this results in a real latency problem on a real system,
> >I would be happy to provide a way to limit the search.
>
> Looking at the code more carefully, the chance of actually scanning
> 256 entries is very small. However, I now have some concern on the
> way you set up the initial queue.
>
> +/*
> + * Start handling a period of high contention by finding a queue to associate
> + * with this lock. Returns true if successful (in which case we hold the
> + * lock) and false otherwise (in which case we do -not- hold the lock).
> + */
> +bool tkt_q_start_contend(arch_spinlock_t *asp, struct __raw_tickets inc)
> +{
> + int i;
> + int start;
> +
> + /* Hash the lock address to find a starting point. */
> + start = i = tkt_q_hash(asp);
> +
> + /*
> + * Each pass through the following loop attempts to associate
> + * the lock with the corresponding queue.
> + */
> + do {
> + /*
> + * Use 0x1 to mark the queue in use, but also avoiding
> + * any spinners trying to use it before we get it all
> + * initialized.
> + */
> + if (cmpxchg(&tkt_q_heads[i].ref,
> + NULL,
> + (arch_spinlock_t *)0x1) == NULL) {
> +
> + /* Succeeded, now go initialize it. */
> + return tkt_q_init_contend(i, asp, inc);
> + }
> +
> + /* If someone beat us to it, go spin on their queue. */
> + if (ACCESS_ONCE(asp->tickets.head)& 0x1)
> + return tkt_q_do_spin(asp, inc);
> + } while ((i = tkt_q_next_slot(i)) != start);
> +
> + /* All the queues are in use, revert to spinning on the ticket lock. */
> + return false;
> +}
> +
>
> Unconditional cmpxchg() can be a source of high contention by
> itself. Considering that 16 threads may be doing cmpxchg() more or
> less simultaneously on the same cache line, it can cause a lot of
> contention. It will be better if you check to see if tkt_q_heads[i]
> is NULL first before doing cmpxchg.
>
> Another point is that the 16 threads maybe setting up the queues in
> consecutive slots in the head table. This is both a source of
> contention and a waste of effort. One possible solution is to add
> one more field (set to cpuid + 1, for example) to indicate that that
> setup is being done with asp set to the target lock address
> immediately. We will need to use cmpxchg128() for 64-bit machine,
> though. Another solution is to have only that thread with ticket
> number that is a fixed distance from head (e.g. 16*2) to do the
> queue setup while the rest wait until the setup is done before
> spinning on the queue.
>
> As my colleague Davidlohr had reported there are more regressions
> than performance improvement in the AIM7 benchmark. I believe that
> queue setup contention is likely a source of performance regression.

Please see below for a v3 patch that:

1. Fixes cpu_relax().

2. Tests before doing cmpxchg().

3. Reduces the number of CPUs attempting to set up the queue,
in the common case, to a single CPU. (Multiple CPUs can
still be trying to set up the queue given unfortunate
sequences of concurrent ticket-lock handoffs.)

Please let me know how it goes!

Thanx, Paul

------------------------------------------------------------------------

ticketlock: Add queued-ticketlock capability

Breaking up locks is better than implementing high-contention locks, but
if we must have high-contention locks, why not make them automatically
switch between light-weight ticket locks at low contention and queued
locks at high contention? After all, this would remove the need for
the developer to predict which locks will be highly contended.

This commit allows ticket locks to automatically switch between pure
ticketlock and queued-lock operation as needed. If too many CPUs are
spinning on a given ticket lock, a queue structure will be allocated
and the lock will switch to queued-lock operation. When the lock becomes
free, it will switch back into ticketlock operation. The low-order bit
of the head counter is used to indicate that the lock is in queued mode,
which forces an unconditional mismatch between the head and tail counters.
This approach means that the common-case code path under conditions of
low contention is very nearly that of a plain ticket lock.

A fixed number of queueing structures is statically allocated in an
array. The ticket-lock address is used to hash into an initial element,
but if that element is already in use, it moves to the next element. If
the entire array is already in use, continue to spin in ticket mode.

Signed-off-by: Paul E. McKenney <[email protected]>
[ paulmck: Eliminate duplicate code and update comments (Steven Rostedt). ]
[ paulmck: Address Eric Dumazet review feedback. ]
[ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
[ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]
[ paulmck: Move cpu_relax() to main spin loop (Steven Rostedt). ]
[ paulmck: Reduce queue-switch contention (Waiman Long). ]

diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 33692ea..509c51a 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -34,6 +34,21 @@
# define UNLOCK_LOCK_PREFIX
#endif

+#ifdef CONFIG_TICKET_LOCK_QUEUED
+
+#define __TKT_SPIN_INC 2
+bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
+
+#else /* #ifdef CONFIG_TICKET_LOCK_QUEUED */
+
+#define __TKT_SPIN_INC 1
+static inline bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
+{
+ return false;
+}
+
+#endif /* #else #ifdef CONFIG_TICKET_LOCK_QUEUED */
+
/*
* Ticket locks are conceptually two parts, one indicating the current head of
* the queue, and the other indicating the current tail. The lock is acquired
@@ -49,17 +64,16 @@
*/
static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
{
- register struct __raw_tickets inc = { .tail = 1 };
+ register struct __raw_tickets inc = { .tail = __TKT_SPIN_INC };

inc = xadd(&lock->tickets, inc);
-
for (;;) {
- if (inc.head == inc.tail)
+ if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
break;
cpu_relax();
inc.head = ACCESS_ONCE(lock->tickets.head);
}
- barrier(); /* make sure nothing creeps before the lock is taken */
+ barrier(); /* Make sure nothing creeps in before the lock is taken. */
}

static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
@@ -70,17 +84,37 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
if (old.tickets.head != old.tickets.tail)
return 0;

+#ifndef CONFIG_TICKET_LOCK_QUEUED
new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
+#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
+ new.head_tail = old.head_tail + (2 << TICKET_SHIFT);
+#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */

/* cmpxchg is a full barrier, so nothing can move before it */
return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
}

+#ifndef CONFIG_TICKET_LOCK_QUEUED
+
static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
{
__add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
}

+#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
+
+extern void tkt_q_do_wake(arch_spinlock_t *lock);
+
+static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
+{
+ __ticket_t head = 2;
+
+ head = xadd(&lock->tickets.head, head);
+ if (head & 0x1)
+ tkt_q_do_wake(lock);
+}
+#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
+
static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
{
struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index ad0ad07..cdaefdd 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -7,12 +7,18 @@

#include <linux/types.h>

-#if (CONFIG_NR_CPUS < 256)
+#if (CONFIG_NR_CPUS < 128)
typedef u8 __ticket_t;
typedef u16 __ticketpair_t;
-#else
+#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
+#elif (CONFIG_NR_CPUS < 32768)
typedef u16 __ticket_t;
typedef u32 __ticketpair_t;
+#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
+#else
+typedef u32 __ticket_t;
+typedef u64 __ticketpair_t;
+#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
#endif

#define TICKET_SHIFT (sizeof(__ticket_t) * 8)
@@ -21,7 +27,11 @@ typedef struct arch_spinlock {
union {
__ticketpair_t head_tail;
struct __raw_tickets {
+#ifdef __BIG_ENDIAN__
+ __ticket_t tail, head;
+#else /* #ifdef __BIG_ENDIAN__ */
__ticket_t head, tail;
+#endif /* #else #ifdef __BIG_ENDIAN__ */
} tickets;
};
} arch_spinlock_t;
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e9ef6d6..816a87c 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -15,6 +15,7 @@
#include <asm/byteorder.h>
#include <uapi/linux/kernel.h>

+#define UCHAR_MAX ((u8)(~0U))
#define USHRT_MAX ((u16)(~0U))
#define SHRT_MAX ((s16)(USHRT_MAX>>1))
#define SHRT_MIN ((s16)(-SHRT_MAX - 1))
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 44511d1..900c0f0 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -223,3 +223,38 @@ endif
config MUTEX_SPIN_ON_OWNER
def_bool y
depends on SMP && !DEBUG_MUTEXES
+
+config TICKET_LOCK_QUEUED
+ bool "Dynamically switch between ticket and queued locking"
+ depends on SMP
+ default n
+ ---help---
+ Enable dynamic switching between ticketlock and queued locking
+ on a per-lock basis. This option will slow down low-contention
+ acquisition and release very slightly (additional conditional
+ in release path), but will provide more efficient operation at
+ high levels of lock contention. High-contention operation will
+ not be quite as efficient as would be a pure queued lock, but
+ this dynamic approach consumes less memory than queud locks
+ and also runs faster at low levels of contention.
+
+ Say "Y" if you are running on a large system with a workload
+ that is likely to result in high levels of contention.
+
+ Say "N" if you are unsure.
+
+config TICKET_LOCK_QUEUED_SWITCH
+ int "When to switch from ticket to queued locking"
+ depends on TICKET_LOCK_QUEUED
+ default 8
+ range 3 32
+ ---help---
+ Specify how many tasks should be spinning on the lock before
+ switching to queued mode. Systems with low-latency memory/cache
+ interconnects will prefer larger numbers, while extreme low-latency
+ and real-time workloads will prefer a smaller number. Of course,
+ extreme real-time workloads would be even happier if contention
+ on the locks were reduced to the point that there was never any
+ need for queued locking in the first place.
+
+ Take the default if you are unsure.
diff --git a/kernel/Makefile b/kernel/Makefile
index 271fd31..70a91f7 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -51,6 +51,7 @@ endif
obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
new file mode 100644
index 0000000..9f03af0
--- /dev/null
+++ b/kernel/tktqlock.c
@@ -0,0 +1,369 @@
+/*
+ * Queued ticket spinlocks.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2013
+ *
+ * Authors: Paul E. McKenney <[email protected]>
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+
+struct tkt_q {
+ int cpu;
+ __ticket_t tail;
+ struct tkt_q *next;
+};
+
+struct tkt_q_head {
+ arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
+ s64 head_tkt; /* Head ticket when started queuing. */
+ struct tkt_q *spin; /* Head of queue. */
+ struct tkt_q **spin_tail; /* Tail of queue. */
+};
+
+/*
+ * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
+ * given ticket lock to motivate switching to spinning on a queue.
+ * The reason that it is twice the number is because the bottom bit of
+ * the ticket is reserved for the bit that indicates that a queue is
+ * associated with the lock.
+ */
+#define TKT_Q_SWITCH (CONFIG_TICKET_LOCK_QUEUED_SWITCH * 2)
+
+/*
+ * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
+ * might have multiple highly contended locks, so provide more queues for
+ * systems with larger numbers of CPUs.
+ */
+#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
+
+/* The queues themselves. */
+struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
+
+/* Advance to the next queue slot, wrapping around to the beginning. */
+static int tkt_q_next_slot(int i)
+{
+ return (++i < TKT_Q_NQUEUES) ? i : 0;
+}
+
+/* Very crude hash from lock address to queue slot number. */
+static unsigned long tkt_q_hash(arch_spinlock_t *lock)
+{
+ return (((unsigned long)lock) >> 8) % TKT_Q_NQUEUES;
+}
+
+/*
+ * Return a pointer to the queue header associated with the specified lock,
+ * or return NULL if there is no queue for the lock or if the lock's queue
+ * is in transition.
+ */
+static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *lock)
+{
+ int i;
+ int start;
+
+ start = i = tkt_q_hash(lock);
+ do
+ if (tkt_q_heads[i].ref == lock)
+ return &tkt_q_heads[i];
+ while ((i = tkt_q_next_slot(i)) != start);
+ return NULL;
+}
+
+/*
+ * Try to stop queuing, reverting back to normal ticket-lock operation.
+ * We can only stop queuing when the queue is empty, which means that
+ * we need to correctly handle races where someone shows up in the queue
+ * just as we are trying to dispense with the queue. They win, we lose.
+ */
+static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)
+{
+ arch_spinlock_t asold;
+ arch_spinlock_t asnew;
+
+ /* Pick up the ticket values. */
+ asold = ACCESS_ONCE(*lock);
+ if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
+
+ /* Attempt to mark the lock as not having a queue. */
+ asnew = asold;
+ asnew.tickets.head &= ~0x1;
+ if (cmpxchg(&lock->head_tail,
+ asold.head_tail,
+ asnew.head_tail) == asold.head_tail) {
+
+ /* Succeeded, mark the queue as unused. */
+ ACCESS_ONCE(tqhp->ref) = NULL;
+ return true;
+ }
+ }
+
+ /* Failed, tell the caller there is still a queue to pass off to. */
+ return false;
+}
+
+/*
+ * Hand the lock off to the first CPU on the queue.
+ */
+void tkt_q_do_wake(arch_spinlock_t *lock)
+{
+ struct tkt_q_head *tqhp;
+ struct tkt_q *tqp;
+
+ /* If the queue is still being set up, wait for it. */
+ while ((tqhp = tkt_q_find_head(lock)) == NULL)
+ cpu_relax();
+
+ for (;;) {
+
+ /* Find the first queue element. */
+ tqp = ACCESS_ONCE(tqhp->spin);
+ if (tqp != NULL)
+ break; /* Element exists, hand off lock. */
+ if (tkt_q_try_unqueue(lock, tqhp))
+ return; /* No element, successfully removed queue. */
+ cpu_relax();
+ }
+ if (ACCESS_ONCE(tqhp->head_tkt) != -1)
+ ACCESS_ONCE(tqhp->head_tkt) = -1;
+ smp_mb(); /* Order pointer fetch and assignment against handoff. */
+ ACCESS_ONCE(tqp->cpu) = -1;
+}
+EXPORT_SYMBOL(tkt_q_do_wake);
+
+/*
+ * Given a lock that already has a queue associated with it, spin on
+ * that queue. Return false if there was no queue (which means we do not
+ * hold the lock) and true otherwise (meaning we -do- hold the lock).
+ */
+bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
+{
+ struct tkt_q **oldtail;
+ struct tkt_q tq;
+ struct tkt_q_head *tqhp;
+
+ /*
+ * Ensure that accesses to queue header happen after sensing
+ * the lock's have-queue bit.
+ */
+ smp_mb(); /* See above block comment. */
+
+ /* If there no longer is a queue, leave. */
+ tqhp = tkt_q_find_head(lock);
+ if (tqhp == NULL)
+ return false;
+
+ /* Initialize our queue element. */
+ tq.cpu = raw_smp_processor_id();
+ tq.tail = inc.tail;
+ tq.next = NULL;
+
+ /* Check to see if we already hold the lock. */
+ if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
+ /* The last holder left before queue formed, we hold lock. */
+ tqhp->head_tkt = -1;
+ return true;
+ }
+
+ /*
+ * Add our element to the tail of the queue. Note that if the
+ * queue is empty, the ->spin_tail pointer will reference
+ * the queue's head pointer, namely ->spin.
+ */
+ oldtail = xchg(&tqhp->spin_tail, &tq.next);
+ ACCESS_ONCE(*oldtail) = &tq;
+
+ /* Spin until handoff. */
+ while (ACCESS_ONCE(tq.cpu) != -1)
+ cpu_relax();
+
+ /*
+ * Remove our element from the queue. If the queue is now empty,
+ * update carefully so that the next acquisition will enqueue itself
+ * at the head of the list. Of course, the next enqueue operation
+ * might be happening concurrently, and this code needs to handle all
+ * of the possible combinations, keeping in mind that the enqueue
+ * operation happens in two stages: (1) update the tail pointer and
+ * (2) update the predecessor's ->next pointer. With this in mind,
+ * the following code needs to deal with three scenarios:
+ *
+ * 1. tq is the last entry. In this case, we use cmpxchg to
+ * point the list tail back to the list head (->spin). If
+ * the cmpxchg fails, that indicates that we are instead
+ * in scenario 2 below. If the cmpxchg succeeds, the next
+ * enqueue operation's tail-pointer exchange will enqueue
+ * the next element at the queue head, because the ->spin_tail
+ * pointer now references the queue head.
+ *
+ * 2. tq is the last entry, and the next entry has updated the
+ * tail pointer but has not yet updated tq.next. In this
+ * case, tq.next is NULL, the cmpxchg will fail, and the
+ * code will wait for the enqueue to complete before completing
+ * removal of tq from the list.
+ *
+ * 3. tq is not the last pointer. In this case, tq.next is non-NULL,
+ * so the following code simply removes tq from the list.
+ */
+ if (tq.next == NULL) {
+
+ /* Mark the queue empty. */
+ tqhp->spin = NULL;
+
+ /* Try to point the tail back at the head. */
+ if (cmpxchg(&tqhp->spin_tail,
+ &tq.next,
+ &tqhp->spin) == &tq.next)
+ return true; /* Succeeded, queue is now empty. */
+
+ /* Failed, if needed, wait for the enqueue to complete. */
+ while (tq.next == NULL)
+ cpu_relax();
+
+ /* The following code will repair the head. */
+ }
+ smp_mb(); /* Force ordering between handoff and critical section. */
+
+ /*
+ * Advance list-head pointer. This same task will be the next to
+ * access this when releasing the lock, so no need for a memory
+ * barrier after the following assignment.
+ */
+ ACCESS_ONCE(tqhp->spin) = tq.next;
+ return true;
+}
+
+/*
+ * Given a lock that does not have a queue, attempt to associate the
+ * i-th queue with it, returning true if successful (meaning we hold
+ * the lock) or false otherwise (meaning we do -not- hold the lock).
+ * Note that the caller has already filled in ->ref with 0x1, so we
+ * own the queue.
+ */
+static bool
+tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
+{
+ arch_spinlock_t asold;
+ arch_spinlock_t asnew;
+ struct tkt_q_head *tqhp;
+
+ /* Initialize the i-th queue header. */
+ tqhp = &tkt_q_heads[i];
+ tqhp->spin = NULL;
+ tqhp->spin_tail = &tqhp->spin;
+
+ /* Each pass through this loop attempts to mark the lock as queued. */
+ do {
+ asold.head_tail = ACCESS_ONCE(lock->head_tail);
+ asnew = asold;
+ if (asnew.tickets.head & 0x1) {
+
+ /* Someone beat us to it, back out. */
+ smp_mb();
+ ACCESS_ONCE(tqhp->ref) = NULL;
+
+ /* Spin on the queue element they set up. */
+ return tkt_q_do_spin(lock, inc);
+ }
+
+ /*
+ * Record the head counter in case one of the spinning
+ * CPUs already holds the lock but doesn't realize it yet.
+ */
+ tqhp->head_tkt = asold.tickets.head;
+
+ /* The low-order bit in the head counter says "queued". */
+ asnew.tickets.head |= 0x1;
+ } while (cmpxchg(&lock->head_tail,
+ asold.head_tail,
+ asnew.head_tail) != asold.head_tail);
+
+ /* Point the queue at the lock and go spin on it. */
+ ACCESS_ONCE(tqhp->ref) = lock;
+ return tkt_q_do_spin(lock, inc);
+}
+
+/*
+ * Start handling a period of high contention by finding a queue to associate
+ * with this lock. Returns true if successful (in which case we hold the
+ * lock) and false otherwise (in which case we do -not- hold the lock).
+ */
+bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
+{
+ int i;
+ int start;
+
+ /* Hash the lock address to find a starting point. */
+ start = i = tkt_q_hash(lock);
+
+ /*
+ * Each pass through the following loop attempts to associate
+ * the lock with the corresponding queue.
+ */
+ do {
+ /*
+ * Use 0x1 to mark the queue in use, but also avoiding
+ * any spinners trying to use it before we get it all
+ * initialized.
+ */
+ if (tkt_q_heads[i].ref)
+ continue;
+ if (cmpxchg(&tkt_q_heads[i].ref,
+ NULL,
+ (arch_spinlock_t *)0x1) == NULL) {
+
+ /* Succeeded, now go initialize it. */
+ return tkt_q_init_contend(i, lock, inc);
+ }
+
+ /* If someone beat us to it, go spin on their queue. */
+ if (ACCESS_ONCE(lock->tickets.head) & 0x1)
+ return tkt_q_do_spin(lock, inc);
+ } while ((i = tkt_q_next_slot(i)) != start);
+
+ /* All the queues are in use, revert to spinning on the ticket lock. */
+ return false;
+}
+
+bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
+{
+ if (unlikely(inc.head & 0x1)) {
+
+ /* This lock has a queue, so go spin on the queue. */
+ if (tkt_q_do_spin(ap, inc))
+ return true;
+
+ /* Get here if the queue is in transition: Retry next time. */
+
+ } else if (inc.tail - TKT_Q_SWITCH == inc.head) {
+
+ /*
+ * This lock has lots of spinners, but no queue.
+ * Go create a queue to spin on.
+ */
+ if (tkt_q_start_contend(ap, inc))
+ return true;
+
+ /* Get here if the queue is in transition: Retry next time. */
+ }
+
+ /* Either no need for a queue or the queue is in transition. Spin. */
+ return false;
+}
+EXPORT_SYMBOL(tkt_spin_pass);

2013-06-11 20:10:02

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, 2013-06-11 at 12:49 -0700, Paul E. McKenney wrote:

> +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> +{
> + if (unlikely(inc.head & 0x1)) {
> +
> + /* This lock has a queue, so go spin on the queue. */
> + if (tkt_q_do_spin(ap, inc))
> + return true;
> +
> + /* Get here if the queue is in transition: Retry next time. */
> +

This looks better, but please add a comment, something to the likes of:

/*
* Only the TKT_Q_SWITCH waiter will set up the queue to prevent
* a thundering herd of setups to occur. It is still possible for
* more than one task to perform a setup if the lock is released
* after this check, a waiter coming in may also match this test. But
* that's covered by the cmpxchg() setup in tkt_q_start_contend.
*/


> + } else if (inc.tail - TKT_Q_SWITCH == inc.head) {

Also shouldn't this be:

} else if ((__ticket_t)(inc.tail - TKT_Q_SWITCH) == inc.head) {

As TKT_Q_SWITCH doesn't have a type, I'm not sure how C will evaluate
this. I always screw type conversions up, and just add in the type casts
to be safe.

You could also give TKT_Q_SWITCH a type too.

-- Steve

> +
> + /*
> + * This lock has lots of spinners, but no queue.
> + * Go create a queue to spin on.
> + */
> + if (tkt_q_start_contend(ap, inc))
> + return true;
> +
> + /* Get here if the queue is in transition: Retry next time. */
> + }
> +
> + /* Either no need for a queue or the queue is in transition. Spin. */
> + return false;
> +}
> +EXPORT_SYMBOL(tkt_spin_pass);

2013-06-11 20:25:20

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 12:49 PM, Paul E. McKenney
<[email protected]> wrote:
> On Tue, Jun 11, 2013 at 02:41:59PM -0400, Waiman Long wrote:
>> On 06/11/2013 12:36 PM, Paul E. McKenney wrote:
>> >
>> >>I am a bit concern about the size of the head queue table itself.
>> >>RHEL6, for example, had defined CONFIG_NR_CPUS to be 4096 which mean
>> >>a table size of 256. Maybe it is better to dynamically allocate the
>> >>table at init time depending on the actual number of CPUs in the
>> >>system.
>> >But if your kernel is built for 4096 CPUs, the 32*256=8192 bytes of memory
>> >is way down in the noise. Systems that care about that small an amount
>> >of memory probably have a small enough number of CPUs that they can just
>> >turn off queueing at build time using CONFIG_TICKET_LOCK_QUEUED=n, right?
>>
>> My concern is more about the latency on the table scan than the
>> actual memory that was used.
>>
>> >>>+/*
>> >>>+ * Return a pointer to the queue header associated with the specified lock,
>> >>>+ * or return NULL if there is no queue for the lock or if the lock's queue
>> >>>+ * is in transition.
>> >>>+ */
>> >>>+static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
>> >>>+{
>> >>>+ int i;
>> >>>+ int start;
>> >>>+
>> >>>+ start = i = tkt_q_hash(asp);
>> >>>+ do
>> >>>+ if (tkt_q_heads[i].ref == asp)
>> >>>+ return&tkt_q_heads[i];
>> >>>+ while ((i = tkt_q_next_slot(i)) != start);
>> >>>+ return NULL;
>> >>>+}
>> >>With a table size of 256 and you have to scan the whole table to
>> >>find the right head queue. This can be a significant overhead. I
>> >>will suggest setting a limiting of how many entries it scans before
>> >>it aborts rather than checking the whole table.
>> >But it will scan 256 entries only if there are 256 other locks in queued
>> >mode, which is -very- unlikely, even given 4096 CPUs. That said, if you
>> >show me that this results in a real latency problem on a real system,
>> >I would be happy to provide a way to limit the search.
>>
>> Looking at the code more carefully, the chance of actually scanning
>> 256 entries is very small. However, I now have some concern on the
>> way you set up the initial queue.
>>
>> +/*
>> + * Start handling a period of high contention by finding a queue to associate
>> + * with this lock. Returns true if successful (in which case we hold the
>> + * lock) and false otherwise (in which case we do -not- hold the lock).
>> + */
>> +bool tkt_q_start_contend(arch_spinlock_t *asp, struct __raw_tickets inc)
>> +{
>> + int i;
>> + int start;
>> +
>> + /* Hash the lock address to find a starting point. */
>> + start = i = tkt_q_hash(asp);
>> +
>> + /*
>> + * Each pass through the following loop attempts to associate
>> + * the lock with the corresponding queue.
>> + */
>> + do {
>> + /*
>> + * Use 0x1 to mark the queue in use, but also avoiding
>> + * any spinners trying to use it before we get it all
>> + * initialized.
>> + */
>> + if (cmpxchg(&tkt_q_heads[i].ref,
>> + NULL,
>> + (arch_spinlock_t *)0x1) == NULL) {
>> +
>> + /* Succeeded, now go initialize it. */
>> + return tkt_q_init_contend(i, asp, inc);
>> + }
>> +
>> + /* If someone beat us to it, go spin on their queue. */
>> + if (ACCESS_ONCE(asp->tickets.head)& 0x1)
>> + return tkt_q_do_spin(asp, inc);
>> + } while ((i = tkt_q_next_slot(i)) != start);
>> +
>> + /* All the queues are in use, revert to spinning on the ticket lock. */
>> + return false;
>> +}
>> +
>>
>> Unconditional cmpxchg() can be a source of high contention by
>> itself. Considering that 16 threads may be doing cmpxchg() more or
>> less simultaneously on the same cache line, it can cause a lot of
>> contention. It will be better if you check to see if tkt_q_heads[i]
>> is NULL first before doing cmpxchg.
>>
>> Another point is that the 16 threads maybe setting up the queues in
>> consecutive slots in the head table. This is both a source of
>> contention and a waste of effort. One possible solution is to add
>> one more field (set to cpuid + 1, for example) to indicate that that
>> setup is being done with asp set to the target lock address
>> immediately. We will need to use cmpxchg128() for 64-bit machine,
>> though. Another solution is to have only that thread with ticket
>> number that is a fixed distance from head (e.g. 16*2) to do the
>> queue setup while the rest wait until the setup is done before
>> spinning on the queue.
>>
>> As my colleague Davidlohr had reported there are more regressions
>> than performance improvement in the AIM7 benchmark. I believe that
>> queue setup contention is likely a source of performance regression.
>
> Please see below for a v3 patch that:
>
> 1. Fixes cpu_relax().
>
> 2. Tests before doing cmpxchg().
>
> 3. Reduces the number of CPUs attempting to set up the queue,
> in the common case, to a single CPU. (Multiple CPUs can
> still be trying to set up the queue given unfortunate
> sequences of concurrent ticket-lock handoffs.)
>
> Please let me know how it goes!
>
> Thanx, Paul
>
> ------------------------------------------------------------------------
>
> ticketlock: Add queued-ticketlock capability
>
> Breaking up locks is better than implementing high-contention locks, but
> if we must have high-contention locks, why not make them automatically
> switch between light-weight ticket locks at low contention and queued
> locks at high contention? After all, this would remove the need for
> the developer to predict which locks will be highly contended.
>
> This commit allows ticket locks to automatically switch between pure
> ticketlock and queued-lock operation as needed. If too many CPUs are
> spinning on a given ticket lock, a queue structure will be allocated
> and the lock will switch to queued-lock operation. When the lock becomes
> free, it will switch back into ticketlock operation. The low-order bit
> of the head counter is used to indicate that the lock is in queued mode,
> which forces an unconditional mismatch between the head and tail counters.
> This approach means that the common-case code path under conditions of
> low contention is very nearly that of a plain ticket lock.
>
> A fixed number of queueing structures is statically allocated in an
> array. The ticket-lock address is used to hash into an initial element,
> but if that element is already in use, it moves to the next element. If
> the entire array is already in use, continue to spin in ticket mode.
>
> Signed-off-by: Paul E. McKenney <[email protected]>
> [ paulmck: Eliminate duplicate code and update comments (Steven Rostedt). ]
> [ paulmck: Address Eric Dumazet review feedback. ]
> [ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
> [ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]
> [ paulmck: Move cpu_relax() to main spin loop (Steven Rostedt). ]
> [ paulmck: Reduce queue-switch contention (Waiman Long). ]
>
> diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
> index 33692ea..509c51a 100644
> --- a/arch/x86/include/asm/spinlock.h
> +++ b/arch/x86/include/asm/spinlock.h
> @@ -34,6 +34,21 @@
> # define UNLOCK_LOCK_PREFIX
> #endif
>
> +#ifdef CONFIG_TICKET_LOCK_QUEUED
> +
> +#define __TKT_SPIN_INC 2
> +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> +
> +#else /* #ifdef CONFIG_TICKET_LOCK_QUEUED */
> +
> +#define __TKT_SPIN_INC 1
> +static inline bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> +{
> + return false;
> +}
> +
> +#endif /* #else #ifdef CONFIG_TICKET_LOCK_QUEUED */
> +
> /*
> * Ticket locks are conceptually two parts, one indicating the current head of
> * the queue, and the other indicating the current tail. The lock is acquired
> @@ -49,17 +64,16 @@
> */
> static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> {
> - register struct __raw_tickets inc = { .tail = 1 };
> + register struct __raw_tickets inc = { .tail = __TKT_SPIN_INC };
>
> inc = xadd(&lock->tickets, inc);
> -
> for (;;) {
> - if (inc.head == inc.tail)
> + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> break;
> cpu_relax();
> inc.head = ACCESS_ONCE(lock->tickets.head);
> }
> - barrier(); /* make sure nothing creeps before the lock is taken */
> + barrier(); /* Make sure nothing creeps in before the lock is taken. */
> }
>
> static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> @@ -70,17 +84,37 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> if (old.tickets.head != old.tickets.tail)
> return 0;
>
> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> + new.head_tail = old.head_tail + (2 << TICKET_SHIFT);
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
>
> /* cmpxchg is a full barrier, so nothing can move before it */
> return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
> }
>
> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> +
> static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> {
> __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
> }
>
> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> +extern void tkt_q_do_wake(arch_spinlock_t *lock);
> +
> +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> +{
> + __ticket_t head = 2;
> +
> + head = xadd(&lock->tickets.head, head);
> + if (head & 0x1)
> + tkt_q_do_wake(lock);
> +}
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
> {
> struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> index ad0ad07..cdaefdd 100644
> --- a/arch/x86/include/asm/spinlock_types.h
> +++ b/arch/x86/include/asm/spinlock_types.h
> @@ -7,12 +7,18 @@
>
> #include <linux/types.h>
>
> -#if (CONFIG_NR_CPUS < 256)
> +#if (CONFIG_NR_CPUS < 128)
> typedef u8 __ticket_t;
> typedef u16 __ticketpair_t;
> -#else
> +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
> +#elif (CONFIG_NR_CPUS < 32768)
> typedef u16 __ticket_t;
> typedef u32 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
> +#else
> +typedef u32 __ticket_t;
> +typedef u64 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
> #endif
>
> #define TICKET_SHIFT (sizeof(__ticket_t) * 8)
> @@ -21,7 +27,11 @@ typedef struct arch_spinlock {
> union {
> __ticketpair_t head_tail;
> struct __raw_tickets {
> +#ifdef __BIG_ENDIAN__
> + __ticket_t tail, head;
> +#else /* #ifdef __BIG_ENDIAN__ */
> __ticket_t head, tail;
> +#endif /* #else #ifdef __BIG_ENDIAN__ */
> } tickets;
> };
> } arch_spinlock_t;
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index e9ef6d6..816a87c 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -15,6 +15,7 @@
> #include <asm/byteorder.h>
> #include <uapi/linux/kernel.h>
>
> +#define UCHAR_MAX ((u8)(~0U))
> #define USHRT_MAX ((u16)(~0U))
> #define SHRT_MAX ((s16)(USHRT_MAX>>1))
> #define SHRT_MIN ((s16)(-SHRT_MAX - 1))
> diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> index 44511d1..900c0f0 100644
> --- a/kernel/Kconfig.locks
> +++ b/kernel/Kconfig.locks
> @@ -223,3 +223,38 @@ endif
> config MUTEX_SPIN_ON_OWNER
> def_bool y
> depends on SMP && !DEBUG_MUTEXES
> +
> +config TICKET_LOCK_QUEUED
> + bool "Dynamically switch between ticket and queued locking"
> + depends on SMP
> + default n
> + ---help---
> + Enable dynamic switching between ticketlock and queued locking
> + on a per-lock basis. This option will slow down low-contention
> + acquisition and release very slightly (additional conditional
> + in release path), but will provide more efficient operation at
> + high levels of lock contention. High-contention operation will
> + not be quite as efficient as would be a pure queued lock, but
> + this dynamic approach consumes less memory than queud locks
> + and also runs faster at low levels of contention.
> +
> + Say "Y" if you are running on a large system with a workload
> + that is likely to result in high levels of contention.
> +
> + Say "N" if you are unsure.
> +
> +config TICKET_LOCK_QUEUED_SWITCH
> + int "When to switch from ticket to queued locking"
> + depends on TICKET_LOCK_QUEUED
> + default 8
> + range 3 32
> + ---help---
> + Specify how many tasks should be spinning on the lock before
> + switching to queued mode. Systems with low-latency memory/cache
> + interconnects will prefer larger numbers, while extreme low-latency
> + and real-time workloads will prefer a smaller number. Of course,
> + extreme real-time workloads would be even happier if contention
> + on the locks were reduced to the point that there was never any
> + need for queued locking in the first place.
> +
> + Take the default if you are unsure.
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 271fd31..70a91f7 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -51,6 +51,7 @@ endif
> obj-$(CONFIG_SMP) += spinlock.o
> obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
> obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
> +obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
> obj-$(CONFIG_UID16) += uid16.o
> obj-$(CONFIG_MODULES) += module.o
> obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
> diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> new file mode 100644
> index 0000000..9f03af0
> --- /dev/null
> +++ b/kernel/tktqlock.c
> @@ -0,0 +1,369 @@
> +/*
> + * Queued ticket spinlocks.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + * Copyright IBM Corporation, 2013
> + *
> + * Authors: Paul E. McKenney <[email protected]>
> + */
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/spinlock.h>
> +#include <linux/smp.h>
> +#include <linux/percpu.h>
> +
> +struct tkt_q {
> + int cpu;
> + __ticket_t tail;
> + struct tkt_q *next;
> +};
> +
> +struct tkt_q_head {
> + arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
> + s64 head_tkt; /* Head ticket when started queuing. */
> + struct tkt_q *spin; /* Head of queue. */
> + struct tkt_q **spin_tail; /* Tail of queue. */
> +};
> +
> +/*
> + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> + * given ticket lock to motivate switching to spinning on a queue.
> + * The reason that it is twice the number is because the bottom bit of
> + * the ticket is reserved for the bit that indicates that a queue is
> + * associated with the lock.
> + */
> +#define TKT_Q_SWITCH (CONFIG_TICKET_LOCK_QUEUED_SWITCH * 2)
> +
> +/*
> + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> + * might have multiple highly contended locks, so provide more queues for
> + * systems with larger numbers of CPUs.
> + */
> +#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
> +
> +/* The queues themselves. */
> +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
> +
> +/* Advance to the next queue slot, wrapping around to the beginning. */
> +static int tkt_q_next_slot(int i)
> +{
> + return (++i < TKT_Q_NQUEUES) ? i : 0;
> +}
> +
> +/* Very crude hash from lock address to queue slot number. */
> +static unsigned long tkt_q_hash(arch_spinlock_t *lock)
> +{
> + return (((unsigned long)lock) >> 8) % TKT_Q_NQUEUES;
> +}
> +
> +/*
> + * Return a pointer to the queue header associated with the specified lock,
> + * or return NULL if there is no queue for the lock or if the lock's queue
> + * is in transition.
> + */
> +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *lock)
> +{
> + int i;
> + int start;
> +
> + start = i = tkt_q_hash(lock);
> + do
> + if (tkt_q_heads[i].ref == lock)
> + return &tkt_q_heads[i];
> + while ((i = tkt_q_next_slot(i)) != start);
> + return NULL;
> +}
> +
> +/*
> + * Try to stop queuing, reverting back to normal ticket-lock operation.
> + * We can only stop queuing when the queue is empty, which means that
> + * we need to correctly handle races where someone shows up in the queue
> + * just as we are trying to dispense with the queue. They win, we lose.
> + */
> +static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)
> +{
> + arch_spinlock_t asold;
> + arch_spinlock_t asnew;
> +
> + /* Pick up the ticket values. */
> + asold = ACCESS_ONCE(*lock);
> + if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
> +
> + /* Attempt to mark the lock as not having a queue. */
> + asnew = asold;
> + asnew.tickets.head &= ~0x1;
> + if (cmpxchg(&lock->head_tail,
> + asold.head_tail,
> + asnew.head_tail) == asold.head_tail) {
> +
> + /* Succeeded, mark the queue as unused. */
> + ACCESS_ONCE(tqhp->ref) = NULL;
> + return true;
> + }
> + }
> +
> + /* Failed, tell the caller there is still a queue to pass off to. */
> + return false;
> +}
> +
> +/*
> + * Hand the lock off to the first CPU on the queue.
> + */
> +void tkt_q_do_wake(arch_spinlock_t *lock)
> +{
> + struct tkt_q_head *tqhp;
> + struct tkt_q *tqp;
> +
> + /* If the queue is still being set up, wait for it. */
> + while ((tqhp = tkt_q_find_head(lock)) == NULL)
> + cpu_relax();
> +
> + for (;;) {
> +
> + /* Find the first queue element. */
> + tqp = ACCESS_ONCE(tqhp->spin);
> + if (tqp != NULL)
> + break; /* Element exists, hand off lock. */
> + if (tkt_q_try_unqueue(lock, tqhp))
> + return; /* No element, successfully removed queue. */
> + cpu_relax();
> + }
> + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> + ACCESS_ONCE(tqhp->head_tkt) = -1;
> + smp_mb(); /* Order pointer fetch and assignment against handoff. */
> + ACCESS_ONCE(tqp->cpu) = -1;
> +}
> +EXPORT_SYMBOL(tkt_q_do_wake);
> +
> +/*
> + * Given a lock that already has a queue associated with it, spin on
> + * that queue. Return false if there was no queue (which means we do not
> + * hold the lock) and true otherwise (meaning we -do- hold the lock).
> + */
> +bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> +{
> + struct tkt_q **oldtail;
> + struct tkt_q tq;
> + struct tkt_q_head *tqhp;
> +
> + /*
> + * Ensure that accesses to queue header happen after sensing
> + * the lock's have-queue bit.
> + */
> + smp_mb(); /* See above block comment. */
> +
> + /* If there no longer is a queue, leave. */
> + tqhp = tkt_q_find_head(lock);
> + if (tqhp == NULL)
> + return false;
> +
> + /* Initialize our queue element. */
> + tq.cpu = raw_smp_processor_id();
> + tq.tail = inc.tail;
> + tq.next = NULL;
> +
> + /* Check to see if we already hold the lock. */
> + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> + /* The last holder left before queue formed, we hold lock. */
> + tqhp->head_tkt = -1;
> + return true;
> + }
> +
> + /*
> + * Add our element to the tail of the queue. Note that if the
> + * queue is empty, the ->spin_tail pointer will reference
> + * the queue's head pointer, namely ->spin.
> + */
> + oldtail = xchg(&tqhp->spin_tail, &tq.next);
> + ACCESS_ONCE(*oldtail) = &tq;
> +
> + /* Spin until handoff. */
> + while (ACCESS_ONCE(tq.cpu) != -1)
> + cpu_relax();
> +
> + /*
> + * Remove our element from the queue. If the queue is now empty,
> + * update carefully so that the next acquisition will enqueue itself
> + * at the head of the list. Of course, the next enqueue operation
> + * might be happening concurrently, and this code needs to handle all
> + * of the possible combinations, keeping in mind that the enqueue
> + * operation happens in two stages: (1) update the tail pointer and
> + * (2) update the predecessor's ->next pointer. With this in mind,
> + * the following code needs to deal with three scenarios:
> + *
> + * 1. tq is the last entry. In this case, we use cmpxchg to
> + * point the list tail back to the list head (->spin). If
> + * the cmpxchg fails, that indicates that we are instead
> + * in scenario 2 below. If the cmpxchg succeeds, the next
> + * enqueue operation's tail-pointer exchange will enqueue
> + * the next element at the queue head, because the ->spin_tail
> + * pointer now references the queue head.
> + *
> + * 2. tq is the last entry, and the next entry has updated the
> + * tail pointer but has not yet updated tq.next. In this
> + * case, tq.next is NULL, the cmpxchg will fail, and the
> + * code will wait for the enqueue to complete before completing
> + * removal of tq from the list.
> + *
> + * 3. tq is not the last pointer. In this case, tq.next is non-NULL,
> + * so the following code simply removes tq from the list.
> + */
> + if (tq.next == NULL) {
> +
> + /* Mark the queue empty. */
> + tqhp->spin = NULL;
> +
> + /* Try to point the tail back at the head. */
> + if (cmpxchg(&tqhp->spin_tail,
> + &tq.next,
> + &tqhp->spin) == &tq.next)
> + return true; /* Succeeded, queue is now empty. */
> +
> + /* Failed, if needed, wait for the enqueue to complete. */
> + while (tq.next == NULL)
> + cpu_relax();
> +
> + /* The following code will repair the head. */
> + }
> + smp_mb(); /* Force ordering between handoff and critical section. */
> +
> + /*
> + * Advance list-head pointer. This same task will be the next to
> + * access this when releasing the lock, so no need for a memory
> + * barrier after the following assignment.
> + */
> + ACCESS_ONCE(tqhp->spin) = tq.next;
> + return true;
> +}
> +
> +/*
> + * Given a lock that does not have a queue, attempt to associate the
> + * i-th queue with it, returning true if successful (meaning we hold
> + * the lock) or false otherwise (meaning we do -not- hold the lock).
> + * Note that the caller has already filled in ->ref with 0x1, so we
> + * own the queue.
> + */
> +static bool
> +tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
> +{
> + arch_spinlock_t asold;
> + arch_spinlock_t asnew;
> + struct tkt_q_head *tqhp;
> +
> + /* Initialize the i-th queue header. */
> + tqhp = &tkt_q_heads[i];
> + tqhp->spin = NULL;
> + tqhp->spin_tail = &tqhp->spin;
> +
> + /* Each pass through this loop attempts to mark the lock as queued. */
> + do {
> + asold.head_tail = ACCESS_ONCE(lock->head_tail);
> + asnew = asold;
> + if (asnew.tickets.head & 0x1) {
> +
> + /* Someone beat us to it, back out. */
> + smp_mb();
> + ACCESS_ONCE(tqhp->ref) = NULL;
> +
> + /* Spin on the queue element they set up. */
> + return tkt_q_do_spin(lock, inc);
> + }
> +
> + /*
> + * Record the head counter in case one of the spinning
> + * CPUs already holds the lock but doesn't realize it yet.
> + */
> + tqhp->head_tkt = asold.tickets.head;
> +
> + /* The low-order bit in the head counter says "queued". */
> + asnew.tickets.head |= 0x1;
> + } while (cmpxchg(&lock->head_tail,
> + asold.head_tail,
> + asnew.head_tail) != asold.head_tail);
> +
> + /* Point the queue at the lock and go spin on it. */
> + ACCESS_ONCE(tqhp->ref) = lock;
> + return tkt_q_do_spin(lock, inc);
> +}
> +
> +/*
> + * Start handling a period of high contention by finding a queue to associate
> + * with this lock. Returns true if successful (in which case we hold the
> + * lock) and false otherwise (in which case we do -not- hold the lock).
> + */
> +bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
> +{
> + int i;
> + int start;
> +
> + /* Hash the lock address to find a starting point. */
> + start = i = tkt_q_hash(lock);
> +
> + /*
> + * Each pass through the following loop attempts to associate
> + * the lock with the corresponding queue.
> + */
> + do {
> + /*
> + * Use 0x1 to mark the queue in use, but also avoiding
> + * any spinners trying to use it before we get it all
> + * initialized.
> + */
> + if (tkt_q_heads[i].ref)
> + continue;
> + if (cmpxchg(&tkt_q_heads[i].ref,
> + NULL,
> + (arch_spinlock_t *)0x1) == NULL) {

Hi Paul,

Would it be better to do the check like this before the cmpxchg in
order to keep the logic the same?

if (!tkt_q_heads[i].ref &&
cmpxchg(&tkt_q_heads[i].ref,
NULL,
(arch_spinlock_t *)0x1) == NULL)

Thanks,
Jason

2013-06-11 20:33:18

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 04:09:56PM -0400, Steven Rostedt wrote:
> On Tue, 2013-06-11 at 12:49 -0700, Paul E. McKenney wrote:
>
> > +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> > +{
> > + if (unlikely(inc.head & 0x1)) {
> > +
> > + /* This lock has a queue, so go spin on the queue. */
> > + if (tkt_q_do_spin(ap, inc))
> > + return true;
> > +
> > + /* Get here if the queue is in transition: Retry next time. */
> > +
>
> This looks better, but please add a comment, something to the likes of:
>
> /*
> * Only the TKT_Q_SWITCH waiter will set up the queue to prevent
> * a thundering herd of setups to occur. It is still possible for
> * more than one task to perform a setup if the lock is released
> * after this check, a waiter coming in may also match this test. But
> * that's covered by the cmpxchg() setup in tkt_q_start_contend.
> */
>
> > + } else if (inc.tail - TKT_Q_SWITCH == inc.head) {
>
> Also shouldn't this be:
>
> } else if ((__ticket_t)(inc.tail - TKT_Q_SWITCH) == inc.head) {

Good points on the comment, here is what I currently have:

} else if (inc.tail - TKT_Q_SWITCH == inc.head) {

/*
* This lock has lots of spinners, but no queue. Go create
* a queue to spin on.
*
* In the common case, only the single task that
* sees the head and tail tickets being different by
* exactly TKT_Q_SWITCH will come here set up the queue,
* which prevents a "thundering herd" of queue setups.
* Although it is still possible for an unfortunate series
* of lock handoffs and newly arrived tasks to result
* in more than one task performing a queue setup, this
* is unlikely. Of course, this situation must still be
* handled correctly, which is the job of the cmpxchg()
* in tkt_q_start_contend().
*/
if (tkt_q_start_contend(ap, inc))
return true;

Does that help?

> As TKT_Q_SWITCH doesn't have a type, I'm not sure how C will evaluate
> this. I always screw type conversions up, and just add in the type casts
> to be safe.
>
> You could also give TKT_Q_SWITCH a type too.

This is an excellent point as well -- things might well get confused.
My solution was to take your last suggestion and given TKT_Q_SWITCH the
same type as inc.tail and inc.head, and also apply type-safety paranoia
to TKT_Q_NQUEUES:

/*
* TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
* given ticket lock to motivate switching to spinning on a queue.
* The reason that it is twice the number is because the bottom bit of
* the ticket is reserved for the bit that indicates that a queue is
* associated with the lock.
*/
#define TKT_Q_SWITCH ((__ticket_t)(CONFIG_TICKET_LOCK_QUEUED_SWITCH * 2))

/*
* TKT_Q_NQUEUES is the number of queues to maintain. Large systems
* might have multiple highly contended locks, so provide more queues for
* systems with larger numbers of CPUs.
*/
#define TKT_Q_NQUEUES (2 * DIV_ROUND_UP(NR_CPUS + ((int)TKT_Q_SWITCH) - 1, \
(int)TKT_Q_SWITCH))

Does that look OK? (The limits on the value of TKT_Q_SWITCH should avoid
signed integer overflow.)

Thanx, Paul

> -- Steve
>
> > +
> > + /*
> > + * This lock has lots of spinners, but no queue.
> > + * Go create a queue to spin on.
> > + */
> > + if (tkt_q_start_contend(ap, inc))
> > + return true;
> > +
> > + /* Get here if the queue is in transition: Retry next time. */
> > + }
> > +
> > + /* Either no need for a queue or the queue is in transition. Spin. */
> > + return false;
> > +}
> > +EXPORT_SYMBOL(tkt_spin_pass);
>
>

2013-06-11 20:41:12

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 01:25:15PM -0700, Jason Low wrote:
> On Tue, Jun 11, 2013 at 12:49 PM, Paul E. McKenney
> <[email protected]> wrote:
> > On Tue, Jun 11, 2013 at 02:41:59PM -0400, Waiman Long wrote:
> >> On 06/11/2013 12:36 PM, Paul E. McKenney wrote:
> >> >
> >> >>I am a bit concern about the size of the head queue table itself.
> >> >>RHEL6, for example, had defined CONFIG_NR_CPUS to be 4096 which mean
> >> >>a table size of 256. Maybe it is better to dynamically allocate the
> >> >>table at init time depending on the actual number of CPUs in the
> >> >>system.
> >> >But if your kernel is built for 4096 CPUs, the 32*256=8192 bytes of memory
> >> >is way down in the noise. Systems that care about that small an amount
> >> >of memory probably have a small enough number of CPUs that they can just
> >> >turn off queueing at build time using CONFIG_TICKET_LOCK_QUEUED=n, right?
> >>
> >> My concern is more about the latency on the table scan than the
> >> actual memory that was used.
> >>
> >> >>>+/*
> >> >>>+ * Return a pointer to the queue header associated with the specified lock,
> >> >>>+ * or return NULL if there is no queue for the lock or if the lock's queue
> >> >>>+ * is in transition.
> >> >>>+ */
> >> >>>+static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
> >> >>>+{
> >> >>>+ int i;
> >> >>>+ int start;
> >> >>>+
> >> >>>+ start = i = tkt_q_hash(asp);
> >> >>>+ do
> >> >>>+ if (tkt_q_heads[i].ref == asp)
> >> >>>+ return&tkt_q_heads[i];
> >> >>>+ while ((i = tkt_q_next_slot(i)) != start);
> >> >>>+ return NULL;
> >> >>>+}
> >> >>With a table size of 256 and you have to scan the whole table to
> >> >>find the right head queue. This can be a significant overhead. I
> >> >>will suggest setting a limiting of how many entries it scans before
> >> >>it aborts rather than checking the whole table.
> >> >But it will scan 256 entries only if there are 256 other locks in queued
> >> >mode, which is -very- unlikely, even given 4096 CPUs. That said, if you
> >> >show me that this results in a real latency problem on a real system,
> >> >I would be happy to provide a way to limit the search.
> >>
> >> Looking at the code more carefully, the chance of actually scanning
> >> 256 entries is very small. However, I now have some concern on the
> >> way you set up the initial queue.
> >>
> >> +/*
> >> + * Start handling a period of high contention by finding a queue to associate
> >> + * with this lock. Returns true if successful (in which case we hold the
> >> + * lock) and false otherwise (in which case we do -not- hold the lock).
> >> + */
> >> +bool tkt_q_start_contend(arch_spinlock_t *asp, struct __raw_tickets inc)
> >> +{
> >> + int i;
> >> + int start;
> >> +
> >> + /* Hash the lock address to find a starting point. */
> >> + start = i = tkt_q_hash(asp);
> >> +
> >> + /*
> >> + * Each pass through the following loop attempts to associate
> >> + * the lock with the corresponding queue.
> >> + */
> >> + do {
> >> + /*
> >> + * Use 0x1 to mark the queue in use, but also avoiding
> >> + * any spinners trying to use it before we get it all
> >> + * initialized.
> >> + */
> >> + if (cmpxchg(&tkt_q_heads[i].ref,
> >> + NULL,
> >> + (arch_spinlock_t *)0x1) == NULL) {
> >> +
> >> + /* Succeeded, now go initialize it. */
> >> + return tkt_q_init_contend(i, asp, inc);
> >> + }
> >> +
> >> + /* If someone beat us to it, go spin on their queue. */
> >> + if (ACCESS_ONCE(asp->tickets.head)& 0x1)
> >> + return tkt_q_do_spin(asp, inc);
> >> + } while ((i = tkt_q_next_slot(i)) != start);
> >> +
> >> + /* All the queues are in use, revert to spinning on the ticket lock. */
> >> + return false;
> >> +}
> >> +
> >>
> >> Unconditional cmpxchg() can be a source of high contention by
> >> itself. Considering that 16 threads may be doing cmpxchg() more or
> >> less simultaneously on the same cache line, it can cause a lot of
> >> contention. It will be better if you check to see if tkt_q_heads[i]
> >> is NULL first before doing cmpxchg.
> >>
> >> Another point is that the 16 threads maybe setting up the queues in
> >> consecutive slots in the head table. This is both a source of
> >> contention and a waste of effort. One possible solution is to add
> >> one more field (set to cpuid + 1, for example) to indicate that that
> >> setup is being done with asp set to the target lock address
> >> immediately. We will need to use cmpxchg128() for 64-bit machine,
> >> though. Another solution is to have only that thread with ticket
> >> number that is a fixed distance from head (e.g. 16*2) to do the
> >> queue setup while the rest wait until the setup is done before
> >> spinning on the queue.
> >>
> >> As my colleague Davidlohr had reported there are more regressions
> >> than performance improvement in the AIM7 benchmark. I believe that
> >> queue setup contention is likely a source of performance regression.
> >
> > Please see below for a v3 patch that:
> >
> > 1. Fixes cpu_relax().
> >
> > 2. Tests before doing cmpxchg().
> >
> > 3. Reduces the number of CPUs attempting to set up the queue,
> > in the common case, to a single CPU. (Multiple CPUs can
> > still be trying to set up the queue given unfortunate
> > sequences of concurrent ticket-lock handoffs.)
> >
> > Please let me know how it goes!
> >
> > Thanx, Paul
> >
> > ------------------------------------------------------------------------
> >
> > ticketlock: Add queued-ticketlock capability
> >
> > Breaking up locks is better than implementing high-contention locks, but
> > if we must have high-contention locks, why not make them automatically
> > switch between light-weight ticket locks at low contention and queued
> > locks at high contention? After all, this would remove the need for
> > the developer to predict which locks will be highly contended.
> >
> > This commit allows ticket locks to automatically switch between pure
> > ticketlock and queued-lock operation as needed. If too many CPUs are
> > spinning on a given ticket lock, a queue structure will be allocated
> > and the lock will switch to queued-lock operation. When the lock becomes
> > free, it will switch back into ticketlock operation. The low-order bit
> > of the head counter is used to indicate that the lock is in queued mode,
> > which forces an unconditional mismatch between the head and tail counters.
> > This approach means that the common-case code path under conditions of
> > low contention is very nearly that of a plain ticket lock.
> >
> > A fixed number of queueing structures is statically allocated in an
> > array. The ticket-lock address is used to hash into an initial element,
> > but if that element is already in use, it moves to the next element. If
> > the entire array is already in use, continue to spin in ticket mode.
> >
> > Signed-off-by: Paul E. McKenney <[email protected]>
> > [ paulmck: Eliminate duplicate code and update comments (Steven Rostedt). ]
> > [ paulmck: Address Eric Dumazet review feedback. ]
> > [ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
> > [ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]
> > [ paulmck: Move cpu_relax() to main spin loop (Steven Rostedt). ]
> > [ paulmck: Reduce queue-switch contention (Waiman Long). ]
> >
> > diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
> > index 33692ea..509c51a 100644
> > --- a/arch/x86/include/asm/spinlock.h
> > +++ b/arch/x86/include/asm/spinlock.h
> > @@ -34,6 +34,21 @@
> > # define UNLOCK_LOCK_PREFIX
> > #endif
> >
> > +#ifdef CONFIG_TICKET_LOCK_QUEUED
> > +
> > +#define __TKT_SPIN_INC 2
> > +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> > +
> > +#else /* #ifdef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > +#define __TKT_SPIN_INC 1
> > +static inline bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> > +{
> > + return false;
> > +}
> > +
> > +#endif /* #else #ifdef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > /*
> > * Ticket locks are conceptually two parts, one indicating the current head of
> > * the queue, and the other indicating the current tail. The lock is acquired
> > @@ -49,17 +64,16 @@
> > */
> > static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> > {
> > - register struct __raw_tickets inc = { .tail = 1 };
> > + register struct __raw_tickets inc = { .tail = __TKT_SPIN_INC };
> >
> > inc = xadd(&lock->tickets, inc);
> > -
> > for (;;) {
> > - if (inc.head == inc.tail)
> > + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> > break;
> > cpu_relax();
> > inc.head = ACCESS_ONCE(lock->tickets.head);
> > }
> > - barrier(); /* make sure nothing creeps before the lock is taken */
> > + barrier(); /* Make sure nothing creeps in before the lock is taken. */
> > }
> >
> > static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> > @@ -70,17 +84,37 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> > if (old.tickets.head != old.tickets.tail)
> > return 0;
> >
> > +#ifndef CONFIG_TICKET_LOCK_QUEUED
> > new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
> > +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > + new.head_tail = old.head_tail + (2 << TICKET_SHIFT);
> > +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> >
> > /* cmpxchg is a full barrier, so nothing can move before it */
> > return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
> > }
> >
> > +#ifndef CONFIG_TICKET_LOCK_QUEUED
> > +
> > static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> > {
> > __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
> > }
> >
> > +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > +extern void tkt_q_do_wake(arch_spinlock_t *lock);
> > +
> > +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> > +{
> > + __ticket_t head = 2;
> > +
> > + head = xadd(&lock->tickets.head, head);
> > + if (head & 0x1)
> > + tkt_q_do_wake(lock);
> > +}
> > +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
> > {
> > struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> > diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> > index ad0ad07..cdaefdd 100644
> > --- a/arch/x86/include/asm/spinlock_types.h
> > +++ b/arch/x86/include/asm/spinlock_types.h
> > @@ -7,12 +7,18 @@
> >
> > #include <linux/types.h>
> >
> > -#if (CONFIG_NR_CPUS < 256)
> > +#if (CONFIG_NR_CPUS < 128)
> > typedef u8 __ticket_t;
> > typedef u16 __ticketpair_t;
> > -#else
> > +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
> > +#elif (CONFIG_NR_CPUS < 32768)
> > typedef u16 __ticket_t;
> > typedef u32 __ticketpair_t;
> > +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
> > +#else
> > +typedef u32 __ticket_t;
> > +typedef u64 __ticketpair_t;
> > +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
> > #endif
> >
> > #define TICKET_SHIFT (sizeof(__ticket_t) * 8)
> > @@ -21,7 +27,11 @@ typedef struct arch_spinlock {
> > union {
> > __ticketpair_t head_tail;
> > struct __raw_tickets {
> > +#ifdef __BIG_ENDIAN__
> > + __ticket_t tail, head;
> > +#else /* #ifdef __BIG_ENDIAN__ */
> > __ticket_t head, tail;
> > +#endif /* #else #ifdef __BIG_ENDIAN__ */
> > } tickets;
> > };
> > } arch_spinlock_t;
> > diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> > index e9ef6d6..816a87c 100644
> > --- a/include/linux/kernel.h
> > +++ b/include/linux/kernel.h
> > @@ -15,6 +15,7 @@
> > #include <asm/byteorder.h>
> > #include <uapi/linux/kernel.h>
> >
> > +#define UCHAR_MAX ((u8)(~0U))
> > #define USHRT_MAX ((u16)(~0U))
> > #define SHRT_MAX ((s16)(USHRT_MAX>>1))
> > #define SHRT_MIN ((s16)(-SHRT_MAX - 1))
> > diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> > index 44511d1..900c0f0 100644
> > --- a/kernel/Kconfig.locks
> > +++ b/kernel/Kconfig.locks
> > @@ -223,3 +223,38 @@ endif
> > config MUTEX_SPIN_ON_OWNER
> > def_bool y
> > depends on SMP && !DEBUG_MUTEXES
> > +
> > +config TICKET_LOCK_QUEUED
> > + bool "Dynamically switch between ticket and queued locking"
> > + depends on SMP
> > + default n
> > + ---help---
> > + Enable dynamic switching between ticketlock and queued locking
> > + on a per-lock basis. This option will slow down low-contention
> > + acquisition and release very slightly (additional conditional
> > + in release path), but will provide more efficient operation at
> > + high levels of lock contention. High-contention operation will
> > + not be quite as efficient as would be a pure queued lock, but
> > + this dynamic approach consumes less memory than queud locks
> > + and also runs faster at low levels of contention.
> > +
> > + Say "Y" if you are running on a large system with a workload
> > + that is likely to result in high levels of contention.
> > +
> > + Say "N" if you are unsure.
> > +
> > +config TICKET_LOCK_QUEUED_SWITCH
> > + int "When to switch from ticket to queued locking"
> > + depends on TICKET_LOCK_QUEUED
> > + default 8
> > + range 3 32
> > + ---help---
> > + Specify how many tasks should be spinning on the lock before
> > + switching to queued mode. Systems with low-latency memory/cache
> > + interconnects will prefer larger numbers, while extreme low-latency
> > + and real-time workloads will prefer a smaller number. Of course,
> > + extreme real-time workloads would be even happier if contention
> > + on the locks were reduced to the point that there was never any
> > + need for queued locking in the first place.
> > +
> > + Take the default if you are unsure.
> > diff --git a/kernel/Makefile b/kernel/Makefile
> > index 271fd31..70a91f7 100644
> > --- a/kernel/Makefile
> > +++ b/kernel/Makefile
> > @@ -51,6 +51,7 @@ endif
> > obj-$(CONFIG_SMP) += spinlock.o
> > obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
> > obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
> > +obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
> > obj-$(CONFIG_UID16) += uid16.o
> > obj-$(CONFIG_MODULES) += module.o
> > obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
> > diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> > new file mode 100644
> > index 0000000..9f03af0
> > --- /dev/null
> > +++ b/kernel/tktqlock.c
> > @@ -0,0 +1,369 @@
> > +/*
> > + * Queued ticket spinlocks.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, write to the Free Software
> > + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> > + *
> > + * Copyright IBM Corporation, 2013
> > + *
> > + * Authors: Paul E. McKenney <[email protected]>
> > + */
> > +#include <linux/types.h>
> > +#include <linux/kernel.h>
> > +#include <linux/spinlock.h>
> > +#include <linux/smp.h>
> > +#include <linux/percpu.h>
> > +
> > +struct tkt_q {
> > + int cpu;
> > + __ticket_t tail;
> > + struct tkt_q *next;
> > +};
> > +
> > +struct tkt_q_head {
> > + arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
> > + s64 head_tkt; /* Head ticket when started queuing. */
> > + struct tkt_q *spin; /* Head of queue. */
> > + struct tkt_q **spin_tail; /* Tail of queue. */
> > +};
> > +
> > +/*
> > + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> > + * given ticket lock to motivate switching to spinning on a queue.
> > + * The reason that it is twice the number is because the bottom bit of
> > + * the ticket is reserved for the bit that indicates that a queue is
> > + * associated with the lock.
> > + */
> > +#define TKT_Q_SWITCH (CONFIG_TICKET_LOCK_QUEUED_SWITCH * 2)
> > +
> > +/*
> > + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> > + * might have multiple highly contended locks, so provide more queues for
> > + * systems with larger numbers of CPUs.
> > + */
> > +#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
> > +
> > +/* The queues themselves. */
> > +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
> > +
> > +/* Advance to the next queue slot, wrapping around to the beginning. */
> > +static int tkt_q_next_slot(int i)
> > +{
> > + return (++i < TKT_Q_NQUEUES) ? i : 0;
> > +}
> > +
> > +/* Very crude hash from lock address to queue slot number. */
> > +static unsigned long tkt_q_hash(arch_spinlock_t *lock)
> > +{
> > + return (((unsigned long)lock) >> 8) % TKT_Q_NQUEUES;
> > +}
> > +
> > +/*
> > + * Return a pointer to the queue header associated with the specified lock,
> > + * or return NULL if there is no queue for the lock or if the lock's queue
> > + * is in transition.
> > + */
> > +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *lock)
> > +{
> > + int i;
> > + int start;
> > +
> > + start = i = tkt_q_hash(lock);
> > + do
> > + if (tkt_q_heads[i].ref == lock)
> > + return &tkt_q_heads[i];
> > + while ((i = tkt_q_next_slot(i)) != start);
> > + return NULL;
> > +}
> > +
> > +/*
> > + * Try to stop queuing, reverting back to normal ticket-lock operation.
> > + * We can only stop queuing when the queue is empty, which means that
> > + * we need to correctly handle races where someone shows up in the queue
> > + * just as we are trying to dispense with the queue. They win, we lose.
> > + */
> > +static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)
> > +{
> > + arch_spinlock_t asold;
> > + arch_spinlock_t asnew;
> > +
> > + /* Pick up the ticket values. */
> > + asold = ACCESS_ONCE(*lock);
> > + if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
> > +
> > + /* Attempt to mark the lock as not having a queue. */
> > + asnew = asold;
> > + asnew.tickets.head &= ~0x1;
> > + if (cmpxchg(&lock->head_tail,
> > + asold.head_tail,
> > + asnew.head_tail) == asold.head_tail) {
> > +
> > + /* Succeeded, mark the queue as unused. */
> > + ACCESS_ONCE(tqhp->ref) = NULL;
> > + return true;
> > + }
> > + }
> > +
> > + /* Failed, tell the caller there is still a queue to pass off to. */
> > + return false;
> > +}
> > +
> > +/*
> > + * Hand the lock off to the first CPU on the queue.
> > + */
> > +void tkt_q_do_wake(arch_spinlock_t *lock)
> > +{
> > + struct tkt_q_head *tqhp;
> > + struct tkt_q *tqp;
> > +
> > + /* If the queue is still being set up, wait for it. */
> > + while ((tqhp = tkt_q_find_head(lock)) == NULL)
> > + cpu_relax();
> > +
> > + for (;;) {
> > +
> > + /* Find the first queue element. */
> > + tqp = ACCESS_ONCE(tqhp->spin);
> > + if (tqp != NULL)
> > + break; /* Element exists, hand off lock. */
> > + if (tkt_q_try_unqueue(lock, tqhp))
> > + return; /* No element, successfully removed queue. */
> > + cpu_relax();
> > + }
> > + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> > + ACCESS_ONCE(tqhp->head_tkt) = -1;
> > + smp_mb(); /* Order pointer fetch and assignment against handoff. */
> > + ACCESS_ONCE(tqp->cpu) = -1;
> > +}
> > +EXPORT_SYMBOL(tkt_q_do_wake);
> > +
> > +/*
> > + * Given a lock that already has a queue associated with it, spin on
> > + * that queue. Return false if there was no queue (which means we do not
> > + * hold the lock) and true otherwise (meaning we -do- hold the lock).
> > + */
> > +bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> > +{
> > + struct tkt_q **oldtail;
> > + struct tkt_q tq;
> > + struct tkt_q_head *tqhp;
> > +
> > + /*
> > + * Ensure that accesses to queue header happen after sensing
> > + * the lock's have-queue bit.
> > + */
> > + smp_mb(); /* See above block comment. */
> > +
> > + /* If there no longer is a queue, leave. */
> > + tqhp = tkt_q_find_head(lock);
> > + if (tqhp == NULL)
> > + return false;
> > +
> > + /* Initialize our queue element. */
> > + tq.cpu = raw_smp_processor_id();
> > + tq.tail = inc.tail;
> > + tq.next = NULL;
> > +
> > + /* Check to see if we already hold the lock. */
> > + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> > + /* The last holder left before queue formed, we hold lock. */
> > + tqhp->head_tkt = -1;
> > + return true;
> > + }
> > +
> > + /*
> > + * Add our element to the tail of the queue. Note that if the
> > + * queue is empty, the ->spin_tail pointer will reference
> > + * the queue's head pointer, namely ->spin.
> > + */
> > + oldtail = xchg(&tqhp->spin_tail, &tq.next);
> > + ACCESS_ONCE(*oldtail) = &tq;
> > +
> > + /* Spin until handoff. */
> > + while (ACCESS_ONCE(tq.cpu) != -1)
> > + cpu_relax();
> > +
> > + /*
> > + * Remove our element from the queue. If the queue is now empty,
> > + * update carefully so that the next acquisition will enqueue itself
> > + * at the head of the list. Of course, the next enqueue operation
> > + * might be happening concurrently, and this code needs to handle all
> > + * of the possible combinations, keeping in mind that the enqueue
> > + * operation happens in two stages: (1) update the tail pointer and
> > + * (2) update the predecessor's ->next pointer. With this in mind,
> > + * the following code needs to deal with three scenarios:
> > + *
> > + * 1. tq is the last entry. In this case, we use cmpxchg to
> > + * point the list tail back to the list head (->spin). If
> > + * the cmpxchg fails, that indicates that we are instead
> > + * in scenario 2 below. If the cmpxchg succeeds, the next
> > + * enqueue operation's tail-pointer exchange will enqueue
> > + * the next element at the queue head, because the ->spin_tail
> > + * pointer now references the queue head.
> > + *
> > + * 2. tq is the last entry, and the next entry has updated the
> > + * tail pointer but has not yet updated tq.next. In this
> > + * case, tq.next is NULL, the cmpxchg will fail, and the
> > + * code will wait for the enqueue to complete before completing
> > + * removal of tq from the list.
> > + *
> > + * 3. tq is not the last pointer. In this case, tq.next is non-NULL,
> > + * so the following code simply removes tq from the list.
> > + */
> > + if (tq.next == NULL) {
> > +
> > + /* Mark the queue empty. */
> > + tqhp->spin = NULL;
> > +
> > + /* Try to point the tail back at the head. */
> > + if (cmpxchg(&tqhp->spin_tail,
> > + &tq.next,
> > + &tqhp->spin) == &tq.next)
> > + return true; /* Succeeded, queue is now empty. */
> > +
> > + /* Failed, if needed, wait for the enqueue to complete. */
> > + while (tq.next == NULL)
> > + cpu_relax();
> > +
> > + /* The following code will repair the head. */
> > + }
> > + smp_mb(); /* Force ordering between handoff and critical section. */
> > +
> > + /*
> > + * Advance list-head pointer. This same task will be the next to
> > + * access this when releasing the lock, so no need for a memory
> > + * barrier after the following assignment.
> > + */
> > + ACCESS_ONCE(tqhp->spin) = tq.next;
> > + return true;
> > +}
> > +
> > +/*
> > + * Given a lock that does not have a queue, attempt to associate the
> > + * i-th queue with it, returning true if successful (meaning we hold
> > + * the lock) or false otherwise (meaning we do -not- hold the lock).
> > + * Note that the caller has already filled in ->ref with 0x1, so we
> > + * own the queue.
> > + */
> > +static bool
> > +tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
> > +{
> > + arch_spinlock_t asold;
> > + arch_spinlock_t asnew;
> > + struct tkt_q_head *tqhp;
> > +
> > + /* Initialize the i-th queue header. */
> > + tqhp = &tkt_q_heads[i];
> > + tqhp->spin = NULL;
> > + tqhp->spin_tail = &tqhp->spin;
> > +
> > + /* Each pass through this loop attempts to mark the lock as queued. */
> > + do {
> > + asold.head_tail = ACCESS_ONCE(lock->head_tail);
> > + asnew = asold;
> > + if (asnew.tickets.head & 0x1) {
> > +
> > + /* Someone beat us to it, back out. */
> > + smp_mb();
> > + ACCESS_ONCE(tqhp->ref) = NULL;
> > +
> > + /* Spin on the queue element they set up. */
> > + return tkt_q_do_spin(lock, inc);
> > + }
> > +
> > + /*
> > + * Record the head counter in case one of the spinning
> > + * CPUs already holds the lock but doesn't realize it yet.
> > + */
> > + tqhp->head_tkt = asold.tickets.head;
> > +
> > + /* The low-order bit in the head counter says "queued". */
> > + asnew.tickets.head |= 0x1;
> > + } while (cmpxchg(&lock->head_tail,
> > + asold.head_tail,
> > + asnew.head_tail) != asold.head_tail);
> > +
> > + /* Point the queue at the lock and go spin on it. */
> > + ACCESS_ONCE(tqhp->ref) = lock;
> > + return tkt_q_do_spin(lock, inc);
> > +}
> > +
> > +/*
> > + * Start handling a period of high contention by finding a queue to associate
> > + * with this lock. Returns true if successful (in which case we hold the
> > + * lock) and false otherwise (in which case we do -not- hold the lock).
> > + */
> > +bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
> > +{
> > + int i;
> > + int start;
> > +
> > + /* Hash the lock address to find a starting point. */
> > + start = i = tkt_q_hash(lock);
> > +
> > + /*
> > + * Each pass through the following loop attempts to associate
> > + * the lock with the corresponding queue.
> > + */
> > + do {
> > + /*
> > + * Use 0x1 to mark the queue in use, but also avoiding
> > + * any spinners trying to use it before we get it all
> > + * initialized.
> > + */
> > + if (tkt_q_heads[i].ref)
> > + continue;
> > + if (cmpxchg(&tkt_q_heads[i].ref,
> > + NULL,
> > + (arch_spinlock_t *)0x1) == NULL) {
>
> Hi Paul,
>
> Would it be better to do the check like this before the cmpxchg in
> order to keep the logic the same?
>
> if (!tkt_q_heads[i].ref &&
> cmpxchg(&tkt_q_heads[i].ref,
> NULL,
> (arch_spinlock_t *)0x1) == NULL)

Good point -- my approach was skipping the check for someone else having
set the queue up. Fixed!

Thanx, Paul

2013-06-11 20:53:48

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, 2013-06-11 at 13:32 -0700, Paul E. McKenney wrote:

> /*
> * This lock has lots of spinners, but no queue. Go create
> * a queue to spin on.
> *
> * In the common case, only the single task that
> * sees the head and tail tickets being different by
> * exactly TKT_Q_SWITCH will come here set up the queue,
> * which prevents a "thundering herd" of queue setups.
> * Although it is still possible for an unfortunate series
> * of lock handoffs and newly arrived tasks to result
> * in more than one task performing a queue setup, this
> * is unlikely. Of course, this situation must still be
> * handled correctly, which is the job of the cmpxchg()
> * in tkt_q_start_contend().
> */
> if (tkt_q_start_contend(ap, inc))
> return true;
>
> Does that help?

Yes, very good.

>
> > As TKT_Q_SWITCH doesn't have a type, I'm not sure how C will evaluate
> > this. I always screw type conversions up, and just add in the type casts
> > to be safe.
> >
> > You could also give TKT_Q_SWITCH a type too.
>
> This is an excellent point as well -- things might well get confused.
> My solution was to take your last suggestion and given TKT_Q_SWITCH the
> same type as inc.tail and inc.head, and also apply type-safety paranoia
> to TKT_Q_NQUEUES:
>
> /*
> * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> * given ticket lock to motivate switching to spinning on a queue.
> * The reason that it is twice the number is because the bottom bit of
> * the ticket is reserved for the bit that indicates that a queue is
> * associated with the lock.
> */
> #define TKT_Q_SWITCH ((__ticket_t)(CONFIG_TICKET_LOCK_QUEUED_SWITCH * 2))
>
> /*
> * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> * might have multiple highly contended locks, so provide more queues for
> * systems with larger numbers of CPUs.
> */
> #define TKT_Q_NQUEUES (2 * DIV_ROUND_UP(NR_CPUS + ((int)TKT_Q_SWITCH) - 1, \
> (int)TKT_Q_SWITCH))
>
> Does that look OK? (The limits on the value of TKT_Q_SWITCH should avoid
> signed integer overflow.)
>

Looks fine.

-- Steve

2013-06-11 20:56:54

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, 2013-06-11 at 12:49 -0700, Paul E. McKenney wrote:

> +config TICKET_LOCK_QUEUED
> + bool "Dynamically switch between ticket and queued locking"
> + depends on SMP
> + default n
> + ---help---
> + Enable dynamic switching between ticketlock and queued locking
> + on a per-lock basis. This option will slow down low-contention
> + acquisition and release very slightly (additional conditional
> + in release path), but will provide more efficient operation at
> + high levels of lock contention. High-contention operation will
> + not be quite as efficient as would be a pure queued lock, but
> + this dynamic approach consumes less memory than queud locks
> + and also runs faster at low levels of contention.
> +
> + Say "Y" if you are running on a large system with a workload
> + that is likely to result in high levels of contention.
> +
> + Say "N" if you are unsure.
> +
> +config TICKET_LOCK_QUEUED_SWITCH
> + int "When to switch from ticket to queued locking"
> + depends on TICKET_LOCK_QUEUED
> + default 8
> + range 3 32
> + ---help---
> + Specify how many tasks should be spinning on the lock before
> + switching to queued mode. Systems with low-latency memory/cache
> + interconnects will prefer larger numbers, while extreme low-latency
> + and real-time workloads will prefer a smaller number. Of course,
> + extreme real-time workloads would be even happier if contention
> + on the locks were reduced to the point that there was never any
> + need for queued locking in the first place.

Are you sure real-time wants low numbers? I would think that real-time
would want this off. This is just a way to help prevent cache ping
ponging, but it adds to non-deterministic behavior. As I mentioned
before, even though you fixed the thundering herd on setup, once the
queue is set, then we will get a thundering herd of tasks trying to
queue itself, and the task that was spinning the longest could very well
become the one at the end of the FIFO.

-- Steve



> +
> + Take the default if you are unsure.
> diff --git a/kernel/Makefile b/kernel/Makefile

2013-06-11 21:09:24

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 04:56:50PM -0400, Steven Rostedt wrote:
> On Tue, 2013-06-11 at 12:49 -0700, Paul E. McKenney wrote:
>
> > +config TICKET_LOCK_QUEUED
> > + bool "Dynamically switch between ticket and queued locking"
> > + depends on SMP
> > + default n
> > + ---help---
> > + Enable dynamic switching between ticketlock and queued locking
> > + on a per-lock basis. This option will slow down low-contention
> > + acquisition and release very slightly (additional conditional
> > + in release path), but will provide more efficient operation at
> > + high levels of lock contention. High-contention operation will
> > + not be quite as efficient as would be a pure queued lock, but
> > + this dynamic approach consumes less memory than queud locks
> > + and also runs faster at low levels of contention.
> > +
> > + Say "Y" if you are running on a large system with a workload
> > + that is likely to result in high levels of contention.
> > +
> > + Say "N" if you are unsure.
> > +
> > +config TICKET_LOCK_QUEUED_SWITCH
> > + int "When to switch from ticket to queued locking"
> > + depends on TICKET_LOCK_QUEUED
> > + default 8
> > + range 3 32
> > + ---help---
> > + Specify how many tasks should be spinning on the lock before
> > + switching to queued mode. Systems with low-latency memory/cache
> > + interconnects will prefer larger numbers, while extreme low-latency
> > + and real-time workloads will prefer a smaller number. Of course,
> > + extreme real-time workloads would be even happier if contention
> > + on the locks were reduced to the point that there was never any
> > + need for queued locking in the first place.
>
> Are you sure real-time wants low numbers? I would think that real-time
> would want this off. This is just a way to help prevent cache ping
> ponging, but it adds to non-deterministic behavior. As I mentioned
> before, even though you fixed the thundering herd on setup, once the
> queue is set, then we will get a thundering herd of tasks trying to
> queue itself, and the task that was spinning the longest could very well
> become the one at the end of the FIFO.

Me? I think that real-time just wants contention to remain low, so that
this sort of thing isn't needed in the first place. And now that you
mention it, I suppose that is one of the few things that real-time and
real-fast workloads have in common.

But if you had some mixed workload on a large system that was mostly
real-fast, but had a real-time component, and if the real-fast portion
needed TICKET_LOCK_QUEUED=y, then I would guess that the real-time
portion would want a relatively low number for TICKET_LOCK_QUEUED_SWITCH.

Thanx, Paul

> -- Steve
>
>
>
> > +
> > + Take the default if you are unsure.
> > diff --git a/kernel/Makefile b/kernel/Makefile
>
>

2013-06-12 01:19:41

by Lai Jiangshan

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Wed, Jun 12, 2013 at 3:49 AM, Paul E. McKenney
<[email protected]> wrote:
> On Tue, Jun 11, 2013 at 02:41:59PM -0400, Waiman Long wrote:
>> On 06/11/2013 12:36 PM, Paul E. McKenney wrote:
>> >
>> >>I am a bit concern about the size of the head queue table itself.
>> >>RHEL6, for example, had defined CONFIG_NR_CPUS to be 4096 which mean
>> >>a table size of 256. Maybe it is better to dynamically allocate the
>> >>table at init time depending on the actual number of CPUs in the
>> >>system.
>> >But if your kernel is built for 4096 CPUs, the 32*256=8192 bytes of memory
>> >is way down in the noise. Systems that care about that small an amount
>> >of memory probably have a small enough number of CPUs that they can just
>> >turn off queueing at build time using CONFIG_TICKET_LOCK_QUEUED=n, right?
>>
>> My concern is more about the latency on the table scan than the
>> actual memory that was used.
>>
>> >>>+/*
>> >>>+ * Return a pointer to the queue header associated with the specified lock,
>> >>>+ * or return NULL if there is no queue for the lock or if the lock's queue
>> >>>+ * is in transition.
>> >>>+ */
>> >>>+static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
>> >>>+{
>> >>>+ int i;
>> >>>+ int start;
>> >>>+
>> >>>+ start = i = tkt_q_hash(asp);
>> >>>+ do
>> >>>+ if (tkt_q_heads[i].ref == asp)
>> >>>+ return&tkt_q_heads[i];
>> >>>+ while ((i = tkt_q_next_slot(i)) != start);
>> >>>+ return NULL;
>> >>>+}
>> >>With a table size of 256 and you have to scan the whole table to
>> >>find the right head queue. This can be a significant overhead. I
>> >>will suggest setting a limiting of how many entries it scans before
>> >>it aborts rather than checking the whole table.
>> >But it will scan 256 entries only if there are 256 other locks in queued
>> >mode, which is -very- unlikely, even given 4096 CPUs. That said, if you
>> >show me that this results in a real latency problem on a real system,
>> >I would be happy to provide a way to limit the search.
>>
>> Looking at the code more carefully, the chance of actually scanning
>> 256 entries is very small. However, I now have some concern on the
>> way you set up the initial queue.
>>
>> +/*
>> + * Start handling a period of high contention by finding a queue to associate
>> + * with this lock. Returns true if successful (in which case we hold the
>> + * lock) and false otherwise (in which case we do -not- hold the lock).
>> + */
>> +bool tkt_q_start_contend(arch_spinlock_t *asp, struct __raw_tickets inc)
>> +{
>> + int i;
>> + int start;
>> +
>> + /* Hash the lock address to find a starting point. */
>> + start = i = tkt_q_hash(asp);
>> +
>> + /*
>> + * Each pass through the following loop attempts to associate
>> + * the lock with the corresponding queue.
>> + */
>> + do {
>> + /*
>> + * Use 0x1 to mark the queue in use, but also avoiding
>> + * any spinners trying to use it before we get it all
>> + * initialized.
>> + */
>> + if (cmpxchg(&tkt_q_heads[i].ref,
>> + NULL,
>> + (arch_spinlock_t *)0x1) == NULL) {
>> +
>> + /* Succeeded, now go initialize it. */
>> + return tkt_q_init_contend(i, asp, inc);
>> + }
>> +
>> + /* If someone beat us to it, go spin on their queue. */
>> + if (ACCESS_ONCE(asp->tickets.head)& 0x1)
>> + return tkt_q_do_spin(asp, inc);
>> + } while ((i = tkt_q_next_slot(i)) != start);
>> +
>> + /* All the queues are in use, revert to spinning on the ticket lock. */
>> + return false;
>> +}
>> +
>>
>> Unconditional cmpxchg() can be a source of high contention by
>> itself. Considering that 16 threads may be doing cmpxchg() more or
>> less simultaneously on the same cache line, it can cause a lot of
>> contention. It will be better if you check to see if tkt_q_heads[i]
>> is NULL first before doing cmpxchg.
>>
>> Another point is that the 16 threads maybe setting up the queues in
>> consecutive slots in the head table. This is both a source of
>> contention and a waste of effort. One possible solution is to add
>> one more field (set to cpuid + 1, for example) to indicate that that
>> setup is being done with asp set to the target lock address
>> immediately. We will need to use cmpxchg128() for 64-bit machine,
>> though. Another solution is to have only that thread with ticket
>> number that is a fixed distance from head (e.g. 16*2) to do the
>> queue setup while the rest wait until the setup is done before
>> spinning on the queue.
>>
>> As my colleague Davidlohr had reported there are more regressions
>> than performance improvement in the AIM7 benchmark. I believe that
>> queue setup contention is likely a source of performance regression.
>
> Please see below for a v3 patch that:
>
> 1. Fixes cpu_relax().
>
> 2. Tests before doing cmpxchg().
>
> 3. Reduces the number of CPUs attempting to set up the queue,
> in the common case, to a single CPU. (Multiple CPUs can
> still be trying to set up the queue given unfortunate
> sequences of concurrent ticket-lock handoffs.)
>
> Please let me know how it goes!
>
> Thanx, Paul
>
> ------------------------------------------------------------------------
>
> ticketlock: Add queued-ticketlock capability
>
> Breaking up locks is better than implementing high-contention locks, but
> if we must have high-contention locks, why not make them automatically
> switch between light-weight ticket locks at low contention and queued
> locks at high contention? After all, this would remove the need for
> the developer to predict which locks will be highly contended.
>
> This commit allows ticket locks to automatically switch between pure
> ticketlock and queued-lock operation as needed. If too many CPUs are
> spinning on a given ticket lock, a queue structure will be allocated
> and the lock will switch to queued-lock operation. When the lock becomes
> free, it will switch back into ticketlock operation. The low-order bit
> of the head counter is used to indicate that the lock is in queued mode,
> which forces an unconditional mismatch between the head and tail counters.
> This approach means that the common-case code path under conditions of
> low contention is very nearly that of a plain ticket lock.
>
> A fixed number of queueing structures is statically allocated in an
> array. The ticket-lock address is used to hash into an initial element,
> but if that element is already in use, it moves to the next element. If
> the entire array is already in use, continue to spin in ticket mode.
>
> Signed-off-by: Paul E. McKenney <[email protected]>
> [ paulmck: Eliminate duplicate code and update comments (Steven Rostedt). ]
> [ paulmck: Address Eric Dumazet review feedback. ]
> [ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
> [ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]
> [ paulmck: Move cpu_relax() to main spin loop (Steven Rostedt). ]
> [ paulmck: Reduce queue-switch contention (Waiman Long). ]
>
> diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
> index 33692ea..509c51a 100644
> --- a/arch/x86/include/asm/spinlock.h
> +++ b/arch/x86/include/asm/spinlock.h
> @@ -34,6 +34,21 @@
> # define UNLOCK_LOCK_PREFIX
> #endif
>
> +#ifdef CONFIG_TICKET_LOCK_QUEUED
> +
> +#define __TKT_SPIN_INC 2
> +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> +
> +#else /* #ifdef CONFIG_TICKET_LOCK_QUEUED */
> +
> +#define __TKT_SPIN_INC 1
> +static inline bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> +{
> + return false;
> +}
> +
> +#endif /* #else #ifdef CONFIG_TICKET_LOCK_QUEUED */
> +
> /*
> * Ticket locks are conceptually two parts, one indicating the current head of
> * the queue, and the other indicating the current tail. The lock is acquired
> @@ -49,17 +64,16 @@
> */
> static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> {
> - register struct __raw_tickets inc = { .tail = 1 };
> + register struct __raw_tickets inc = { .tail = __TKT_SPIN_INC };
>
> inc = xadd(&lock->tickets, inc);
> -
> for (;;) {
> - if (inc.head == inc.tail)
> + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> break;
> cpu_relax();
> inc.head = ACCESS_ONCE(lock->tickets.head);
> }
> - barrier(); /* make sure nothing creeps before the lock is taken */
> + barrier(); /* Make sure nothing creeps in before the lock is taken. */
> }
>
> static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> @@ -70,17 +84,37 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> if (old.tickets.head != old.tickets.tail)
> return 0;
>
> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> + new.head_tail = old.head_tail + (2 << TICKET_SHIFT);
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
>
> /* cmpxchg is a full barrier, so nothing can move before it */
> return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
> }
>
> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> +
> static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> {
> __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
> }
>
> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> +extern void tkt_q_do_wake(arch_spinlock_t *lock);
> +
> +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> +{
> + __ticket_t head = 2;
> +
> + head = xadd(&lock->tickets.head, head);
> + if (head & 0x1)
> + tkt_q_do_wake(lock);
> +}
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
> {
> struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> index ad0ad07..cdaefdd 100644
> --- a/arch/x86/include/asm/spinlock_types.h
> +++ b/arch/x86/include/asm/spinlock_types.h
> @@ -7,12 +7,18 @@
>
> #include <linux/types.h>
>
> -#if (CONFIG_NR_CPUS < 256)
> +#if (CONFIG_NR_CPUS < 128)
> typedef u8 __ticket_t;
> typedef u16 __ticketpair_t;
> -#else
> +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
> +#elif (CONFIG_NR_CPUS < 32768)
> typedef u16 __ticket_t;
> typedef u32 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
> +#else
> +typedef u32 __ticket_t;
> +typedef u64 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
> #endif
>
> #define TICKET_SHIFT (sizeof(__ticket_t) * 8)
> @@ -21,7 +27,11 @@ typedef struct arch_spinlock {
> union {
> __ticketpair_t head_tail;
> struct __raw_tickets {
> +#ifdef __BIG_ENDIAN__
> + __ticket_t tail, head;
> +#else /* #ifdef __BIG_ENDIAN__ */
> __ticket_t head, tail;
> +#endif /* #else #ifdef __BIG_ENDIAN__ */
> } tickets;
> };
> } arch_spinlock_t;
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index e9ef6d6..816a87c 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -15,6 +15,7 @@
> #include <asm/byteorder.h>
> #include <uapi/linux/kernel.h>
>
> +#define UCHAR_MAX ((u8)(~0U))
> #define USHRT_MAX ((u16)(~0U))
> #define SHRT_MAX ((s16)(USHRT_MAX>>1))
> #define SHRT_MIN ((s16)(-SHRT_MAX - 1))
> diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> index 44511d1..900c0f0 100644
> --- a/kernel/Kconfig.locks
> +++ b/kernel/Kconfig.locks
> @@ -223,3 +223,38 @@ endif
> config MUTEX_SPIN_ON_OWNER
> def_bool y
> depends on SMP && !DEBUG_MUTEXES
> +
> +config TICKET_LOCK_QUEUED
> + bool "Dynamically switch between ticket and queued locking"
> + depends on SMP
> + default n
> + ---help---
> + Enable dynamic switching between ticketlock and queued locking
> + on a per-lock basis. This option will slow down low-contention
> + acquisition and release very slightly (additional conditional
> + in release path), but will provide more efficient operation at
> + high levels of lock contention. High-contention operation will
> + not be quite as efficient as would be a pure queued lock, but
> + this dynamic approach consumes less memory than queud locks
> + and also runs faster at low levels of contention.
> +
> + Say "Y" if you are running on a large system with a workload
> + that is likely to result in high levels of contention.
> +
> + Say "N" if you are unsure.
> +
> +config TICKET_LOCK_QUEUED_SWITCH
> + int "When to switch from ticket to queued locking"
> + depends on TICKET_LOCK_QUEUED
> + default 8
> + range 3 32
> + ---help---
> + Specify how many tasks should be spinning on the lock before
> + switching to queued mode. Systems with low-latency memory/cache
> + interconnects will prefer larger numbers, while extreme low-latency
> + and real-time workloads will prefer a smaller number. Of course,
> + extreme real-time workloads would be even happier if contention
> + on the locks were reduced to the point that there was never any
> + need for queued locking in the first place.
> +
> + Take the default if you are unsure.
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 271fd31..70a91f7 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -51,6 +51,7 @@ endif
> obj-$(CONFIG_SMP) += spinlock.o
> obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
> obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
> +obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
> obj-$(CONFIG_UID16) += uid16.o
> obj-$(CONFIG_MODULES) += module.o
> obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
> diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> new file mode 100644
> index 0000000..9f03af0
> --- /dev/null
> +++ b/kernel/tktqlock.c
> @@ -0,0 +1,369 @@
> +/*
> + * Queued ticket spinlocks.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + * Copyright IBM Corporation, 2013
> + *
> + * Authors: Paul E. McKenney <[email protected]>
> + */
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/spinlock.h>
> +#include <linux/smp.h>
> +#include <linux/percpu.h>
> +
> +struct tkt_q {
> + int cpu;
> + __ticket_t tail;
> + struct tkt_q *next;
> +};
> +
> +struct tkt_q_head {
> + arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
> + s64 head_tkt; /* Head ticket when started queuing. */
> + struct tkt_q *spin; /* Head of queue. */
> + struct tkt_q **spin_tail; /* Tail of queue. */
> +};
> +
> +/*
> + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> + * given ticket lock to motivate switching to spinning on a queue.
> + * The reason that it is twice the number is because the bottom bit of
> + * the ticket is reserved for the bit that indicates that a queue is
> + * associated with the lock.
> + */
> +#define TKT_Q_SWITCH (CONFIG_TICKET_LOCK_QUEUED_SWITCH * 2)
> +
> +/*
> + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> + * might have multiple highly contended locks, so provide more queues for
> + * systems with larger numbers of CPUs.
> + */
> +#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
> +
> +/* The queues themselves. */
> +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
> +
> +/* Advance to the next queue slot, wrapping around to the beginning. */
> +static int tkt_q_next_slot(int i)
> +{
> + return (++i < TKT_Q_NQUEUES) ? i : 0;
> +}
> +
> +/* Very crude hash from lock address to queue slot number. */
> +static unsigned long tkt_q_hash(arch_spinlock_t *lock)
> +{
> + return (((unsigned long)lock) >> 8) % TKT_Q_NQUEUES;
> +}
> +
> +/*
> + * Return a pointer to the queue header associated with the specified lock,
> + * or return NULL if there is no queue for the lock or if the lock's queue
> + * is in transition.
> + */
> +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *lock)
> +{
> + int i;
> + int start;
> +
> + start = i = tkt_q_hash(lock);
> + do
> + if (tkt_q_heads[i].ref == lock)
> + return &tkt_q_heads[i];
> + while ((i = tkt_q_next_slot(i)) != start);
> + return NULL;
> +}
> +
> +/*
> + * Try to stop queuing, reverting back to normal ticket-lock operation.
> + * We can only stop queuing when the queue is empty, which means that
> + * we need to correctly handle races where someone shows up in the queue
> + * just as we are trying to dispense with the queue. They win, we lose.
> + */
> +static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)
> +{
> + arch_spinlock_t asold;
> + arch_spinlock_t asnew;
> +
> + /* Pick up the ticket values. */
> + asold = ACCESS_ONCE(*lock);
> + if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
> +
> + /* Attempt to mark the lock as not having a queue. */
> + asnew = asold;
> + asnew.tickets.head &= ~0x1;
> + if (cmpxchg(&lock->head_tail,
> + asold.head_tail,
> + asnew.head_tail) == asold.head_tail) {
> +
> + /* Succeeded, mark the queue as unused. */
> + ACCESS_ONCE(tqhp->ref) = NULL;
> + return true;
> + }
> + }
> +
> + /* Failed, tell the caller there is still a queue to pass off to. */
> + return false;
> +}
> +
> +/*
> + * Hand the lock off to the first CPU on the queue.
> + */
> +void tkt_q_do_wake(arch_spinlock_t *lock)
> +{
> + struct tkt_q_head *tqhp;
> + struct tkt_q *tqp;
> +
> + /* If the queue is still being set up, wait for it. */
> + while ((tqhp = tkt_q_find_head(lock)) == NULL)
> + cpu_relax();
> +
> + for (;;) {
> +
> + /* Find the first queue element. */
> + tqp = ACCESS_ONCE(tqhp->spin);
> + if (tqp != NULL)
> + break; /* Element exists, hand off lock. */
> + if (tkt_q_try_unqueue(lock, tqhp))
> + return; /* No element, successfully removed queue. */
> + cpu_relax();
> + }
> + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> + ACCESS_ONCE(tqhp->head_tkt) = -1;
> + smp_mb(); /* Order pointer fetch and assignment against handoff. */
> + ACCESS_ONCE(tqp->cpu) = -1;
> +}
> +EXPORT_SYMBOL(tkt_q_do_wake);
> +
> +/*
> + * Given a lock that already has a queue associated with it, spin on
> + * that queue. Return false if there was no queue (which means we do not
> + * hold the lock) and true otherwise (meaning we -do- hold the lock).
> + */
> +bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> +{
> + struct tkt_q **oldtail;
> + struct tkt_q tq;
> + struct tkt_q_head *tqhp;
> +
> + /*
> + * Ensure that accesses to queue header happen after sensing
> + * the lock's have-queue bit.
> + */
> + smp_mb(); /* See above block comment. */
> +
> + /* If there no longer is a queue, leave. */
> + tqhp = tkt_q_find_head(lock);
> + if (tqhp == NULL)
> + return false;
> +
> + /* Initialize our queue element. */
> + tq.cpu = raw_smp_processor_id();
> + tq.tail = inc.tail;
> + tq.next = NULL;

I guess a mb() is needed here for between read tqhp->ref and read
tqhp->head_tkt.
you can move the above mb() to here.

> +
> + /* Check to see if we already hold the lock. */
> + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> + /* The last holder left before queue formed, we hold lock. */
> + tqhp->head_tkt = -1;
> + return true;
> + }
> +
> + /*
> + * Add our element to the tail of the queue. Note that if the
> + * queue is empty, the ->spin_tail pointer will reference
> + * the queue's head pointer, namely ->spin.
> + */
> + oldtail = xchg(&tqhp->spin_tail, &tq.next);
> + ACCESS_ONCE(*oldtail) = &tq;
> +
> + /* Spin until handoff. */
> + while (ACCESS_ONCE(tq.cpu) != -1)
> + cpu_relax();
> +
> + /*
> + * Remove our element from the queue. If the queue is now empty,
> + * update carefully so that the next acquisition will enqueue itself
> + * at the head of the list. Of course, the next enqueue operation
> + * might be happening concurrently, and this code needs to handle all
> + * of the possible combinations, keeping in mind that the enqueue
> + * operation happens in two stages: (1) update the tail pointer and
> + * (2) update the predecessor's ->next pointer. With this in mind,
> + * the following code needs to deal with three scenarios:
> + *
> + * 1. tq is the last entry. In this case, we use cmpxchg to
> + * point the list tail back to the list head (->spin). If
> + * the cmpxchg fails, that indicates that we are instead
> + * in scenario 2 below. If the cmpxchg succeeds, the next
> + * enqueue operation's tail-pointer exchange will enqueue
> + * the next element at the queue head, because the ->spin_tail
> + * pointer now references the queue head.
> + *
> + * 2. tq is the last entry, and the next entry has updated the
> + * tail pointer but has not yet updated tq.next. In this
> + * case, tq.next is NULL, the cmpxchg will fail, and the
> + * code will wait for the enqueue to complete before completing
> + * removal of tq from the list.
> + *
> + * 3. tq is not the last pointer. In this case, tq.next is non-NULL,
> + * so the following code simply removes tq from the list.
> + */
> + if (tq.next == NULL) {
> +
> + /* Mark the queue empty. */
> + tqhp->spin = NULL;
> +
> + /* Try to point the tail back at the head. */
> + if (cmpxchg(&tqhp->spin_tail,
> + &tq.next,
> + &tqhp->spin) == &tq.next)
> + return true; /* Succeeded, queue is now empty. */
> +
> + /* Failed, if needed, wait for the enqueue to complete. */
> + while (tq.next == NULL)
> + cpu_relax();
> +
> + /* The following code will repair the head. */
> + }
> + smp_mb(); /* Force ordering between handoff and critical section. */
> +
> + /*
> + * Advance list-head pointer. This same task will be the next to
> + * access this when releasing the lock, so no need for a memory
> + * barrier after the following assignment.
> + */
> + ACCESS_ONCE(tqhp->spin) = tq.next;
> + return true;
> +}
> +
> +/*
> + * Given a lock that does not have a queue, attempt to associate the
> + * i-th queue with it, returning true if successful (meaning we hold
> + * the lock) or false otherwise (meaning we do -not- hold the lock).
> + * Note that the caller has already filled in ->ref with 0x1, so we
> + * own the queue.
> + */
> +static bool
> +tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
> +{
> + arch_spinlock_t asold;
> + arch_spinlock_t asnew;
> + struct tkt_q_head *tqhp;
> +
> + /* Initialize the i-th queue header. */
> + tqhp = &tkt_q_heads[i];
> + tqhp->spin = NULL;
> + tqhp->spin_tail = &tqhp->spin;
> +
> + /* Each pass through this loop attempts to mark the lock as queued. */
> + do {
> + asold.head_tail = ACCESS_ONCE(lock->head_tail);
> + asnew = asold;
> + if (asnew.tickets.head & 0x1) {
> +
> + /* Someone beat us to it, back out. */
> + smp_mb();
> + ACCESS_ONCE(tqhp->ref) = NULL;
> +
> + /* Spin on the queue element they set up. */
> + return tkt_q_do_spin(lock, inc);
> + }
> +
> + /*
> + * Record the head counter in case one of the spinning
> + * CPUs already holds the lock but doesn't realize it yet.
> + */
> + tqhp->head_tkt = asold.tickets.head;
> +
> + /* The low-order bit in the head counter says "queued". */
> + asnew.tickets.head |= 0x1;
> + } while (cmpxchg(&lock->head_tail,
> + asold.head_tail,
> + asnew.head_tail) != asold.head_tail);
> +
> + /* Point the queue at the lock and go spin on it. */
> + ACCESS_ONCE(tqhp->ref) = lock;
> + return tkt_q_do_spin(lock, inc);
> +}
> +
> +/*
> + * Start handling a period of high contention by finding a queue to associate
> + * with this lock. Returns true if successful (in which case we hold the
> + * lock) and false otherwise (in which case we do -not- hold the lock).
> + */
> +bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
> +{
> + int i;
> + int start;
> +
> + /* Hash the lock address to find a starting point. */
> + start = i = tkt_q_hash(lock);
> +
> + /*
> + * Each pass through the following loop attempts to associate
> + * the lock with the corresponding queue.
> + */
> + do {
> + /*
> + * Use 0x1 to mark the queue in use, but also avoiding
> + * any spinners trying to use it before we get it all
> + * initialized.
> + */
> + if (tkt_q_heads[i].ref)
> + continue;
> + if (cmpxchg(&tkt_q_heads[i].ref,
> + NULL,
> + (arch_spinlock_t *)0x1) == NULL) {
> +
> + /* Succeeded, now go initialize it. */
> + return tkt_q_init_contend(i, lock, inc);
> + }
> +
> + /* If someone beat us to it, go spin on their queue. */
> + if (ACCESS_ONCE(lock->tickets.head) & 0x1)
> + return tkt_q_do_spin(lock, inc);
> + } while ((i = tkt_q_next_slot(i)) != start);
> +
> + /* All the queues are in use, revert to spinning on the ticket lock. */
> + return false;
> +}
> +
> +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> +{
> + if (unlikely(inc.head & 0x1)) {
> +
> + /* This lock has a queue, so go spin on the queue. */
> + if (tkt_q_do_spin(ap, inc))
> + return true;
> +
> + /* Get here if the queue is in transition: Retry next time. */
> +
> + } else if (inc.tail - TKT_Q_SWITCH == inc.head) {
> +
> + /*
> + * This lock has lots of spinners, but no queue.
> + * Go create a queue to spin on.
> + */
> + if (tkt_q_start_contend(ap, inc))
> + return true;
> +
> + /* Get here if the queue is in transition: Retry next time. */
> + }
> +
> + /* Either no need for a queue or the queue is in transition. Spin. */
> + return false;
> +}
> +EXPORT_SYMBOL(tkt_spin_pass);
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2013-06-12 01:58:13

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Wed, 2013-06-12 at 09:19 +0800, Lai Jiangshan wrote:

> > +
> > +/*
> > + * Hand the lock off to the first CPU on the queue.
> > + */
> > +void tkt_q_do_wake(arch_spinlock_t *lock)
> > +{
> > + struct tkt_q_head *tqhp;
> > + struct tkt_q *tqp;
> > +
> > + /* If the queue is still being set up, wait for it. */
> > + while ((tqhp = tkt_q_find_head(lock)) == NULL)
> > + cpu_relax();
> > +
> > + for (;;) {
> > +
> > + /* Find the first queue element. */
> > + tqp = ACCESS_ONCE(tqhp->spin);
> > + if (tqp != NULL)
> > + break; /* Element exists, hand off lock. */
> > + if (tkt_q_try_unqueue(lock, tqhp))
> > + return; /* No element, successfully removed queue. */
> > + cpu_relax();
> > + }
> > + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> > + ACCESS_ONCE(tqhp->head_tkt) = -1;
> > + smp_mb(); /* Order pointer fetch and assignment against handoff. */
> > + ACCESS_ONCE(tqp->cpu) = -1;
> > +}
> > +EXPORT_SYMBOL(tkt_q_do_wake);
> > +
> > +/*
> > + * Given a lock that already has a queue associated with it, spin on
> > + * that queue. Return false if there was no queue (which means we do not
> > + * hold the lock) and true otherwise (meaning we -do- hold the lock).
> > + */
> > +bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> > +{
> > + struct tkt_q **oldtail;
> > + struct tkt_q tq;
> > + struct tkt_q_head *tqhp;
> > +
> > + /*
> > + * Ensure that accesses to queue header happen after sensing
> > + * the lock's have-queue bit.
> > + */
> > + smp_mb(); /* See above block comment. */
> > +
> > + /* If there no longer is a queue, leave. */
> > + tqhp = tkt_q_find_head(lock);
> > + if (tqhp == NULL)
> > + return false;
> > +
> > + /* Initialize our queue element. */
> > + tq.cpu = raw_smp_processor_id();
> > + tq.tail = inc.tail;
> > + tq.next = NULL;
>
> I guess a mb() is needed here for between read tqhp->ref and read
> tqhp->head_tkt.
> you can move the above mb() to here.

Do we?

The only way to get into here is if you either set up the queue
yourself, or you saw the LSB set in head.

If you were the one to set it up yourself, then there's nothing to worry
about because you are also the one that set head_tkt.

If you didn't set up the queue, then someone else set the LSB in head,
which is done with a cmpxchg() which is also a full mb. This would make
head_tkt visible as well because it's set before cmpxchg is called.

Thus, to come into this function you must have seen head & 1 set, and
the smp_mb() above will also make head_tkt visible.

The only thing I can see now is that it might not find tqhp because ref
may not be set yet. If that's the case, then it will fall out back to
the main loop. But if it finds ref, then I don't see how it can't see
head_tkt up to date as well.

Maybe I'm missing something.

-- Steve


>
> > +
> > + /* Check to see if we already hold the lock. */
> > + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> > + /* The last holder left before queue formed, we hold lock. */
> > + tqhp->head_tkt = -1;
> > + return true;
> > + }
> > +
> > + /*
> > + * Add our element to the tail of the queue. Note that if the
> > + * queue is empty, the ->spin_tail pointer will reference
> > + * the queue's head pointer, namely ->spin.
> > + */
> > + oldtail = xchg(&tqhp->spin_tail, &tq.next);
> > + ACCESS_ONCE(*oldtail) = &tq;
> > +
> > + /* Spin until handoff. */
> > + while (ACCESS_ONCE(tq.cpu) != -1)
> > + cpu_relax();
> > +
> > + /*
> > + * Remove our element from the queue. If the queue is now empty,
> > + * update carefully so that the next acquisition will enqueue itself
> > + * at the head of the list. Of course, the next enqueue operation
> > + * might be happening concurrently, and this code needs to handle all
> > + * of the possible combinations, keeping in mind that the enqueue
> > + * operation happens in two stages: (1) update the tail pointer and
> > + * (2) update the predecessor's ->next pointer. With this in mind,
> > + * the following code needs to deal with three scenarios:
> > + *
> > + * 1. tq is the last entry. In this case, we use cmpxchg to
> > + * point the list tail back to the list head (->spin). If
> > + * the cmpxchg fails, that indicates that we are instead
> > + * in scenario 2 below. If the cmpxchg succeeds, the next
> > + * enqueue operation's tail-pointer exchange will enqueue
> > + * the next element at the queue head, because the ->spin_tail
> > + * pointer now references the queue head.
> > + *
> > + * 2. tq is the last entry, and the next entry has updated the
> > + * tail pointer but has not yet updated tq.next. In this
> > + * case, tq.next is NULL, the cmpxchg will fail, and the
> > + * code will wait for the enqueue to complete before completing
> > + * removal of tq from the list.
> > + *
> > + * 3. tq is not the last pointer. In this case, tq.next is non-NULL,
> > + * so the following code simply removes tq from the list.
> > + */
> > + if (tq.next == NULL) {
> > +
> > + /* Mark the queue empty. */
> > + tqhp->spin = NULL;
> > +
> > + /* Try to point the tail back at the head. */
> > + if (cmpxchg(&tqhp->spin_tail,
> > + &tq.next,
> > + &tqhp->spin) == &tq.next)
> > + return true; /* Succeeded, queue is now empty. */
> > +
> > + /* Failed, if needed, wait for the enqueue to complete. */
> > + while (tq.next == NULL)
> > + cpu_relax();
> > +
> > + /* The following code will repair the head. */
> > + }
> > + smp_mb(); /* Force ordering between handoff and critical section. */
> > +
> > + /*
> > + * Advance list-head pointer. This same task will be the next to
> > + * access this when releasing the lock, so no need for a memory
> > + * barrier after the following assignment.
> > + */
> > + ACCESS_ONCE(tqhp->spin) = tq.next;
> > + return true;
> > +}
> > +
> > +/*
> > + * Given a lock that does not have a queue, attempt to associate the
> > + * i-th queue with it, returning true if successful (meaning we hold
> > + * the lock) or false otherwise (meaning we do -not- hold the lock).
> > + * Note that the caller has already filled in ->ref with 0x1, so we
> > + * own the queue.
> > + */
> > +static bool
> > +tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
> > +{
> > + arch_spinlock_t asold;
> > + arch_spinlock_t asnew;
> > + struct tkt_q_head *tqhp;
> > +
> > + /* Initialize the i-th queue header. */
> > + tqhp = &tkt_q_heads[i];
> > + tqhp->spin = NULL;
> > + tqhp->spin_tail = &tqhp->spin;
> > +
> > + /* Each pass through this loop attempts to mark the lock as queued. */
> > + do {
> > + asold.head_tail = ACCESS_ONCE(lock->head_tail);
> > + asnew = asold;
> > + if (asnew.tickets.head & 0x1) {
> > +
> > + /* Someone beat us to it, back out. */
> > + smp_mb();
> > + ACCESS_ONCE(tqhp->ref) = NULL;
> > +
> > + /* Spin on the queue element they set up. */
> > + return tkt_q_do_spin(lock, inc);
> > + }
> > +
> > + /*
> > + * Record the head counter in case one of the spinning
> > + * CPUs already holds the lock but doesn't realize it yet.
> > + */
> > + tqhp->head_tkt = asold.tickets.head;
> > +
> > + /* The low-order bit in the head counter says "queued". */
> > + asnew.tickets.head |= 0x1;
> > + } while (cmpxchg(&lock->head_tail,
> > + asold.head_tail,
> > + asnew.head_tail) != asold.head_tail);
> > +
> > + /* Point the queue at the lock and go spin on it. */
> > + ACCESS_ONCE(tqhp->ref) = lock;
> > + return tkt_q_do_spin(lock, inc);
> > +}
> > +
> > +/*
> > + * Start handling a period of high contention by finding a queue to associate
> > + * with this lock. Returns true if successful (in which case we hold the
> > + * lock) and false otherwise (in which case we do -not- hold the lock).
> > + */
> > +bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
> > +{
> > + int i;
> > + int start;
> > +
> > + /* Hash the lock address to find a starting point. */
> > + start = i = tkt_q_hash(lock);
> > +
> > + /*
> > + * Each pass through the following loop attempts to associate
> > + * the lock with the corresponding queue.
> > + */
> > + do {
> > + /*
> > + * Use 0x1 to mark the queue in use, but also avoiding
> > + * any spinners trying to use it before we get it all
> > + * initialized.
> > + */
> > + if (tkt_q_heads[i].ref)
> > + continue;
> > + if (cmpxchg(&tkt_q_heads[i].ref,
> > + NULL,
> > + (arch_spinlock_t *)0x1) == NULL) {
> > +
> > + /* Succeeded, now go initialize it. */
> > + return tkt_q_init_contend(i, lock, inc);
> > + }
> > +
> > + /* If someone beat us to it, go spin on their queue. */
> > + if (ACCESS_ONCE(lock->tickets.head) & 0x1)
> > + return tkt_q_do_spin(lock, inc);
> > + } while ((i = tkt_q_next_slot(i)) != start);
> > +
> > + /* All the queues are in use, revert to spinning on the ticket lock. */
> > + return false;
> > +}
> > +
> > +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> > +{
> > + if (unlikely(inc.head & 0x1)) {
> > +
> > + /* This lock has a queue, so go spin on the queue. */
> > + if (tkt_q_do_spin(ap, inc))
> > + return true;
> > +
> > + /* Get here if the queue is in transition: Retry next time. */
> > +
> > + } else if (inc.tail - TKT_Q_SWITCH == inc.head) {
> > +
> > + /*
> > + * This lock has lots of spinners, but no queue.
> > + * Go create a queue to spin on.
> > + */
> > + if (tkt_q_start_contend(ap, inc))
> > + return true;
> > +
> > + /* Get here if the queue is in transition: Retry next time. */
> > + }
> > +
> > + /* Either no need for a queue or the queue is in transition. Spin. */
> > + return false;
> > +}
> > +EXPORT_SYMBOL(tkt_spin_pass);
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at http://www.tux.org/lkml/

2013-06-12 10:13:42

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, Jun 11, 2013 at 09:58:08PM -0400, Steven Rostedt wrote:
> On Wed, 2013-06-12 at 09:19 +0800, Lai Jiangshan wrote:
>
> > > +
> > > +/*
> > > + * Hand the lock off to the first CPU on the queue.
> > > + */
> > > +void tkt_q_do_wake(arch_spinlock_t *lock)
> > > +{
> > > + struct tkt_q_head *tqhp;
> > > + struct tkt_q *tqp;
> > > +
> > > + /* If the queue is still being set up, wait for it. */
> > > + while ((tqhp = tkt_q_find_head(lock)) == NULL)
> > > + cpu_relax();
> > > +
> > > + for (;;) {
> > > +
> > > + /* Find the first queue element. */
> > > + tqp = ACCESS_ONCE(tqhp->spin);
> > > + if (tqp != NULL)
> > > + break; /* Element exists, hand off lock. */
> > > + if (tkt_q_try_unqueue(lock, tqhp))
> > > + return; /* No element, successfully removed queue. */
> > > + cpu_relax();
> > > + }
> > > + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> > > + ACCESS_ONCE(tqhp->head_tkt) = -1;
> > > + smp_mb(); /* Order pointer fetch and assignment against handoff. */
> > > + ACCESS_ONCE(tqp->cpu) = -1;
> > > +}
> > > +EXPORT_SYMBOL(tkt_q_do_wake);
> > > +
> > > +/*
> > > + * Given a lock that already has a queue associated with it, spin on
> > > + * that queue. Return false if there was no queue (which means we do not
> > > + * hold the lock) and true otherwise (meaning we -do- hold the lock).
> > > + */
> > > +bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> > > +{
> > > + struct tkt_q **oldtail;
> > > + struct tkt_q tq;
> > > + struct tkt_q_head *tqhp;
> > > +
> > > + /*
> > > + * Ensure that accesses to queue header happen after sensing
> > > + * the lock's have-queue bit.
> > > + */
> > > + smp_mb(); /* See above block comment. */
> > > +
> > > + /* If there no longer is a queue, leave. */
> > > + tqhp = tkt_q_find_head(lock);
> > > + if (tqhp == NULL)
> > > + return false;
> > > +
> > > + /* Initialize our queue element. */
> > > + tq.cpu = raw_smp_processor_id();
> > > + tq.tail = inc.tail;
> > > + tq.next = NULL;
> >
> > I guess a mb() is needed here for between read tqhp->ref and read
> > tqhp->head_tkt.
> > you can move the above mb() to here.
>
> Do we?
>
> The only way to get into here is if you either set up the queue
> yourself, or you saw the LSB set in head.
>
> If you were the one to set it up yourself, then there's nothing to worry
> about because you are also the one that set head_tkt.
>
> If you didn't set up the queue, then someone else set the LSB in head,
> which is done with a cmpxchg() which is also a full mb. This would make
> head_tkt visible as well because it's set before cmpxchg is called.
>
> Thus, to come into this function you must have seen head & 1 set, and
> the smp_mb() above will also make head_tkt visible.
>
> The only thing I can see now is that it might not find tqhp because ref
> may not be set yet. If that's the case, then it will fall out back to
> the main loop. But if it finds ref, then I don't see how it can't see
> head_tkt up to date as well.
>
> Maybe I'm missing something.

Hmmm... I need to look at this more carefully. Lai might well be right
because if we are relying on the cmpxchg() for ordering, there needs
to be a memory barrier on the read side to pair with the cmpxchg().
You are of course quite correct in the case where the CPU reading the
->head_tkt is the one that set it up.

Something to think about at the gym, I guess. ;-)

Thanx, Paul

> -- Steve
>
>
> >
> > > +
> > > + /* Check to see if we already hold the lock. */
> > > + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> > > + /* The last holder left before queue formed, we hold lock. */
> > > + tqhp->head_tkt = -1;
> > > + return true;
> > > + }
> > > +
> > > + /*
> > > + * Add our element to the tail of the queue. Note that if the
> > > + * queue is empty, the ->spin_tail pointer will reference
> > > + * the queue's head pointer, namely ->spin.
> > > + */
> > > + oldtail = xchg(&tqhp->spin_tail, &tq.next);
> > > + ACCESS_ONCE(*oldtail) = &tq;
> > > +
> > > + /* Spin until handoff. */
> > > + while (ACCESS_ONCE(tq.cpu) != -1)
> > > + cpu_relax();
> > > +
> > > + /*
> > > + * Remove our element from the queue. If the queue is now empty,
> > > + * update carefully so that the next acquisition will enqueue itself
> > > + * at the head of the list. Of course, the next enqueue operation
> > > + * might be happening concurrently, and this code needs to handle all
> > > + * of the possible combinations, keeping in mind that the enqueue
> > > + * operation happens in two stages: (1) update the tail pointer and
> > > + * (2) update the predecessor's ->next pointer. With this in mind,
> > > + * the following code needs to deal with three scenarios:
> > > + *
> > > + * 1. tq is the last entry. In this case, we use cmpxchg to
> > > + * point the list tail back to the list head (->spin). If
> > > + * the cmpxchg fails, that indicates that we are instead
> > > + * in scenario 2 below. If the cmpxchg succeeds, the next
> > > + * enqueue operation's tail-pointer exchange will enqueue
> > > + * the next element at the queue head, because the ->spin_tail
> > > + * pointer now references the queue head.
> > > + *
> > > + * 2. tq is the last entry, and the next entry has updated the
> > > + * tail pointer but has not yet updated tq.next. In this
> > > + * case, tq.next is NULL, the cmpxchg will fail, and the
> > > + * code will wait for the enqueue to complete before completing
> > > + * removal of tq from the list.
> > > + *
> > > + * 3. tq is not the last pointer. In this case, tq.next is non-NULL,
> > > + * so the following code simply removes tq from the list.
> > > + */
> > > + if (tq.next == NULL) {
> > > +
> > > + /* Mark the queue empty. */
> > > + tqhp->spin = NULL;
> > > +
> > > + /* Try to point the tail back at the head. */
> > > + if (cmpxchg(&tqhp->spin_tail,
> > > + &tq.next,
> > > + &tqhp->spin) == &tq.next)
> > > + return true; /* Succeeded, queue is now empty. */
> > > +
> > > + /* Failed, if needed, wait for the enqueue to complete. */
> > > + while (tq.next == NULL)
> > > + cpu_relax();
> > > +
> > > + /* The following code will repair the head. */
> > > + }
> > > + smp_mb(); /* Force ordering between handoff and critical section. */
> > > +
> > > + /*
> > > + * Advance list-head pointer. This same task will be the next to
> > > + * access this when releasing the lock, so no need for a memory
> > > + * barrier after the following assignment.
> > > + */
> > > + ACCESS_ONCE(tqhp->spin) = tq.next;
> > > + return true;
> > > +}
> > > +
> > > +/*
> > > + * Given a lock that does not have a queue, attempt to associate the
> > > + * i-th queue with it, returning true if successful (meaning we hold
> > > + * the lock) or false otherwise (meaning we do -not- hold the lock).
> > > + * Note that the caller has already filled in ->ref with 0x1, so we
> > > + * own the queue.
> > > + */
> > > +static bool
> > > +tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
> > > +{
> > > + arch_spinlock_t asold;
> > > + arch_spinlock_t asnew;
> > > + struct tkt_q_head *tqhp;
> > > +
> > > + /* Initialize the i-th queue header. */
> > > + tqhp = &tkt_q_heads[i];
> > > + tqhp->spin = NULL;
> > > + tqhp->spin_tail = &tqhp->spin;
> > > +
> > > + /* Each pass through this loop attempts to mark the lock as queued. */
> > > + do {
> > > + asold.head_tail = ACCESS_ONCE(lock->head_tail);
> > > + asnew = asold;
> > > + if (asnew.tickets.head & 0x1) {
> > > +
> > > + /* Someone beat us to it, back out. */
> > > + smp_mb();
> > > + ACCESS_ONCE(tqhp->ref) = NULL;
> > > +
> > > + /* Spin on the queue element they set up. */
> > > + return tkt_q_do_spin(lock, inc);
> > > + }
> > > +
> > > + /*
> > > + * Record the head counter in case one of the spinning
> > > + * CPUs already holds the lock but doesn't realize it yet.
> > > + */
> > > + tqhp->head_tkt = asold.tickets.head;
> > > +
> > > + /* The low-order bit in the head counter says "queued". */
> > > + asnew.tickets.head |= 0x1;
> > > + } while (cmpxchg(&lock->head_tail,
> > > + asold.head_tail,
> > > + asnew.head_tail) != asold.head_tail);
> > > +
> > > + /* Point the queue at the lock and go spin on it. */
> > > + ACCESS_ONCE(tqhp->ref) = lock;
> > > + return tkt_q_do_spin(lock, inc);
> > > +}
> > > +
> > > +/*
> > > + * Start handling a period of high contention by finding a queue to associate
> > > + * with this lock. Returns true if successful (in which case we hold the
> > > + * lock) and false otherwise (in which case we do -not- hold the lock).
> > > + */
> > > +bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
> > > +{
> > > + int i;
> > > + int start;
> > > +
> > > + /* Hash the lock address to find a starting point. */
> > > + start = i = tkt_q_hash(lock);
> > > +
> > > + /*
> > > + * Each pass through the following loop attempts to associate
> > > + * the lock with the corresponding queue.
> > > + */
> > > + do {
> > > + /*
> > > + * Use 0x1 to mark the queue in use, but also avoiding
> > > + * any spinners trying to use it before we get it all
> > > + * initialized.
> > > + */
> > > + if (tkt_q_heads[i].ref)
> > > + continue;
> > > + if (cmpxchg(&tkt_q_heads[i].ref,
> > > + NULL,
> > > + (arch_spinlock_t *)0x1) == NULL) {
> > > +
> > > + /* Succeeded, now go initialize it. */
> > > + return tkt_q_init_contend(i, lock, inc);
> > > + }
> > > +
> > > + /* If someone beat us to it, go spin on their queue. */
> > > + if (ACCESS_ONCE(lock->tickets.head) & 0x1)
> > > + return tkt_q_do_spin(lock, inc);
> > > + } while ((i = tkt_q_next_slot(i)) != start);
> > > +
> > > + /* All the queues are in use, revert to spinning on the ticket lock. */
> > > + return false;
> > > +}
> > > +
> > > +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> > > +{
> > > + if (unlikely(inc.head & 0x1)) {
> > > +
> > > + /* This lock has a queue, so go spin on the queue. */
> > > + if (tkt_q_do_spin(ap, inc))
> > > + return true;
> > > +
> > > + /* Get here if the queue is in transition: Retry next time. */
> > > +
> > > + } else if (inc.tail - TKT_Q_SWITCH == inc.head) {
> > > +
> > > + /*
> > > + * This lock has lots of spinners, but no queue.
> > > + * Go create a queue to spin on.
> > > + */
> > > + if (tkt_q_start_contend(ap, inc))
> > > + return true;
> > > +
> > > + /* Get here if the queue is in transition: Retry next time. */
> > > + }
> > > +
> > > + /* Either no need for a queue or the queue is in transition. Spin. */
> > > + return false;
> > > +}
> > > +EXPORT_SYMBOL(tkt_spin_pass);
> > >
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > > the body of a message to [email protected]
> > > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > > Please read the FAQ at http://www.tux.org/lkml/
>
>

2013-06-12 11:06:55

by Lai Jiangshan

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Wed, Jun 12, 2013 at 9:58 AM, Steven Rostedt <[email protected]> wrote:
> On Wed, 2013-06-12 at 09:19 +0800, Lai Jiangshan wrote:
>
>> > +
>> > +/*
>> > + * Hand the lock off to the first CPU on the queue.
>> > + */
>> > +void tkt_q_do_wake(arch_spinlock_t *lock)
>> > +{
>> > + struct tkt_q_head *tqhp;
>> > + struct tkt_q *tqp;
>> > +
>> > + /* If the queue is still being set up, wait for it. */
>> > + while ((tqhp = tkt_q_find_head(lock)) == NULL)
>> > + cpu_relax();
>> > +
>> > + for (;;) {
>> > +
>> > + /* Find the first queue element. */
>> > + tqp = ACCESS_ONCE(tqhp->spin);
>> > + if (tqp != NULL)
>> > + break; /* Element exists, hand off lock. */
>> > + if (tkt_q_try_unqueue(lock, tqhp))
>> > + return; /* No element, successfully removed queue. */
>> > + cpu_relax();
>> > + }
>> > + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
>> > + ACCESS_ONCE(tqhp->head_tkt) = -1;
>> > + smp_mb(); /* Order pointer fetch and assignment against handoff. */
>> > + ACCESS_ONCE(tqp->cpu) = -1;
>> > +}
>> > +EXPORT_SYMBOL(tkt_q_do_wake);
>> > +
>> > +/*
>> > + * Given a lock that already has a queue associated with it, spin on
>> > + * that queue. Return false if there was no queue (which means we do not
>> > + * hold the lock) and true otherwise (meaning we -do- hold the lock).
>> > + */
>> > +bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
>> > +{
>> > + struct tkt_q **oldtail;
>> > + struct tkt_q tq;
>> > + struct tkt_q_head *tqhp;
>> > +
>> > + /*
>> > + * Ensure that accesses to queue header happen after sensing
>> > + * the lock's have-queue bit.
>> > + */
>> > + smp_mb(); /* See above block comment. */
>> > +
>> > + /* If there no longer is a queue, leave. */
>> > + tqhp = tkt_q_find_head(lock);
>> > + if (tqhp == NULL)
>> > + return false;
>> > +
>> > + /* Initialize our queue element. */
>> > + tq.cpu = raw_smp_processor_id();
>> > + tq.tail = inc.tail;
>> > + tq.next = NULL;
>>
>> I guess a mb() is needed here for between read tqhp->ref and read
>> tqhp->head_tkt.
>> you can move the above mb() to here.
>
> Do we?
>
> The only way to get into here is if you either set up the queue
> yourself, or you saw the LSB set in head.
>
> If you were the one to set it up yourself, then there's nothing to worry
> about because you are also the one that set head_tkt.
>
> If you didn't set up the queue, then someone else set the LSB in head,
> which is done with a cmpxchg() which is also a full mb. This would make
> head_tkt visible as well because it's set before cmpxchg is called.
>
> Thus, to come into this function you must have seen head & 1 set, and
> the smp_mb() above will also make head_tkt visible.
>
> The only thing I can see now is that it might not find tqhp because ref
> may not be set yet. If that's the case, then it will fall out back to
> the main loop. But if it finds ref, then I don't see how it can't see
> head_tkt up to date as well.
>
> Maybe I'm missing something.

No, you are right.

When I lay on the bed in the night, I was thinking about the V1,
I wrongly considered the V2 has the same problem without deeper
thought in this morning.

V2 has not such problem. sorry for the noisy.

Thanks,
Lai

>
> -- Steve
>
>
>>
>> > +
>> > + /* Check to see if we already hold the lock. */
>> > + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
>> > + /* The last holder left before queue formed, we hold lock. */
>> > + tqhp->head_tkt = -1;
>> > + return true;
>> > + }
>> > +
>> > + /*
>> > + * Add our element to the tail of the queue. Note that if the
>> > + * queue is empty, the ->spin_tail pointer will reference
>> > + * the queue's head pointer, namely ->spin.
>> > + */
>> > + oldtail = xchg(&tqhp->spin_tail, &tq.next);
>> > + ACCESS_ONCE(*oldtail) = &tq;
>> > +
>> > + /* Spin until handoff. */
>> > + while (ACCESS_ONCE(tq.cpu) != -1)
>> > + cpu_relax();
>> > +
>> > + /*
>> > + * Remove our element from the queue. If the queue is now empty,
>> > + * update carefully so that the next acquisition will enqueue itself
>> > + * at the head of the list. Of course, the next enqueue operation
>> > + * might be happening concurrently, and this code needs to handle all
>> > + * of the possible combinations, keeping in mind that the enqueue
>> > + * operation happens in two stages: (1) update the tail pointer and
>> > + * (2) update the predecessor's ->next pointer. With this in mind,
>> > + * the following code needs to deal with three scenarios:
>> > + *
>> > + * 1. tq is the last entry. In this case, we use cmpxchg to
>> > + * point the list tail back to the list head (->spin). If
>> > + * the cmpxchg fails, that indicates that we are instead
>> > + * in scenario 2 below. If the cmpxchg succeeds, the next
>> > + * enqueue operation's tail-pointer exchange will enqueue
>> > + * the next element at the queue head, because the ->spin_tail
>> > + * pointer now references the queue head.
>> > + *
>> > + * 2. tq is the last entry, and the next entry has updated the
>> > + * tail pointer but has not yet updated tq.next. In this
>> > + * case, tq.next is NULL, the cmpxchg will fail, and the
>> > + * code will wait for the enqueue to complete before completing
>> > + * removal of tq from the list.
>> > + *
>> > + * 3. tq is not the last pointer. In this case, tq.next is non-NULL,
>> > + * so the following code simply removes tq from the list.
>> > + */
>> > + if (tq.next == NULL) {
>> > +
>> > + /* Mark the queue empty. */
>> > + tqhp->spin = NULL;
>> > +
>> > + /* Try to point the tail back at the head. */
>> > + if (cmpxchg(&tqhp->spin_tail,
>> > + &tq.next,
>> > + &tqhp->spin) == &tq.next)
>> > + return true; /* Succeeded, queue is now empty. */
>> > +
>> > + /* Failed, if needed, wait for the enqueue to complete. */
>> > + while (tq.next == NULL)
>> > + cpu_relax();
>> > +
>> > + /* The following code will repair the head. */
>> > + }
>> > + smp_mb(); /* Force ordering between handoff and critical section. */
>> > +
>> > + /*
>> > + * Advance list-head pointer. This same task will be the next to
>> > + * access this when releasing the lock, so no need for a memory
>> > + * barrier after the following assignment.
>> > + */
>> > + ACCESS_ONCE(tqhp->spin) = tq.next;
>> > + return true;
>> > +}
>> > +
>> > +/*
>> > + * Given a lock that does not have a queue, attempt to associate the
>> > + * i-th queue with it, returning true if successful (meaning we hold
>> > + * the lock) or false otherwise (meaning we do -not- hold the lock).
>> > + * Note that the caller has already filled in ->ref with 0x1, so we
>> > + * own the queue.
>> > + */
>> > +static bool
>> > +tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
>> > +{
>> > + arch_spinlock_t asold;
>> > + arch_spinlock_t asnew;
>> > + struct tkt_q_head *tqhp;
>> > +
>> > + /* Initialize the i-th queue header. */
>> > + tqhp = &tkt_q_heads[i];
>> > + tqhp->spin = NULL;
>> > + tqhp->spin_tail = &tqhp->spin;
>> > +
>> > + /* Each pass through this loop attempts to mark the lock as queued. */
>> > + do {
>> > + asold.head_tail = ACCESS_ONCE(lock->head_tail);
>> > + asnew = asold;
>> > + if (asnew.tickets.head & 0x1) {
>> > +
>> > + /* Someone beat us to it, back out. */
>> > + smp_mb();
>> > + ACCESS_ONCE(tqhp->ref) = NULL;
>> > +
>> > + /* Spin on the queue element they set up. */
>> > + return tkt_q_do_spin(lock, inc);
>> > + }
>> > +
>> > + /*
>> > + * Record the head counter in case one of the spinning
>> > + * CPUs already holds the lock but doesn't realize it yet.
>> > + */
>> > + tqhp->head_tkt = asold.tickets.head;
>> > +
>> > + /* The low-order bit in the head counter says "queued". */
>> > + asnew.tickets.head |= 0x1;
>> > + } while (cmpxchg(&lock->head_tail,
>> > + asold.head_tail,
>> > + asnew.head_tail) != asold.head_tail);
>> > +
>> > + /* Point the queue at the lock and go spin on it. */
>> > + ACCESS_ONCE(tqhp->ref) = lock;
>> > + return tkt_q_do_spin(lock, inc);
>> > +}
>> > +
>> > +/*
>> > + * Start handling a period of high contention by finding a queue to associate
>> > + * with this lock. Returns true if successful (in which case we hold the
>> > + * lock) and false otherwise (in which case we do -not- hold the lock).
>> > + */
>> > +bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
>> > +{
>> > + int i;
>> > + int start;
>> > +
>> > + /* Hash the lock address to find a starting point. */
>> > + start = i = tkt_q_hash(lock);
>> > +
>> > + /*
>> > + * Each pass through the following loop attempts to associate
>> > + * the lock with the corresponding queue.
>> > + */
>> > + do {
>> > + /*
>> > + * Use 0x1 to mark the queue in use, but also avoiding
>> > + * any spinners trying to use it before we get it all
>> > + * initialized.
>> > + */
>> > + if (tkt_q_heads[i].ref)
>> > + continue;
>> > + if (cmpxchg(&tkt_q_heads[i].ref,
>> > + NULL,
>> > + (arch_spinlock_t *)0x1) == NULL) {
>> > +
>> > + /* Succeeded, now go initialize it. */
>> > + return tkt_q_init_contend(i, lock, inc);
>> > + }
>> > +
>> > + /* If someone beat us to it, go spin on their queue. */
>> > + if (ACCESS_ONCE(lock->tickets.head) & 0x1)
>> > + return tkt_q_do_spin(lock, inc);
>> > + } while ((i = tkt_q_next_slot(i)) != start);
>> > +
>> > + /* All the queues are in use, revert to spinning on the ticket lock. */
>> > + return false;
>> > +}
>> > +
>> > +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
>> > +{
>> > + if (unlikely(inc.head & 0x1)) {
>> > +
>> > + /* This lock has a queue, so go spin on the queue. */
>> > + if (tkt_q_do_spin(ap, inc))
>> > + return true;
>> > +
>> > + /* Get here if the queue is in transition: Retry next time. */
>> > +
>> > + } else if (inc.tail - TKT_Q_SWITCH == inc.head) {
>> > +
>> > + /*
>> > + * This lock has lots of spinners, but no queue.
>> > + * Go create a queue to spin on.
>> > + */
>> > + if (tkt_q_start_contend(ap, inc))
>> > + return true;
>> > +
>> > + /* Get here if the queue is in transition: Retry next time. */
>> > + }
>> > +
>> > + /* Either no need for a queue or the queue is in transition. Spin. */
>> > + return false;
>> > +}
>> > +EXPORT_SYMBOL(tkt_spin_pass);
>> >
>> > --
>> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> > the body of a message to [email protected]
>> > More majordomo info at http://vger.kernel.org/majordomo-info.html
>> > Please read the FAQ at http://www.tux.org/lkml/
>
>

2013-06-12 14:15:56

by Lai Jiangshan

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

Hi, Paul

I have some question about smp_mb().(searching smp_mb() can find all
my question)

Thanks,
Lai

On Wed, Jun 12, 2013 at 3:49 AM, Paul E. McKenney
<[email protected]> wrote:
> On Tue, Jun 11, 2013 at 02:41:59PM -0400, Waiman Long wrote:
>> On 06/11/2013 12:36 PM, Paul E. McKenney wrote:
>> >
>> >>I am a bit concern about the size of the head queue table itself.
>> >>RHEL6, for example, had defined CONFIG_NR_CPUS to be 4096 which mean
>> >>a table size of 256. Maybe it is better to dynamically allocate the
>> >>table at init time depending on the actual number of CPUs in the
>> >>system.
>> >But if your kernel is built for 4096 CPUs, the 32*256=8192 bytes of memory
>> >is way down in the noise. Systems that care about that small an amount
>> >of memory probably have a small enough number of CPUs that they can just
>> >turn off queueing at build time using CONFIG_TICKET_LOCK_QUEUED=n, right?
>>
>> My concern is more about the latency on the table scan than the
>> actual memory that was used.
>>
>> >>>+/*
>> >>>+ * Return a pointer to the queue header associated with the specified lock,
>> >>>+ * or return NULL if there is no queue for the lock or if the lock's queue
>> >>>+ * is in transition.
>> >>>+ */
>> >>>+static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
>> >>>+{
>> >>>+ int i;
>> >>>+ int start;
>> >>>+
>> >>>+ start = i = tkt_q_hash(asp);
>> >>>+ do
>> >>>+ if (tkt_q_heads[i].ref == asp)
>> >>>+ return&tkt_q_heads[i];
>> >>>+ while ((i = tkt_q_next_slot(i)) != start);
>> >>>+ return NULL;
>> >>>+}
>> >>With a table size of 256 and you have to scan the whole table to
>> >>find the right head queue. This can be a significant overhead. I
>> >>will suggest setting a limiting of how many entries it scans before
>> >>it aborts rather than checking the whole table.
>> >But it will scan 256 entries only if there are 256 other locks in queued
>> >mode, which is -very- unlikely, even given 4096 CPUs. That said, if you
>> >show me that this results in a real latency problem on a real system,
>> >I would be happy to provide a way to limit the search.
>>
>> Looking at the code more carefully, the chance of actually scanning
>> 256 entries is very small. However, I now have some concern on the
>> way you set up the initial queue.
>>
>> +/*
>> + * Start handling a period of high contention by finding a queue to associate
>> + * with this lock. Returns true if successful (in which case we hold the
>> + * lock) and false otherwise (in which case we do -not- hold the lock).
>> + */
>> +bool tkt_q_start_contend(arch_spinlock_t *asp, struct __raw_tickets inc)
>> +{
>> + int i;
>> + int start;
>> +
>> + /* Hash the lock address to find a starting point. */
>> + start = i = tkt_q_hash(asp);
>> +
>> + /*
>> + * Each pass through the following loop attempts to associate
>> + * the lock with the corresponding queue.
>> + */
>> + do {
>> + /*
>> + * Use 0x1 to mark the queue in use, but also avoiding
>> + * any spinners trying to use it before we get it all
>> + * initialized.
>> + */
>> + if (cmpxchg(&tkt_q_heads[i].ref,
>> + NULL,
>> + (arch_spinlock_t *)0x1) == NULL) {
>> +
>> + /* Succeeded, now go initialize it. */
>> + return tkt_q_init_contend(i, asp, inc);
>> + }
>> +
>> + /* If someone beat us to it, go spin on their queue. */
>> + if (ACCESS_ONCE(asp->tickets.head)& 0x1)
>> + return tkt_q_do_spin(asp, inc);
>> + } while ((i = tkt_q_next_slot(i)) != start);
>> +
>> + /* All the queues are in use, revert to spinning on the ticket lock. */
>> + return false;
>> +}
>> +
>>
>> Unconditional cmpxchg() can be a source of high contention by
>> itself. Considering that 16 threads may be doing cmpxchg() more or
>> less simultaneously on the same cache line, it can cause a lot of
>> contention. It will be better if you check to see if tkt_q_heads[i]
>> is NULL first before doing cmpxchg.
>>
>> Another point is that the 16 threads maybe setting up the queues in
>> consecutive slots in the head table. This is both a source of
>> contention and a waste of effort. One possible solution is to add
>> one more field (set to cpuid + 1, for example) to indicate that that
>> setup is being done with asp set to the target lock address
>> immediately. We will need to use cmpxchg128() for 64-bit machine,
>> though. Another solution is to have only that thread with ticket
>> number that is a fixed distance from head (e.g. 16*2) to do the
>> queue setup while the rest wait until the setup is done before
>> spinning on the queue.
>>
>> As my colleague Davidlohr had reported there are more regressions
>> than performance improvement in the AIM7 benchmark. I believe that
>> queue setup contention is likely a source of performance regression.
>
> Please see below for a v3 patch that:
>
> 1. Fixes cpu_relax().
>
> 2. Tests before doing cmpxchg().
>
> 3. Reduces the number of CPUs attempting to set up the queue,
> in the common case, to a single CPU. (Multiple CPUs can
> still be trying to set up the queue given unfortunate
> sequences of concurrent ticket-lock handoffs.)
>
> Please let me know how it goes!
>
> Thanx, Paul
>
> ------------------------------------------------------------------------
>
> ticketlock: Add queued-ticketlock capability
>
> Breaking up locks is better than implementing high-contention locks, but
> if we must have high-contention locks, why not make them automatically
> switch between light-weight ticket locks at low contention and queued
> locks at high contention? After all, this would remove the need for
> the developer to predict which locks will be highly contended.
>
> This commit allows ticket locks to automatically switch between pure
> ticketlock and queued-lock operation as needed. If too many CPUs are
> spinning on a given ticket lock, a queue structure will be allocated
> and the lock will switch to queued-lock operation. When the lock becomes
> free, it will switch back into ticketlock operation. The low-order bit
> of the head counter is used to indicate that the lock is in queued mode,
> which forces an unconditional mismatch between the head and tail counters.
> This approach means that the common-case code path under conditions of
> low contention is very nearly that of a plain ticket lock.
>
> A fixed number of queueing structures is statically allocated in an
> array. The ticket-lock address is used to hash into an initial element,
> but if that element is already in use, it moves to the next element. If
> the entire array is already in use, continue to spin in ticket mode.
>
> Signed-off-by: Paul E. McKenney <[email protected]>
> [ paulmck: Eliminate duplicate code and update comments (Steven Rostedt). ]
> [ paulmck: Address Eric Dumazet review feedback. ]
> [ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
> [ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]
> [ paulmck: Move cpu_relax() to main spin loop (Steven Rostedt). ]
> [ paulmck: Reduce queue-switch contention (Waiman Long). ]
>
> diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
> index 33692ea..509c51a 100644
> --- a/arch/x86/include/asm/spinlock.h
> +++ b/arch/x86/include/asm/spinlock.h
> @@ -34,6 +34,21 @@
> # define UNLOCK_LOCK_PREFIX
> #endif
>
> +#ifdef CONFIG_TICKET_LOCK_QUEUED
> +
> +#define __TKT_SPIN_INC 2
> +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> +
> +#else /* #ifdef CONFIG_TICKET_LOCK_QUEUED */
> +
> +#define __TKT_SPIN_INC 1
> +static inline bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> +{
> + return false;
> +}
> +
> +#endif /* #else #ifdef CONFIG_TICKET_LOCK_QUEUED */
> +
> /*
> * Ticket locks are conceptually two parts, one indicating the current head of
> * the queue, and the other indicating the current tail. The lock is acquired
> @@ -49,17 +64,16 @@
> */
> static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> {
> - register struct __raw_tickets inc = { .tail = 1 };
> + register struct __raw_tickets inc = { .tail = __TKT_SPIN_INC };
>
> inc = xadd(&lock->tickets, inc);
> -
> for (;;) {
> - if (inc.head == inc.tail)
> + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> break;
> cpu_relax();
> inc.head = ACCESS_ONCE(lock->tickets.head);
> }
> - barrier(); /* make sure nothing creeps before the lock is taken */
> + barrier(); /* Make sure nothing creeps in before the lock is taken. */
> }
>
> static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> @@ -70,17 +84,37 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> if (old.tickets.head != old.tickets.tail)
> return 0;
>
> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> + new.head_tail = old.head_tail + (2 << TICKET_SHIFT);
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
>
> /* cmpxchg is a full barrier, so nothing can move before it */
> return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
> }
>
> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> +
> static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> {
> __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
> }
>
> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> +extern void tkt_q_do_wake(arch_spinlock_t *lock);
> +
> +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> +{
> + __ticket_t head = 2;
> +
> + head = xadd(&lock->tickets.head, head);
> + if (head & 0x1)
> + tkt_q_do_wake(lock);
> +}
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
> {
> struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> index ad0ad07..cdaefdd 100644
> --- a/arch/x86/include/asm/spinlock_types.h
> +++ b/arch/x86/include/asm/spinlock_types.h
> @@ -7,12 +7,18 @@
>
> #include <linux/types.h>
>
> -#if (CONFIG_NR_CPUS < 256)
> +#if (CONFIG_NR_CPUS < 128)
> typedef u8 __ticket_t;
> typedef u16 __ticketpair_t;
> -#else
> +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
> +#elif (CONFIG_NR_CPUS < 32768)
> typedef u16 __ticket_t;
> typedef u32 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
> +#else
> +typedef u32 __ticket_t;
> +typedef u64 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
> #endif
>
> #define TICKET_SHIFT (sizeof(__ticket_t) * 8)
> @@ -21,7 +27,11 @@ typedef struct arch_spinlock {
> union {
> __ticketpair_t head_tail;
> struct __raw_tickets {
> +#ifdef __BIG_ENDIAN__
> + __ticket_t tail, head;
> +#else /* #ifdef __BIG_ENDIAN__ */
> __ticket_t head, tail;
> +#endif /* #else #ifdef __BIG_ENDIAN__ */
> } tickets;
> };
> } arch_spinlock_t;
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index e9ef6d6..816a87c 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -15,6 +15,7 @@
> #include <asm/byteorder.h>
> #include <uapi/linux/kernel.h>
>
> +#define UCHAR_MAX ((u8)(~0U))
> #define USHRT_MAX ((u16)(~0U))
> #define SHRT_MAX ((s16)(USHRT_MAX>>1))
> #define SHRT_MIN ((s16)(-SHRT_MAX - 1))
> diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> index 44511d1..900c0f0 100644
> --- a/kernel/Kconfig.locks
> +++ b/kernel/Kconfig.locks
> @@ -223,3 +223,38 @@ endif
> config MUTEX_SPIN_ON_OWNER
> def_bool y
> depends on SMP && !DEBUG_MUTEXES
> +
> +config TICKET_LOCK_QUEUED
> + bool "Dynamically switch between ticket and queued locking"
> + depends on SMP
> + default n
> + ---help---
> + Enable dynamic switching between ticketlock and queued locking
> + on a per-lock basis. This option will slow down low-contention
> + acquisition and release very slightly (additional conditional
> + in release path), but will provide more efficient operation at
> + high levels of lock contention. High-contention operation will
> + not be quite as efficient as would be a pure queued lock, but
> + this dynamic approach consumes less memory than queud locks
> + and also runs faster at low levels of contention.
> +
> + Say "Y" if you are running on a large system with a workload
> + that is likely to result in high levels of contention.
> +
> + Say "N" if you are unsure.
> +
> +config TICKET_LOCK_QUEUED_SWITCH
> + int "When to switch from ticket to queued locking"
> + depends on TICKET_LOCK_QUEUED
> + default 8
> + range 3 32
> + ---help---
> + Specify how many tasks should be spinning on the lock before
> + switching to queued mode. Systems with low-latency memory/cache
> + interconnects will prefer larger numbers, while extreme low-latency
> + and real-time workloads will prefer a smaller number. Of course,
> + extreme real-time workloads would be even happier if contention
> + on the locks were reduced to the point that there was never any
> + need for queued locking in the first place.
> +
> + Take the default if you are unsure.
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 271fd31..70a91f7 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -51,6 +51,7 @@ endif
> obj-$(CONFIG_SMP) += spinlock.o
> obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
> obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
> +obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
> obj-$(CONFIG_UID16) += uid16.o
> obj-$(CONFIG_MODULES) += module.o
> obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
> diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> new file mode 100644
> index 0000000..9f03af0
> --- /dev/null
> +++ b/kernel/tktqlock.c
> @@ -0,0 +1,369 @@
> +/*
> + * Queued ticket spinlocks.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + * Copyright IBM Corporation, 2013
> + *
> + * Authors: Paul E. McKenney <[email protected]>
> + */
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/spinlock.h>
> +#include <linux/smp.h>
> +#include <linux/percpu.h>
> +
> +struct tkt_q {
> + int cpu;
> + __ticket_t tail;
> + struct tkt_q *next;
> +};
> +
> +struct tkt_q_head {
> + arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
> + s64 head_tkt; /* Head ticket when started queuing. */
> + struct tkt_q *spin; /* Head of queue. */
> + struct tkt_q **spin_tail; /* Tail of queue. */
> +};
> +
> +/*
> + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> + * given ticket lock to motivate switching to spinning on a queue.
> + * The reason that it is twice the number is because the bottom bit of
> + * the ticket is reserved for the bit that indicates that a queue is
> + * associated with the lock.
> + */
> +#define TKT_Q_SWITCH (CONFIG_TICKET_LOCK_QUEUED_SWITCH * 2)
> +
> +/*
> + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> + * might have multiple highly contended locks, so provide more queues for
> + * systems with larger numbers of CPUs.
> + */
> +#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
> +
> +/* The queues themselves. */
> +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
> +
> +/* Advance to the next queue slot, wrapping around to the beginning. */
> +static int tkt_q_next_slot(int i)
> +{
> + return (++i < TKT_Q_NQUEUES) ? i : 0;
> +}
> +
> +/* Very crude hash from lock address to queue slot number. */
> +static unsigned long tkt_q_hash(arch_spinlock_t *lock)
> +{
> + return (((unsigned long)lock) >> 8) % TKT_Q_NQUEUES;
> +}
> +
> +/*
> + * Return a pointer to the queue header associated with the specified lock,
> + * or return NULL if there is no queue for the lock or if the lock's queue
> + * is in transition.
> + */
> +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *lock)
> +{
> + int i;
> + int start;
> +
> + start = i = tkt_q_hash(lock);
> + do
> + if (tkt_q_heads[i].ref == lock)
> + return &tkt_q_heads[i];
> + while ((i = tkt_q_next_slot(i)) != start);
> + return NULL;
> +}
> +
> +/*
> + * Try to stop queuing, reverting back to normal ticket-lock operation.
> + * We can only stop queuing when the queue is empty, which means that
> + * we need to correctly handle races where someone shows up in the queue
> + * just as we are trying to dispense with the queue. They win, we lose.
> + */
> +static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)
> +{
> + arch_spinlock_t asold;
> + arch_spinlock_t asnew;
> +
> + /* Pick up the ticket values. */
> + asold = ACCESS_ONCE(*lock);
> + if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
> +
> + /* Attempt to mark the lock as not having a queue. */
> + asnew = asold;
> + asnew.tickets.head &= ~0x1;
> + if (cmpxchg(&lock->head_tail,
> + asold.head_tail,
> + asnew.head_tail) == asold.head_tail) {
> +
> + /* Succeeded, mark the queue as unused. */
> + ACCESS_ONCE(tqhp->ref) = NULL;
> + return true;
> + }
> + }
> +
> + /* Failed, tell the caller there is still a queue to pass off to. */
> + return false;
> +}
> +
> +/*
> + * Hand the lock off to the first CPU on the queue.
> + */
> +void tkt_q_do_wake(arch_spinlock_t *lock)
> +{
> + struct tkt_q_head *tqhp;
> + struct tkt_q *tqp;
> +
> + /* If the queue is still being set up, wait for it. */
> + while ((tqhp = tkt_q_find_head(lock)) == NULL)
> + cpu_relax();
> +
> + for (;;) {
> +
> + /* Find the first queue element. */
> + tqp = ACCESS_ONCE(tqhp->spin);
> + if (tqp != NULL)
> + break; /* Element exists, hand off lock. */
> + if (tkt_q_try_unqueue(lock, tqhp))
> + return; /* No element, successfully removed queue. */
> + cpu_relax();
> + }
> + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> + ACCESS_ONCE(tqhp->head_tkt) = -1;
> + smp_mb(); /* Order pointer fetch and assignment against handoff. */

I just try to find in which arch&case,
there will be wrong if I use barrier() instead.

Is barrier() enough if I want to reduce the overhead for X86?

> + ACCESS_ONCE(tqp->cpu) = -1;
> +}
> +EXPORT_SYMBOL(tkt_q_do_wake);
> +
> +/*
> + * Given a lock that already has a queue associated with it, spin on
> + * that queue. Return false if there was no queue (which means we do not
> + * hold the lock) and true otherwise (meaning we -do- hold the lock).
> + */
> +bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> +{
> + struct tkt_q **oldtail;
> + struct tkt_q tq;
> + struct tkt_q_head *tqhp;
> +
> + /*
> + * Ensure that accesses to queue header happen after sensing
> + * the lock's have-queue bit.
> + */
> + smp_mb(); /* See above block comment. */


Is barrier() enough if I want to reduce the overhead for X86?

> +
> + /* If there no longer is a queue, leave. */
> + tqhp = tkt_q_find_head(lock);
> + if (tqhp == NULL)
> + return false;
> +
> + /* Initialize our queue element. */
> + tq.cpu = raw_smp_processor_id();
> + tq.tail = inc.tail;
> + tq.next = NULL;
> +
> + /* Check to see if we already hold the lock. */
> + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> + /* The last holder left before queue formed, we hold lock. */
> + tqhp->head_tkt = -1;
> + return true;
> + }
> +
> + /*
> + * Add our element to the tail of the queue. Note that if the
> + * queue is empty, the ->spin_tail pointer will reference
> + * the queue's head pointer, namely ->spin.
> + */
> + oldtail = xchg(&tqhp->spin_tail, &tq.next);
> + ACCESS_ONCE(*oldtail) = &tq;
> +
> + /* Spin until handoff. */
> + while (ACCESS_ONCE(tq.cpu) != -1)
> + cpu_relax();
> +
> + /*
> + * Remove our element from the queue. If the queue is now empty,
> + * update carefully so that the next acquisition will enqueue itself
> + * at the head of the list. Of course, the next enqueue operation
> + * might be happening concurrently, and this code needs to handle all
> + * of the possible combinations, keeping in mind that the enqueue
> + * operation happens in two stages: (1) update the tail pointer and
> + * (2) update the predecessor's ->next pointer. With this in mind,
> + * the following code needs to deal with three scenarios:
> + *
> + * 1. tq is the last entry. In this case, we use cmpxchg to
> + * point the list tail back to the list head (->spin). If
> + * the cmpxchg fails, that indicates that we are instead
> + * in scenario 2 below. If the cmpxchg succeeds, the next
> + * enqueue operation's tail-pointer exchange will enqueue
> + * the next element at the queue head, because the ->spin_tail
> + * pointer now references the queue head.
> + *
> + * 2. tq is the last entry, and the next entry has updated the
> + * tail pointer but has not yet updated tq.next. In this
> + * case, tq.next is NULL, the cmpxchg will fail, and the
> + * code will wait for the enqueue to complete before completing
> + * removal of tq from the list.
> + *
> + * 3. tq is not the last pointer. In this case, tq.next is non-NULL,
> + * so the following code simply removes tq from the list.
> + */
> + if (tq.next == NULL) {
> +
> + /* Mark the queue empty. */
> + tqhp->spin = NULL;
> +
> + /* Try to point the tail back at the head. */
> + if (cmpxchg(&tqhp->spin_tail,
> + &tq.next,
> + &tqhp->spin) == &tq.next)
> + return true; /* Succeeded, queue is now empty. */
> +
> + /* Failed, if needed, wait for the enqueue to complete. */
> + while (tq.next == NULL)
> + cpu_relax();
> +
> + /* The following code will repair the head. */
> + }
> + smp_mb(); /* Force ordering between handoff and critical section. */


Is barrier() enough if I want to reduce the overhead for X86?

> +
> + /*
> + * Advance list-head pointer. This same task will be the next to
> + * access this when releasing the lock, so no need for a memory
> + * barrier after the following assignment.
> + */
> + ACCESS_ONCE(tqhp->spin) = tq.next;
> + return true;
> +}
> +
> +/*
> + * Given a lock that does not have a queue, attempt to associate the
> + * i-th queue with it, returning true if successful (meaning we hold
> + * the lock) or false otherwise (meaning we do -not- hold the lock).
> + * Note that the caller has already filled in ->ref with 0x1, so we
> + * own the queue.
> + */
> +static bool
> +tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
> +{
> + arch_spinlock_t asold;
> + arch_spinlock_t asnew;
> + struct tkt_q_head *tqhp;
> +
> + /* Initialize the i-th queue header. */
> + tqhp = &tkt_q_heads[i];
> + tqhp->spin = NULL;
> + tqhp->spin_tail = &tqhp->spin;
> +
> + /* Each pass through this loop attempts to mark the lock as queued. */
> + do {
> + asold.head_tail = ACCESS_ONCE(lock->head_tail);
> + asnew = asold;
> + if (asnew.tickets.head & 0x1) {
> +
> + /* Someone beat us to it, back out. */
> + smp_mb();

Is this intention not to corrupt the next owner of tqhp by the above
modification of tqhp?
Is smp_wmb() enough?

> + ACCESS_ONCE(tqhp->ref) = NULL;
> +
> + /* Spin on the queue element they set up. */
> + return tkt_q_do_spin(lock, inc);
> + }
> +
> + /*
> + * Record the head counter in case one of the spinning
> + * CPUs already holds the lock but doesn't realize it yet.
> + */
> + tqhp->head_tkt = asold.tickets.head;
> +
> + /* The low-order bit in the head counter says "queued". */
> + asnew.tickets.head |= 0x1;
> + } while (cmpxchg(&lock->head_tail,
> + asold.head_tail,
> + asnew.head_tail) != asold.head_tail);
> +
> + /* Point the queue at the lock and go spin on it. */
> + ACCESS_ONCE(tqhp->ref) = lock;
> + return tkt_q_do_spin(lock, inc);
> +}
> +
> +/*
> + * Start handling a period of high contention by finding a queue to associate
> + * with this lock. Returns true if successful (in which case we hold the
> + * lock) and false otherwise (in which case we do -not- hold the lock).
> + */
> +bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
> +{
> + int i;
> + int start;
> +
> + /* Hash the lock address to find a starting point. */
> + start = i = tkt_q_hash(lock);
> +
> + /*
> + * Each pass through the following loop attempts to associate
> + * the lock with the corresponding queue.
> + */
> + do {
> + /*
> + * Use 0x1 to mark the queue in use, but also avoiding
> + * any spinners trying to use it before we get it all
> + * initialized.
> + */
> + if (tkt_q_heads[i].ref)
> + continue;
> + if (cmpxchg(&tkt_q_heads[i].ref,
> + NULL,
> + (arch_spinlock_t *)0x1) == NULL) {
> +
> + /* Succeeded, now go initialize it. */
> + return tkt_q_init_contend(i, lock, inc);
> + }
> +
> + /* If someone beat us to it, go spin on their queue. */
> + if (ACCESS_ONCE(lock->tickets.head) & 0x1)
> + return tkt_q_do_spin(lock, inc);
> + } while ((i = tkt_q_next_slot(i)) != start);
> +
> + /* All the queues are in use, revert to spinning on the ticket lock. */
> + return false;
> +}
> +
> +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> +{
> + if (unlikely(inc.head & 0x1)) {
> +
> + /* This lock has a queue, so go spin on the queue. */
> + if (tkt_q_do_spin(ap, inc))
> + return true;
> +
> + /* Get here if the queue is in transition: Retry next time. */
> +
> + } else if (inc.tail - TKT_Q_SWITCH == inc.head) {
> +
> + /*
> + * This lock has lots of spinners, but no queue.
> + * Go create a queue to spin on.
> + */
> + if (tkt_q_start_contend(ap, inc))
> + return true;
> +
> + /* Get here if the queue is in transition: Retry next time. */
> + }
> +
> + /* Either no need for a queue or the queue is in transition. Spin. */
> + return false;
> +}
> +EXPORT_SYMBOL(tkt_spin_pass);
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2013-06-12 14:21:16

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Wed, Jun 12, 2013 at 07:06:53PM +0800, Lai Jiangshan wrote:
> On Wed, Jun 12, 2013 at 9:58 AM, Steven Rostedt <[email protected]> wrote:
> > On Wed, 2013-06-12 at 09:19 +0800, Lai Jiangshan wrote:
> >
> >> > +
> >> > +/*
> >> > + * Hand the lock off to the first CPU on the queue.
> >> > + */
> >> > +void tkt_q_do_wake(arch_spinlock_t *lock)
> >> > +{
> >> > + struct tkt_q_head *tqhp;
> >> > + struct tkt_q *tqp;
> >> > +
> >> > + /* If the queue is still being set up, wait for it. */
> >> > + while ((tqhp = tkt_q_find_head(lock)) == NULL)
> >> > + cpu_relax();
> >> > +
> >> > + for (;;) {
> >> > +
> >> > + /* Find the first queue element. */
> >> > + tqp = ACCESS_ONCE(tqhp->spin);
> >> > + if (tqp != NULL)
> >> > + break; /* Element exists, hand off lock. */
> >> > + if (tkt_q_try_unqueue(lock, tqhp))
> >> > + return; /* No element, successfully removed queue. */
> >> > + cpu_relax();
> >> > + }
> >> > + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> >> > + ACCESS_ONCE(tqhp->head_tkt) = -1;
> >> > + smp_mb(); /* Order pointer fetch and assignment against handoff. */
> >> > + ACCESS_ONCE(tqp->cpu) = -1;
> >> > +}
> >> > +EXPORT_SYMBOL(tkt_q_do_wake);
> >> > +
> >> > +/*
> >> > + * Given a lock that already has a queue associated with it, spin on
> >> > + * that queue. Return false if there was no queue (which means we do not
> >> > + * hold the lock) and true otherwise (meaning we -do- hold the lock).
> >> > + */
> >> > +bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> >> > +{
> >> > + struct tkt_q **oldtail;
> >> > + struct tkt_q tq;
> >> > + struct tkt_q_head *tqhp;
> >> > +
> >> > + /*
> >> > + * Ensure that accesses to queue header happen after sensing
> >> > + * the lock's have-queue bit.
> >> > + */
> >> > + smp_mb(); /* See above block comment. */
> >> > +
> >> > + /* If there no longer is a queue, leave. */
> >> > + tqhp = tkt_q_find_head(lock);
> >> > + if (tqhp == NULL)
> >> > + return false;
> >> > +
> >> > + /* Initialize our queue element. */
> >> > + tq.cpu = raw_smp_processor_id();
> >> > + tq.tail = inc.tail;
> >> > + tq.next = NULL;
> >>
> >> I guess a mb() is needed here for between read tqhp->ref and read
> >> tqhp->head_tkt.
> >> you can move the above mb() to here.
> >
> > Do we?
> >
> > The only way to get into here is if you either set up the queue
> > yourself, or you saw the LSB set in head.
> >
> > If you were the one to set it up yourself, then there's nothing to worry
> > about because you are also the one that set head_tkt.
> >
> > If you didn't set up the queue, then someone else set the LSB in head,
> > which is done with a cmpxchg() which is also a full mb. This would make
> > head_tkt visible as well because it's set before cmpxchg is called.
> >
> > Thus, to come into this function you must have seen head & 1 set, and
> > the smp_mb() above will also make head_tkt visible.

Agreed, after looking again.

> > The only thing I can see now is that it might not find tqhp because ref
> > may not be set yet. If that's the case, then it will fall out back to
> > the main loop. But if it finds ref, then I don't see how it can't see
> > head_tkt up to date as well.
> >
> > Maybe I'm missing something.
>
> No, you are right.
>
> When I lay on the bed in the night, I was thinking about the V1,
> I wrongly considered the V2 has the same problem without deeper
> thought in this morning.
>
> V2 has not such problem. sorry for the noisy.

Not a problem -- you did cause me to spot a missing ACCESS_ONCE() in
tkt_q_find_head(), which I have now added. I also added a comment
to tkt_q_do_wake() noting that the caller's xadd() provides the needed
memory ordering.

Thank you both for looking this over!

Thanx, Paul

> Thanks,
> Lai
>
> >
> > -- Steve
> >
> >
> >>
> >> > +
> >> > + /* Check to see if we already hold the lock. */
> >> > + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> >> > + /* The last holder left before queue formed, we hold lock. */
> >> > + tqhp->head_tkt = -1;
> >> > + return true;
> >> > + }
> >> > +
> >> > + /*
> >> > + * Add our element to the tail of the queue. Note that if the
> >> > + * queue is empty, the ->spin_tail pointer will reference
> >> > + * the queue's head pointer, namely ->spin.
> >> > + */
> >> > + oldtail = xchg(&tqhp->spin_tail, &tq.next);
> >> > + ACCESS_ONCE(*oldtail) = &tq;
> >> > +
> >> > + /* Spin until handoff. */
> >> > + while (ACCESS_ONCE(tq.cpu) != -1)
> >> > + cpu_relax();
> >> > +
> >> > + /*
> >> > + * Remove our element from the queue. If the queue is now empty,
> >> > + * update carefully so that the next acquisition will enqueue itself
> >> > + * at the head of the list. Of course, the next enqueue operation
> >> > + * might be happening concurrently, and this code needs to handle all
> >> > + * of the possible combinations, keeping in mind that the enqueue
> >> > + * operation happens in two stages: (1) update the tail pointer and
> >> > + * (2) update the predecessor's ->next pointer. With this in mind,
> >> > + * the following code needs to deal with three scenarios:
> >> > + *
> >> > + * 1. tq is the last entry. In this case, we use cmpxchg to
> >> > + * point the list tail back to the list head (->spin). If
> >> > + * the cmpxchg fails, that indicates that we are instead
> >> > + * in scenario 2 below. If the cmpxchg succeeds, the next
> >> > + * enqueue operation's tail-pointer exchange will enqueue
> >> > + * the next element at the queue head, because the ->spin_tail
> >> > + * pointer now references the queue head.
> >> > + *
> >> > + * 2. tq is the last entry, and the next entry has updated the
> >> > + * tail pointer but has not yet updated tq.next. In this
> >> > + * case, tq.next is NULL, the cmpxchg will fail, and the
> >> > + * code will wait for the enqueue to complete before completing
> >> > + * removal of tq from the list.
> >> > + *
> >> > + * 3. tq is not the last pointer. In this case, tq.next is non-NULL,
> >> > + * so the following code simply removes tq from the list.
> >> > + */
> >> > + if (tq.next == NULL) {
> >> > +
> >> > + /* Mark the queue empty. */
> >> > + tqhp->spin = NULL;
> >> > +
> >> > + /* Try to point the tail back at the head. */
> >> > + if (cmpxchg(&tqhp->spin_tail,
> >> > + &tq.next,
> >> > + &tqhp->spin) == &tq.next)
> >> > + return true; /* Succeeded, queue is now empty. */
> >> > +
> >> > + /* Failed, if needed, wait for the enqueue to complete. */
> >> > + while (tq.next == NULL)
> >> > + cpu_relax();
> >> > +
> >> > + /* The following code will repair the head. */
> >> > + }
> >> > + smp_mb(); /* Force ordering between handoff and critical section. */
> >> > +
> >> > + /*
> >> > + * Advance list-head pointer. This same task will be the next to
> >> > + * access this when releasing the lock, so no need for a memory
> >> > + * barrier after the following assignment.
> >> > + */
> >> > + ACCESS_ONCE(tqhp->spin) = tq.next;
> >> > + return true;
> >> > +}
> >> > +
> >> > +/*
> >> > + * Given a lock that does not have a queue, attempt to associate the
> >> > + * i-th queue with it, returning true if successful (meaning we hold
> >> > + * the lock) or false otherwise (meaning we do -not- hold the lock).
> >> > + * Note that the caller has already filled in ->ref with 0x1, so we
> >> > + * own the queue.
> >> > + */
> >> > +static bool
> >> > +tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
> >> > +{
> >> > + arch_spinlock_t asold;
> >> > + arch_spinlock_t asnew;
> >> > + struct tkt_q_head *tqhp;
> >> > +
> >> > + /* Initialize the i-th queue header. */
> >> > + tqhp = &tkt_q_heads[i];
> >> > + tqhp->spin = NULL;
> >> > + tqhp->spin_tail = &tqhp->spin;
> >> > +
> >> > + /* Each pass through this loop attempts to mark the lock as queued. */
> >> > + do {
> >> > + asold.head_tail = ACCESS_ONCE(lock->head_tail);
> >> > + asnew = asold;
> >> > + if (asnew.tickets.head & 0x1) {
> >> > +
> >> > + /* Someone beat us to it, back out. */
> >> > + smp_mb();
> >> > + ACCESS_ONCE(tqhp->ref) = NULL;
> >> > +
> >> > + /* Spin on the queue element they set up. */
> >> > + return tkt_q_do_spin(lock, inc);
> >> > + }
> >> > +
> >> > + /*
> >> > + * Record the head counter in case one of the spinning
> >> > + * CPUs already holds the lock but doesn't realize it yet.
> >> > + */
> >> > + tqhp->head_tkt = asold.tickets.head;
> >> > +
> >> > + /* The low-order bit in the head counter says "queued". */
> >> > + asnew.tickets.head |= 0x1;
> >> > + } while (cmpxchg(&lock->head_tail,
> >> > + asold.head_tail,
> >> > + asnew.head_tail) != asold.head_tail);
> >> > +
> >> > + /* Point the queue at the lock and go spin on it. */
> >> > + ACCESS_ONCE(tqhp->ref) = lock;
> >> > + return tkt_q_do_spin(lock, inc);
> >> > +}
> >> > +
> >> > +/*
> >> > + * Start handling a period of high contention by finding a queue to associate
> >> > + * with this lock. Returns true if successful (in which case we hold the
> >> > + * lock) and false otherwise (in which case we do -not- hold the lock).
> >> > + */
> >> > +bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
> >> > +{
> >> > + int i;
> >> > + int start;
> >> > +
> >> > + /* Hash the lock address to find a starting point. */
> >> > + start = i = tkt_q_hash(lock);
> >> > +
> >> > + /*
> >> > + * Each pass through the following loop attempts to associate
> >> > + * the lock with the corresponding queue.
> >> > + */
> >> > + do {
> >> > + /*
> >> > + * Use 0x1 to mark the queue in use, but also avoiding
> >> > + * any spinners trying to use it before we get it all
> >> > + * initialized.
> >> > + */
> >> > + if (tkt_q_heads[i].ref)
> >> > + continue;
> >> > + if (cmpxchg(&tkt_q_heads[i].ref,
> >> > + NULL,
> >> > + (arch_spinlock_t *)0x1) == NULL) {
> >> > +
> >> > + /* Succeeded, now go initialize it. */
> >> > + return tkt_q_init_contend(i, lock, inc);
> >> > + }
> >> > +
> >> > + /* If someone beat us to it, go spin on their queue. */
> >> > + if (ACCESS_ONCE(lock->tickets.head) & 0x1)
> >> > + return tkt_q_do_spin(lock, inc);
> >> > + } while ((i = tkt_q_next_slot(i)) != start);
> >> > +
> >> > + /* All the queues are in use, revert to spinning on the ticket lock. */
> >> > + return false;
> >> > +}
> >> > +
> >> > +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> >> > +{
> >> > + if (unlikely(inc.head & 0x1)) {
> >> > +
> >> > + /* This lock has a queue, so go spin on the queue. */
> >> > + if (tkt_q_do_spin(ap, inc))
> >> > + return true;
> >> > +
> >> > + /* Get here if the queue is in transition: Retry next time. */
> >> > +
> >> > + } else if (inc.tail - TKT_Q_SWITCH == inc.head) {
> >> > +
> >> > + /*
> >> > + * This lock has lots of spinners, but no queue.
> >> > + * Go create a queue to spin on.
> >> > + */
> >> > + if (tkt_q_start_contend(ap, inc))
> >> > + return true;
> >> > +
> >> > + /* Get here if the queue is in transition: Retry next time. */
> >> > + }
> >> > +
> >> > + /* Either no need for a queue or the queue is in transition. Spin. */
> >> > + return false;
> >> > +}
> >> > +EXPORT_SYMBOL(tkt_spin_pass);
> >> >
> >> > --
> >> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> >> > the body of a message to [email protected]
> >> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> >> > Please read the FAQ at http://www.tux.org/lkml/
> >
> >
>

2013-06-12 14:49:18

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Wed, Jun 12, 2013 at 10:15:49PM +0800, Lai Jiangshan wrote:
> Hi, Paul
>
> I have some question about smp_mb().(searching smp_mb() can find all
> my question)
>
> Thanks,
> Lai
>
> On Wed, Jun 12, 2013 at 3:49 AM, Paul E. McKenney
> <[email protected]> wrote:
> > On Tue, Jun 11, 2013 at 02:41:59PM -0400, Waiman Long wrote:
> >> On 06/11/2013 12:36 PM, Paul E. McKenney wrote:
> >> >
> >> >>I am a bit concern about the size of the head queue table itself.
> >> >>RHEL6, for example, had defined CONFIG_NR_CPUS to be 4096 which mean
> >> >>a table size of 256. Maybe it is better to dynamically allocate the
> >> >>table at init time depending on the actual number of CPUs in the
> >> >>system.
> >> >But if your kernel is built for 4096 CPUs, the 32*256=8192 bytes of memory
> >> >is way down in the noise. Systems that care about that small an amount
> >> >of memory probably have a small enough number of CPUs that they can just
> >> >turn off queueing at build time using CONFIG_TICKET_LOCK_QUEUED=n, right?
> >>
> >> My concern is more about the latency on the table scan than the
> >> actual memory that was used.
> >>
> >> >>>+/*
> >> >>>+ * Return a pointer to the queue header associated with the specified lock,
> >> >>>+ * or return NULL if there is no queue for the lock or if the lock's queue
> >> >>>+ * is in transition.
> >> >>>+ */
> >> >>>+static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *asp)
> >> >>>+{
> >> >>>+ int i;
> >> >>>+ int start;
> >> >>>+
> >> >>>+ start = i = tkt_q_hash(asp);
> >> >>>+ do
> >> >>>+ if (tkt_q_heads[i].ref == asp)
> >> >>>+ return&tkt_q_heads[i];
> >> >>>+ while ((i = tkt_q_next_slot(i)) != start);
> >> >>>+ return NULL;
> >> >>>+}
> >> >>With a table size of 256 and you have to scan the whole table to
> >> >>find the right head queue. This can be a significant overhead. I
> >> >>will suggest setting a limiting of how many entries it scans before
> >> >>it aborts rather than checking the whole table.
> >> >But it will scan 256 entries only if there are 256 other locks in queued
> >> >mode, which is -very- unlikely, even given 4096 CPUs. That said, if you
> >> >show me that this results in a real latency problem on a real system,
> >> >I would be happy to provide a way to limit the search.
> >>
> >> Looking at the code more carefully, the chance of actually scanning
> >> 256 entries is very small. However, I now have some concern on the
> >> way you set up the initial queue.
> >>
> >> +/*
> >> + * Start handling a period of high contention by finding a queue to associate
> >> + * with this lock. Returns true if successful (in which case we hold the
> >> + * lock) and false otherwise (in which case we do -not- hold the lock).
> >> + */
> >> +bool tkt_q_start_contend(arch_spinlock_t *asp, struct __raw_tickets inc)
> >> +{
> >> + int i;
> >> + int start;
> >> +
> >> + /* Hash the lock address to find a starting point. */
> >> + start = i = tkt_q_hash(asp);
> >> +
> >> + /*
> >> + * Each pass through the following loop attempts to associate
> >> + * the lock with the corresponding queue.
> >> + */
> >> + do {
> >> + /*
> >> + * Use 0x1 to mark the queue in use, but also avoiding
> >> + * any spinners trying to use it before we get it all
> >> + * initialized.
> >> + */
> >> + if (cmpxchg(&tkt_q_heads[i].ref,
> >> + NULL,
> >> + (arch_spinlock_t *)0x1) == NULL) {
> >> +
> >> + /* Succeeded, now go initialize it. */
> >> + return tkt_q_init_contend(i, asp, inc);
> >> + }
> >> +
> >> + /* If someone beat us to it, go spin on their queue. */
> >> + if (ACCESS_ONCE(asp->tickets.head)& 0x1)
> >> + return tkt_q_do_spin(asp, inc);
> >> + } while ((i = tkt_q_next_slot(i)) != start);
> >> +
> >> + /* All the queues are in use, revert to spinning on the ticket lock. */
> >> + return false;
> >> +}
> >> +
> >>
> >> Unconditional cmpxchg() can be a source of high contention by
> >> itself. Considering that 16 threads may be doing cmpxchg() more or
> >> less simultaneously on the same cache line, it can cause a lot of
> >> contention. It will be better if you check to see if tkt_q_heads[i]
> >> is NULL first before doing cmpxchg.
> >>
> >> Another point is that the 16 threads maybe setting up the queues in
> >> consecutive slots in the head table. This is both a source of
> >> contention and a waste of effort. One possible solution is to add
> >> one more field (set to cpuid + 1, for example) to indicate that that
> >> setup is being done with asp set to the target lock address
> >> immediately. We will need to use cmpxchg128() for 64-bit machine,
> >> though. Another solution is to have only that thread with ticket
> >> number that is a fixed distance from head (e.g. 16*2) to do the
> >> queue setup while the rest wait until the setup is done before
> >> spinning on the queue.
> >>
> >> As my colleague Davidlohr had reported there are more regressions
> >> than performance improvement in the AIM7 benchmark. I believe that
> >> queue setup contention is likely a source of performance regression.
> >
> > Please see below for a v3 patch that:
> >
> > 1. Fixes cpu_relax().
> >
> > 2. Tests before doing cmpxchg().
> >
> > 3. Reduces the number of CPUs attempting to set up the queue,
> > in the common case, to a single CPU. (Multiple CPUs can
> > still be trying to set up the queue given unfortunate
> > sequences of concurrent ticket-lock handoffs.)
> >
> > Please let me know how it goes!
> >
> > Thanx, Paul
> >
> > ------------------------------------------------------------------------
> >
> > ticketlock: Add queued-ticketlock capability
> >
> > Breaking up locks is better than implementing high-contention locks, but
> > if we must have high-contention locks, why not make them automatically
> > switch between light-weight ticket locks at low contention and queued
> > locks at high contention? After all, this would remove the need for
> > the developer to predict which locks will be highly contended.
> >
> > This commit allows ticket locks to automatically switch between pure
> > ticketlock and queued-lock operation as needed. If too many CPUs are
> > spinning on a given ticket lock, a queue structure will be allocated
> > and the lock will switch to queued-lock operation. When the lock becomes
> > free, it will switch back into ticketlock operation. The low-order bit
> > of the head counter is used to indicate that the lock is in queued mode,
> > which forces an unconditional mismatch between the head and tail counters.
> > This approach means that the common-case code path under conditions of
> > low contention is very nearly that of a plain ticket lock.
> >
> > A fixed number of queueing structures is statically allocated in an
> > array. The ticket-lock address is used to hash into an initial element,
> > but if that element is already in use, it moves to the next element. If
> > the entire array is already in use, continue to spin in ticket mode.
> >
> > Signed-off-by: Paul E. McKenney <[email protected]>
> > [ paulmck: Eliminate duplicate code and update comments (Steven Rostedt). ]
> > [ paulmck: Address Eric Dumazet review feedback. ]
> > [ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
> > [ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]
> > [ paulmck: Move cpu_relax() to main spin loop (Steven Rostedt). ]
> > [ paulmck: Reduce queue-switch contention (Waiman Long). ]
> >
> > diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
> > index 33692ea..509c51a 100644
> > --- a/arch/x86/include/asm/spinlock.h
> > +++ b/arch/x86/include/asm/spinlock.h
> > @@ -34,6 +34,21 @@
> > # define UNLOCK_LOCK_PREFIX
> > #endif
> >
> > +#ifdef CONFIG_TICKET_LOCK_QUEUED
> > +
> > +#define __TKT_SPIN_INC 2
> > +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> > +
> > +#else /* #ifdef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > +#define __TKT_SPIN_INC 1
> > +static inline bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> > +{
> > + return false;
> > +}
> > +
> > +#endif /* #else #ifdef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > /*
> > * Ticket locks are conceptually two parts, one indicating the current head of
> > * the queue, and the other indicating the current tail. The lock is acquired
> > @@ -49,17 +64,16 @@
> > */
> > static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> > {
> > - register struct __raw_tickets inc = { .tail = 1 };
> > + register struct __raw_tickets inc = { .tail = __TKT_SPIN_INC };
> >
> > inc = xadd(&lock->tickets, inc);
> > -
> > for (;;) {
> > - if (inc.head == inc.tail)
> > + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> > break;
> > cpu_relax();
> > inc.head = ACCESS_ONCE(lock->tickets.head);
> > }
> > - barrier(); /* make sure nothing creeps before the lock is taken */
> > + barrier(); /* Make sure nothing creeps in before the lock is taken. */
> > }
> >
> > static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> > @@ -70,17 +84,37 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> > if (old.tickets.head != old.tickets.tail)
> > return 0;
> >
> > +#ifndef CONFIG_TICKET_LOCK_QUEUED
> > new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
> > +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > + new.head_tail = old.head_tail + (2 << TICKET_SHIFT);
> > +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> >
> > /* cmpxchg is a full barrier, so nothing can move before it */
> > return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
> > }
> >
> > +#ifndef CONFIG_TICKET_LOCK_QUEUED
> > +
> > static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> > {
> > __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
> > }
> >
> > +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > +extern void tkt_q_do_wake(arch_spinlock_t *lock);
> > +
> > +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> > +{
> > + __ticket_t head = 2;
> > +
> > + head = xadd(&lock->tickets.head, head);
> > + if (head & 0x1)
> > + tkt_q_do_wake(lock);
> > +}
> > +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
> > {
> > struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> > diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> > index ad0ad07..cdaefdd 100644
> > --- a/arch/x86/include/asm/spinlock_types.h
> > +++ b/arch/x86/include/asm/spinlock_types.h
> > @@ -7,12 +7,18 @@
> >
> > #include <linux/types.h>
> >
> > -#if (CONFIG_NR_CPUS < 256)
> > +#if (CONFIG_NR_CPUS < 128)
> > typedef u8 __ticket_t;
> > typedef u16 __ticketpair_t;
> > -#else
> > +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
> > +#elif (CONFIG_NR_CPUS < 32768)
> > typedef u16 __ticket_t;
> > typedef u32 __ticketpair_t;
> > +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
> > +#else
> > +typedef u32 __ticket_t;
> > +typedef u64 __ticketpair_t;
> > +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
> > #endif
> >
> > #define TICKET_SHIFT (sizeof(__ticket_t) * 8)
> > @@ -21,7 +27,11 @@ typedef struct arch_spinlock {
> > union {
> > __ticketpair_t head_tail;
> > struct __raw_tickets {
> > +#ifdef __BIG_ENDIAN__
> > + __ticket_t tail, head;
> > +#else /* #ifdef __BIG_ENDIAN__ */
> > __ticket_t head, tail;
> > +#endif /* #else #ifdef __BIG_ENDIAN__ */
> > } tickets;
> > };
> > } arch_spinlock_t;
> > diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> > index e9ef6d6..816a87c 100644
> > --- a/include/linux/kernel.h
> > +++ b/include/linux/kernel.h
> > @@ -15,6 +15,7 @@
> > #include <asm/byteorder.h>
> > #include <uapi/linux/kernel.h>
> >
> > +#define UCHAR_MAX ((u8)(~0U))
> > #define USHRT_MAX ((u16)(~0U))
> > #define SHRT_MAX ((s16)(USHRT_MAX>>1))
> > #define SHRT_MIN ((s16)(-SHRT_MAX - 1))
> > diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> > index 44511d1..900c0f0 100644
> > --- a/kernel/Kconfig.locks
> > +++ b/kernel/Kconfig.locks
> > @@ -223,3 +223,38 @@ endif
> > config MUTEX_SPIN_ON_OWNER
> > def_bool y
> > depends on SMP && !DEBUG_MUTEXES
> > +
> > +config TICKET_LOCK_QUEUED
> > + bool "Dynamically switch between ticket and queued locking"
> > + depends on SMP
> > + default n
> > + ---help---
> > + Enable dynamic switching between ticketlock and queued locking
> > + on a per-lock basis. This option will slow down low-contention
> > + acquisition and release very slightly (additional conditional
> > + in release path), but will provide more efficient operation at
> > + high levels of lock contention. High-contention operation will
> > + not be quite as efficient as would be a pure queued lock, but
> > + this dynamic approach consumes less memory than queud locks
> > + and also runs faster at low levels of contention.
> > +
> > + Say "Y" if you are running on a large system with a workload
> > + that is likely to result in high levels of contention.
> > +
> > + Say "N" if you are unsure.
> > +
> > +config TICKET_LOCK_QUEUED_SWITCH
> > + int "When to switch from ticket to queued locking"
> > + depends on TICKET_LOCK_QUEUED
> > + default 8
> > + range 3 32
> > + ---help---
> > + Specify how many tasks should be spinning on the lock before
> > + switching to queued mode. Systems with low-latency memory/cache
> > + interconnects will prefer larger numbers, while extreme low-latency
> > + and real-time workloads will prefer a smaller number. Of course,
> > + extreme real-time workloads would be even happier if contention
> > + on the locks were reduced to the point that there was never any
> > + need for queued locking in the first place.
> > +
> > + Take the default if you are unsure.
> > diff --git a/kernel/Makefile b/kernel/Makefile
> > index 271fd31..70a91f7 100644
> > --- a/kernel/Makefile
> > +++ b/kernel/Makefile
> > @@ -51,6 +51,7 @@ endif
> > obj-$(CONFIG_SMP) += spinlock.o
> > obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
> > obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
> > +obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
> > obj-$(CONFIG_UID16) += uid16.o
> > obj-$(CONFIG_MODULES) += module.o
> > obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
> > diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> > new file mode 100644
> > index 0000000..9f03af0
> > --- /dev/null
> > +++ b/kernel/tktqlock.c
> > @@ -0,0 +1,369 @@
> > +/*
> > + * Queued ticket spinlocks.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, write to the Free Software
> > + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> > + *
> > + * Copyright IBM Corporation, 2013
> > + *
> > + * Authors: Paul E. McKenney <[email protected]>
> > + */
> > +#include <linux/types.h>
> > +#include <linux/kernel.h>
> > +#include <linux/spinlock.h>
> > +#include <linux/smp.h>
> > +#include <linux/percpu.h>
> > +
> > +struct tkt_q {
> > + int cpu;
> > + __ticket_t tail;
> > + struct tkt_q *next;
> > +};
> > +
> > +struct tkt_q_head {
> > + arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
> > + s64 head_tkt; /* Head ticket when started queuing. */
> > + struct tkt_q *spin; /* Head of queue. */
> > + struct tkt_q **spin_tail; /* Tail of queue. */
> > +};
> > +
> > +/*
> > + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> > + * given ticket lock to motivate switching to spinning on a queue.
> > + * The reason that it is twice the number is because the bottom bit of
> > + * the ticket is reserved for the bit that indicates that a queue is
> > + * associated with the lock.
> > + */
> > +#define TKT_Q_SWITCH (CONFIG_TICKET_LOCK_QUEUED_SWITCH * 2)
> > +
> > +/*
> > + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> > + * might have multiple highly contended locks, so provide more queues for
> > + * systems with larger numbers of CPUs.
> > + */
> > +#define TKT_Q_NQUEUES (DIV_ROUND_UP(NR_CPUS + TKT_Q_SWITCH - 1, TKT_Q_SWITCH) * 2)
> > +
> > +/* The queues themselves. */
> > +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
> > +
> > +/* Advance to the next queue slot, wrapping around to the beginning. */
> > +static int tkt_q_next_slot(int i)
> > +{
> > + return (++i < TKT_Q_NQUEUES) ? i : 0;
> > +}
> > +
> > +/* Very crude hash from lock address to queue slot number. */
> > +static unsigned long tkt_q_hash(arch_spinlock_t *lock)
> > +{
> > + return (((unsigned long)lock) >> 8) % TKT_Q_NQUEUES;
> > +}
> > +
> > +/*
> > + * Return a pointer to the queue header associated with the specified lock,
> > + * or return NULL if there is no queue for the lock or if the lock's queue
> > + * is in transition.
> > + */
> > +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *lock)
> > +{
> > + int i;
> > + int start;
> > +
> > + start = i = tkt_q_hash(lock);
> > + do
> > + if (tkt_q_heads[i].ref == lock)
> > + return &tkt_q_heads[i];
> > + while ((i = tkt_q_next_slot(i)) != start);
> > + return NULL;
> > +}
> > +
> > +/*
> > + * Try to stop queuing, reverting back to normal ticket-lock operation.
> > + * We can only stop queuing when the queue is empty, which means that
> > + * we need to correctly handle races where someone shows up in the queue
> > + * just as we are trying to dispense with the queue. They win, we lose.
> > + */
> > +static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)
> > +{
> > + arch_spinlock_t asold;
> > + arch_spinlock_t asnew;
> > +
> > + /* Pick up the ticket values. */
> > + asold = ACCESS_ONCE(*lock);
> > + if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
> > +
> > + /* Attempt to mark the lock as not having a queue. */
> > + asnew = asold;
> > + asnew.tickets.head &= ~0x1;
> > + if (cmpxchg(&lock->head_tail,
> > + asold.head_tail,
> > + asnew.head_tail) == asold.head_tail) {
> > +
> > + /* Succeeded, mark the queue as unused. */
> > + ACCESS_ONCE(tqhp->ref) = NULL;
> > + return true;
> > + }
> > + }
> > +
> > + /* Failed, tell the caller there is still a queue to pass off to. */
> > + return false;
> > +}
> > +
> > +/*
> > + * Hand the lock off to the first CPU on the queue.
> > + */
> > +void tkt_q_do_wake(arch_spinlock_t *lock)
> > +{
> > + struct tkt_q_head *tqhp;
> > + struct tkt_q *tqp;
> > +
> > + /* If the queue is still being set up, wait for it. */
> > + while ((tqhp = tkt_q_find_head(lock)) == NULL)
> > + cpu_relax();
> > +
> > + for (;;) {
> > +
> > + /* Find the first queue element. */
> > + tqp = ACCESS_ONCE(tqhp->spin);
> > + if (tqp != NULL)
> > + break; /* Element exists, hand off lock. */
> > + if (tkt_q_try_unqueue(lock, tqhp))
> > + return; /* No element, successfully removed queue. */
> > + cpu_relax();
> > + }
> > + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> > + ACCESS_ONCE(tqhp->head_tkt) = -1;
> > + smp_mb(); /* Order pointer fetch and assignment against handoff. */
>
> I just try to find in which arch&case,
> there will be wrong if I use barrier() instead.
>
> Is barrier() enough if I want to reduce the overhead for X86?

It would for x86, but my intent is for this to be common code. Hence my
question to Linus about his thoughts on an smp_mb_tso() that provided
non-transitive ordering of everything but prior stores against later
loads. If people are OK with this, I would implement it and use it here.

> > + ACCESS_ONCE(tqp->cpu) = -1;
> > +}
> > +EXPORT_SYMBOL(tkt_q_do_wake);
> > +
> > +/*
> > + * Given a lock that already has a queue associated with it, spin on
> > + * that queue. Return false if there was no queue (which means we do not
> > + * hold the lock) and true otherwise (meaning we -do- hold the lock).
> > + */
> > +bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> > +{
> > + struct tkt_q **oldtail;
> > + struct tkt_q tq;
> > + struct tkt_q_head *tqhp;
> > +
> > + /*
> > + * Ensure that accesses to queue header happen after sensing
> > + * the lock's have-queue bit.
> > + */
> > + smp_mb(); /* See above block comment. */
>
> Is barrier() enough if I want to reduce the overhead for X86?

Yep, but same desire for common code and thoughts of smp_mb_tso().

> > +
> > + /* If there no longer is a queue, leave. */
> > + tqhp = tkt_q_find_head(lock);
> > + if (tqhp == NULL)
> > + return false;
> > +
> > + /* Initialize our queue element. */
> > + tq.cpu = raw_smp_processor_id();
> > + tq.tail = inc.tail;
> > + tq.next = NULL;
> > +
> > + /* Check to see if we already hold the lock. */
> > + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> > + /* The last holder left before queue formed, we hold lock. */
> > + tqhp->head_tkt = -1;
> > + return true;
> > + }
> > +
> > + /*
> > + * Add our element to the tail of the queue. Note that if the
> > + * queue is empty, the ->spin_tail pointer will reference
> > + * the queue's head pointer, namely ->spin.
> > + */
> > + oldtail = xchg(&tqhp->spin_tail, &tq.next);
> > + ACCESS_ONCE(*oldtail) = &tq;
> > +
> > + /* Spin until handoff. */
> > + while (ACCESS_ONCE(tq.cpu) != -1)
> > + cpu_relax();
> > +
> > + /*
> > + * Remove our element from the queue. If the queue is now empty,
> > + * update carefully so that the next acquisition will enqueue itself
> > + * at the head of the list. Of course, the next enqueue operation
> > + * might be happening concurrently, and this code needs to handle all
> > + * of the possible combinations, keeping in mind that the enqueue
> > + * operation happens in two stages: (1) update the tail pointer and
> > + * (2) update the predecessor's ->next pointer. With this in mind,
> > + * the following code needs to deal with three scenarios:
> > + *
> > + * 1. tq is the last entry. In this case, we use cmpxchg to
> > + * point the list tail back to the list head (->spin). If
> > + * the cmpxchg fails, that indicates that we are instead
> > + * in scenario 2 below. If the cmpxchg succeeds, the next
> > + * enqueue operation's tail-pointer exchange will enqueue
> > + * the next element at the queue head, because the ->spin_tail
> > + * pointer now references the queue head.
> > + *
> > + * 2. tq is the last entry, and the next entry has updated the
> > + * tail pointer but has not yet updated tq.next. In this
> > + * case, tq.next is NULL, the cmpxchg will fail, and the
> > + * code will wait for the enqueue to complete before completing
> > + * removal of tq from the list.
> > + *
> > + * 3. tq is not the last pointer. In this case, tq.next is non-NULL,
> > + * so the following code simply removes tq from the list.
> > + */
> > + if (tq.next == NULL) {
> > +
> > + /* Mark the queue empty. */
> > + tqhp->spin = NULL;
> > +
> > + /* Try to point the tail back at the head. */
> > + if (cmpxchg(&tqhp->spin_tail,
> > + &tq.next,
> > + &tqhp->spin) == &tq.next)
> > + return true; /* Succeeded, queue is now empty. */
> > +
> > + /* Failed, if needed, wait for the enqueue to complete. */
> > + while (tq.next == NULL)
> > + cpu_relax();
> > +
> > + /* The following code will repair the head. */
> > + }
> > + smp_mb(); /* Force ordering between handoff and critical section. */
>
> Is barrier() enough if I want to reduce the overhead for X86?

Yep, same desire for common code and smp_mb_tso().

> > +
> > + /*
> > + * Advance list-head pointer. This same task will be the next to
> > + * access this when releasing the lock, so no need for a memory
> > + * barrier after the following assignment.
> > + */
> > + ACCESS_ONCE(tqhp->spin) = tq.next;
> > + return true;
> > +}
> > +
> > +/*
> > + * Given a lock that does not have a queue, attempt to associate the
> > + * i-th queue with it, returning true if successful (meaning we hold
> > + * the lock) or false otherwise (meaning we do -not- hold the lock).
> > + * Note that the caller has already filled in ->ref with 0x1, so we
> > + * own the queue.
> > + */
> > +static bool
> > +tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
> > +{
> > + arch_spinlock_t asold;
> > + arch_spinlock_t asnew;
> > + struct tkt_q_head *tqhp;
> > +
> > + /* Initialize the i-th queue header. */
> > + tqhp = &tkt_q_heads[i];
> > + tqhp->spin = NULL;
> > + tqhp->spin_tail = &tqhp->spin;
> > +
> > + /* Each pass through this loop attempts to mark the lock as queued. */
> > + do {
> > + asold.head_tail = ACCESS_ONCE(lock->head_tail);
> > + asnew = asold;
> > + if (asnew.tickets.head & 0x1) {
> > +
> > + /* Someone beat us to it, back out. */
> > + smp_mb();
>
> Is this intention not to corrupt the next owner of tqhp by the above
> modification of tqhp?

Yep!

> Is smp_wmb() enough?

Indeed it should be sufficient.

> > + ACCESS_ONCE(tqhp->ref) = NULL;
> > +
> > + /* Spin on the queue element they set up. */
> > + return tkt_q_do_spin(lock, inc);
> > + }
> > +
> > + /*
> > + * Record the head counter in case one of the spinning
> > + * CPUs already holds the lock but doesn't realize it yet.
> > + */
> > + tqhp->head_tkt = asold.tickets.head;
> > +
> > + /* The low-order bit in the head counter says "queued". */
> > + asnew.tickets.head |= 0x1;
> > + } while (cmpxchg(&lock->head_tail,
> > + asold.head_tail,
> > + asnew.head_tail) != asold.head_tail);
> > +
> > + /* Point the queue at the lock and go spin on it. */
> > + ACCESS_ONCE(tqhp->ref) = lock;
> > + return tkt_q_do_spin(lock, inc);
> > +}
> > +
> > +/*
> > + * Start handling a period of high contention by finding a queue to associate
> > + * with this lock. Returns true if successful (in which case we hold the
> > + * lock) and false otherwise (in which case we do -not- hold the lock).
> > + */
> > +bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
> > +{
> > + int i;
> > + int start;
> > +
> > + /* Hash the lock address to find a starting point. */
> > + start = i = tkt_q_hash(lock);
> > +
> > + /*
> > + * Each pass through the following loop attempts to associate
> > + * the lock with the corresponding queue.
> > + */
> > + do {
> > + /*
> > + * Use 0x1 to mark the queue in use, but also avoiding
> > + * any spinners trying to use it before we get it all
> > + * initialized.
> > + */
> > + if (tkt_q_heads[i].ref)
> > + continue;
> > + if (cmpxchg(&tkt_q_heads[i].ref,
> > + NULL,
> > + (arch_spinlock_t *)0x1) == NULL) {
> > +
> > + /* Succeeded, now go initialize it. */
> > + return tkt_q_init_contend(i, lock, inc);
> > + }
> > +
> > + /* If someone beat us to it, go spin on their queue. */
> > + if (ACCESS_ONCE(lock->tickets.head) & 0x1)
> > + return tkt_q_do_spin(lock, inc);
> > + } while ((i = tkt_q_next_slot(i)) != start);
> > +
> > + /* All the queues are in use, revert to spinning on the ticket lock. */
> > + return false;
> > +}
> > +
> > +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> > +{
> > + if (unlikely(inc.head & 0x1)) {
> > +
> > + /* This lock has a queue, so go spin on the queue. */
> > + if (tkt_q_do_spin(ap, inc))
> > + return true;
> > +
> > + /* Get here if the queue is in transition: Retry next time. */
> > +
> > + } else if (inc.tail - TKT_Q_SWITCH == inc.head) {
> > +
> > + /*
> > + * This lock has lots of spinners, but no queue.
> > + * Go create a queue to spin on.
> > + */
> > + if (tkt_q_start_contend(ap, inc))
> > + return true;
> > +
> > + /* Get here if the queue is in transition: Retry next time. */
> > + }
> > +
> > + /* Either no need for a queue or the queue is in transition. Spin. */
> > + return false;
> > +}
> > +EXPORT_SYMBOL(tkt_spin_pass);
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at http://www.tux.org/lkml/
>

2013-06-12 15:40:22

by Paul E. McKenney

[permalink] [raw]
Subject: [PATCH RFC ticketlock] v3 Auto-queued ticketlock

Breaking up locks is better than implementing high-contention locks, but
if we must have high-contention locks, why not make them automatically
switch between light-weight ticket locks at low contention and queued
locks at high contention? After all, this would remove the need for
the developer to predict which locks will be highly contended.

This commit allows ticket locks to automatically switch between pure
ticketlock and queued-lock operation as needed. If too many CPUs are
spinning on a given ticket lock, a queue structure will be allocated
and the lock will switch to queued-lock operation. When the lock becomes
free, it will switch back into ticketlock operation. The low-order bit
of the head counter is used to indicate that the lock is in queued mode,
which forces an unconditional mismatch between the head and tail counters.
This approach means that the common-case code path under conditions of
low contention is very nearly that of a plain ticket lock.

A fixed number of queueing structures is statically allocated in an
array. The ticket-lock address is used to hash into an initial element,
but if that element is already in use, it moves to the next element. If
the entire array is already in use, continue to spin in ticket mode.

Signed-off-by: Paul E. McKenney <[email protected]>
[ paulmck: Eliminate duplicate code and update comments (Steven Rostedt). ]
[ paulmck: Address Eric Dumazet review feedback. ]
[ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
[ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]
[ paulmck: Move cpu_relax() to main spin loop (Steven Rostedt). ]
[ paulmck: Reduce queue-switch contention (Waiman Long). ]
[ paulmck: __TKT_SPIN_INC for __ticket_spin_trylock() (Steffen Persvold). ]
[ paulmck: Type safety fixes (Steven Rostedt). ]
[ paulmck: Pre-check cmpxchg() value (Waiman Long). ]
[ paulmck: smp_mb() downgrade to smp_wmb() (Lai Jiangshan). ]

diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 33692ea..5aa0177 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -34,6 +34,21 @@
# define UNLOCK_LOCK_PREFIX
#endif

+#ifdef CONFIG_TICKET_LOCK_QUEUED
+
+#define __TKT_SPIN_INC 2
+bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
+
+#else /* #ifdef CONFIG_TICKET_LOCK_QUEUED */
+
+#define __TKT_SPIN_INC 1
+static inline bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
+{
+ return false;
+}
+
+#endif /* #else #ifdef CONFIG_TICKET_LOCK_QUEUED */
+
/*
* Ticket locks are conceptually two parts, one indicating the current head of
* the queue, and the other indicating the current tail. The lock is acquired
@@ -49,17 +64,16 @@
*/
static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
{
- register struct __raw_tickets inc = { .tail = 1 };
+ register struct __raw_tickets inc = { .tail = __TKT_SPIN_INC };

inc = xadd(&lock->tickets, inc);
-
for (;;) {
- if (inc.head == inc.tail)
+ if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
break;
cpu_relax();
inc.head = ACCESS_ONCE(lock->tickets.head);
}
- barrier(); /* make sure nothing creeps before the lock is taken */
+ barrier(); /* Make sure nothing creeps in before the lock is taken. */
}

static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
@@ -70,17 +84,33 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
if (old.tickets.head != old.tickets.tail)
return 0;

- new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
+ new.head_tail = old.head_tail + (__TKT_SPIN_INC << TICKET_SHIFT);

/* cmpxchg is a full barrier, so nothing can move before it */
return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
}

+#ifndef CONFIG_TICKET_LOCK_QUEUED
+
static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
{
__add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
}

+#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
+
+extern void tkt_q_do_wake(arch_spinlock_t *lock);
+
+static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
+{
+ __ticket_t head = 2;
+
+ head = xadd(&lock->tickets.head, head);
+ if (head & 0x1)
+ tkt_q_do_wake(lock);
+}
+#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
+
static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
{
struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index ad0ad07..cdaefdd 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -7,12 +7,18 @@

#include <linux/types.h>

-#if (CONFIG_NR_CPUS < 256)
+#if (CONFIG_NR_CPUS < 128)
typedef u8 __ticket_t;
typedef u16 __ticketpair_t;
-#else
+#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
+#elif (CONFIG_NR_CPUS < 32768)
typedef u16 __ticket_t;
typedef u32 __ticketpair_t;
+#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
+#else
+typedef u32 __ticket_t;
+typedef u64 __ticketpair_t;
+#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
#endif

#define TICKET_SHIFT (sizeof(__ticket_t) * 8)
@@ -21,7 +27,11 @@ typedef struct arch_spinlock {
union {
__ticketpair_t head_tail;
struct __raw_tickets {
+#ifdef __BIG_ENDIAN__
+ __ticket_t tail, head;
+#else /* #ifdef __BIG_ENDIAN__ */
__ticket_t head, tail;
+#endif /* #else #ifdef __BIG_ENDIAN__ */
} tickets;
};
} arch_spinlock_t;
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e9ef6d6..816a87c 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -15,6 +15,7 @@
#include <asm/byteorder.h>
#include <uapi/linux/kernel.h>

+#define UCHAR_MAX ((u8)(~0U))
#define USHRT_MAX ((u16)(~0U))
#define SHRT_MAX ((s16)(USHRT_MAX>>1))
#define SHRT_MIN ((s16)(-SHRT_MAX - 1))
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 44511d1..900c0f0 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -223,3 +223,38 @@ endif
config MUTEX_SPIN_ON_OWNER
def_bool y
depends on SMP && !DEBUG_MUTEXES
+
+config TICKET_LOCK_QUEUED
+ bool "Dynamically switch between ticket and queued locking"
+ depends on SMP
+ default n
+ ---help---
+ Enable dynamic switching between ticketlock and queued locking
+ on a per-lock basis. This option will slow down low-contention
+ acquisition and release very slightly (additional conditional
+ in release path), but will provide more efficient operation at
+ high levels of lock contention. High-contention operation will
+ not be quite as efficient as would be a pure queued lock, but
+ this dynamic approach consumes less memory than queud locks
+ and also runs faster at low levels of contention.
+
+ Say "Y" if you are running on a large system with a workload
+ that is likely to result in high levels of contention.
+
+ Say "N" if you are unsure.
+
+config TICKET_LOCK_QUEUED_SWITCH
+ int "When to switch from ticket to queued locking"
+ depends on TICKET_LOCK_QUEUED
+ default 8
+ range 3 32
+ ---help---
+ Specify how many tasks should be spinning on the lock before
+ switching to queued mode. Systems with low-latency memory/cache
+ interconnects will prefer larger numbers, while extreme low-latency
+ and real-time workloads will prefer a smaller number. Of course,
+ extreme real-time workloads would be even happier if contention
+ on the locks were reduced to the point that there was never any
+ need for queued locking in the first place.
+
+ Take the default if you are unsure.
diff --git a/kernel/Makefile b/kernel/Makefile
index 271fd31..70a91f7 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -51,6 +51,7 @@ endif
obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
new file mode 100644
index 0000000..912817c
--- /dev/null
+++ b/kernel/tktqlock.c
@@ -0,0 +1,383 @@
+/*
+ * Queued ticket spinlocks.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2013
+ *
+ * Authors: Paul E. McKenney <[email protected]>
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+
+struct tkt_q {
+ int cpu;
+ __ticket_t tail;
+ struct tkt_q *next;
+};
+
+struct tkt_q_head {
+ arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
+ s64 head_tkt; /* Head ticket when started queuing. */
+ struct tkt_q *spin; /* Head of queue. */
+ struct tkt_q **spin_tail; /* Tail of queue. */
+};
+
+/*
+ * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
+ * given ticket lock to motivate switching to spinning on a queue.
+ * The reason that it is twice the number is because the bottom bit of
+ * the ticket is reserved for the bit that indicates that a queue is
+ * associated with the lock.
+ */
+#define TKT_Q_SWITCH ((__ticket_t)(CONFIG_TICKET_LOCK_QUEUED_SWITCH * 2))
+
+/*
+ * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
+ * might have multiple highly contended locks, so provide more queues for
+ * systems with larger numbers of CPUs.
+ */
+#define TKT_Q_NQUEUES (2 * DIV_ROUND_UP(NR_CPUS + ((int)TKT_Q_SWITCH) - 1, \
+ (int)TKT_Q_SWITCH))
+
+/* The queues themselves. */
+struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
+
+/* Advance to the next queue slot, wrapping around to the beginning. */
+static int tkt_q_next_slot(int i)
+{
+ return (++i < TKT_Q_NQUEUES) ? i : 0;
+}
+
+/* Very crude hash from lock address to queue slot number. */
+static unsigned long tkt_q_hash(arch_spinlock_t *lock)
+{
+ return (((unsigned long)lock) >> 8) % TKT_Q_NQUEUES;
+}
+
+/*
+ * Return a pointer to the queue header associated with the specified lock,
+ * or return NULL if there is no queue for the lock or if the lock's queue
+ * is in transition.
+ */
+static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *lock)
+{
+ int i;
+ int start;
+
+ start = i = tkt_q_hash(lock);
+ do
+ if (ACCESS_ONCE(tkt_q_heads[i].ref) == lock)
+ return &tkt_q_heads[i];
+ while ((i = tkt_q_next_slot(i)) != start);
+ return NULL;
+}
+
+/*
+ * Try to stop queuing, reverting back to normal ticket-lock operation.
+ * We can only stop queuing when the queue is empty, which means that
+ * we need to correctly handle races where someone shows up in the queue
+ * just as we are trying to dispense with the queue. They win, we lose.
+ */
+static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)
+{
+ arch_spinlock_t asold;
+ arch_spinlock_t asnew;
+
+ /* Pick up the ticket values. */
+ asold = ACCESS_ONCE(*lock);
+ if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
+
+ /* Attempt to mark the lock as not having a queue. */
+ asnew = asold;
+ asnew.tickets.head &= ~0x1;
+ if (cmpxchg(&lock->head_tail,
+ asold.head_tail,
+ asnew.head_tail) == asold.head_tail) {
+
+ /* Succeeded, mark the queue as unused. */
+ ACCESS_ONCE(tqhp->ref) = NULL;
+ return true;
+ }
+ }
+
+ /* Failed, tell the caller there is still a queue to pass off to. */
+ return false;
+}
+
+/*
+ * Hand the lock off to the first CPU on the queue.
+ */
+void tkt_q_do_wake(arch_spinlock_t *lock)
+{
+ struct tkt_q_head *tqhp;
+ struct tkt_q *tqp;
+
+ /*
+ * If the queue is still being set up, wait for it. Note that
+ * the caller's xadd() provides the needed memory ordering.
+ */
+ while ((tqhp = tkt_q_find_head(lock)) == NULL)
+ cpu_relax();
+
+ for (;;) {
+
+ /* Find the first queue element. */
+ tqp = ACCESS_ONCE(tqhp->spin);
+ if (tqp != NULL)
+ break; /* Element exists, hand off lock. */
+ if (tkt_q_try_unqueue(lock, tqhp))
+ return; /* No element, successfully removed queue. */
+ cpu_relax();
+ }
+ if (ACCESS_ONCE(tqhp->head_tkt) != -1)
+ ACCESS_ONCE(tqhp->head_tkt) = -1;
+ smp_mb(); /* Order pointer fetch and assignment against handoff. */
+ ACCESS_ONCE(tqp->cpu) = -1;
+}
+EXPORT_SYMBOL(tkt_q_do_wake);
+
+/*
+ * Given a lock that already has a queue associated with it, spin on
+ * that queue. Return false if there was no queue (which means we do not
+ * hold the lock) and true otherwise (meaning we -do- hold the lock).
+ */
+bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
+{
+ struct tkt_q **oldtail;
+ struct tkt_q tq;
+ struct tkt_q_head *tqhp;
+
+ /*
+ * Ensure that accesses to queue header happen after sensing
+ * the lock's have-queue bit.
+ */
+ smp_mb(); /* See above block comment. */
+
+ /* If there no longer is a queue, leave. */
+ tqhp = tkt_q_find_head(lock);
+ if (tqhp == NULL)
+ return false;
+
+ /* Initialize our queue element. */
+ tq.cpu = raw_smp_processor_id();
+ tq.tail = inc.tail;
+ tq.next = NULL;
+
+ /* Check to see if we already hold the lock. */
+ if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
+ /* The last holder left before queue formed, we hold lock. */
+ tqhp->head_tkt = -1;
+ return true;
+ }
+
+ /*
+ * Add our element to the tail of the queue. Note that if the
+ * queue is empty, the ->spin_tail pointer will reference
+ * the queue's head pointer, namely ->spin.
+ */
+ oldtail = xchg(&tqhp->spin_tail, &tq.next);
+ ACCESS_ONCE(*oldtail) = &tq;
+
+ /* Spin until handoff. */
+ while (ACCESS_ONCE(tq.cpu) != -1)
+ cpu_relax();
+
+ /*
+ * Remove our element from the queue. If the queue is now empty,
+ * update carefully so that the next acquisition will enqueue itself
+ * at the head of the list. Of course, the next enqueue operation
+ * might be happening concurrently, and this code needs to handle all
+ * of the possible combinations, keeping in mind that the enqueue
+ * operation happens in two stages: (1) update the tail pointer and
+ * (2) update the predecessor's ->next pointer. With this in mind,
+ * the following code needs to deal with three scenarios:
+ *
+ * 1. tq is the last entry. In this case, we use cmpxchg to
+ * point the list tail back to the list head (->spin). If
+ * the cmpxchg fails, that indicates that we are instead
+ * in scenario 2 below. If the cmpxchg succeeds, the next
+ * enqueue operation's tail-pointer exchange will enqueue
+ * the next element at the queue head, because the ->spin_tail
+ * pointer now references the queue head.
+ *
+ * 2. tq is the last entry, and the next entry has updated the
+ * tail pointer but has not yet updated tq.next. In this
+ * case, tq.next is NULL, the cmpxchg will fail, and the
+ * code will wait for the enqueue to complete before completing
+ * removal of tq from the list.
+ *
+ * 3. tq is not the last pointer. In this case, tq.next is non-NULL,
+ * so the following code simply removes tq from the list.
+ */
+ if (tq.next == NULL) {
+
+ /* Mark the queue empty. */
+ tqhp->spin = NULL;
+
+ /* Try to point the tail back at the head. */
+ if (cmpxchg(&tqhp->spin_tail,
+ &tq.next,
+ &tqhp->spin) == &tq.next)
+ return true; /* Succeeded, queue is now empty. */
+
+ /* Failed, if needed, wait for the enqueue to complete. */
+ while (tq.next == NULL)
+ cpu_relax();
+
+ /* The following code will repair the head. */
+ }
+ smp_mb(); /* Force ordering between handoff and critical section. */
+
+ /*
+ * Advance list-head pointer. This same task will be the next to
+ * access this when releasing the lock, so no need for a memory
+ * barrier after the following assignment.
+ */
+ ACCESS_ONCE(tqhp->spin) = tq.next;
+ return true;
+}
+
+/*
+ * Given a lock that does not have a queue, attempt to associate the
+ * i-th queue with it, returning true if successful (meaning we hold
+ * the lock) or false otherwise (meaning we do -not- hold the lock).
+ * Note that the caller has already filled in ->ref with 0x1, so we
+ * own the queue.
+ */
+static bool
+tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
+{
+ arch_spinlock_t asold;
+ arch_spinlock_t asnew;
+ struct tkt_q_head *tqhp;
+
+ /* Initialize the i-th queue header. */
+ tqhp = &tkt_q_heads[i];
+ tqhp->spin = NULL;
+ tqhp->spin_tail = &tqhp->spin;
+
+ /* Each pass through this loop attempts to mark the lock as queued. */
+ do {
+ asold.head_tail = ACCESS_ONCE(lock->head_tail);
+ asnew = asold;
+ if (asnew.tickets.head & 0x1) {
+
+ /* Someone beat us to it, back out. */
+ smp_wmb(); /* Ensure init before NULLing. */
+ ACCESS_ONCE(tqhp->ref) = NULL;
+
+ /* Spin on the queue element they set up. */
+ return tkt_q_do_spin(lock, inc);
+ }
+
+ /*
+ * Record the head counter in case one of the spinning
+ * CPUs already holds the lock but doesn't realize it yet.
+ */
+ tqhp->head_tkt = asold.tickets.head;
+
+ /* The low-order bit in the head counter says "queued". */
+ asnew.tickets.head |= 0x1;
+ } while (cmpxchg(&lock->head_tail,
+ asold.head_tail,
+ asnew.head_tail) != asold.head_tail);
+
+ /* Point the queue at the lock and go spin on it. */
+ ACCESS_ONCE(tqhp->ref) = lock;
+ return tkt_q_do_spin(lock, inc);
+}
+
+/*
+ * Start handling a period of high contention by finding a queue to associate
+ * with this lock. Returns true if successful (in which case we hold the
+ * lock) and false otherwise (in which case we do -not- hold the lock).
+ */
+bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
+{
+ int i;
+ int start;
+
+ /* Hash the lock address to find a starting point. */
+ start = i = tkt_q_hash(lock);
+
+ /*
+ * Each pass through the following loop attempts to associate
+ * the lock with the corresponding queue.
+ */
+ do {
+ /*
+ * Use 0x1 to mark the queue in use, but also avoiding
+ * any spinners trying to use it before we get it all
+ * initialized.
+ */
+ if (!tkt_q_heads[i].ref &&
+ cmpxchg(&tkt_q_heads[i].ref,
+ NULL,
+ (arch_spinlock_t *)0x1) == NULL) {
+
+ /* Succeeded, now go initialize it. */
+ return tkt_q_init_contend(i, lock, inc);
+ }
+
+ /* If someone beat us to it, go spin on their queue. */
+ if (ACCESS_ONCE(lock->tickets.head) & 0x1)
+ return tkt_q_do_spin(lock, inc);
+ } while ((i = tkt_q_next_slot(i)) != start);
+
+ /* All the queues are in use, revert to spinning on the ticket lock. */
+ return false;
+}
+
+bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
+{
+ if (unlikely(inc.head & 0x1)) {
+
+ /* This lock has a queue, so go spin on the queue. */
+ if (tkt_q_do_spin(ap, inc))
+ return true;
+
+ /* Get here if the queue is in transition: Retry next time. */
+
+ } else if (inc.tail - TKT_Q_SWITCH == inc.head) {
+
+ /*
+ * This lock has lots of spinners, but no queue. Go create
+ * a queue to spin on.
+ *
+ * In the common case, only the single task that
+ * sees the head and tail tickets being different by
+ * exactly TKT_Q_SWITCH will come here set up the queue,
+ * which prevents a "thundering herd" of queue setups.
+ * Although it is still possible for an unfortunate series
+ * of lock handoffs and newly arrived tasks to result
+ * in more than one task performing a queue setup, this
+ * is unlikely. Of course, this situation must still be
+ * handled correctly, which is the job of the cmpxchg()
+ * in tkt_q_start_contend().
+ */
+ if (tkt_q_start_contend(ap, inc))
+ return true;
+
+ /* Get here if the queue is in transition: Retry next time. */
+ }
+
+ /* Either no need for a queue or the queue is in transition. Spin. */
+ return false;
+}
+EXPORT_SYMBOL(tkt_spin_pass);

2013-06-12 16:13:50

by Lai Jiangshan

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] v3 Auto-queued ticketlock

On Wed, Jun 12, 2013 at 11:40 PM, Paul E. McKenney
<[email protected]> wrote:
> Breaking up locks is better than implementing high-contention locks, but
> if we must have high-contention locks, why not make them automatically
> switch between light-weight ticket locks at low contention and queued
> locks at high contention? After all, this would remove the need for
> the developer to predict which locks will be highly contended.
>
> This commit allows ticket locks to automatically switch between pure
> ticketlock and queued-lock operation as needed. If too many CPUs are
> spinning on a given ticket lock, a queue structure will be allocated
> and the lock will switch to queued-lock operation. When the lock becomes
> free, it will switch back into ticketlock operation. The low-order bit
> of the head counter is used to indicate that the lock is in queued mode,
> which forces an unconditional mismatch between the head and tail counters.
> This approach means that the common-case code path under conditions of
> low contention is very nearly that of a plain ticket lock.
>
> A fixed number of queueing structures is statically allocated in an
> array. The ticket-lock address is used to hash into an initial element,
> but if that element is already in use, it moves to the next element. If
> the entire array is already in use, continue to spin in ticket mode.
>
> Signed-off-by: Paul E. McKenney <[email protected]>
> [ paulmck: Eliminate duplicate code and update comments (Steven Rostedt). ]
> [ paulmck: Address Eric Dumazet review feedback. ]
> [ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
> [ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]
> [ paulmck: Move cpu_relax() to main spin loop (Steven Rostedt). ]
> [ paulmck: Reduce queue-switch contention (Waiman Long). ]
> [ paulmck: __TKT_SPIN_INC for __ticket_spin_trylock() (Steffen Persvold). ]
> [ paulmck: Type safety fixes (Steven Rostedt). ]
> [ paulmck: Pre-check cmpxchg() value (Waiman Long). ]
> [ paulmck: smp_mb() downgrade to smp_wmb() (Lai Jiangshan). ]
>
> diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
> index 33692ea..5aa0177 100644
> --- a/arch/x86/include/asm/spinlock.h
> +++ b/arch/x86/include/asm/spinlock.h
> @@ -34,6 +34,21 @@
> # define UNLOCK_LOCK_PREFIX
> #endif
>
> +#ifdef CONFIG_TICKET_LOCK_QUEUED
> +
> +#define __TKT_SPIN_INC 2
> +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> +
> +#else /* #ifdef CONFIG_TICKET_LOCK_QUEUED */
> +
> +#define __TKT_SPIN_INC 1
> +static inline bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> +{
> + return false;
> +}
> +
> +#endif /* #else #ifdef CONFIG_TICKET_LOCK_QUEUED */
> +
> /*
> * Ticket locks are conceptually two parts, one indicating the current head of
> * the queue, and the other indicating the current tail. The lock is acquired
> @@ -49,17 +64,16 @@
> */
> static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> {
> - register struct __raw_tickets inc = { .tail = 1 };
> + register struct __raw_tickets inc = { .tail = __TKT_SPIN_INC };
>
> inc = xadd(&lock->tickets, inc);
> -
> for (;;) {
> - if (inc.head == inc.tail)
> + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> break;
> cpu_relax();
> inc.head = ACCESS_ONCE(lock->tickets.head);
> }
> - barrier(); /* make sure nothing creeps before the lock is taken */
> + barrier(); /* Make sure nothing creeps in before the lock is taken. */
> }
>
> static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> @@ -70,17 +84,33 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> if (old.tickets.head != old.tickets.tail)
> return 0;
>
> - new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
> + new.head_tail = old.head_tail + (__TKT_SPIN_INC << TICKET_SHIFT);
>
> /* cmpxchg is a full barrier, so nothing can move before it */
> return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
> }
>
> +#ifndef CONFIG_TICKET_LOCK_QUEUED
> +
> static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> {
> __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
> }
>
> +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> +extern void tkt_q_do_wake(arch_spinlock_t *lock);
> +
> +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> +{
> + __ticket_t head = 2;
> +
> + head = xadd(&lock->tickets.head, head);
> + if (head & 0x1)
> + tkt_q_do_wake(lock);
> +}
> +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> +
> static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
> {
> struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> index ad0ad07..cdaefdd 100644
> --- a/arch/x86/include/asm/spinlock_types.h
> +++ b/arch/x86/include/asm/spinlock_types.h
> @@ -7,12 +7,18 @@
>
> #include <linux/types.h>
>
> -#if (CONFIG_NR_CPUS < 256)
> +#if (CONFIG_NR_CPUS < 128)
> typedef u8 __ticket_t;
> typedef u16 __ticketpair_t;
> -#else
> +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
> +#elif (CONFIG_NR_CPUS < 32768)
> typedef u16 __ticket_t;
> typedef u32 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
> +#else
> +typedef u32 __ticket_t;
> +typedef u64 __ticketpair_t;
> +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
> #endif
>
> #define TICKET_SHIFT (sizeof(__ticket_t) * 8)
> @@ -21,7 +27,11 @@ typedef struct arch_spinlock {
> union {
> __ticketpair_t head_tail;
> struct __raw_tickets {
> +#ifdef __BIG_ENDIAN__
> + __ticket_t tail, head;
> +#else /* #ifdef __BIG_ENDIAN__ */
> __ticket_t head, tail;
> +#endif /* #else #ifdef __BIG_ENDIAN__ */
> } tickets;
> };
> } arch_spinlock_t;
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index e9ef6d6..816a87c 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -15,6 +15,7 @@
> #include <asm/byteorder.h>
> #include <uapi/linux/kernel.h>
>
> +#define UCHAR_MAX ((u8)(~0U))
> #define USHRT_MAX ((u16)(~0U))
> #define SHRT_MAX ((s16)(USHRT_MAX>>1))
> #define SHRT_MIN ((s16)(-SHRT_MAX - 1))
> diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> index 44511d1..900c0f0 100644
> --- a/kernel/Kconfig.locks
> +++ b/kernel/Kconfig.locks
> @@ -223,3 +223,38 @@ endif
> config MUTEX_SPIN_ON_OWNER
> def_bool y
> depends on SMP && !DEBUG_MUTEXES
> +
> +config TICKET_LOCK_QUEUED
> + bool "Dynamically switch between ticket and queued locking"
> + depends on SMP
> + default n
> + ---help---
> + Enable dynamic switching between ticketlock and queued locking
> + on a per-lock basis. This option will slow down low-contention
> + acquisition and release very slightly (additional conditional
> + in release path), but will provide more efficient operation at
> + high levels of lock contention. High-contention operation will
> + not be quite as efficient as would be a pure queued lock, but
> + this dynamic approach consumes less memory than queud locks
> + and also runs faster at low levels of contention.
> +
> + Say "Y" if you are running on a large system with a workload
> + that is likely to result in high levels of contention.
> +
> + Say "N" if you are unsure.
> +
> +config TICKET_LOCK_QUEUED_SWITCH
> + int "When to switch from ticket to queued locking"
> + depends on TICKET_LOCK_QUEUED
> + default 8
> + range 3 32
> + ---help---
> + Specify how many tasks should be spinning on the lock before
> + switching to queued mode. Systems with low-latency memory/cache
> + interconnects will prefer larger numbers, while extreme low-latency
> + and real-time workloads will prefer a smaller number. Of course,
> + extreme real-time workloads would be even happier if contention
> + on the locks were reduced to the point that there was never any
> + need for queued locking in the first place.
> +
> + Take the default if you are unsure.
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 271fd31..70a91f7 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -51,6 +51,7 @@ endif
> obj-$(CONFIG_SMP) += spinlock.o
> obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
> obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
> +obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
> obj-$(CONFIG_UID16) += uid16.o
> obj-$(CONFIG_MODULES) += module.o
> obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
> diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> new file mode 100644
> index 0000000..912817c
> --- /dev/null
> +++ b/kernel/tktqlock.c
> @@ -0,0 +1,383 @@
> +/*
> + * Queued ticket spinlocks.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + * Copyright IBM Corporation, 2013
> + *
> + * Authors: Paul E. McKenney <[email protected]>
> + */
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/spinlock.h>
> +#include <linux/smp.h>
> +#include <linux/percpu.h>
> +
> +struct tkt_q {
> + int cpu;
> + __ticket_t tail;
> + struct tkt_q *next;
> +};
> +
> +struct tkt_q_head {
> + arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
> + s64 head_tkt; /* Head ticket when started queuing. */
> + struct tkt_q *spin; /* Head of queue. */
> + struct tkt_q **spin_tail; /* Tail of queue. */
> +};
> +
> +/*
> + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> + * given ticket lock to motivate switching to spinning on a queue.
> + * The reason that it is twice the number is because the bottom bit of
> + * the ticket is reserved for the bit that indicates that a queue is
> + * associated with the lock.
> + */
> +#define TKT_Q_SWITCH ((__ticket_t)(CONFIG_TICKET_LOCK_QUEUED_SWITCH * 2))
> +
> +/*
> + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> + * might have multiple highly contended locks, so provide more queues for
> + * systems with larger numbers of CPUs.
> + */
> +#define TKT_Q_NQUEUES (2 * DIV_ROUND_UP(NR_CPUS + ((int)TKT_Q_SWITCH) - 1, \
> + (int)TKT_Q_SWITCH))
> +
> +/* The queues themselves. */
> +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
> +
> +/* Advance to the next queue slot, wrapping around to the beginning. */
> +static int tkt_q_next_slot(int i)
> +{
> + return (++i < TKT_Q_NQUEUES) ? i : 0;
> +}
> +
> +/* Very crude hash from lock address to queue slot number. */
> +static unsigned long tkt_q_hash(arch_spinlock_t *lock)
> +{
> + return (((unsigned long)lock) >> 8) % TKT_Q_NQUEUES;
> +}
> +
> +/*
> + * Return a pointer to the queue header associated with the specified lock,
> + * or return NULL if there is no queue for the lock or if the lock's queue
> + * is in transition.
> + */
> +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *lock)
> +{
> + int i;
> + int start;
> +
> + start = i = tkt_q_hash(lock);
> + do
> + if (ACCESS_ONCE(tkt_q_heads[i].ref) == lock)
> + return &tkt_q_heads[i];
> + while ((i = tkt_q_next_slot(i)) != start);
> + return NULL;
> +}
> +
> +/*
> + * Try to stop queuing, reverting back to normal ticket-lock operation.
> + * We can only stop queuing when the queue is empty, which means that
> + * we need to correctly handle races where someone shows up in the queue
> + * just as we are trying to dispense with the queue. They win, we lose.
> + */
> +static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)
> +{
> + arch_spinlock_t asold;
> + arch_spinlock_t asnew;
> +
> + /* Pick up the ticket values. */
> + asold = ACCESS_ONCE(*lock);
> + if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
> +
> + /* Attempt to mark the lock as not having a queue. */
> + asnew = asold;
> + asnew.tickets.head &= ~0x1;
> + if (cmpxchg(&lock->head_tail,
> + asold.head_tail,
> + asnew.head_tail) == asold.head_tail) {
> +
> + /* Succeeded, mark the queue as unused. */
> + ACCESS_ONCE(tqhp->ref) = NULL;
> + return true;
> + }
> + }
> +
> + /* Failed, tell the caller there is still a queue to pass off to. */
> + return false;
> +}
> +
> +/*
> + * Hand the lock off to the first CPU on the queue.
> + */
> +void tkt_q_do_wake(arch_spinlock_t *lock)
> +{
> + struct tkt_q_head *tqhp;
> + struct tkt_q *tqp;
> +
> + /*
> + * If the queue is still being set up, wait for it. Note that
> + * the caller's xadd() provides the needed memory ordering.
> + */
> + while ((tqhp = tkt_q_find_head(lock)) == NULL)
> + cpu_relax();
> +
> + for (;;) {
> +
> + /* Find the first queue element. */
> + tqp = ACCESS_ONCE(tqhp->spin);
> + if (tqp != NULL)
> + break; /* Element exists, hand off lock. */
> + if (tkt_q_try_unqueue(lock, tqhp))
> + return; /* No element, successfully removed queue. */
> + cpu_relax();
> + }
> + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> + ACCESS_ONCE(tqhp->head_tkt) = -1;
> + smp_mb(); /* Order pointer fetch and assignment against handoff. */
> + ACCESS_ONCE(tqp->cpu) = -1;
> +}
> +EXPORT_SYMBOL(tkt_q_do_wake);
> +
> +/*
> + * Given a lock that already has a queue associated with it, spin on
> + * that queue. Return false if there was no queue (which means we do not
> + * hold the lock) and true otherwise (meaning we -do- hold the lock).
> + */
> +bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> +{
> + struct tkt_q **oldtail;
> + struct tkt_q tq;
> + struct tkt_q_head *tqhp;
> +
> + /*
> + * Ensure that accesses to queue header happen after sensing
> + * the lock's have-queue bit.
> + */
> + smp_mb(); /* See above block comment. */
> +
> + /* If there no longer is a queue, leave. */
> + tqhp = tkt_q_find_head(lock);
> + if (tqhp == NULL)
> + return false;
> +
> + /* Initialize our queue element. */
> + tq.cpu = raw_smp_processor_id();
> + tq.tail = inc.tail;
> + tq.next = NULL;
> +
> + /* Check to see if we already hold the lock. */
> + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> + /* The last holder left before queue formed, we hold lock. */
> + tqhp->head_tkt = -1;
> + return true;
> + }
> +
> + /*
> + * Add our element to the tail of the queue. Note that if the
> + * queue is empty, the ->spin_tail pointer will reference
> + * the queue's head pointer, namely ->spin.
> + */
> + oldtail = xchg(&tqhp->spin_tail, &tq.next);
> + ACCESS_ONCE(*oldtail) = &tq;
> +
> + /* Spin until handoff. */
> + while (ACCESS_ONCE(tq.cpu) != -1)
> + cpu_relax();
> +
> + /*
> + * Remove our element from the queue. If the queue is now empty,
> + * update carefully so that the next acquisition will enqueue itself
> + * at the head of the list. Of course, the next enqueue operation
> + * might be happening concurrently, and this code needs to handle all
> + * of the possible combinations, keeping in mind that the enqueue
> + * operation happens in two stages: (1) update the tail pointer and
> + * (2) update the predecessor's ->next pointer. With this in mind,
> + * the following code needs to deal with three scenarios:
> + *
> + * 1. tq is the last entry. In this case, we use cmpxchg to
> + * point the list tail back to the list head (->spin). If
> + * the cmpxchg fails, that indicates that we are instead
> + * in scenario 2 below. If the cmpxchg succeeds, the next
> + * enqueue operation's tail-pointer exchange will enqueue
> + * the next element at the queue head, because the ->spin_tail
> + * pointer now references the queue head.
> + *
> + * 2. tq is the last entry, and the next entry has updated the
> + * tail pointer but has not yet updated tq.next. In this
> + * case, tq.next is NULL, the cmpxchg will fail, and the
> + * code will wait for the enqueue to complete before completing
> + * removal of tq from the list.
> + *
> + * 3. tq is not the last pointer. In this case, tq.next is non-NULL,
> + * so the following code simply removes tq from the list.
> + */
> + if (tq.next == NULL) {
> +
> + /* Mark the queue empty. */
> + tqhp->spin = NULL;
> +
> + /* Try to point the tail back at the head. */
> + if (cmpxchg(&tqhp->spin_tail,
> + &tq.next,
> + &tqhp->spin) == &tq.next)
> + return true; /* Succeeded, queue is now empty. */
> +
> + /* Failed, if needed, wait for the enqueue to complete. */
> + while (tq.next == NULL)
> + cpu_relax();
> +
> + /* The following code will repair the head. */
> + }
> + smp_mb(); /* Force ordering between handoff and critical section. */
> +
> + /*
> + * Advance list-head pointer. This same task will be the next to
> + * access this when releasing the lock, so no need for a memory
> + * barrier after the following assignment.
> + */
> + ACCESS_ONCE(tqhp->spin) = tq.next;
> + return true;
> +}
> +
> +/*
> + * Given a lock that does not have a queue, attempt to associate the
> + * i-th queue with it, returning true if successful (meaning we hold
> + * the lock) or false otherwise (meaning we do -not- hold the lock).
> + * Note that the caller has already filled in ->ref with 0x1, so we
> + * own the queue.
> + */
> +static bool
> +tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
> +{
> + arch_spinlock_t asold;
> + arch_spinlock_t asnew;
> + struct tkt_q_head *tqhp;
> +
> + /* Initialize the i-th queue header. */
> + tqhp = &tkt_q_heads[i];
> + tqhp->spin = NULL;
> + tqhp->spin_tail = &tqhp->spin;
> +
> + /* Each pass through this loop attempts to mark the lock as queued. */
> + do {
> + asold.head_tail = ACCESS_ONCE(lock->head_tail);
> + asnew = asold;
> + if (asnew.tickets.head & 0x1) {
> +
> + /* Someone beat us to it, back out. */
> + smp_wmb(); /* Ensure init before NULLing. */
> + ACCESS_ONCE(tqhp->ref) = NULL;
> +
> + /* Spin on the queue element they set up. */
> + return tkt_q_do_spin(lock, inc);
> + }
> +
> + /*
> + * Record the head counter in case one of the spinning
> + * CPUs already holds the lock but doesn't realize it yet.
> + */
> + tqhp->head_tkt = asold.tickets.head;
> +
> + /* The low-order bit in the head counter says "queued". */
> + asnew.tickets.head |= 0x1;

if asold.tickets.head == inc.tail, we will quickly success in the next steps,
we don't need to cancel&return directly to avoid unneeded redo from other cpu.
but what if in tkt_q_start_contend() ... ?

> + } while (cmpxchg(&lock->head_tail,
> + asold.head_tail,
> + asnew.head_tail) != asold.head_tail);
> +
> + /* Point the queue at the lock and go spin on it. */
> + ACCESS_ONCE(tqhp->ref) = lock;
> + return tkt_q_do_spin(lock, inc);
> +}
> +
> +/*
> + * Start handling a period of high contention by finding a queue to associate
> + * with this lock. Returns true if successful (in which case we hold the
> + * lock) and false otherwise (in which case we do -not- hold the lock).
> + */
> +bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
> +{
> + int i;
> + int start;
> +
> + /* Hash the lock address to find a starting point. */
> + start = i = tkt_q_hash(lock);
> +
> + /*
> + * Each pass through the following loop attempts to associate
> + * the lock with the corresponding queue.
> + */
> + do {
> + /*
> + * Use 0x1 to mark the queue in use, but also avoiding
> + * any spinners trying to use it before we get it all
> + * initialized.
> + */
> + if (!tkt_q_heads[i].ref &&
> + cmpxchg(&tkt_q_heads[i].ref,
> + NULL,
> + (arch_spinlock_t *)0x1) == NULL) {
> +
> + /* Succeeded, now go initialize it. */
> + return tkt_q_init_contend(i, lock, inc);
> + }
> +
> + /* If someone beat us to it, go spin on their queue. */
> + if (ACCESS_ONCE(lock->tickets.head) & 0x1)
> + return tkt_q_do_spin(lock, inc);

if (ACCESS_ONCE(lock->tickets.head) == inc.tail)
return true;

> + } while ((i = tkt_q_next_slot(i)) != start);
> +
> + /* All the queues are in use, revert to spinning on the ticket lock. */
> + return false;
> +}
> +
> +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> +{
> + if (unlikely(inc.head & 0x1)) {
> +
> + /* This lock has a queue, so go spin on the queue. */
> + if (tkt_q_do_spin(ap, inc))
> + return true;
> +
> + /* Get here if the queue is in transition: Retry next time. */
> +
> + } else if (inc.tail - TKT_Q_SWITCH == inc.head) {
> +
> + /*
> + * This lock has lots of spinners, but no queue. Go create
> + * a queue to spin on.
> + *
> + * In the common case, only the single task that
> + * sees the head and tail tickets being different by
> + * exactly TKT_Q_SWITCH will come here set up the queue,
> + * which prevents a "thundering herd" of queue setups.
> + * Although it is still possible for an unfortunate series
> + * of lock handoffs and newly arrived tasks to result
> + * in more than one task performing a queue setup, this
> + * is unlikely. Of course, this situation must still be
> + * handled correctly, which is the job of the cmpxchg()
> + * in tkt_q_start_contend().
> + */
> + if (tkt_q_start_contend(ap, inc))
> + return true;
> +
> + /* Get here if the queue is in transition: Retry next time. */
> + }
> +
> + /* Either no need for a queue or the queue is in transition. Spin. */
> + return false;
> +}
> +EXPORT_SYMBOL(tkt_spin_pass);
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2013-06-12 16:59:28

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] v3 Auto-queued ticketlock

On Thu, Jun 13, 2013 at 12:13:47AM +0800, Lai Jiangshan wrote:
> On Wed, Jun 12, 2013 at 11:40 PM, Paul E. McKenney
> <[email protected]> wrote:
> > Breaking up locks is better than implementing high-contention locks, but
> > if we must have high-contention locks, why not make them automatically
> > switch between light-weight ticket locks at low contention and queued
> > locks at high contention? After all, this would remove the need for
> > the developer to predict which locks will be highly contended.
> >
> > This commit allows ticket locks to automatically switch between pure
> > ticketlock and queued-lock operation as needed. If too many CPUs are
> > spinning on a given ticket lock, a queue structure will be allocated
> > and the lock will switch to queued-lock operation. When the lock becomes
> > free, it will switch back into ticketlock operation. The low-order bit
> > of the head counter is used to indicate that the lock is in queued mode,
> > which forces an unconditional mismatch between the head and tail counters.
> > This approach means that the common-case code path under conditions of
> > low contention is very nearly that of a plain ticket lock.
> >
> > A fixed number of queueing structures is statically allocated in an
> > array. The ticket-lock address is used to hash into an initial element,
> > but if that element is already in use, it moves to the next element. If
> > the entire array is already in use, continue to spin in ticket mode.
> >
> > Signed-off-by: Paul E. McKenney <[email protected]>
> > [ paulmck: Eliminate duplicate code and update comments (Steven Rostedt). ]
> > [ paulmck: Address Eric Dumazet review feedback. ]
> > [ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
> > [ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]
> > [ paulmck: Move cpu_relax() to main spin loop (Steven Rostedt). ]
> > [ paulmck: Reduce queue-switch contention (Waiman Long). ]
> > [ paulmck: __TKT_SPIN_INC for __ticket_spin_trylock() (Steffen Persvold). ]
> > [ paulmck: Type safety fixes (Steven Rostedt). ]
> > [ paulmck: Pre-check cmpxchg() value (Waiman Long). ]
> > [ paulmck: smp_mb() downgrade to smp_wmb() (Lai Jiangshan). ]
> >
> > diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
> > index 33692ea..5aa0177 100644
> > --- a/arch/x86/include/asm/spinlock.h
> > +++ b/arch/x86/include/asm/spinlock.h
> > @@ -34,6 +34,21 @@
> > # define UNLOCK_LOCK_PREFIX
> > #endif
> >
> > +#ifdef CONFIG_TICKET_LOCK_QUEUED
> > +
> > +#define __TKT_SPIN_INC 2
> > +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc);
> > +
> > +#else /* #ifdef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > +#define __TKT_SPIN_INC 1
> > +static inline bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> > +{
> > + return false;
> > +}
> > +
> > +#endif /* #else #ifdef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > /*
> > * Ticket locks are conceptually two parts, one indicating the current head of
> > * the queue, and the other indicating the current tail. The lock is acquired
> > @@ -49,17 +64,16 @@
> > */
> > static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
> > {
> > - register struct __raw_tickets inc = { .tail = 1 };
> > + register struct __raw_tickets inc = { .tail = __TKT_SPIN_INC };
> >
> > inc = xadd(&lock->tickets, inc);
> > -
> > for (;;) {
> > - if (inc.head == inc.tail)
> > + if (inc.head == inc.tail || tkt_spin_pass(lock, inc))
> > break;
> > cpu_relax();
> > inc.head = ACCESS_ONCE(lock->tickets.head);
> > }
> > - barrier(); /* make sure nothing creeps before the lock is taken */
> > + barrier(); /* Make sure nothing creeps in before the lock is taken. */
> > }
> >
> > static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> > @@ -70,17 +84,33 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> > if (old.tickets.head != old.tickets.tail)
> > return 0;
> >
> > - new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
> > + new.head_tail = old.head_tail + (__TKT_SPIN_INC << TICKET_SHIFT);
> >
> > /* cmpxchg is a full barrier, so nothing can move before it */
> > return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
> > }
> >
> > +#ifndef CONFIG_TICKET_LOCK_QUEUED
> > +
> > static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> > {
> > __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
> > }
> >
> > +#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > +extern void tkt_q_do_wake(arch_spinlock_t *lock);
> > +
> > +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> > +{
> > + __ticket_t head = 2;
> > +
> > + head = xadd(&lock->tickets.head, head);
> > + if (head & 0x1)
> > + tkt_q_do_wake(lock);
> > +}
> > +#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> > +
> > static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
> > {
> > struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> > diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> > index ad0ad07..cdaefdd 100644
> > --- a/arch/x86/include/asm/spinlock_types.h
> > +++ b/arch/x86/include/asm/spinlock_types.h
> > @@ -7,12 +7,18 @@
> >
> > #include <linux/types.h>
> >
> > -#if (CONFIG_NR_CPUS < 256)
> > +#if (CONFIG_NR_CPUS < 128)
> > typedef u8 __ticket_t;
> > typedef u16 __ticketpair_t;
> > -#else
> > +#define TICKET_T_CMP_GE(a, b) (UCHAR_MAX / 2 >= (unsigned char)((a) - (b)))
> > +#elif (CONFIG_NR_CPUS < 32768)
> > typedef u16 __ticket_t;
> > typedef u32 __ticketpair_t;
> > +#define TICKET_T_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
> > +#else
> > +typedef u32 __ticket_t;
> > +typedef u64 __ticketpair_t;
> > +#define TICKET_T_CMP_GE(a, b) (UINT_MAX / 2 >= (unsigned int)((a) - (b)))
> > #endif
> >
> > #define TICKET_SHIFT (sizeof(__ticket_t) * 8)
> > @@ -21,7 +27,11 @@ typedef struct arch_spinlock {
> > union {
> > __ticketpair_t head_tail;
> > struct __raw_tickets {
> > +#ifdef __BIG_ENDIAN__
> > + __ticket_t tail, head;
> > +#else /* #ifdef __BIG_ENDIAN__ */
> > __ticket_t head, tail;
> > +#endif /* #else #ifdef __BIG_ENDIAN__ */
> > } tickets;
> > };
> > } arch_spinlock_t;
> > diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> > index e9ef6d6..816a87c 100644
> > --- a/include/linux/kernel.h
> > +++ b/include/linux/kernel.h
> > @@ -15,6 +15,7 @@
> > #include <asm/byteorder.h>
> > #include <uapi/linux/kernel.h>
> >
> > +#define UCHAR_MAX ((u8)(~0U))
> > #define USHRT_MAX ((u16)(~0U))
> > #define SHRT_MAX ((s16)(USHRT_MAX>>1))
> > #define SHRT_MIN ((s16)(-SHRT_MAX - 1))
> > diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> > index 44511d1..900c0f0 100644
> > --- a/kernel/Kconfig.locks
> > +++ b/kernel/Kconfig.locks
> > @@ -223,3 +223,38 @@ endif
> > config MUTEX_SPIN_ON_OWNER
> > def_bool y
> > depends on SMP && !DEBUG_MUTEXES
> > +
> > +config TICKET_LOCK_QUEUED
> > + bool "Dynamically switch between ticket and queued locking"
> > + depends on SMP
> > + default n
> > + ---help---
> > + Enable dynamic switching between ticketlock and queued locking
> > + on a per-lock basis. This option will slow down low-contention
> > + acquisition and release very slightly (additional conditional
> > + in release path), but will provide more efficient operation at
> > + high levels of lock contention. High-contention operation will
> > + not be quite as efficient as would be a pure queued lock, but
> > + this dynamic approach consumes less memory than queud locks
> > + and also runs faster at low levels of contention.
> > +
> > + Say "Y" if you are running on a large system with a workload
> > + that is likely to result in high levels of contention.
> > +
> > + Say "N" if you are unsure.
> > +
> > +config TICKET_LOCK_QUEUED_SWITCH
> > + int "When to switch from ticket to queued locking"
> > + depends on TICKET_LOCK_QUEUED
> > + default 8
> > + range 3 32
> > + ---help---
> > + Specify how many tasks should be spinning on the lock before
> > + switching to queued mode. Systems with low-latency memory/cache
> > + interconnects will prefer larger numbers, while extreme low-latency
> > + and real-time workloads will prefer a smaller number. Of course,
> > + extreme real-time workloads would be even happier if contention
> > + on the locks were reduced to the point that there was never any
> > + need for queued locking in the first place.
> > +
> > + Take the default if you are unsure.
> > diff --git a/kernel/Makefile b/kernel/Makefile
> > index 271fd31..70a91f7 100644
> > --- a/kernel/Makefile
> > +++ b/kernel/Makefile
> > @@ -51,6 +51,7 @@ endif
> > obj-$(CONFIG_SMP) += spinlock.o
> > obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
> > obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
> > +obj-$(CONFIG_TICKET_LOCK_QUEUED) += tktqlock.o
> > obj-$(CONFIG_UID16) += uid16.o
> > obj-$(CONFIG_MODULES) += module.o
> > obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
> > diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> > new file mode 100644
> > index 0000000..912817c
> > --- /dev/null
> > +++ b/kernel/tktqlock.c
> > @@ -0,0 +1,383 @@
> > +/*
> > + * Queued ticket spinlocks.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, write to the Free Software
> > + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> > + *
> > + * Copyright IBM Corporation, 2013
> > + *
> > + * Authors: Paul E. McKenney <[email protected]>
> > + */
> > +#include <linux/types.h>
> > +#include <linux/kernel.h>
> > +#include <linux/spinlock.h>
> > +#include <linux/smp.h>
> > +#include <linux/percpu.h>
> > +
> > +struct tkt_q {
> > + int cpu;
> > + __ticket_t tail;
> > + struct tkt_q *next;
> > +};
> > +
> > +struct tkt_q_head {
> > + arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
> > + s64 head_tkt; /* Head ticket when started queuing. */
> > + struct tkt_q *spin; /* Head of queue. */
> > + struct tkt_q **spin_tail; /* Tail of queue. */
> > +};
> > +
> > +/*
> > + * TKT_Q_SWITCH is twice the number of CPUs that must be spinning on a
> > + * given ticket lock to motivate switching to spinning on a queue.
> > + * The reason that it is twice the number is because the bottom bit of
> > + * the ticket is reserved for the bit that indicates that a queue is
> > + * associated with the lock.
> > + */
> > +#define TKT_Q_SWITCH ((__ticket_t)(CONFIG_TICKET_LOCK_QUEUED_SWITCH * 2))
> > +
> > +/*
> > + * TKT_Q_NQUEUES is the number of queues to maintain. Large systems
> > + * might have multiple highly contended locks, so provide more queues for
> > + * systems with larger numbers of CPUs.
> > + */
> > +#define TKT_Q_NQUEUES (2 * DIV_ROUND_UP(NR_CPUS + ((int)TKT_Q_SWITCH) - 1, \
> > + (int)TKT_Q_SWITCH))
> > +
> > +/* The queues themselves. */
> > +struct tkt_q_head tkt_q_heads[TKT_Q_NQUEUES];
> > +
> > +/* Advance to the next queue slot, wrapping around to the beginning. */
> > +static int tkt_q_next_slot(int i)
> > +{
> > + return (++i < TKT_Q_NQUEUES) ? i : 0;
> > +}
> > +
> > +/* Very crude hash from lock address to queue slot number. */
> > +static unsigned long tkt_q_hash(arch_spinlock_t *lock)
> > +{
> > + return (((unsigned long)lock) >> 8) % TKT_Q_NQUEUES;
> > +}
> > +
> > +/*
> > + * Return a pointer to the queue header associated with the specified lock,
> > + * or return NULL if there is no queue for the lock or if the lock's queue
> > + * is in transition.
> > + */
> > +static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *lock)
> > +{
> > + int i;
> > + int start;
> > +
> > + start = i = tkt_q_hash(lock);
> > + do
> > + if (ACCESS_ONCE(tkt_q_heads[i].ref) == lock)
> > + return &tkt_q_heads[i];
> > + while ((i = tkt_q_next_slot(i)) != start);
> > + return NULL;
> > +}
> > +
> > +/*
> > + * Try to stop queuing, reverting back to normal ticket-lock operation.
> > + * We can only stop queuing when the queue is empty, which means that
> > + * we need to correctly handle races where someone shows up in the queue
> > + * just as we are trying to dispense with the queue. They win, we lose.
> > + */
> > +static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)
> > +{
> > + arch_spinlock_t asold;
> > + arch_spinlock_t asnew;
> > +
> > + /* Pick up the ticket values. */
> > + asold = ACCESS_ONCE(*lock);
> > + if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
> > +
> > + /* Attempt to mark the lock as not having a queue. */
> > + asnew = asold;
> > + asnew.tickets.head &= ~0x1;
> > + if (cmpxchg(&lock->head_tail,
> > + asold.head_tail,
> > + asnew.head_tail) == asold.head_tail) {
> > +
> > + /* Succeeded, mark the queue as unused. */
> > + ACCESS_ONCE(tqhp->ref) = NULL;
> > + return true;
> > + }
> > + }
> > +
> > + /* Failed, tell the caller there is still a queue to pass off to. */
> > + return false;
> > +}
> > +
> > +/*
> > + * Hand the lock off to the first CPU on the queue.
> > + */
> > +void tkt_q_do_wake(arch_spinlock_t *lock)
> > +{
> > + struct tkt_q_head *tqhp;
> > + struct tkt_q *tqp;
> > +
> > + /*
> > + * If the queue is still being set up, wait for it. Note that
> > + * the caller's xadd() provides the needed memory ordering.
> > + */
> > + while ((tqhp = tkt_q_find_head(lock)) == NULL)
> > + cpu_relax();
> > +
> > + for (;;) {
> > +
> > + /* Find the first queue element. */
> > + tqp = ACCESS_ONCE(tqhp->spin);
> > + if (tqp != NULL)
> > + break; /* Element exists, hand off lock. */
> > + if (tkt_q_try_unqueue(lock, tqhp))
> > + return; /* No element, successfully removed queue. */
> > + cpu_relax();
> > + }
> > + if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> > + ACCESS_ONCE(tqhp->head_tkt) = -1;
> > + smp_mb(); /* Order pointer fetch and assignment against handoff. */
> > + ACCESS_ONCE(tqp->cpu) = -1;
> > +}
> > +EXPORT_SYMBOL(tkt_q_do_wake);
> > +
> > +/*
> > + * Given a lock that already has a queue associated with it, spin on
> > + * that queue. Return false if there was no queue (which means we do not
> > + * hold the lock) and true otherwise (meaning we -do- hold the lock).
> > + */
> > +bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> > +{
> > + struct tkt_q **oldtail;
> > + struct tkt_q tq;
> > + struct tkt_q_head *tqhp;
> > +
> > + /*
> > + * Ensure that accesses to queue header happen after sensing
> > + * the lock's have-queue bit.
> > + */
> > + smp_mb(); /* See above block comment. */
> > +
> > + /* If there no longer is a queue, leave. */
> > + tqhp = tkt_q_find_head(lock);
> > + if (tqhp == NULL)
> > + return false;
> > +
> > + /* Initialize our queue element. */
> > + tq.cpu = raw_smp_processor_id();
> > + tq.tail = inc.tail;
> > + tq.next = NULL;
> > +
> > + /* Check to see if we already hold the lock. */
> > + if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> > + /* The last holder left before queue formed, we hold lock. */
> > + tqhp->head_tkt = -1;
> > + return true;
> > + }
> > +
> > + /*
> > + * Add our element to the tail of the queue. Note that if the
> > + * queue is empty, the ->spin_tail pointer will reference
> > + * the queue's head pointer, namely ->spin.
> > + */
> > + oldtail = xchg(&tqhp->spin_tail, &tq.next);
> > + ACCESS_ONCE(*oldtail) = &tq;
> > +
> > + /* Spin until handoff. */
> > + while (ACCESS_ONCE(tq.cpu) != -1)
> > + cpu_relax();
> > +
> > + /*
> > + * Remove our element from the queue. If the queue is now empty,
> > + * update carefully so that the next acquisition will enqueue itself
> > + * at the head of the list. Of course, the next enqueue operation
> > + * might be happening concurrently, and this code needs to handle all
> > + * of the possible combinations, keeping in mind that the enqueue
> > + * operation happens in two stages: (1) update the tail pointer and
> > + * (2) update the predecessor's ->next pointer. With this in mind,
> > + * the following code needs to deal with three scenarios:
> > + *
> > + * 1. tq is the last entry. In this case, we use cmpxchg to
> > + * point the list tail back to the list head (->spin). If
> > + * the cmpxchg fails, that indicates that we are instead
> > + * in scenario 2 below. If the cmpxchg succeeds, the next
> > + * enqueue operation's tail-pointer exchange will enqueue
> > + * the next element at the queue head, because the ->spin_tail
> > + * pointer now references the queue head.
> > + *
> > + * 2. tq is the last entry, and the next entry has updated the
> > + * tail pointer but has not yet updated tq.next. In this
> > + * case, tq.next is NULL, the cmpxchg will fail, and the
> > + * code will wait for the enqueue to complete before completing
> > + * removal of tq from the list.
> > + *
> > + * 3. tq is not the last pointer. In this case, tq.next is non-NULL,
> > + * so the following code simply removes tq from the list.
> > + */
> > + if (tq.next == NULL) {
> > +
> > + /* Mark the queue empty. */
> > + tqhp->spin = NULL;
> > +
> > + /* Try to point the tail back at the head. */
> > + if (cmpxchg(&tqhp->spin_tail,
> > + &tq.next,
> > + &tqhp->spin) == &tq.next)
> > + return true; /* Succeeded, queue is now empty. */
> > +
> > + /* Failed, if needed, wait for the enqueue to complete. */
> > + while (tq.next == NULL)
> > + cpu_relax();
> > +
> > + /* The following code will repair the head. */
> > + }
> > + smp_mb(); /* Force ordering between handoff and critical section. */
> > +
> > + /*
> > + * Advance list-head pointer. This same task will be the next to
> > + * access this when releasing the lock, so no need for a memory
> > + * barrier after the following assignment.
> > + */
> > + ACCESS_ONCE(tqhp->spin) = tq.next;
> > + return true;
> > +}
> > +
> > +/*
> > + * Given a lock that does not have a queue, attempt to associate the
> > + * i-th queue with it, returning true if successful (meaning we hold
> > + * the lock) or false otherwise (meaning we do -not- hold the lock).
> > + * Note that the caller has already filled in ->ref with 0x1, so we
> > + * own the queue.
> > + */
> > +static bool
> > +tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
> > +{
> > + arch_spinlock_t asold;
> > + arch_spinlock_t asnew;
> > + struct tkt_q_head *tqhp;
> > +
> > + /* Initialize the i-th queue header. */
> > + tqhp = &tkt_q_heads[i];
> > + tqhp->spin = NULL;
> > + tqhp->spin_tail = &tqhp->spin;
> > +
> > + /* Each pass through this loop attempts to mark the lock as queued. */
> > + do {
> > + asold.head_tail = ACCESS_ONCE(lock->head_tail);
> > + asnew = asold;
> > + if (asnew.tickets.head & 0x1) {
> > +
> > + /* Someone beat us to it, back out. */
> > + smp_wmb(); /* Ensure init before NULLing. */
> > + ACCESS_ONCE(tqhp->ref) = NULL;
> > +
> > + /* Spin on the queue element they set up. */
> > + return tkt_q_do_spin(lock, inc);
> > + }
> > +
> > + /*
> > + * Record the head counter in case one of the spinning
> > + * CPUs already holds the lock but doesn't realize it yet.
> > + */
> > + tqhp->head_tkt = asold.tickets.head;
> > +
> > + /* The low-order bit in the head counter says "queued". */
> > + asnew.tickets.head |= 0x1;
>
> if asold.tickets.head == inc.tail, we will quickly success in the next steps,
> we don't need to cancel&return directly to avoid unneeded redo from other cpu.
> but what if in tkt_q_start_contend() ... ?

I am not completely sure that I understand what you are asking.

The straightforward answer is that ->head_tkt handles this case, and it
is checked before we start spinning on our queue element.

Another possible answer is that the current version tends to avoid
having multiple CPUs attempting to concurrently switch a given lock to
queued mode.

So what was the real question? ;-)

> > + } while (cmpxchg(&lock->head_tail,
> > + asold.head_tail,
> > + asnew.head_tail) != asold.head_tail);
> > +
> > + /* Point the queue at the lock and go spin on it. */
> > + ACCESS_ONCE(tqhp->ref) = lock;
> > + return tkt_q_do_spin(lock, inc);
> > +}
> > +
> > +/*
> > + * Start handling a period of high contention by finding a queue to associate
> > + * with this lock. Returns true if successful (in which case we hold the
> > + * lock) and false otherwise (in which case we do -not- hold the lock).
> > + */
> > +bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
> > +{
> > + int i;
> > + int start;
> > +
> > + /* Hash the lock address to find a starting point. */
> > + start = i = tkt_q_hash(lock);
> > +
> > + /*
> > + * Each pass through the following loop attempts to associate
> > + * the lock with the corresponding queue.
> > + */
> > + do {
> > + /*
> > + * Use 0x1 to mark the queue in use, but also avoiding
> > + * any spinners trying to use it before we get it all
> > + * initialized.
> > + */
> > + if (!tkt_q_heads[i].ref &&
> > + cmpxchg(&tkt_q_heads[i].ref,
> > + NULL,
> > + (arch_spinlock_t *)0x1) == NULL) {
> > +
> > + /* Succeeded, now go initialize it. */
> > + return tkt_q_init_contend(i, lock, inc);
> > + }
> > +
> > + /* If someone beat us to it, go spin on their queue. */
> > + if (ACCESS_ONCE(lock->tickets.head) & 0x1)
> > + return tkt_q_do_spin(lock, inc);
>
> if (ACCESS_ONCE(lock->tickets.head) == inc.tail)
> return true;

Is this really a good change? The downside is that we fail to put the
lock into queued mode when it needs it. Sooner or later, someone else
should do so, but we will be suffering from memory contention in the
meantime. It is not clear to me that this is a win.

> > + } while ((i = tkt_q_next_slot(i)) != start);
> > +
> > + /* All the queues are in use, revert to spinning on the ticket lock. */
> > + return false;
> > +}
> > +
> > +bool tkt_spin_pass(arch_spinlock_t *ap, struct __raw_tickets inc)
> > +{
> > + if (unlikely(inc.head & 0x1)) {
> > +
> > + /* This lock has a queue, so go spin on the queue. */
> > + if (tkt_q_do_spin(ap, inc))
> > + return true;
> > +
> > + /* Get here if the queue is in transition: Retry next time. */
> > +
> > + } else if (inc.tail - TKT_Q_SWITCH == inc.head) {
> > +
> > + /*
> > + * This lock has lots of spinners, but no queue. Go create
> > + * a queue to spin on.
> > + *
> > + * In the common case, only the single task that
> > + * sees the head and tail tickets being different by
> > + * exactly TKT_Q_SWITCH will come here set up the queue,
> > + * which prevents a "thundering herd" of queue setups.
> > + * Although it is still possible for an unfortunate series
> > + * of lock handoffs and newly arrived tasks to result
> > + * in more than one task performing a queue setup, this
> > + * is unlikely. Of course, this situation must still be
> > + * handled correctly, which is the job of the cmpxchg()
> > + * in tkt_q_start_contend().
> > + */
> > + if (tkt_q_start_contend(ap, inc))
> > + return true;
> > +
> > + /* Get here if the queue is in transition: Retry next time. */
> > + }
> > +
> > + /* Either no need for a queue or the queue is in transition. Spin. */
> > + return false;
> > +}
> > +EXPORT_SYMBOL(tkt_spin_pass);
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at http://www.tux.org/lkml/
>

2013-06-12 17:50:13

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Tue, 2013-06-11 at 14:10 -0400, Steven Rostedt wrote:
> Perhaps short work loads have a cold cache, and the impact on cache is
> not as drastic?
>
> It would be interesting to see what perf reports on these runs.

After running the aim7 workloads on Paul's v3 patch (same 80 core, 8
socket box - HT off) the results are quite similar to the v1. One
difference is that the five_sec workload benefited with +15% throughput
after 500 users.

Taking a further look at each workload:

* five_sec: spends a large amount of time in the newish mcs style lock
at the spin on owner for the inode->i_mutex:

24.13% 315655 reaim [kernel.kallsyms] [k] mspin_lock
|
--- mspin_lock
|
|--99.76%-- __mutex_lock_killable_slowpath
| mutex_lock_killable
| vfs_readdir
| SyS_getdents
| system_call_fastpath
| __getdents64

With this patch:
23.56% 310531 reaim [kernel.kallsyms] [k] mspin_lock
|
--- mspin_lock
|
|--99.78%-- __mutex_lock_killable_slowpath
| mutex_lock_killable
| vfs_readdir
| SyS_getdents
| system_call
| __getdents64

* custom: Got a -33% throughput regression with this patch with 10-100
users and -46% with 100 users and up. It spends most kernel space time
dealing trying to take the inode->i_mutex and the ext4 ->s_orphan_lock
(note that all runs are performed on ramdisks with ext4):

3.12% 137131 reaim [kernel.kallsyms] [k] mspin_lock
|
--- mspin_lock
|
|--82.98%-- __mutex_lock_killable_slowpath
| mutex_lock_killable
| vfs_readdir
| SyS_getdents
| system_call_fastpath
| __getdents64
|
|--16.97%-- __mutex_lock_slowpath
| mutex_lock
| |
| |--47.65%-- ext4_orphan_del
| |--45.01%-- ext4_orphan_add

With this patch:
2.14% 109982 reaim [kernel.kallsyms] [k] mspin_lock
|
--- mspin_lock
|
|--68.67%-- __mutex_lock_killable_slowpath
| mutex_lock_killable
| vfs_readdir
| SyS_getdents
| system_call
| __getdents64
|
|--31.24%-- __mutex_lock_slowpath
| mutex_lock
| |
| |--40.36%-- ext4_orphan_del


* short: is the big winner for this patch, +69% throughput improvement
with 100-2000 users. This makes a lot of sense since the workload spends
a ridiculous amount of time trying to acquire the d_lock:

84.86% 1569902 reaim [kernel.kallsyms] [k] _raw_spin_lock
|
--- _raw_spin_lock
|
|--49.96%-- dget_parent
| __fsnotify_parent
|--49.71%-- dput
| |
| |--99.98%-- __fsnotify_parent

With this patch:
70.65% 467422 reaim [kernel.kallsyms] [k] tkt_q_do_spin
|
--- tkt_q_do_spin
|
|--100.00%-- tkt_spin_pass
| |
| |--100.00%-- _raw_spin_lock
| | |
| | |--50.07%-- dget_parent
| | | __fsnotify_parent
| | |--49.93%-- dput
| | | __fsnotify_parent


* disk: This patch benefits when adding more concurrency. Got -57% with
10-100 users, -25% with 100-1000 users and +8% with over 1000 users.
Spends a good amount of time dealing with the wait_queue lock. The perf
traces are with 80 users, where we see the worst numbers:

22.34% 20400 reaim [kernel.kallsyms] [k] _raw_spin_lock_irqsave
|
--- _raw_spin_lock_irqsave
|
|--50.28%-- __wake_up
| |
| |--99.10%-- __wake_up_bit
| | wake_up_bit
| | unlock_buffer
|
|--33.73%-- prepare_to_wait_exclusive
| __wait_on_bit_lock
| out_of_line_wait_on_bit_lock
| __lock_buffer
| do_get_write_access
| jbd2_journal_get_write_access
| __ext4_journal_get_write_access
|--14.76%-- finish_wait
| |
| |--98.93%-- __wait_on_bit_lock
| | out_of_line_wait_on_bit_lock
| | __lock_buffer
| | do_get_write_access
| | jbd2_journal_get_write_access
| | __ext4_journal_get_write_access


With this patch the the time spent in the mentioned spinlocks
considerably reduced:
8.09% 6237 reaim [kernel.kallsyms] [k] __read_lock_failed
|
--- __read_lock_failed
_raw_read_lock
|
|--99.08%-- start_this_handle
| jbd2__journal_start
| __ext4_journal_start_sb

1.48% 1032 reaim [kernel.kallsyms] [k] _raw_spin_lock_irqsave
|
--- _raw_spin_lock_irqsave
|
|--50.77%-- prepare_to_wait
| |
| |--72.61%-- jbd2_log_wait_commit
| | jbd2_complete_transaction
|--21.76%-- prepare_to_wait_exclusive
| __wait_on_bit_lock
| out_of_line_wait_on_bit_lock
| __lock_buffer
| do_get_write_access
| jbd2_journal_get_write_access
|--11.46%-- __wake_up
| |
| |--44.21%-- ftrace_define_fields_jbd2_run_stats
| | __ext4_journal_stop
|--10.39%-- finish_wait
| |
| |--53.18%-- __wait_on_bit_lock
| | out_of_line_wait_on_bit_lock
| | __lock_buffer
| | do_get_write_access
| | jbd2_journal_get_write_access
| | __ext4_journal_get_write_access


Thanks,
Davidlohr

2013-06-12 18:15:24

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Wed, Jun 12, 2013 at 10:50 AM, Davidlohr Bueso
<[email protected]> wrote:
>
> * short: is the big winner for this patch, +69% throughput improvement
> with 100-2000 users. This makes a lot of sense since the workload spends
> a ridiculous amount of time trying to acquire the d_lock:
>
> 84.86% 1569902 reaim [kernel.kallsyms] [k] _raw_spin_lock
> |
> --- _raw_spin_lock
> |
> |--49.96%-- dget_parent
> | __fsnotify_parent
> |--49.71%-- dput

Ugh. Do you have any idea what the heck that thing actually does?

Normally, we shouldn't see lots of dget contention, since the dcache
these days does everything but the last path component locklessly.

But there's a few exceptions, like symlinks (act as "last component"
in the middle). And obviously, if some crazy threaded program opens
the *same* file concurrently over and over again, then that "last
component" will hammer on the dentry lock of that particular path. But
that "open the same file concurrently" seems totally unrealistic -
although maybe that's what AIM does..

Anybody know the AIM subtests?

Also, we *may* actually be able to optimize this by making
dentry->d_count atomic, which will allow us to often do dget_parent
and put() without taking the dcache lock at all. That's what it used
to be, but the RCU patches actually made it be protected by the
d_lock. It made sense at the time, as a step in the sequence, and many
of the dentry d_count accesses are under the lock, but now that the
remaining hot-paths are dget_parent and dput and many of the dentry
d_count increments are gone from the hot-paths, we might want to
re-visit that decision. It could go either way.

Al, comments?

Linus

2013-06-12 18:18:48

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Wed, 2013-06-12 at 10:50 -0700, Davidlohr Bueso wrote:
> On Tue, 2013-06-11 at 14:10 -0400, Steven Rostedt wrote:
> > Perhaps short work loads have a cold cache, and the impact on cache is
> > not as drastic?
> >
> > It would be interesting to see what perf reports on these runs.
>
> After running the aim7 workloads on Paul's v3 patch (same 80 core, 8
> socket box - HT off) the results are quite similar to the v1. One
> difference is that the five_sec workload benefited with +15% throughput
> after 500 users.

Thanks,

>
> Taking a further look at each workload:
>
> * five_sec: spends a large amount of time in the newish mcs style lock
> at the spin on owner for the inode->i_mutex:
>
> 24.13% 315655 reaim [kernel.kallsyms] [k] mspin_lock
> |
> --- mspin_lock
> |
> |--99.76%-- __mutex_lock_killable_slowpath
> | mutex_lock_killable
> | vfs_readdir
> | SyS_getdents
> | system_call_fastpath
> | __getdents64
>
> With this patch:
> 23.56% 310531 reaim [kernel.kallsyms] [k] mspin_lock
> |
> --- mspin_lock
> |
> |--99.78%-- __mutex_lock_killable_slowpath
> | mutex_lock_killable
> | vfs_readdir
> | SyS_getdents
> | system_call
> | __getdents64

Note, the mspin_lock is not interesting, as its not affected by this
patch.

>
> * custom: Got a -33% throughput regression with this patch with 10-100
> users and -46% with 100 users and up. It spends most kernel space time
> dealing trying to take the inode->i_mutex and the ext4 ->s_orphan_lock
> (note that all runs are performed on ramdisks with ext4):
>
> 3.12% 137131 reaim [kernel.kallsyms] [k] mspin_lock
> |
> --- mspin_lock
> |
> |--82.98%-- __mutex_lock_killable_slowpath
> | mutex_lock_killable
> | vfs_readdir
> | SyS_getdents
> | system_call_fastpath
> | __getdents64
> |
> |--16.97%-- __mutex_lock_slowpath
> | mutex_lock
> | |
> | |--47.65%-- ext4_orphan_del
> | |--45.01%-- ext4_orphan_add
>
> With this patch:
> 2.14% 109982 reaim [kernel.kallsyms] [k] mspin_lock

Less time in the mspin_lock as it's probably now in the real spin lock
somewhere.

> |
> --- mspin_lock
> |
> |--68.67%-- __mutex_lock_killable_slowpath
> | mutex_lock_killable
> | vfs_readdir
> | SyS_getdents
> | system_call
> | __getdents64
> |
> |--31.24%-- __mutex_lock_slowpath
> | mutex_lock
> | |
> | |--40.36%-- ext4_orphan_del
>
>
> * short: is the big winner for this patch, +69% throughput improvement
> with 100-2000 users. This makes a lot of sense since the workload spends
> a ridiculous amount of time trying to acquire the d_lock:
>
> 84.86% 1569902 reaim [kernel.kallsyms] [k] _raw_spin_lock
> |
> --- _raw_spin_lock
> |
> |--49.96%-- dget_parent
> | __fsnotify_parent
> |--49.71%-- dput
> | |
> | |--99.98%-- __fsnotify_parent
>
> With this patch:
> 70.65% 467422 reaim [kernel.kallsyms] [k] tkt_q_do_spin
> |
> --- tkt_q_do_spin
> |
> |--100.00%-- tkt_spin_pass
> | |
> | |--100.00%-- _raw_spin_lock
> | | |
> | | |--50.07%-- dget_parent
> | | | __fsnotify_parent
> | | |--49.93%-- dput
> | | | __fsnotify_parent

This looks to be where the patch helps. The run without the patch is
hammering away at the cacheline of the d_lock, which I'm sure shares the
cache of other items in the dentry, such as the d_count. With the patch,
the spin is on a separate cacheline and doesn't affect the owner so
much.

>
>
> * disk: This patch benefits when adding more concurrency. Got -57% with
> 10-100 users, -25% with 100-1000 users and +8% with over 1000 users.
> Spends a good amount of time dealing with the wait_queue lock. The perf
> traces are with 80 users, where we see the worst numbers:
>
> 22.34% 20400 reaim [kernel.kallsyms] [k] _raw_spin_lock_irqsave
> |
> --- _raw_spin_lock_irqsave
> |
> |--50.28%-- __wake_up
> | |
> | |--99.10%-- __wake_up_bit
> | | wake_up_bit
> | | unlock_buffer
> |
> |--33.73%-- prepare_to_wait_exclusive
> | __wait_on_bit_lock
> | out_of_line_wait_on_bit_lock
> | __lock_buffer
> | do_get_write_access
> | jbd2_journal_get_write_access
> | __ext4_journal_get_write_access
> |--14.76%-- finish_wait
> | |
> | |--98.93%-- __wait_on_bit_lock
> | | out_of_line_wait_on_bit_lock
> | | __lock_buffer
> | | do_get_write_access
> | | jbd2_journal_get_write_access
> | | __ext4_journal_get_write_access
>
>
> With this patch the the time spent in the mentioned spinlocks
> considerably reduced:
> 8.09% 6237 reaim [kernel.kallsyms] [k] __read_lock_failed
> |
> --- __read_lock_failed
> _raw_read_lock
> |
> |--99.08%-- start_this_handle
> | jbd2__journal_start
> | __ext4_journal_start_sb
>
> 1.48% 1032 reaim [kernel.kallsyms] [k] _raw_spin_lock_irqsave
> |
> --- _raw_spin_lock_irqsave
> |
> |--50.77%-- prepare_to_wait
> | |
> | |--72.61%-- jbd2_log_wait_commit
> | | jbd2_complete_transaction
> |--21.76%-- prepare_to_wait_exclusive
> | __wait_on_bit_lock
> | out_of_line_wait_on_bit_lock
> | __lock_buffer
> | do_get_write_access
> | jbd2_journal_get_write_access
> |--11.46%-- __wake_up
> | |
> | |--44.21%-- ftrace_define_fields_jbd2_run_stats
> | | __ext4_journal_stop
> |--10.39%-- finish_wait
> | |
> | |--53.18%-- __wait_on_bit_lock
> | | out_of_line_wait_on_bit_lock
> | | __lock_buffer
> | | do_get_write_access
> | | jbd2_journal_get_write_access
> | | __ext4_journal_get_write_access

Interesting that this trace doesn't show it going into patch code at
all. I wonder if adding the slight overhead to the spin lock itself
shifts things enough to get a benefit by avoiding contention?

-- Steve

2013-06-12 20:03:27

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Wed, 2013-06-12 at 11:15 -0700, Linus Torvalds wrote:
> On Wed, Jun 12, 2013 at 10:50 AM, Davidlohr Bueso
> <[email protected]> wrote:
> >
> > * short: is the big winner for this patch, +69% throughput improvement
> > with 100-2000 users. This makes a lot of sense since the workload spends
> > a ridiculous amount of time trying to acquire the d_lock:
> >
> > 84.86% 1569902 reaim [kernel.kallsyms] [k] _raw_spin_lock
> > |
> > --- _raw_spin_lock
> > |
> > |--49.96%-- dget_parent
> > | __fsnotify_parent
> > |--49.71%-- dput
>
> Ugh. Do you have any idea what the heck that thing actually does?

Waiman's dcache patchet were actually an attempt to address these exact
issues: http://lkml.org/lkml/2013/5/22/716

According to him:

"the short workload calls security functions like getpwnam(),
getpwuid(), getgrgid() a couple of times. These functions open
the /etc/passwd or /etc/group files, read their content and close the
files.
It is the intensive open/read/close sequence from multiple threads that
is causing 80%+ contention in the d_lock on a system with large number
of cores. The MIT's MOSBench paper also outlined dentry reference
counting as a scalability roadblock for its Apache and Exim tests."

>
> Normally, we shouldn't see lots of dget contention, since the dcache
> these days does everything but the last path component locklessly.
>
> But there's a few exceptions, like symlinks (act as "last component"
> in the middle). And obviously, if some crazy threaded program opens
> the *same* file concurrently over and over again, then that "last
> component" will hammer on the dentry lock of that particular path. But
> that "open the same file concurrently" seems totally unrealistic -
> although maybe that's what AIM does..
>
> Anybody know the AIM subtests?
>
> Also, we *may* actually be able to optimize this by making
> dentry->d_count atomic, which will allow us to often do dget_parent
> and put() without taking the dcache lock at all. That's what it used
> to be, but the RCU patches actually made it be protected by the
> d_lock. It made sense at the time, as a step in the sequence, and many
> of the dentry d_count accesses are under the lock, but now that the
> remaining hot-paths are dget_parent and dput and many of the dentry
> d_count increments are gone from the hot-paths, we might want to
> re-visit that decision. It could go either way.

I did a quick attempt at this (patch attached). For the short workload,
we now have:

76.90% 928688 reaim [kernel.kallsyms] [k] _raw_spin_lock
|
--- _raw_spin_lock
|
|--99.69%-- dget_parent
| __fsnotify_parent
| |
| |--20.23%-- fsnotify_access
| | vfs_read
| |--20.13%-- __fput
| | ____fput
| | task_work_run
| |--20.07%-- security_file_permission
| | rw_verify_area
| | vfs_read
| |--19.97%-- do_sys_open
| | SyS_open
| --19.60%-- security_file_open
| do_dentry_open

Still 76%!!! Throughput wise we do have a very nice boost when compared
to the vanilla kernel:

10-100 users: +47%
100-1000 users: +76%
1000-2000 users: +76%

Thanks,
Davidlohr


Attachments:
atomic_dcount.patch (15.02 kB)

2013-06-12 20:26:28

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Wed, Jun 12, 2013 at 1:03 PM, Davidlohr Bueso <[email protected]> wrote:
>
> According to him:
>
> "the short workload calls security functions like getpwnam(),
> getpwuid(), getgrgid() a couple of times. These functions open
> the /etc/passwd or /etc/group files, read their content and close the
> files.

Ahh, ok. So yeah, it's multiple threads all hitting the same file.

I guess that /etc/passwd case is historically interesting, but I'm not
sure we really want to care too deeply..

> I did a quick attempt at this (patch attached).

Yeah, that's wrong, although it probably approximates the dget() case
(but incorrectly).

One of the points behind using an atomic d_count is that then dput() should do

if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_count))
return;

at the very top of the function. It can avoid taking the lock entirely
if the count doesn't go down to zero, which would be a common case if
you have lots of users opening the same file. While still protecting
d_count from ever going to zero while the lock is held.

Your

+ if (atomic_read(&dentry->d_count) > 1) {
+ atomic_dec(&dentry->d_count);
+ return;
+ }
+ spin_lock(&dentry->d_lock);

pattern is fundamentally racy, but it's what "atomic_dec_and_lock()"
should do race-free.

For similar reasons, I think you need to still maintain the d_lock in
d_prune_aliases etc. That's a slow-path, so the fact that we add an
atomic sequence there doesn't much matter.

However, one optimization missing from your patch is obvious in the
profile. "dget_parent()" also needs to be optimized - you still have
that as 99% of the spin-lock case. I think we could do something like

rcu_read_lock();
parent = ACCESS_ONCE(dentry->d_parent);
if (atomic_inc_nonzero(&parent->d_count))
return parent;
.. get d_lock and do it the slow way ...
rcu_read_unlock();

to locklessly get the parent pointer. We know "parent" isn't going
away (dentries are rcu-free'd and we hold the rcu read lock), and I
think that we can optimistically take *any* parent dentry that
happened to be valid at one point. As long as the refcount didn't go
down to zero. Al?

With dput and dget_parent() both being lockless for the common case,
you might get rid of the d_lock contention entirely for that load. I
dunno. And I should really think more about that dget_parent() thing a
bit more, but I cannot imagine how it could not be right (because even
with the current d_lock model, the lock is gotten *within*
dget_parent(), so the caller can never know if it gets a new or an old
parent, so there is no higher-level serialization going on - and we
might as well return *either* the new or the old as such).

I really want Al to double-check me if we decide to try going down
this hole. But the above two fixes to your patch should at least
approximate the d_lock changes, even if I'd have to look more closely
at the other details of your patch..

Linus

2013-06-12 20:37:17

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Wed, Jun 12, 2013 at 1:03 PM, Davidlohr Bueso <[email protected]> wrote:
>
> Waiman's dcache patchet were actually an attempt to address these exact
> issues: http://lkml.org/lkml/2013/5/22/716

Ok, looking at that patch-set, I think it has the same race with not
atomically getting the d_lock spinlock and d_count going down to zero
in dput(). And Waiman clearly didn't know about
"atomic_inc_not_zero()" or "atomic_dec_and_lock()" that are designed
for exactly the "increment if already nonzero" and "decrement without
taking the lock if we're not going down to zero" cases.

As outlined, I'm also not at all sure that the whole seqrw-lock thing
that Waiman did is really necessary - I think the optimistic
dget_parent() might be sufficient.

Linus

2013-06-12 20:40:39

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Wed, 2013-06-12 at 13:26 -0700, Linus Torvalds wrote:
> On Wed, Jun 12, 2013 at 1:03 PM, Davidlohr Bueso <[email protected]> wrote:
> >
> > According to him:
> >
> > "the short workload calls security functions like getpwnam(),
> > getpwuid(), getgrgid() a couple of times. These functions open
> > the /etc/passwd or /etc/group files, read their content and close the
> > files.
>
> Ahh, ok. So yeah, it's multiple threads all hitting the same file.
>
> I guess that /etc/passwd case is historically interesting, but I'm not
> sure we really want to care too deeply..
>
> > I did a quick attempt at this (patch attached).
>
> Yeah, that's wrong, although it probably approximates the dget() case
> (but incorrectly).

Indeed, it was only a proof of concept.

> One of the points behind using an atomic d_count is that then dput() should do
>
> if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_count))
> return;

noted.

> at the very top of the function. It can avoid taking the lock entirely
> if the count doesn't go down to zero, which would be a common case if
> you have lots of users opening the same file. While still protecting
> d_count from ever going to zero while the lock is held.
>
> Your
>
> + if (atomic_read(&dentry->d_count) > 1) {
> + atomic_dec(&dentry->d_count);
> + return;
> + }
> + spin_lock(&dentry->d_lock);
>
> pattern is fundamentally racy, but it's what "atomic_dec_and_lock()"
> should do race-free.
> For similar reasons, I think you need to still maintain the d_lock in
> d_prune_aliases etc. That's a slow-path, so the fact that we add an
> atomic sequence there doesn't much matter.
>
> However, one optimization missing from your patch is obvious in the
> profile. "dget_parent()" also needs to be optimized - you still have
> that as 99% of the spin-lock case. I think we could do something like
>
> rcu_read_lock();
> parent = ACCESS_ONCE(dentry->d_parent);
> if (atomic_inc_nonzero(&parent->d_count))
> return parent;
> .. get d_lock and do it the slow way ...
> rcu_read_unlock();
>
> to locklessly get the parent pointer. We know "parent" isn't going
> away (dentries are rcu-free'd and we hold the rcu read lock), and I
> think that we can optimistically take *any* parent dentry that
> happened to be valid at one point. As long as the refcount didn't go
> down to zero. Al?
>
> With dput and dget_parent() both being lockless for the common case,
> you might get rid of the d_lock contention entirely for that load. I
> dunno. And I should really think more about that dget_parent() thing a
> bit more, but I cannot imagine how it could not be right (because even
> with the current d_lock model, the lock is gotten *within*
> dget_parent(), so the caller can never know if it gets a new or an old
> parent, so there is no higher-level serialization going on - and we
> might as well return *either* the new or the old as such).
>
> I really want Al to double-check me if we decide to try going down
> this hole. But the above two fixes to your patch should at least
> approximate the d_lock changes, even if I'd have to look more closely
> at the other details of your patch..

Ok, I'll try to rerun and send a more conscious patch. Thanks for the
tips.

Davidlohr

2013-06-12 21:07:10

by Raymond Jennings

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Wed, 2013-06-12 at 13:26 -0700, Linus Torvalds wrote:
> On Wed, Jun 12, 2013 at 1:03 PM, Davidlohr Bueso <[email protected]> wrote:
> >
> > According to him:
> >
> > "the short workload calls security functions like getpwnam(),
> > getpwuid(), getgrgid() a couple of times. These functions open
> > the /etc/passwd or /etc/group files, read their content and close the
> > files.
>
> Ahh, ok. So yeah, it's multiple threads all hitting the same file

If that's the case and it's a bunch of reads, shouldn't they act
concurrently anyway?

I mean it's not like dentries are being changed or added or removed in
this case.

> I guess that /etc/passwd case is historically interesting, but I'm not
> sure we really want to care too deeply..
>
> > I did a quick attempt at this (patch attached).
>
> Yeah, that's wrong, although it probably approximates the dget() case
> (but incorrectly).
>
> One of the points behind using an atomic d_count is that then dput() should do
>
> if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_count))
> return;
>
> at the very top of the function. It can avoid taking the lock entirely
> if the count doesn't go down to zero, which would be a common case if
> you have lots of users opening the same file. While still protecting
> d_count from ever going to zero while the lock is held.
>
> Your
>
> + if (atomic_read(&dentry->d_count) > 1) {
> + atomic_dec(&dentry->d_count);
> + return;
> + }
> + spin_lock(&dentry->d_lock);
>
> pattern is fundamentally racy, but it's what "atomic_dec_and_lock()"
> should do race-free.
>
> For similar reasons, I think you need to still maintain the d_lock in
> d_prune_aliases etc. That's a slow-path, so the fact that we add an
> atomic sequence there doesn't much matter.
>
> However, one optimization missing from your patch is obvious in the
> profile. "dget_parent()" also needs to be optimized - you still have
> that as 99% of the spin-lock case. I think we could do something like
>
> rcu_read_lock();
> parent = ACCESS_ONCE(dentry->d_parent);
> if (atomic_inc_nonzero(&parent->d_count))
> return parent;
> .. get d_lock and do it the slow way ...
> rcu_read_unlock();
>
> to locklessly get the parent pointer. We know "parent" isn't going
> away (dentries are rcu-free'd and we hold the rcu read lock), and I
> think that we can optimistically take *any* parent dentry that
> happened to be valid at one point. As long as the refcount didn't go
> down to zero. Al?
>
> With dput and dget_parent() both being lockless for the common case,
> you might get rid of the d_lock contention entirely for that load. I
> dunno. And I should really think more about that dget_parent() thing a
> bit more, but I cannot imagine how it could not be right (because even
> with the current d_lock model, the lock is gotten *within*
> dget_parent(), so the caller can never know if it gets a new or an old
> parent, so there is no higher-level serialization going on - and we
> might as well return *either* the new or the old as such).
>
> I really want Al to double-check me if we decide to try going down
> this hole. But the above two fixes to your patch should at least
> approximate the d_lock changes, even if I'd have to look more closely
> at the other details of your patch..
>
> Linus
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2013-06-12 23:32:46

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Wed, Jun 12, 2013 at 01:26:25PM -0700, Linus Torvalds wrote:

> For similar reasons, I think you need to still maintain the d_lock in
> d_prune_aliases etc. That's a slow-path, so the fact that we add an
> atomic sequence there doesn't much matter.
>
> However, one optimization missing from your patch is obvious in the
> profile. "dget_parent()" also needs to be optimized - you still have
> that as 99% of the spin-lock case. I think we could do something like
>
> rcu_read_lock();
> parent = ACCESS_ONCE(dentry->d_parent);
> if (atomic_inc_nonzero(&parent->d_count))
> return parent;
> .. get d_lock and do it the slow way ...
> rcu_read_unlock();
>
> to locklessly get the parent pointer. We know "parent" isn't going
> away (dentries are rcu-free'd and we hold the rcu read lock), and I
> think that we can optimistically take *any* parent dentry that
> happened to be valid at one point. As long as the refcount didn't go
> down to zero. Al?

What will you do with __d_rcu_to_refcount()? Any such scheme has to
hold d_lock from zero->non-zero d_count changes, or atomic_dec_and_lock
in dput() won't help at all. As it is, both comlete_walk() and unlazy_walk()
are grabbing ->d_lock on the dentry we'd reached, so they can call that
sucker. And that'll give you ->d_lock contention when a bunch of threads
are hitting the same file; I don't see how atomics would avoid that
one...

2013-06-13 00:01:22

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Wed, Jun 12, 2013 at 4:32 PM, Al Viro <[email protected]> wrote:
> On Wed, Jun 12, 2013 at 01:26:25PM -0700, Linus Torvalds wrote:
>>
>> However, one optimization missing from your patch is obvious in the
>> profile. "dget_parent()" also needs to be optimized - you still have
>> that as 99% of the spin-lock case. I think we could do something like
>>
>> rcu_read_lock();
>> parent = ACCESS_ONCE(dentry->d_parent);
>> if (atomic_inc_nonzero(&parent->d_count))
>> return parent;
>> .. get d_lock and do it the slow way ...
>> rcu_read_unlock();
>>
>> to locklessly get the parent pointer. We know "parent" isn't going
>> away (dentries are rcu-free'd and we hold the rcu read lock), and I
>> think that we can optimistically take *any* parent dentry that
>> happened to be valid at one point. As long as the refcount didn't go
>> down to zero. Al?
>
> What will you do with __d_rcu_to_refcount()? Any such scheme has to
> hold d_lock from zero->non-zero d_count changes, or atomic_dec_and_lock
> in dput() won't help at all.

I'd actually suggest we do *not* remove any existing d_lock usage
outside of the particular special cases we want to optimize, which at
least from Davidlohr's profile is just dput() (which has shown up a
lot before) and dget_parent() (which I'm not sure why it happens so
much on his load, but it really seems trivially safe to optimistically
do under just the RCU lock).

> As it is, both complete_walk() and unlazy_walk()
> are grabbing ->d_lock on the dentry we'd reached, so they can call that
> sucker. And that'll give you ->d_lock contention when a bunch of threads
> are hitting the same file; I don't see how atomics would avoid that
> one...

I'd love to get rid of complete_walk() using the dcache lock too, but
if we really can't get rid of it, I won't cry.

That said, I do wonder if we could do something like
"atomic_inc_not_zero()" on the d_count, and only if it is zero (which
won't be horribly unusual, since for leaf dentries that nobody else is
using) we'd do the whole locking sequence.

But my first reaction is to not even bother until it shows up on some
profile. Of course, maybe it immediately does.

There's a real downside to making d_count an "atomic_t", and there
might be loads where it actually bites us. But even in the absense of
contention, atomics tend to be sufficiently faster than spinlocks that
even if we end up adding two or even three atomics for each d_lock
lock we get rid of, we should be ok even for single-thread. On the
contention case, we'll obviously win almost regardless of how many
atomics we add.

Of course, that assumes we get rid of any locking at all for the
normal case. with dput() having to take the lock when the refcount
goes to zero, and most single-threaded file opens using the final path
component with a dentry that isn't otherwise used, doing an atomic
d_count might hurt the single-thread case without getting any spinlock
advantage at all. dget_parent() would be the only case that helps even
there, and that should normally only happen for "..", I think.

So who knows.. Are we willing to take a hit on the single-thread case
(*if* that is the case) if it helps scalability a lot? If it was a
common scalability issue, sure. But just for AIM7 looking up
/etc/passwd? Maybe that's not a good idea.

Of course, for the mostly non-contended case, it's quite possible that
the extra atomic isn't even noticeable. As long as the dentry is dirty
in the cache (which it would be, normally), the atomic cost is just in
the 10-20 cycles.

End result: I think it would be interesting to try this all out, and
it could be a noticeable win under some cases, but it *definitely*
needs a lot of timing and testing to see which ways it goes..

Linus

2013-06-13 00:21:16

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Wed, Jun 12, 2013 at 05:01:19PM -0700, Linus Torvalds wrote:
> I'd actually suggest we do *not* remove any existing d_lock usage
> outside of the particular special cases we want to optimize, which at
> least from Davidlohr's profile is just dput() (which has shown up a
> lot before) and dget_parent() (which I'm not sure why it happens so
> much on his load, but it really seems trivially safe to optimistically
> do under just the RCU lock).

Actually, dget_parent() change might be broken; the thing is, the assumptions
are more subtle than "zero -> non-zero only happens under ->d_lock". It's
actually "new references are grabbed by somebody who's either already holding
one on the same dentry _or_ holding ->d_lock". That's what d_invalidate()
check for ->d_count needs for correctness - caller holds one reference, so
comparing ->d_count with 2 under ->d_lock means checking that there's no other
holders _and_ there won't be any new ones appearing.

Consider the following situation:
X is dentry of a/b
Y is dentry of a/b/c
Z is dentry of d/e

A holds a reference to Y and enters dget_parent(Y)
B holds a reference to X and enters d_invalidate(X)
A picks the value of Y->d_parent (== X)
C moves Y to Z
B grabs ->d_lock on X
B checks X->d_count; it's 1, we deduce that no other references exist or
are going to appear
A does atomic_inc_not_zero(&X->d_count). And since it's not zero (it's 1,
actually), we've just grabbed an extra reference on X that was not going
to appear according to B...

> That said, I do wonder if we could do something like
> "atomic_inc_not_zero()" on the d_count, and only if it is zero (which
> won't be horribly unusual, since for leaf dentries that nobody else is
> using) we'd do the whole locking sequence.

Same correctness issue as above, I'm afraid...

> End result: I think it would be interesting to try this all out, and
> it could be a noticeable win under some cases, but it *definitely*
> needs a lot of timing and testing to see which ways it goes..

*nod*

What's more, we need the underlying assumptions documented very clearly for
any such change; it's _not_ as simple as "protect transitions from zero to
non-zero and we are done" ;-/

2013-06-13 00:38:17

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Wed, Jun 12, 2013 at 5:20 PM, Al Viro <[email protected]> wrote:
>
> Actually, dget_parent() change might be broken; the thing is, the assumptions
> are more subtle than "zero -> non-zero only happens under ->d_lock". It's
> actually "new references are grabbed by somebody who's either already holding
> one on the same dentry _or_ holding ->d_lock". That's what d_invalidate()
> check for ->d_count needs for correctness - caller holds one reference, so
> comparing ->d_count with 2 under ->d_lock means checking that there's no other
> holders _and_ there won't be any new ones appearing.

For the particular case of dget_parent() maybe dget_parent() should
just double-check the original dentry->d_parent pointer after getting
the refcount on it (and if the parent has changed, drop the refcount
again and go to the locked version). That might be a good idea anyway,
and should fix the possible race (which would be with another cpu
having to first rename the child to some other parent, and the
d_invalidate() the original parent)

That said, the case we'd really want to fix isn't dget_parent(), but
just the normal RCU lookup finishing touches (the__d_rcu_to_refcount()
case you already mentioned) . *If* we could do that without ever
taking the d_lock on the target, that would be lovely. But it would
seem to have the exact same issue. Although maybe the
dentry_rcuwalk_barrier() thing ends up solving it (ie if we had a
lookup at a bad time, we know it will fail the sequence count test, so
we're ok).

Subtle, subtle.

Linus

2013-06-13 00:49:57

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Wed, Jun 12, 2013 at 05:38:13PM -0700, Linus Torvalds wrote:
> On Wed, Jun 12, 2013 at 5:20 PM, Al Viro <[email protected]> wrote:
> >
> > Actually, dget_parent() change might be broken; the thing is, the assumptions
> > are more subtle than "zero -> non-zero only happens under ->d_lock". It's
> > actually "new references are grabbed by somebody who's either already holding
> > one on the same dentry _or_ holding ->d_lock". That's what d_invalidate()
> > check for ->d_count needs for correctness - caller holds one reference, so
> > comparing ->d_count with 2 under ->d_lock means checking that there's no other
> > holders _and_ there won't be any new ones appearing.
>
> For the particular case of dget_parent() maybe dget_parent() should
> just double-check the original dentry->d_parent pointer after getting
> the refcount on it (and if the parent has changed, drop the refcount
> again and go to the locked version). That might be a good idea anyway,
> and should fix the possible race (which would be with another cpu
> having to first rename the child to some other parent, and the
> d_invalidate() the original parent)

Yes, but... Then we'd need to dput() that sucker if we decide we shouldn't
have grabbed that reference, after all, which would make dget_parent()
potentially blocking.

> That said, the case we'd really want to fix isn't dget_parent(), but
> just the normal RCU lookup finishing touches (the__d_rcu_to_refcount()
> case you already mentioned) . *If* we could do that without ever
> taking the d_lock on the target, that would be lovely. But it would
> seem to have the exact same issue. Although maybe the
> dentry_rcuwalk_barrier() thing ends up solving it (ie if we had a
> lookup at a bad time, we know it will fail the sequence count test, so
> we're ok).

Maybe, but that would require dentry_rcuwalk_barrier() between any such
check and corresponding grabbing of ->d_lock done for it, so it's not
just d_invalidate().

> Subtle, subtle.

Yes ;-/ The current variant is using ->d_lock as a brute-force mechanism
for avoiding all that fun, and I'm not sure that getting rid of it would
buy us enough to make it worth the trouble. I'm absolutely sure that if
we go for that, we _MUST_ document the entire scheme as explicitly as
possible, or we'll end up with the shitload of recurring bugs in that
area. Preferably with the formal proof of correctness spelled out somewhere...

2013-06-13 00:59:54

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Wed, Jun 12, 2013 at 5:49 PM, Al Viro <[email protected]> wrote:
> On Wed, Jun 12, 2013 at 05:38:13PM -0700, Linus Torvalds wrote:
>>
>> For the particular case of dget_parent() maybe dget_parent() should
>> just double-check the original dentry->d_parent pointer after getting
>> the refcount on it (and if the parent has changed, drop the refcount
>> again and go to the locked version). That might be a good idea anyway,
>> and should fix the possible race (which would be with another cpu
>> having to first rename the child to some other parent, and the
>> d_invalidate() the original parent)
>
> Yes, but... Then we'd need to dput() that sucker if we decide we shouldn't
> have grabbed that reference, after all, which would make dget_parent()
> potentially blocking.

Ho humm.. interesting. I was talking about wanting to mix atomics and
spinlocks earlier in this thread due to space constraints, and it
strikes me that that would actually help this case a lot. Having the
dentry count mix d_lock and the count in one word would allow for
atomic ops like "increment if not locked", and we'd avoid this whole
race entirely..

Something like "low bit of count is the lock bit" would end up being
lovely for this case. Of course, that's not how our spinlocks work ..

Linus

2013-06-13 02:52:55

by Lai Jiangshan

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] v3 Auto-queued ticketlock

On 06/12/2013 11:40 PM, Paul E. McKenney wrote:
> Breaking up locks is better than implementing high-contention locks, but
> if we must have high-contention locks, why not make them automatically
> switch between light-weight ticket locks at low contention and queued
> locks at high contention? After all, this would remove the need for
> the developer to predict which locks will be highly contended.
>
> This commit allows ticket locks to automatically switch between pure
> ticketlock and queued-lock operation as needed. If too many CPUs are
> spinning on a given ticket lock, a queue structure will be allocated
> and the lock will switch to queued-lock operation. When the lock becomes
> free, it will switch back into ticketlock operation. The low-order bit
> of the head counter is used to indicate that the lock is in queued mode,
> which forces an unconditional mismatch between the head and tail counters.
> This approach means that the common-case code path under conditions of
> low contention is very nearly that of a plain ticket lock.
>
> A fixed number of queueing structures is statically allocated in an
> array. The ticket-lock address is used to hash into an initial element,
> but if that element is already in use, it moves to the next element. If
> the entire array is already in use, continue to spin in ticket mode.
>
> Signed-off-by: Paul E. McKenney <[email protected]>
> [ paulmck: Eliminate duplicate code and update comments (Steven Rostedt). ]
> [ paulmck: Address Eric Dumazet review feedback. ]
> [ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
> [ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]
> [ paulmck: Move cpu_relax() to main spin loop (Steven Rostedt). ]
> [ paulmck: Reduce queue-switch contention (Waiman Long). ]
> [ paulmck: __TKT_SPIN_INC for __ticket_spin_trylock() (Steffen Persvold). ]
> [ paulmck: Type safety fixes (Steven Rostedt). ]
> [ paulmck: Pre-check cmpxchg() value (Waiman Long). ]
> [ paulmck: smp_mb() downgrade to smp_wmb() (Lai Jiangshan). ]


Hi, Paul,

I simplify the code and remove the search by encoding the index of struct tkt_q_head
into lock->tickets.head.

A) lock->tickets.head(when queued-lock):
---------------------------------
index of struct tkt_q_head |0|1|
---------------------------------

The bit0 = 1 for queued, and the bit1 = 0 is reserved for __ticket_spin_unlock(),
thus __ticket_spin_unlock() will not change the higher bits of lock->tickets.head.

B) tqhp->head is for the real value of lock->tickets.head.
if the last bit of tqhp->head is 1, it means it is the head ticket when started queuing.

Thanks,
Lai

kernel/tktqlock.c | 51 +++++++++++++--------------------------------------
1 files changed, 13 insertions(+), 38 deletions(-)

diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
index 912817c..1329d0f 100644
--- a/kernel/tktqlock.c
+++ b/kernel/tktqlock.c
@@ -33,7 +33,7 @@ struct tkt_q {

struct tkt_q_head {
arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
- s64 head_tkt; /* Head ticket when started queuing. */
+ __ticket_t head; /* Real head when queued. */
struct tkt_q *spin; /* Head of queue. */
struct tkt_q **spin_tail; /* Tail of queue. */
};
@@ -77,15 +77,8 @@ static unsigned long tkt_q_hash(arch_spinlock_t *lock)
*/
static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *lock)
{
- int i;
- int start;
-
- start = i = tkt_q_hash(lock);
- do
- if (ACCESS_ONCE(tkt_q_heads[i].ref) == lock)
- return &tkt_q_heads[i];
- while ((i = tkt_q_next_slot(i)) != start);
- return NULL;
+ BUILD_BUG_ON(TKT_Q_NQUEUES > (1 << (TICKET_SHIFT - 2)));
+ return &tkt_q_heads[ACCESS_ONCE(lock->tickets.head) >> 2];
}

/*
@@ -101,11 +94,11 @@ static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)

/* Pick up the ticket values. */
asold = ACCESS_ONCE(*lock);
- if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
+ if (tqhp->head == asold.tickets.tail) {

/* Attempt to mark the lock as not having a queue. */
asnew = asold;
- asnew.tickets.head &= ~0x1;
+ asnew.tickets.head = tqhp->head;
if (cmpxchg(&lock->head_tail,
asold.head_tail,
asnew.head_tail) == asold.head_tail) {
@@ -128,12 +121,9 @@ void tkt_q_do_wake(arch_spinlock_t *lock)
struct tkt_q_head *tqhp;
struct tkt_q *tqp;

- /*
- * If the queue is still being set up, wait for it. Note that
- * the caller's xadd() provides the needed memory ordering.
- */
- while ((tqhp = tkt_q_find_head(lock)) == NULL)
- cpu_relax();
+ tqhp = tkt_q_find_head(lock);
+ ACCESS_ONCE(lock->tickets.head) -= __TKT_SPIN_INC;
+ ACCESS_ONCE(tqhp->head) = (tqhp->head & ~0x1) + __TKT_SPIN_INC;

for (;;) {

@@ -145,9 +135,7 @@ void tkt_q_do_wake(arch_spinlock_t *lock)
return; /* No element, successfully removed queue. */
cpu_relax();
}
- if (ACCESS_ONCE(tqhp->head_tkt) != -1)
- ACCESS_ONCE(tqhp->head_tkt) = -1;
- smp_mb(); /* Order pointer fetch and assignment against handoff. */
+ smp_mb(); /* Order modification, pointer fetch and assignment against handoff. */
ACCESS_ONCE(tqp->cpu) = -1;
}
EXPORT_SYMBOL(tkt_q_do_wake);
@@ -169,10 +157,7 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
*/
smp_mb(); /* See above block comment. */

- /* If there no longer is a queue, leave. */
tqhp = tkt_q_find_head(lock);
- if (tqhp == NULL)
- return false;

/* Initialize our queue element. */
tq.cpu = raw_smp_processor_id();
@@ -180,9 +165,8 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
tq.next = NULL;

/* Check to see if we already hold the lock. */
- if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
+ if (ACCESS_ONCE(tqhp->head) == (inc.tail | 0x1)) {
/* The last holder left before queue formed, we hold lock. */
- tqhp->head_tkt = -1;
return true;
}

@@ -290,16 +274,14 @@ tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
* Record the head counter in case one of the spinning
* CPUs already holds the lock but doesn't realize it yet.
*/
- tqhp->head_tkt = asold.tickets.head;
+ tqhp->head = asold.tickets.head | 0x1;

/* The low-order bit in the head counter says "queued". */
- asnew.tickets.head |= 0x1;
+ asnew.tickets.head = (i << 2) + 0x1;
} while (cmpxchg(&lock->head_tail,
asold.head_tail,
asnew.head_tail) != asold.head_tail);

- /* Point the queue at the lock and go spin on it. */
- ACCESS_ONCE(tqhp->ref) = lock;
return tkt_q_do_spin(lock, inc);
}

@@ -321,15 +303,8 @@ bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
* the lock with the corresponding queue.
*/
do {
- /*
- * Use 0x1 to mark the queue in use, but also avoiding
- * any spinners trying to use it before we get it all
- * initialized.
- */
if (!tkt_q_heads[i].ref &&
- cmpxchg(&tkt_q_heads[i].ref,
- NULL,
- (arch_spinlock_t *)0x1) == NULL) {
+ cmpxchg(&tkt_q_heads[i].ref, NULL, lock) == NULL) {

/* Succeeded, now go initialize it. */
return tkt_q_init_contend(i, lock, inc);

2013-06-13 15:23:01

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] v3 Auto-queued ticketlock

On Thu, Jun 13, 2013 at 10:55:41AM +0800, Lai Jiangshan wrote:
> On 06/12/2013 11:40 PM, Paul E. McKenney wrote:
> > Breaking up locks is better than implementing high-contention locks, but
> > if we must have high-contention locks, why not make them automatically
> > switch between light-weight ticket locks at low contention and queued
> > locks at high contention? After all, this would remove the need for
> > the developer to predict which locks will be highly contended.
> >
> > This commit allows ticket locks to automatically switch between pure
> > ticketlock and queued-lock operation as needed. If too many CPUs are
> > spinning on a given ticket lock, a queue structure will be allocated
> > and the lock will switch to queued-lock operation. When the lock becomes
> > free, it will switch back into ticketlock operation. The low-order bit
> > of the head counter is used to indicate that the lock is in queued mode,
> > which forces an unconditional mismatch between the head and tail counters.
> > This approach means that the common-case code path under conditions of
> > low contention is very nearly that of a plain ticket lock.
> >
> > A fixed number of queueing structures is statically allocated in an
> > array. The ticket-lock address is used to hash into an initial element,
> > but if that element is already in use, it moves to the next element. If
> > the entire array is already in use, continue to spin in ticket mode.
> >
> > Signed-off-by: Paul E. McKenney <[email protected]>
> > [ paulmck: Eliminate duplicate code and update comments (Steven Rostedt). ]
> > [ paulmck: Address Eric Dumazet review feedback. ]
> > [ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
> > [ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]
> > [ paulmck: Move cpu_relax() to main spin loop (Steven Rostedt). ]
> > [ paulmck: Reduce queue-switch contention (Waiman Long). ]
> > [ paulmck: __TKT_SPIN_INC for __ticket_spin_trylock() (Steffen Persvold). ]
> > [ paulmck: Type safety fixes (Steven Rostedt). ]
> > [ paulmck: Pre-check cmpxchg() value (Waiman Long). ]
> > [ paulmck: smp_mb() downgrade to smp_wmb() (Lai Jiangshan). ]
>
>
> Hi, Paul,
>
> I simplify the code and remove the search by encoding the index of struct tkt_q_head
> into lock->tickets.head.
>
> A) lock->tickets.head(when queued-lock):
> ---------------------------------
> index of struct tkt_q_head |0|1|
> ---------------------------------

Interesting approach! It might reduce queued-mode overhead a bit in
some cases, though I bet that in the common case the first queue element
examined is the right one. More on this below.

> The bit0 = 1 for queued, and the bit1 = 0 is reserved for __ticket_spin_unlock(),
> thus __ticket_spin_unlock() will not change the higher bits of lock->tickets.head.
>
> B) tqhp->head is for the real value of lock->tickets.head.
> if the last bit of tqhp->head is 1, it means it is the head ticket when started queuing.

But don't you also need the xadd() in __ticket_spin_unlock() to become
a cmpxchg() for this to work? Or is your patch missing your changes to
arch/x86/include/asm/spinlock.h? Either way, this is likely to increase
the no-contention overhead, which might be counterproductive. Wouldn't
hurt to get measurements, though.

Given the results that Davidlohr posted, I believe that the following
optimizations would also provide some improvement:

1. Move the call to tkt_spin_pass() from __ticket_spin_lock()
to a separate linker section in order to reduce the icache
penalty exacted by the spinloop. This is likely to be causing
some of the performance reductions in the cases where ticket
locks are not highly contended.

2. Limit the number of elements searched for in the array of
queues. However, this would help only if a number of ticket
locks were in queued mode at the same time.

3. Dynamically allocate the queue array at boot. This might
also reduce cache pressure, again, at least in cases where
there are a number of ticket locks in queued mode at the
same time.

Frederic just reminded me that I owe him some energy-efficiency improvements
for adaptive ticks, so I won't get to these very quickly. Please feel free
to take these on -- the patch clearly does well under high contention, so
reducing the no-contention penalty could really help.

Thanx, Paul

> Thanks,
> Lai
>
> kernel/tktqlock.c | 51 +++++++++++++--------------------------------------
> 1 files changed, 13 insertions(+), 38 deletions(-)
>
> diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> index 912817c..1329d0f 100644
> --- a/kernel/tktqlock.c
> +++ b/kernel/tktqlock.c
> @@ -33,7 +33,7 @@ struct tkt_q {
>
> struct tkt_q_head {
> arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
> - s64 head_tkt; /* Head ticket when started queuing. */
> + __ticket_t head; /* Real head when queued. */
> struct tkt_q *spin; /* Head of queue. */
> struct tkt_q **spin_tail; /* Tail of queue. */
> };
> @@ -77,15 +77,8 @@ static unsigned long tkt_q_hash(arch_spinlock_t *lock)
> */
> static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *lock)
> {
> - int i;
> - int start;
> -
> - start = i = tkt_q_hash(lock);
> - do
> - if (ACCESS_ONCE(tkt_q_heads[i].ref) == lock)
> - return &tkt_q_heads[i];
> - while ((i = tkt_q_next_slot(i)) != start);
> - return NULL;
> + BUILD_BUG_ON(TKT_Q_NQUEUES > (1 << (TICKET_SHIFT - 2)));
> + return &tkt_q_heads[ACCESS_ONCE(lock->tickets.head) >> 2];
> }
>
> /*
> @@ -101,11 +94,11 @@ static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)
>
> /* Pick up the ticket values. */
> asold = ACCESS_ONCE(*lock);
> - if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
> + if (tqhp->head == asold.tickets.tail) {
>
> /* Attempt to mark the lock as not having a queue. */
> asnew = asold;
> - asnew.tickets.head &= ~0x1;
> + asnew.tickets.head = tqhp->head;
> if (cmpxchg(&lock->head_tail,
> asold.head_tail,
> asnew.head_tail) == asold.head_tail) {
> @@ -128,12 +121,9 @@ void tkt_q_do_wake(arch_spinlock_t *lock)
> struct tkt_q_head *tqhp;
> struct tkt_q *tqp;
>
> - /*
> - * If the queue is still being set up, wait for it. Note that
> - * the caller's xadd() provides the needed memory ordering.
> - */
> - while ((tqhp = tkt_q_find_head(lock)) == NULL)
> - cpu_relax();
> + tqhp = tkt_q_find_head(lock);
> + ACCESS_ONCE(lock->tickets.head) -= __TKT_SPIN_INC;
> + ACCESS_ONCE(tqhp->head) = (tqhp->head & ~0x1) + __TKT_SPIN_INC;
>
> for (;;) {
>
> @@ -145,9 +135,7 @@ void tkt_q_do_wake(arch_spinlock_t *lock)
> return; /* No element, successfully removed queue. */
> cpu_relax();
> }
> - if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> - ACCESS_ONCE(tqhp->head_tkt) = -1;
> - smp_mb(); /* Order pointer fetch and assignment against handoff. */
> + smp_mb(); /* Order modification, pointer fetch and assignment against handoff. */
> ACCESS_ONCE(tqp->cpu) = -1;
> }
> EXPORT_SYMBOL(tkt_q_do_wake);
> @@ -169,10 +157,7 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> */
> smp_mb(); /* See above block comment. */
>
> - /* If there no longer is a queue, leave. */
> tqhp = tkt_q_find_head(lock);
> - if (tqhp == NULL)
> - return false;
>
> /* Initialize our queue element. */
> tq.cpu = raw_smp_processor_id();
> @@ -180,9 +165,8 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> tq.next = NULL;
>
> /* Check to see if we already hold the lock. */
> - if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> + if (ACCESS_ONCE(tqhp->head) == (inc.tail | 0x1)) {
> /* The last holder left before queue formed, we hold lock. */
> - tqhp->head_tkt = -1;
> return true;
> }
>
> @@ -290,16 +274,14 @@ tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
> * Record the head counter in case one of the spinning
> * CPUs already holds the lock but doesn't realize it yet.
> */
> - tqhp->head_tkt = asold.tickets.head;
> + tqhp->head = asold.tickets.head | 0x1;
>
> /* The low-order bit in the head counter says "queued". */
> - asnew.tickets.head |= 0x1;
> + asnew.tickets.head = (i << 2) + 0x1;
> } while (cmpxchg(&lock->head_tail,
> asold.head_tail,
> asnew.head_tail) != asold.head_tail);
>
> - /* Point the queue at the lock and go spin on it. */
> - ACCESS_ONCE(tqhp->ref) = lock;
> return tkt_q_do_spin(lock, inc);
> }
>
> @@ -321,15 +303,8 @@ bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
> * the lock with the corresponding queue.
> */
> do {
> - /*
> - * Use 0x1 to mark the queue in use, but also avoiding
> - * any spinners trying to use it before we get it all
> - * initialized.
> - */
> if (!tkt_q_heads[i].ref &&
> - cmpxchg(&tkt_q_heads[i].ref,
> - NULL,
> - (arch_spinlock_t *)0x1) == NULL) {
> + cmpxchg(&tkt_q_heads[i].ref, NULL, lock) == NULL) {
>
> /* Succeeded, now go initialize it. */
> return tkt_q_init_contend(i, lock, inc);
>

2013-06-13 23:26:00

by Lai Jiangshan

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] v3 Auto-queued ticketlock

On Thu, Jun 13, 2013 at 11:22 PM, Paul E. McKenney
<[email protected]> wrote:
> On Thu, Jun 13, 2013 at 10:55:41AM +0800, Lai Jiangshan wrote:
>> On 06/12/2013 11:40 PM, Paul E. McKenney wrote:
>> > Breaking up locks is better than implementing high-contention locks, but
>> > if we must have high-contention locks, why not make them automatically
>> > switch between light-weight ticket locks at low contention and queued
>> > locks at high contention? After all, this would remove the need for
>> > the developer to predict which locks will be highly contended.
>> >
>> > This commit allows ticket locks to automatically switch between pure
>> > ticketlock and queued-lock operation as needed. If too many CPUs are
>> > spinning on a given ticket lock, a queue structure will be allocated
>> > and the lock will switch to queued-lock operation. When the lock becomes
>> > free, it will switch back into ticketlock operation. The low-order bit
>> > of the head counter is used to indicate that the lock is in queued mode,
>> > which forces an unconditional mismatch between the head and tail counters.
>> > This approach means that the common-case code path under conditions of
>> > low contention is very nearly that of a plain ticket lock.
>> >
>> > A fixed number of queueing structures is statically allocated in an
>> > array. The ticket-lock address is used to hash into an initial element,
>> > but if that element is already in use, it moves to the next element. If
>> > the entire array is already in use, continue to spin in ticket mode.
>> >
>> > Signed-off-by: Paul E. McKenney <[email protected]>
>> > [ paulmck: Eliminate duplicate code and update comments (Steven Rostedt). ]
>> > [ paulmck: Address Eric Dumazet review feedback. ]
>> > [ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
>> > [ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]
>> > [ paulmck: Move cpu_relax() to main spin loop (Steven Rostedt). ]
>> > [ paulmck: Reduce queue-switch contention (Waiman Long). ]
>> > [ paulmck: __TKT_SPIN_INC for __ticket_spin_trylock() (Steffen Persvold). ]
>> > [ paulmck: Type safety fixes (Steven Rostedt). ]
>> > [ paulmck: Pre-check cmpxchg() value (Waiman Long). ]
>> > [ paulmck: smp_mb() downgrade to smp_wmb() (Lai Jiangshan). ]
>>
>>
>> Hi, Paul,
>>
>> I simplify the code and remove the search by encoding the index of struct tkt_q_head
>> into lock->tickets.head.
>>
>> A) lock->tickets.head(when queued-lock):
>> ---------------------------------
>> index of struct tkt_q_head |0|1|
>> ---------------------------------
>
> Interesting approach! It might reduce queued-mode overhead a bit in
> some cases, though I bet that in the common case the first queue element
> examined is the right one. More on this below.
>
>> The bit0 = 1 for queued, and the bit1 = 0 is reserved for __ticket_spin_unlock(),
>> thus __ticket_spin_unlock() will not change the higher bits of lock->tickets.head.
>>
>> B) tqhp->head is for the real value of lock->tickets.head.
>> if the last bit of tqhp->head is 1, it means it is the head ticket when started queuing.
>
> But don't you also need the xadd() in __ticket_spin_unlock() to become
> a cmpxchg() for this to work? Or is your patch missing your changes to
> arch/x86/include/asm/spinlock.h? Either way, this is likely to increase
> the no-contention overhead, which might be counterproductive. Wouldn't
> hurt to get measurements, though.

No, don't need to change __ticket_spin_unlock() in my idea.
bit1 in the tickets.head is reserved for __ticket_spin_unlock(),
__ticket_spin_unlock() only changes the bit1, it will not change
the higher bits. tkt_q_do_wake() will restore the tickets.head.

This approach avoids cmpxchg in __ticket_spin_unlock().

>
> Given the results that Davidlohr posted, I believe that the following
> optimizations would also provide some improvement:
>
> 1. Move the call to tkt_spin_pass() from __ticket_spin_lock()
> to a separate linker section in order to reduce the icache
> penalty exacted by the spinloop. This is likely to be causing
> some of the performance reductions in the cases where ticket
> locks are not highly contended.
>
> 2. Limit the number of elements searched for in the array of
> queues. However, this would help only if a number of ticket
> locks were in queued mode at the same time.
>
> 3. Dynamically allocate the queue array at boot. This might
> also reduce cache pressure, again, at least in cases where
> there are a number of ticket locks in queued mode at the
> same time.
>
> Frederic just reminded me that I owe him some energy-efficiency improvements
> for adaptive ticks, so I won't get to these very quickly. Please feel free
> to take these on -- the patch clearly does well under high contention, so
> reducing the no-contention penalty could really help.
>
> Thanx, Paul
>
>> Thanks,
>> Lai
>>
>> kernel/tktqlock.c | 51 +++++++++++++--------------------------------------
>> 1 files changed, 13 insertions(+), 38 deletions(-)
>>
>> diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
>> index 912817c..1329d0f 100644
>> --- a/kernel/tktqlock.c
>> +++ b/kernel/tktqlock.c
>> @@ -33,7 +33,7 @@ struct tkt_q {
>>
>> struct tkt_q_head {
>> arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
>> - s64 head_tkt; /* Head ticket when started queuing. */
>> + __ticket_t head; /* Real head when queued. */
>> struct tkt_q *spin; /* Head of queue. */
>> struct tkt_q **spin_tail; /* Tail of queue. */
>> };
>> @@ -77,15 +77,8 @@ static unsigned long tkt_q_hash(arch_spinlock_t *lock)
>> */
>> static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *lock)
>> {
>> - int i;
>> - int start;
>> -
>> - start = i = tkt_q_hash(lock);
>> - do
>> - if (ACCESS_ONCE(tkt_q_heads[i].ref) == lock)
>> - return &tkt_q_heads[i];
>> - while ((i = tkt_q_next_slot(i)) != start);
>> - return NULL;
>> + BUILD_BUG_ON(TKT_Q_NQUEUES > (1 << (TICKET_SHIFT - 2)));
>> + return &tkt_q_heads[ACCESS_ONCE(lock->tickets.head) >> 2];
>> }
>>
>> /*
>> @@ -101,11 +94,11 @@ static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)
>>
>> /* Pick up the ticket values. */
>> asold = ACCESS_ONCE(*lock);
>> - if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
>> + if (tqhp->head == asold.tickets.tail) {
>>
>> /* Attempt to mark the lock as not having a queue. */
>> asnew = asold;
>> - asnew.tickets.head &= ~0x1;
>> + asnew.tickets.head = tqhp->head;
>> if (cmpxchg(&lock->head_tail,
>> asold.head_tail,
>> asnew.head_tail) == asold.head_tail) {
>> @@ -128,12 +121,9 @@ void tkt_q_do_wake(arch_spinlock_t *lock)
>> struct tkt_q_head *tqhp;
>> struct tkt_q *tqp;
>>
>> - /*
>> - * If the queue is still being set up, wait for it. Note that
>> - * the caller's xadd() provides the needed memory ordering.
>> - */
>> - while ((tqhp = tkt_q_find_head(lock)) == NULL)
>> - cpu_relax();
>> + tqhp = tkt_q_find_head(lock);
>> + ACCESS_ONCE(lock->tickets.head) -= __TKT_SPIN_INC;
>> + ACCESS_ONCE(tqhp->head) = (tqhp->head & ~0x1) + __TKT_SPIN_INC;
>>
>> for (;;) {
>>
>> @@ -145,9 +135,7 @@ void tkt_q_do_wake(arch_spinlock_t *lock)
>> return; /* No element, successfully removed queue. */
>> cpu_relax();
>> }
>> - if (ACCESS_ONCE(tqhp->head_tkt) != -1)
>> - ACCESS_ONCE(tqhp->head_tkt) = -1;
>> - smp_mb(); /* Order pointer fetch and assignment against handoff. */
>> + smp_mb(); /* Order modification, pointer fetch and assignment against handoff. */
>> ACCESS_ONCE(tqp->cpu) = -1;
>> }
>> EXPORT_SYMBOL(tkt_q_do_wake);
>> @@ -169,10 +157,7 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
>> */
>> smp_mb(); /* See above block comment. */
>>
>> - /* If there no longer is a queue, leave. */
>> tqhp = tkt_q_find_head(lock);
>> - if (tqhp == NULL)
>> - return false;
>>
>> /* Initialize our queue element. */
>> tq.cpu = raw_smp_processor_id();
>> @@ -180,9 +165,8 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
>> tq.next = NULL;
>>
>> /* Check to see if we already hold the lock. */
>> - if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
>> + if (ACCESS_ONCE(tqhp->head) == (inc.tail | 0x1)) {
>> /* The last holder left before queue formed, we hold lock. */
>> - tqhp->head_tkt = -1;
>> return true;
>> }
>>
>> @@ -290,16 +274,14 @@ tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
>> * Record the head counter in case one of the spinning
>> * CPUs already holds the lock but doesn't realize it yet.
>> */
>> - tqhp->head_tkt = asold.tickets.head;
>> + tqhp->head = asold.tickets.head | 0x1;
>>
>> /* The low-order bit in the head counter says "queued". */
>> - asnew.tickets.head |= 0x1;
>> + asnew.tickets.head = (i << 2) + 0x1;
>> } while (cmpxchg(&lock->head_tail,
>> asold.head_tail,
>> asnew.head_tail) != asold.head_tail);
>>
>> - /* Point the queue at the lock and go spin on it. */
>> - ACCESS_ONCE(tqhp->ref) = lock;
>> return tkt_q_do_spin(lock, inc);
>> }
>>
>> @@ -321,15 +303,8 @@ bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
>> * the lock with the corresponding queue.
>> */
>> do {
>> - /*
>> - * Use 0x1 to mark the queue in use, but also avoiding
>> - * any spinners trying to use it before we get it all
>> - * initialized.
>> - */
>> if (!tkt_q_heads[i].ref &&
>> - cmpxchg(&tkt_q_heads[i].ref,
>> - NULL,
>> - (arch_spinlock_t *)0x1) == NULL) {
>> + cmpxchg(&tkt_q_heads[i].ref, NULL, lock) == NULL) {
>>
>> /* Succeeded, now go initialize it. */
>> return tkt_q_init_contend(i, lock, inc);
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2013-06-13 23:58:19

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] v3 Auto-queued ticketlock

On Fri, Jun 14, 2013 at 07:25:57AM +0800, Lai Jiangshan wrote:
> On Thu, Jun 13, 2013 at 11:22 PM, Paul E. McKenney
> <[email protected]> wrote:
> > On Thu, Jun 13, 2013 at 10:55:41AM +0800, Lai Jiangshan wrote:
> >> On 06/12/2013 11:40 PM, Paul E. McKenney wrote:
> >> > Breaking up locks is better than implementing high-contention locks, but
> >> > if we must have high-contention locks, why not make them automatically
> >> > switch between light-weight ticket locks at low contention and queued
> >> > locks at high contention? After all, this would remove the need for
> >> > the developer to predict which locks will be highly contended.
> >> >
> >> > This commit allows ticket locks to automatically switch between pure
> >> > ticketlock and queued-lock operation as needed. If too many CPUs are
> >> > spinning on a given ticket lock, a queue structure will be allocated
> >> > and the lock will switch to queued-lock operation. When the lock becomes
> >> > free, it will switch back into ticketlock operation. The low-order bit
> >> > of the head counter is used to indicate that the lock is in queued mode,
> >> > which forces an unconditional mismatch between the head and tail counters.
> >> > This approach means that the common-case code path under conditions of
> >> > low contention is very nearly that of a plain ticket lock.
> >> >
> >> > A fixed number of queueing structures is statically allocated in an
> >> > array. The ticket-lock address is used to hash into an initial element,
> >> > but if that element is already in use, it moves to the next element. If
> >> > the entire array is already in use, continue to spin in ticket mode.
> >> >
> >> > Signed-off-by: Paul E. McKenney <[email protected]>
> >> > [ paulmck: Eliminate duplicate code and update comments (Steven Rostedt). ]
> >> > [ paulmck: Address Eric Dumazet review feedback. ]
> >> > [ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
> >> > [ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]
> >> > [ paulmck: Move cpu_relax() to main spin loop (Steven Rostedt). ]
> >> > [ paulmck: Reduce queue-switch contention (Waiman Long). ]
> >> > [ paulmck: __TKT_SPIN_INC for __ticket_spin_trylock() (Steffen Persvold). ]
> >> > [ paulmck: Type safety fixes (Steven Rostedt). ]
> >> > [ paulmck: Pre-check cmpxchg() value (Waiman Long). ]
> >> > [ paulmck: smp_mb() downgrade to smp_wmb() (Lai Jiangshan). ]
> >>
> >>
> >> Hi, Paul,
> >>
> >> I simplify the code and remove the search by encoding the index of struct tkt_q_head
> >> into lock->tickets.head.
> >>
> >> A) lock->tickets.head(when queued-lock):
> >> ---------------------------------
> >> index of struct tkt_q_head |0|1|
> >> ---------------------------------
> >
> > Interesting approach! It might reduce queued-mode overhead a bit in
> > some cases, though I bet that in the common case the first queue element
> > examined is the right one. More on this below.
> >
> >> The bit0 = 1 for queued, and the bit1 = 0 is reserved for __ticket_spin_unlock(),
> >> thus __ticket_spin_unlock() will not change the higher bits of lock->tickets.head.
> >>
> >> B) tqhp->head is for the real value of lock->tickets.head.
> >> if the last bit of tqhp->head is 1, it means it is the head ticket when started queuing.
> >
> > But don't you also need the xadd() in __ticket_spin_unlock() to become
> > a cmpxchg() for this to work? Or is your patch missing your changes to
> > arch/x86/include/asm/spinlock.h? Either way, this is likely to increase
> > the no-contention overhead, which might be counterproductive. Wouldn't
> > hurt to get measurements, though.
>
> No, don't need to change __ticket_spin_unlock() in my idea.
> bit1 in the tickets.head is reserved for __ticket_spin_unlock(),
> __ticket_spin_unlock() only changes the bit1, it will not change
> the higher bits. tkt_q_do_wake() will restore the tickets.head.
>
> This approach avoids cmpxchg in __ticket_spin_unlock().

Ah, I did miss that. But doesn't the adjustment in __ticket_spin_lock()
need to be atomic in order to handle concurrent invocations of
__ticket_spin_lock()?

Either way, it would be good to see the performance effects of this.

Thanx, Paul

> > Given the results that Davidlohr posted, I believe that the following
> > optimizations would also provide some improvement:
> >
> > 1. Move the call to tkt_spin_pass() from __ticket_spin_lock()
> > to a separate linker section in order to reduce the icache
> > penalty exacted by the spinloop. This is likely to be causing
> > some of the performance reductions in the cases where ticket
> > locks are not highly contended.
> >
> > 2. Limit the number of elements searched for in the array of
> > queues. However, this would help only if a number of ticket
> > locks were in queued mode at the same time.
> >
> > 3. Dynamically allocate the queue array at boot. This might
> > also reduce cache pressure, again, at least in cases where
> > there are a number of ticket locks in queued mode at the
> > same time.
> >
> > Frederic just reminded me that I owe him some energy-efficiency improvements
> > for adaptive ticks, so I won't get to these very quickly. Please feel free
> > to take these on -- the patch clearly does well under high contention, so
> > reducing the no-contention penalty could really help.
> >
> > Thanx, Paul
> >
> >> Thanks,
> >> Lai
> >>
> >> kernel/tktqlock.c | 51 +++++++++++++--------------------------------------
> >> 1 files changed, 13 insertions(+), 38 deletions(-)
> >>
> >> diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> >> index 912817c..1329d0f 100644
> >> --- a/kernel/tktqlock.c
> >> +++ b/kernel/tktqlock.c
> >> @@ -33,7 +33,7 @@ struct tkt_q {
> >>
> >> struct tkt_q_head {
> >> arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
> >> - s64 head_tkt; /* Head ticket when started queuing. */
> >> + __ticket_t head; /* Real head when queued. */
> >> struct tkt_q *spin; /* Head of queue. */
> >> struct tkt_q **spin_tail; /* Tail of queue. */
> >> };
> >> @@ -77,15 +77,8 @@ static unsigned long tkt_q_hash(arch_spinlock_t *lock)
> >> */
> >> static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *lock)
> >> {
> >> - int i;
> >> - int start;
> >> -
> >> - start = i = tkt_q_hash(lock);
> >> - do
> >> - if (ACCESS_ONCE(tkt_q_heads[i].ref) == lock)
> >> - return &tkt_q_heads[i];
> >> - while ((i = tkt_q_next_slot(i)) != start);
> >> - return NULL;
> >> + BUILD_BUG_ON(TKT_Q_NQUEUES > (1 << (TICKET_SHIFT - 2)));
> >> + return &tkt_q_heads[ACCESS_ONCE(lock->tickets.head) >> 2];
> >> }
> >>
> >> /*
> >> @@ -101,11 +94,11 @@ static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)
> >>
> >> /* Pick up the ticket values. */
> >> asold = ACCESS_ONCE(*lock);
> >> - if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
> >> + if (tqhp->head == asold.tickets.tail) {
> >>
> >> /* Attempt to mark the lock as not having a queue. */
> >> asnew = asold;
> >> - asnew.tickets.head &= ~0x1;
> >> + asnew.tickets.head = tqhp->head;
> >> if (cmpxchg(&lock->head_tail,
> >> asold.head_tail,
> >> asnew.head_tail) == asold.head_tail) {
> >> @@ -128,12 +121,9 @@ void tkt_q_do_wake(arch_spinlock_t *lock)
> >> struct tkt_q_head *tqhp;
> >> struct tkt_q *tqp;
> >>
> >> - /*
> >> - * If the queue is still being set up, wait for it. Note that
> >> - * the caller's xadd() provides the needed memory ordering.
> >> - */
> >> - while ((tqhp = tkt_q_find_head(lock)) == NULL)
> >> - cpu_relax();
> >> + tqhp = tkt_q_find_head(lock);
> >> + ACCESS_ONCE(lock->tickets.head) -= __TKT_SPIN_INC;
> >> + ACCESS_ONCE(tqhp->head) = (tqhp->head & ~0x1) + __TKT_SPIN_INC;
> >>
> >> for (;;) {
> >>
> >> @@ -145,9 +135,7 @@ void tkt_q_do_wake(arch_spinlock_t *lock)
> >> return; /* No element, successfully removed queue. */
> >> cpu_relax();
> >> }
> >> - if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> >> - ACCESS_ONCE(tqhp->head_tkt) = -1;
> >> - smp_mb(); /* Order pointer fetch and assignment against handoff. */
> >> + smp_mb(); /* Order modification, pointer fetch and assignment against handoff. */
> >> ACCESS_ONCE(tqp->cpu) = -1;
> >> }
> >> EXPORT_SYMBOL(tkt_q_do_wake);
> >> @@ -169,10 +157,7 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> >> */
> >> smp_mb(); /* See above block comment. */
> >>
> >> - /* If there no longer is a queue, leave. */
> >> tqhp = tkt_q_find_head(lock);
> >> - if (tqhp == NULL)
> >> - return false;
> >>
> >> /* Initialize our queue element. */
> >> tq.cpu = raw_smp_processor_id();
> >> @@ -180,9 +165,8 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> >> tq.next = NULL;
> >>
> >> /* Check to see if we already hold the lock. */
> >> - if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> >> + if (ACCESS_ONCE(tqhp->head) == (inc.tail | 0x1)) {
> >> /* The last holder left before queue formed, we hold lock. */
> >> - tqhp->head_tkt = -1;
> >> return true;
> >> }
> >>
> >> @@ -290,16 +274,14 @@ tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
> >> * Record the head counter in case one of the spinning
> >> * CPUs already holds the lock but doesn't realize it yet.
> >> */
> >> - tqhp->head_tkt = asold.tickets.head;
> >> + tqhp->head = asold.tickets.head | 0x1;
> >>
> >> /* The low-order bit in the head counter says "queued". */
> >> - asnew.tickets.head |= 0x1;
> >> + asnew.tickets.head = (i << 2) + 0x1;
> >> } while (cmpxchg(&lock->head_tail,
> >> asold.head_tail,
> >> asnew.head_tail) != asold.head_tail);
> >>
> >> - /* Point the queue at the lock and go spin on it. */
> >> - ACCESS_ONCE(tqhp->ref) = lock;
> >> return tkt_q_do_spin(lock, inc);
> >> }
> >>
> >> @@ -321,15 +303,8 @@ bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
> >> * the lock with the corresponding queue.
> >> */
> >> do {
> >> - /*
> >> - * Use 0x1 to mark the queue in use, but also avoiding
> >> - * any spinners trying to use it before we get it all
> >> - * initialized.
> >> - */
> >> if (!tkt_q_heads[i].ref &&
> >> - cmpxchg(&tkt_q_heads[i].ref,
> >> - NULL,
> >> - (arch_spinlock_t *)0x1) == NULL) {
> >> + cmpxchg(&tkt_q_heads[i].ref, NULL, lock) == NULL) {
> >>
> >> /* Succeeded, now go initialize it. */
> >> return tkt_q_init_contend(i, lock, inc);
> >>
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at http://www.tux.org/lkml/
>

2013-06-14 01:28:39

by Lai Jiangshan

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] v3 Auto-queued ticketlock

On 06/14/2013 07:57 AM, Paul E. McKenney wrote:
> On Fri, Jun 14, 2013 at 07:25:57AM +0800, Lai Jiangshan wrote:
>> On Thu, Jun 13, 2013 at 11:22 PM, Paul E. McKenney
>> <[email protected]> wrote:
>>> On Thu, Jun 13, 2013 at 10:55:41AM +0800, Lai Jiangshan wrote:
>>>> On 06/12/2013 11:40 PM, Paul E. McKenney wrote:
>>>>> Breaking up locks is better than implementing high-contention locks, but
>>>>> if we must have high-contention locks, why not make them automatically
>>>>> switch between light-weight ticket locks at low contention and queued
>>>>> locks at high contention? After all, this would remove the need for
>>>>> the developer to predict which locks will be highly contended.
>>>>>
>>>>> This commit allows ticket locks to automatically switch between pure
>>>>> ticketlock and queued-lock operation as needed. If too many CPUs are
>>>>> spinning on a given ticket lock, a queue structure will be allocated
>>>>> and the lock will switch to queued-lock operation. When the lock becomes
>>>>> free, it will switch back into ticketlock operation. The low-order bit
>>>>> of the head counter is used to indicate that the lock is in queued mode,
>>>>> which forces an unconditional mismatch between the head and tail counters.
>>>>> This approach means that the common-case code path under conditions of
>>>>> low contention is very nearly that of a plain ticket lock.
>>>>>
>>>>> A fixed number of queueing structures is statically allocated in an
>>>>> array. The ticket-lock address is used to hash into an initial element,
>>>>> but if that element is already in use, it moves to the next element. If
>>>>> the entire array is already in use, continue to spin in ticket mode.
>>>>>
>>>>> Signed-off-by: Paul E. McKenney <[email protected]>
>>>>> [ paulmck: Eliminate duplicate code and update comments (Steven Rostedt). ]
>>>>> [ paulmck: Address Eric Dumazet review feedback. ]
>>>>> [ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
>>>>> [ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]
>>>>> [ paulmck: Move cpu_relax() to main spin loop (Steven Rostedt). ]
>>>>> [ paulmck: Reduce queue-switch contention (Waiman Long). ]
>>>>> [ paulmck: __TKT_SPIN_INC for __ticket_spin_trylock() (Steffen Persvold). ]
>>>>> [ paulmck: Type safety fixes (Steven Rostedt). ]
>>>>> [ paulmck: Pre-check cmpxchg() value (Waiman Long). ]
>>>>> [ paulmck: smp_mb() downgrade to smp_wmb() (Lai Jiangshan). ]
>>>>
>>>>
>>>> Hi, Paul,
>>>>
>>>> I simplify the code and remove the search by encoding the index of struct tkt_q_head
>>>> into lock->tickets.head.
>>>>
>>>> A) lock->tickets.head(when queued-lock):
>>>> ---------------------------------
>>>> index of struct tkt_q_head |0|1|
>>>> ---------------------------------
>>>
>>> Interesting approach! It might reduce queued-mode overhead a bit in
>>> some cases, though I bet that in the common case the first queue element
>>> examined is the right one. More on this below.
>>>
>>>> The bit0 = 1 for queued, and the bit1 = 0 is reserved for __ticket_spin_unlock(),
>>>> thus __ticket_spin_unlock() will not change the higher bits of lock->tickets.head.
>>>>
>>>> B) tqhp->head is for the real value of lock->tickets.head.
>>>> if the last bit of tqhp->head is 1, it means it is the head ticket when started queuing.
>>>
>>> But don't you also need the xadd() in __ticket_spin_unlock() to become
>>> a cmpxchg() for this to work? Or is your patch missing your changes to
>>> arch/x86/include/asm/spinlock.h? Either way, this is likely to increase
>>> the no-contention overhead, which might be counterproductive. Wouldn't
>>> hurt to get measurements, though.
>>
>> No, don't need to change __ticket_spin_unlock() in my idea.
>> bit1 in the tickets.head is reserved for __ticket_spin_unlock(),
>> __ticket_spin_unlock() only changes the bit1, it will not change
>> the higher bits. tkt_q_do_wake() will restore the tickets.head.
>>
>> This approach avoids cmpxchg in __ticket_spin_unlock().
>
> Ah, I did miss that. But doesn't the adjustment in __ticket_spin_lock()
> need to be atomic in order to handle concurrent invocations of
> __ticket_spin_lock()?

I don't understand, do we just discuss about __ticket_spin_unlock() only?
Or does my suggestion hurt __ticket_spin_lock()?

>
> Either way, it would be good to see the performance effects of this.
>
> Thanx, Paul

2013-06-14 07:09:48

by Lai Jiangshan

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] v3 Auto-queued ticketlock

On 06/14/2013 07:57 AM, Paul E. McKenney wrote:
> On Fri, Jun 14, 2013 at 07:25:57AM +0800, Lai Jiangshan wrote:
>> On Thu, Jun 13, 2013 at 11:22 PM, Paul E. McKenney
>> <[email protected]> wrote:
>>> On Thu, Jun 13, 2013 at 10:55:41AM +0800, Lai Jiangshan wrote:
>>>> On 06/12/2013 11:40 PM, Paul E. McKenney wrote:
>>>>> Breaking up locks is better than implementing high-contention locks, but
>>>>> if we must have high-contention locks, why not make them automatically
>>>>> switch between light-weight ticket locks at low contention and queued
>>>>> locks at high contention? After all, this would remove the need for
>>>>> the developer to predict which locks will be highly contended.
>>>>>
>>>>> This commit allows ticket locks to automatically switch between pure
>>>>> ticketlock and queued-lock operation as needed. If too many CPUs are
>>>>> spinning on a given ticket lock, a queue structure will be allocated
>>>>> and the lock will switch to queued-lock operation. When the lock becomes
>>>>> free, it will switch back into ticketlock operation. The low-order bit
>>>>> of the head counter is used to indicate that the lock is in queued mode,
>>>>> which forces an unconditional mismatch between the head and tail counters.
>>>>> This approach means that the common-case code path under conditions of
>>>>> low contention is very nearly that of a plain ticket lock.
>>>>>
>>>>> A fixed number of queueing structures is statically allocated in an
>>>>> array. The ticket-lock address is used to hash into an initial element,
>>>>> but if that element is already in use, it moves to the next element. If
>>>>> the entire array is already in use, continue to spin in ticket mode.
>>>>>
>>>>> Signed-off-by: Paul E. McKenney <[email protected]>
>>>>> [ paulmck: Eliminate duplicate code and update comments (Steven Rostedt). ]
>>>>> [ paulmck: Address Eric Dumazet review feedback. ]
>>>>> [ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
>>>>> [ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]
>>>>> [ paulmck: Move cpu_relax() to main spin loop (Steven Rostedt). ]
>>>>> [ paulmck: Reduce queue-switch contention (Waiman Long). ]
>>>>> [ paulmck: __TKT_SPIN_INC for __ticket_spin_trylock() (Steffen Persvold). ]
>>>>> [ paulmck: Type safety fixes (Steven Rostedt). ]
>>>>> [ paulmck: Pre-check cmpxchg() value (Waiman Long). ]
>>>>> [ paulmck: smp_mb() downgrade to smp_wmb() (Lai Jiangshan). ]
>>>>
>>>>
>>>> Hi, Paul,
>>>>
>>>> I simplify the code and remove the search by encoding the index of struct tkt_q_head
>>>> into lock->tickets.head.
>>>>
>>>> A) lock->tickets.head(when queued-lock):
>>>> ---------------------------------
>>>> index of struct tkt_q_head |0|1|
>>>> ---------------------------------
>>>
>>> Interesting approach! It might reduce queued-mode overhead a bit in
>>> some cases, though I bet that in the common case the first queue element
>>> examined is the right one. More on this below.
>>>
>>>> The bit0 = 1 for queued, and the bit1 = 0 is reserved for __ticket_spin_unlock(),
>>>> thus __ticket_spin_unlock() will not change the higher bits of lock->tickets.head.
>>>>
>>>> B) tqhp->head is for the real value of lock->tickets.head.
>>>> if the last bit of tqhp->head is 1, it means it is the head ticket when started queuing.
>>>
>>> But don't you also need the xadd() in __ticket_spin_unlock() to become
>>> a cmpxchg() for this to work? Or is your patch missing your changes to
>>> arch/x86/include/asm/spinlock.h? Either way, this is likely to increase
>>> the no-contention overhead, which might be counterproductive. Wouldn't
>>> hurt to get measurements, though.
>>
>> No, don't need to change __ticket_spin_unlock() in my idea.
>> bit1 in the tickets.head is reserved for __ticket_spin_unlock(),
>> __ticket_spin_unlock() only changes the bit1, it will not change
>> the higher bits. tkt_q_do_wake() will restore the tickets.head.
>>
>> This approach avoids cmpxchg in __ticket_spin_unlock().
>
> Ah, I did miss that. But doesn't the adjustment in __ticket_spin_lock()
> need to be atomic in order to handle concurrent invocations of
> __ticket_spin_lock()?
>
> Either way, it would be good to see the performance effects of this.
>
> Thanx, Paul
>
>>> Given the results that Davidlohr posted, I believe that the following
>>> optimizations would also provide some improvement:
>>>
>>> 1. Move the call to tkt_spin_pass() from __ticket_spin_lock()
>>> to a separate linker section in order to reduce the icache
>>> penalty exacted by the spinloop. This is likely to be causing
>>> some of the performance reductions in the cases where ticket
>>> locks are not highly contended.
>>>
>>> 2. Limit the number of elements searched for in the array of
>>> queues. However, this would help only if a number of ticket
>>> locks were in queued mode at the same time.
>>>
>>> 3. Dynamically allocate the queue array at boot. This might
>>> also reduce cache pressure, again, at least in cases where
>>> there are a number of ticket locks in queued mode at the
>>> same time.
>>>
>>> Frederic just reminded me that I owe him some energy-efficiency improvements
>>> for adaptive ticks, so I won't get to these very quickly. Please feel free
>>> to take these on -- the patch clearly does well under high contention, so
>>> reducing the no-contention penalty could really help.
>>>
>>> Thanx, Paul
>>>
>>>> Thanks,
>>>> Lai
>>>>
>>>> kernel/tktqlock.c | 51 +++++++++++++--------------------------------------
>>>> 1 files changed, 13 insertions(+), 38 deletions(-)
>>>>
>>>> diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
>>>> index 912817c..1329d0f 100644
>>>> --- a/kernel/tktqlock.c
>>>> +++ b/kernel/tktqlock.c
>>>> @@ -33,7 +33,7 @@ struct tkt_q {
>>>>
>>>> struct tkt_q_head {
>>>> arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
>>>> - s64 head_tkt; /* Head ticket when started queuing. */
>>>> + __ticket_t head; /* Real head when queued. */
>>>> struct tkt_q *spin; /* Head of queue. */
>>>> struct tkt_q **spin_tail; /* Tail of queue. */
>>>> };
>>>> @@ -77,15 +77,8 @@ static unsigned long tkt_q_hash(arch_spinlock_t *lock)
>>>> */
>>>> static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *lock)
>>>> {
>>>> - int i;
>>>> - int start;
>>>> -
>>>> - start = i = tkt_q_hash(lock);
>>>> - do
>>>> - if (ACCESS_ONCE(tkt_q_heads[i].ref) == lock)
>>>> - return &tkt_q_heads[i];
>>>> - while ((i = tkt_q_next_slot(i)) != start);
>>>> - return NULL;
>>>> + BUILD_BUG_ON(TKT_Q_NQUEUES > (1 << (TICKET_SHIFT - 2)));
>>>> + return &tkt_q_heads[ACCESS_ONCE(lock->tickets.head) >> 2];
>>>> }
>>>>
>>>> /*
>>>> @@ -101,11 +94,11 @@ static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)
>>>>
>>>> /* Pick up the ticket values. */
>>>> asold = ACCESS_ONCE(*lock);
>>>> - if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
>>>> + if (tqhp->head == asold.tickets.tail) {
>>>>
>>>> /* Attempt to mark the lock as not having a queue. */
>>>> asnew = asold;
>>>> - asnew.tickets.head &= ~0x1;
>>>> + asnew.tickets.head = tqhp->head;
>>>> if (cmpxchg(&lock->head_tail,
>>>> asold.head_tail,
>>>> asnew.head_tail) == asold.head_tail) {
>>>> @@ -128,12 +121,9 @@ void tkt_q_do_wake(arch_spinlock_t *lock)
>>>> struct tkt_q_head *tqhp;
>>>> struct tkt_q *tqp;
>>>>
>>>> - /*
>>>> - * If the queue is still being set up, wait for it. Note that
>>>> - * the caller's xadd() provides the needed memory ordering.
>>>> - */
>>>> - while ((tqhp = tkt_q_find_head(lock)) == NULL)
>>>> - cpu_relax();
>>>> + tqhp = tkt_q_find_head(lock);
>>>> + ACCESS_ONCE(lock->tickets.head) -= __TKT_SPIN_INC;
>>>> + ACCESS_ONCE(tqhp->head) = (tqhp->head & ~0x1) + __TKT_SPIN_INC;
>>>>
>>>> for (;;) {
>>>>
>>>> @@ -145,9 +135,7 @@ void tkt_q_do_wake(arch_spinlock_t *lock)
>>>> return; /* No element, successfully removed queue. */
>>>> cpu_relax();
>>>> }
>>>> - if (ACCESS_ONCE(tqhp->head_tkt) != -1)
>>>> - ACCESS_ONCE(tqhp->head_tkt) = -1;
>>>> - smp_mb(); /* Order pointer fetch and assignment against handoff. */
>>>> + smp_mb(); /* Order modification, pointer fetch and assignment against handoff. */
>>>> ACCESS_ONCE(tqp->cpu) = -1;
>>>> }
>>>> EXPORT_SYMBOL(tkt_q_do_wake);
>>>> @@ -169,10 +157,7 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
>>>> */
>>>> smp_mb(); /* See above block comment. */
>>>>
>>>> - /* If there no longer is a queue, leave. */
>>>> tqhp = tkt_q_find_head(lock);
>>>> - if (tqhp == NULL)
>>>> - return false;
>>>>
>>>> /* Initialize our queue element. */
>>>> tq.cpu = raw_smp_processor_id();
>>>> @@ -180,9 +165,8 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
>>>> tq.next = NULL;
>>>>
>>>> /* Check to see if we already hold the lock. */
>>>> - if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
>>>> + if (ACCESS_ONCE(tqhp->head) == (inc.tail | 0x1)) {
>>>> /* The last holder left before queue formed, we hold lock. */
>>>> - tqhp->head_tkt = -1;
>>>> return true;
>>>> }
>>>>
>>>> @@ -290,16 +274,14 @@ tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
>>>> * Record the head counter in case one of the spinning
>>>> * CPUs already holds the lock but doesn't realize it yet.
>>>> */
>>>> - tqhp->head_tkt = asold.tickets.head;
>>>> + tqhp->head = asold.tickets.head | 0x1;
>>>>
>>>> /* The low-order bit in the head counter says "queued". */
>>>> - asnew.tickets.head |= 0x1;
>>>> + asnew.tickets.head = (i << 2) + 0x1;
>>>> } while (cmpxchg(&lock->head_tail,
>>>> asold.head_tail,
>>>> asnew.head_tail) != asold.head_tail);
>>>>
>>>> - /* Point the queue at the lock and go spin on it. */
>>>> - ACCESS_ONCE(tqhp->ref) = lock;
>>>> return tkt_q_do_spin(lock, inc);
>>>> }
>>>>
>>>> @@ -321,15 +303,8 @@ bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
>>>> * the lock with the corresponding queue.
>>>> */
>>>> do {
>>>> - /*
>>>> - * Use 0x1 to mark the queue in use, but also avoiding
>>>> - * any spinners trying to use it before we get it all
>>>> - * initialized.
>>>> - */
>>>> if (!tkt_q_heads[i].ref &&
>>>> - cmpxchg(&tkt_q_heads[i].ref,
>>>> - NULL,
>>>> - (arch_spinlock_t *)0x1) == NULL) {
>>>> + cmpxchg(&tkt_q_heads[i].ref, NULL, lock) == NULL) {
>>>>
>>>> /* Succeeded, now go initialize it. */
>>>> return tkt_q_init_contend(i, lock, inc);
>>>>
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>> the body of a message to [email protected]
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>> Please read the FAQ at http://www.tux.org/lkml/
>>
>

Hi, Paul.

More possible improvement. Again, this is untested.

This improvement removes slow path from unlock() by:
1) Instead of forcing all competitor to spin on its queued-node,
this improvement selects one and only one competitor and force it still spinning on the lock.
2) The selected competitor is the leader of the queue.
3) the queue is used for passing the leadership instead of passing the lock holder.

Implemented on top of my previous improvement.
Would you merge them all as one patch to get more reviews if you agree my improvement?


Thanks,
Lai

PS:

After this, we can shrink the size of struct tkt_q_head.
Is this size important?

struct tkt_q_head {
__ticket_t head; /* Real head when queued. */
struct tkt_q **spin_tail; /* Tail of queue. */
};

And "__ticket_t head;" can be also removed.


diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 5aa0177..01c3bdd 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -90,27 +90,11 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
}

-#ifndef CONFIG_TICKET_LOCK_QUEUED
-
static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
{
__add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
}

-#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
-
-extern void tkt_q_do_wake(arch_spinlock_t *lock);
-
-static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
-{
- __ticket_t head = 2;
-
- head = xadd(&lock->tickets.head, head);
- if (head & 0x1)
- tkt_q_do_wake(lock);
-}
-#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
-
static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
{
struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
index 1329d0f..b658fae 100644
--- a/kernel/tktqlock.c
+++ b/kernel/tktqlock.c
@@ -94,7 +94,7 @@ static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)

/* Pick up the ticket values. */
asold = ACCESS_ONCE(*lock);
- if (tqhp->head == asold.tickets.tail) {
+ if (tqhp->head + __TKT_SPIN_INC == asold.tickets.tail) {

/* Attempt to mark the lock as not having a queue. */
asnew = asold;
@@ -114,33 +114,6 @@ static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)
}

/*
- * Hand the lock off to the first CPU on the queue.
- */
-void tkt_q_do_wake(arch_spinlock_t *lock)
-{
- struct tkt_q_head *tqhp;
- struct tkt_q *tqp;
-
- tqhp = tkt_q_find_head(lock);
- ACCESS_ONCE(lock->tickets.head) -= __TKT_SPIN_INC;
- ACCESS_ONCE(tqhp->head) = (tqhp->head & ~0x1) + __TKT_SPIN_INC;
-
- for (;;) {
-
- /* Find the first queue element. */
- tqp = ACCESS_ONCE(tqhp->spin);
- if (tqp != NULL)
- break; /* Element exists, hand off lock. */
- if (tkt_q_try_unqueue(lock, tqhp))
- return; /* No element, successfully removed queue. */
- cpu_relax();
- }
- smp_mb(); /* Order modification, pointer fetch and assignment against handoff. */
- ACCESS_ONCE(tqp->cpu) = -1;
-}
-EXPORT_SYMBOL(tkt_q_do_wake);
-
-/*
* Given a lock that already has a queue associated with it, spin on
* that queue. Return false if there was no queue (which means we do not
* hold the lock) and true otherwise (meaning we -do- hold the lock).
@@ -150,6 +123,7 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
struct tkt_q **oldtail;
struct tkt_q tq;
struct tkt_q_head *tqhp;
+ int index;

/*
* Ensure that accesses to queue header happen after sensing
@@ -157,6 +131,7 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
*/
smp_mb(); /* See above block comment. */

+ index = ACCESS_ONCE(lock->tickets.head) >> 2;
tqhp = tkt_q_find_head(lock);

/* Initialize our queue element. */
@@ -178,10 +153,29 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
oldtail = xchg(&tqhp->spin_tail, &tq.next);
ACCESS_ONCE(*oldtail) = &tq;

- /* Spin until handoff. */
- while (ACCESS_ONCE(tq.cpu) != -1)
+ if (oldtail != &tqhp->spin) {
+ /* Spin until get the queue leadership */
+ while (ACCESS_ONCE(tq.cpu) != -1)
+ cpu_relax();
+ smp_mb(); /* Force ordering between get leadership and access lock->tickets.head */
+ }
+
+ /*
+ * Spin until hold the lock. if the next smp_mb() doesn't help,
+ * it should be implemented arch-depended
+ */
+ inc.head = index * __TKT_SPIN_INC * 2 + 1;
+ while (ACCESS_ONCE(lock->tickets.head) != inc.head + __TKT_SPIN_INC)
cpu_relax();

+ smp_mb(); /* Force ordering between (prev C.S. & lock->tickets.head)
+ and (current C.S. & tqhp->head & hand off) */
+
+ /* store queued-lock tickets head */
+ ACCESS_ONCE(lock->tickets.head) = inc.head;
+ /* update real tickets head */
+ ACCESS_ONCE(tqhp->head) = (tqhp->head & ~0x1) + __TKT_SPIN_INC;
+
/*
* Remove our element from the queue. If the queue is now empty,
* update carefully so that the next acquisition will enqueue itself
@@ -217,8 +211,10 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
/* Try to point the tail back at the head. */
if (cmpxchg(&tqhp->spin_tail,
&tq.next,
- &tqhp->spin) == &tq.next)
+ &tqhp->spin) == &tq.next) {
+ tkt_q_try_unqueue(lock, tqhp);
return true; /* Succeeded, queue is now empty. */
+ }

/* Failed, if needed, wait for the enqueue to complete. */
while (tq.next == NULL)
@@ -226,14 +222,13 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)

/* The following code will repair the head. */
}
- smp_mb(); /* Force ordering between handoff and critical section. */

/*
- * Advance list-head pointer. This same task will be the next to
- * access this when releasing the lock, so no need for a memory
- * barrier after the following assignment.
+ * Advance list-head pointer. tqhp->spin is useless, it can be removed.
*/
ACCESS_ONCE(tqhp->spin) = tq.next;
+ ACCESS_ONCE(tq.next->cpu) = -1; /* hand off queue leadership */
+
return true;
}

@@ -277,7 +272,7 @@ tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
tqhp->head = asold.tickets.head | 0x1;

/* The low-order bit in the head counter says "queued". */
- asnew.tickets.head = (i << 2) + 0x1;
+ asnew.tickets.head = i * __TKT_SPIN_INC * 2 + 0x1;
} while (cmpxchg(&lock->head_tail,
asold.head_tail,
asnew.head_tail) != asold.head_tail);

2013-06-14 15:00:32

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On 06/12/2013 08:59 PM, Linus Torvalds wrote:
> On Wed, Jun 12, 2013 at 5:49 PM, Al Viro<[email protected]> wrote:
>> On Wed, Jun 12, 2013 at 05:38:13PM -0700, Linus Torvalds wrote:
>>> For the particular case of dget_parent() maybe dget_parent() should
>>> just double-check the original dentry->d_parent pointer after getting
>>> the refcount on it (and if the parent has changed, drop the refcount
>>> again and go to the locked version). That might be a good idea anyway,
>>> and should fix the possible race (which would be with another cpu
>>> having to first rename the child to some other parent, and the
>>> d_invalidate() the original parent)
>> Yes, but... Then we'd need to dput() that sucker if we decide we shouldn't
>> have grabbed that reference, after all, which would make dget_parent()
>> potentially blocking.
> Ho humm.. interesting. I was talking about wanting to mix atomics and
> spinlocks earlier in this thread due to space constraints, and it
> strikes me that that would actually help this case a lot. Having the
> dentry count mix d_lock and the count in one word would allow for
> atomic ops like "increment if not locked", and we'd avoid this whole
> race entirely..
>
> Something like "low bit of count is the lock bit" would end up being
> lovely for this case. Of course, that's not how our spinlocks work ..
>
> Linus

I have created another patch to do exactly the "increment if not locked"
operation as suggested. It did help a lot. See the patch below for more
information. Any additional comment will be appreciated.

Regards,
Longman

-------------------------------------------------------------------------------------------------------------------
The current code takes the dentry's d_lock lock whenever the d_count
reference count is being updated. In reality, nothing big really
happens until d_count goes to 0 in dput(). So it is not necessary
to take the lock if the reference count won't go to 0. On the other
hand, there are cases where d_count should not be updated or was not
expected to be updated while d_lock was taken by other functions.

To try to locklessly update the d_count while d_lock is not taken
by others, the 32-bit d_count and 32-bit d_lock (when no debugging
code is turned on) can be combined into a single 64-bit word to be
updated atomically whenever the following conditions happen:

1. The lock is not taken, i.e. spin_can_lock() returns true.
2. For increment, the original d_count must be > 0, or
3. for decrement, the original d_count must be > 1.

To maximize the chance of doing lockless update, the new code calls
spin_unlock_wait() before trying to do the update.

The new code also attempts to do lockless atomic update twice before
falling back to the old code path of taking a lock before doing
the update. It is because there will still be some fair amount of
contention with only one attempt.

The offsets of the d_count/d_lock duple are at byte 72 and 88 for
32-bit and 64-bit systems respectively. In both cases, they are
8-byte aligned and their combination into a single 8-byte word will
not introduce a hole that increase the size of the dentry structure.

This patch has a particular big impact on the short workload of the
AIM7 benchmark with ramdisk filesystem. The table below show the
performance improvement to the JPM (jobs per minutes) throughput due
to this patch on an 8-socket 80-core x86-64 system with a 3.10-rc4
kernel in a 1/2/4/8 node configuration by using numactl to restrict
the execution of the workload on certain nodes.

+-----------------+----------------+-----------------+----------+
| Configuration | Mean JPM | Mean JPM | % Change |
| | Rate w/o patch | Rate with patch | |
+-----------------+---------------------------------------------+
| | User Range 10 - 100 |
+-----------------+---------------------------------------------+
| 8 nodes, HT off | 1596798 | 4748981 | +197.4% |
| 4 nodes, HT off | 1653817 | 4845590 | +193.0% |
| 2 nodes, HT off | 3802258 | 3832017 | +0.8% |
| 1 node , HT off | 2403297 | 2386401 | -0.7% |
+-----------------+---------------------------------------------+
| | User Range 200 - 1000 |
+-----------------+---------------------------------------------+
| 8 nodes, HT off | 1070992 | 6060457 | +465.9% |
| 4 nodes, HT off | 1367668 | 6799978 | +397.2% |
| 2 nodes, HT off | 4554370 | 4609893 | +1.2% |
| 1 node , HT off | 2534826 | 2526519 | -0.3% |
+-----------------+---------------------------------------------+
| | User Range 1100 - 2000 |
+-----------------+---------------------------------------------+
| 8 nodes, HT off | 1061322 | 6435537 | +506.37 |
| 4 nodes, HT off | 1365111 | 6589983 | +382.7% |
| 2 nodes, HT off | 4583947 | 4648464 | +1.4% |
| 1 node , HT off | 2563721 | 2566229 | +0.1% |
+-----------------+----------------+-----------------+----------+

It can be seen that with 40 CPUs (4 nodes) or more, this patch can
significantly improve the short workload performance. With only 1 or
2 nodes, the performance is similar with or without the patch. The
short workload also scales pretty well up to 4 nodes with this patch.

A perf call-graph report of the short workload at 1500 users
without the patch on the same 8-node machine indicates that about
78% of the workload's total time were spent in the _raw_spin_lock()
function. Almost all of which can be attributed to the following 2
kernel functions:
1. dget_parent (49.91%)
2. dput (49.89%)

The relevant perf report lines are:
+ 78.37% reaim [kernel.kallsyms] [k] _raw_spin_lock
+ 0.09% reaim [kernel.kallsyms] [k] dput
+ 0.05% reaim [kernel.kallsyms] [k] _raw_spin_lock_irq
+ 0.00% reaim [kernel.kallsyms] [k] dget_parent

With this patch installed, the new perf report lines are:
+ 13.28% reaim [kernel.kallsyms] [k] _raw_spin_lock_irqsave
+ 7.33% reaim [kernel.kallsyms] [k] _raw_spin_lock
+ 2.93% reaim [kernel.kallsyms] [k] dget_parent
+ 1.32% reaim [kernel.kallsyms] [k] dput

- 7.33% reaim [kernel.kallsyms] [k] _raw_spin_lock
- _raw_spin_lock
+ 41.96% d_path
+ 41.68% sys_getcwd
+ 2.67% prepend_path
+ 1.66% complete_walk
+ 0.86% unlazy_walk
+ 0.74% sem_lock
+ 0.72% do_anonymous_page
+ 0.69% dget_parent
+ 0.67% dput
+ 0.55% process_backlog
+ 0.52% enqueue_to_backlog

The dput used up only 0.67% of the _raw_spin_lock time while
dget_parent used only 0.69%. The time spent on dput and dget_parent
did increase because of busy waiting for unlock as well as the overhead
of doing cmpxchg operations.

This impact of this patch on other AIM7 workloads were much more
modest. The table below show the mean %change due to this patch on
the same 8-socket system with a 3.10-rc4 kernel.

+--------------+---------------+----------------+-----------------+
| Workload | mean % change | mean % change | mean % change |
| | 10-100 users | 200-1000 users | 1100-2000 users |
+--------------+---------------+----------------+-----------------+
| alltests | 0.0% | -0.3% | -0.3% |
| five_sec | -4.6% | +6.5% | +3.1% |
| fserver | -1.2% | -4.0% | -3.4% |
| high_systime | -0.1% | +1.7% | +7.2% |
| new_fserver | -2.8% | -3.3% | -2.1% |
| shared | -0.6% | -0.2% | +0.2% |
+--------------+---------------+----------------+-----------------+

There are slight drops in performance for fsever and new_fserver
workloads, but slight increase in the high_systime and five_sec
workloads.

Signed-off-by: Waiman Long <[email protected]>
---
fs/dcache.c | 14 ++++++-
include/linux/dcache.h | 102
+++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 112 insertions(+), 4 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index f09b908..2190c34 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -515,6 +515,8 @@ void dput(struct dentry *dentry)
repeat:
if (dentry->d_count == 1)
might_sleep();
+ if (__dput_unless_lt2_or_locked(dentry))
+ return;
spin_lock(&dentry->d_lock);
BUG_ON(!dentry->d_count);
if (dentry->d_count > 1) {
@@ -611,6 +613,8 @@ static inline void __dget_dlock(struct dentry *dentry)

static inline void __dget(struct dentry *dentry)
{
+ if (__dget_unless_zero_or_locked(dentry))
+ return;
spin_lock(&dentry->d_lock);
__dget_dlock(dentry);
spin_unlock(&dentry->d_lock);
@@ -620,17 +624,23 @@ struct dentry *dget_parent(struct dentry *dentry)
{
struct dentry *ret;

+ rcu_read_lock();
+ ret = rcu_dereference(dentry->d_parent);
+ if (__dget_unless_zero_or_locked(ret)) {
+ rcu_read_unlock();
+ return ret;
+ }
repeat:
/*
* Don't need rcu_dereference because we re-check it was correct under
* the lock.
*/
- rcu_read_lock();
- ret = dentry->d_parent;
+ ret = ACCESS_ONCE(dentry->d_parent);
spin_lock(&ret->d_lock);
if (unlikely(ret != dentry->d_parent)) {
spin_unlock(&ret->d_lock);
rcu_read_unlock();
+ rcu_read_lock();
goto repeat;
}
rcu_read_unlock();
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 1a6bb81..99ab699 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -112,8 +112,13 @@ struct dentry {
unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */

/* Ref lookup also touches following */
- unsigned int d_count; /* protected by d_lock */
- spinlock_t d_lock; /* per dentry lock */
+ union {
+ struct {
+ unsigned int d_count; /* protected by d_lock */
+ spinlock_t d_lock; /* per dentry lock */
+ };
+ u64 d_cnt_lock; /* combined count & lock */
+ };
const struct dentry_operations *d_op;
struct super_block *d_sb; /* The root of the dentry tree */
unsigned long d_time; /* used by d_revalidate */
@@ -132,6 +137,19 @@ struct dentry {
};

/*
+ * The compiler does not allow named union in struct dentry without adding
+ * a named field. The union definition is repeated below to allow functions
+ * to reference it.
+ */
+union _d_cnt_lock {
+ struct {
+ unsigned int d_count; /* protected by d_lock */
+ spinlock_t d_lock; /* per dentry lock */
+ };
+ u64 d_cnt_lock; /* combined count & lock */
+};
+
+/*
* dentry->d_lock spinlock nesting subclasses:
*
* 0: normal
@@ -325,6 +343,84 @@ static inline int __d_rcu_to_refcount(struct dentry
*dentry
, unsigned seq)
return ret;
}

+/**
+ * __dget_unless_zero_or_locked - atomically inc d_count if != 0 and
not locked
+ * @dentry: dentry to work on
+ * Return: 0 on failure, else 1
+ *
+ * __dget_if_notzero_and_locked tries to atomically increment d_count
without
+ * taking a lock as long as the count is not 0 and d_lock is not taken.
+ */
+static inline int __dget_unless_zero_or_locked(struct dentry *dentry)
+{
+ if (sizeof(union _d_cnt_lock) == sizeof(dentry->d_cnt_lock)) {
+ union _d_cnt_lock old, new;
+
+ spin_unlock_wait(&dentry->d_lock);
+ old.d_cnt_lock = ACCESS_ONCE(dentry->d_cnt_lock);
+ if ((old.d_count > 0) && spin_can_lock(&old.d_lock)) {
+ new.d_cnt_lock = old.d_cnt_lock;
+ new.d_count++;
+ if (cmpxchg(&dentry->d_cnt_lock, old.d_cnt_lock,
+ new.d_cnt_lock) == old.d_cnt_lock)
+ return 1;
+ cpu_relax();
+ /*
+ * Try one more time
+ */
+ old.d_cnt_lock = ACCESS_ONCE(dentry->d_cnt_lock);
+ if ((old.d_count > 0) && spin_can_lock(&old.d_lock)) {
+ new.d_cnt_lock = old.d_cnt_lock;
+ new.d_count++;
+ if (cmpxchg(&dentry->d_cnt_lock, old.d_cnt_lock,
+ new.d_cnt_lock) == old.d_cnt_lock)
+ return 1;
+ cpu_relax();
+ }
+ }
+ }
+ return 0;
+}
+
+/**
+ * __dput_unless_lt2_or_locked - atomically dec d_count if >= 1 and not
locked
+ * @dentry: dentry to work on
+ * Return: 0 on failure, else 1
+ *
+ * __dput_unless_leone_or_locked tries to atomically decrement d_count
without
+ * taking a lock as long as the count is bigger than 1 and d_lock is
not taken.
+ */
+static inline int __dput_unless_lt2_or_locked(struct dentry *dentry)
+{
+ if (sizeof(union _d_cnt_lock) == sizeof(dentry->d_cnt_lock)) {
+ union _d_cnt_lock old, new;
+
+ spin_unlock_wait(&dentry->d_lock);
+ old.d_cnt_lock = ACCESS_ONCE(dentry->d_cnt_lock);
+ if ((old.d_count > 1) && spin_can_lock(&old.d_lock)) {
+ new.d_cnt_lock = old.d_cnt_lock;
+ new.d_count--;
+ if (cmpxchg(&dentry->d_cnt_lock, old.d_cnt_lock,
+ new.d_cnt_lock) == old.d_cnt_lock)
+ return 1;
+ cpu_relax();
+ /*
+ * Try one more time
+ */
+ old.d_cnt_lock = ACCESS_ONCE(dentry->d_cnt_lock);
+ if ((old.d_count > 1) && spin_can_lock(&old.d_lock)) {
+ new.d_cnt_lock = old.d_cnt_lock;
+ new.d_count--;
+ if (cmpxchg(&dentry->d_cnt_lock, old.d_cnt_lock,
+ new.d_cnt_lock) == old.d_cnt_lock)
+ return 1;
+ cpu_relax();
+ }
+ }
+ }
+ return 0;
+}
+
/* validate "insecure" dentry pointer */
extern int d_validate(struct dentry *, struct dentry *);

@@ -359,6 +455,8 @@ static inline struct dentry *dget_dlock(struct
dentry *dentr
y)
static inline struct dentry *dget(struct dentry *dentry)
{
if (dentry) {
+ if (__dget_unless_zero_or_locked(dentry))
+ return dentry;
spin_lock(&dentry->d_lock);
dget_dlock(dentry);
spin_unlock(&dentry->d_lock);
--
1.7.1

2013-06-14 15:37:09

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Fri, Jun 14, 2013 at 8:00 AM, Waiman Long <[email protected]> wrote:
> On 06/12/2013 08:59 PM, Linus Torvalds wrote:
>>
>> Ho humm.. interesting. I was talking about wanting to mix atomics and
>> spinlocks earlier in this thread due to space constraints, and it
>> strikes me that that would actually help this case a lot. Having the
>> dentry count mix d_lock and the count in one word would allow for
>> atomic ops like "increment if not locked", and we'd avoid this whole
>> race entirely..
>>
>> Something like "low bit of count is the lock bit" would end up being
>> lovely for this case. Of course, that's not how our spinlocks work ..
>>
>> Linus
>
>
> I have created another patch to do exactly the "increment if not locked"
> operation as suggested. It did help a lot. See the patch below for more
> information. Any additional comment will be appreciated.

Hmm. This is interesting and proves the concept, and the numbers look
very promising.

The patch is not mergable, though, since it clearly depends on the
spinlock/d_count fitting in a u64, which is normally true, but not the
case of debugging locks etc, we'd need to generalize and fix the whole
concept of "refcount+lock".

Generalizing it might be a good idea anyway, since there are other
cases of "atomic_dec_and_lock()" etc behaviours where we might want to
have these kinds of extended lock+count shenanigans.

I also do wonder if we could perhaps fit both in 32-bits, and just not
use the "real" spinlocks at all, but use a bitlock in the low (or
high) bit of the refcount. We do that in some other places - we'd
potentially lose lockdep etc, and we'd lose some of the other good
parts of spinlocks (fairness yadda yadda), but *if* we can reduce
contention enough that it works out, maybe it would be worth it.

So this doesn't look like 3.11 material, but the numbers certainly
make it look very promising, so with some more work on it ...

Linus

2013-06-14 18:17:26

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On 06/14/2013 11:37 AM, Linus Torvalds wrote:
> On Fri, Jun 14, 2013 at 8:00 AM, Waiman Long<[email protected]> wrote:
>> On 06/12/2013 08:59 PM, Linus Torvalds wrote:
>>> Ho humm.. interesting. I was talking about wanting to mix atomics and
>>> spinlocks earlier in this thread due to space constraints, and it
>>> strikes me that that would actually help this case a lot. Having the
>>> dentry count mix d_lock and the count in one word would allow for
>>> atomic ops like "increment if not locked", and we'd avoid this whole
>>> race entirely..
>>>
>>> Something like "low bit of count is the lock bit" would end up being
>>> lovely for this case. Of course, that's not how our spinlocks work ..
>>>
>>> Linus
>>
>> I have created another patch to do exactly the "increment if not locked"
>> operation as suggested. It did help a lot. See the patch below for more
>> information. Any additional comment will be appreciated.
> Hmm. This is interesting and proves the concept, and the numbers look
> very promising.
>
> The patch is not mergable, though, since it clearly depends on the
> spinlock/d_count fitting in a u64, which is normally true, but not the
> case of debugging locks etc, we'd need to generalize and fix the whole
> concept of "refcount+lock".

With some minor changes, the current patch can be modified to support
debugging lock for 32-bit system. For 64-bit system, we can apply a
similar concept for debugging lock with cmpxchg_double. However, for
architecture that does not have cmpxchg_double support, it will be out
of luck and we probably couldn't support the same feature in debugging
mode. It will have to fall back to taking the lock.

I was thinking about generalizing the fix, but one issue that I was
aware of was that the d_lock member of dentry had more than 300
references throughout the filesystem code. A general fix will require
d_lock to be accessed in a different way. So it will be a pretty massive
patch touching quite a lot of files even though the changes will be
pretty straightforward in most cases.

> Generalizing it might be a good idea anyway, since there are other
> cases of "atomic_dec_and_lock()" etc behaviours where we might want to
> have these kinds of extended lock+count shenanigans.

The patch can certainly be generalized. I will see what I can do in a
week of two.

> I also do wonder if we could perhaps fit both in 32-bits, and just not
> use the "real" spinlocks at all, but use a bitlock in the low (or
> high) bit of the refcount. We do that in some other places - we'd
> potentially lose lockdep etc, and we'd lose some of the other good
> parts of spinlocks (fairness yadda yadda), but *if* we can reduce
> contention enough that it works out, maybe it would be worth it.

As the dentry is such an important data structure for the filesystem
layer, losing the fairness attribute and the ability to debug may be too
high a price to pay. For other niche cases, such a combined data type
can certainly be used.

> So this doesn't look like 3.11 material, but the numbers certainly
> make it look very promising, so with some more work on it ...
>
> Linus

When will be the deadline for stable patch that can be considered to be
merged in 3.11?

Regards,
Longman

2013-06-14 23:47:24

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] v3 Auto-queued ticketlock

On Fri, Jun 14, 2013 at 03:12:43PM +0800, Lai Jiangshan wrote:
> On 06/14/2013 07:57 AM, Paul E. McKenney wrote:
> > On Fri, Jun 14, 2013 at 07:25:57AM +0800, Lai Jiangshan wrote:
> >> On Thu, Jun 13, 2013 at 11:22 PM, Paul E. McKenney
> >> <[email protected]> wrote:
> >>> On Thu, Jun 13, 2013 at 10:55:41AM +0800, Lai Jiangshan wrote:
> >>>> On 06/12/2013 11:40 PM, Paul E. McKenney wrote:
> >>>>> Breaking up locks is better than implementing high-contention locks, but
> >>>>> if we must have high-contention locks, why not make them automatically
> >>>>> switch between light-weight ticket locks at low contention and queued
> >>>>> locks at high contention? After all, this would remove the need for
> >>>>> the developer to predict which locks will be highly contended.
> >>>>>
> >>>>> This commit allows ticket locks to automatically switch between pure
> >>>>> ticketlock and queued-lock operation as needed. If too many CPUs are
> >>>>> spinning on a given ticket lock, a queue structure will be allocated
> >>>>> and the lock will switch to queued-lock operation. When the lock becomes
> >>>>> free, it will switch back into ticketlock operation. The low-order bit
> >>>>> of the head counter is used to indicate that the lock is in queued mode,
> >>>>> which forces an unconditional mismatch between the head and tail counters.
> >>>>> This approach means that the common-case code path under conditions of
> >>>>> low contention is very nearly that of a plain ticket lock.
> >>>>>
> >>>>> A fixed number of queueing structures is statically allocated in an
> >>>>> array. The ticket-lock address is used to hash into an initial element,
> >>>>> but if that element is already in use, it moves to the next element. If
> >>>>> the entire array is already in use, continue to spin in ticket mode.
> >>>>>
> >>>>> Signed-off-by: Paul E. McKenney <[email protected]>
> >>>>> [ paulmck: Eliminate duplicate code and update comments (Steven Rostedt). ]
> >>>>> [ paulmck: Address Eric Dumazet review feedback. ]
> >>>>> [ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
> >>>>> [ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]
> >>>>> [ paulmck: Move cpu_relax() to main spin loop (Steven Rostedt). ]
> >>>>> [ paulmck: Reduce queue-switch contention (Waiman Long). ]
> >>>>> [ paulmck: __TKT_SPIN_INC for __ticket_spin_trylock() (Steffen Persvold). ]
> >>>>> [ paulmck: Type safety fixes (Steven Rostedt). ]
> >>>>> [ paulmck: Pre-check cmpxchg() value (Waiman Long). ]
> >>>>> [ paulmck: smp_mb() downgrade to smp_wmb() (Lai Jiangshan). ]
> >>>>
> >>>>
> >>>> Hi, Paul,
> >>>>
> >>>> I simplify the code and remove the search by encoding the index of struct tkt_q_head
> >>>> into lock->tickets.head.
> >>>>
> >>>> A) lock->tickets.head(when queued-lock):
> >>>> ---------------------------------
> >>>> index of struct tkt_q_head |0|1|
> >>>> ---------------------------------
> >>>
> >>> Interesting approach! It might reduce queued-mode overhead a bit in
> >>> some cases, though I bet that in the common case the first queue element
> >>> examined is the right one. More on this below.
> >>>
> >>>> The bit0 = 1 for queued, and the bit1 = 0 is reserved for __ticket_spin_unlock(),
> >>>> thus __ticket_spin_unlock() will not change the higher bits of lock->tickets.head.
> >>>>
> >>>> B) tqhp->head is for the real value of lock->tickets.head.
> >>>> if the last bit of tqhp->head is 1, it means it is the head ticket when started queuing.
> >>>
> >>> But don't you also need the xadd() in __ticket_spin_unlock() to become
> >>> a cmpxchg() for this to work? Or is your patch missing your changes to
> >>> arch/x86/include/asm/spinlock.h? Either way, this is likely to increase
> >>> the no-contention overhead, which might be counterproductive. Wouldn't
> >>> hurt to get measurements, though.
> >>
> >> No, don't need to change __ticket_spin_unlock() in my idea.
> >> bit1 in the tickets.head is reserved for __ticket_spin_unlock(),
> >> __ticket_spin_unlock() only changes the bit1, it will not change
> >> the higher bits. tkt_q_do_wake() will restore the tickets.head.
> >>
> >> This approach avoids cmpxchg in __ticket_spin_unlock().
> >
> > Ah, I did miss that. But doesn't the adjustment in __ticket_spin_lock()
> > need to be atomic in order to handle concurrent invocations of
> > __ticket_spin_lock()?
> >
> > Either way, it would be good to see the performance effects of this.
> >
> > Thanx, Paul
> >
> >>> Given the results that Davidlohr posted, I believe that the following
> >>> optimizations would also provide some improvement:
> >>>
> >>> 1. Move the call to tkt_spin_pass() from __ticket_spin_lock()
> >>> to a separate linker section in order to reduce the icache
> >>> penalty exacted by the spinloop. This is likely to be causing
> >>> some of the performance reductions in the cases where ticket
> >>> locks are not highly contended.
> >>>
> >>> 2. Limit the number of elements searched for in the array of
> >>> queues. However, this would help only if a number of ticket
> >>> locks were in queued mode at the same time.
> >>>
> >>> 3. Dynamically allocate the queue array at boot. This might
> >>> also reduce cache pressure, again, at least in cases where
> >>> there are a number of ticket locks in queued mode at the
> >>> same time.
> >>>
> >>> Frederic just reminded me that I owe him some energy-efficiency improvements
> >>> for adaptive ticks, so I won't get to these very quickly. Please feel free
> >>> to take these on -- the patch clearly does well under high contention, so
> >>> reducing the no-contention penalty could really help.
> >>>
> >>> Thanx, Paul
> >>>
> >>>> Thanks,
> >>>> Lai
> >>>>
> >>>> kernel/tktqlock.c | 51 +++++++++++++--------------------------------------
> >>>> 1 files changed, 13 insertions(+), 38 deletions(-)
> >>>>
> >>>> diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> >>>> index 912817c..1329d0f 100644
> >>>> --- a/kernel/tktqlock.c
> >>>> +++ b/kernel/tktqlock.c
> >>>> @@ -33,7 +33,7 @@ struct tkt_q {
> >>>>
> >>>> struct tkt_q_head {
> >>>> arch_spinlock_t *ref; /* Pointer to spinlock if in use. */
> >>>> - s64 head_tkt; /* Head ticket when started queuing. */
> >>>> + __ticket_t head; /* Real head when queued. */
> >>>> struct tkt_q *spin; /* Head of queue. */
> >>>> struct tkt_q **spin_tail; /* Tail of queue. */
> >>>> };
> >>>> @@ -77,15 +77,8 @@ static unsigned long tkt_q_hash(arch_spinlock_t *lock)
> >>>> */
> >>>> static struct tkt_q_head *tkt_q_find_head(arch_spinlock_t *lock)
> >>>> {
> >>>> - int i;
> >>>> - int start;
> >>>> -
> >>>> - start = i = tkt_q_hash(lock);
> >>>> - do
> >>>> - if (ACCESS_ONCE(tkt_q_heads[i].ref) == lock)
> >>>> - return &tkt_q_heads[i];
> >>>> - while ((i = tkt_q_next_slot(i)) != start);
> >>>> - return NULL;
> >>>> + BUILD_BUG_ON(TKT_Q_NQUEUES > (1 << (TICKET_SHIFT - 2)));
> >>>> + return &tkt_q_heads[ACCESS_ONCE(lock->tickets.head) >> 2];
> >>>> }
> >>>>
> >>>> /*
> >>>> @@ -101,11 +94,11 @@ static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)
> >>>>
> >>>> /* Pick up the ticket values. */
> >>>> asold = ACCESS_ONCE(*lock);
> >>>> - if ((asold.tickets.head & ~0x1) == asold.tickets.tail) {
> >>>> + if (tqhp->head == asold.tickets.tail) {
> >>>>
> >>>> /* Attempt to mark the lock as not having a queue. */
> >>>> asnew = asold;
> >>>> - asnew.tickets.head &= ~0x1;
> >>>> + asnew.tickets.head = tqhp->head;
> >>>> if (cmpxchg(&lock->head_tail,
> >>>> asold.head_tail,
> >>>> asnew.head_tail) == asold.head_tail) {
> >>>> @@ -128,12 +121,9 @@ void tkt_q_do_wake(arch_spinlock_t *lock)
> >>>> struct tkt_q_head *tqhp;
> >>>> struct tkt_q *tqp;
> >>>>
> >>>> - /*
> >>>> - * If the queue is still being set up, wait for it. Note that
> >>>> - * the caller's xadd() provides the needed memory ordering.
> >>>> - */
> >>>> - while ((tqhp = tkt_q_find_head(lock)) == NULL)
> >>>> - cpu_relax();
> >>>> + tqhp = tkt_q_find_head(lock);
> >>>> + ACCESS_ONCE(lock->tickets.head) -= __TKT_SPIN_INC;
> >>>> + ACCESS_ONCE(tqhp->head) = (tqhp->head & ~0x1) + __TKT_SPIN_INC;
> >>>>
> >>>> for (;;) {
> >>>>
> >>>> @@ -145,9 +135,7 @@ void tkt_q_do_wake(arch_spinlock_t *lock)
> >>>> return; /* No element, successfully removed queue. */
> >>>> cpu_relax();
> >>>> }
> >>>> - if (ACCESS_ONCE(tqhp->head_tkt) != -1)
> >>>> - ACCESS_ONCE(tqhp->head_tkt) = -1;
> >>>> - smp_mb(); /* Order pointer fetch and assignment against handoff. */
> >>>> + smp_mb(); /* Order modification, pointer fetch and assignment against handoff. */
> >>>> ACCESS_ONCE(tqp->cpu) = -1;
> >>>> }
> >>>> EXPORT_SYMBOL(tkt_q_do_wake);
> >>>> @@ -169,10 +157,7 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> >>>> */
> >>>> smp_mb(); /* See above block comment. */
> >>>>
> >>>> - /* If there no longer is a queue, leave. */
> >>>> tqhp = tkt_q_find_head(lock);
> >>>> - if (tqhp == NULL)
> >>>> - return false;
> >>>>
> >>>> /* Initialize our queue element. */
> >>>> tq.cpu = raw_smp_processor_id();
> >>>> @@ -180,9 +165,8 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> >>>> tq.next = NULL;
> >>>>
> >>>> /* Check to see if we already hold the lock. */
> >>>> - if (ACCESS_ONCE(tqhp->head_tkt) == inc.tail) {
> >>>> + if (ACCESS_ONCE(tqhp->head) == (inc.tail | 0x1)) {
> >>>> /* The last holder left before queue formed, we hold lock. */
> >>>> - tqhp->head_tkt = -1;
> >>>> return true;
> >>>> }
> >>>>
> >>>> @@ -290,16 +274,14 @@ tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
> >>>> * Record the head counter in case one of the spinning
> >>>> * CPUs already holds the lock but doesn't realize it yet.
> >>>> */
> >>>> - tqhp->head_tkt = asold.tickets.head;
> >>>> + tqhp->head = asold.tickets.head | 0x1;
> >>>>
> >>>> /* The low-order bit in the head counter says "queued". */
> >>>> - asnew.tickets.head |= 0x1;
> >>>> + asnew.tickets.head = (i << 2) + 0x1;
> >>>> } while (cmpxchg(&lock->head_tail,
> >>>> asold.head_tail,
> >>>> asnew.head_tail) != asold.head_tail);
> >>>>
> >>>> - /* Point the queue at the lock and go spin on it. */
> >>>> - ACCESS_ONCE(tqhp->ref) = lock;
> >>>> return tkt_q_do_spin(lock, inc);
> >>>> }
> >>>>
> >>>> @@ -321,15 +303,8 @@ bool tkt_q_start_contend(arch_spinlock_t *lock, struct __raw_tickets inc)
> >>>> * the lock with the corresponding queue.
> >>>> */
> >>>> do {
> >>>> - /*
> >>>> - * Use 0x1 to mark the queue in use, but also avoiding
> >>>> - * any spinners trying to use it before we get it all
> >>>> - * initialized.
> >>>> - */
> >>>> if (!tkt_q_heads[i].ref &&
> >>>> - cmpxchg(&tkt_q_heads[i].ref,
> >>>> - NULL,
> >>>> - (arch_spinlock_t *)0x1) == NULL) {
> >>>> + cmpxchg(&tkt_q_heads[i].ref, NULL, lock) == NULL) {
> >>>>
> >>>> /* Succeeded, now go initialize it. */
> >>>> return tkt_q_init_contend(i, lock, inc);
> >>>>
> >>>
> >>> --
> >>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> >>> the body of a message to [email protected]
> >>> More majordomo info at http://vger.kernel.org/majordomo-info.html
> >>> Please read the FAQ at http://www.tux.org/lkml/
>
> Hi, Paul.
>
> More possible improvement. Again, this is untested.
>
> This improvement removes slow path from unlock() by:
> 1) Instead of forcing all competitor to spin on its queued-node,
> this improvement selects one and only one competitor and force it still spinning on the lock.
> 2) The selected competitor is the leader of the queue.
> 3) the queue is used for passing the leadership instead of passing the lock holder.
>
> Implemented on top of my previous improvement.
> Would you merge them all as one patch to get more reviews if you agree my improvement?

This one does have the advantage of reducing the non-queued overhead by
allowing the original __ticket_spin_unlock() to be used as-is, which is
good. After all, the biggest weakness in my earliers versions was not the
handoff, but rather the performance regressions when in non-queued mode.
Of course, the performance regressions would need to be negligible for
this to have any chance of inclusion. My guess is that this patch gets
things half-way there, but that the ____ticket_spin_lock() path needs
some help (for example, moving the queued-lock handling out of line)
is required to get something that has a chance.

If you test them both for correct function, I will post them as separate
commits on a qlock branch in my -rcu tree. I do have a user-mode test
harness that might help, though of course in-kernel testing is needed.

> Thanks,
> Lai
>
> PS:
>
> After this, we can shrink the size of struct tkt_q_head.
> Is this size important?

Not hugely, but every bit helps.

> struct tkt_q_head {
> __ticket_t head; /* Real head when queued. */
> struct tkt_q **spin_tail; /* Tail of queue. */
> };
>
> And "__ticket_t head;" can be also removed.
>
>
> diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
> index 5aa0177..01c3bdd 100644
> --- a/arch/x86/include/asm/spinlock.h
> +++ b/arch/x86/include/asm/spinlock.h
> @@ -90,27 +90,11 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
> return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
> }
>
> -#ifndef CONFIG_TICKET_LOCK_QUEUED
> -
> static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> {
> __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
> }
>
> -#else /* #ifndef CONFIG_TICKET_LOCK_QUEUED */
> -
> -extern void tkt_q_do_wake(arch_spinlock_t *lock);
> -
> -static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
> -{
> - __ticket_t head = 2;
> -
> - head = xadd(&lock->tickets.head, head);
> - if (head & 0x1)
> - tkt_q_do_wake(lock);
> -}
> -#endif /* #else #ifndef CONFIG_TICKET_LOCK_QUEUED */
> -
> static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
> {
> struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> diff --git a/kernel/tktqlock.c b/kernel/tktqlock.c
> index 1329d0f..b658fae 100644
> --- a/kernel/tktqlock.c
> +++ b/kernel/tktqlock.c
> @@ -94,7 +94,7 @@ static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)
>
> /* Pick up the ticket values. */
> asold = ACCESS_ONCE(*lock);
> - if (tqhp->head == asold.tickets.tail) {
> + if (tqhp->head + __TKT_SPIN_INC == asold.tickets.tail) {
>
> /* Attempt to mark the lock as not having a queue. */
> asnew = asold;
> @@ -114,33 +114,6 @@ static bool tkt_q_try_unqueue(arch_spinlock_t *lock, struct tkt_q_head *tqhp)
> }
>
> /*
> - * Hand the lock off to the first CPU on the queue.
> - */
> -void tkt_q_do_wake(arch_spinlock_t *lock)
> -{
> - struct tkt_q_head *tqhp;
> - struct tkt_q *tqp;
> -
> - tqhp = tkt_q_find_head(lock);
> - ACCESS_ONCE(lock->tickets.head) -= __TKT_SPIN_INC;
> - ACCESS_ONCE(tqhp->head) = (tqhp->head & ~0x1) + __TKT_SPIN_INC;
> -
> - for (;;) {
> -
> - /* Find the first queue element. */
> - tqp = ACCESS_ONCE(tqhp->spin);
> - if (tqp != NULL)
> - break; /* Element exists, hand off lock. */
> - if (tkt_q_try_unqueue(lock, tqhp))
> - return; /* No element, successfully removed queue. */
> - cpu_relax();
> - }
> - smp_mb(); /* Order modification, pointer fetch and assignment against handoff. */
> - ACCESS_ONCE(tqp->cpu) = -1;
> -}
> -EXPORT_SYMBOL(tkt_q_do_wake);
> -
> -/*
> * Given a lock that already has a queue associated with it, spin on
> * that queue. Return false if there was no queue (which means we do not
> * hold the lock) and true otherwise (meaning we -do- hold the lock).
> @@ -150,6 +123,7 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> struct tkt_q **oldtail;
> struct tkt_q tq;
> struct tkt_q_head *tqhp;
> + int index;
>
> /*
> * Ensure that accesses to queue header happen after sensing
> @@ -157,6 +131,7 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> */
> smp_mb(); /* See above block comment. */
>
> + index = ACCESS_ONCE(lock->tickets.head) >> 2;
> tqhp = tkt_q_find_head(lock);
>
> /* Initialize our queue element. */
> @@ -178,10 +153,29 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> oldtail = xchg(&tqhp->spin_tail, &tq.next);
> ACCESS_ONCE(*oldtail) = &tq;
>
> - /* Spin until handoff. */
> - while (ACCESS_ONCE(tq.cpu) != -1)
> + if (oldtail != &tqhp->spin) {
> + /* Spin until get the queue leadership */
> + while (ACCESS_ONCE(tq.cpu) != -1)
> + cpu_relax();
> + smp_mb(); /* Force ordering between get leadership and access lock->tickets.head */
> + }
> +
> + /*
> + * Spin until hold the lock. if the next smp_mb() doesn't help,
> + * it should be implemented arch-depended
> + */
> + inc.head = index * __TKT_SPIN_INC * 2 + 1;
> + while (ACCESS_ONCE(lock->tickets.head) != inc.head + __TKT_SPIN_INC)
> cpu_relax();

So this detects the atomic increment and then undoes it below, correct?

> + smp_mb(); /* Force ordering between (prev C.S. & lock->tickets.head)
> + and (current C.S. & tqhp->head & hand off) */
> +
> + /* store queued-lock tickets head */
> + ACCESS_ONCE(lock->tickets.head) = inc.head;
> + /* update real tickets head */
> + ACCESS_ONCE(tqhp->head) = (tqhp->head & ~0x1) + __TKT_SPIN_INC;
> +
> /*
> * Remove our element from the queue. If the queue is now empty,
> * update carefully so that the next acquisition will enqueue itself
> @@ -217,8 +211,10 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
> /* Try to point the tail back at the head. */
> if (cmpxchg(&tqhp->spin_tail,
> &tq.next,
> - &tqhp->spin) == &tq.next)
> + &tqhp->spin) == &tq.next) {
> + tkt_q_try_unqueue(lock, tqhp);
> return true; /* Succeeded, queue is now empty. */
> + }
>
> /* Failed, if needed, wait for the enqueue to complete. */
> while (tq.next == NULL)
> @@ -226,14 +222,13 @@ bool tkt_q_do_spin(arch_spinlock_t *lock, struct __raw_tickets inc)
>
> /* The following code will repair the head. */
> }
> - smp_mb(); /* Force ordering between handoff and critical section. */
>
> /*
> - * Advance list-head pointer. This same task will be the next to
> - * access this when releasing the lock, so no need for a memory
> - * barrier after the following assignment.
> + * Advance list-head pointer. tqhp->spin is useless, it can be removed.
> */
> ACCESS_ONCE(tqhp->spin) = tq.next;
> + ACCESS_ONCE(tq.next->cpu) = -1; /* hand off queue leadership */
> +
> return true;
> }
>
> @@ -277,7 +272,7 @@ tkt_q_init_contend(int i, arch_spinlock_t *lock, struct __raw_tickets inc)
> tqhp->head = asold.tickets.head | 0x1;
>
> /* The low-order bit in the head counter says "queued". */
> - asnew.tickets.head = (i << 2) + 0x1;
> + asnew.tickets.head = i * __TKT_SPIN_INC * 2 + 0x1;
> } while (cmpxchg(&lock->head_tail,
> asold.head_tail,
> asnew.head_tail) != asold.head_tail);
>

2013-06-14 23:49:55

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] v3 Auto-queued ticketlock

On Fri, Jun 14, 2013 at 09:28:16AM +0800, Lai Jiangshan wrote:
> On 06/14/2013 07:57 AM, Paul E. McKenney wrote:
> > On Fri, Jun 14, 2013 at 07:25:57AM +0800, Lai Jiangshan wrote:
> >> On Thu, Jun 13, 2013 at 11:22 PM, Paul E. McKenney
> >> <[email protected]> wrote:
> >>> On Thu, Jun 13, 2013 at 10:55:41AM +0800, Lai Jiangshan wrote:
> >>>> On 06/12/2013 11:40 PM, Paul E. McKenney wrote:
> >>>>> Breaking up locks is better than implementing high-contention locks, but
> >>>>> if we must have high-contention locks, why not make them automatically
> >>>>> switch between light-weight ticket locks at low contention and queued
> >>>>> locks at high contention? After all, this would remove the need for
> >>>>> the developer to predict which locks will be highly contended.
> >>>>>
> >>>>> This commit allows ticket locks to automatically switch between pure
> >>>>> ticketlock and queued-lock operation as needed. If too many CPUs are
> >>>>> spinning on a given ticket lock, a queue structure will be allocated
> >>>>> and the lock will switch to queued-lock operation. When the lock becomes
> >>>>> free, it will switch back into ticketlock operation. The low-order bit
> >>>>> of the head counter is used to indicate that the lock is in queued mode,
> >>>>> which forces an unconditional mismatch between the head and tail counters.
> >>>>> This approach means that the common-case code path under conditions of
> >>>>> low contention is very nearly that of a plain ticket lock.
> >>>>>
> >>>>> A fixed number of queueing structures is statically allocated in an
> >>>>> array. The ticket-lock address is used to hash into an initial element,
> >>>>> but if that element is already in use, it moves to the next element. If
> >>>>> the entire array is already in use, continue to spin in ticket mode.
> >>>>>
> >>>>> Signed-off-by: Paul E. McKenney <[email protected]>
> >>>>> [ paulmck: Eliminate duplicate code and update comments (Steven Rostedt). ]
> >>>>> [ paulmck: Address Eric Dumazet review feedback. ]
> >>>>> [ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
> >>>>> [ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]
> >>>>> [ paulmck: Move cpu_relax() to main spin loop (Steven Rostedt). ]
> >>>>> [ paulmck: Reduce queue-switch contention (Waiman Long). ]
> >>>>> [ paulmck: __TKT_SPIN_INC for __ticket_spin_trylock() (Steffen Persvold). ]
> >>>>> [ paulmck: Type safety fixes (Steven Rostedt). ]
> >>>>> [ paulmck: Pre-check cmpxchg() value (Waiman Long). ]
> >>>>> [ paulmck: smp_mb() downgrade to smp_wmb() (Lai Jiangshan). ]
> >>>>
> >>>>
> >>>> Hi, Paul,
> >>>>
> >>>> I simplify the code and remove the search by encoding the index of struct tkt_q_head
> >>>> into lock->tickets.head.
> >>>>
> >>>> A) lock->tickets.head(when queued-lock):
> >>>> ---------------------------------
> >>>> index of struct tkt_q_head |0|1|
> >>>> ---------------------------------
> >>>
> >>> Interesting approach! It might reduce queued-mode overhead a bit in
> >>> some cases, though I bet that in the common case the first queue element
> >>> examined is the right one. More on this below.
> >>>
> >>>> The bit0 = 1 for queued, and the bit1 = 0 is reserved for __ticket_spin_unlock(),
> >>>> thus __ticket_spin_unlock() will not change the higher bits of lock->tickets.head.
> >>>>
> >>>> B) tqhp->head is for the real value of lock->tickets.head.
> >>>> if the last bit of tqhp->head is 1, it means it is the head ticket when started queuing.
> >>>
> >>> But don't you also need the xadd() in __ticket_spin_unlock() to become
> >>> a cmpxchg() for this to work? Or is your patch missing your changes to
> >>> arch/x86/include/asm/spinlock.h? Either way, this is likely to increase
> >>> the no-contention overhead, which might be counterproductive. Wouldn't
> >>> hurt to get measurements, though.
> >>
> >> No, don't need to change __ticket_spin_unlock() in my idea.
> >> bit1 in the tickets.head is reserved for __ticket_spin_unlock(),
> >> __ticket_spin_unlock() only changes the bit1, it will not change
> >> the higher bits. tkt_q_do_wake() will restore the tickets.head.
> >>
> >> This approach avoids cmpxchg in __ticket_spin_unlock().
> >
> > Ah, I did miss that. But doesn't the adjustment in __ticket_spin_lock()
> > need to be atomic in order to handle concurrent invocations of
> > __ticket_spin_lock()?
>
> I don't understand, do we just discuss about __ticket_spin_unlock() only?
> Or does my suggestion hurt __ticket_spin_lock()?

On many architectures, it is harmless. But my concern is that
__ticket_spin_lock() is atomically incrementing the full value
(both head and tail), but in such a way as to never change the
value of head. So in theory, a concurrent non-atomic store to
head should be OK, but it does make me quite nervous.

At the very least, it needs a comment saying why it is safe.

Thanx, Paul

> > Either way, it would be good to see the performance effects of this.
> >
> > Thanx, Paul
>

2013-06-15 01:28:52

by Benjamin Herrenschmidt

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On Fri, 2013-06-14 at 14:17 -0400, Waiman Long wrote:
>
> With some minor changes, the current patch can be modified to support
> debugging lock for 32-bit system. For 64-bit system, we can apply a
> similar concept for debugging lock with cmpxchg_double. However, for
> architecture that does not have cmpxchg_double support, it will be out
> of luck and we probably couldn't support the same feature in debugging
> mode. It will have to fall back to taking the lock.

That means only x86_64 and s390 would benefit from it ... I'm sure we can do better :-)

Cheers,
Ben.

2013-06-15 03:37:01

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] Auto-queued ticketlock

On 06/14/2013 09:26 PM, Benjamin Herrenschmidt wrote:
> On Fri, 2013-06-14 at 14:17 -0400, Waiman Long wrote:
>> With some minor changes, the current patch can be modified to support
>> debugging lock for 32-bit system. For 64-bit system, we can apply a
>> similar concept for debugging lock with cmpxchg_double. However, for
>> architecture that does not have cmpxchg_double support, it will be out
>> of luck and we probably couldn't support the same feature in debugging
>> mode. It will have to fall back to taking the lock.
> That means only x86_64 and s390 would benefit from it ... I'm sure we can do better :-)
>
> Cheers,
> Ben.

On second thought, using cmpxchg_double may not be such a good idea
after all as it requires a 16-byte alignment, at least for x86-64.
Another possible alternative is to integrate the reference count
directly into the spinlock_t data structure immediately after
arch_spinlock_t for this special case. If CONFIG_GENERIC_LOCKBREAK is
not defined, there will be a 4-byte hole that can be used. Otherwise,
the spinlock_t structure will have an 8 byte size increase. I suppose
that others won't be too upset for an 8-byte increase in size when
spinlock debugging is turned on.

Regards,
Longman

2013-07-01 09:19:36

by Raghavendra KT

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] v3 Auto-queued ticketlock

On Sun, Jun 23, 2013 at 11:23 PM, Raghavendra KT
<[email protected]> wrote:
>
>
> On Wed, Jun 12, 2013 at 9:10 PM, Paul E. McKenney
> <[email protected]> wrote:
>>
>> Breaking up locks is better than implementing high-contention locks, but
>> if we must have high-contention locks, why not make them automatically
>> switch between light-weight ticket locks at low contention and queued
>> locks at high contention? After all, this would remove the need for
>> the developer to predict which locks will be highly contended.
>>
>> This commit allows ticket locks to automatically switch between pure
>> ticketlock and queued-lock operation as needed. If too many CPUs are
>> spinning on a given ticket lock, a queue structure will be allocated
>> and the lock will switch to queued-lock operation. When the lock becomes
>> free, it will switch back into ticketlock operation. The low-order bit
>> of the head counter is used to indicate that the lock is in queued mode,
>> which forces an unconditional mismatch between the head and tail counters.
>> This approach means that the common-case code path under conditions of
>> low contention is very nearly that of a plain ticket lock.
>>
>> A fixed number of queueing structures is statically allocated in an
>> array. The ticket-lock address is used to hash into an initial element,
>> but if that element is already in use, it moves to the next element. If
>> the entire array is already in use, continue to spin in ticket mode.
>>
>> Signed-off-by: Paul E. McKenney <[email protected]>
>> [ paulmck: Eliminate duplicate code and update comments (Steven Rostedt).
>> ]
>> [ paulmck: Address Eric Dumazet review feedback. ]
>> [ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
>> [ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]
>> [ paulmck: Move cpu_relax() to main spin loop (Steven Rostedt). ]
>> [ paulmck: Reduce queue-switch contention (Waiman Long). ]
>> [ paulmck: __TKT_SPIN_INC for __ticket_spin_trylock() (Steffen Persvold).
>> ]
>> [ paulmck: Type safety fixes (Steven Rostedt). ]
>> [ paulmck: Pre-check cmpxchg() value (Waiman Long). ]
>> [ paulmck: smp_mb() downgrade to smp_wmb() (Lai Jiangshan). ]
>>
> [...]
>
> I did test this on 32 core machine with 32 vcpu guests.
>
> This version gave me around 20% improvement fro sysbench and 36% improvement
> for ebizzy, for 1x commit though other overcommited results showed
> degradation. I have not tested Lai Jiangshan's patches on top of this yet.
> Will report any findings.

Sorry for late report.

With Lai's patch I see few percentage of improvement in ebizzy 1x and
reduction in degradation in dbench 1x.

But over-commit degradation seem to still persist. seeing this, I
feel it is more of qmode overhead itself for large guests,

+---+-----------+-----------+-----------+------------+-----------+
ebizzy (rec/sec higher is better)
+---+---+-----------+-----------+-----------+------------+-----------+
base stdev patched stdev %improvement
+---+-----------+-----------+-----------+------------+-----------+
1x 5574.9000 237.4997 7851.9000 148.6737 40.84378
2x 2741.5000 561.3090 1620.9000 410.8299 -40.87543
3x 2146.2500 216.7718 1751.8333 96.5023 -18.37702
+---+-----------+-----------+-----------+------------+-----------+
+---+-----------+-----------+-----------+------------+-----------+
dbench (throughput higher is better)
+---+-----------+-----------+-----------+------------+-----------+
base stdev patched stdev %improvement
+---+-----------+-----------+-----------+------------+-----------+
1x 14111.5600 754.4525 13826.5700 1458.0744 -2.01955
2x 2481.6270 71.2665 1549.3740 245.3777 -37.56620
3x 1510.2483 31.8634 1116.0158 26.4882 -26.10382
+---+-----------+-----------+-----------+------------+-----------+

2013-07-02 05:57:16

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH RFC ticketlock] v3 Auto-queued ticketlock

On Mon, Jul 01, 2013 at 02:49:34PM +0530, Raghavendra KT wrote:
> On Sun, Jun 23, 2013 at 11:23 PM, Raghavendra KT
> <[email protected]> wrote:
> >
> >
> > On Wed, Jun 12, 2013 at 9:10 PM, Paul E. McKenney
> > <[email protected]> wrote:
> >>
> >> Breaking up locks is better than implementing high-contention locks, but
> >> if we must have high-contention locks, why not make them automatically
> >> switch between light-weight ticket locks at low contention and queued
> >> locks at high contention? After all, this would remove the need for
> >> the developer to predict which locks will be highly contended.
> >>
> >> This commit allows ticket locks to automatically switch between pure
> >> ticketlock and queued-lock operation as needed. If too many CPUs are
> >> spinning on a given ticket lock, a queue structure will be allocated
> >> and the lock will switch to queued-lock operation. When the lock becomes
> >> free, it will switch back into ticketlock operation. The low-order bit
> >> of the head counter is used to indicate that the lock is in queued mode,
> >> which forces an unconditional mismatch between the head and tail counters.
> >> This approach means that the common-case code path under conditions of
> >> low contention is very nearly that of a plain ticket lock.
> >>
> >> A fixed number of queueing structures is statically allocated in an
> >> array. The ticket-lock address is used to hash into an initial element,
> >> but if that element is already in use, it moves to the next element. If
> >> the entire array is already in use, continue to spin in ticket mode.
> >>
> >> Signed-off-by: Paul E. McKenney <[email protected]>
> >> [ paulmck: Eliminate duplicate code and update comments (Steven Rostedt).
> >> ]
> >> [ paulmck: Address Eric Dumazet review feedback. ]
> >> [ paulmck: Use Lai Jiangshan idea to eliminate smp_mb(). ]
> >> [ paulmck: Expand ->head_tkt from s32 to s64 (Waiman Long). ]
> >> [ paulmck: Move cpu_relax() to main spin loop (Steven Rostedt). ]
> >> [ paulmck: Reduce queue-switch contention (Waiman Long). ]
> >> [ paulmck: __TKT_SPIN_INC for __ticket_spin_trylock() (Steffen Persvold).
> >> ]
> >> [ paulmck: Type safety fixes (Steven Rostedt). ]
> >> [ paulmck: Pre-check cmpxchg() value (Waiman Long). ]
> >> [ paulmck: smp_mb() downgrade to smp_wmb() (Lai Jiangshan). ]
> >>
> > [...]
> >
> > I did test this on 32 core machine with 32 vcpu guests.
> >
> > This version gave me around 20% improvement fro sysbench and 36% improvement
> > for ebizzy, for 1x commit though other overcommited results showed
> > degradation. I have not tested Lai Jiangshan's patches on top of this yet.
> > Will report any findings.
>
> Sorry for late report.

Not a problem, thank you for running these numbers!

> With Lai's patch I see few percentage of improvement in ebizzy 1x and
> reduction in degradation in dbench 1x.

OK, good! But my guess is that even pushing the lock-acquisition
slowpath out of line, we still would not reach parity for the less-good
results. Still seems like I should add Lai Jiangshan's patches
and post them somewhere in case they are helpful in some other context.

Thanx, Paul

> But over-commit degradation seem to still persist. seeing this, I
> feel it is more of qmode overhead itself for large guests,
>
> +---+-----------+-----------+-----------+------------+-----------+
> ebizzy (rec/sec higher is better)
> +---+---+-----------+-----------+-----------+------------+-----------+
> base stdev patched stdev %improvement
> +---+-----------+-----------+-----------+------------+-----------+
> 1x 5574.9000 237.4997 7851.9000 148.6737 40.84378
> 2x 2741.5000 561.3090 1620.9000 410.8299 -40.87543
> 3x 2146.2500 216.7718 1751.8333 96.5023 -18.37702
> +---+-----------+-----------+-----------+------------+-----------+
> +---+-----------+-----------+-----------+------------+-----------+
> dbench (throughput higher is better)
> +---+-----------+-----------+-----------+------------+-----------+
> base stdev patched stdev %improvement
> +---+-----------+-----------+-----------+------------+-----------+
> 1x 14111.5600 754.4525 13826.5700 1458.0744 -2.01955
> 2x 2481.6270 71.2665 1549.3740 245.3777 -37.56620
> 3x 1510.2483 31.8634 1116.0158 26.4882 -26.10382
> +---+-----------+-----------+-----------+------------+-----------+
>