2023-05-09 17:00:36

by Kent Overstreet

[permalink] [raw]
Subject: [PATCH 04/32] locking: SIX locks (shared/intent/exclusive)

From: Kent Overstreet <[email protected]>

New lock for bcachefs, like read/write locks but with a third state,
intent.

Intent locks conflict with each other, but not with read locks; taking a
write lock requires first holding an intent lock.

Signed-off-by: Kent Overstreet <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Will Deacon <[email protected]>
Cc: Waiman Long <[email protected]>
Cc: Boqun Feng <[email protected]>
---
include/linux/six.h | 210 +++++++++++
kernel/Kconfig.locks | 3 +
kernel/locking/Makefile | 1 +
kernel/locking/six.c | 779 ++++++++++++++++++++++++++++++++++++++++
4 files changed, 993 insertions(+)
create mode 100644 include/linux/six.h
create mode 100644 kernel/locking/six.c

diff --git a/include/linux/six.h b/include/linux/six.h
new file mode 100644
index 0000000000..41ddf63b74
--- /dev/null
+++ b/include/linux/six.h
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_SIX_H
+#define _LINUX_SIX_H
+
+/*
+ * Shared/intent/exclusive locks: sleepable read/write locks, much like rw
+ * semaphores, except with a third intermediate state, intent. Basic operations
+ * are:
+ *
+ * six_lock_read(&foo->lock);
+ * six_unlock_read(&foo->lock);
+ *
+ * six_lock_intent(&foo->lock);
+ * six_unlock_intent(&foo->lock);
+ *
+ * six_lock_write(&foo->lock);
+ * six_unlock_write(&foo->lock);
+ *
+ * Intent locks block other intent locks, but do not block read locks, and you
+ * must have an intent lock held before taking a write lock, like so:
+ *
+ * six_lock_intent(&foo->lock);
+ * six_lock_write(&foo->lock);
+ * six_unlock_write(&foo->lock);
+ * six_unlock_intent(&foo->lock);
+ *
+ * Other operations:
+ *
+ * six_trylock_read()
+ * six_trylock_intent()
+ * six_trylock_write()
+ *
+ * six_lock_downgrade(): convert from intent to read
+ * six_lock_tryupgrade(): attempt to convert from read to intent
+ *
+ * Locks also embed a sequence number, which is incremented when the lock is
+ * locked or unlocked for write. The current sequence number can be grabbed
+ * while a lock is held from lock->state.seq; then, if you drop the lock you can
+ * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock
+ * iff it hasn't been locked for write in the meantime.
+ *
+ * There are also operations that take the lock type as a parameter, where the
+ * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write:
+ *
+ * six_lock_type(lock, type)
+ * six_unlock_type(lock, type)
+ * six_relock(lock, type, seq)
+ * six_trylock_type(lock, type)
+ * six_trylock_convert(lock, from, to)
+ *
+ * A lock may be held multiple types by the same thread (for read or intent,
+ * not write). However, the six locks code does _not_ implement the actual
+ * recursive checks itself though - rather, if your code (e.g. btree iterator
+ * code) knows that the current thread already has a lock held, and for the
+ * correct type, six_lock_increment() may be used to bump up the counter for
+ * that type - the only effect is that one more call to unlock will be required
+ * before the lock is unlocked.
+ */
+
+#include <linux/lockdep.h>
+#include <linux/osq_lock.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+#define SIX_LOCK_SEPARATE_LOCKFNS
+
+union six_lock_state {
+ struct {
+ atomic64_t counter;
+ };
+
+ struct {
+ u64 v;
+ };
+
+ struct {
+ /* for waitlist_bitnr() */
+ unsigned long l;
+ };
+
+ struct {
+ unsigned read_lock:27;
+ unsigned write_locking:1;
+ unsigned intent_lock:1;
+ unsigned waiters:3;
+ /*
+ * seq works much like in seqlocks: it's incremented every time
+ * we lock and unlock for write.
+ *
+ * If it's odd write lock is held, even unlocked.
+ *
+ * Thus readers can unlock, and then lock again later iff it
+ * hasn't been modified in the meantime.
+ */
+ u32 seq;
+ };
+};
+
+enum six_lock_type {
+ SIX_LOCK_read,
+ SIX_LOCK_intent,
+ SIX_LOCK_write,
+};
+
+struct six_lock {
+ union six_lock_state state;
+ unsigned intent_lock_recurse;
+ struct task_struct *owner;
+ struct optimistic_spin_queue osq;
+ unsigned __percpu *readers;
+
+ raw_spinlock_t wait_lock;
+ struct list_head wait_list[2];
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
+};
+
+typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
+
+static __always_inline void __six_lock_init(struct six_lock *lock,
+ const char *name,
+ struct lock_class_key *key)
+{
+ atomic64_set(&lock->state.counter, 0);
+ raw_spin_lock_init(&lock->wait_lock);
+ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]);
+ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ debug_check_no_locks_freed((void *) lock, sizeof(*lock));
+ lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+}
+
+#define six_lock_init(lock) \
+do { \
+ static struct lock_class_key __key; \
+ \
+ __six_lock_init((lock), #lock, &__key); \
+} while (0)
+
+#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v)
+
+#define __SIX_LOCK(type) \
+bool six_trylock_##type(struct six_lock *); \
+bool six_relock_##type(struct six_lock *, u32); \
+int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\
+void six_unlock_##type(struct six_lock *);
+
+__SIX_LOCK(read)
+__SIX_LOCK(intent)
+__SIX_LOCK(write)
+#undef __SIX_LOCK
+
+#define SIX_LOCK_DISPATCH(type, fn, ...) \
+ switch (type) { \
+ case SIX_LOCK_read: \
+ return fn##_read(__VA_ARGS__); \
+ case SIX_LOCK_intent: \
+ return fn##_intent(__VA_ARGS__); \
+ case SIX_LOCK_write: \
+ return fn##_write(__VA_ARGS__); \
+ default: \
+ BUG(); \
+ }
+
+static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+{
+ SIX_LOCK_DISPATCH(type, six_trylock, lock);
+}
+
+static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
+ unsigned seq)
+{
+ SIX_LOCK_DISPATCH(type, six_relock, lock, seq);
+}
+
+static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
+ six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+ SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p);
+}
+
+static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+ SIX_LOCK_DISPATCH(type, six_unlock, lock);
+}
+
+void six_lock_downgrade(struct six_lock *);
+bool six_lock_tryupgrade(struct six_lock *);
+bool six_trylock_convert(struct six_lock *, enum six_lock_type,
+ enum six_lock_type);
+
+void six_lock_increment(struct six_lock *, enum six_lock_type);
+
+void six_lock_wakeup_all(struct six_lock *);
+
+void six_lock_pcpu_free_rcu(struct six_lock *);
+void six_lock_pcpu_free(struct six_lock *);
+void six_lock_pcpu_alloc(struct six_lock *);
+
+struct six_lock_count {
+ unsigned read;
+ unsigned intent;
+};
+
+struct six_lock_count six_lock_counts(struct six_lock *);
+
+#endif /* _LINUX_SIX_H */
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 4198f0273e..b2abd9a5d9 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -259,3 +259,6 @@ config ARCH_HAS_MMIOWB
config MMIOWB
def_bool y if ARCH_HAS_MMIOWB
depends on SMP
+
+config SIXLOCKS
+ bool
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 0db4093d17..a095dbbf01 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -32,3 +32,4 @@ obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o
+obj-$(CONFIG_SIXLOCKS) += six.o
diff --git a/kernel/locking/six.c b/kernel/locking/six.c
new file mode 100644
index 0000000000..5b2d92c6e9
--- /dev/null
+++ b/kernel/locking/six.c
@@ -0,0 +1,779 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/export.h>
+#include <linux/log2.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/six.h>
+#include <linux/slab.h>
+
+#ifdef DEBUG
+#define EBUG_ON(cond) BUG_ON(cond)
+#else
+#define EBUG_ON(cond) do {} while (0)
+#endif
+
+#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
+#define six_release(l) lock_release(l, _RET_IP_)
+
+struct six_lock_vals {
+ /* Value we add to the lock in order to take the lock: */
+ u64 lock_val;
+
+ /* If the lock has this value (used as a mask), taking the lock fails: */
+ u64 lock_fail;
+
+ /* Value we add to the lock in order to release the lock: */
+ u64 unlock_val;
+
+ /* Mask that indicates lock is held for this type: */
+ u64 held_mask;
+
+ /* Waitlist we wakeup when releasing the lock: */
+ enum six_lock_type unlock_wakeup;
+};
+
+#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0)
+#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0)
+#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1)
+
+#define LOCK_VALS { \
+ [SIX_LOCK_read] = { \
+ .lock_val = __SIX_VAL(read_lock, 1), \
+ .lock_fail = __SIX_LOCK_HELD_write + __SIX_VAL(write_locking, 1),\
+ .unlock_val = -__SIX_VAL(read_lock, 1), \
+ .held_mask = __SIX_LOCK_HELD_read, \
+ .unlock_wakeup = SIX_LOCK_write, \
+ }, \
+ [SIX_LOCK_intent] = { \
+ .lock_val = __SIX_VAL(intent_lock, 1), \
+ .lock_fail = __SIX_LOCK_HELD_intent, \
+ .unlock_val = -__SIX_VAL(intent_lock, 1), \
+ .held_mask = __SIX_LOCK_HELD_intent, \
+ .unlock_wakeup = SIX_LOCK_intent, \
+ }, \
+ [SIX_LOCK_write] = { \
+ .lock_val = __SIX_VAL(seq, 1), \
+ .lock_fail = __SIX_LOCK_HELD_read, \
+ .unlock_val = __SIX_VAL(seq, 1), \
+ .held_mask = __SIX_LOCK_HELD_write, \
+ .unlock_wakeup = SIX_LOCK_read, \
+ }, \
+}
+
+static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
+ union six_lock_state old)
+{
+ if (type != SIX_LOCK_intent)
+ return;
+
+ if (!old.intent_lock) {
+ EBUG_ON(lock->owner);
+ lock->owner = current;
+ } else {
+ EBUG_ON(lock->owner != current);
+ }
+}
+
+static inline unsigned pcpu_read_count(struct six_lock *lock)
+{
+ unsigned read_count = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ read_count += *per_cpu_ptr(lock->readers, cpu);
+ return read_count;
+}
+
+struct six_lock_waiter {
+ struct list_head list;
+ struct task_struct *task;
+};
+
+/* This is probably up there with the more evil things I've done */
+#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
+
+static inline void six_lock_wakeup(struct six_lock *lock,
+ union six_lock_state state,
+ unsigned waitlist_id)
+{
+ if (waitlist_id == SIX_LOCK_write) {
+ if (state.write_locking && !state.read_lock) {
+ struct task_struct *p = READ_ONCE(lock->owner);
+ if (p)
+ wake_up_process(p);
+ }
+ } else {
+ struct list_head *wait_list = &lock->wait_list[waitlist_id];
+ struct six_lock_waiter *w, *next;
+
+ if (!(state.waiters & (1 << waitlist_id)))
+ return;
+
+ clear_bit(waitlist_bitnr(waitlist_id),
+ (unsigned long *) &lock->state.v);
+
+ raw_spin_lock(&lock->wait_lock);
+
+ list_for_each_entry_safe(w, next, wait_list, list) {
+ list_del_init(&w->list);
+
+ if (wake_up_process(w->task) &&
+ waitlist_id != SIX_LOCK_read) {
+ if (!list_empty(wait_list))
+ set_bit(waitlist_bitnr(waitlist_id),
+ (unsigned long *) &lock->state.v);
+ break;
+ }
+ }
+
+ raw_spin_unlock(&lock->wait_lock);
+ }
+}
+
+static __always_inline bool do_six_trylock_type(struct six_lock *lock,
+ enum six_lock_type type,
+ bool try)
+{
+ const struct six_lock_vals l[] = LOCK_VALS;
+ union six_lock_state old, new;
+ bool ret;
+ u64 v;
+
+ EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
+ EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
+
+ EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
+
+ /*
+ * Percpu reader mode:
+ *
+ * The basic idea behind this algorithm is that you can implement a lock
+ * between two threads without any atomics, just memory barriers:
+ *
+ * For two threads you'll need two variables, one variable for "thread a
+ * has the lock" and another for "thread b has the lock".
+ *
+ * To take the lock, a thread sets its variable indicating that it holds
+ * the lock, then issues a full memory barrier, then reads from the
+ * other thread's variable to check if the other thread thinks it has
+ * the lock. If we raced, we backoff and retry/sleep.
+ */
+
+ if (type == SIX_LOCK_read && lock->readers) {
+retry:
+ preempt_disable();
+ this_cpu_inc(*lock->readers); /* signal that we own lock */
+
+ smp_mb();
+
+ old.v = READ_ONCE(lock->state.v);
+ ret = !(old.v & l[type].lock_fail);
+
+ this_cpu_sub(*lock->readers, !ret);
+ preempt_enable();
+
+ /*
+ * If we failed because a writer was trying to take the
+ * lock, issue a wakeup because we might have caused a
+ * spurious trylock failure:
+ */
+ if (old.write_locking) {
+ struct task_struct *p = READ_ONCE(lock->owner);
+
+ if (p)
+ wake_up_process(p);
+ }
+
+ /*
+ * If we failed from the lock path and the waiting bit wasn't
+ * set, set it:
+ */
+ if (!try && !ret) {
+ v = old.v;
+
+ do {
+ new.v = old.v = v;
+
+ if (!(old.v & l[type].lock_fail))
+ goto retry;
+
+ if (new.waiters & (1 << type))
+ break;
+
+ new.waiters |= 1 << type;
+ } while ((v = atomic64_cmpxchg(&lock->state.counter,
+ old.v, new.v)) != old.v);
+ }
+ } else if (type == SIX_LOCK_write && lock->readers) {
+ if (try) {
+ atomic64_add(__SIX_VAL(write_locking, 1),
+ &lock->state.counter);
+ smp_mb__after_atomic();
+ }
+
+ ret = !pcpu_read_count(lock);
+
+ /*
+ * On success, we increment lock->seq; also we clear
+ * write_locking unless we failed from the lock path:
+ */
+ v = 0;
+ if (ret)
+ v += __SIX_VAL(seq, 1);
+ if (ret || try)
+ v -= __SIX_VAL(write_locking, 1);
+
+ if (try && !ret) {
+ old.v = atomic64_add_return(v, &lock->state.counter);
+ six_lock_wakeup(lock, old, SIX_LOCK_read);
+ } else {
+ atomic64_add(v, &lock->state.counter);
+ }
+ } else {
+ v = READ_ONCE(lock->state.v);
+ do {
+ new.v = old.v = v;
+
+ if (!(old.v & l[type].lock_fail)) {
+ new.v += l[type].lock_val;
+
+ if (type == SIX_LOCK_write)
+ new.write_locking = 0;
+ } else if (!try && type != SIX_LOCK_write &&
+ !(new.waiters & (1 << type)))
+ new.waiters |= 1 << type;
+ else
+ break; /* waiting bit already set */
+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+ old.v, new.v)) != old.v);
+
+ ret = !(old.v & l[type].lock_fail);
+
+ EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
+ }
+
+ if (ret)
+ six_set_owner(lock, type, old);
+
+ EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking));
+
+ return ret;
+}
+
+__always_inline __flatten
+static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+{
+ if (!do_six_trylock_type(lock, type, true))
+ return false;
+
+ if (type != SIX_LOCK_write)
+ six_acquire(&lock->dep_map, 1);
+ return true;
+}
+
+__always_inline __flatten
+static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
+ unsigned seq)
+{
+ const struct six_lock_vals l[] = LOCK_VALS;
+ union six_lock_state old;
+ u64 v;
+
+ EBUG_ON(type == SIX_LOCK_write);
+
+ if (type == SIX_LOCK_read &&
+ lock->readers) {
+ bool ret;
+
+ preempt_disable();
+ this_cpu_inc(*lock->readers);
+
+ smp_mb();
+
+ old.v = READ_ONCE(lock->state.v);
+ ret = !(old.v & l[type].lock_fail) && old.seq == seq;
+
+ this_cpu_sub(*lock->readers, !ret);
+ preempt_enable();
+
+ /*
+ * Similar to the lock path, we may have caused a spurious write
+ * lock fail and need to issue a wakeup:
+ */
+ if (old.write_locking) {
+ struct task_struct *p = READ_ONCE(lock->owner);
+
+ if (p)
+ wake_up_process(p);
+ }
+
+ if (ret)
+ six_acquire(&lock->dep_map, 1);
+
+ return ret;
+ }
+
+ v = READ_ONCE(lock->state.v);
+ do {
+ old.v = v;
+
+ if (old.seq != seq || old.v & l[type].lock_fail)
+ return false;
+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+ old.v,
+ old.v + l[type].lock_val)) != old.v);
+
+ six_set_owner(lock, type, old);
+ if (type != SIX_LOCK_write)
+ six_acquire(&lock->dep_map, 1);
+ return true;
+}
+
+#ifdef CONFIG_LOCK_SPIN_ON_OWNER
+
+static inline int six_can_spin_on_owner(struct six_lock *lock)
+{
+ struct task_struct *owner;
+ int retval = 1;
+
+ if (need_resched())
+ return 0;
+
+ rcu_read_lock();
+ owner = READ_ONCE(lock->owner);
+ if (owner)
+ retval = owner->on_cpu;
+ rcu_read_unlock();
+ /*
+ * if lock->owner is not set, the mutex owner may have just acquired
+ * it and not set the owner yet or the mutex has been released.
+ */
+ return retval;
+}
+
+static inline bool six_spin_on_owner(struct six_lock *lock,
+ struct task_struct *owner)
+{
+ bool ret = true;
+
+ rcu_read_lock();
+ while (lock->owner == owner) {
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_
+ * checking lock->owner still matches owner. If that fails,
+ * owner might point to freed memory. If it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+
+ if (!owner->on_cpu || need_resched()) {
+ ret = false;
+ break;
+ }
+
+ cpu_relax();
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+ struct task_struct *task = current;
+
+ if (type == SIX_LOCK_write)
+ return false;
+
+ preempt_disable();
+ if (!six_can_spin_on_owner(lock))
+ goto fail;
+
+ if (!osq_lock(&lock->osq))
+ goto fail;
+
+ while (1) {
+ struct task_struct *owner;
+
+ /*
+ * If there's an owner, wait for it to either
+ * release the lock or go to sleep.
+ */
+ owner = READ_ONCE(lock->owner);
+ if (owner && !six_spin_on_owner(lock, owner))
+ break;
+
+ if (do_six_trylock_type(lock, type, false)) {
+ osq_unlock(&lock->osq);
+ preempt_enable();
+ return true;
+ }
+
+ /*
+ * When there's no owner, we might have preempted between the
+ * owner acquiring the lock and setting the owner field. If
+ * we're an RT task that will live-lock because we won't let
+ * the owner complete.
+ */
+ if (!owner && (need_resched() || rt_task(task)))
+ break;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ cpu_relax();
+ }
+
+ osq_unlock(&lock->osq);
+fail:
+ preempt_enable();
+
+ /*
+ * If we fell out of the spin path because of need_resched(),
+ * reschedule now, before we try-lock again. This avoids getting
+ * scheduled out right after we obtained the lock.
+ */
+ if (need_resched())
+ schedule();
+
+ return false;
+}
+
+#else /* CONFIG_LOCK_SPIN_ON_OWNER */
+
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+ return false;
+}
+
+#endif
+
+noinline
+static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
+ six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+ union six_lock_state old;
+ struct six_lock_waiter wait;
+ int ret = 0;
+
+ if (type == SIX_LOCK_write) {
+ EBUG_ON(lock->state.write_locking);
+ atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter);
+ smp_mb__after_atomic();
+ }
+
+ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
+ if (ret)
+ goto out_before_sleep;
+
+ if (six_optimistic_spin(lock, type))
+ goto out_before_sleep;
+
+ lock_contended(&lock->dep_map, _RET_IP_);
+
+ INIT_LIST_HEAD(&wait.list);
+ wait.task = current;
+
+ while (1) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (type == SIX_LOCK_write)
+ EBUG_ON(lock->owner != current);
+ else if (list_empty_careful(&wait.list)) {
+ raw_spin_lock(&lock->wait_lock);
+ list_add_tail(&wait.list, &lock->wait_list[type]);
+ raw_spin_unlock(&lock->wait_lock);
+ }
+
+ if (do_six_trylock_type(lock, type, false))
+ break;
+
+ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
+ if (ret)
+ break;
+
+ schedule();
+ }
+
+ __set_current_state(TASK_RUNNING);
+
+ if (!list_empty_careful(&wait.list)) {
+ raw_spin_lock(&lock->wait_lock);
+ list_del_init(&wait.list);
+ raw_spin_unlock(&lock->wait_lock);
+ }
+out_before_sleep:
+ if (ret && type == SIX_LOCK_write) {
+ old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
+ &lock->state.counter);
+ six_lock_wakeup(lock, old, SIX_LOCK_read);
+ }
+
+ return ret;
+}
+
+__always_inline
+static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
+ six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+ int ret;
+
+ if (type != SIX_LOCK_write)
+ six_acquire(&lock->dep_map, 0);
+
+ ret = do_six_trylock_type(lock, type, true) ? 0
+ : __six_lock_type_slowpath(lock, type, should_sleep_fn, p);
+
+ if (ret && type != SIX_LOCK_write)
+ six_release(&lock->dep_map);
+ if (!ret)
+ lock_acquired(&lock->dep_map, _RET_IP_);
+
+ return ret;
+}
+
+__always_inline __flatten
+static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+ const struct six_lock_vals l[] = LOCK_VALS;
+ union six_lock_state state;
+
+ EBUG_ON(type == SIX_LOCK_write &&
+ !(lock->state.v & __SIX_LOCK_HELD_intent));
+
+ if (type != SIX_LOCK_write)
+ six_release(&lock->dep_map);
+
+ if (type == SIX_LOCK_intent) {
+ EBUG_ON(lock->owner != current);
+
+ if (lock->intent_lock_recurse) {
+ --lock->intent_lock_recurse;
+ return;
+ }
+
+ lock->owner = NULL;
+ }
+
+ if (type == SIX_LOCK_read &&
+ lock->readers) {
+ smp_mb(); /* unlock barrier */
+ this_cpu_dec(*lock->readers);
+ smp_mb(); /* between unlocking and checking for waiters */
+ state.v = READ_ONCE(lock->state.v);
+ } else {
+ EBUG_ON(!(lock->state.v & l[type].held_mask));
+ state.v = atomic64_add_return_release(l[type].unlock_val,
+ &lock->state.counter);
+ }
+
+ six_lock_wakeup(lock, state, l[type].unlock_wakeup);
+}
+
+#define __SIX_LOCK(type) \
+bool six_trylock_##type(struct six_lock *lock) \
+{ \
+ return __six_trylock_type(lock, SIX_LOCK_##type); \
+} \
+EXPORT_SYMBOL_GPL(six_trylock_##type); \
+ \
+bool six_relock_##type(struct six_lock *lock, u32 seq) \
+{ \
+ return __six_relock_type(lock, SIX_LOCK_##type, seq); \
+} \
+EXPORT_SYMBOL_GPL(six_relock_##type); \
+ \
+int six_lock_##type(struct six_lock *lock, \
+ six_lock_should_sleep_fn should_sleep_fn, void *p) \
+{ \
+ return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\
+} \
+EXPORT_SYMBOL_GPL(six_lock_##type); \
+ \
+void six_unlock_##type(struct six_lock *lock) \
+{ \
+ __six_unlock_type(lock, SIX_LOCK_##type); \
+} \
+EXPORT_SYMBOL_GPL(six_unlock_##type);
+
+__SIX_LOCK(read)
+__SIX_LOCK(intent)
+__SIX_LOCK(write)
+
+#undef __SIX_LOCK
+
+/* Convert from intent to read: */
+void six_lock_downgrade(struct six_lock *lock)
+{
+ six_lock_increment(lock, SIX_LOCK_read);
+ six_unlock_intent(lock);
+}
+EXPORT_SYMBOL_GPL(six_lock_downgrade);
+
+bool six_lock_tryupgrade(struct six_lock *lock)
+{
+ union six_lock_state old, new;
+ u64 v = READ_ONCE(lock->state.v);
+
+ do {
+ new.v = old.v = v;
+
+ if (new.intent_lock)
+ return false;
+
+ if (!lock->readers) {
+ EBUG_ON(!new.read_lock);
+ new.read_lock--;
+ }
+
+ new.intent_lock = 1;
+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+ old.v, new.v)) != old.v);
+
+ if (lock->readers)
+ this_cpu_dec(*lock->readers);
+
+ six_set_owner(lock, SIX_LOCK_intent, old);
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
+
+bool six_trylock_convert(struct six_lock *lock,
+ enum six_lock_type from,
+ enum six_lock_type to)
+{
+ EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
+
+ if (to == from)
+ return true;
+
+ if (to == SIX_LOCK_read) {
+ six_lock_downgrade(lock);
+ return true;
+ } else {
+ return six_lock_tryupgrade(lock);
+ }
+}
+EXPORT_SYMBOL_GPL(six_trylock_convert);
+
+/*
+ * Increment read/intent lock count, assuming we already have it read or intent
+ * locked:
+ */
+void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
+{
+ const struct six_lock_vals l[] = LOCK_VALS;
+
+ six_acquire(&lock->dep_map, 0);
+
+ /* XXX: assert already locked, and that we don't overflow: */
+
+ switch (type) {
+ case SIX_LOCK_read:
+ if (lock->readers) {
+ this_cpu_inc(*lock->readers);
+ } else {
+ EBUG_ON(!lock->state.read_lock &&
+ !lock->state.intent_lock);
+ atomic64_add(l[type].lock_val, &lock->state.counter);
+ }
+ break;
+ case SIX_LOCK_intent:
+ EBUG_ON(!lock->state.intent_lock);
+ lock->intent_lock_recurse++;
+ break;
+ case SIX_LOCK_write:
+ BUG();
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(six_lock_increment);
+
+void six_lock_wakeup_all(struct six_lock *lock)
+{
+ struct six_lock_waiter *w;
+
+ raw_spin_lock(&lock->wait_lock);
+
+ list_for_each_entry(w, &lock->wait_list[0], list)
+ wake_up_process(w->task);
+ list_for_each_entry(w, &lock->wait_list[1], list)
+ wake_up_process(w->task);
+
+ raw_spin_unlock(&lock->wait_lock);
+}
+EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
+
+struct free_pcpu_rcu {
+ struct rcu_head rcu;
+ void __percpu *p;
+};
+
+static void free_pcpu_rcu_fn(struct rcu_head *_rcu)
+{
+ struct free_pcpu_rcu *rcu =
+ container_of(_rcu, struct free_pcpu_rcu, rcu);
+
+ free_percpu(rcu->p);
+ kfree(rcu);
+}
+
+void six_lock_pcpu_free_rcu(struct six_lock *lock)
+{
+ struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL);
+
+ if (!rcu)
+ return;
+
+ rcu->p = lock->readers;
+ lock->readers = NULL;
+
+ call_rcu(&rcu->rcu, free_pcpu_rcu_fn);
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu);
+
+void six_lock_pcpu_free(struct six_lock *lock)
+{
+ BUG_ON(lock->readers && pcpu_read_count(lock));
+ BUG_ON(lock->state.read_lock);
+
+ free_percpu(lock->readers);
+ lock->readers = NULL;
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_free);
+
+void six_lock_pcpu_alloc(struct six_lock *lock)
+{
+#ifdef __KERNEL__
+ if (!lock->readers)
+ lock->readers = alloc_percpu(unsigned);
+#endif
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);
+
+/*
+ * Returns lock held counts, for both read and intent
+ */
+struct six_lock_count six_lock_counts(struct six_lock *lock)
+{
+ struct six_lock_count ret = { 0, lock->state.intent_lock };
+
+ if (!lock->readers)
+ ret.read += lock->state.read_lock;
+ else {
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ ret.read += *per_cpu_ptr(lock->readers, cpu);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(six_lock_counts);
--
2.40.1


2023-05-11 12:34:19

by Jan Engelhardt

[permalink] [raw]
Subject: Re: [PATCH 04/32] locking: SIX locks (shared/intent/exclusive)


On Tuesday 2023-05-09 18:56, Kent Overstreet wrote:
>--- /dev/null
>+++ b/include/linux/six.h
>@@ -0,0 +1,210 @@
>+ * There are also operations that take the lock type as a parameter, where the
>+ * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write:
>+ *
>+ * six_lock_type(lock, type)
>+ * six_unlock_type(lock, type)
>+ * six_relock(lock, type, seq)
>+ * six_trylock_type(lock, type)
>+ * six_trylock_convert(lock, from, to)
>+ *
>+ * A lock may be held multiple types by the same thread (for read or intent,

"multiple times"

>+// SPDX-License-Identifier: GPL-2.0

The currently SPDX list only knows "GPL-2.0-only" or "GPL-2.0-or-later",
please edit.

2023-05-12 21:07:50

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 04/32] locking: SIX locks (shared/intent/exclusive)

On Thu, May 11, 2023 at 02:14:08PM +0200, Jan Engelhardt wrote:
>
> On Tuesday 2023-05-09 18:56, Kent Overstreet wrote:
> >--- /dev/null
> >+++ b/include/linux/six.h
> >@@ -0,0 +1,210 @@
> >+ * There are also operations that take the lock type as a parameter, where the
> >+ * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write:
> >+ *
> >+ * six_lock_type(lock, type)
> >+ * six_unlock_type(lock, type)
> >+ * six_relock(lock, type, seq)
> >+ * six_trylock_type(lock, type)
> >+ * six_trylock_convert(lock, from, to)
> >+ *
> >+ * A lock may be held multiple types by the same thread (for read or intent,
>
> "multiple times"

Thanks, fixed

> >+// SPDX-License-Identifier: GPL-2.0
>
> The currently SPDX list only knows "GPL-2.0-only" or "GPL-2.0-or-later",
> please edit.

Where is that list?

2023-05-12 23:15:24

by Jan Engelhardt

[permalink] [raw]
Subject: Re: [PATCH 04/32] locking: SIX locks (shared/intent/exclusive)


On Friday 2023-05-12 22:58, Kent Overstreet wrote:
>On Thu, May 11, 2023 at 02:14:08PM +0200, Jan Engelhardt wrote:
>> >+// SPDX-License-Identifier: GPL-2.0
>>
>> The currently SPDX list only knows "GPL-2.0-only" or "GPL-2.0-or-later",
>> please edit.
>
>Where is that list?

I just went to spdx.org and then chose "License List" from the
horizontal top bar menu.

2023-05-12 23:46:24

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 04/32] locking: SIX locks (shared/intent/exclusive)

On Sat, May 13, 2023 at 12:39:34AM +0200, Jan Engelhardt wrote:
>
> On Friday 2023-05-12 22:58, Kent Overstreet wrote:
> >On Thu, May 11, 2023 at 02:14:08PM +0200, Jan Engelhardt wrote:
> >> >+// SPDX-License-Identifier: GPL-2.0
> >>
> >> The currently SPDX list only knows "GPL-2.0-only" or "GPL-2.0-or-later",
> >> please edit.
> >
> >Where is that list?
>
> I just went to spdx.org and then chose "License List" from the
> horizontal top bar menu.

Do we have anything more official? Quick grep through the source tree
says I'm following accepted usage.

2023-05-12 23:51:01

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH 04/32] locking: SIX locks (shared/intent/exclusive)



On 5/12/23 16:26, Kent Overstreet wrote:
> On Sat, May 13, 2023 at 12:39:34AM +0200, Jan Engelhardt wrote:
>>
>> On Friday 2023-05-12 22:58, Kent Overstreet wrote:
>>> On Thu, May 11, 2023 at 02:14:08PM +0200, Jan Engelhardt wrote:
>>>>> +// SPDX-License-Identifier: GPL-2.0
>>>>
>>>> The currently SPDX list only knows "GPL-2.0-only" or "GPL-2.0-or-later",
>>>> please edit.
>>>
>>> Where is that list?
>>
>> I just went to spdx.org and then chose "License List" from the
>> horizontal top bar menu.
>
> Do we have anything more official? Quick grep through the source tree
> says I'm following accepted usage.

Documentation/process/license-rules.rst points to spdx.org for
further info.

or LICENSES/preferred/GPL-2.0 contains this:
Valid-License-Identifier: GPL-2.0
Valid-License-Identifier: GPL-2.0-only
Valid-License-Identifier: GPL-2.0+
Valid-License-Identifier: GPL-2.0-or-later
SPDX-URL: https://spdx.org/licenses/GPL-2.0.html


--
~Randy

2023-05-13 00:36:55

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 04/32] locking: SIX locks (shared/intent/exclusive)

On Fri, May 12, 2023 at 04:49:04PM -0700, Randy Dunlap wrote:
>
>
> On 5/12/23 16:26, Kent Overstreet wrote:
> > On Sat, May 13, 2023 at 12:39:34AM +0200, Jan Engelhardt wrote:
> >>
> >> On Friday 2023-05-12 22:58, Kent Overstreet wrote:
> >>> On Thu, May 11, 2023 at 02:14:08PM +0200, Jan Engelhardt wrote:
> >>>>> +// SPDX-License-Identifier: GPL-2.0
> >>>>
> >>>> The currently SPDX list only knows "GPL-2.0-only" or "GPL-2.0-or-later",
> >>>> please edit.
> >>>
> >>> Where is that list?
> >>
> >> I just went to spdx.org and then chose "License List" from the
> >> horizontal top bar menu.
> >
> > Do we have anything more official? Quick grep through the source tree
> > says I'm following accepted usage.
>
> Documentation/process/license-rules.rst points to spdx.org for
> further info.
>
> or LICENSES/preferred/GPL-2.0 contains this:
> Valid-License-Identifier: GPL-2.0
> Valid-License-Identifier: GPL-2.0-only
> Valid-License-Identifier: GPL-2.0+
> Valid-License-Identifier: GPL-2.0-or-later
> SPDX-URL: https://spdx.org/licenses/GPL-2.0.html

Thanks, I'll leave it at GPL-2.0 then.

2023-05-13 01:07:08

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 04/32] locking: SIX locks (shared/intent/exclusive)

On Fri, May 12, 2023 at 05:45:18PM -0700, Eric Biggers wrote:
> On Fri, May 12, 2023 at 08:17:23PM -0400, Kent Overstreet wrote:
> > On Fri, May 12, 2023 at 04:49:04PM -0700, Randy Dunlap wrote:
> > >
> > >
> > > On 5/12/23 16:26, Kent Overstreet wrote:
> > > > On Sat, May 13, 2023 at 12:39:34AM +0200, Jan Engelhardt wrote:
> > > >>
> > > >> On Friday 2023-05-12 22:58, Kent Overstreet wrote:
> > > >>> On Thu, May 11, 2023 at 02:14:08PM +0200, Jan Engelhardt wrote:
> > > >>>>> +// SPDX-License-Identifier: GPL-2.0
> > > >>>>
> > > >>>> The currently SPDX list only knows "GPL-2.0-only" or "GPL-2.0-or-later",
> > > >>>> please edit.
> > > >>>
> > > >>> Where is that list?
> > > >>
> > > >> I just went to spdx.org and then chose "License List" from the
> > > >> horizontal top bar menu.
> > > >
> > > > Do we have anything more official? Quick grep through the source tree
> > > > says I'm following accepted usage.
> > >
> > > Documentation/process/license-rules.rst points to spdx.org for
> > > further info.
> > >
> > > or LICENSES/preferred/GPL-2.0 contains this:
> > > Valid-License-Identifier: GPL-2.0
> > > Valid-License-Identifier: GPL-2.0-only
> > > Valid-License-Identifier: GPL-2.0+
> > > Valid-License-Identifier: GPL-2.0-or-later
> > > SPDX-URL: https://spdx.org/licenses/GPL-2.0.html
> >
> > Thanks, I'll leave it at GPL-2.0 then.
>
> https://spdx.org/licenses/GPL-2.0.html says that GPL-2.0 is deprecated. Its
> replacement is https://spdx.org/licenses/GPL-2.0-only.html. Yes, they mean the
> same thing, but the new names were introduced to be clearer than the old ones.

Perhaps updating LICENCES/preferred/GPL-2.0 is in order.

2023-05-13 02:11:16

by Eric Biggers

[permalink] [raw]
Subject: Re: [PATCH 04/32] locking: SIX locks (shared/intent/exclusive)

On Fri, May 12, 2023 at 08:17:23PM -0400, Kent Overstreet wrote:
> On Fri, May 12, 2023 at 04:49:04PM -0700, Randy Dunlap wrote:
> >
> >
> > On 5/12/23 16:26, Kent Overstreet wrote:
> > > On Sat, May 13, 2023 at 12:39:34AM +0200, Jan Engelhardt wrote:
> > >>
> > >> On Friday 2023-05-12 22:58, Kent Overstreet wrote:
> > >>> On Thu, May 11, 2023 at 02:14:08PM +0200, Jan Engelhardt wrote:
> > >>>>> +// SPDX-License-Identifier: GPL-2.0
> > >>>>
> > >>>> The currently SPDX list only knows "GPL-2.0-only" or "GPL-2.0-or-later",
> > >>>> please edit.
> > >>>
> > >>> Where is that list?
> > >>
> > >> I just went to spdx.org and then chose "License List" from the
> > >> horizontal top bar menu.
> > >
> > > Do we have anything more official? Quick grep through the source tree
> > > says I'm following accepted usage.
> >
> > Documentation/process/license-rules.rst points to spdx.org for
> > further info.
> >
> > or LICENSES/preferred/GPL-2.0 contains this:
> > Valid-License-Identifier: GPL-2.0
> > Valid-License-Identifier: GPL-2.0-only
> > Valid-License-Identifier: GPL-2.0+
> > Valid-License-Identifier: GPL-2.0-or-later
> > SPDX-URL: https://spdx.org/licenses/GPL-2.0.html
>
> Thanks, I'll leave it at GPL-2.0 then.

https://spdx.org/licenses/GPL-2.0.html says that GPL-2.0 is deprecated. Its
replacement is https://spdx.org/licenses/GPL-2.0-only.html. Yes, they mean the
same thing, but the new names were introduced to be clearer than the old ones.

- Eric

2023-05-14 12:32:38

by Jeffrey Layton

[permalink] [raw]
Subject: Re: [PATCH 04/32] locking: SIX locks (shared/intent/exclusive)

On Tue, 2023-05-09 at 12:56 -0400, Kent Overstreet wrote:
> From: Kent Overstreet <[email protected]>
>
> New lock for bcachefs, like read/write locks but with a third state,
> intent.
>
> Intent locks conflict with each other, but not with read locks; taking a
> write lock requires first holding an intent lock.
>
> Signed-off-by: Kent Overstreet <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Will Deacon <[email protected]>
> Cc: Waiman Long <[email protected]>
> Cc: Boqun Feng <[email protected]>
> ---
> include/linux/six.h | 210 +++++++++++
> kernel/Kconfig.locks | 3 +
> kernel/locking/Makefile | 1 +
> kernel/locking/six.c | 779 ++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 993 insertions(+)
> create mode 100644 include/linux/six.h
> create mode 100644 kernel/locking/six.c
>
> diff --git a/include/linux/six.h b/include/linux/six.h
> new file mode 100644
> index 0000000000..41ddf63b74
> --- /dev/null
> +++ b/include/linux/six.h
> @@ -0,0 +1,210 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef _LINUX_SIX_H
> +#define _LINUX_SIX_H
> +
> +/*
> + * Shared/intent/exclusive locks: sleepable read/write locks, much like rw
> + * semaphores, except with a third intermediate state, intent. Basic operations
> + * are:
> + *
> + * six_lock_read(&foo->lock);
> + * six_unlock_read(&foo->lock);
> + *
> + * six_lock_intent(&foo->lock);
> + * six_unlock_intent(&foo->lock);
> + *
> + * six_lock_write(&foo->lock);
> + * six_unlock_write(&foo->lock);
> + *
> + * Intent locks block other intent locks, but do not block read locks, and you
> + * must have an intent lock held before taking a write lock, like so:
> + *
> + * six_lock_intent(&foo->lock);
> + * six_lock_write(&foo->lock);
> + * six_unlock_write(&foo->lock);
> + * six_unlock_intent(&foo->lock);
> + *

So the idea is to create a fundamentally unfair rwsem? One that always
prefers readers over writers?

> + * Other operations:
> + *
> + * six_trylock_read()
> + * six_trylock_intent()
> + * six_trylock_write()
> + *
> + * six_lock_downgrade(): convert from intent to read
> + * six_lock_tryupgrade(): attempt to convert from read to intent
> + *
> + * Locks also embed a sequence number, which is incremented when the lock is
> + * locked or unlocked for write. The current sequence number can be grabbed
> + * while a lock is held from lock->state.seq; then, if you drop the lock you can
> + * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock
> + * iff it hasn't been locked for write in the meantime.
> + *

^^^
This is a cool idea.

> + * There are also operations that take the lock type as a parameter, where the
> + * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write:
> + *
> + * six_lock_type(lock, type)
> + * six_unlock_type(lock, type)
> + * six_relock(lock, type, seq)
> + * six_trylock_type(lock, type)
> + * six_trylock_convert(lock, from, to)
> + *
> + * A lock may be held multiple types by the same thread (for read or intent,
> + * not write). However, the six locks code does _not_ implement the actual
> + * recursive checks itself though - rather, if your code (e.g. btree iterator
> + * code) knows that the current thread already has a lock held, and for the
> + * correct type, six_lock_increment() may be used to bump up the counter for
> + * that type - the only effect is that one more call to unlock will be required
> + * before the lock is unlocked.

Thse semantics are a bit confusing. Once you hold a read or intent lock,
you can take it as many times as you like. What happens if I take it in
one context and release it in another? Say, across a workqueue job for
instance?

Are intent locks "converted" to write locks, or do they stack? For
instance, suppose I take the intent lock 3 times and then take a write
lock. How many times do I have to call unlock to fully release it (3 or
4)? If I release it just once, do I still hold the write lock or am I
back to "intent" state?


> + */
> +
> +#include <linux/lockdep.h>
> +#include <linux/osq_lock.h>
> +#include <linux/sched.h>
> +#include <linux/types.h>
> +
> +#define SIX_LOCK_SEPARATE_LOCKFNS
>
>

Some basic info about the underlying design would be nice here. What
info is tracked in the union below? When are different members being
used? How does the code decide which way to cast this thing? etc.


> +
> +union six_lock_state {
> + struct {
> + atomic64_t counter;
> + };
> +
> + struct {
> + u64 v;
> + };
> +
> + struct {
> + /* for waitlist_bitnr() */
> + unsigned long l;
> + };
> +
> + struct {
> + unsigned read_lock:27;
> + unsigned write_locking:1;
> + unsigned intent_lock:1;
> + unsigned waiters:3;


Ehttp://www...bitfields. That seems a bit scary in a union. There is no
guarantee that the underlying arch will even pack that into a single
word, AIUI. It may be safer to do this with masking and shifting
instead.

> + /*
> + * seq works much like in seqlocks: it's incremented every time
> + * we lock and unlock for write.
> + *
> + * If it's odd write lock is held, even unlocked.
> + *
> + * Thus readers can unlock, and then lock again later iff it
> + * hasn't been modified in the meantime.
> + */
> + u32 seq;
> + };
> +};
> +
> +enum six_lock_type {
> + SIX_LOCK_read,
> + SIX_LOCK_intent,
> + SIX_LOCK_write,
> +};
> +
> +struct six_lock {
> + union six_lock_state state;
> + unsigned intent_lock_recurse;
> + struct task_struct *owner;
> + struct optimistic_spin_queue osq;
> + unsigned __percpu *readers;
> +
> + raw_spinlock_t wait_lock;
> + struct list_head wait_list[2];
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> + struct lockdep_map dep_map;
> +#endif
> +};
> +
> +typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
> +
> +static __always_inline void __six_lock_init(struct six_lock *lock,
> + const char *name,
> + struct lock_class_key *key)
> +{
> + atomic64_set(&lock->state.counter, 0);
> + raw_spin_lock_init(&lock->wait_lock);
> + INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]);
> + INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]);
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> + debug_check_no_locks_freed((void *) lock, sizeof(*lock));
> + lockdep_init_map(&lock->dep_map, name, key, 0);
> +#endif
> +}
> +
> +#define six_lock_init(lock) \
> +do { \
> + static struct lock_class_key __key; \
> + \
> + __six_lock_init((lock), #lock, &__key); \
> +} while (0)
> +
> +#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v)
> +
> +#define __SIX_LOCK(type) \
> +bool six_trylock_##type(struct six_lock *); \
> +bool six_relock_##type(struct six_lock *, u32); \
> +int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\
> +void six_unlock_##type(struct six_lock *);
> +
> +__SIX_LOCK(read)
> +__SIX_LOCK(intent)
> +__SIX_LOCK(write)
> +#undef __SIX_LOCK
> +
> +#define SIX_LOCK_DISPATCH(type, fn, ...) \
> + switch (type) { \
> + case SIX_LOCK_read: \
> + return fn##_read(__VA_ARGS__); \
> + case SIX_LOCK_intent: \
> + return fn##_intent(__VA_ARGS__); \
> + case SIX_LOCK_write: \
> + return fn##_write(__VA_ARGS__); \
> + default: \
> + BUG(); \
> + }
> +
> +static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
> +{
> + SIX_LOCK_DISPATCH(type, six_trylock, lock);
> +}
> +
> +static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
> + unsigned seq)
> +{
> + SIX_LOCK_DISPATCH(type, six_relock, lock, seq);
> +}
> +
> +static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
> + six_lock_should_sleep_fn should_sleep_fn, void *p)
> +{
> + SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p);
> +}
> +
> +static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
> +{
> + SIX_LOCK_DISPATCH(type, six_unlock, lock);
> +}
> +
> +void six_lock_downgrade(struct six_lock *);
> +bool six_lock_tryupgrade(struct six_lock *);
> +bool six_trylock_convert(struct six_lock *, enum six_lock_type,
> + enum six_lock_type);
> +
> +void six_lock_increment(struct six_lock *, enum six_lock_type);
> +
> +void six_lock_wakeup_all(struct six_lock *);
> +
> +void six_lock_pcpu_free_rcu(struct six_lock *);
> +void six_lock_pcpu_free(struct six_lock *);
> +void six_lock_pcpu_alloc(struct six_lock *);
> +
> +struct six_lock_count {
> + unsigned read;
> + unsigned intent;
> +};
> +
> +struct six_lock_count six_lock_counts(struct six_lock *);
> +
> +#endif /* _LINUX_SIX_H */
> diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> index 4198f0273e..b2abd9a5d9 100644
> --- a/kernel/Kconfig.locks
> +++ b/kernel/Kconfig.locks
> @@ -259,3 +259,6 @@ config ARCH_HAS_MMIOWB
> config MMIOWB
> def_bool y if ARCH_HAS_MMIOWB
> depends on SMP
> +
> +config SIXLOCKS
> + bool
> diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
> index 0db4093d17..a095dbbf01 100644
> --- a/kernel/locking/Makefile
> +++ b/kernel/locking/Makefile
> @@ -32,3 +32,4 @@ obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
> obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
> obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
> obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o
> +obj-$(CONFIG_SIXLOCKS) += six.o
> diff --git a/kernel/locking/six.c b/kernel/locking/six.c
> new file mode 100644
> index 0000000000..5b2d92c6e9
> --- /dev/null
> +++ b/kernel/locking/six.c
> @@ -0,0 +1,779 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/export.h>
> +#include <linux/log2.h>
> +#include <linux/percpu.h>
> +#include <linux/preempt.h>
> +#include <linux/rcupdate.h>
> +#include <linux/sched.h>
> +#include <linux/sched/rt.h>
> +#include <linux/six.h>
> +#include <linux/slab.h>
> +
> +#ifdef DEBUG
> +#define EBUG_ON(cond) BUG_ON(cond)
> +#else
> +#define EBUG_ON(cond) do {} while (0)
> +#endif
> +
> +#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
> +#define six_release(l) lock_release(l, _RET_IP_)
> +
> +struct six_lock_vals {
> + /* Value we add to the lock in order to take the lock: */
> + u64 lock_val;
> +
> + /* If the lock has this value (used as a mask), taking the lock fails: */
> + u64 lock_fail;
> +
> + /* Value we add to the lock in order to release the lock: */
> + u64 unlock_val;
> +
> + /* Mask that indicates lock is held for this type: */
> + u64 held_mask;
> +
> + /* Waitlist we wakeup when releasing the lock: */
> + enum six_lock_type unlock_wakeup;
> +};
> +
> +#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0)
> +#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0)
> +#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1)
> +
> +#define LOCK_VALS { \
> + [SIX_LOCK_read] = { \
> + .lock_val = __SIX_VAL(read_lock, 1), \
> + .lock_fail = __SIX_LOCK_HELD_write + __SIX_VAL(write_locking, 1),\
> + .unlock_val = -__SIX_VAL(read_lock, 1), \
> + .held_mask = __SIX_LOCK_HELD_read, \
> + .unlock_wakeup = SIX_LOCK_write, \
> + }, \
> + [SIX_LOCK_intent] = { \
> + .lock_val = __SIX_VAL(intent_lock, 1), \
> + .lock_fail = __SIX_LOCK_HELD_intent, \
> + .unlock_val = -__SIX_VAL(intent_lock, 1), \
> + .held_mask = __SIX_LOCK_HELD_intent, \
> + .unlock_wakeup = SIX_LOCK_intent, \
> + }, \
> + [SIX_LOCK_write] = { \
> + .lock_val = __SIX_VAL(seq, 1), \
> + .lock_fail = __SIX_LOCK_HELD_read, \
> + .unlock_val = __SIX_VAL(seq, 1), \
> + .held_mask = __SIX_LOCK_HELD_write, \
> + .unlock_wakeup = SIX_LOCK_read, \
> + }, \
> +}
> +
> +static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
> + union six_lock_state old)
> +{
> + if (type != SIX_LOCK_intent)
> + return;
> +
> + if (!old.intent_lock) {
> + EBUG_ON(lock->owner);
> + lock->owner = current;
> + } else {
> + EBUG_ON(lock->owner != current);
> + }
> +}
> +
> +static inline unsigned pcpu_read_count(struct six_lock *lock)
> +{
> + unsigned read_count = 0;
> + int cpu;
> +
> + for_each_possible_cpu(cpu)
> + read_count += *per_cpu_ptr(lock->readers, cpu);
> + return read_count;
> +}
> +
> +struct six_lock_waiter {
> + struct list_head list;
> + struct task_struct *task;
> +};
> +
> +/* This is probably up there with the more evil things I've done */
> +#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
> +
> +static inline void six_lock_wakeup(struct six_lock *lock,
> + union six_lock_state state,
> + unsigned waitlist_id)
> +{
> + if (waitlist_id == SIX_LOCK_write) {
> + if (state.write_locking && !state.read_lock) {
> + struct task_struct *p = READ_ONCE(lock->owner);
> + if (p)
> + wake_up_process(p);
> + }
> + } else {
> + struct list_head *wait_list = &lock->wait_list[waitlist_id];
> + struct six_lock_waiter *w, *next;
> +
> + if (!(state.waiters & (1 << waitlist_id)))
> + return;
> +
> + clear_bit(waitlist_bitnr(waitlist_id),
> + (unsigned long *) &lock->state.v);
> +
> + raw_spin_lock(&lock->wait_lock);
> +
> + list_for_each_entry_safe(w, next, wait_list, list) {
> + list_del_init(&w->list);
> +
> + if (wake_up_process(w->task) &&
> + waitlist_id != SIX_LOCK_read) {
> + if (!list_empty(wait_list))
> + set_bit(waitlist_bitnr(waitlist_id),
> + (unsigned long *) &lock->state.v);
> + break;
> + }
> + }
> +
> + raw_spin_unlock(&lock->wait_lock);
> + }
> +}
> +
> +static __always_inline bool do_six_trylock_type(struct six_lock *lock,
> + enum six_lock_type type,
> + bool try)
> +{
> + const struct six_lock_vals l[] = LOCK_VALS;
> + union six_lock_state old, new;
> + bool ret;
> + u64 v;
> +
> + EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
> + EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
> +
> + EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
> +
> + /*
> + * Percpu reader mode:
> + *
> + * The basic idea behind this algorithm is that you can implement a lock
> + * between two threads without any atomics, just memory barriers:
> + *
> + * For two threads you'll need two variables, one variable for "thread a
> + * has the lock" and another for "thread b has the lock".
> + *
> + * To take the lock, a thread sets its variable indicating that it holds
> + * the lock, then issues a full memory barrier, then reads from the
> + * other thread's variable to check if the other thread thinks it has
> + * the lock. If we raced, we backoff and retry/sleep.
> + */
> +
> + if (type == SIX_LOCK_read && lock->readers) {
> +retry:
> + preempt_disable();
> + this_cpu_inc(*lock->readers); /* signal that we own lock */
> +
> + smp_mb();
> +
> + old.v = READ_ONCE(lock->state.v);
> + ret = !(old.v & l[type].lock_fail);
> +
> + this_cpu_sub(*lock->readers, !ret);
> + preempt_enable();
> +
> + /*
> + * If we failed because a writer was trying to take the
> + * lock, issue a wakeup because we might have caused a
> + * spurious trylock failure:
> + */
> + if (old.write_locking) {
> + struct task_struct *p = READ_ONCE(lock->owner);
> +
> + if (p)
> + wake_up_process(p);
> + }
> +
> + /*
> + * If we failed from the lock path and the waiting bit wasn't
> + * set, set it:
> + */
> + if (!try && !ret) {
> + v = old.v;
> +
> + do {
> + new.v = old.v = v;
> +
> + if (!(old.v & l[type].lock_fail))
> + goto retry;
> +
> + if (new.waiters & (1 << type))
> + break;
> +
> + new.waiters |= 1 << type;
> + } while ((v = atomic64_cmpxchg(&lock->state.counter,
> + old.v, new.v)) != old.v);
> + }
> + } else if (type == SIX_LOCK_write && lock->readers) {
> + if (try) {
> + atomic64_add(__SIX_VAL(write_locking, 1),
> + &lock->state.counter);
> + smp_mb__after_atomic();
> + }
> +
> + ret = !pcpu_read_count(lock);
> +
> + /*
> + * On success, we increment lock->seq; also we clear
> + * write_locking unless we failed from the lock path:
> + */
> + v = 0;
> + if (ret)
> + v += __SIX_VAL(seq, 1);
> + if (ret || try)
> + v -= __SIX_VAL(write_locking, 1);
> +
> + if (try && !ret) {
> + old.v = atomic64_add_return(v, &lock->state.counter);
> + six_lock_wakeup(lock, old, SIX_LOCK_read);
> + } else {
> + atomic64_add(v, &lock->state.counter);
> + }
> + } else {
> + v = READ_ONCE(lock->state.v);
> + do {
> + new.v = old.v = v;
> +
> + if (!(old.v & l[type].lock_fail)) {
> + new.v += l[type].lock_val;
> +
> + if (type == SIX_LOCK_write)
> + new.write_locking = 0;
> + } else if (!try && type != SIX_LOCK_write &&
> + !(new.waiters & (1 << type)))
> + new.waiters |= 1 << type;
> + else
> + break; /* waiting bit already set */
> + } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
> + old.v, new.v)) != old.v);
> +
> + ret = !(old.v & l[type].lock_fail);
> +
> + EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
> + }
> +
> + if (ret)
> + six_set_owner(lock, type, old);
> +
> + EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking));
> +
> + return ret;
> +}
> +

^^^
I'd really like to see some more comments in the code above. It's pretty
complex.

> +__always_inline __flatten
> +static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
> +{
> + if (!do_six_trylock_type(lock, type, true))
> + return false;
> +
> + if (type != SIX_LOCK_write)
> + six_acquire(&lock->dep_map, 1);
> + return true;
> +}
> +
> +__always_inline __flatten
> +static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
> + unsigned seq)
> +{
> + const struct six_lock_vals l[] = LOCK_VALS;
> + union six_lock_state old;
> + u64 v;
> +
> + EBUG_ON(type == SIX_LOCK_write);
> +
> + if (type == SIX_LOCK_read &&
> + lock->readers) {
> + bool ret;
> +
> + preempt_disable();
> + this_cpu_inc(*lock->readers);
> +
> + smp_mb();
> +
> + old.v = READ_ONCE(lock->state.v);
> + ret = !(old.v & l[type].lock_fail) && old.seq == seq;
> +
> + this_cpu_sub(*lock->readers, !ret);
> + preempt_enable();
> +
> + /*
> + * Similar to the lock path, we may have caused a spurious write
> + * lock fail and need to issue a wakeup:
> + */
> + if (old.write_locking) {
> + struct task_struct *p = READ_ONCE(lock->owner);
> +
> + if (p)
> + wake_up_process(p);
> + }
> +
> + if (ret)
> + six_acquire(&lock->dep_map, 1);
> +
> + return ret;
> + }
> +
> + v = READ_ONCE(lock->state.v);
> + do {
> + old.v = v;
> +
> + if (old.seq != seq || old.v & l[type].lock_fail)
> + return false;
> + } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
> + old.v,
> + old.v + l[type].lock_val)) != old.v);
> +
> + six_set_owner(lock, type, old);
> + if (type != SIX_LOCK_write)
> + six_acquire(&lock->dep_map, 1);
> + return true;
> +}
> +
> +#ifdef CONFIG_LOCK_SPIN_ON_OWNER
> +
> +static inline int six_can_spin_on_owner(struct six_lock *lock)
> +{
> + struct task_struct *owner;
> + int retval = 1;
> +
> + if (need_resched())
> + return 0;
> +
> + rcu_read_lock();
> + owner = READ_ONCE(lock->owner);
> + if (owner)
> + retval = owner->on_cpu;
> + rcu_read_unlock();
> + /*
> + * if lock->owner is not set, the mutex owner may have just acquired
> + * it and not set the owner yet or the mutex has been released.
> + */
> + return retval;
> +}
> +
> +static inline bool six_spin_on_owner(struct six_lock *lock,
> + struct task_struct *owner)
> +{
> + bool ret = true;
> +
> + rcu_read_lock();
> + while (lock->owner == owner) {
> + /*
> + * Ensure we emit the owner->on_cpu, dereference _after_
> + * checking lock->owner still matches owner. If that fails,
> + * owner might point to freed memory. If it still matches,
> + * the rcu_read_lock() ensures the memory stays valid.
> + */
> + barrier();
> +
> + if (!owner->on_cpu || need_resched()) {
> + ret = false;
> + break;
> + }
> +
> + cpu_relax();
> + }
> + rcu_read_unlock();
> +
> + return ret;
> +}
> +
> +static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
> +{
> + struct task_struct *task = current;
> +
> + if (type == SIX_LOCK_write)
> + return false;
> +
> + preempt_disable();
> + if (!six_can_spin_on_owner(lock))
> + goto fail;
> +
> + if (!osq_lock(&lock->osq))
> + goto fail;
> +
> + while (1) {
> + struct task_struct *owner;
> +
> + /*
> + * If there's an owner, wait for it to either
> + * release the lock or go to sleep.
> + */
> + owner = READ_ONCE(lock->owner);
> + if (owner && !six_spin_on_owner(lock, owner))
> + break;
> +
> + if (do_six_trylock_type(lock, type, false)) {
> + osq_unlock(&lock->osq);
> + preempt_enable();
> + return true;
> + }
> +
> + /*
> + * When there's no owner, we might have preempted between the
> + * owner acquiring the lock and setting the owner field. If
> + * we're an RT task that will live-lock because we won't let
> + * the owner complete.
> + */
> + if (!owner && (need_resched() || rt_task(task)))
> + break;
> +
> + /*
> + * The cpu_relax() call is a compiler barrier which forces
> + * everything in this loop to be re-loaded. We don't need
> + * memory barriers as we'll eventually observe the right
> + * values at the cost of a few extra spins.
> + */
> + cpu_relax();
> + }
> +
> + osq_unlock(&lock->osq);
> +fail:
> + preempt_enable();
> +
> + /*
> + * If we fell out of the spin path because of need_resched(),
> + * reschedule now, before we try-lock again. This avoids getting
> + * scheduled out right after we obtained the lock.
> + */
> + if (need_resched())
> + schedule();
> +
> + return false;
> +}
> +
> +#else /* CONFIG_LOCK_SPIN_ON_OWNER */
> +
> +static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
> +{
> + return false;
> +}
> +
> +#endif
> +
> +noinline
> +static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
> + six_lock_should_sleep_fn should_sleep_fn, void *p)
> +{
> + union six_lock_state old;
> + struct six_lock_waiter wait;
> + int ret = 0;
> +
> + if (type == SIX_LOCK_write) {
> + EBUG_ON(lock->state.write_locking);
> + atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter);
> + smp_mb__after_atomic();
> + }
> +
> + ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
> + if (ret)
> + goto out_before_sleep;
> +
> + if (six_optimistic_spin(lock, type))
> + goto out_before_sleep;
> +
> + lock_contended(&lock->dep_map, _RET_IP_);
> +
> + INIT_LIST_HEAD(&wait.list);
> + wait.task = current;
> +
> + while (1) {
> + set_current_state(TASK_UNINTERRUPTIBLE);
> + if (type == SIX_LOCK_write)
> + EBUG_ON(lock->owner != current);
> + else if (list_empty_careful(&wait.list)) {
> + raw_spin_lock(&lock->wait_lock);
> + list_add_tail(&wait.list, &lock->wait_list[type]);
> + raw_spin_unlock(&lock->wait_lock);
> + }
> +
> + if (do_six_trylock_type(lock, type, false))
> + break;
> +
> + ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
> + if (ret)
> + break;
> +
> + schedule();
> + }
> +
> + __set_current_state(TASK_RUNNING);
> +
> + if (!list_empty_careful(&wait.list)) {
> + raw_spin_lock(&lock->wait_lock);
> + list_del_init(&wait.list);
> + raw_spin_unlock(&lock->wait_lock);
> + }
> +out_before_sleep:
> + if (ret && type == SIX_LOCK_write) {
> + old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
> + &lock->state.counter);
> + six_lock_wakeup(lock, old, SIX_LOCK_read);
> + }
> +
> + return ret;
> +}
> +
> +__always_inline
> +static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
> + six_lock_should_sleep_fn should_sleep_fn, void *p)
> +{
> + int ret;
> +
> + if (type != SIX_LOCK_write)
> + six_acquire(&lock->dep_map, 0);
> +
> + ret = do_six_trylock_type(lock, type, true) ? 0
> + : __six_lock_type_slowpath(lock, type, should_sleep_fn, p);
> +
> + if (ret && type != SIX_LOCK_write)
> + six_release(&lock->dep_map);
> + if (!ret)
> + lock_acquired(&lock->dep_map, _RET_IP_);
> +
> + return ret;
> +}
> +
> +__always_inline __flatten
> +static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
> +{
> + const struct six_lock_vals l[] = LOCK_VALS;
> + union six_lock_state state;
> +
> + EBUG_ON(type == SIX_LOCK_write &&
> + !(lock->state.v & __SIX_LOCK_HELD_intent));
> +
> + if (type != SIX_LOCK_write)
> + six_release(&lock->dep_map);
> +
> + if (type == SIX_LOCK_intent) {
> + EBUG_ON(lock->owner != current);
> +
> + if (lock->intent_lock_recurse) {
> + --lock->intent_lock_recurse;
> + return;
> + }
> +
> + lock->owner = NULL;
> + }
> +
> + if (type == SIX_LOCK_read &&
> + lock->readers) {
> + smp_mb(); /* unlock barrier */
> + this_cpu_dec(*lock->readers);
> + smp_mb(); /* between unlocking and checking for waiters */
> + state.v = READ_ONCE(lock->state.v);
> + } else {
> + EBUG_ON(!(lock->state.v & l[type].held_mask));
> + state.v = atomic64_add_return_release(l[type].unlock_val,
> + &lock->state.counter);
> + }
> +
> + six_lock_wakeup(lock, state, l[type].unlock_wakeup);
> +}
> +
> +#define __SIX_LOCK(type) \
> +bool six_trylock_##type(struct six_lock *lock) \
> +{ \
> + return __six_trylock_type(lock, SIX_LOCK_##type); \
> +} \
> +EXPORT_SYMBOL_GPL(six_trylock_##type); \
> + \
> +bool six_relock_##type(struct six_lock *lock, u32 seq) \
> +{ \
> + return __six_relock_type(lock, SIX_LOCK_##type, seq); \
> +} \
> +EXPORT_SYMBOL_GPL(six_relock_##type); \
> + \
> +int six_lock_##type(struct six_lock *lock, \
> + six_lock_should_sleep_fn should_sleep_fn, void *p) \
> +{ \
> + return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\
> +} \
> +EXPORT_SYMBOL_GPL(six_lock_##type); \
> + \
> +void six_unlock_##type(struct six_lock *lock) \
> +{ \
> + __six_unlock_type(lock, SIX_LOCK_##type); \
> +} \
> +EXPORT_SYMBOL_GPL(six_unlock_##type);
> +
> +__SIX_LOCK(read)
> +__SIX_LOCK(intent)
> +__SIX_LOCK(write)
> +
> +#undef __SIX_LOCK
> +
> +/* Convert from intent to read: */
> +void six_lock_downgrade(struct six_lock *lock)
> +{
> + six_lock_increment(lock, SIX_LOCK_read);
> + six_unlock_intent(lock);
> +}
> +EXPORT_SYMBOL_GPL(six_lock_downgrade);
> +
> +bool six_lock_tryupgrade(struct six_lock *lock)
> +{
> + union six_lock_state old, new;
> + u64 v = READ_ONCE(lock->state.v);
> +
> + do {
> + new.v = old.v = v;
> +
> + if (new.intent_lock)
> + return false;
> +
> + if (!lock->readers) {
> + EBUG_ON(!new.read_lock);
> + new.read_lock--;
> + }
> +
> + new.intent_lock = 1;
> + } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
> + old.v, new.v)) != old.v);
> +
> + if (lock->readers)
> + this_cpu_dec(*lock->readers);
> +
> + six_set_owner(lock, SIX_LOCK_intent, old);
> +
> + return true;
> +}
> +EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
> +
> +bool six_trylock_convert(struct six_lock *lock,
> + enum six_lock_type from,
> + enum six_lock_type to)
> +{
> + EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
> +
> + if (to == from)
> + return true;
> +
> + if (to == SIX_LOCK_read) {
> + six_lock_downgrade(lock);
> + return true;
> + } else {
> + return six_lock_tryupgrade(lock);
> + }
> +}
> +EXPORT_SYMBOL_GPL(six_trylock_convert);
> +
> +/*
> + * Increment read/intent lock count, assuming we already have it read or intent
> + * locked:
> + */
> +void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
> +{
> + const struct six_lock_vals l[] = LOCK_VALS;
> +
> + six_acquire(&lock->dep_map, 0);
> +
> + /* XXX: assert already locked, and that we don't overflow: */
> +
> + switch (type) {
> + case SIX_LOCK_read:
> + if (lock->readers) {
> + this_cpu_inc(*lock->readers);
> + } else {
> + EBUG_ON(!lock->state.read_lock &&
> + !lock->state.intent_lock);
> + atomic64_add(l[type].lock_val, &lock->state.counter);
> + }
> + break;
> + case SIX_LOCK_intent:
> + EBUG_ON(!lock->state.intent_lock);
> + lock->intent_lock_recurse++;
> + break;
> + case SIX_LOCK_write:
> + BUG();
> + break;
> + }
> +}
> +EXPORT_SYMBOL_GPL(six_lock_increment);
> +
> +void six_lock_wakeup_all(struct six_lock *lock)
> +{
> + struct six_lock_waiter *w;
> +
> + raw_spin_lock(&lock->wait_lock);
> +
> + list_for_each_entry(w, &lock->wait_list[0], list)
> + wake_up_process(w->task);
> + list_for_each_entry(w, &lock->wait_list[1], list)
> + wake_up_process(w->task);
> +
> + raw_spin_unlock(&lock->wait_lock);
> +}
> +EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
> +
> +struct free_pcpu_rcu {
> + struct rcu_head rcu;
> + void __percpu *p;
> +};
> +
> +static void free_pcpu_rcu_fn(struct rcu_head *_rcu)
> +{
> + struct free_pcpu_rcu *rcu =
> + container_of(_rcu, struct free_pcpu_rcu, rcu);
> +
> + free_percpu(rcu->p);
> + kfree(rcu);
> +}
> +
> +void six_lock_pcpu_free_rcu(struct six_lock *lock)
> +{
> + struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL);
> +
> + if (!rcu)
> + return;
> +
> + rcu->p = lock->readers;
> + lock->readers = NULL;
> +
> + call_rcu(&rcu->rcu, free_pcpu_rcu_fn);
> +}
> +EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu);
> +
> +void six_lock_pcpu_free(struct six_lock *lock)
> +{
> + BUG_ON(lock->readers && pcpu_read_count(lock));
> + BUG_ON(lock->state.read_lock);
> +
> + free_percpu(lock->readers);
> + lock->readers = NULL;
> +}
> +EXPORT_SYMBOL_GPL(six_lock_pcpu_free);
> +
> +void six_lock_pcpu_alloc(struct six_lock *lock)
> +{
> +#ifdef __KERNEL__
> + if (!lock->readers)
> + lock->readers = alloc_percpu(unsigned);
> +#endif
> +}
> +EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);
> +
> +/*
> + * Returns lock held counts, for both read and intent
> + */
> +struct six_lock_count six_lock_counts(struct six_lock *lock)
> +{
> + struct six_lock_count ret = { 0, lock->state.intent_lock };
> +
> + if (!lock->readers)
> + ret.read += lock->state.read_lock;
> + else {
> + int cpu;
> +
> + for_each_possible_cpu(cpu)
> + ret.read += *per_cpu_ptr(lock->readers, cpu);
> + }
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(six_lock_counts);

--
Jeff Layton <[email protected]>

2023-05-15 02:55:23

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 04/32] locking: SIX locks (shared/intent/exclusive)

On Sun, May 14, 2023 at 08:15:20AM -0400, Jeff Layton wrote:
> So the idea is to create a fundamentally unfair rwsem? One that always
> prefers readers over writers?

No, not sure where you're getting that from. It's unfair, but writes are
preferred over readers :)

>
> > + * Other operations:
> > + *
> > + * six_trylock_read()
> > + * six_trylock_intent()
> > + * six_trylock_write()
> > + *
> > + * six_lock_downgrade(): convert from intent to read
> > + * six_lock_tryupgrade(): attempt to convert from read to intent
> > + *
> > + * Locks also embed a sequence number, which is incremented when the lock is
> > + * locked or unlocked for write. The current sequence number can be grabbed
> > + * while a lock is held from lock->state.seq; then, if you drop the lock you can
> > + * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock
> > + * iff it hasn't been locked for write in the meantime.
> > + *
>
> ^^^
> This is a cool idea.

It's used heavily in bcachefs so we can drop locks if we might be
blocking - and then relock and continue, at the cost of a transaction
restart if the relock fails. It's a huge win for tail latency.

> > + * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write:
> > + *
> > + * six_lock_type(lock, type)
> > + * six_unlock_type(lock, type)
> > + * six_relock(lock, type, seq)
> > + * six_trylock_type(lock, type)
> > + * six_trylock_convert(lock, from, to)
> > + *
> > + * A lock may be held multiple types by the same thread (for read or intent,
> > + * not write). However, the six locks code does _not_ implement the actual
> > + * recursive checks itself though - rather, if your code (e.g. btree iterator
> > + * code) knows that the current thread already has a lock held, and for the
> > + * correct type, six_lock_increment() may be used to bump up the counter for
> > + * that type - the only effect is that one more call to unlock will be required
> > + * before the lock is unlocked.
>
> Thse semantics are a bit confusing. Once you hold a read or intent lock,
> you can take it as many times as you like. What happens if I take it in
> one context and release it in another? Say, across a workqueue job for
> instance?

Not allowed because of lockdep, same as with other locks.

> Are intent locks "converted" to write locks, or do they stack? For
> instance, suppose I take the intent lock 3 times and then take a write
> lock. How many times do I have to call unlock to fully release it (3 or
> 4)? If I release it just once, do I still hold the write lock or am I
> back to "intent" state?

They stack. You'd call unlock_write() once ad unlock_intent() three
times.

> Some basic info about the underlying design would be nice here. What
> info is tracked in the union below? When are different members being
> used? How does the code decide which way to cast this thing? etc.

The field names seem pretty descriptive to me.

counter, v are just for READ_ONCE/atomic64 cmpxchg ops.

> Ehttp://www...bitfields. That seems a bit scary in a union. There is no
> guarantee that the underlying arch will even pack that into a single
> word, AIUI. It may be safer to do this with masking and shifting
> instead.

It wouldn't hurt to add a BUILD_BUG_ON() for the size, but I don't find
anything "scary" about unions and bitfields :)

And it makes the code more descriptive and readable than masking and
shifting.

> > +static __always_inline bool do_six_trylock_type(struct six_lock *lock,
> > + enum six_lock_type type,
> > + bool try)
> > +{
> > + const struct six_lock_vals l[] = LOCK_VALS;
> > + union six_lock_state old, new;
> > + bool ret;
> > + u64 v;
> > +
> > + EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
> > + EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
> > +
> > + EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
> > +
> > + /*
> > + * Percpu reader mode:
> > + *
> > + * The basic idea behind this algorithm is that you can implement a lock
> > + * between two threads without any atomics, just memory barriers:
> > + *
> > + * For two threads you'll need two variables, one variable for "thread a
> > + * has the lock" and another for "thread b has the lock".
> > + *
> > + * To take the lock, a thread sets its variable indicating that it holds
> > + * the lock, then issues a full memory barrier, then reads from the
> > + * other thread's variable to check if the other thread thinks it has
> > + * the lock. If we raced, we backoff and retry/sleep.
> > + */
> > +
> > + if (type == SIX_LOCK_read && lock->readers) {
> > +retry:
> > + preempt_disable();
> > + this_cpu_inc(*lock->readers); /* signal that we own lock */
> > +
> > + smp_mb();
> > +
> > + old.v = READ_ONCE(lock->state.v);
> > + ret = !(old.v & l[type].lock_fail);
> > +
> > + this_cpu_sub(*lock->readers, !ret);
> > + preempt_enable();
> > +
> > + /*
> > + * If we failed because a writer was trying to take the
> > + * lock, issue a wakeup because we might have caused a
> > + * spurious trylock failure:
> > + */
> > + if (old.write_locking) {
> > + struct task_struct *p = READ_ONCE(lock->owner);
> > +
> > + if (p)
> > + wake_up_process(p);
> > + }
> > +
> > + /*
> > + * If we failed from the lock path and the waiting bit wasn't
> > + * set, set it:
> > + */
> > + if (!try && !ret) {
> > + v = old.v;
> > +
> > + do {
> > + new.v = old.v = v;
> > +
> > + if (!(old.v & l[type].lock_fail))
> > + goto retry;
> > +
> > + if (new.waiters & (1 << type))
> > + break;
> > +
> > + new.waiters |= 1 << type;
> > + } while ((v = atomic64_cmpxchg(&lock->state.counter,
> > + old.v, new.v)) != old.v);
> > + }
> > + } else if (type == SIX_LOCK_write && lock->readers) {
> > + if (try) {
> > + atomic64_add(__SIX_VAL(write_locking, 1),
> > + &lock->state.counter);
> > + smp_mb__after_atomic();
> > + }
> > +
> > + ret = !pcpu_read_count(lock);
> > +
> > + /*
> > + * On success, we increment lock->seq; also we clear
> > + * write_locking unless we failed from the lock path:
> > + */
> > + v = 0;
> > + if (ret)
> > + v += __SIX_VAL(seq, 1);
> > + if (ret || try)
> > + v -= __SIX_VAL(write_locking, 1);
> > +
> > + if (try && !ret) {
> > + old.v = atomic64_add_return(v, &lock->state.counter);
> > + six_lock_wakeup(lock, old, SIX_LOCK_read);
> > + } else {
> > + atomic64_add(v, &lock->state.counter);
> > + }
> > + } else {
> > + v = READ_ONCE(lock->state.v);
> > + do {
> > + new.v = old.v = v;
> > +
> > + if (!(old.v & l[type].lock_fail)) {
> > + new.v += l[type].lock_val;
> > +
> > + if (type == SIX_LOCK_write)
> > + new.write_locking = 0;
> > + } else if (!try && type != SIX_LOCK_write &&
> > + !(new.waiters & (1 << type)))
> > + new.waiters |= 1 << type;
> > + else
> > + break; /* waiting bit already set */
> > + } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
> > + old.v, new.v)) != old.v);
> > +
> > + ret = !(old.v & l[type].lock_fail);
> > +
> > + EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
> > + }
> > +
> > + if (ret)
> > + six_set_owner(lock, type, old);
> > +
> > + EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking));
> > +
> > + return ret;
> > +}
> > +
>
> ^^^
> I'd really like to see some more comments in the code above. It's pretty
> complex.

It's already got more comments than is typical for kernel locking code :)

But if there's specific things you'd like to see clarified, please do
point them out.