bit-spinlocks are ideal for locking individual chains of a
hashtable as they do not take extra memory, and they share
a cachle line with the chain head which needs to be loaded (and often
written) anyway.
bit-spinlocks are not fair, but this is only a problem if contention
is likely. Resizing hashtable are designed to avoid contention on
individual chains - when chains have more than a very few object, the
table is resized.
This is a resend of patches which have been previously posted. The
only comment raised before concerned the need for lockdep support and
so the last patch was written and posted.
The only changes is that another occurrence of 'locks_mul' needed to
be removed.
Thanks,
NeilBrown
---
NeilBrown (5):
rhashtable: use cmpxchg() in nested_table_alloc()
rhashtable: don't hold lock on first table throughout insertion.
rhashtable: allow rht_bucket_var to return NULL.
rhashtable: use bit_spin_locks to protect hash bucket.
rhashtable: add lockdep tracking to bucket bit-spin-locks.
include/linux/rhashtable-types.h | 2
include/linux/rhashtable.h | 216 +++++++++++++++++++++++++-------------
ipc/util.c | 1
lib/rhashtable.c | 188 ++++++++++++++++-----------------
net/bridge/br_fdb.c | 1
net/bridge/br_vlan.c | 1
net/bridge/br_vlan_tunnel.c | 1
net/ipv4/ipmr.c | 1
net/ipv6/ip6mr.c | 1
net/netfilter/nf_tables_api.c | 1
10 files changed, 233 insertions(+), 180 deletions(-)
--
Signature
nested_table_alloc() relies on the fact that there is
at most one spinlock allocated for every slot in the top
level nested table, so it is not possible for two threads
to try to allocate the same table at the same time.
A future patch will change the locking and invalidate
this assumption. So change the code to protect against
a race using cmpxchg() - if it loses, it frees the table
that it allocated.
Signed-off-by: NeilBrown <[email protected]>
---
lib/rhashtable.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 9ddb7134285e..c6eaeaff16d1 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -132,9 +132,11 @@ static union nested_table *nested_table_alloc(struct rhashtable *ht,
INIT_RHT_NULLS_HEAD(ntbl[i].bucket);
}
- rcu_assign_pointer(*prev, ntbl);
-
- return ntbl;
+ if (cmpxchg(prev, NULL, ntbl) == NULL)
+ return ntbl;
+ /* Raced with another thread. */
+ kfree(ntbl);
+ return rcu_dereference(*prev);
}
static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
Rather than returning a pointer a static nulls, rht_bucket_var()
now returns NULL if the bucket doesn't exist.
This will make the next patch, which stores a bitlock in the
bucket pointer, somewhat cleaner.
Signed-off-by: NeilBrown <[email protected]>
---
include/linux/rhashtable.h | 11 +++++++++--
lib/rhashtable.c | 29 ++++++++++++++++++++---------
2 files changed, 29 insertions(+), 11 deletions(-)
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 26ed3b299028..a4ff6ae524a0 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -284,6 +284,8 @@ void rhashtable_destroy(struct rhashtable *ht);
struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
unsigned int hash);
+struct rhash_head __rcu **__rht_bucket_nested(const struct bucket_table *tbl,
+ unsigned int hash);
struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
struct bucket_table *tbl,
unsigned int hash);
@@ -313,7 +315,7 @@ static inline struct rhash_head __rcu *const *rht_bucket(
static inline struct rhash_head __rcu **rht_bucket_var(
struct bucket_table *tbl, unsigned int hash)
{
- return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
+ return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) :
&tbl->buckets[hash];
}
@@ -916,6 +918,8 @@ static inline int __rhashtable_remove_fast_one(
spin_lock_bh(lock);
pprev = rht_bucket_var(tbl, hash);
+ if (!pprev)
+ goto out;
rht_for_each_continue(he, *pprev, tbl, hash) {
struct rhlist_head *list;
@@ -960,6 +964,7 @@ static inline int __rhashtable_remove_fast_one(
break;
}
+out:
spin_unlock_bh(lock);
if (err > 0) {
@@ -1068,6 +1073,8 @@ static inline int __rhashtable_replace_fast(
spin_lock_bh(lock);
pprev = rht_bucket_var(tbl, hash);
+ if (!pprev)
+ goto out;
rht_for_each_continue(he, *pprev, tbl, hash) {
if (he != obj_old) {
pprev = &he->next;
@@ -1079,7 +1086,7 @@ static inline int __rhashtable_replace_fast(
err = 0;
break;
}
-
+out:
spin_unlock_bh(lock);
return err;
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 27a5ffa993d4..7a68c1f0b6d0 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -240,8 +240,10 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
goto out;
err = -ENOENT;
+ if (!pprev)
+ goto out;
- rht_for_each(entry, old_tbl, old_hash) {
+ rht_for_each_continue(entry, *pprev, old_tbl, old_hash) {
err = 0;
next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
@@ -498,6 +500,8 @@ static void *rhashtable_lookup_one(struct rhashtable *ht,
elasticity = RHT_ELASTICITY;
pprev = rht_bucket_var(tbl, hash);
+ if (!pprev)
+ return ERR_PTR(-ENOENT);
rht_for_each_continue(head, *pprev, tbl, hash) {
struct rhlist_head *list;
struct rhlist_head *plist;
@@ -1160,11 +1164,10 @@ void rhashtable_destroy(struct rhashtable *ht)
}
EXPORT_SYMBOL_GPL(rhashtable_destroy);
-struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
- unsigned int hash)
+struct rhash_head __rcu **__rht_bucket_nested(const struct bucket_table *tbl,
+ unsigned int hash)
{
const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
- static struct rhash_head __rcu *rhnull;
unsigned int index = hash & ((1 << tbl->nest) - 1);
unsigned int size = tbl->size >> tbl->nest;
unsigned int subhash = hash;
@@ -1182,15 +1185,23 @@ struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
subhash >>= shift;
}
- if (!ntbl) {
- if (!rhnull)
- INIT_RHT_NULLS_HEAD(rhnull, NULL, 0);
- return &rhnull;
- }
+ if (!ntbl)
+ return NULL;
return &ntbl[subhash].bucket;
}
+EXPORT_SYMBOL_GPL(__rht_bucket_nested);
+
+struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
+ unsigned int hash)
+{
+ static struct rhash_head __rcu *rhnull;
+
+ if (!rhnull)
+ INIT_RHT_NULLS_HEAD(rhnull);
+ return __rht_bucket_nested(tbl, hash) ?: &rhnull;
+}
EXPORT_SYMBOL_GPL(rht_bucket_nested);
struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
This patch changes rhashtables to use a bit_spin_lock (BIT(1))
the bucket pointer to lock the hash chain for that bucket.
The benefits of a bit spin_lock are:
- no need to allocate a separate array of locks.
- no need to have a configuration option to guide the
choice of the size of this array
- locking cost if often a single test-and-set in a cache line
that will have to be loaded anyway. When inserting at, or removing
from, the head of the chain, the unlock is free - writing the new
address in the bucket head implicitly clears the lock bit.
- even when lockings costs 2 updates (lock and unlock), they are
in a cacheline that needs to be read anyway.
The cost of using a bit spin_lock is a little bit of code complexity,
which I think is quite manageable.
Bit spin_locks are sometimes inappropriate because they are not fair -
if multiple CPUs repeatedly contend of the same lock, one CPU can
easily be starved. This is not a credible situation with rhashtable.
Multiple CPUs may want to repeatedly add or remove objects, but they
will typically do so at different buckets, so they will attempt to
acquire different locks.
As we have more bit-locks than we previously had spinlocks (by at
least a factor of two) we can expect slightly less contention to
go with the slightly better cache behavior and reduced memory
consumption.
Signed-off-by: NeilBrown <[email protected]>
---
include/linux/rhashtable-types.h | 2
include/linux/rhashtable.h | 190 +++++++++++++++++++++++++-------------
ipc/util.c | 1
lib/rhashtable.c | 118 ++++++++++++------------
net/bridge/br_fdb.c | 1
net/bridge/br_vlan.c | 1
net/bridge/br_vlan_tunnel.c | 1
net/ipv4/ipmr.c | 1
net/ipv6/ip6mr.c | 1
net/netfilter/nf_tables_api.c | 1
10 files changed, 184 insertions(+), 133 deletions(-)
diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
index bc3e84547ba7..39e5e1fb9b65 100644
--- a/include/linux/rhashtable-types.h
+++ b/include/linux/rhashtable-types.h
@@ -48,7 +48,6 @@ typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
* @head_offset: Offset of rhash_head in struct to be hashed
* @max_size: Maximum size while expanding
* @min_size: Minimum size while shrinking
- * @locks_mul: Number of bucket locks to allocate per cpu (default: 32)
* @automatic_shrinking: Enable automatic shrinking of tables
* @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
* @obj_hashfn: Function to hash object
@@ -62,7 +61,6 @@ struct rhashtable_params {
unsigned int max_size;
u16 min_size;
bool automatic_shrinking;
- u8 locks_mul;
rht_hashfn_t hashfn;
rht_obj_hashfn_t obj_hashfn;
rht_obj_cmpfn_t obj_cmpfn;
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index a4ff6ae524a0..b683dc336be1 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -24,6 +24,7 @@
#include <linux/list_nulls.h>
#include <linux/workqueue.h>
#include <linux/rculist.h>
+#include <linux/bit_spinlock.h>
#include <linux/rhashtable-types.h>
/*
@@ -52,8 +53,6 @@
* @nest: Number of bits of first-level nested table.
* @rehash: Current bucket being rehashed
* @hash_rnd: Random seed to fold into hash
- * @locks_mask: Mask to apply before accessing locks[]
- * @locks: Array of spinlocks protecting individual buckets
* @walkers: List of active walkers
* @rcu: RCU structure for freeing the table
* @future_tbl: Table under construction during rehashing
@@ -64,8 +63,6 @@ struct bucket_table {
unsigned int size;
unsigned int nest;
u32 hash_rnd;
- unsigned int locks_mask;
- spinlock_t *locks;
struct list_head walkers;
struct rcu_head rcu;
@@ -74,6 +71,61 @@ struct bucket_table {
struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp;
};
+/*
+ * We lock a bucket by setting BIT(1) in the pointer - this is always
+ * zero in real pointers and in the nulls marker.
+ * bit_spin_locks do not handle contention well, but the whole point
+ * of the hashtable design is to achieve minimum per-bucket contention.
+ * A nested hash table might not have a bucket pointer. In that case
+ * we cannot get a lock. For remove and replace the bucket cannot be
+ * interesting and doesn't need locking.
+ * For insert we allocate the bucket if this is the last bucket_table,
+ * and then take the lock.
+ * Sometimes we unlock a bucket by writing a new pointer there. In that
+ * case we don't need to unlock, but we do need to reset state such as
+ * local_bh. For that we have rht_unlocked(). This doesn't include
+ * the memory barrier that bit_spin_unlock() provides, but rcu_assign_pointer()
+ * will have provided that.
+ */
+
+static inline void rht_lock(struct rhash_head **bucket)
+{
+ local_bh_disable();
+ bit_spin_lock(1, (unsigned long *)bucket);
+}
+
+static inline void rht_unlock(struct rhash_head **bucket)
+{
+ bit_spin_unlock(1, (unsigned long *)bucket);
+ local_bh_enable();
+}
+
+static inline void rht_unlocked(void)
+{
+ preempt_enable();
+ __release(bitlock);
+ local_bh_enable();
+}
+
+/*
+ * If 'p' is a bucket head and might be locked, rht_ptr returns
+ * the address without the lock bit.
+ */
+static inline struct rhash_head __rcu *rht_ptr(const struct rhash_head *p)
+{
+ return (void *)(((unsigned long)p) & ~2UL);
+}
+
+static inline struct rhash_head __rcu *rht_ptr_locked(const struct rhash_head *p)
+{
+ return (void *)(((unsigned long)p) | 2UL);
+}
+
+static inline bool rht_is_locked(const struct rhash_head *p)
+{
+ return rht_ptr_locked(p) == p;
+}
+
#define RHT_NULLS_MARKER(ptr) \
((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1))
#define INIT_RHT_NULLS_HEAD(ptr) \
@@ -197,25 +249,6 @@ static inline bool rht_grow_above_max(const struct rhashtable *ht,
return atomic_read(&ht->nelems) >= ht->max_elems;
}
-/* The bucket lock is selected based on the hash and protects mutations
- * on a group of hash buckets.
- *
- * A maximum of tbl->size/2 bucket locks is allocated. This ensures that
- * a single lock always covers both buckets which may both contains
- * entries which link to the same bucket of the old table during resizing.
- * This allows to simplify the locking as locking the bucket in both
- * tables during resize always guarantee protection.
- *
- * IMPORTANT: When holding the bucket lock of both the old and new table
- * during expansions and shrinking, the old bucket lock must always be
- * acquired first.
- */
-static inline spinlock_t *rht_bucket_lock(const struct bucket_table *tbl,
- unsigned int hash)
-{
- return &tbl->locks[hash & tbl->locks_mask];
-}
-
#ifdef CONFIG_PROVE_LOCKING
int lockdep_rht_mutex_is_held(struct rhashtable *ht);
int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
@@ -345,7 +378,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
* @hash: the hash value / bucket index
*/
#define rht_for_each(pos, tbl, hash) \
- rht_for_each_continue(pos, *rht_bucket(tbl, hash), tbl, hash)
+ rht_for_each_continue(pos, rht_ptr(*rht_bucket(tbl, hash)), tbl, hash)
/**
* rht_for_each_entry_continue - continue iterating over hash chain
@@ -370,7 +403,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
* @member: name of the &struct rhash_head within the hashable struct.
*/
#define rht_for_each_entry(tpos, pos, tbl, hash, member) \
- rht_for_each_entry_continue(tpos, pos, *rht_bucket(tbl, hash), \
+ rht_for_each_entry_continue(tpos, pos, rht_ptr(*rht_bucket(tbl, hash)), \
tbl, hash, member)
/**
@@ -386,7 +419,8 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
* remove the loop cursor from the list.
*/
#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \
- for (pos = rht_dereference_bucket(*rht_bucket(tbl, hash), tbl, hash), \
+ for (pos = rht_dereference_bucket(rht_ptr(*rht_bucket(tbl, hash)), \
+ tbl, hash), \
next = !rht_is_a_nulls(pos) ? \
rht_dereference_bucket(pos->next, tbl, hash) : NULL; \
(!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \
@@ -407,7 +441,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
*/
#define rht_for_each_rcu_continue(pos, head, tbl, hash) \
for (({barrier(); }), \
- pos = rht_dereference_bucket_rcu(head, tbl, hash); \
+ pos = rht_ptr(rht_dereference_bucket_rcu(head, tbl, hash)); \
!rht_is_a_nulls(pos); \
pos = rcu_dereference_raw(pos->next))
@@ -422,7 +456,11 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
* traversal is guarded by rcu_read_lock().
*/
#define rht_for_each_rcu(pos, tbl, hash) \
- rht_for_each_rcu_continue(pos, *rht_bucket(tbl, hash), tbl, hash)
+ for (({barrier(); }), \
+ pos = rht_ptr(rht_dereference_bucket_rcu(*rht_bucket(tbl, hash), \
+ tbl, hash)); \
+ !rht_is_a_nulls(pos); \
+ pos = rcu_dereference_raw(pos->next))
/**
* rht_for_each_entry_rcu_continue - continue iterating over rcu hash chain
@@ -456,7 +494,8 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
* traversal is guarded by rcu_read_lock().
*/
#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \
- rht_for_each_entry_rcu_continue(tpos, pos, *rht_bucket(tbl, hash), \
+ rht_for_each_entry_rcu_continue(tpos, pos, \
+ rht_ptr(*rht_bucket(tbl, hash)), \
tbl, hash, member)
/**
@@ -620,9 +659,9 @@ static inline void *__rhashtable_insert_fast(
};
struct rhash_head __rcu **headp;
struct rhash_head __rcu **pprev;
+ struct rhash_head __rcu **lock;
struct bucket_table *tbl;
struct rhash_head *head;
- spinlock_t *lock;
unsigned int hash;
int elasticity;
void *data;
@@ -631,24 +670,23 @@ static inline void *__rhashtable_insert_fast(
tbl = rht_dereference_rcu(ht->tbl, ht);
hash = rht_head_hashfn(ht, tbl, obj, params);
- lock = rht_bucket_lock(tbl, hash);
- spin_lock_bh(lock);
+ elasticity = RHT_ELASTICITY;
+ headp = rht_bucket_insert(ht, tbl, hash);
+ data = ERR_PTR(-ENOMEM);
+ if (!headp)
+ goto out;
+ lock = pprev = headp;
+ rht_lock(lock);
if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
slow_path:
- spin_unlock_bh(lock);
+ rht_unlock(lock);
rcu_read_unlock();
return rhashtable_insert_slow(ht, key, obj);
}
- elasticity = RHT_ELASTICITY;
- headp = rht_bucket_insert(ht, tbl, hash);
- pprev = headp;
- data = ERR_PTR(-ENOMEM);
- if (!pprev)
- goto out;
- rht_for_each_continue(head, *headp, tbl, hash) {
+ rht_for_each_continue(head, rht_ptr(*headp), tbl, hash) {
struct rhlist_head *plist;
struct rhlist_head *list;
@@ -679,6 +717,8 @@ static inline void *__rhashtable_insert_fast(
head = rht_dereference_bucket(head->next, tbl, hash);
RCU_INIT_POINTER(list->rhead.next, head);
rcu_assign_pointer(*pprev, obj);
+ /* This is where we inserted */
+ headp = pprev;
goto good;
}
@@ -695,7 +735,7 @@ static inline void *__rhashtable_insert_fast(
head = rht_dereference_bucket(*headp, tbl, hash);
- RCU_INIT_POINTER(obj->next, head);
+ RCU_INIT_POINTER(obj->next, rht_ptr(head));
if (rhlist) {
struct rhlist_head *list;
@@ -712,8 +752,15 @@ static inline void *__rhashtable_insert_fast(
good:
data = NULL;
+ if (headp == lock) {
+ /* Assigning to *headp unlocked the chain, so we
+ * don't need to do it again.
+ */
+ rht_unlocked();
+ } else {
out:
- spin_unlock_bh(lock);
+ rht_unlock(lock);
+ }
rcu_read_unlock();
return data;
@@ -725,9 +772,9 @@ static inline void *__rhashtable_insert_fast(
* @obj: pointer to hash head inside object
* @params: hash table parameters
*
- * Will take a per bucket spinlock to protect against mutual mutations
+ * Will take the per bucket bitlock to protect against mutual mutations
* on the same bucket. Multiple insertions may occur in parallel unless
- * they map to the same bucket lock.
+ * they map to the same bucket.
*
* It is safe to call this function from atomic context.
*
@@ -754,9 +801,9 @@ static inline int rhashtable_insert_fast(
* @list: pointer to hash list head inside object
* @params: hash table parameters
*
- * Will take a per bucket spinlock to protect against mutual mutations
+ * Will take the per bucket bitlock to protect against mutual mutations
* on the same bucket. Multiple insertions may occur in parallel unless
- * they map to the same bucket lock.
+ * they map to the same bucket.
*
* It is safe to call this function from atomic context.
*
@@ -777,9 +824,9 @@ static inline int rhltable_insert_key(
* @list: pointer to hash list head inside object
* @params: hash table parameters
*
- * Will take a per bucket spinlock to protect against mutual mutations
+ * Will take the per bucket bitlock to protect against mutual mutations
* on the same bucket. Multiple insertions may occur in parallel unless
- * they map to the same bucket lock.
+ * they map to the same bucket.
*
* It is safe to call this function from atomic context.
*
@@ -907,20 +954,19 @@ static inline int __rhashtable_remove_fast_one(
bool rhlist)
{
struct rhash_head __rcu **pprev;
+ struct rhash_head __rcu **lock;
struct rhash_head *he;
- spinlock_t * lock;
unsigned int hash;
int err = -ENOENT;
hash = rht_head_hashfn(ht, tbl, obj, params);
- lock = rht_bucket_lock(tbl, hash);
-
- spin_lock_bh(lock);
-
pprev = rht_bucket_var(tbl, hash);
if (!pprev)
- goto out;
- rht_for_each_continue(he, *pprev, tbl, hash) {
+ return -ENOENT;
+ lock = pprev;
+ rht_lock(lock);
+
+ rht_for_each_continue(he, rht_ptr(*pprev), tbl, hash) {
struct rhlist_head *list;
list = container_of(he, struct rhlist_head, rhead);
@@ -961,12 +1007,16 @@ static inline int __rhashtable_remove_fast_one(
}
rcu_assign_pointer(*pprev, obj);
+ if (lock == pprev) {
+ /* That rcu_assign_pointer() unlocked the chain */
+ rht_unlocked();
+ goto unlocked;
+ }
break;
}
-out:
- spin_unlock_bh(lock);
-
+ rht_unlock(lock);
+unlocked:
if (err > 0) {
atomic_dec(&ht->nelems);
if (unlikely(ht->p.automatic_shrinking &&
@@ -1056,8 +1106,8 @@ static inline int __rhashtable_replace_fast(
const struct rhashtable_params params)
{
struct rhash_head __rcu **pprev;
+ struct rhash_head __rcu **lock;
struct rhash_head *he;
- spinlock_t *lock;
unsigned int hash;
int err = -ENOENT;
@@ -1068,14 +1118,14 @@ static inline int __rhashtable_replace_fast(
if (hash != rht_head_hashfn(ht, tbl, obj_new, params))
return -EINVAL;
- lock = rht_bucket_lock(tbl, hash);
-
- spin_lock_bh(lock);
-
pprev = rht_bucket_var(tbl, hash);
if (!pprev)
- goto out;
- rht_for_each_continue(he, *pprev, tbl, hash) {
+ return -ENOENT;
+
+ lock = pprev;
+ rht_lock(lock);
+
+ rht_for_each_continue(he, rht_ptr(*pprev), tbl, hash) {
if (he != obj_old) {
pprev = &he->next;
continue;
@@ -1084,11 +1134,17 @@ static inline int __rhashtable_replace_fast(
rcu_assign_pointer(obj_new->next, obj_old->next);
rcu_assign_pointer(*pprev, obj_new);
err = 0;
+ if (pprev == lock) {
+ /* We just unlocked the chain by assigning to *pprev */
+ rht_unlocked();
+ goto unlocked;
+ }
break;
}
-out:
- spin_unlock_bh(lock);
+ rht_unlock(lock);
+
+unlocked:
return err;
}
diff --git a/ipc/util.c b/ipc/util.c
index fdffff41f65b..cc78eb76df8b 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -105,7 +105,6 @@ static const struct rhashtable_params ipc_kht_params = {
.head_offset = offsetof(struct kern_ipc_perm, khtnode),
.key_offset = offsetof(struct kern_ipc_perm, key),
.key_len = FIELD_SIZEOF(struct kern_ipc_perm, key),
- .locks_mul = 1,
.automatic_shrinking = true,
};
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 7a68c1f0b6d0..9b0ca9e1f6b5 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -32,7 +32,6 @@
#define HASH_DEFAULT_SIZE 64UL
#define HASH_MIN_SIZE 4U
-#define BUCKET_LOCKS_PER_CPU 32UL
union nested_table {
union nested_table __rcu *table;
@@ -57,9 +56,11 @@ EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
{
- spinlock_t *lock = rht_bucket_lock(tbl, hash);
-
- return (debug_locks) ? lockdep_is_held(lock) : 1;
+ if (!debug_locks)
+ return 1;
+ if (unlikely(tbl->nest))
+ return 1;
+ return bit_spin_is_locked(1, (unsigned long *)&tbl->buckets[hash]);
}
EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
#else
@@ -105,7 +106,6 @@ static void bucket_table_free(const struct bucket_table *tbl)
if (tbl->nest)
nested_bucket_table_free(tbl);
- free_bucket_spinlocks(tbl->locks);
kvfree(tbl);
}
@@ -172,7 +172,7 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
gfp_t gfp)
{
struct bucket_table *tbl = NULL;
- size_t size, max_locks;
+ size_t size;
int i;
size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
@@ -192,16 +192,6 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
tbl->size = size;
- max_locks = size >> 1;
- if (tbl->nest)
- max_locks = min_t(size_t, max_locks, 1U << tbl->nest);
-
- if (alloc_bucket_spinlocks(&tbl->locks, &tbl->locks_mask, max_locks,
- ht->p.locks_mul, gfp) < 0) {
- bucket_table_free(tbl);
- return NULL;
- }
-
INIT_LIST_HEAD(&tbl->walkers);
tbl->hash_rnd = get_random_u32();
@@ -225,25 +215,24 @@ static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
return new_tbl;
}
-static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
+static int rhashtable_rehash_one(struct rhashtable *ht,
+ struct rhash_head __rcu **pprev,
+ unsigned int old_hash)
{
struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
- struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash);
struct rhash_head __rcu **inspos;
+ struct rhash_head __rcu **lock;
int err = -EAGAIN;
struct rhash_head *head, *next, *entry;
- spinlock_t *new_bucket_lock;
unsigned int new_hash;
if (new_tbl->nest)
goto out;
err = -ENOENT;
- if (!pprev)
- goto out;
- rht_for_each_continue(entry, *pprev, old_tbl, old_hash) {
+ rht_for_each_continue(entry, rht_ptr(*pprev), old_tbl, old_hash) {
err = 0;
next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
@@ -258,11 +247,11 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
new_hash = head_hashfn(ht, new_tbl, entry);
- new_bucket_lock = rht_bucket_lock(new_tbl, new_hash);
-
- spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
inspos = &new_tbl->buckets[new_hash];
- head = rht_dereference_bucket(*inspos, new_tbl, new_hash);
+ lock = inspos;
+ rht_lock(lock);
+
+ head = rht_ptr(rht_dereference_bucket(*inspos, new_tbl, new_hash));
while (!rht_is_a_nulls(head) && head < entry) {
inspos = &head->next;
head = rht_dereference_bucket(*inspos, new_tbl, new_hash);
@@ -270,7 +259,14 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
RCU_INIT_POINTER(entry->next, head);
rcu_assign_pointer(*inspos, entry);
- spin_unlock(new_bucket_lock);
+ if (inspos != lock)
+ rht_unlock(lock);
+ else
+ rht_unlocked();
+
+ /* Need to preserved the bit lock. */
+ if (rht_is_locked(*pprev))
+ next = rht_ptr_locked(next);
rcu_assign_pointer(*pprev, next);
@@ -282,19 +278,19 @@ static int rhashtable_rehash_chain(struct rhashtable *ht,
unsigned int old_hash)
{
struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
- spinlock_t *old_bucket_lock;
+ struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash);
int err;
- old_bucket_lock = rht_bucket_lock(old_tbl, old_hash);
+ if (!pprev)
+ return 0;
+ rht_lock(pprev);
- spin_lock_bh(old_bucket_lock);
- while (!(err = rhashtable_rehash_one(ht, old_hash)))
+ while (!(err = rhashtable_rehash_one(ht, pprev, old_hash)))
;
if (err == -ENOENT)
err = 0;
-
- spin_unlock_bh(old_bucket_lock);
+ rht_unlock(pprev);
return err;
}
@@ -487,6 +483,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht,
}
static void *rhashtable_lookup_one(struct rhashtable *ht,
+ struct rhash_head __rcu **pprev,
struct bucket_table *tbl, unsigned int hash,
const void *key, struct rhash_head *obj)
{
@@ -494,15 +491,12 @@ static void *rhashtable_lookup_one(struct rhashtable *ht,
.ht = ht,
.key = key,
};
- struct rhash_head __rcu **pprev;
+ struct rhash_head **lock = pprev;
struct rhash_head *head;
int elasticity;
elasticity = RHT_ELASTICITY;
- pprev = rht_bucket_var(tbl, hash);
- if (!pprev)
- return ERR_PTR(-ENOENT);
- rht_for_each_continue(head, *pprev, tbl, hash) {
+ rht_for_each_continue(head, rht_ptr(*pprev), tbl, hash) {
struct rhlist_head *list;
struct rhlist_head *plist;
@@ -524,6 +518,9 @@ static void *rhashtable_lookup_one(struct rhashtable *ht,
RCU_INIT_POINTER(list->next, plist);
head = rht_dereference_bucket(head->next, tbl, hash);
RCU_INIT_POINTER(list->rhead.next, head);
+ if (pprev == lock)
+ /* Need to preserve the bit lock */
+ obj = rht_ptr_locked(obj);
rcu_assign_pointer(*pprev, obj);
return NULL;
@@ -536,12 +533,13 @@ static void *rhashtable_lookup_one(struct rhashtable *ht,
}
static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
+ struct rhash_head __rcu **pprev,
struct bucket_table *tbl,
unsigned int hash,
struct rhash_head *obj,
void *data)
{
- struct rhash_head __rcu **pprev;
+ struct rhash_head **lock = pprev;
struct bucket_table *new_tbl;
struct rhash_head *head;
@@ -564,11 +562,7 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
if (unlikely(rht_grow_above_100(ht, tbl)))
return ERR_PTR(-EAGAIN);
- pprev = rht_bucket_insert(ht, tbl, hash);
- if (!pprev)
- return ERR_PTR(-ENOMEM);
-
- head = rht_dereference_bucket(*pprev, tbl, hash);
+ head = rht_ptr(rht_dereference_bucket(*pprev, tbl, hash));
while (!ht->rhlist && !rht_is_a_nulls(head) && head < obj) {
pprev = &head->next;
head = rht_dereference_bucket(*pprev, tbl, hash);
@@ -582,6 +576,9 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
RCU_INIT_POINTER(list->next, NULL);
}
+ if (pprev == lock)
+ /* Need to preserve the bit lock */
+ obj = (void *)(2UL | (unsigned long)obj);
rcu_assign_pointer(*pprev, obj);
atomic_inc(&ht->nelems);
@@ -596,6 +593,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
{
struct bucket_table *new_tbl;
struct bucket_table *tbl;
+ struct rhash_head __rcu **pprev;
unsigned int hash;
void *data;
@@ -604,14 +602,25 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
do {
tbl = new_tbl;
hash = rht_head_hashfn(ht, tbl, obj, ht->p);
- spin_lock(rht_bucket_lock(tbl, hash));
-
- data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
- new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
- if (PTR_ERR(new_tbl) != -EEXIST)
- data = ERR_CAST(new_tbl);
-
- spin_unlock(rht_bucket_lock(tbl, hash));
+ if (rcu_access_pointer(tbl->future_tbl))
+ /* Failure is OK */
+ pprev = rht_bucket_var(tbl, hash);
+ else
+ pprev = rht_bucket_insert(ht, tbl, hash);
+ if (pprev == NULL) {
+ new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+ data = ERR_PTR(-EAGAIN);
+ } else {
+ rht_lock(pprev);
+ data = rhashtable_lookup_one(ht, pprev, tbl,
+ hash, key, obj);
+ new_tbl = rhashtable_insert_one(ht, pprev, tbl,
+ hash, obj, data);
+ if (PTR_ERR(new_tbl) != -EEXIST)
+ data = ERR_CAST(new_tbl);
+
+ rht_unlock(pprev);
+ }
} while (!IS_ERR_OR_NULL(new_tbl));
if (PTR_ERR(data) == -EAGAIN)
@@ -1044,11 +1053,6 @@ int rhashtable_init(struct rhashtable *ht,
if (params->nelem_hint)
size = rounded_hashtable_size(&ht->p);
- if (params->locks_mul)
- ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
- else
- ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
-
ht->key_len = ht->p.key_len;
if (!params->hashfn) {
ht->p.hashfn = jhash;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 502f66349530..3ad6c080ebdd 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -33,7 +33,6 @@ static const struct rhashtable_params br_fdb_rht_params = {
.key_offset = offsetof(struct net_bridge_fdb_entry, key),
.key_len = sizeof(struct net_bridge_fdb_key),
.automatic_shrinking = true,
- .locks_mul = 1,
};
static struct kmem_cache *br_fdb_cache __read_mostly;
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 7df269092103..b3d940fbfc11 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -21,7 +21,6 @@ static const struct rhashtable_params br_vlan_rht_params = {
.key_offset = offsetof(struct net_bridge_vlan, vid),
.key_len = sizeof(u16),
.nelem_hint = 3,
- .locks_mul = 1,
.max_size = VLAN_N_VID,
.obj_cmpfn = br_vlan_cmp,
.automatic_shrinking = true,
diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c
index 6d2c4eed2dc8..758151863669 100644
--- a/net/bridge/br_vlan_tunnel.c
+++ b/net/bridge/br_vlan_tunnel.c
@@ -34,7 +34,6 @@ static const struct rhashtable_params br_vlan_tunnel_rht_params = {
.key_offset = offsetof(struct net_bridge_vlan, tinfo.tunnel_id),
.key_len = sizeof(__be64),
.nelem_hint = 3,
- .locks_mul = 1,
.obj_cmpfn = br_vlan_tunid_cmp,
.automatic_shrinking = true,
};
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 82f914122f1b..d847d3e4df1f 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -372,7 +372,6 @@ static const struct rhashtable_params ipmr_rht_params = {
.key_offset = offsetof(struct mfc_cache, cmparg),
.key_len = sizeof(struct mfc_cache_cmp_arg),
.nelem_hint = 3,
- .locks_mul = 1,
.obj_cmpfn = ipmr_hash_cmp,
.automatic_shrinking = true,
};
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index d0b7e0249c13..f2ab8a29f53e 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -346,7 +346,6 @@ static const struct rhashtable_params ip6mr_rht_params = {
.key_offset = offsetof(struct mfc6_cache, cmparg),
.key_len = sizeof(struct mfc6_cache_cmp_arg),
.nelem_hint = 3,
- .locks_mul = 1,
.obj_cmpfn = ip6mr_hash_cmp,
.automatic_shrinking = true,
};
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3f211e1025c1..c4e62382f6b0 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -45,7 +45,6 @@ static const struct rhashtable_params nft_chain_ht_params = {
.hashfn = nft_chain_hash,
.obj_hashfn = nft_chain_hash_obj,
.obj_cmpfn = nft_chain_hash_cmp,
- .locks_mul = 1,
.automatic_shrinking = true,
};
rhashtable_try_insert() currently hold a lock on the bucket in
the first table, while also locking buckets in subsequent tables.
This is unnecessary and looks like a hold-over from some earlier
version of the implementation.
As insert and remove always lock a bucket in each table in turn, and
as insert only inserts in the final table, there cannot be any races
that are not covered by simply locking a bucket in each table in turn.
When an insert call reaches that last table it can be sure that there
is no match entry in any other table as it has searched them all, and
insertion never happens anywhere but in the last table. The fact that
code tests for the existence of future_tbl while holding a lock on
the relevant bucket ensures that two threads inserting the same key
will make compatible decisions about which is the "last" table.
This simplifies the code and allows the ->rehash field to be
discarded.
We still need a way to ensure that a dead bucket_table is never
re-linked by rhashtable_walk_stop(). This can be achieved by
calling call_rcu() inside the locked region, and checking
->rcu.func in rhashtable_walk_stop(). If it is not NULL, then
the bucket table is empty and dead.
Signed-off-by: NeilBrown <[email protected]>
---
include/linux/rhashtable.h | 13 -------------
lib/rhashtable.c | 44 +++++++++++---------------------------------
2 files changed, 11 insertions(+), 46 deletions(-)
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 96ebc2690027..26ed3b299028 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -63,7 +63,6 @@
struct bucket_table {
unsigned int size;
unsigned int nest;
- unsigned int rehash;
u32 hash_rnd;
unsigned int locks_mask;
spinlock_t *locks;
@@ -802,12 +801,6 @@ static inline int rhltable_insert(
* @obj: pointer to hash head inside object
* @params: hash table parameters
*
- * Locks down the bucket chain in both the old and new table if a resize
- * is in progress to ensure that writers can't remove from the old table
- * and can't insert to the new table during the atomic operation of search
- * and insertion. Searches for duplicates in both the old and new table if
- * a resize is in progress.
- *
* This lookup function may only be used for fixed key hash table (key_len
* parameter set). It will BUG() if used inappropriately.
*
@@ -863,12 +856,6 @@ static inline void *rhashtable_lookup_get_insert_fast(
* @obj: pointer to hash head inside object
* @params: hash table parameters
*
- * Locks down the bucket chain in both the old and new table if a resize
- * is in progress to ensure that writers can't remove from the old table
- * and can't insert to the new table during the atomic operation of search
- * and insertion. Searches for duplicates in both the old and new table if
- * a resize is in progress.
- *
* Lookups may occur in parallel with hashtable mutations and resizing.
*
* Will trigger an automatic deferred table resizing if residency in the
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index c6eaeaff16d1..27a5ffa993d4 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -289,10 +289,9 @@ static int rhashtable_rehash_chain(struct rhashtable *ht,
while (!(err = rhashtable_rehash_one(ht, old_hash)))
;
- if (err == -ENOENT) {
- old_tbl->rehash++;
+ if (err == -ENOENT)
err = 0;
- }
+
spin_unlock_bh(old_bucket_lock);
return err;
@@ -339,13 +338,16 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
spin_lock(&ht->lock);
list_for_each_entry(walker, &old_tbl->walkers, list)
walker->tbl = NULL;
- spin_unlock(&ht->lock);
/* Wait for readers. All new readers will see the new
* table, and thus no references to the old table will
* remain.
+ * We do this inside the locked region so that
+ * rhashtable_walk_stop() can check ->rcu.func and know
+ * not to re-link the table.
*/
call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
+ spin_unlock(&ht->lock);
return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
}
@@ -591,36 +593,14 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
struct bucket_table *new_tbl;
struct bucket_table *tbl;
unsigned int hash;
- spinlock_t *lock;
void *data;
- tbl = rcu_dereference(ht->tbl);
-
- /* All insertions must grab the oldest table containing
- * the hashed bucket that is yet to be rehashed.
- */
- for (;;) {
- hash = rht_head_hashfn(ht, tbl, obj, ht->p);
- lock = rht_bucket_lock(tbl, hash);
- spin_lock_bh(lock);
-
- if (tbl->rehash <= hash)
- break;
-
- spin_unlock_bh(lock);
- tbl = rht_dereference_rcu(tbl->future_tbl, ht);
- }
+ new_tbl = rcu_dereference(ht->tbl);
- data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
- new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
- if (PTR_ERR(new_tbl) != -EEXIST)
- data = ERR_CAST(new_tbl);
-
- while (!IS_ERR_OR_NULL(new_tbl)) {
+ do {
tbl = new_tbl;
hash = rht_head_hashfn(ht, tbl, obj, ht->p);
- spin_lock_nested(rht_bucket_lock(tbl, hash),
- SINGLE_DEPTH_NESTING);
+ spin_lock(rht_bucket_lock(tbl, hash));
data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
@@ -628,9 +608,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
data = ERR_CAST(new_tbl);
spin_unlock(rht_bucket_lock(tbl, hash));
- }
-
- spin_unlock_bh(lock);
+ } while (!IS_ERR_OR_NULL(new_tbl));
if (PTR_ERR(data) == -EAGAIN)
data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
@@ -964,7 +942,7 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
ht = iter->ht;
spin_lock(&ht->lock);
- if (tbl->rehash < tbl->size)
+ if (tbl->rcu.func == NULL)
list_add(&iter->walker.list, &tbl->walkers);
else
iter->walker.tbl = NULL;
Native bit_spin_locks are not tracked by lockdep.
The bit_spin_locks used for rhashtable buckets are local
to the rhashtable implementation, so there is little opportunity
for the sort of misuse that lockdep might detect.
However locks are held while a hash function or compare
function is called, and if one of these took a lock,
a misbehaviour is possible.
As it is quite easy to add lockdep support this unlikely
possibility seems to be enough justification.
So create a lockdep class for bucket bit_spin_lock as attach
through a lockdep_map in each bucket_table.
With the 'nested' annotation in rhashtable_rehash_one(), lockdep
correctly reports a possible problem as this lock it taken
while another bucket lock (in another table) is held. This
confirms that the added support works.
With the correct nested annotation in place, lockdep reports
no problems.
Signed-off-by: NeilBrown <[email protected]>
---
include/linux/rhashtable.h | 40 +++++++++++++++++++++++++++-------------
lib/rhashtable.c | 17 ++++++++++-------
2 files changed, 37 insertions(+), 20 deletions(-)
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index b683dc336be1..7568b94f6c87 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -68,6 +68,8 @@ struct bucket_table {
struct bucket_table __rcu *future_tbl;
+ struct lockdep_map dep_map;
+
struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp;
};
@@ -88,20 +90,32 @@ struct bucket_table {
* will have provided that.
*/
-static inline void rht_lock(struct rhash_head **bucket)
+static inline void rht_lock(struct bucket_table *tbl, struct rhash_head **bucket)
+{
+ local_bh_disable();
+ bit_spin_lock(1, (unsigned long *)bucket);
+ lock_map_acquire(&tbl->dep_map);
+}
+
+static inline void rht_lock_nested(struct bucket_table *tbl,
+ struct rhash_head **bucket,
+ unsigned int subclass)
{
local_bh_disable();
bit_spin_lock(1, (unsigned long *)bucket);
+ lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_);
}
-static inline void rht_unlock(struct rhash_head **bucket)
+static inline void rht_unlock(struct bucket_table *tbl, struct rhash_head **bucket)
{
+ lock_map_release(&tbl->dep_map);
bit_spin_unlock(1, (unsigned long *)bucket);
local_bh_enable();
}
-static inline void rht_unlocked(void)
+static inline void rht_unlocked(struct bucket_table *tbl)
{
+ lock_map_release(&tbl->dep_map);
preempt_enable();
__release(bitlock);
local_bh_enable();
@@ -676,11 +690,11 @@ static inline void *__rhashtable_insert_fast(
if (!headp)
goto out;
lock = pprev = headp;
- rht_lock(lock);
+ rht_lock(tbl, lock);
if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
slow_path:
- rht_unlock(lock);
+ rht_unlock(tbl, lock);
rcu_read_unlock();
return rhashtable_insert_slow(ht, key, obj);
}
@@ -756,10 +770,10 @@ static inline void *__rhashtable_insert_fast(
/* Assigning to *headp unlocked the chain, so we
* don't need to do it again.
*/
- rht_unlocked();
+ rht_unlocked(tbl);
} else {
out:
- rht_unlock(lock);
+ rht_unlock(tbl, lock);
}
rcu_read_unlock();
@@ -964,7 +978,7 @@ static inline int __rhashtable_remove_fast_one(
if (!pprev)
return -ENOENT;
lock = pprev;
- rht_lock(lock);
+ rht_lock(tbl, lock);
rht_for_each_continue(he, rht_ptr(*pprev), tbl, hash) {
struct rhlist_head *list;
@@ -1009,13 +1023,13 @@ static inline int __rhashtable_remove_fast_one(
rcu_assign_pointer(*pprev, obj);
if (lock == pprev) {
/* That rcu_assign_pointer() unlocked the chain */
- rht_unlocked();
+ rht_unlocked(tbl);
goto unlocked;
}
break;
}
- rht_unlock(lock);
+ rht_unlock(tbl, lock);
unlocked:
if (err > 0) {
atomic_dec(&ht->nelems);
@@ -1123,7 +1137,7 @@ static inline int __rhashtable_replace_fast(
return -ENOENT;
lock = pprev;
- rht_lock(lock);
+ rht_lock(tbl, lock);
rht_for_each_continue(he, rht_ptr(*pprev), tbl, hash) {
if (he != obj_old) {
@@ -1136,13 +1150,13 @@ static inline int __rhashtable_replace_fast(
err = 0;
if (pprev == lock) {
/* We just unlocked the chain by assigning to *pprev */
- rht_unlocked();
+ rht_unlocked(tbl);
goto unlocked;
}
break;
}
- rht_unlock(lock);
+ rht_unlock(tbl, lock);
unlocked:
return err;
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 9b0ca9e1f6b5..068e6dfcb249 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -174,6 +174,7 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
struct bucket_table *tbl = NULL;
size_t size;
int i;
+ static struct lock_class_key __key;
size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
if (gfp != GFP_KERNEL)
@@ -190,6 +191,8 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
if (tbl == NULL)
return NULL;
+ lockdep_init_map(&tbl->dep_map, "rhashtable_bucket", &__key, 0);
+
tbl->size = size;
INIT_LIST_HEAD(&tbl->walkers);
@@ -249,7 +252,7 @@ static int rhashtable_rehash_one(struct rhashtable *ht,
inspos = &new_tbl->buckets[new_hash];
lock = inspos;
- rht_lock(lock);
+ rht_lock_nested(new_tbl, lock, SINGLE_DEPTH_NESTING);
head = rht_ptr(rht_dereference_bucket(*inspos, new_tbl, new_hash));
while (!rht_is_a_nulls(head) && head < entry) {
@@ -260,9 +263,9 @@ static int rhashtable_rehash_one(struct rhashtable *ht,
rcu_assign_pointer(*inspos, entry);
if (inspos != lock)
- rht_unlock(lock);
+ rht_unlock(new_tbl, lock);
else
- rht_unlocked();
+ rht_unlocked(new_tbl);
/* Need to preserved the bit lock. */
if (rht_is_locked(*pprev))
@@ -283,14 +286,14 @@ static int rhashtable_rehash_chain(struct rhashtable *ht,
if (!pprev)
return 0;
- rht_lock(pprev);
+ rht_lock(old_tbl, pprev);
while (!(err = rhashtable_rehash_one(ht, pprev, old_hash)))
;
if (err == -ENOENT)
err = 0;
- rht_unlock(pprev);
+ rht_unlock(old_tbl, pprev);
return err;
}
@@ -611,7 +614,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
data = ERR_PTR(-EAGAIN);
} else {
- rht_lock(pprev);
+ rht_lock(tbl, pprev);
data = rhashtable_lookup_one(ht, pprev, tbl,
hash, key, obj);
new_tbl = rhashtable_insert_one(ht, pprev, tbl,
@@ -619,7 +622,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
if (PTR_ERR(new_tbl) != -EEXIST)
data = ERR_CAST(new_tbl);
- rht_unlock(pprev);
+ rht_unlock(tbl, pprev);
}
} while (!IS_ERR_OR_NULL(new_tbl));
On Fri, Jul 06, 2018 at 05:22:30PM +1000, NeilBrown wrote:
> rhashtable_try_insert() currently hold a lock on the bucket in
> the first table, while also locking buckets in subsequent tables.
> This is unnecessary and looks like a hold-over from some earlier
> version of the implementation.
>
> As insert and remove always lock a bucket in each table in turn, and
> as insert only inserts in the final table, there cannot be any races
> that are not covered by simply locking a bucket in each table in turn.
>
> When an insert call reaches that last table it can be sure that there
> is no match entry in any other table as it has searched them all, and
> insertion never happens anywhere but in the last table. The fact that
> code tests for the existence of future_tbl while holding a lock on
> the relevant bucket ensures that two threads inserting the same key
> will make compatible decisions about which is the "last" table.
>
> This simplifies the code and allows the ->rehash field to be
> discarded.
>
> We still need a way to ensure that a dead bucket_table is never
> re-linked by rhashtable_walk_stop(). This can be achieved by
> calling call_rcu() inside the locked region, and checking
> ->rcu.func in rhashtable_walk_stop(). If it is not NULL, then
> the bucket table is empty and dead.
>
> Signed-off-by: NeilBrown <[email protected]>
...
> @@ -339,13 +338,16 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
> spin_lock(&ht->lock);
> list_for_each_entry(walker, &old_tbl->walkers, list)
> walker->tbl = NULL;
> - spin_unlock(&ht->lock);
>
> /* Wait for readers. All new readers will see the new
> * table, and thus no references to the old table will
> * remain.
> + * We do this inside the locked region so that
> + * rhashtable_walk_stop() can check ->rcu.func and know
> + * not to re-link the table.
> */
> call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
> + spin_unlock(&ht->lock);
>
> return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
> }
...
> @@ -964,7 +942,7 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
> ht = iter->ht;
>
> spin_lock(&ht->lock);
> - if (tbl->rehash < tbl->size)
> + if (tbl->rcu.func == NULL)
> list_add(&iter->walker.list, &tbl->walkers);
> else
> iter->walker.tbl = NULL;
This appears to be relying on implementation details within RCU.
Paul, are you OK with rhashtable doing this trick?
Thanks,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Fri, Jul 20, 2018 at 03:54:09PM +0800, Herbert Xu wrote:
> On Fri, Jul 06, 2018 at 05:22:30PM +1000, NeilBrown wrote:
> > rhashtable_try_insert() currently hold a lock on the bucket in
> > the first table, while also locking buckets in subsequent tables.
> > This is unnecessary and looks like a hold-over from some earlier
> > version of the implementation.
> >
> > As insert and remove always lock a bucket in each table in turn, and
> > as insert only inserts in the final table, there cannot be any races
> > that are not covered by simply locking a bucket in each table in turn.
> >
> > When an insert call reaches that last table it can be sure that there
> > is no match entry in any other table as it has searched them all, and
> > insertion never happens anywhere but in the last table. The fact that
> > code tests for the existence of future_tbl while holding a lock on
> > the relevant bucket ensures that two threads inserting the same key
> > will make compatible decisions about which is the "last" table.
> >
> > This simplifies the code and allows the ->rehash field to be
> > discarded.
> >
> > We still need a way to ensure that a dead bucket_table is never
> > re-linked by rhashtable_walk_stop(). This can be achieved by
> > calling call_rcu() inside the locked region, and checking
> > ->rcu.func in rhashtable_walk_stop(). If it is not NULL, then
> > the bucket table is empty and dead.
> >
> > Signed-off-by: NeilBrown <[email protected]>
>
> ...
>
> > @@ -339,13 +338,16 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
> > spin_lock(&ht->lock);
> > list_for_each_entry(walker, &old_tbl->walkers, list)
> > walker->tbl = NULL;
> > - spin_unlock(&ht->lock);
> >
> > /* Wait for readers. All new readers will see the new
> > * table, and thus no references to the old table will
> > * remain.
> > + * We do this inside the locked region so that
> > + * rhashtable_walk_stop() can check ->rcu.func and know
> > + * not to re-link the table.
> > */
> > call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
> > + spin_unlock(&ht->lock);
> >
> > return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
> > }
>
> ...
>
> > @@ -964,7 +942,7 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
> > ht = iter->ht;
> >
> > spin_lock(&ht->lock);
> > - if (tbl->rehash < tbl->size)
> > + if (tbl->rcu.func == NULL)
> > list_add(&iter->walker.list, &tbl->walkers);
> > else
> > iter->walker.tbl = NULL;
>
> This appears to be relying on implementation details within RCU.
> Paul, are you OK with rhashtable doing this trick?
The notion of accessing objects that are already on RCU's callback lists
makes me -very- nervous because this sort of thing is not easy to
get right. After all, if you are accessing something that is already
on one of RCU's callback lists, RCU might invoke the callback it at any
time (thus freeing it in this case), and because it is already on RCU's
callback lists, rcu_read_lock() is going to be of no help whatsoever.
In addition, RCU does no ordering on its store to ->func, but the ht->lock
compensates in this case. But suppose rhashtable_walk_stop() sees the
pointer as non-NULL. What prevents RCU from freeing the bucket table out
from under rhashtable_walk_stop()? In v4.17, bucket_table_free_rcu()
just does some calls to various deallocators, which does not provide
the necessary synchronization.
Does the rhashtable_iter structure use some trick to make this safe?
Or has synchronization been added to bucket_table_free_rcu()? Or is
some other trick in use?
Thanx, Paul
On Fri, Jul 20 2018, Paul E. McKenney wrote:
> On Fri, Jul 20, 2018 at 03:54:09PM +0800, Herbert Xu wrote:
>> On Fri, Jul 06, 2018 at 05:22:30PM +1000, NeilBrown wrote:
>> > rhashtable_try_insert() currently hold a lock on the bucket in
>> > the first table, while also locking buckets in subsequent tables.
>> > This is unnecessary and looks like a hold-over from some earlier
>> > version of the implementation.
>> >
>> > As insert and remove always lock a bucket in each table in turn, and
>> > as insert only inserts in the final table, there cannot be any races
>> > that are not covered by simply locking a bucket in each table in turn.
>> >
>> > When an insert call reaches that last table it can be sure that there
>> > is no match entry in any other table as it has searched them all, and
>> > insertion never happens anywhere but in the last table. The fact that
>> > code tests for the existence of future_tbl while holding a lock on
>> > the relevant bucket ensures that two threads inserting the same key
>> > will make compatible decisions about which is the "last" table.
>> >
>> > This simplifies the code and allows the ->rehash field to be
>> > discarded.
>> >
>> > We still need a way to ensure that a dead bucket_table is never
>> > re-linked by rhashtable_walk_stop(). This can be achieved by
>> > calling call_rcu() inside the locked region, and checking
>> > ->rcu.func in rhashtable_walk_stop(). If it is not NULL, then
>> > the bucket table is empty and dead.
>> >
>> > Signed-off-by: NeilBrown <[email protected]>
>>
>> ...
>>
>> > @@ -339,13 +338,16 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
>> > spin_lock(&ht->lock);
>> > list_for_each_entry(walker, &old_tbl->walkers, list)
>> > walker->tbl = NULL;
>> > - spin_unlock(&ht->lock);
>> >
>> > /* Wait for readers. All new readers will see the new
>> > * table, and thus no references to the old table will
>> > * remain.
>> > + * We do this inside the locked region so that
>> > + * rhashtable_walk_stop() can check ->rcu.func and know
>> > + * not to re-link the table.
>> > */
>> > call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
>> > + spin_unlock(&ht->lock);
>> >
>> > return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
>> > }
>>
>> ...
>>
>> > @@ -964,7 +942,7 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
>> > ht = iter->ht;
>> >
>> > spin_lock(&ht->lock);
>> > - if (tbl->rehash < tbl->size)
>> > + if (tbl->rcu.func == NULL)
>> > list_add(&iter->walker.list, &tbl->walkers);
>> > else
>> > iter->walker.tbl = NULL;
>>
>> This appears to be relying on implementation details within RCU.
>> Paul, are you OK with rhashtable doing this trick?
>
> The notion of accessing objects that are already on RCU's callback lists
> makes me -very- nervous because this sort of thing is not easy to
> get right. After all, if you are accessing something that is already
> on one of RCU's callback lists, RCU might invoke the callback it at any
> time (thus freeing it in this case), and because it is already on RCU's
> callback lists, rcu_read_lock() is going to be of no help whatsoever.
I don't follow that last line. If some other thread has already called
rcu_read_lock() when call_rcu() is called, then that other threads
rcu_read_lock() will certainly help to ensure that the object doesn't
get freed. This code assumes that it also ensures that rcu.func will
not be changed before the other thread calls rcu_read_unlock() and
allows the grace period to end.
(There is nothing explicitly about rcu lists here, just rcu.func).
>
> In addition, RCU does no ordering on its store to ->func, but the ht->lock
> compensates in this case. But suppose rhashtable_walk_stop() sees the
> pointer as non-NULL. What prevents RCU from freeing the bucket table out
> from under rhashtable_walk_stop()? In v4.17, bucket_table_free_rcu()
> just does some calls to various deallocators, which does not provide
> the necessary synchronization.
>
> Does the rhashtable_iter structure use some trick to make this safe?
> Or has synchronization been added to bucket_table_free_rcu()? Or is
> some other trick in use?
>
> Thanx, Paul
When rhashtable_rehash_table() has copied all objects out of a
bucket_table, it must then disconnect any paused walkers and free the
table. (a 'paused' walker has called rhashtable_walk_stop() and dropped
the rcu read lock).
It sets walk->tbl=NULL (thus implicitly removing from the list) and
calls call_rcu(...,bucket_table_free_rcu) under a spinlock.
When rhashtable_walk_stop() is called, it needs to know whether it is
safe to attach the walker to the bucket_table().
It takes the same spin lock as above while still holding the
rcu_read_lock that it took some time ago.
If it gets the spinlock before rhashtable_rehash_table() gets it, then
rcu.func will be NULL (tables are allocated with kzalloc) and the walker
is attached to the table. If it gets the spinlock after
rhashtable_rehash_table() gets it, then rcu.func will not be NULL and
the walker will not be attached to the table.
The only interesting question is whether RCU might ever set rcu.func to
NULL (or change it at all) *after* call_rcu() has been called, and
*before* the current grace period ends.
If you don't want to guarantee that it doesn't, I can add an extra flag
field to the table to say "this table must not be attached walkers", but
I currently think that should be unnecessary.
Thanks,
NeilBrown
On Sat, Jul 21, 2018 at 12:25:41PM +1000, NeilBrown wrote:
> On Fri, Jul 20 2018, Paul E. McKenney wrote:
>
> > On Fri, Jul 20, 2018 at 03:54:09PM +0800, Herbert Xu wrote:
> >> On Fri, Jul 06, 2018 at 05:22:30PM +1000, NeilBrown wrote:
> >> > rhashtable_try_insert() currently hold a lock on the bucket in
> >> > the first table, while also locking buckets in subsequent tables.
> >> > This is unnecessary and looks like a hold-over from some earlier
> >> > version of the implementation.
> >> >
> >> > As insert and remove always lock a bucket in each table in turn, and
> >> > as insert only inserts in the final table, there cannot be any races
> >> > that are not covered by simply locking a bucket in each table in turn.
> >> >
> >> > When an insert call reaches that last table it can be sure that there
> >> > is no match entry in any other table as it has searched them all, and
> >> > insertion never happens anywhere but in the last table. The fact that
> >> > code tests for the existence of future_tbl while holding a lock on
> >> > the relevant bucket ensures that two threads inserting the same key
> >> > will make compatible decisions about which is the "last" table.
> >> >
> >> > This simplifies the code and allows the ->rehash field to be
> >> > discarded.
> >> >
> >> > We still need a way to ensure that a dead bucket_table is never
> >> > re-linked by rhashtable_walk_stop(). This can be achieved by
> >> > calling call_rcu() inside the locked region, and checking
> >> > ->rcu.func in rhashtable_walk_stop(). If it is not NULL, then
> >> > the bucket table is empty and dead.
> >> >
> >> > Signed-off-by: NeilBrown <[email protected]>
> >>
> >> ...
> >>
> >> > @@ -339,13 +338,16 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
> >> > spin_lock(&ht->lock);
> >> > list_for_each_entry(walker, &old_tbl->walkers, list)
> >> > walker->tbl = NULL;
> >> > - spin_unlock(&ht->lock);
> >> >
> >> > /* Wait for readers. All new readers will see the new
> >> > * table, and thus no references to the old table will
> >> > * remain.
> >> > + * We do this inside the locked region so that
> >> > + * rhashtable_walk_stop() can check ->rcu.func and know
> >> > + * not to re-link the table.
> >> > */
> >> > call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
> >> > + spin_unlock(&ht->lock);
> >> >
> >> > return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
> >> > }
> >>
> >> ...
> >>
> >> > @@ -964,7 +942,7 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
> >> > ht = iter->ht;
> >> >
> >> > spin_lock(&ht->lock);
> >> > - if (tbl->rehash < tbl->size)
> >> > + if (tbl->rcu.func == NULL)
> >> > list_add(&iter->walker.list, &tbl->walkers);
> >> > else
> >> > iter->walker.tbl = NULL;
> >>
> >> This appears to be relying on implementation details within RCU.
> >> Paul, are you OK with rhashtable doing this trick?
> >
> > The notion of accessing objects that are already on RCU's callback lists
> > makes me -very- nervous because this sort of thing is not easy to
> > get right. After all, if you are accessing something that is already
> > on one of RCU's callback lists, RCU might invoke the callback it at any
> > time (thus freeing it in this case), and because it is already on RCU's
> > callback lists, rcu_read_lock() is going to be of no help whatsoever.
>
> I don't follow that last line. If some other thread has already called
> rcu_read_lock() when call_rcu() is called, then that other threads
> rcu_read_lock() will certainly help to ensure that the object doesn't
> get freed. This code assumes that it also ensures that rcu.func will
> not be changed before the other thread calls rcu_read_unlock() and
> allows the grace period to end.
> (There is nothing explicitly about rcu lists here, just rcu.func).
>
> >
> > In addition, RCU does no ordering on its store to ->func, but the ht->lock
> > compensates in this case. But suppose rhashtable_walk_stop() sees the
> > pointer as non-NULL. What prevents RCU from freeing the bucket table out
> > from under rhashtable_walk_stop()? In v4.17, bucket_table_free_rcu()
> > just does some calls to various deallocators, which does not provide
> > the necessary synchronization.
> >
> > Does the rhashtable_iter structure use some trick to make this safe?
> > Or has synchronization been added to bucket_table_free_rcu()? Or is
> > some other trick in use?
> >
> > Thanx, Paul
>
> When rhashtable_rehash_table() has copied all objects out of a
> bucket_table, it must then disconnect any paused walkers and free the
> table. (a 'paused' walker has called rhashtable_walk_stop() and dropped
> the rcu read lock).
> It sets walk->tbl=NULL (thus implicitly removing from the list) and
> calls call_rcu(...,bucket_table_free_rcu) under a spinlock.
>
> When rhashtable_walk_stop() is called, it needs to know whether it is
> safe to attach the walker to the bucket_table().
> It takes the same spin lock as above while still holding the
> rcu_read_lock that it took some time ago.
> If it gets the spinlock before rhashtable_rehash_table() gets it, then
> rcu.func will be NULL (tables are allocated with kzalloc) and the walker
> is attached to the table. If it gets the spinlock after
> rhashtable_rehash_table() gets it, then rcu.func will not be NULL and
> the walker will not be attached to the table.
>
> The only interesting question is whether RCU might ever set rcu.func to
> NULL (or change it at all) *after* call_rcu() has been called, and
> *before* the current grace period ends.
> If you don't want to guarantee that it doesn't, I can add an extra flag
> field to the table to say "this table must not be attached walkers", but
> I currently think that should be unnecessary.
One issue is that the ->func pointer can legitimately be NULL while on
RCU's callback lists. This happens when someone invokes kfree_rcu()
with the rcu_head structure at the beginning of the enclosing structure.
I could add an offset to avoid this, or perhaps the kmalloc() folks
could be persuaded Rao Shoaib's patch moving kfree_rcu() handling to
the slab allocators, so that RCU only ever sees function pointers in
the ->func field.
Either way, this should be hidden behind an API to allow adjustments
to be made if needed. Maybe something like is_after_call_rcu()?
This would (for example) allow debug-object checks to be used to catch
check-after-free bugs.
Would something of that sort work for you?
Thanx, Paul
On Sun, Jul 22 2018, Paul E. McKenney wrote:
>
> One issue is that the ->func pointer can legitimately be NULL while on
> RCU's callback lists. This happens when someone invokes kfree_rcu()
> with the rcu_head structure at the beginning of the enclosing structure.
> I could add an offset to avoid this, or perhaps the kmalloc() folks
> could be persuaded Rao Shoaib's patch moving kfree_rcu() handling to
> the slab allocators, so that RCU only ever sees function pointers in
> the ->func field.
>
> Either way, this should be hidden behind an API to allow adjustments
> to be made if needed. Maybe something like is_after_call_rcu()?
> This would (for example) allow debug-object checks to be used to catch
> check-after-free bugs.
>
> Would something of that sort work for you?
Yes, if you could provide an is_after_call_rcu() API, that would
perfectly suit my use-case.
Thanks,
NeilBrown
On Mon, Jul 23, 2018 at 09:13:43AM +1000, NeilBrown wrote:
> On Sun, Jul 22 2018, Paul E. McKenney wrote:
> >
> > One issue is that the ->func pointer can legitimately be NULL while on
> > RCU's callback lists. This happens when someone invokes kfree_rcu()
> > with the rcu_head structure at the beginning of the enclosing structure.
> > I could add an offset to avoid this, or perhaps the kmalloc() folks
> > could be persuaded Rao Shoaib's patch moving kfree_rcu() handling to
> > the slab allocators, so that RCU only ever sees function pointers in
> > the ->func field.
> >
> > Either way, this should be hidden behind an API to allow adjustments
> > to be made if needed. Maybe something like is_after_call_rcu()?
> > This would (for example) allow debug-object checks to be used to catch
> > check-after-free bugs.
> >
> > Would something of that sort work for you?
>
> Yes, if you could provide an is_after_call_rcu() API, that would
> perfectly suit my use-case.
After beating my head against the object-debug code a bit, I have to ask
if it would be OK for you if the is_after_call_rcu() API also takes the
function that was passed to RCU.
Thanx, Paul
On Mon, Jul 23 2018, Paul E. McKenney wrote:
> On Mon, Jul 23, 2018 at 09:13:43AM +1000, NeilBrown wrote:
>> On Sun, Jul 22 2018, Paul E. McKenney wrote:
>> >
>> > One issue is that the ->func pointer can legitimately be NULL while on
>> > RCU's callback lists. This happens when someone invokes kfree_rcu()
>> > with the rcu_head structure at the beginning of the enclosing structure.
>> > I could add an offset to avoid this, or perhaps the kmalloc() folks
>> > could be persuaded Rao Shoaib's patch moving kfree_rcu() handling to
>> > the slab allocators, so that RCU only ever sees function pointers in
>> > the ->func field.
>> >
>> > Either way, this should be hidden behind an API to allow adjustments
>> > to be made if needed. Maybe something like is_after_call_rcu()?
>> > This would (for example) allow debug-object checks to be used to catch
>> > check-after-free bugs.
>> >
>> > Would something of that sort work for you?
>>
>> Yes, if you could provide an is_after_call_rcu() API, that would
>> perfectly suit my use-case.
>
> After beating my head against the object-debug code a bit, I have to ask
> if it would be OK for you if the is_after_call_rcu() API also takes the
> function that was passed to RCU.
Sure. It feels a bit clumsy, but I can see it could be easier to make
robust.
So yes: I'm fine with pass the same function and rcu_head to both
call_rcu() and is_after_call_rcu(). Actually, when I say it like that,
it seems less clumsy :-)
Thanks,
NeilBrown
On Tue, Jul 24, 2018 at 07:52:03AM +1000, NeilBrown wrote:
> On Mon, Jul 23 2018, Paul E. McKenney wrote:
>
> > On Mon, Jul 23, 2018 at 09:13:43AM +1000, NeilBrown wrote:
> >> On Sun, Jul 22 2018, Paul E. McKenney wrote:
> >> >
> >> > One issue is that the ->func pointer can legitimately be NULL while on
> >> > RCU's callback lists. This happens when someone invokes kfree_rcu()
> >> > with the rcu_head structure at the beginning of the enclosing structure.
> >> > I could add an offset to avoid this, or perhaps the kmalloc() folks
> >> > could be persuaded Rao Shoaib's patch moving kfree_rcu() handling to
> >> > the slab allocators, so that RCU only ever sees function pointers in
> >> > the ->func field.
> >> >
> >> > Either way, this should be hidden behind an API to allow adjustments
> >> > to be made if needed. Maybe something like is_after_call_rcu()?
> >> > This would (for example) allow debug-object checks to be used to catch
> >> > check-after-free bugs.
> >> >
> >> > Would something of that sort work for you?
> >>
> >> Yes, if you could provide an is_after_call_rcu() API, that would
> >> perfectly suit my use-case.
> >
> > After beating my head against the object-debug code a bit, I have to ask
> > if it would be OK for you if the is_after_call_rcu() API also takes the
> > function that was passed to RCU.
>
> Sure. It feels a bit clumsy, but I can see it could be easier to make
> robust.
> So yes: I'm fine with pass the same function and rcu_head to both
> call_rcu() and is_after_call_rcu(). Actually, when I say it like that,
> it seems less clumsy :-)
How about like this? (It needs refinements, like lockdep, but should
get the gist.)
Thanx, Paul
------------------------------------------------------------------------
commit 5aa0ebf4799b8bddbbd0124db1c008526e99fc7c
Author: Paul E. McKenney <[email protected]>
Date: Tue Jul 24 15:28:09 2018 -0700
rcu: Provide functions for determining if call_rcu() has been invoked
This commit adds is_after_call_rcu() and is_after_call_rcu_init()
functions to help RCU users detect when another CPU has passed
the specified rcu_head structure and function to call_rcu().
The is_after_call_rcu_init() should be invoked before making the
structure visible to RCU readers, and then the is_after_call_rcu() may
be invoked from within an RCU read-side critical section on an rcu_head
structure that was obtained during a traversal of the data structure
in question. The is_after_call_rcu() function will return true if the
rcu_head structure has already been passed (with the specified function)
to call_rcu(), otherwise it will return false.
If is_after_call_rcu_init() has not been invoked on the rcu_head
structure or if the rcu_head (AKA callback) has already been invoked,
then is_after_call_rcu() will do WARN_ON_ONCE().
Reported-by: NeilBrown <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index e4f821165d0b..82e5a91539b5 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -857,6 +857,45 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
+/* Has the specified rcu_head structure been handed to call_rcu()? */
+
+/*
+ * is_after_call_rcu_init - Initialize rcu_head for is_after_call_rcu()
+ * @rhp: The rcu_head structure to initialize.
+ *
+ * If you intend to invoke is_after_call_rcu() to test whether a given
+ * rcu_head structure has already been passed to call_rcu(), then you must
+ * also invoke this is_after_call_rcu_init() function on it just after
+ * allocating that structure. Calls to this function must not race with
+ * calls to call_rcu(), is_after_call_rcu(), or callback invocation.
+ */
+static inline void is_after_call_rcu_init(struct rcu_head *rhp)
+{
+ rhp->func = (rcu_callback_t)~0L;
+}
+
+/*
+ * is_after_call_rcu - Has this rcu_head been passed to call_rcu()?
+ * @rhp: The rcu_head structure to test.
+ * @func: The function passed to call_rcu() along with @rhp.
+ *
+ * Returns @true if the @rhp has been passed to call_rcu() with @func, and
+ * @false otherwise. Emits a warning in any other case, including the
+ * case where @rhp has already been invoked after a grace period.
+ * Calls to this function must not race with callback invocation. One
+ * way to avoid such races is to enclose the call to is_after_call_rcu()
+ * in an RCU read-side critical section that includes a read-side fetch
+ * of the pointer to the structure containing @rhp.
+ */
+static inline bool is_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f)
+{
+ if (READ_ONCE(rhp->func) == f)
+ return true;
+ WARN_ON_ONCE(READ_ONCE(rhp->func) != (rcu_callback_t)~0L);
+ return false;
+}
+
+
/* Transitional pre-consolidation compatibility definitions. */
static inline void synchronize_rcu_bh(void)
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 5dec94509a7e..4c56c1d98fb3 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -224,6 +224,7 @@ void kfree(const void *);
*/
static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
{
+ rcu_callback_t f;
unsigned long offset = (unsigned long)head->func;
rcu_lock_acquire(&rcu_callback_map);
@@ -234,7 +235,9 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
return true;
} else {
RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
- head->func(head);
+ f = head->func;
+ WRITE_ONCE(head->func, (rcu_callback_t)0L);
+ f(head);
rcu_lock_release(&rcu_callback_map);
return false;
}
On Tue, Jul 24 2018, Paul E. McKenney wrote:
> On Tue, Jul 24, 2018 at 07:52:03AM +1000, NeilBrown wrote:
>> On Mon, Jul 23 2018, Paul E. McKenney wrote:
>>
>> > On Mon, Jul 23, 2018 at 09:13:43AM +1000, NeilBrown wrote:
>> >> On Sun, Jul 22 2018, Paul E. McKenney wrote:
>> >> >
>> >> > One issue is that the ->func pointer can legitimately be NULL while on
>> >> > RCU's callback lists. This happens when someone invokes kfree_rcu()
>> >> > with the rcu_head structure at the beginning of the enclosing structure.
>> >> > I could add an offset to avoid this, or perhaps the kmalloc() folks
>> >> > could be persuaded Rao Shoaib's patch moving kfree_rcu() handling to
>> >> > the slab allocators, so that RCU only ever sees function pointers in
>> >> > the ->func field.
>> >> >
>> >> > Either way, this should be hidden behind an API to allow adjustments
>> >> > to be made if needed. Maybe something like is_after_call_rcu()?
>> >> > This would (for example) allow debug-object checks to be used to catch
>> >> > check-after-free bugs.
>> >> >
>> >> > Would something of that sort work for you?
>> >>
>> >> Yes, if you could provide an is_after_call_rcu() API, that would
>> >> perfectly suit my use-case.
>> >
>> > After beating my head against the object-debug code a bit, I have to ask
>> > if it would be OK for you if the is_after_call_rcu() API also takes the
>> > function that was passed to RCU.
>>
>> Sure. It feels a bit clumsy, but I can see it could be easier to make
>> robust.
>> So yes: I'm fine with pass the same function and rcu_head to both
>> call_rcu() and is_after_call_rcu(). Actually, when I say it like that,
>> it seems less clumsy :-)
>
> How about like this? (It needs refinements, like lockdep, but should
> get the gist.)
>
Looks good ... except ... naming is hard.
is_after_call_rcu_init() asserts where in the lifecycle we are,
is_after_call_rcu() tests where in the lifecycle we are.
The names are similar but the purpose is quite different.
Maybe s/is_after_call_rcu_init/call_rcu_init/ ??
Thanks,
NeilBrown
> Thanx, Paul
>
> ------------------------------------------------------------------------
>
> commit 5aa0ebf4799b8bddbbd0124db1c008526e99fc7c
> Author: Paul E. McKenney <[email protected]>
> Date: Tue Jul 24 15:28:09 2018 -0700
>
> rcu: Provide functions for determining if call_rcu() has been invoked
>
> This commit adds is_after_call_rcu() and is_after_call_rcu_init()
> functions to help RCU users detect when another CPU has passed
> the specified rcu_head structure and function to call_rcu().
> The is_after_call_rcu_init() should be invoked before making the
> structure visible to RCU readers, and then the is_after_call_rcu() may
> be invoked from within an RCU read-side critical section on an rcu_head
> structure that was obtained during a traversal of the data structure
> in question. The is_after_call_rcu() function will return true if the
> rcu_head structure has already been passed (with the specified function)
> to call_rcu(), otherwise it will return false.
>
> If is_after_call_rcu_init() has not been invoked on the rcu_head
> structure or if the rcu_head (AKA callback) has already been invoked,
> then is_after_call_rcu() will do WARN_ON_ONCE().
>
> Reported-by: NeilBrown <[email protected]>
> Signed-off-by: Paul E. McKenney <[email protected]>
>
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index e4f821165d0b..82e5a91539b5 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -857,6 +857,45 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
> #endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
>
>
> +/* Has the specified rcu_head structure been handed to call_rcu()? */
> +
> +/*
> + * is_after_call_rcu_init - Initialize rcu_head for is_after_call_rcu()
> + * @rhp: The rcu_head structure to initialize.
> + *
> + * If you intend to invoke is_after_call_rcu() to test whether a given
> + * rcu_head structure has already been passed to call_rcu(), then you must
> + * also invoke this is_after_call_rcu_init() function on it just after
> + * allocating that structure. Calls to this function must not race with
> + * calls to call_rcu(), is_after_call_rcu(), or callback invocation.
> + */
> +static inline void is_after_call_rcu_init(struct rcu_head *rhp)
> +{
> + rhp->func = (rcu_callback_t)~0L;
> +}
> +
> +/*
> + * is_after_call_rcu - Has this rcu_head been passed to call_rcu()?
> + * @rhp: The rcu_head structure to test.
> + * @func: The function passed to call_rcu() along with @rhp.
> + *
> + * Returns @true if the @rhp has been passed to call_rcu() with @func, and
> + * @false otherwise. Emits a warning in any other case, including the
> + * case where @rhp has already been invoked after a grace period.
> + * Calls to this function must not race with callback invocation. One
> + * way to avoid such races is to enclose the call to is_after_call_rcu()
> + * in an RCU read-side critical section that includes a read-side fetch
> + * of the pointer to the structure containing @rhp.
> + */
> +static inline bool is_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f)
> +{
> + if (READ_ONCE(rhp->func) == f)
> + return true;
> + WARN_ON_ONCE(READ_ONCE(rhp->func) != (rcu_callback_t)~0L);
> + return false;
> +}
> +
> +
> /* Transitional pre-consolidation compatibility definitions. */
>
> static inline void synchronize_rcu_bh(void)
> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
> index 5dec94509a7e..4c56c1d98fb3 100644
> --- a/kernel/rcu/rcu.h
> +++ b/kernel/rcu/rcu.h
> @@ -224,6 +224,7 @@ void kfree(const void *);
> */
> static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
> {
> + rcu_callback_t f;
> unsigned long offset = (unsigned long)head->func;
>
> rcu_lock_acquire(&rcu_callback_map);
> @@ -234,7 +235,9 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
> return true;
> } else {
> RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
> - head->func(head);
> + f = head->func;
> + WRITE_ONCE(head->func, (rcu_callback_t)0L);
> + f(head);
> rcu_lock_release(&rcu_callback_map);
> return false;
> }
On Wed, Jul 25, 2018 at 02:53:36PM +1000, NeilBrown wrote:
> On Tue, Jul 24 2018, Paul E. McKenney wrote:
>
> > On Tue, Jul 24, 2018 at 07:52:03AM +1000, NeilBrown wrote:
> >> On Mon, Jul 23 2018, Paul E. McKenney wrote:
> >>
> >> > On Mon, Jul 23, 2018 at 09:13:43AM +1000, NeilBrown wrote:
> >> >> On Sun, Jul 22 2018, Paul E. McKenney wrote:
> >> >> >
> >> >> > One issue is that the ->func pointer can legitimately be NULL while on
> >> >> > RCU's callback lists. This happens when someone invokes kfree_rcu()
> >> >> > with the rcu_head structure at the beginning of the enclosing structure.
> >> >> > I could add an offset to avoid this, or perhaps the kmalloc() folks
> >> >> > could be persuaded Rao Shoaib's patch moving kfree_rcu() handling to
> >> >> > the slab allocators, so that RCU only ever sees function pointers in
> >> >> > the ->func field.
> >> >> >
> >> >> > Either way, this should be hidden behind an API to allow adjustments
> >> >> > to be made if needed. Maybe something like is_after_call_rcu()?
> >> >> > This would (for example) allow debug-object checks to be used to catch
> >> >> > check-after-free bugs.
> >> >> >
> >> >> > Would something of that sort work for you?
> >> >>
> >> >> Yes, if you could provide an is_after_call_rcu() API, that would
> >> >> perfectly suit my use-case.
> >> >
> >> > After beating my head against the object-debug code a bit, I have to ask
> >> > if it would be OK for you if the is_after_call_rcu() API also takes the
> >> > function that was passed to RCU.
> >>
> >> Sure. It feels a bit clumsy, but I can see it could be easier to make
> >> robust.
> >> So yes: I'm fine with pass the same function and rcu_head to both
> >> call_rcu() and is_after_call_rcu(). Actually, when I say it like that,
> >> it seems less clumsy :-)
> >
> > How about like this? (It needs refinements, like lockdep, but should
> > get the gist.)
> >
>
> Looks good ... except ... naming is hard.
>
> is_after_call_rcu_init() asserts where in the lifecycle we are,
> is_after_call_rcu() tests where in the lifecycle we are.
>
> The names are similar but the purpose is quite different.
> Maybe s/is_after_call_rcu_init/call_rcu_init/ ??
How about rcu_head_init() and rcu_head_after_call_rcu()?
Thanx, Paul
> Thanks,
> NeilBrown
>
>
> > Thanx, Paul
> >
> > ------------------------------------------------------------------------
> >
> > commit 5aa0ebf4799b8bddbbd0124db1c008526e99fc7c
> > Author: Paul E. McKenney <[email protected]>
> > Date: Tue Jul 24 15:28:09 2018 -0700
> >
> > rcu: Provide functions for determining if call_rcu() has been invoked
> >
> > This commit adds is_after_call_rcu() and is_after_call_rcu_init()
> > functions to help RCU users detect when another CPU has passed
> > the specified rcu_head structure and function to call_rcu().
> > The is_after_call_rcu_init() should be invoked before making the
> > structure visible to RCU readers, and then the is_after_call_rcu() may
> > be invoked from within an RCU read-side critical section on an rcu_head
> > structure that was obtained during a traversal of the data structure
> > in question. The is_after_call_rcu() function will return true if the
> > rcu_head structure has already been passed (with the specified function)
> > to call_rcu(), otherwise it will return false.
> >
> > If is_after_call_rcu_init() has not been invoked on the rcu_head
> > structure or if the rcu_head (AKA callback) has already been invoked,
> > then is_after_call_rcu() will do WARN_ON_ONCE().
> >
> > Reported-by: NeilBrown <[email protected]>
> > Signed-off-by: Paul E. McKenney <[email protected]>
> >
> > diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> > index e4f821165d0b..82e5a91539b5 100644
> > --- a/include/linux/rcupdate.h
> > +++ b/include/linux/rcupdate.h
> > @@ -857,6 +857,45 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
> > #endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
> >
> >
> > +/* Has the specified rcu_head structure been handed to call_rcu()? */
> > +
> > +/*
> > + * is_after_call_rcu_init - Initialize rcu_head for is_after_call_rcu()
> > + * @rhp: The rcu_head structure to initialize.
> > + *
> > + * If you intend to invoke is_after_call_rcu() to test whether a given
> > + * rcu_head structure has already been passed to call_rcu(), then you must
> > + * also invoke this is_after_call_rcu_init() function on it just after
> > + * allocating that structure. Calls to this function must not race with
> > + * calls to call_rcu(), is_after_call_rcu(), or callback invocation.
> > + */
> > +static inline void is_after_call_rcu_init(struct rcu_head *rhp)
> > +{
> > + rhp->func = (rcu_callback_t)~0L;
> > +}
> > +
> > +/*
> > + * is_after_call_rcu - Has this rcu_head been passed to call_rcu()?
> > + * @rhp: The rcu_head structure to test.
> > + * @func: The function passed to call_rcu() along with @rhp.
> > + *
> > + * Returns @true if the @rhp has been passed to call_rcu() with @func, and
> > + * @false otherwise. Emits a warning in any other case, including the
> > + * case where @rhp has already been invoked after a grace period.
> > + * Calls to this function must not race with callback invocation. One
> > + * way to avoid such races is to enclose the call to is_after_call_rcu()
> > + * in an RCU read-side critical section that includes a read-side fetch
> > + * of the pointer to the structure containing @rhp.
> > + */
> > +static inline bool is_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f)
> > +{
> > + if (READ_ONCE(rhp->func) == f)
> > + return true;
> > + WARN_ON_ONCE(READ_ONCE(rhp->func) != (rcu_callback_t)~0L);
> > + return false;
> > +}
> > +
> > +
> > /* Transitional pre-consolidation compatibility definitions. */
> >
> > static inline void synchronize_rcu_bh(void)
> > diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
> > index 5dec94509a7e..4c56c1d98fb3 100644
> > --- a/kernel/rcu/rcu.h
> > +++ b/kernel/rcu/rcu.h
> > @@ -224,6 +224,7 @@ void kfree(const void *);
> > */
> > static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
> > {
> > + rcu_callback_t f;
> > unsigned long offset = (unsigned long)head->func;
> >
> > rcu_lock_acquire(&rcu_callback_map);
> > @@ -234,7 +235,9 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
> > return true;
> > } else {
> > RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
> > - head->func(head);
> > + f = head->func;
> > + WRITE_ONCE(head->func, (rcu_callback_t)0L);
> > + f(head);
> > rcu_lock_release(&rcu_callback_map);
> > return false;
> > }
On Wed, Jul 25 2018, Paul E. McKenney wrote:
>>
>> Looks good ... except ... naming is hard.
>>
>> is_after_call_rcu_init() asserts where in the lifecycle we are,
>> is_after_call_rcu() tests where in the lifecycle we are.
>>
>> The names are similar but the purpose is quite different.
>> Maybe s/is_after_call_rcu_init/call_rcu_init/ ??
>
> How about rcu_head_init() and rcu_head_after_call_rcu()?
Yes, I like those.
Thanks,
NeilBrown
On Fri, Jul 27, 2018 at 11:04:37AM +1000, NeilBrown wrote:
> On Wed, Jul 25 2018, Paul E. McKenney wrote:
> >>
> >> Looks good ... except ... naming is hard.
> >>
> >> is_after_call_rcu_init() asserts where in the lifecycle we are,
> >> is_after_call_rcu() tests where in the lifecycle we are.
> >>
> >> The names are similar but the purpose is quite different.
> >> Maybe s/is_after_call_rcu_init/call_rcu_init/ ??
> >
> > How about rcu_head_init() and rcu_head_after_call_rcu()?
Very well, I will pull this change in on my next rebase.
Thanx, Paul
On Thu, Jul 26, 2018 at 08:18:15PM -0700, Paul E. McKenney wrote:
> On Fri, Jul 27, 2018 at 11:04:37AM +1000, NeilBrown wrote:
> > On Wed, Jul 25 2018, Paul E. McKenney wrote:
> > >>
> > >> Looks good ... except ... naming is hard.
> > >>
> > >> is_after_call_rcu_init() asserts where in the lifecycle we are,
> > >> is_after_call_rcu() tests where in the lifecycle we are.
> > >>
> > >> The names are similar but the purpose is quite different.
> > >> Maybe s/is_after_call_rcu_init/call_rcu_init/ ??
> > >
> > > How about rcu_head_init() and rcu_head_after_call_rcu()?
>
> Very well, I will pull this change in on my next rebase.
Like this?
Thanx, Paul
------------------------------------------------------------------------
commit 45dfdcaee39bf6dd1785253c78e4805122ee8d93
Author: Paul E. McKenney <[email protected]>
Date: Tue Jul 24 15:28:09 2018 -0700
rcu: Provide functions for determining if call_rcu() has been invoked
This commit adds rcu_head_init() and rcu_head_after_call_rcu()
functions to help RCU users detect when another CPU has passed
the specified rcu_head structure and function to call_rcu().
The rcu_head_init() should be invoked before making the structure
visible to RCU readers, and then the rcu_head_after_call_rcu()
may be invoked from within an RCU read-side critical section on an
rcu_head structure that was obtained during a traversal of the data
structure in question. The rcu_head_after_call_rcu() function will
return true if the rcu_head structure has already been passed (with
the specified function) to call_rcu(), otherwise it will return false.
If rcu_head_init() has not been invoked on the rcu_head structure
or if the rcu_head (AKA callback) has already been invoked, then
rcu_head_after_call_rcu() will do WARN_ON_ONCE().
Reported-by: NeilBrown <[email protected]> Signed-off-by: Paul
E. McKenney <[email protected]> [ paulmck: Apply neilb
feedback on naming. ]
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index
e4f821165d0b..4db8bcacc51a 100644 --- a/include/linux/rcupdate.h +++
b/include/linux/rcupdate.h @@ -857,6 +857,46 @@ static inline notrace
void rcu_read_unlock_sched_notrace(void)
#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
+/* Has the specified rcu_head structure been handed to call_rcu()? */ +
+/* + * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
+ * @rhp: The rcu_head structure to initialize. + * + * If you intend
to invoke rcu_head_after_call_rcu() to test whether a + * given rcu_head
structure has already been passed to call_rcu(), then + * you must also
invoke this rcu_head_init() function on it just after + * allocating
that structure. Calls to this function must not race with + * calls
to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
+ */ +static inline void rcu_head_init(struct rcu_head *rhp) +{ +
rhp->func = (rcu_callback_t)~0L; +} + +/* + * rcu_head_after_call_rcu -
Has this rcu_head been passed to call_rcu()? + * @rhp: The rcu_head
structure to test. + * @func: The function passed to call_rcu()
along with @rhp. + * + * Returns @true if the @rhp has been passed to
call_rcu() with @func, + * and @false otherwise. Emits a warning in any
other case, including + * the case where @rhp has already been invoked
after a grace period. + * Calls to this function must not race with
callback invocation. One way + * to avoid such races is to enclose the
call to rcu_head_after_call_rcu() + * in an RCU read-side critical section
that includes a read-side fetch + * of the pointer to the structure
containing @rhp. + */ +static inline bool +rcu_head_after_call_rcu(struct
rcu_head *rhp, rcu_callback_t f) +{ + if (READ_ONCE(rhp->func) == f)
+ return true; + WARN_ON_ONCE(READ_ONCE(rhp->func) !=
(rcu_callback_t)~0L); + return false; +} + +
/* Transitional pre-consolidation compatibility definitions. */
static inline void synchronize_rcu_bh(void)
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index
5dec94509a7e..4c56c1d98fb3 100644 --- a/kernel/rcu/rcu.h +++
b/kernel/rcu/rcu.h @@ -224,6 +224,7 @@ void kfree(const void *);
*/
static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) {
+ rcu_callback_t f;
unsigned long offset = (unsigned long)head->func;
rcu_lock_acquire(&rcu_callback_map);
@@ -234,7 +235,9 @@ static inline bool __rcu_reclaim(const char *rn,
struct rcu_head *head)
return true;
} else {
RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
- head->func(head); + f = head->func; +
WRITE_ONCE(head->func, (rcu_callback_t)0L); + f(head);
rcu_lock_release(&rcu_callback_map); return false;
}
On Fri, Jul 27 2018, Paul E. McKenney wrote:
> On Thu, Jul 26, 2018 at 08:18:15PM -0700, Paul E. McKenney wrote:
>> On Fri, Jul 27, 2018 at 11:04:37AM +1000, NeilBrown wrote:
>> > On Wed, Jul 25 2018, Paul E. McKenney wrote:
>> > >>
>> > >> Looks good ... except ... naming is hard.
>> > >>
>> > >> is_after_call_rcu_init() asserts where in the lifecycle we are,
>> > >> is_after_call_rcu() tests where in the lifecycle we are.
>> > >>
>> > >> The names are similar but the purpose is quite different.
>> > >> Maybe s/is_after_call_rcu_init/call_rcu_init/ ??
>> > >
>> > > How about rcu_head_init() and rcu_head_after_call_rcu()?
>>
>> Very well, I will pull this change in on my next rebase.
>
> Like this?
Hard to say - unwinding white-space damage in my head is too challenging
when newlines have been deleted :-(
Thanks,
NeilBrown
>
> Thanx, Paul
>
> ------------------------------------------------------------------------
>
> commit 45dfdcaee39bf6dd1785253c78e4805122ee8d93
> Author: Paul E. McKenney <[email protected]>
> Date: Tue Jul 24 15:28:09 2018 -0700
>
> rcu: Provide functions for determining if call_rcu() has been invoked
>
> This commit adds rcu_head_init() and rcu_head_after_call_rcu()
> functions to help RCU users detect when another CPU has passed
> the specified rcu_head structure and function to call_rcu().
> The rcu_head_init() should be invoked before making the structure
> visible to RCU readers, and then the rcu_head_after_call_rcu()
> may be invoked from within an RCU read-side critical section on an
> rcu_head structure that was obtained during a traversal of the data
> structure in question. The rcu_head_after_call_rcu() function will
> return true if the rcu_head structure has already been passed (with
> the specified function) to call_rcu(), otherwise it will return false.
>
> If rcu_head_init() has not been invoked on the rcu_head structure
> or if the rcu_head (AKA callback) has already been invoked, then
> rcu_head_after_call_rcu() will do WARN_ON_ONCE().
>
> Reported-by: NeilBrown <[email protected]> Signed-off-by: Paul
> E. McKenney <[email protected]> [ paulmck: Apply neilb
> feedback on naming. ]
>
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index
> e4f821165d0b..4db8bcacc51a 100644 --- a/include/linux/rcupdate.h +++
> b/include/linux/rcupdate.h @@ -857,6 +857,46 @@ static inline notrace
> void rcu_read_unlock_sched_notrace(void)
> #endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
>
>
> +/* Has the specified rcu_head structure been handed to call_rcu()? */ +
> +/* + * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
> + * @rhp: The rcu_head structure to initialize. + * + * If you intend
> to invoke rcu_head_after_call_rcu() to test whether a + * given rcu_head
> structure has already been passed to call_rcu(), then + * you must also
> invoke this rcu_head_init() function on it just after + * allocating
> that structure. Calls to this function must not race with + * calls
> to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
> + */ +static inline void rcu_head_init(struct rcu_head *rhp) +{ +
> rhp->func = (rcu_callback_t)~0L; +} + +/* + * rcu_head_after_call_rcu -
> Has this rcu_head been passed to call_rcu()? + * @rhp: The rcu_head
> structure to test. + * @func: The function passed to call_rcu()
> along with @rhp. + * + * Returns @true if the @rhp has been passed to
> call_rcu() with @func, + * and @false otherwise. Emits a warning in any
> other case, including + * the case where @rhp has already been invoked
> after a grace period. + * Calls to this function must not race with
> callback invocation. One way + * to avoid such races is to enclose the
> call to rcu_head_after_call_rcu() + * in an RCU read-side critical section
> that includes a read-side fetch + * of the pointer to the structure
> containing @rhp. + */ +static inline bool +rcu_head_after_call_rcu(struct
> rcu_head *rhp, rcu_callback_t f) +{ + if (READ_ONCE(rhp->func) == f)
> + return true; + WARN_ON_ONCE(READ_ONCE(rhp->func) !=
> (rcu_callback_t)~0L); + return false; +} + +
> /* Transitional pre-consolidation compatibility definitions. */
>
> static inline void synchronize_rcu_bh(void)
> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index
> 5dec94509a7e..4c56c1d98fb3 100644 --- a/kernel/rcu/rcu.h +++
> b/kernel/rcu/rcu.h @@ -224,6 +224,7 @@ void kfree(const void *);
> */
> static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) {
> + rcu_callback_t f;
> unsigned long offset = (unsigned long)head->func;
>
> rcu_lock_acquire(&rcu_callback_map);
> @@ -234,7 +235,9 @@ static inline bool __rcu_reclaim(const char *rn,
> struct rcu_head *head)
> return true;
> } else {
> RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
> - head->func(head); + f = head->func; +
> WRITE_ONCE(head->func, (rcu_callback_t)0L); + f(head);
> rcu_lock_release(&rcu_callback_map); return false;
> }
On Tue, Jul 31, 2018 at 10:45:45AM +1000, NeilBrown wrote:
> On Fri, Jul 27 2018, Paul E. McKenney wrote:
>
> > On Thu, Jul 26, 2018 at 08:18:15PM -0700, Paul E. McKenney wrote:
> >> On Fri, Jul 27, 2018 at 11:04:37AM +1000, NeilBrown wrote:
> >> > On Wed, Jul 25 2018, Paul E. McKenney wrote:
> >> > >>
> >> > >> Looks good ... except ... naming is hard.
> >> > >>
> >> > >> is_after_call_rcu_init() asserts where in the lifecycle we are,
> >> > >> is_after_call_rcu() tests where in the lifecycle we are.
> >> > >>
> >> > >> The names are similar but the purpose is quite different.
> >> > >> Maybe s/is_after_call_rcu_init/call_rcu_init/ ??
> >> > >
> >> > > How about rcu_head_init() and rcu_head_after_call_rcu()?
> >>
> >> Very well, I will pull this change in on my next rebase.
> >
> > Like this?
>
> Hard to say - unwinding white-space damage in my head is too challenging
> when newlines have been deleted :-(
What??? Don't you like block-structured code?
All kidding aside, how about the following more conventionally formatted
version?
Thanx, Paul
------------------------------------------------------------------------
commit e3408141ed7d702995b2fdc94703af88aadd226b
Author: Paul E. McKenney <[email protected]>
Date: Tue Jul 24 15:28:09 2018 -0700
rcu: Provide functions for determining if call_rcu() has been invoked
This commit adds rcu_head_init() and rcu_head_after_call_rcu() functions
to help RCU users detect when another CPU has passed the specified
rcu_head structure and function to call_rcu(). The rcu_head_init()
should be invoked before making the structure visible to RCU readers,
and then the rcu_head_after_call_rcu() may be invoked from within
an RCU read-side critical section on an rcu_head structure that
was obtained during a traversal of the data structure in question.
The rcu_head_after_call_rcu() function will return true if the rcu_head
structure has already been passed (with the specified function) to
call_rcu(), otherwise it will return false.
If rcu_head_init() has not been invoked on the rcu_head structure
or if the rcu_head (AKA callback) has already been invoked, then
rcu_head_after_call_rcu() will do WARN_ON_ONCE().
Reported-by: NeilBrown <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
[ paulmck: Apply neilb naming feedback. ]
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index e4f821165d0b..4db8bcacc51a 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -857,6 +857,46 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
+/* Has the specified rcu_head structure been handed to call_rcu()? */
+
+/*
+ * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
+ * @rhp: The rcu_head structure to initialize.
+ *
+ * If you intend to invoke rcu_head_after_call_rcu() to test whether a
+ * given rcu_head structure has already been passed to call_rcu(), then
+ * you must also invoke this rcu_head_init() function on it just after
+ * allocating that structure. Calls to this function must not race with
+ * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
+ */
+static inline void rcu_head_init(struct rcu_head *rhp)
+{
+ rhp->func = (rcu_callback_t)~0L;
+}
+
+/*
+ * rcu_head_after_call_rcu - Has this rcu_head been passed to call_rcu()?
+ * @rhp: The rcu_head structure to test.
+ * @func: The function passed to call_rcu() along with @rhp.
+ *
+ * Returns @true if the @rhp has been passed to call_rcu() with @func,
+ * and @false otherwise. Emits a warning in any other case, including
+ * the case where @rhp has already been invoked after a grace period.
+ * Calls to this function must not race with callback invocation. One way
+ * to avoid such races is to enclose the call to rcu_head_after_call_rcu()
+ * in an RCU read-side critical section that includes a read-side fetch
+ * of the pointer to the structure containing @rhp.
+ */
+static inline bool
+rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f)
+{
+ if (READ_ONCE(rhp->func) == f)
+ return true;
+ WARN_ON_ONCE(READ_ONCE(rhp->func) != (rcu_callback_t)~0L);
+ return false;
+}
+
+
/* Transitional pre-consolidation compatibility definitions. */
static inline void synchronize_rcu_bh(void)
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 5dec94509a7e..4c56c1d98fb3 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -224,6 +224,7 @@ void kfree(const void *);
*/
static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
{
+ rcu_callback_t f;
unsigned long offset = (unsigned long)head->func;
rcu_lock_acquire(&rcu_callback_map);
@@ -234,7 +235,9 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
return true;
} else {
RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
- head->func(head);
+ f = head->func;
+ WRITE_ONCE(head->func, (rcu_callback_t)0L);
+ f(head);
rcu_lock_release(&rcu_callback_map);
return false;
}
On Mon, Jul 30 2018, Paul E. McKenney wrote:
> On Tue, Jul 31, 2018 at 10:45:45AM +1000, NeilBrown wrote:
>> On Fri, Jul 27 2018, Paul E. McKenney wrote:
>>
>> > On Thu, Jul 26, 2018 at 08:18:15PM -0700, Paul E. McKenney wrote:
>> >> On Fri, Jul 27, 2018 at 11:04:37AM +1000, NeilBrown wrote:
>> >> > On Wed, Jul 25 2018, Paul E. McKenney wrote:
>> >> > >>
>> >> > >> Looks good ... except ... naming is hard.
>> >> > >>
>> >> > >> is_after_call_rcu_init() asserts where in the lifecycle we are,
>> >> > >> is_after_call_rcu() tests where in the lifecycle we are.
>> >> > >>
>> >> > >> The names are similar but the purpose is quite different.
>> >> > >> Maybe s/is_after_call_rcu_init/call_rcu_init/ ??
>> >> > >
>> >> > > How about rcu_head_init() and rcu_head_after_call_rcu()?
>> >>
>> >> Very well, I will pull this change in on my next rebase.
>> >
>> > Like this?
>>
>> Hard to say - unwinding white-space damage in my head is too challenging
>> when newlines have been deleted :-(
>
> What??? Don't you like block-structured code?
>
> All kidding aside, how about the following more conventionally formatted
> version?
Wow - it's like I just got new glasses!
Yes - nice an clear and now flaws to be found. Thanks a lot.
NeilBrown
>
> Thanx, Paul
>
> ------------------------------------------------------------------------
>
> commit e3408141ed7d702995b2fdc94703af88aadd226b
> Author: Paul E. McKenney <[email protected]>
> Date: Tue Jul 24 15:28:09 2018 -0700
>
> rcu: Provide functions for determining if call_rcu() has been invoked
>
> This commit adds rcu_head_init() and rcu_head_after_call_rcu() functions
> to help RCU users detect when another CPU has passed the specified
> rcu_head structure and function to call_rcu(). The rcu_head_init()
> should be invoked before making the structure visible to RCU readers,
> and then the rcu_head_after_call_rcu() may be invoked from within
> an RCU read-side critical section on an rcu_head structure that
> was obtained during a traversal of the data structure in question.
> The rcu_head_after_call_rcu() function will return true if the rcu_head
> structure has already been passed (with the specified function) to
> call_rcu(), otherwise it will return false.
>
> If rcu_head_init() has not been invoked on the rcu_head structure
> or if the rcu_head (AKA callback) has already been invoked, then
> rcu_head_after_call_rcu() will do WARN_ON_ONCE().
>
> Reported-by: NeilBrown <[email protected]>
> Signed-off-by: Paul E. McKenney <[email protected]>
> [ paulmck: Apply neilb naming feedback. ]
>
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index e4f821165d0b..4db8bcacc51a 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -857,6 +857,46 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
> #endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
>
>
> +/* Has the specified rcu_head structure been handed to call_rcu()? */
> +
> +/*
> + * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
> + * @rhp: The rcu_head structure to initialize.
> + *
> + * If you intend to invoke rcu_head_after_call_rcu() to test whether a
> + * given rcu_head structure has already been passed to call_rcu(), then
> + * you must also invoke this rcu_head_init() function on it just after
> + * allocating that structure. Calls to this function must not race with
> + * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
> + */
> +static inline void rcu_head_init(struct rcu_head *rhp)
> +{
> + rhp->func = (rcu_callback_t)~0L;
> +}
> +
> +/*
> + * rcu_head_after_call_rcu - Has this rcu_head been passed to call_rcu()?
> + * @rhp: The rcu_head structure to test.
> + * @func: The function passed to call_rcu() along with @rhp.
> + *
> + * Returns @true if the @rhp has been passed to call_rcu() with @func,
> + * and @false otherwise. Emits a warning in any other case, including
> + * the case where @rhp has already been invoked after a grace period.
> + * Calls to this function must not race with callback invocation. One way
> + * to avoid such races is to enclose the call to rcu_head_after_call_rcu()
> + * in an RCU read-side critical section that includes a read-side fetch
> + * of the pointer to the structure containing @rhp.
> + */
> +static inline bool
> +rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f)
> +{
> + if (READ_ONCE(rhp->func) == f)
> + return true;
> + WARN_ON_ONCE(READ_ONCE(rhp->func) != (rcu_callback_t)~0L);
> + return false;
> +}
> +
> +
> /* Transitional pre-consolidation compatibility definitions. */
>
> static inline void synchronize_rcu_bh(void)
> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
> index 5dec94509a7e..4c56c1d98fb3 100644
> --- a/kernel/rcu/rcu.h
> +++ b/kernel/rcu/rcu.h
> @@ -224,6 +224,7 @@ void kfree(const void *);
> */
> static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
> {
> + rcu_callback_t f;
> unsigned long offset = (unsigned long)head->func;
>
> rcu_lock_acquire(&rcu_callback_map);
> @@ -234,7 +235,9 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
> return true;
> } else {
> RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
> - head->func(head);
> + f = head->func;
> + WRITE_ONCE(head->func, (rcu_callback_t)0L);
> + f(head);
> rcu_lock_release(&rcu_callback_map);
> return false;
> }
On Tue, Jul 31, 2018 at 03:04:48PM +1000, NeilBrown wrote:
> On Mon, Jul 30 2018, Paul E. McKenney wrote:
>
> > On Tue, Jul 31, 2018 at 10:45:45AM +1000, NeilBrown wrote:
> >> On Fri, Jul 27 2018, Paul E. McKenney wrote:
> >>
> >> > On Thu, Jul 26, 2018 at 08:18:15PM -0700, Paul E. McKenney wrote:
> >> >> On Fri, Jul 27, 2018 at 11:04:37AM +1000, NeilBrown wrote:
> >> >> > On Wed, Jul 25 2018, Paul E. McKenney wrote:
> >> >> > >>
> >> >> > >> Looks good ... except ... naming is hard.
> >> >> > >>
> >> >> > >> is_after_call_rcu_init() asserts where in the lifecycle we are,
> >> >> > >> is_after_call_rcu() tests where in the lifecycle we are.
> >> >> > >>
> >> >> > >> The names are similar but the purpose is quite different.
> >> >> > >> Maybe s/is_after_call_rcu_init/call_rcu_init/ ??
> >> >> > >
> >> >> > > How about rcu_head_init() and rcu_head_after_call_rcu()?
> >> >>
> >> >> Very well, I will pull this change in on my next rebase.
> >> >
> >> > Like this?
> >>
> >> Hard to say - unwinding white-space damage in my head is too challenging
> >> when newlines have been deleted :-(
> >
> > What??? Don't you like block-structured code?
> >
> > All kidding aside, how about the following more conventionally formatted
> > version?
>
> Wow - it's like I just got new glasses!
> Yes - nice an clear and now flaws to be found. Thanks a lot.
Now that flaws are to be found, please feel free to report them. ;-)
Thanx, Paul
> NeilBrown
>
> >
> > Thanx, Paul
> >
> > ------------------------------------------------------------------------
> >
> > commit e3408141ed7d702995b2fdc94703af88aadd226b
> > Author: Paul E. McKenney <[email protected]>
> > Date: Tue Jul 24 15:28:09 2018 -0700
> >
> > rcu: Provide functions for determining if call_rcu() has been invoked
> >
> > This commit adds rcu_head_init() and rcu_head_after_call_rcu() functions
> > to help RCU users detect when another CPU has passed the specified
> > rcu_head structure and function to call_rcu(). The rcu_head_init()
> > should be invoked before making the structure visible to RCU readers,
> > and then the rcu_head_after_call_rcu() may be invoked from within
> > an RCU read-side critical section on an rcu_head structure that
> > was obtained during a traversal of the data structure in question.
> > The rcu_head_after_call_rcu() function will return true if the rcu_head
> > structure has already been passed (with the specified function) to
> > call_rcu(), otherwise it will return false.
> >
> > If rcu_head_init() has not been invoked on the rcu_head structure
> > or if the rcu_head (AKA callback) has already been invoked, then
> > rcu_head_after_call_rcu() will do WARN_ON_ONCE().
> >
> > Reported-by: NeilBrown <[email protected]>
> > Signed-off-by: Paul E. McKenney <[email protected]>
> > [ paulmck: Apply neilb naming feedback. ]
> >
> > diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> > index e4f821165d0b..4db8bcacc51a 100644
> > --- a/include/linux/rcupdate.h
> > +++ b/include/linux/rcupdate.h
> > @@ -857,6 +857,46 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
> > #endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
> >
> >
> > +/* Has the specified rcu_head structure been handed to call_rcu()? */
> > +
> > +/*
> > + * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
> > + * @rhp: The rcu_head structure to initialize.
> > + *
> > + * If you intend to invoke rcu_head_after_call_rcu() to test whether a
> > + * given rcu_head structure has already been passed to call_rcu(), then
> > + * you must also invoke this rcu_head_init() function on it just after
> > + * allocating that structure. Calls to this function must not race with
> > + * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
> > + */
> > +static inline void rcu_head_init(struct rcu_head *rhp)
> > +{
> > + rhp->func = (rcu_callback_t)~0L;
> > +}
> > +
> > +/*
> > + * rcu_head_after_call_rcu - Has this rcu_head been passed to call_rcu()?
> > + * @rhp: The rcu_head structure to test.
> > + * @func: The function passed to call_rcu() along with @rhp.
> > + *
> > + * Returns @true if the @rhp has been passed to call_rcu() with @func,
> > + * and @false otherwise. Emits a warning in any other case, including
> > + * the case where @rhp has already been invoked after a grace period.
> > + * Calls to this function must not race with callback invocation. One way
> > + * to avoid such races is to enclose the call to rcu_head_after_call_rcu()
> > + * in an RCU read-side critical section that includes a read-side fetch
> > + * of the pointer to the structure containing @rhp.
> > + */
> > +static inline bool
> > +rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f)
> > +{
> > + if (READ_ONCE(rhp->func) == f)
> > + return true;
> > + WARN_ON_ONCE(READ_ONCE(rhp->func) != (rcu_callback_t)~0L);
> > + return false;
> > +}
> > +
> > +
> > /* Transitional pre-consolidation compatibility definitions. */
> >
> > static inline void synchronize_rcu_bh(void)
> > diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
> > index 5dec94509a7e..4c56c1d98fb3 100644
> > --- a/kernel/rcu/rcu.h
> > +++ b/kernel/rcu/rcu.h
> > @@ -224,6 +224,7 @@ void kfree(const void *);
> > */
> > static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
> > {
> > + rcu_callback_t f;
> > unsigned long offset = (unsigned long)head->func;
> >
> > rcu_lock_acquire(&rcu_callback_map);
> > @@ -234,7 +235,9 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
> > return true;
> > } else {
> > RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
> > - head->func(head);
> > + f = head->func;
> > + WRITE_ONCE(head->func, (rcu_callback_t)0L);
> > + f(head);
> > rcu_lock_release(&rcu_callback_map);
> > return false;
> > }
On Tue, Jul 31, 2018 at 07:44:29AM -0700, Paul E. McKenney wrote:
> On Tue, Jul 31, 2018 at 03:04:48PM +1000, NeilBrown wrote:
> > On Mon, Jul 30 2018, Paul E. McKenney wrote:
> >
> > > On Tue, Jul 31, 2018 at 10:45:45AM +1000, NeilBrown wrote:
> > >> On Fri, Jul 27 2018, Paul E. McKenney wrote:
> > >>
> > >> > On Thu, Jul 26, 2018 at 08:18:15PM -0700, Paul E. McKenney wrote:
> > >> >> On Fri, Jul 27, 2018 at 11:04:37AM +1000, NeilBrown wrote:
> > >> >> > On Wed, Jul 25 2018, Paul E. McKenney wrote:
> > >> >> > >>
> > >> >> > >> Looks good ... except ... naming is hard.
> > >> >> > >>
> > >> >> > >> is_after_call_rcu_init() asserts where in the lifecycle we are,
> > >> >> > >> is_after_call_rcu() tests where in the lifecycle we are.
> > >> >> > >>
> > >> >> > >> The names are similar but the purpose is quite different.
> > >> >> > >> Maybe s/is_after_call_rcu_init/call_rcu_init/ ??
> > >> >> > >
> > >> >> > > How about rcu_head_init() and rcu_head_after_call_rcu()?
> > >> >>
> > >> >> Very well, I will pull this change in on my next rebase.
> > >> >
> > >> > Like this?
> > >>
> > >> Hard to say - unwinding white-space damage in my head is too challenging
> > >> when newlines have been deleted :-(
> > >
> > > What??? Don't you like block-structured code?
> > >
> > > All kidding aside, how about the following more conventionally formatted
> > > version?
> >
> > Wow - it's like I just got new glasses!
> > Yes - nice an clear and now flaws to be found. Thanks a lot.
>
> Now that flaws are to be found, please feel free to report them. ;-)
Hello, Neil,
Any plans to use these functions? There are still no upstream uses.
On the other hand, if they proved unuseful, I will remove them. If I
don't hear otherwise from you, I will pull them in v5.2.
Thanx, Paul
> > NeilBrown
> >
> > >
> > > Thanx, Paul
> > >
> > > ------------------------------------------------------------------------
> > >
> > > commit e3408141ed7d702995b2fdc94703af88aadd226b
> > > Author: Paul E. McKenney <[email protected]>
> > > Date: Tue Jul 24 15:28:09 2018 -0700
> > >
> > > rcu: Provide functions for determining if call_rcu() has been invoked
> > >
> > > This commit adds rcu_head_init() and rcu_head_after_call_rcu() functions
> > > to help RCU users detect when another CPU has passed the specified
> > > rcu_head structure and function to call_rcu(). The rcu_head_init()
> > > should be invoked before making the structure visible to RCU readers,
> > > and then the rcu_head_after_call_rcu() may be invoked from within
> > > an RCU read-side critical section on an rcu_head structure that
> > > was obtained during a traversal of the data structure in question.
> > > The rcu_head_after_call_rcu() function will return true if the rcu_head
> > > structure has already been passed (with the specified function) to
> > > call_rcu(), otherwise it will return false.
> > >
> > > If rcu_head_init() has not been invoked on the rcu_head structure
> > > or if the rcu_head (AKA callback) has already been invoked, then
> > > rcu_head_after_call_rcu() will do WARN_ON_ONCE().
> > >
> > > Reported-by: NeilBrown <[email protected]>
> > > Signed-off-by: Paul E. McKenney <[email protected]>
> > > [ paulmck: Apply neilb naming feedback. ]
> > >
> > > diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> > > index e4f821165d0b..4db8bcacc51a 100644
> > > --- a/include/linux/rcupdate.h
> > > +++ b/include/linux/rcupdate.h
> > > @@ -857,6 +857,46 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
> > > #endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
> > >
> > >
> > > +/* Has the specified rcu_head structure been handed to call_rcu()? */
> > > +
> > > +/*
> > > + * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
> > > + * @rhp: The rcu_head structure to initialize.
> > > + *
> > > + * If you intend to invoke rcu_head_after_call_rcu() to test whether a
> > > + * given rcu_head structure has already been passed to call_rcu(), then
> > > + * you must also invoke this rcu_head_init() function on it just after
> > > + * allocating that structure. Calls to this function must not race with
> > > + * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
> > > + */
> > > +static inline void rcu_head_init(struct rcu_head *rhp)
> > > +{
> > > + rhp->func = (rcu_callback_t)~0L;
> > > +}
> > > +
> > > +/*
> > > + * rcu_head_after_call_rcu - Has this rcu_head been passed to call_rcu()?
> > > + * @rhp: The rcu_head structure to test.
> > > + * @func: The function passed to call_rcu() along with @rhp.
> > > + *
> > > + * Returns @true if the @rhp has been passed to call_rcu() with @func,
> > > + * and @false otherwise. Emits a warning in any other case, including
> > > + * the case where @rhp has already been invoked after a grace period.
> > > + * Calls to this function must not race with callback invocation. One way
> > > + * to avoid such races is to enclose the call to rcu_head_after_call_rcu()
> > > + * in an RCU read-side critical section that includes a read-side fetch
> > > + * of the pointer to the structure containing @rhp.
> > > + */
> > > +static inline bool
> > > +rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f)
> > > +{
> > > + if (READ_ONCE(rhp->func) == f)
> > > + return true;
> > > + WARN_ON_ONCE(READ_ONCE(rhp->func) != (rcu_callback_t)~0L);
> > > + return false;
> > > +}
> > > +
> > > +
> > > /* Transitional pre-consolidation compatibility definitions. */
> > >
> > > static inline void synchronize_rcu_bh(void)
> > > diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
> > > index 5dec94509a7e..4c56c1d98fb3 100644
> > > --- a/kernel/rcu/rcu.h
> > > +++ b/kernel/rcu/rcu.h
> > > @@ -224,6 +224,7 @@ void kfree(const void *);
> > > */
> > > static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
> > > {
> > > + rcu_callback_t f;
> > > unsigned long offset = (unsigned long)head->func;
> > >
> > > rcu_lock_acquire(&rcu_callback_map);
> > > @@ -234,7 +235,9 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
> > > return true;
> > > } else {
> > > RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
> > > - head->func(head);
> > > + f = head->func;
> > > + WRITE_ONCE(head->func, (rcu_callback_t)0L);
> > > + f(head);
> > > rcu_lock_release(&rcu_callback_map);
> > > return false;
> > > }
>
>
On Mon, Mar 11 2019, Paul E. McKenney wrote:
> On Tue, Jul 31, 2018 at 07:44:29AM -0700, Paul E. McKenney wrote:
>> On Tue, Jul 31, 2018 at 03:04:48PM +1000, NeilBrown wrote:
>> > On Mon, Jul 30 2018, Paul E. McKenney wrote:
>> >
>> > > On Tue, Jul 31, 2018 at 10:45:45AM +1000, NeilBrown wrote:
>> > >> On Fri, Jul 27 2018, Paul E. McKenney wrote:
>> > >>
>> > >> > On Thu, Jul 26, 2018 at 08:18:15PM -0700, Paul E. McKenney wrote:
>> > >> >> On Fri, Jul 27, 2018 at 11:04:37AM +1000, NeilBrown wrote:
>> > >> >> > On Wed, Jul 25 2018, Paul E. McKenney wrote:
>> > >> >> > >>
>> > >> >> > >> Looks good ... except ... naming is hard.
>> > >> >> > >>
>> > >> >> > >> is_after_call_rcu_init() asserts where in the lifecycle we are,
>> > >> >> > >> is_after_call_rcu() tests where in the lifecycle we are.
>> > >> >> > >>
>> > >> >> > >> The names are similar but the purpose is quite different.
>> > >> >> > >> Maybe s/is_after_call_rcu_init/call_rcu_init/ ??
>> > >> >> > >
>> > >> >> > > How about rcu_head_init() and rcu_head_after_call_rcu()?
>> > >> >>
>> > >> >> Very well, I will pull this change in on my next rebase.
>> > >> >
>> > >> > Like this?
>> > >>
>> > >> Hard to say - unwinding white-space damage in my head is too challenging
>> > >> when newlines have been deleted :-(
>> > >
>> > > What??? Don't you like block-structured code?
>> > >
>> > > All kidding aside, how about the following more conventionally formatted
>> > > version?
>> >
>> > Wow - it's like I just got new glasses!
>> > Yes - nice an clear and now flaws to be found. Thanks a lot.
>>
>> Now that flaws are to be found, please feel free to report them. ;-)
>
> Hello, Neil,
>
> Any plans to use these functions? There are still no upstream uses.
> On the other hand, if they proved unuseful, I will remove them. If I
> don't hear otherwise from you, I will pull them in v5.2.
Hi Paul,
yes, I do still have plans for them. I've got quite a few things I
want to add to rhashtables including this, but got stalled late last
year and I haven't managed to get back to it.
Thanks for your prompting - I'll make an effort to post some patches
soon, particularly the one that makes use of this new functionality.
Thanks,
NeilBrown
>
> Thanx, Paul
>
>> > NeilBrown
>> >
>> > >
>> > > Thanx, Paul
>> > >
>> > > ------------------------------------------------------------------------
>> > >
>> > > commit e3408141ed7d702995b2fdc94703af88aadd226b
>> > > Author: Paul E. McKenney <[email protected]>
>> > > Date: Tue Jul 24 15:28:09 2018 -0700
>> > >
>> > > rcu: Provide functions for determining if call_rcu() has been invoked
>> > >
>> > > This commit adds rcu_head_init() and rcu_head_after_call_rcu() functions
>> > > to help RCU users detect when another CPU has passed the specified
>> > > rcu_head structure and function to call_rcu(). The rcu_head_init()
>> > > should be invoked before making the structure visible to RCU readers,
>> > > and then the rcu_head_after_call_rcu() may be invoked from within
>> > > an RCU read-side critical section on an rcu_head structure that
>> > > was obtained during a traversal of the data structure in question.
>> > > The rcu_head_after_call_rcu() function will return true if the rcu_head
>> > > structure has already been passed (with the specified function) to
>> > > call_rcu(), otherwise it will return false.
>> > >
>> > > If rcu_head_init() has not been invoked on the rcu_head structure
>> > > or if the rcu_head (AKA callback) has already been invoked, then
>> > > rcu_head_after_call_rcu() will do WARN_ON_ONCE().
>> > >
>> > > Reported-by: NeilBrown <[email protected]>
>> > > Signed-off-by: Paul E. McKenney <[email protected]>
>> > > [ paulmck: Apply neilb naming feedback. ]
>> > >
>> > > diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
>> > > index e4f821165d0b..4db8bcacc51a 100644
>> > > --- a/include/linux/rcupdate.h
>> > > +++ b/include/linux/rcupdate.h
>> > > @@ -857,6 +857,46 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
>> > > #endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
>> > >
>> > >
>> > > +/* Has the specified rcu_head structure been handed to call_rcu()? */
>> > > +
>> > > +/*
>> > > + * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
>> > > + * @rhp: The rcu_head structure to initialize.
>> > > + *
>> > > + * If you intend to invoke rcu_head_after_call_rcu() to test whether a
>> > > + * given rcu_head structure has already been passed to call_rcu(), then
>> > > + * you must also invoke this rcu_head_init() function on it just after
>> > > + * allocating that structure. Calls to this function must not race with
>> > > + * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
>> > > + */
>> > > +static inline void rcu_head_init(struct rcu_head *rhp)
>> > > +{
>> > > + rhp->func = (rcu_callback_t)~0L;
>> > > +}
>> > > +
>> > > +/*
>> > > + * rcu_head_after_call_rcu - Has this rcu_head been passed to call_rcu()?
>> > > + * @rhp: The rcu_head structure to test.
>> > > + * @func: The function passed to call_rcu() along with @rhp.
>> > > + *
>> > > + * Returns @true if the @rhp has been passed to call_rcu() with @func,
>> > > + * and @false otherwise. Emits a warning in any other case, including
>> > > + * the case where @rhp has already been invoked after a grace period.
>> > > + * Calls to this function must not race with callback invocation. One way
>> > > + * to avoid such races is to enclose the call to rcu_head_after_call_rcu()
>> > > + * in an RCU read-side critical section that includes a read-side fetch
>> > > + * of the pointer to the structure containing @rhp.
>> > > + */
>> > > +static inline bool
>> > > +rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f)
>> > > +{
>> > > + if (READ_ONCE(rhp->func) == f)
>> > > + return true;
>> > > + WARN_ON_ONCE(READ_ONCE(rhp->func) != (rcu_callback_t)~0L);
>> > > + return false;
>> > > +}
>> > > +
>> > > +
>> > > /* Transitional pre-consolidation compatibility definitions. */
>> > >
>> > > static inline void synchronize_rcu_bh(void)
>> > > diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
>> > > index 5dec94509a7e..4c56c1d98fb3 100644
>> > > --- a/kernel/rcu/rcu.h
>> > > +++ b/kernel/rcu/rcu.h
>> > > @@ -224,6 +224,7 @@ void kfree(const void *);
>> > > */
>> > > static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
>> > > {
>> > > + rcu_callback_t f;
>> > > unsigned long offset = (unsigned long)head->func;
>> > >
>> > > rcu_lock_acquire(&rcu_callback_map);
>> > > @@ -234,7 +235,9 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
>> > > return true;
>> > > } else {
>> > > RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
>> > > - head->func(head);
>> > > + f = head->func;
>> > > + WRITE_ONCE(head->func, (rcu_callback_t)0L);
>> > > + f(head);
>> > > rcu_lock_release(&rcu_callback_map);
>> > > return false;
>> > > }
>>
>>
On Tue, Mar 12, 2019 at 08:50:05AM +1100, NeilBrown wrote:
> On Mon, Mar 11 2019, Paul E. McKenney wrote:
>
> > On Tue, Jul 31, 2018 at 07:44:29AM -0700, Paul E. McKenney wrote:
> >> On Tue, Jul 31, 2018 at 03:04:48PM +1000, NeilBrown wrote:
> >> > On Mon, Jul 30 2018, Paul E. McKenney wrote:
> >> >
> >> > > On Tue, Jul 31, 2018 at 10:45:45AM +1000, NeilBrown wrote:
> >> > >> On Fri, Jul 27 2018, Paul E. McKenney wrote:
> >> > >>
> >> > >> > On Thu, Jul 26, 2018 at 08:18:15PM -0700, Paul E. McKenney wrote:
> >> > >> >> On Fri, Jul 27, 2018 at 11:04:37AM +1000, NeilBrown wrote:
> >> > >> >> > On Wed, Jul 25 2018, Paul E. McKenney wrote:
> >> > >> >> > >>
> >> > >> >> > >> Looks good ... except ... naming is hard.
> >> > >> >> > >>
> >> > >> >> > >> is_after_call_rcu_init() asserts where in the lifecycle we are,
> >> > >> >> > >> is_after_call_rcu() tests where in the lifecycle we are.
> >> > >> >> > >>
> >> > >> >> > >> The names are similar but the purpose is quite different.
> >> > >> >> > >> Maybe s/is_after_call_rcu_init/call_rcu_init/ ??
> >> > >> >> > >
> >> > >> >> > > How about rcu_head_init() and rcu_head_after_call_rcu()?
> >> > >> >>
> >> > >> >> Very well, I will pull this change in on my next rebase.
> >> > >> >
> >> > >> > Like this?
> >> > >>
> >> > >> Hard to say - unwinding white-space damage in my head is too challenging
> >> > >> when newlines have been deleted :-(
> >> > >
> >> > > What??? Don't you like block-structured code?
> >> > >
> >> > > All kidding aside, how about the following more conventionally formatted
> >> > > version?
> >> >
> >> > Wow - it's like I just got new glasses!
> >> > Yes - nice an clear and now flaws to be found. Thanks a lot.
> >>
> >> Now that flaws are to be found, please feel free to report them. ;-)
> >
> > Hello, Neil,
> >
> > Any plans to use these functions? There are still no upstream uses.
> > On the other hand, if they proved unuseful, I will remove them. If I
> > don't hear otherwise from you, I will pull them in v5.2.
>
> Hi Paul,
> yes, I do still have plans for them. I've got quite a few things I
> want to add to rhashtables including this, but got stalled late last
> year and I haven't managed to get back to it.
> Thanks for your prompting - I'll make an effort to post some patches
> soon, particularly the one that makes use of this new functionality.
OK, I won't remove it. Not just yet, anyway. ;-)
Thanx, Paul
> Thanks,
> NeilBrown
>
>
> >
> > Thanx, Paul
> >
> >> > NeilBrown
> >> >
> >> > >
> >> > > Thanx, Paul
> >> > >
> >> > > ------------------------------------------------------------------------
> >> > >
> >> > > commit e3408141ed7d702995b2fdc94703af88aadd226b
> >> > > Author: Paul E. McKenney <[email protected]>
> >> > > Date: Tue Jul 24 15:28:09 2018 -0700
> >> > >
> >> > > rcu: Provide functions for determining if call_rcu() has been invoked
> >> > >
> >> > > This commit adds rcu_head_init() and rcu_head_after_call_rcu() functions
> >> > > to help RCU users detect when another CPU has passed the specified
> >> > > rcu_head structure and function to call_rcu(). The rcu_head_init()
> >> > > should be invoked before making the structure visible to RCU readers,
> >> > > and then the rcu_head_after_call_rcu() may be invoked from within
> >> > > an RCU read-side critical section on an rcu_head structure that
> >> > > was obtained during a traversal of the data structure in question.
> >> > > The rcu_head_after_call_rcu() function will return true if the rcu_head
> >> > > structure has already been passed (with the specified function) to
> >> > > call_rcu(), otherwise it will return false.
> >> > >
> >> > > If rcu_head_init() has not been invoked on the rcu_head structure
> >> > > or if the rcu_head (AKA callback) has already been invoked, then
> >> > > rcu_head_after_call_rcu() will do WARN_ON_ONCE().
> >> > >
> >> > > Reported-by: NeilBrown <[email protected]>
> >> > > Signed-off-by: Paul E. McKenney <[email protected]>
> >> > > [ paulmck: Apply neilb naming feedback. ]
> >> > >
> >> > > diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> >> > > index e4f821165d0b..4db8bcacc51a 100644
> >> > > --- a/include/linux/rcupdate.h
> >> > > +++ b/include/linux/rcupdate.h
> >> > > @@ -857,6 +857,46 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
> >> > > #endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
> >> > >
> >> > >
> >> > > +/* Has the specified rcu_head structure been handed to call_rcu()? */
> >> > > +
> >> > > +/*
> >> > > + * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
> >> > > + * @rhp: The rcu_head structure to initialize.
> >> > > + *
> >> > > + * If you intend to invoke rcu_head_after_call_rcu() to test whether a
> >> > > + * given rcu_head structure has already been passed to call_rcu(), then
> >> > > + * you must also invoke this rcu_head_init() function on it just after
> >> > > + * allocating that structure. Calls to this function must not race with
> >> > > + * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
> >> > > + */
> >> > > +static inline void rcu_head_init(struct rcu_head *rhp)
> >> > > +{
> >> > > + rhp->func = (rcu_callback_t)~0L;
> >> > > +}
> >> > > +
> >> > > +/*
> >> > > + * rcu_head_after_call_rcu - Has this rcu_head been passed to call_rcu()?
> >> > > + * @rhp: The rcu_head structure to test.
> >> > > + * @func: The function passed to call_rcu() along with @rhp.
> >> > > + *
> >> > > + * Returns @true if the @rhp has been passed to call_rcu() with @func,
> >> > > + * and @false otherwise. Emits a warning in any other case, including
> >> > > + * the case where @rhp has already been invoked after a grace period.
> >> > > + * Calls to this function must not race with callback invocation. One way
> >> > > + * to avoid such races is to enclose the call to rcu_head_after_call_rcu()
> >> > > + * in an RCU read-side critical section that includes a read-side fetch
> >> > > + * of the pointer to the structure containing @rhp.
> >> > > + */
> >> > > +static inline bool
> >> > > +rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f)
> >> > > +{
> >> > > + if (READ_ONCE(rhp->func) == f)
> >> > > + return true;
> >> > > + WARN_ON_ONCE(READ_ONCE(rhp->func) != (rcu_callback_t)~0L);
> >> > > + return false;
> >> > > +}
> >> > > +
> >> > > +
> >> > > /* Transitional pre-consolidation compatibility definitions. */
> >> > >
> >> > > static inline void synchronize_rcu_bh(void)
> >> > > diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
> >> > > index 5dec94509a7e..4c56c1d98fb3 100644
> >> > > --- a/kernel/rcu/rcu.h
> >> > > +++ b/kernel/rcu/rcu.h
> >> > > @@ -224,6 +224,7 @@ void kfree(const void *);
> >> > > */
> >> > > static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
> >> > > {
> >> > > + rcu_callback_t f;
> >> > > unsigned long offset = (unsigned long)head->func;
> >> > >
> >> > > rcu_lock_acquire(&rcu_callback_map);
> >> > > @@ -234,7 +235,9 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
> >> > > return true;
> >> > > } else {
> >> > > RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
> >> > > - head->func(head);
> >> > > + f = head->func;
> >> > > + WRITE_ONCE(head->func, (rcu_callback_t)0L);
> >> > > + f(head);
> >> > > rcu_lock_release(&rcu_callback_map);
> >> > > return false;
> >> > > }
> >>
> >>