This series contains some bugfixes, mostly minor though one
is worthy of a stable backport I think - tagged with Fixes and Cc: stable.
Then there are improvements to walking, which have been discussed
to some degree already.
Finally a code simplification which I think is correct...
Thanks,
NeilBrown
---
NeilBrown (8):
rhashtable: silence RCU warning in rhashtable_test.
rhashtable: remove nulls_base and related code.
rhashtable: use cmpxchg() to protect ->future_tbl.
rhashtable: fix race in nested_table_alloc()
rhashtable: remove rhashtable_walk_peek()
rhashtable: further improve stability of rhashtable_walk
rhashtable: add rhashtable_walk_prev()
rhashtable: don't hold lock on first table throughout insertion.
include/linux/rhashtable.h | 61 +++----------
lib/rhashtable.c | 202 +++++++++++++++++++++-----------------------
lib/test_rhashtable.c | 8 +-
3 files changed, 113 insertions(+), 158 deletions(-)
--
Signature
This "feature" is unused, undocumented, and untested and so
doesn't really belong. If a use for the nulls marker
is found, all this code would need to be reviewed to
ensure it works as required. It would be just as easy to
just add the code if/when it is needed instead.
This patch actually fixes a bug too. The table resizing allows a
table to grow to 2^31 buckets, but the hash is truncated to 27 bits -
any growth beyond 2^27 is wasteful an ineffective.
This patch result in NULLS_MARKER(0) being used for all chains,
and leave the use of rht_is_a_null() to test for it.
Signed-off-by: NeilBrown <[email protected]>
---
include/linux/rhashtable.h | 35 +++--------------------------------
lib/rhashtable.c | 8 --------
lib/test_rhashtable.c | 5 +----
3 files changed, 4 insertions(+), 44 deletions(-)
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 4e1f535c2034..8822924dd05a 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -29,25 +29,8 @@
/*
* The end of the chain is marked with a special nulls marks which has
- * the following format:
- *
- * +-------+-----------------------------------------------------+-+
- * | Base | Hash |1|
- * +-------+-----------------------------------------------------+-+
- *
- * Base (4 bits) : Reserved to distinguish between multiple tables.
- * Specified via &struct rhashtable_params.nulls_base.
- * Hash (27 bits): Full hash (unmasked) of first element added to bucket
- * 1 (1 bit) : Nulls marker (always set)
- *
- * The remaining bits of the next pointer remain unused for now.
+ * the least significant bit set.
*/
-#define RHT_BASE_BITS 4
-#define RHT_HASH_BITS 27
-#define RHT_BASE_SHIFT RHT_HASH_BITS
-
-/* Base bits plus 1 bit for nulls marker */
-#define RHT_HASH_RESERVED_SPACE (RHT_BASE_BITS + 1)
/* Maximum chain length before rehash
*
@@ -129,7 +112,6 @@ struct rhashtable;
* @min_size: Minimum size while shrinking
* @locks_mul: Number of bucket locks to allocate per cpu (default: 32)
* @automatic_shrinking: Enable automatic shrinking of tables
- * @nulls_base: Base value to generate nulls marker
* @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
* @obj_hashfn: Function to hash object
* @obj_cmpfn: Function to compare key with object
@@ -143,7 +125,6 @@ struct rhashtable_params {
u16 min_size;
bool automatic_shrinking;
u8 locks_mul;
- u32 nulls_base;
rht_hashfn_t hashfn;
rht_obj_hashfn_t obj_hashfn;
rht_obj_cmpfn_t obj_cmpfn;
@@ -210,24 +191,14 @@ struct rhashtable_iter {
bool end_of_table;
};
-static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash)
-{
- return NULLS_MARKER(ht->p.nulls_base + hash);
-}
-
#define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \
- ((ptr) = (typeof(ptr)) rht_marker(ht, hash))
+ ((ptr) = (typeof(ptr)) NULLS_MARKER(0))
static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
{
return ((unsigned long) ptr & 1);
}
-static inline unsigned long rht_get_nulls_value(const struct rhash_head *ptr)
-{
- return ((unsigned long) ptr) >> 1;
-}
-
static inline void *rht_obj(const struct rhashtable *ht,
const struct rhash_head *he)
{
@@ -237,7 +208,7 @@ static inline void *rht_obj(const struct rhashtable *ht,
static inline unsigned int rht_bucket_index(const struct bucket_table *tbl,
unsigned int hash)
{
- return (hash >> RHT_HASH_RESERVED_SPACE) & (tbl->size - 1);
+ return hash & (tbl->size - 1);
}
static inline unsigned int rht_key_get_hash(struct rhashtable *ht,
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 9427b5766134..4a3f94e8e8a6 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -994,7 +994,6 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
* .key_offset = offsetof(struct test_obj, key),
* .key_len = sizeof(int),
* .hashfn = jhash,
- * .nulls_base = (1U << RHT_BASE_SHIFT),
* };
*
* Configuration Example 2: Variable length keys
@@ -1028,9 +1027,6 @@ int rhashtable_init(struct rhashtable *ht,
(params->obj_hashfn && !params->obj_cmpfn))
return -EINVAL;
- if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
- return -EINVAL;
-
memset(ht, 0, sizeof(*ht));
mutex_init(&ht->mutex);
spin_lock_init(&ht->lock);
@@ -1095,10 +1091,6 @@ int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
{
int err;
- /* No rhlist NULLs marking for now. */
- if (params->nulls_base)
- return -EINVAL;
-
err = rhashtable_init(&hlt->ht, params);
hlt->ht.rhlist = true;
return err;
diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index bf92b7aa2a49..b428a9c7522a 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -83,7 +83,7 @@ static u32 my_hashfn(const void *data, u32 len, u32 seed)
{
const struct test_obj_rhl *obj = data;
- return (obj->value.id % 10) << RHT_HASH_RESERVED_SPACE;
+ return (obj->value.id % 10);
}
static int my_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
@@ -99,7 +99,6 @@ static struct rhashtable_params test_rht_params = {
.key_offset = offsetof(struct test_obj, value),
.key_len = sizeof(struct test_obj_val),
.hashfn = jhash,
- .nulls_base = (3U << RHT_BASE_SHIFT),
};
static struct rhashtable_params test_rht_params_dup = {
@@ -294,8 +293,6 @@ static int __init test_rhltable(unsigned int entries)
if (!obj_in_table)
goto out_free;
- /* nulls_base not supported in rhlist interface */
- test_rht_params.nulls_base = 0;
err = rhltable_init(&rhlt, &test_rht_params);
if (WARN_ON(err))
goto out_free;
If two threads run nested_table_alloc() at the same time
they could both allocate a new table.
Best case is that one of them will never be freed, leaking memory.
Worst case is hat entry get stored there before it leaks,
and the are lost from the table.
So use cmpxchg to detect the race and free the unused table.
Fixes: da20420f83ea ("rhashtable: Add nested tables")
Cc: [email protected] # 4.11+
Signed-off-by: NeilBrown <[email protected]>
---
lib/rhashtable.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index b73afe1dec7e..114e6090228a 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -119,6 +119,7 @@ static union nested_table *nested_table_alloc(struct rhashtable *ht,
unsigned int nhash)
{
union nested_table *ntbl;
+ union nested_table *tmp;
int i;
ntbl = rcu_dereference(*prev);
@@ -133,9 +134,12 @@ static union nested_table *nested_table_alloc(struct rhashtable *ht,
(i << shifted) | nhash);
}
- rcu_assign_pointer(*prev, ntbl);
-
- return ntbl;
+ rcu_assign_pointer(tmp, ntbl);
+ if (cmpxchg(prev, NULL, tmp) == NULL)
+ return tmp;
+ /* Raced with another thread. */
+ kfree(ntbl);
+ return rcu_dereference(*prev);
}
static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
rhashtable_walk_prev() returns the object returned by
the previous rhashtable_walk_next(), providing it is still in the
table (or was during this grace period).
This works even if rhashtable_walk_stop() and rhashtable_talk_start()
have been called since the last rhashtable_walk_next().
If there have been no calls to rhashtable_walk_next(), or if the
object is gone from the table, then NULL is returned.
This can usefully be used in a seq_file ->start() function.
If the pos is the same as was returned by the last ->next() call,
then rhashtable_walk_prev() can be used to re-establish the
current location in the table. If it returns NULL, then
rhashtable_walk_next() should be used.
Signed-off-by: NeilBrown <[email protected]>
---
include/linux/rhashtable.h | 1 +
lib/rhashtable.c | 31 +++++++++++++++++++++++++++++++
2 files changed, 32 insertions(+)
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 20684a451cb0..82d061ff96d6 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -367,6 +367,7 @@ static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
}
void *rhashtable_walk_next(struct rhashtable_iter *iter);
+void *rhashtable_walk_prev(struct rhashtable_iter *iter);
void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
void rhashtable_free_and_destroy(struct rhashtable *ht,
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 038c4156b66a..d0267e37e7e1 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -921,6 +921,37 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
}
EXPORT_SYMBOL_GPL(rhashtable_walk_next);
+/**
+ * rhashtable_walk_prev - Return the previously returned object, if available
+ * @iter: Hash table iterator
+ *
+ * If rhashtable_walk_next() has previously been called and the object
+ * it returned is still in the hash table, that object is returned again,
+ * otherwise %NULL is returned.
+ *
+ * If the recent rhashtable_walk_next() call was since the most recent
+ * rhashtable_walk_start() call then the returned object may not, strictly
+ * speaking, still be in the table. It will be safe to dereference.
+ *
+ * Note that the iterator is not changed and in particular it does not
+ * step backwards.
+ */
+void *rhashtable_walk_prev(struct rhashtable_iter *iter)
+{
+ struct rhashtable *ht = iter->ht;
+ struct rhash_head *p = iter->p;
+
+ if (!p)
+ return NULL;
+ if (!iter->p_is_unsafe || ht->rhlist)
+ return p;
+ rht_for_each_rcu(p, iter->walker.tbl, iter->slot)
+ if (p == iter->p)
+ return p;
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_prev);
+
/**
* rhashtable_walk_stop - Finish a hash table walk
* @iter: Hash table iterator
rhashtable_try_insert() currently hold a lock on the bucket in
the first table, while also locking buckets in subsequent tables.
This is unnecessary and looks like a hold-over from some earlier
version of the implementation.
As insert and remove always lock a bucket in each table in turn, and
as insert only inserts in the final table, there cannot be any races
that are not covered by simply locking a bucket in each table in turn.
When an insert call reaches that last table it can be sure that there
is no match entry in any other table as it has searched them all, and
insertion never happens anywhere but in the last table. The fact that
code tests for the existence of future_tbl while holding a lock on
the relevant bucket ensures that two threads inserting the same key
will make compatible decisions about which is the "last" table.
This simplifies the code and allows the ->rehash field to be
discarded.
We still need a way to ensure that a dead bucket_table is never
re-linked by rhashtable_walk_stop(). This can be achieved by
setting the ->size to 1. This still allows lookup code to work (and
correctly not find anything) but can never happen on an active bucket
table (as the minimum size is 4).
Signed-off-by: NeilBrown <[email protected]>
---
include/linux/rhashtable.h | 13 -------------
lib/rhashtable.c | 42 ++++++++++--------------------------------
2 files changed, 10 insertions(+), 45 deletions(-)
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 82d061ff96d6..0529925af41d 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -73,7 +73,6 @@ struct rhlist_head {
struct bucket_table {
unsigned int size;
unsigned int nest;
- unsigned int rehash;
u32 hash_rnd;
unsigned int locks_mask;
spinlock_t *locks;
@@ -885,12 +884,6 @@ static inline int rhltable_insert(
* @obj: pointer to hash head inside object
* @params: hash table parameters
*
- * Locks down the bucket chain in both the old and new table if a resize
- * is in progress to ensure that writers can't remove from the old table
- * and can't insert to the new table during the atomic operation of search
- * and insertion. Searches for duplicates in both the old and new table if
- * a resize is in progress.
- *
* This lookup function may only be used for fixed key hash table (key_len
* parameter set). It will BUG() if used inappropriately.
*
@@ -946,12 +939,6 @@ static inline void *rhashtable_lookup_get_insert_fast(
* @obj: pointer to hash head inside object
* @params: hash table parameters
*
- * Locks down the bucket chain in both the old and new table if a resize
- * is in progress to ensure that writers can't remove from the old table
- * and can't insert to the new table during the atomic operation of search
- * and insertion. Searches for duplicates in both the old and new table if
- * a resize is in progress.
- *
* Lookups may occur in parallel with hashtable mutations and resizing.
*
* Will trigger an automatic deferred table resizing if residency in the
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index d0267e37e7e1..4f7a7423a675 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -293,10 +293,9 @@ static int rhashtable_rehash_chain(struct rhashtable *ht,
while (!(err = rhashtable_rehash_one(ht, old_hash)))
;
- if (err == -ENOENT) {
- old_tbl->rehash++;
+ if (err == -ENOENT)
err = 0;
- }
+
spin_unlock_bh(old_bucket_lock);
return err;
@@ -345,6 +344,9 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
spin_lock(&ht->lock);
list_for_each_entry(walker, &old_tbl->walkers, list)
walker->tbl = NULL;
+
+ /* Ensure rhashtable_walk_stop() doesn't relink this table */
+ old_tbl->size = 1;
spin_unlock(&ht->lock);
/* Wait for readers. All new readers will see the new
@@ -597,36 +599,14 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
struct bucket_table *new_tbl;
struct bucket_table *tbl;
unsigned int hash;
- spinlock_t *lock;
void *data;
- tbl = rcu_dereference(ht->tbl);
-
- /* All insertions must grab the oldest table containing
- * the hashed bucket that is yet to be rehashed.
- */
- for (;;) {
- hash = rht_head_hashfn(ht, tbl, obj, ht->p);
- lock = rht_bucket_lock(tbl, hash);
- spin_lock_bh(lock);
-
- if (tbl->rehash <= hash)
- break;
-
- spin_unlock_bh(lock);
- tbl = rcu_dereference(tbl->future_tbl);
- }
-
- data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
- new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
- if (PTR_ERR(new_tbl) != -EEXIST)
- data = ERR_CAST(new_tbl);
+ new_tbl = rcu_dereference(ht->tbl);
- while (!IS_ERR_OR_NULL(new_tbl)) {
+ do {
tbl = new_tbl;
hash = rht_head_hashfn(ht, tbl, obj, ht->p);
- spin_lock_nested(rht_bucket_lock(tbl, hash),
- SINGLE_DEPTH_NESTING);
+ spin_lock(rht_bucket_lock(tbl, hash));
data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
@@ -634,9 +614,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
data = ERR_CAST(new_tbl);
spin_unlock(rht_bucket_lock(tbl, hash));
- }
-
- spin_unlock_bh(lock);
+ } while (!IS_ERR_OR_NULL(new_tbl));
if (PTR_ERR(data) == -EAGAIN)
data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
@@ -971,7 +949,7 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
ht = iter->ht;
spin_lock(&ht->lock);
- if (tbl->rehash < tbl->size)
+ if (tbl->size > 1)
list_add(&iter->walker.list, &tbl->walkers);
else
iter->walker.tbl = NULL;
print_ht in rhashtable_test calls rht_dereference() with neither
RCU protection or the mutex. This triggers an RCU warning.
So take the mutex to silence the warning.
Signed-off-by: NeilBrown <[email protected]>
---
lib/test_rhashtable.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index f4000c137dbe..bf92b7aa2a49 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -499,6 +499,8 @@ static unsigned int __init print_ht(struct rhltable *rhlt)
unsigned int i, cnt = 0;
ht = &rhlt->ht;
+ /* Take the mutex to avoid RCU warning */
+ mutex_lock(&ht->mutex);
tbl = rht_dereference(ht->tbl, ht);
for (i = 0; i < tbl->size; i++) {
struct rhash_head *pos, *next;
@@ -532,6 +534,7 @@ static unsigned int __init print_ht(struct rhltable *rhlt)
}
}
printk(KERN_ERR "\n---- ht: ----%s\n-------------\n", buff);
+ mutex_unlock(&ht->mutex);
return cnt;
}
Rather than borrowing one of the bucket locks to
protect ->future_tbl updates, use cmpxchg().
This gives more freedom to change how bucket locking
is implemented.
Signed-off-by: NeilBrown <[email protected]>
---
lib/rhashtable.c | 17 ++++++-----------
1 file changed, 6 insertions(+), 11 deletions(-)
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 4a3f94e8e8a6..b73afe1dec7e 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -298,21 +298,16 @@ static int rhashtable_rehash_attach(struct rhashtable *ht,
struct bucket_table *old_tbl,
struct bucket_table *new_tbl)
{
- /* Protect future_tbl using the first bucket lock. */
- spin_lock_bh(old_tbl->locks);
-
- /* Did somebody beat us to it? */
- if (rcu_access_pointer(old_tbl->future_tbl)) {
- spin_unlock_bh(old_tbl->locks);
- return -EEXIST;
- }
-
/* Make insertions go into the new, empty table right away. Deletions
* and lookups will be attempted in both tables until we synchronize.
+ * The use of 'tmp' is simply to ensure we get the required memory
+ * barriers before the cmpxchg().
*/
- rcu_assign_pointer(old_tbl->future_tbl, new_tbl);
+ struct bucket_table *tmp;
- spin_unlock_bh(old_tbl->locks);
+ rcu_assign_pointer(tmp, new_tbl);
+ if (cmpxchg(&old_tbl->future_tbl, NULL, tmp) != NULL)
+ return -EEXIST;
return 0;
}
If the sequence:
obj = rhashtable_walk_next(iter);
rhashtable_walk_stop(iter);
rhashtable_remove_fast(ht, &obj->head, params);
rhashtable_walk_start(iter);
races with another thread inserting or removing
an object on the same hash chain, a subsequent
rhashtable_walk_next() is not guaranteed to get the "next"
object. It is possible that an object could be
repeated, or missed.
This can be made more reliable by keeping the objects in a hash chain
sorted by memory address. A subsequent rhashtable_walk_next()
call can reliably find the correct position in the list, and thus
find the 'next' object.
It is not possible (certainly not so easy) to achieve this with an
rhltable as keeping the hash chain in order is not so easy. When the
first object with a given key is removed, it is replaced in the chain
with the next object with the same key, and the address of that
object may not be correctly ordered.
No current user of rhltable_walk_enter() calls
rhashtable_walk_start() more than once, so no current code
could benefit from a more reliable walk of rhltables.
This patch only attempts to improve walks for rhashtables.
- a new object is always inserted after the last object with a
smaller address, or at the start
- when rhashtable_walk_start() is called, it records that 'p' is not
'safe', meaning that it cannot be dereferenced. The revalidation
that was previously done here is moved to rhashtable_walk_next()
- when rhashtable_walk_next() is called while p is not NULL and not
safe, it walks the chain looking for the first object with an
address greater than p and returns that. If there is none, it moves
to the next hash chain.
Signed-off-by: NeilBrown <[email protected]>
---
include/linux/rhashtable.h | 11 +++++-
lib/rhashtable.c | 82 ++++++++++++++++++++++++++++----------------
2 files changed, 62 insertions(+), 31 deletions(-)
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 5091abf975a1..20684a451cb0 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -188,6 +188,7 @@ struct rhashtable_iter {
struct rhashtable_walker walker;
unsigned int slot;
unsigned int skip;
+ bool p_is_unsafe;
bool end_of_table;
};
@@ -737,7 +738,12 @@ static inline void *__rhashtable_insert_fast(
(params.obj_cmpfn ?
params.obj_cmpfn(&arg, rht_obj(ht, head)) :
rhashtable_compare(&arg, rht_obj(ht, head)))) {
- pprev = &head->next;
+ if (rhlist) {
+ pprev = &head->next;
+ } else {
+ if (head < obj)
+ pprev = &head->next;
+ }
continue;
}
@@ -1233,7 +1239,8 @@ static inline int rhashtable_walk_init(struct rhashtable *ht,
* Note that if you restart a walk after rhashtable_walk_stop you
* may see the same object twice. Also, you may miss objects if
* there are removals in between rhashtable_walk_stop and the next
- * call to rhashtable_walk_start.
+ * call to rhashtable_walk_start. Note that this is different to
+ * rhashtable_walk_enter() which misses objects.
*
* For a completely stable walk you should construct your own data
* structure outside the hash table.
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 83f5d1ebf452..038c4156b66a 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -234,6 +234,7 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
struct bucket_table *new_tbl = rhashtable_last_table(ht,
rht_dereference_rcu(old_tbl->future_tbl, ht));
struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash);
+ struct rhash_head __rcu **inspos;
int err = -EAGAIN;
struct rhash_head *head, *next, *entry;
spinlock_t *new_bucket_lock;
@@ -262,12 +263,15 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
new_bucket_lock = rht_bucket_lock(new_tbl, new_hash);
spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
- head = rht_dereference_bucket(new_tbl->buckets[new_hash],
- new_tbl, new_hash);
-
+ inspos = &new_tbl->buckets[new_hash];
+ head = rht_dereference_bucket(*inspos, new_tbl, new_hash);
+ while (!rht_is_a_nulls(head) && head < entry) {
+ inspos = &head->next;
+ head = rht_dereference_bucket(*inspos, new_tbl, new_hash);
+ }
RCU_INIT_POINTER(entry->next, head);
- rcu_assign_pointer(new_tbl->buckets[new_hash], entry);
+ rcu_assign_pointer(*inspos, entry);
spin_unlock(new_bucket_lock);
rcu_assign_pointer(*pprev, next);
@@ -565,6 +569,10 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
return ERR_PTR(-ENOMEM);
head = rht_dereference_bucket(*pprev, tbl, hash);
+ while (!rht_is_a_nulls(head) && head < obj) {
+ pprev = &head->next;
+ head = rht_dereference_bucket(*pprev, tbl, hash);
+ }
RCU_INIT_POINTER(obj->next, head);
if (ht->rhlist) {
@@ -659,10 +667,10 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
*
* This function prepares a hash table walk.
*
- * Note that if you restart a walk after rhashtable_walk_stop you
- * may see the same object twice. Also, you may miss objects if
- * there are removals in between rhashtable_walk_stop and the next
- * call to rhashtable_walk_start.
+ * A walk is guaranteed to return every object that was in
+ * the table before this call, and is still in the table when
+ * rhashtable_walk_next() returns NULL. Duplicates can be
+ * seen, but only if there is a rehash event during the walk.
*
* For a completely stable walk you should construct your own data
* structure outside the hash table.
@@ -746,19 +754,10 @@ int rhashtable_walk_start_check(struct rhashtable_iter *iter)
if (iter->p && !rhlist) {
/*
- * We need to validate that 'p' is still in the table, and
- * if so, update 'skip'
+ * 'p' will be revalidated when rhashtable_walk_next()
+ * is called.
*/
- struct rhash_head *p;
- int skip = 0;
- rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
- skip++;
- if (p == iter->p) {
- iter->skip = skip;
- goto found;
- }
- }
- iter->p = NULL;
+ iter->p_is_unsafe = true;
} else if (iter->p && rhlist) {
/* Need to validate that 'list' is still in the table, and
* if so, update 'skip' and 'p'.
@@ -875,15 +874,39 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
bool rhlist = ht->rhlist;
if (p) {
- if (!rhlist || !(list = rcu_dereference(list->next))) {
- p = rcu_dereference(p->next);
- list = container_of(p, struct rhlist_head, rhead);
- }
- if (!rht_is_a_nulls(p)) {
- iter->skip++;
- iter->p = p;
- iter->list = list;
- return rht_obj(ht, rhlist ? &list->rhead : p);
+ if (!rhlist && iter->p_is_unsafe) {
+ /*
+ * First time next() was called after start().
+ * Need to find location of 'p' in the list.
+ */
+ struct rhash_head *p;
+
+ iter->skip = 0;
+ rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+ iter->skip++;
+ if (p <= iter->p)
+ continue;
+
+ /* p is the next object after iter->p */
+ iter->p = p;
+ iter->p_is_unsafe = false;
+ return rht_obj(ht, p);
+ }
+ /* There is no "next" object in the list, move
+ * to next hash chain.
+ */
+ } else {
+ if (!rhlist || !(list = rcu_dereference(list->next))) {
+ p = rcu_dereference(p->next);
+ list = container_of(p, struct rhlist_head,
+ rhead);
+ }
+ if (!rht_is_a_nulls(p)) {
+ iter->skip++;
+ iter->p = p;
+ iter->list = list;
+ return rht_obj(ht, rhlist ? &list->rhead : p);
+ }
}
/* At the end of this slot, switch to next one and then find
@@ -893,6 +916,7 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
iter->slot++;
}
+ iter->p_is_unsafe = false;
return __rhashtable_walk_find_next(iter);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_next);
This function has a somewhat confused behavior that is not properly
described by the documentation.
Sometimes is returns the previous object, sometimes it returns the
next one.
Sometimes it changes the iterator, sometimes it doesn't.
This function is not currently used and is not worth keeping, so
remove it.
Signed-off-by: NeilBrown <[email protected]>
---
include/linux/rhashtable.h | 1 -
lib/rhashtable.c | 34 ----------------------------------
2 files changed, 35 deletions(-)
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 8822924dd05a..5091abf975a1 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -366,7 +366,6 @@ static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
}
void *rhashtable_walk_next(struct rhashtable_iter *iter);
-void *rhashtable_walk_peek(struct rhashtable_iter *iter);
void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
void rhashtable_free_and_destroy(struct rhashtable *ht,
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 114e6090228a..83f5d1ebf452 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -897,40 +897,6 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
}
EXPORT_SYMBOL_GPL(rhashtable_walk_next);
-/**
- * rhashtable_walk_peek - Return the next object but don't advance the iterator
- * @iter: Hash table iterator
- *
- * Returns the next object or NULL when the end of the table is reached.
- *
- * Returns -EAGAIN if resize event occurred. Note that the iterator
- * will rewind back to the beginning and you may continue to use it.
- */
-void *rhashtable_walk_peek(struct rhashtable_iter *iter)
-{
- struct rhlist_head *list = iter->list;
- struct rhashtable *ht = iter->ht;
- struct rhash_head *p = iter->p;
-
- if (p)
- return rht_obj(ht, ht->rhlist ? &list->rhead : p);
-
- /* No object found in current iter, find next one in the table. */
-
- if (iter->skip) {
- /* A nonzero skip value points to the next entry in the table
- * beyond that last one that was found. Decrement skip so
- * we find the current value. __rhashtable_walk_find_next
- * will restore the original value of skip assuming that
- * the table hasn't changed.
- */
- iter->skip--;
- }
-
- return __rhashtable_walk_find_next(iter);
-}
-EXPORT_SYMBOL_GPL(rhashtable_walk_peek);
-
/**
* rhashtable_walk_stop - Finish a hash table walk
* @iter: Hash table iterator
From: NeilBrown <[email protected]>
Date: Fri, 04 May 2018 13:54:14 +1000
> This series contains some bugfixes, mostly minor though one
> is worthy of a stable backport I think - tagged with Fixes and Cc: stable.
>
> Then there are improvements to walking, which have been discussed
> to some degree already.
> Finally a code simplification which I think is correct...
Please do not mix bug fixes and features or improvements.
Please target the serious bug fixes for 'net' and then you can
target the features and improvements for 'net-next'.
This is especially important if you want a change queued up for
-stable, as the change must first go into 'net' for that to happen.
Thank you.
On Fri, May 04, 2018 at 01:54:14PM +1000, NeilBrown wrote:
> print_ht in rhashtable_test calls rht_dereference() with neither
> RCU protection or the mutex. This triggers an RCU warning.
> So take the mutex to silence the warning.
>
> Signed-off-by: NeilBrown <[email protected]>
I don't think the mutex is actually needed but since we don't
have rht_dereference_raw and this is just test code:
Acked-by: Herbert Xu <[email protected]>
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Fri, May 04, 2018 at 01:54:14PM +1000, NeilBrown wrote:
> This "feature" is unused, undocumented, and untested and so
> doesn't really belong. If a use for the nulls marker
> is found, all this code would need to be reviewed to
> ensure it works as required. It would be just as easy to
> just add the code if/when it is needed instead.
>
> This patch actually fixes a bug too. The table resizing allows a
> table to grow to 2^31 buckets, but the hash is truncated to 27 bits -
> any growth beyond 2^27 is wasteful an ineffective.
>
> This patch result in NULLS_MARKER(0) being used for all chains,
> and leave the use of rht_is_a_null() to test for it.
>
> Signed-off-by: NeilBrown <[email protected]>
I disagree. This is a fundamental requirement for the use of
rhashtable in certain networking systems such as TCP/UDP. So
we know that there will be a use for this.
As to the bug fix, please separate it out of the patch and resubmit.
Thanks,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Fri, May 04, 2018 at 01:54:14PM +1000, NeilBrown wrote:
> Rather than borrowing one of the bucket locks to
> protect ->future_tbl updates, use cmpxchg().
> This gives more freedom to change how bucket locking
> is implemented.
>
> Signed-off-by: NeilBrown <[email protected]>
This looks nice.
> - spin_unlock_bh(old_tbl->locks);
> + rcu_assign_pointer(tmp, new_tbl);
Do we need this barrier since cmpxchg is supposed to provide memory
barrier semantics?
> + if (cmpxchg(&old_tbl->future_tbl, NULL, tmp) != NULL)
> + return -EEXIST;
Thanks,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Fri, May 04, 2018 at 01:54:14PM +1000, NeilBrown wrote:
> If two threads run nested_table_alloc() at the same time
> they could both allocate a new table.
> Best case is that one of them will never be freed, leaking memory.
> Worst case is hat entry get stored there before it leaks,
> and the are lost from the table.
>
> So use cmpxchg to detect the race and free the unused table.
>
> Fixes: da20420f83ea ("rhashtable: Add nested tables")
> Cc: [email protected] # 4.11+
> Signed-off-by: NeilBrown <[email protected]>
What about the spinlock that's meant to be held around this
operation?
Cheers,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Fri, May 04, 2018 at 01:54:14PM +1000, NeilBrown wrote:
> This function has a somewhat confused behavior that is not properly
> described by the documentation.
> Sometimes is returns the previous object, sometimes it returns the
> next one.
> Sometimes it changes the iterator, sometimes it doesn't.
>
> This function is not currently used and is not worth keeping, so
> remove it.
>
> Signed-off-by: NeilBrown <[email protected]>
I don't have a problem with this. But it would be good to hear
from Tom Herbert who added this.
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Fri, May 04, 2018 at 01:54:14PM +1000, NeilBrown wrote:
> rhashtable_try_insert() currently hold a lock on the bucket in
> the first table, while also locking buckets in subsequent tables.
> This is unnecessary and looks like a hold-over from some earlier
> version of the implementation.
>
> As insert and remove always lock a bucket in each table in turn, and
> as insert only inserts in the final table, there cannot be any races
> that are not covered by simply locking a bucket in each table in turn.
>
> When an insert call reaches that last table it can be sure that there
> is no match entry in any other table as it has searched them all, and
> insertion never happens anywhere but in the last table. The fact that
> code tests for the existence of future_tbl while holding a lock on
> the relevant bucket ensures that two threads inserting the same key
> will make compatible decisions about which is the "last" table.
>
> This simplifies the code and allows the ->rehash field to be
> discarded.
> We still need a way to ensure that a dead bucket_table is never
> re-linked by rhashtable_walk_stop(). This can be achieved by
> setting the ->size to 1. This still allows lookup code to work (and
> correctly not find anything) but can never happen on an active bucket
> table (as the minimum size is 4).
>
> Signed-off-by: NeilBrown <[email protected]>
I'm not convinced this is safe. There can be multiple tables
in existence. Unless you hold the lock on the first table, what
is to stop two parallel inserters each from inserting into their
"last" tables which may not be the same table?
Cheers,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Fri, May 04, 2018 at 01:54:14PM +1000, NeilBrown wrote:
> If the sequence:
> obj = rhashtable_walk_next(iter);
> rhashtable_walk_stop(iter);
> rhashtable_remove_fast(ht, &obj->head, params);
> rhashtable_walk_start(iter);
>
> races with another thread inserting or removing
> an object on the same hash chain, a subsequent
> rhashtable_walk_next() is not guaranteed to get the "next"
> object. It is possible that an object could be
> repeated, or missed.
>
> This can be made more reliable by keeping the objects in a hash chain
> sorted by memory address. A subsequent rhashtable_walk_next()
> call can reliably find the correct position in the list, and thus
> find the 'next' object.
>
> It is not possible (certainly not so easy) to achieve this with an
> rhltable as keeping the hash chain in order is not so easy. When the
> first object with a given key is removed, it is replaced in the chain
> with the next object with the same key, and the address of that
> object may not be correctly ordered.
> No current user of rhltable_walk_enter() calls
> rhashtable_walk_start() more than once, so no current code
> could benefit from a more reliable walk of rhltables.
>
> This patch only attempts to improve walks for rhashtables.
> - a new object is always inserted after the last object with a
> smaller address, or at the start
> - when rhashtable_walk_start() is called, it records that 'p' is not
> 'safe', meaning that it cannot be dereferenced. The revalidation
> that was previously done here is moved to rhashtable_walk_next()
> - when rhashtable_walk_next() is called while p is not NULL and not
> safe, it walks the chain looking for the first object with an
> address greater than p and returns that. If there is none, it moves
> to the next hash chain.
>
> Signed-off-by: NeilBrown <[email protected]>
I'm a bit torn on this. On the hand this is definitely an improvement
over the status quo. On the other this does not work on rhltable and
we do have a way of fixing it for both rhashtable and rhltable.
Cheers,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Fri, May 04, 2018 at 01:54:14PM +1000, NeilBrown wrote:
> rhashtable_walk_prev() returns the object returned by
> the previous rhashtable_walk_next(), providing it is still in the
> table (or was during this grace period).
> This works even if rhashtable_walk_stop() and rhashtable_talk_start()
> have been called since the last rhashtable_walk_next().
>
> If there have been no calls to rhashtable_walk_next(), or if the
> object is gone from the table, then NULL is returned.
>
> This can usefully be used in a seq_file ->start() function.
> If the pos is the same as was returned by the last ->next() call,
> then rhashtable_walk_prev() can be used to re-establish the
> current location in the table. If it returns NULL, then
> rhashtable_walk_next() should be used.
>
> Signed-off-by: NeilBrown <[email protected]>
I will ack this if Tom is OK with replacing peek with it.
Cheers,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Sat, May 5, 2018 at 2:43 AM, Herbert Xu <[email protected]> wrote:
> On Fri, May 04, 2018 at 01:54:14PM +1000, NeilBrown wrote:
>> rhashtable_walk_prev() returns the object returned by
>> the previous rhashtable_walk_next(), providing it is still in the
>> table (or was during this grace period).
>> This works even if rhashtable_walk_stop() and rhashtable_talk_start()
>> have been called since the last rhashtable_walk_next().
>>
>> If there have been no calls to rhashtable_walk_next(), or if the
>> object is gone from the table, then NULL is returned.
>>
>> This can usefully be used in a seq_file ->start() function.
>> If the pos is the same as was returned by the last ->next() call,
>> then rhashtable_walk_prev() can be used to re-establish the
>> current location in the table. If it returns NULL, then
>> rhashtable_walk_next() should be used.
>>
>> Signed-off-by: NeilBrown <[email protected]>
>
> I will ack this if Tom is OK with replacing peek with it.
>
I'm not following why this is any better than peek. The point of
having rhashtable_walk_peek is to to allow the caller to get then
current element not the next one. This is needed when table is read in
multiple parts and we need to pick up with what was returned from the
last call to rhashtable_walk_next (which apparently is what this patch
is also trying to do).
There is one significant difference in that peek will return the
element in the table regardless of where the iterator is at (this is
why peek can move the iterator) and only returns NULL at end of table.
As mentioned above rhashtable_walk_prev can return NULL so then caller
needs and so rhashtable_walk_next "should be used" to get the previous
element. Doing a peek is a lot cleaner and more straightforward API in
this regard.
Tom
> Cheers,
> --
> Email: Herbert Xu <[email protected]>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Sat, May 05 2018, Herbert Xu wrote:
> On Fri, May 04, 2018 at 01:54:14PM +1000, NeilBrown wrote:
>> This "feature" is unused, undocumented, and untested and so
>> doesn't really belong. If a use for the nulls marker
>> is found, all this code would need to be reviewed to
>> ensure it works as required. It would be just as easy to
>> just add the code if/when it is needed instead.
>>
>> This patch actually fixes a bug too. The table resizing allows a
>> table to grow to 2^31 buckets, but the hash is truncated to 27 bits -
>> any growth beyond 2^27 is wasteful an ineffective.
>>
>> This patch result in NULLS_MARKER(0) being used for all chains,
>> and leave the use of rht_is_a_null() to test for it.
>>
>> Signed-off-by: NeilBrown <[email protected]>
>
> I disagree. This is a fundamental requirement for the use of
> rhashtable in certain networking systems such as TCP/UDP. So
> we know that there will be a use for this.
I can see no evidence that this is required for anything, as it isn't
use and I'm fairly sure that in it's current form - it cannot be used.
Based on my best guess about how you might intend to use it, I suspect
it would be simpler to store the address of the bucket head in the nuls
rather than the hash and a magic number. This would make it just as
easy to detect when a search reaches the end of the wrong chain, which I
presume is the purpose.
I would find that useful myself - if the search would repeat when that
happened - as I could then use SLAB_TYPESAFE_BY_RCU.
Were we to take this approach, all the code I've removed here would
still need to be removed.
>
> As to the bug fix, please separate it out of the patch and resubmit.
I don't know how to do that. I don't know what is safe to change
without "breaking" the nulls_base code because that code is undocumented and
unused, so unmaintainable.
In general the kernel has, I believe, a policy against keeping unused
interfaces. While that isn't firm and universal, is seems to apply
particularly well to unusable interfaces.
Thanks,
NeilBrown
>
> Thanks,
> --
> Email: Herbert Xu <[email protected]>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Sat, May 05 2018, Herbert Xu wrote:
> On Fri, May 04, 2018 at 01:54:14PM +1000, NeilBrown wrote:
>> Rather than borrowing one of the bucket locks to
>> protect ->future_tbl updates, use cmpxchg().
>> This gives more freedom to change how bucket locking
>> is implemented.
>>
>> Signed-off-by: NeilBrown <[email protected]>
>
> This looks nice.
>
>> - spin_unlock_bh(old_tbl->locks);
>> + rcu_assign_pointer(tmp, new_tbl);
>
> Do we need this barrier since cmpxchg is supposed to provide memory
> barrier semantics?
It's hard to find documentation even for what cmpxchg() is meant do, let
alone what barriers is provides, but there does seem to be something
hidden in Documentation/core-api/atomic_ops.rst which suggests full
barrier semantics if the comparison succeeds. I'll replace the
rcu_assign_pointer with a comment saying why it isn't needed.
Thanks,
NeilBrown
>
>> + if (cmpxchg(&old_tbl->future_tbl, NULL, tmp) != NULL)
>> + return -EEXIST;
>
> Thanks,
> --
> Email: Herbert Xu <[email protected]>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Sat, May 05 2018, Herbert Xu wrote:
> On Fri, May 04, 2018 at 01:54:14PM +1000, NeilBrown wrote:
>> If two threads run nested_table_alloc() at the same time
>> they could both allocate a new table.
>> Best case is that one of them will never be freed, leaking memory.
>> Worst case is hat entry get stored there before it leaks,
>> and the are lost from the table.
>>
>> So use cmpxchg to detect the race and free the unused table.
>>
>> Fixes: da20420f83ea ("rhashtable: Add nested tables")
>> Cc: [email protected] # 4.11+
>> Signed-off-by: NeilBrown <[email protected]>
>
> What about the spinlock that's meant to be held around this
> operation?
The spinlock protects 2 or more buckets. The nested table contains at
least 512 buckets, maybe more.
It is quite possible for two insertions into 2 different buckets to both
get their spinlock and both try to instantiate the same nested table.
Thanks,
NeilBrown
On Sat, May 05 2018, Herbert Xu wrote:
> On Fri, May 04, 2018 at 01:54:14PM +1000, NeilBrown wrote:
>> print_ht in rhashtable_test calls rht_dereference() with neither
>> RCU protection or the mutex. This triggers an RCU warning.
>> So take the mutex to silence the warning.
>>
>> Signed-off-by: NeilBrown <[email protected]>
>
> I don't think the mutex is actually needed but since we don't
> have rht_dereference_raw and this is just test code:
I agrees there is no functional need for the mutex. It just needed to
silence the warning, so that other warnings are easier to see.
>
> Acked-by: Herbert Xu <[email protected]>
thanks,
NeilBrown
> --
> Email: Herbert Xu <[email protected]>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Sat, May 05 2018, Herbert Xu wrote:
> On Fri, May 04, 2018 at 01:54:14PM +1000, NeilBrown wrote:
>> If the sequence:
>> obj = rhashtable_walk_next(iter);
>> rhashtable_walk_stop(iter);
>> rhashtable_remove_fast(ht, &obj->head, params);
>> rhashtable_walk_start(iter);
>>
>> races with another thread inserting or removing
>> an object on the same hash chain, a subsequent
>> rhashtable_walk_next() is not guaranteed to get the "next"
>> object. It is possible that an object could be
>> repeated, or missed.
>>
>> This can be made more reliable by keeping the objects in a hash chain
>> sorted by memory address. A subsequent rhashtable_walk_next()
>> call can reliably find the correct position in the list, and thus
>> find the 'next' object.
>>
>> It is not possible (certainly not so easy) to achieve this with an
>> rhltable as keeping the hash chain in order is not so easy. When the
>> first object with a given key is removed, it is replaced in the chain
>> with the next object with the same key, and the address of that
>> object may not be correctly ordered.
>> No current user of rhltable_walk_enter() calls
>> rhashtable_walk_start() more than once, so no current code
>> could benefit from a more reliable walk of rhltables.
>>
>> This patch only attempts to improve walks for rhashtables.
>> - a new object is always inserted after the last object with a
>> smaller address, or at the start
>> - when rhashtable_walk_start() is called, it records that 'p' is not
>> 'safe', meaning that it cannot be dereferenced. The revalidation
>> that was previously done here is moved to rhashtable_walk_next()
>> - when rhashtable_walk_next() is called while p is not NULL and not
>> safe, it walks the chain looking for the first object with an
>> address greater than p and returns that. If there is none, it moves
>> to the next hash chain.
>>
>> Signed-off-by: NeilBrown <[email protected]>
>
> I'm a bit torn on this. On the hand this is definitely an improvement
> over the status quo. On the other this does not work on rhltable and
> we do have a way of fixing it for both rhashtable and rhltable.
Do we? How could we fix it for both rhashtable and rhltable?
Thanks,
NeilBrown
On Sat, May 05 2018, Herbert Xu wrote:
> On Fri, May 04, 2018 at 01:54:14PM +1000, NeilBrown wrote:
>> rhashtable_try_insert() currently hold a lock on the bucket in
>> the first table, while also locking buckets in subsequent tables.
>> This is unnecessary and looks like a hold-over from some earlier
>> version of the implementation.
>>
>> As insert and remove always lock a bucket in each table in turn, and
>> as insert only inserts in the final table, there cannot be any races
>> that are not covered by simply locking a bucket in each table in turn.
>>
>> When an insert call reaches that last table it can be sure that there
>> is no match entry in any other table as it has searched them all, and
>> insertion never happens anywhere but in the last table. The fact that
>> code tests for the existence of future_tbl while holding a lock on
>> the relevant bucket ensures that two threads inserting the same key
>> will make compatible decisions about which is the "last" table.
>>
>> This simplifies the code and allows the ->rehash field to be
>> discarded.
>> We still need a way to ensure that a dead bucket_table is never
>> re-linked by rhashtable_walk_stop(). This can be achieved by
>> setting the ->size to 1. This still allows lookup code to work (and
>> correctly not find anything) but can never happen on an active bucket
>> table (as the minimum size is 4).
>>
>> Signed-off-by: NeilBrown <[email protected]>
>
> I'm not convinced this is safe. There can be multiple tables
> in existence. Unless you hold the lock on the first table, what
> is to stop two parallel inserters each from inserting into their
> "last" tables which may not be the same table?
The insert function must (and does) take the lock on the bucket before
testing if there is a "next" table.
If one inserter finds that it has locked the "last" table (because there
is no next) and successfully inserts, then the other inserter cannot
have locked that table yet, else it would have inserted. When it does,
it will find what the first inserter inserted.
Thanks,
NeilBrown
>
> Cheers,
> --
> Email: Herbert Xu <[email protected]>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Sun, May 06, 2018 at 08:00:49AM +1000, NeilBrown wrote:
>
> The insert function must (and does) take the lock on the bucket before
> testing if there is a "next" table.
> If one inserter finds that it has locked the "last" table (because there
> is no next) and successfully inserts, then the other inserter cannot
> have locked that table yet, else it would have inserted. When it does,
> it will find what the first inserter inserted.
If you release the lock to the first table then it may be deleted
by the resize thread. Hence the other inserter may not have even
started from the same place.
Cheers,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Sun, May 06, 2018 at 07:48:20AM +1000, NeilBrown wrote:
>
> The spinlock protects 2 or more buckets. The nested table contains at
> least 512 buckets, maybe more.
> It is quite possible for two insertions into 2 different buckets to both
> get their spinlock and both try to instantiate the same nested table.
I think you missed the fact that when we use nested tables the spin
lock table is limited to just a single page and hence corresponds
to the first level in the nested table. Therefore it's always safe.
Cheers,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Sun, May 06 2018, Herbert Xu wrote:
> On Sun, May 06, 2018 at 07:48:20AM +1000, NeilBrown wrote:
>>
>> The spinlock protects 2 or more buckets. The nested table contains at
>> least 512 buckets, maybe more.
>> It is quite possible for two insertions into 2 different buckets to both
>> get their spinlock and both try to instantiate the same nested table.
>
> I think you missed the fact that when we use nested tables the spin
> lock table is limited to just a single page and hence corresponds
> to the first level in the nested table. Therefore it's always safe.
Yes I had missed that - thanks for pointing it out.
In fact the lock table is limited to the number of nested_tables
in the second level.
And it is the same low-order bits that choose both the lock
and the set of nested tables.
So there isn't a bug here. So we don't need this patch. (I still like
it though - it seems more obviously correct).
Thanks,
NeilBrown
On Sat, May 05 2018, Tom Herbert wrote:
> On Sat, May 5, 2018 at 2:43 AM, Herbert Xu <[email protected]> wrote:
>> On Fri, May 04, 2018 at 01:54:14PM +1000, NeilBrown wrote:
>>> rhashtable_walk_prev() returns the object returned by
>>> the previous rhashtable_walk_next(), providing it is still in the
>>> table (or was during this grace period).
>>> This works even if rhashtable_walk_stop() and rhashtable_talk_start()
>>> have been called since the last rhashtable_walk_next().
>>>
>>> If there have been no calls to rhashtable_walk_next(), or if the
>>> object is gone from the table, then NULL is returned.
>>>
>>> This can usefully be used in a seq_file ->start() function.
>>> If the pos is the same as was returned by the last ->next() call,
>>> then rhashtable_walk_prev() can be used to re-establish the
>>> current location in the table. If it returns NULL, then
>>> rhashtable_walk_next() should be used.
>>>
>>> Signed-off-by: NeilBrown <[email protected]>
>>
>> I will ack this if Tom is OK with replacing peek with it.
>>
> I'm not following why this is any better than peek. The point of
> having rhashtable_walk_peek is to to allow the caller to get then
> current element not the next one. This is needed when table is read in
> multiple parts and we need to pick up with what was returned from the
> last call to rhashtable_walk_next (which apparently is what this patch
> is also trying to do).
>
> There is one significant difference in that peek will return the
> element in the table regardless of where the iterator is at (this is
> why peek can move the iterator) and only returns NULL at end of table.
> As mentioned above rhashtable_walk_prev can return NULL so then caller
> needs and so rhashtable_walk_next "should be used" to get the previous
> element. Doing a peek is a lot cleaner and more straightforward API in
> this regard.
Thanks for the review. I agree with a lot of what you say about the
behavior of the different implementations.
One important difference is the documentation. The current
documentation for rhashtable_walk_peek() is wrong. For example it says
that the function doesn't change the iterator, but sometimes it does.
The first rhashtable patch I submitted tried to fix this up, but it is
hard to document the function clearly because it really is doing one of
two different things. It returns the previous element if it still
exists, or it returns the next one. With my rhashtable_walk_prev(),
that can be done with
rhashtable_walk_prev() ?: rhashtable_walk_next();
Both of these functions can easily be documented clearly.
We could combine the two as you have done, but "peek" does seem like a
good name. "prev_or_next" is more honest, but somewhat clumsy.
Whether that is a good thing or not is partly a matter of taste, and we
seem to be on opposite sides of that fence.
There is a practical aspect to it though.
Lustre has a debugfs seq_file which shows all the cached pages of all
the cached object. The objects are in a hashtable (which I want to
change to an rhashtable). So the seq_file iterator has both an
rhashtable iterator an offset in the object.
When we restart a walk, we might be in the middle of some object - but
that object might have been removed from the cache, so we would need to
go on to the first page of the next object.
Using my interface I can do
obj = rhashtable_walk_prev(&iter.rhash_iter);
offset = iter.offset;
if (!obj) {
obj = rhashtable_walk_next(&iter.rhash_iter)
offset = 0;
}
I could achieve something similar with your interface if I kept an extra
copy of the previous object and compared with the value returned by
rhashtable_walk_peek(), but (to me) that seems like double handling.
Thanks,
NeilBrown
On Sun, May 06 2018, Herbert Xu wrote:
> On Sun, May 06, 2018 at 08:00:49AM +1000, NeilBrown wrote:
>>
>> The insert function must (and does) take the lock on the bucket before
>> testing if there is a "next" table.
>> If one inserter finds that it has locked the "last" table (because there
>> is no next) and successfully inserts, then the other inserter cannot
>> have locked that table yet, else it would have inserted. When it does,
>> it will find what the first inserter inserted.
>
> If you release the lock to the first table then it may be deleted
> by the resize thread. Hence the other inserter may not have even
> started from the same place.
This is true, but I don't see how it is relevant.
At some point, each thread will find that the table they have just
locked for their search key, has a NULL 'future_tbl' pointer.
At the point, the thread can know that the key is not in any table,
and that no other thread can add the key until the lock is released.
Thanks,
NeilBrown
On Sun, May 06, 2018 at 07:37:49AM +1000, NeilBrown wrote:
>
> I can see no evidence that this is required for anything, as it isn't
> use and I'm fairly sure that in it's current form - it cannot be used.
Search for nulls in net/ipv4. This is definitely used throughout
the network stack. As the aim is to convert as many existing hash
tables to rhashtable as possible, we want to keep this.
Cheers,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Mon, May 07, 2018 at 08:24:41AM +1000, NeilBrown wrote:
>
> This is true, but I don't see how it is relevant.
> At some point, each thread will find that the table they have just
> locked for their search key, has a NULL 'future_tbl' pointer.
> At the point, the thread can know that the key is not in any table,
> and that no other thread can add the key until the lock is released.
The updating of future_tbl is not synchronised with insert threads.
Therefore it is entirely possible that two inserters end up on
different tables as their "latest" table. This must not be allowed
to occur.
Cheers,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Sun, May 06, 2018 at 07:50:54AM +1000, NeilBrown wrote:
>
> Do we? How could we fix it for both rhashtable and rhltable?
Well I suggested earlier to insert the walker object into the
hash table, which would be applicable regardless of whether it
is a normal rhashtable of a rhltable.
Cheers,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Mon, May 07 2018, Herbert Xu wrote:
> On Mon, May 07, 2018 at 08:24:41AM +1000, NeilBrown wrote:
>>
>> This is true, but I don't see how it is relevant.
>> At some point, each thread will find that the table they have just
>> locked for their search key, has a NULL 'future_tbl' pointer.
>> At the point, the thread can know that the key is not in any table,
>> and that no other thread can add the key until the lock is released.
>
> The updating of future_tbl is not synchronised with insert threads.
> Therefore it is entirely possible that two inserters end up on
> different tables as their "latest" table. This must not be allowed
> to occur.
I disagree.
Certainly the update of future_tbl is not synchronised with insert
threads.
However there is only a single update to any given future_tbl (from NULL
to non-NULL) and two insert threads for the same key will see that
update in a synchronized way as they look at it while holding the bucket
lock for that key.
It is certainly true if that two inserters can end up on different
tables as their "latest" table, but that doesn't result in a problem.
e.g. T1 might see A as the latest table, and T2 might see B where
A.future_tbl == B
In that case, T2 must have examined A (under the lock) *after* T1
examined A, and so will have seen if T1 inserted anything.
Thanks,
NeilBrown
On Mon, May 07 2018, Herbert Xu wrote:
> On Sun, May 06, 2018 at 07:50:54AM +1000, NeilBrown wrote:
>>
>> Do we? How could we fix it for both rhashtable and rhltable?
>
> Well I suggested earlier to insert the walker object into the
> hash table, which would be applicable regardless of whether it
> is a normal rhashtable of a rhltable.
Oh yes, that idea. I had considered it and discarded it.
Moving a walker object around in an rcu-list is far from straight
forward. A lockless iteration of the list would need to be very careful
not to be tripped up by the walker-object. I don't think we want to
make the search code more complex.
I explored the idea a bit more and I think that any solution that we
came up with along those lines would look quite different for regular
rhash_tables than for rhl_tables. So it is probably best to keep them
quite separate.
i.e. use the scheme I proposed for rhashtables and use something else
entirely for rhltables.
My current thinking for a stable walk for rhl tables goes like this:
- we embed
struct rhl_cursor {
struct rhl_cursor *next;
int unseen;
}
in struct rhashtable_iter.
- on rhashtable_stop(), we:
+ lock the current hash chain
+ find the next object that is still in the table (after ->p).
+ count the number of objects from there to the end to the
end of the rhlist_head.next list.
+ store that count in rhl_cursor.unseen
+ change the ->next pointer at the end of the list to point to the
rhl_cursor, but with the lsb set. If there was already an
rhl_cursor there, they are chained together on ->next.
- on rhashtable_start(), we lock the hash chain and search the hash
chain and sub-chains for ->p or the rhl_cursor.
If ->p is still there, that can be used as a reliable anchor.
If ->p isn't found but the rhl_cursor is, then the 'unseen' counter
tells us where to start in that rhl_list.
If neither are found, then we start at the beginning of the hash chain.
If the rhl_cursor is found, it is removed.
- lookup and insert can almost completely ignore the cursor.
rhl_for_each_rcu() needs to detect the end-of-list by looking for
lsb set, but that is the only change.
As _insert() adds new objects to the head of the rhlist, that
won't disturb the accuracy of the 'unseen' count. The newly inserted
object won't be seen by the walk, but that is expected.
- delete needs some extra handling.
+ When an object is deleted from an rhlist and the list does not become
empty, then it counts the number of objects to the end of the list,
and possibly decrements the 'unseen' counter on any rhl_cursors that
are at the end of the list.
+ when an object is deleted resulting in an empty rhlist, any
cursors at the end of the list needs to be moved to the next
list in the hash chain, and their 'unseen' count needs to be set to
the length of the list.
If there is no next object in the hash chain, then the iter.slot is
incremented and the rhlist_curs is left unconnected.
- rehash needs to remove any cursors from the end of an rhlist before
moving them to the new table. The cursors are left disconnected.
I'm happy to implement this if you think the approach is workable and
the functionality is necessary. However there are no current users of
rhltable_walk_inter that would benefit from this.
Thanks,
NeilBrown
On Mon, May 07 2018, Herbert Xu wrote:
> On Sun, May 06, 2018 at 07:37:49AM +1000, NeilBrown wrote:
>>
>> I can see no evidence that this is required for anything, as it isn't
>> use and I'm fairly sure that in it's current form - it cannot be used.
>
> Search for nulls in net/ipv4. This is definitely used throughout
> the network stack. As the aim is to convert as many existing hash
> tables to rhashtable as possible, we want to keep this.
Yes, I'm sure that nulls lists are very useful and I can see them used
in net/ipv4 (and I would like to use them myself). I don't want to
remove the nulls list concept from rhashtable, and my patch doesn't do
that.
It just removes nulls_base (as the subject says) and stops setting any
particular value in the nulls pointer, because the value is currently
never tested.
The point of nulls lists is, as I'm sure you know, to detect if an
object was moved to a different chain and to restart the search in that
case.
This means that on an unsuccessful search, we need to test
get_nulls_value() of the tail pointer.
Several times in net/ipv4/inet_hashtables.c (for example) we see code
like:
/*
* if the nulls value we got at the end of this lookup is
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
if (get_nulls_value(node) != slot)
goto begin;
which does exactly that. This test-and-restart cannot be done by a
caller to rhashtable_lookup_fast() as the caller doesn't even get to
the tail nulls.
It would need to be done in rhashtable.[ch].
If you like, I can make this functionality work rather than just
removing the useless parts.
I would change INIT_RHT_NULLS_HEAD() to return to be
#define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \
((ptr) = (void*)NULLS_MARKER(((unsigned long)ptr)>>1)
Then when __rhashtable_lookup() comes to the end of the chain
without finding anything it would do something like
if (get_nulls_value(he)<<1 == (unsigned long)head)
goto restart;
where 'head' is the head of the list that we would have saved at the
start.
i.e. instead of using a nulls_base and limiting the size of the hash, we
store the address of the bucket in the nulls.
In my mind that should be a separate patch. First remove the unused,
harmful code. Then add code to provide useful functionality. But
I can put the two together in one patch if you would prefer that.
Thanks,
NeilBrown