Hi Jens,
This series is against for-3.16/blk-mq-tagging branch.
Cc: Ming Lei <[email protected]>
Cc: Jens Axboe <[email protected]>
Alexander Gordeev (3):
blk-mq: bitmap tag: fix races on shared ::wake_index fields
blk-mq: bitmap tag: fix race on blk_mq_bitmap_tags::wake_cnt
blk-mq: bitmap tag: clean-up bt_get() function
block/blk-mq-tag.c | 73 ++++++++++++++++++++++++++---------------------
block/blk-mq-tag.h | 2 +-
include/linux/blk-mq.h | 2 +-
3 files changed, 42 insertions(+), 35 deletions(-)
--
1.7.7.6
Fix racy updates of shared blk_mq_bitmap_tags::wake_index
and blk_mq_hw_ctx::wake_index fields.
Cc: Ming Lei <[email protected]>
Cc: Jens Axboe <[email protected]>
Signed-off-by: Alexander Gordeev <[email protected]>
---
block/blk-mq-tag.c | 32 +++++++++++++++++++++-----------
block/blk-mq-tag.h | 2 +-
include/linux/blk-mq.h | 2 +-
3 files changed, 23 insertions(+), 13 deletions(-)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index c80086c..efe9419 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -39,9 +39,16 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
return bt_has_free_tags(&tags->bitmap_tags);
}
-static inline void bt_index_inc(unsigned int *index)
+static inline int bt_index_inc(int index)
{
- *index = (*index + 1) & (BT_WAIT_QUEUES - 1);
+ return (index + 1) & (BT_WAIT_QUEUES - 1);
+}
+
+static inline void bt_index_atomic_inc(atomic_t *index)
+{
+ int old = atomic_read(index);
+ int new = bt_index_inc(old);
+ atomic_cmpxchg(index, old, new);
}
/*
@@ -75,14 +82,14 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
* Will only throttle depth on non-reserved tags
*/
bt = &tags->bitmap_tags;
- wake_index = bt->wake_index;
+ wake_index = atomic_read(&bt->wake_index);
for (i = 0; i < BT_WAIT_QUEUES; i++) {
struct bt_wait_state *bs = &bt->bs[wake_index];
if (waitqueue_active(&bs->wait))
wake_up(&bs->wait);
- bt_index_inc(&wake_index);
+ wake_index = bt_index_inc(wake_index);
}
}
@@ -202,12 +209,14 @@ static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
struct blk_mq_hw_ctx *hctx)
{
struct bt_wait_state *bs;
+ int wait_index;
if (!hctx)
return &bt->bs[0];
- bs = &bt->bs[hctx->wait_index];
- bt_index_inc(&hctx->wait_index);
+ wait_index = atomic_read(&hctx->wait_index);
+ bs = &bt->bs[wait_index];
+ bt_index_atomic_inc(&hctx->wait_index);
return bs;
}
@@ -289,18 +298,19 @@ static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
{
int i, wake_index;
- wake_index = bt->wake_index;
+ wake_index = atomic_read(&bt->wake_index);
for (i = 0; i < BT_WAIT_QUEUES; i++) {
struct bt_wait_state *bs = &bt->bs[wake_index];
if (waitqueue_active(&bs->wait)) {
- if (wake_index != bt->wake_index)
- bt->wake_index = wake_index;
+ int o = atomic_read(&bt->wake_index);
+ if (wake_index != o)
+ atomic_cmpxchg(&bt->wake_index, o, wake_index);
return bs;
}
- bt_index_inc(&wake_index);
+ wake_index = bt_index_inc(wake_index);
}
return NULL;
@@ -320,7 +330,7 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
bs = bt_wake_ptr(bt);
if (bs && atomic_dec_and_test(&bs->wait_cnt)) {
atomic_set(&bs->wait_cnt, bt->wake_cnt);
- bt_index_inc(&bt->wake_index);
+ bt_index_atomic_inc(&bt->wake_index);
wake_up(&bs->wait);
}
}
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 0f5ec8b..96f13d7 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -27,7 +27,7 @@ struct blk_mq_bitmap_tags {
unsigned int map_nr;
struct blk_mq_bitmap *map;
- unsigned int wake_index;
+ atomic_t wake_index;
struct bt_wait_state *bs;
};
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 379f88d..9b86442 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -36,7 +36,7 @@ struct blk_mq_hw_ctx {
unsigned int nr_ctx;
struct blk_mq_ctx **ctxs;
- unsigned int wait_index;
+ atomic_t wait_index;
struct blk_mq_tags *tags;
--
1.7.7.6
The tag waiting loop in bt_get() function is a mystery for me:
do {
bool was_empty;
1. was_empty = list_empty(&wait.task_list);
prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
tag = __bt_get(hctx, bt, last_tag);
if (tag != -1)
break;
2. if (was_empty)
3. atomic_set(&bs->wait_cnt, bt->wake_cnt);
io_schedule();
} while (1);
[1] list_empty(&wait.task_list) check is not protected;
[2] was_empty check is always true which results in *every* thread
entering the loop resets bt_wait_state::wait_cnt counter rather
than every bt->wake_cnt'th thread;
[3] 'bt_wait_state::wait_cnt' counter update seems redundant anyway,
since it is also gets reset in bt_clear_tag() function;
Cc: Ming Lei <[email protected]>
Cc: Jens Axboe <[email protected]>
Signed-off-by: Alexander Gordeev <[email protected]>
---
block/blk-mq-tag.c | 27 +++++++--------------------
1 files changed, 7 insertions(+), 20 deletions(-)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 5579fae..dc1f684 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -224,7 +224,6 @@ static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx,
unsigned int *last_tag, gfp_t gfp)
{
struct bt_wait_state *bs;
- DEFINE_WAIT(wait);
int tag;
tag = __bt_get(hctx, bt, last_tag);
@@ -235,23 +234,9 @@ static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx,
return -1;
bs = bt_wait_ptr(bt, hctx);
- do {
- bool was_empty;
-
- was_empty = list_empty(&wait.task_list);
- prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
-
- tag = __bt_get(hctx, bt, last_tag);
- if (tag != -1)
- break;
+ ___wait_event(bs->wait, (tag = __bt_get(hctx, bt, last_tag)) != -1,
+ TASK_UNINTERRUPTIBLE, 0, 0, io_schedule());
- if (was_empty)
- atomic_set(&bs->wait_cnt, bt->wake_cnt);
-
- io_schedule();
- } while (1);
-
- finish_wait(&bs->wait, &wait);
return tag;
}
@@ -477,13 +462,15 @@ static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
return -ENOMEM;
}
- for (i = 0; i < BT_WAIT_QUEUES; i++)
- init_waitqueue_head(&bt->bs[i].wait);
-
bt->wake_cnt = BT_WAIT_BATCH;
if (bt->wake_cnt > depth / 4)
bt->wake_cnt = max(1U, depth / 4);
+ for (i = 0; i < BT_WAIT_QUEUES; i++) {
+ init_waitqueue_head(&bt->bs[i].wait);
+ atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt);
+ }
+
bt->depth = depth;
return 0;
}
--
1.7.7.6
This piece of code in bt_clear_tag() function is racy:
bs = bt_wake_ptr(bt);
if (bs && atomic_dec_and_test(&bs->wait_cnt)) {
atomic_set(&bs->wait_cnt, bt->wake_cnt);
wake_up(&bs->wait);
}
Since nothing prevents bt_wake_ptr() from returning the very
same 'bs' address on multiple CPUs, the following scenario is
possible:
CPU1 CPU2
---- ----
0. bs = bt_wake_ptr(bt); bs = bt_wake_ptr(bt);
1. atomic_dec_and_test(&bs->wait_cnt)
2. atomic_dec_and_test(&bs->wait_cnt)
3. atomic_set(&bs->wait_cnt, bt->wake_cnt);
If the decrement in [1] yields zero then for some amount of time
the decrement in [2] results in a negative/overflow value, which
is not expected. The follow-up assignment in [3] overwrites the
invalid value with the batch value (and likely prevents the issue
from being severe) which is still incorrect and should be a lesser.
Cc: Ming Lei <[email protected]>
Cc: Jens Axboe <[email protected]>
Signed-off-by: Alexander Gordeev <[email protected]>
---
block/blk-mq-tag.c | 14 ++++++++++++--
1 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index efe9419..5579fae 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -320,6 +320,7 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
{
const int index = TAG_TO_INDEX(bt, tag);
struct bt_wait_state *bs;
+ int wait_cnt;
/*
* The unlock memory barrier need to order access to req in free
@@ -328,10 +329,19 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word);
bs = bt_wake_ptr(bt);
- if (bs && atomic_dec_and_test(&bs->wait_cnt)) {
- atomic_set(&bs->wait_cnt, bt->wake_cnt);
+ if (!bs)
+ return;
+
+ wait_cnt = atomic_dec_return(&bs->wait_cnt);
+ if (wait_cnt == 0) {
+wake:
+ atomic_add(bt->wake_cnt, &bs->wait_cnt);
bt_index_atomic_inc(&bt->wake_index);
wake_up(&bs->wait);
+ } else if (wait_cnt < 0) {
+ wait_cnt = atomic_inc_return(&bs->wait_cnt);
+ if (!wait_cnt)
+ goto wake;
}
}
--
1.7.7.6
Jens, did you plan to pick these up? There's a few bits of low hanging
fruit for cleanups / micro-optimizations in this area, but I don't want
to create conflicts with Alexanders work.
On Thu, Jun 12, 2014 at 05:05:35PM +0200, Alexander Gordeev wrote:
> Hi Jens,
>
> This series is against for-3.16/blk-mq-tagging branch.
>
> Cc: Ming Lei <[email protected]>
> Cc: Jens Axboe <[email protected]>
>
> Alexander Gordeev (3):
> blk-mq: bitmap tag: fix races on shared ::wake_index fields
> blk-mq: bitmap tag: fix race on blk_mq_bitmap_tags::wake_cnt
> blk-mq: bitmap tag: clean-up bt_get() function
>
> block/blk-mq-tag.c | 73 ++++++++++++++++++++++++++---------------------
> block/blk-mq-tag.h | 2 +-
> include/linux/blk-mq.h | 2 +-
> 3 files changed, 42 insertions(+), 35 deletions(-)
>
> --
> 1.7.7.6
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
---end quoted text---
On 2014-06-17 04:35, Christoph Hellwig wrote:
> Jens, did you plan to pick these up? There's a few bits of low hanging
> fruit for cleanups / micro-optimizations in this area, but I don't want
> to create conflicts with Alexanders work.
I'm evaluating just getting rid of the cyclic waits, which is why I've
held off. But the patches don't apply to the current tree anyway, so
they would need a rediffing no matter what.
--
Jens Axboe
On Tue, Jun 17, 2014 at 07:21:19AM -0600, Jens Axboe wrote:
> On 2014-06-17 04:35, Christoph Hellwig wrote:
> >Jens, did you plan to pick these up? There's a few bits of low hanging
> >fruit for cleanups / micro-optimizations in this area, but I don't want
> >to create conflicts with Alexanders work.
>
> I'm evaluating just getting rid of the cyclic waits, which is why
> I've held off. But the patches don't apply to the current tree
> anyway, so they would need a rediffing no matter what.
Hi Jens,
Patches 1/3 and 2/3 apply to Linus's tree and could go in without patch 3/3.
I'll post v2 of 3/3.
> --
> Jens Axboe
>
--
Regards,
Alexander Gordeev
[email protected]
This update fixes few issues in bt_get() function:
- list_empty(&wait.task_list) check is not protected;
- was_empty check is always true which results in *every* thread
entering the loop resets bt_wait_state::wait_cnt counter rather
than every bt->wake_cnt'th thread;
- 'bt_wait_state::wait_cnt' counter update is redundant, since
it also gets reset in bt_clear_tag() function;
Cc: Christoph Hellwig <[email protected]>
Cc: Ming Lei <[email protected]>
Cc: Jens Axboe <[email protected]>
Signed-off-by: Alexander Gordeev <[email protected]>
---
block/blk-mq-tag.c | 13 +++++--------
1 files changed, 5 insertions(+), 8 deletions(-)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 08fc671..c1b9242 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -248,18 +248,12 @@ static int bt_get(struct blk_mq_alloc_data *data,
bs = bt_wait_ptr(bt, hctx);
do {
- bool was_empty;
-
- was_empty = list_empty(&wait.task_list);
prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
tag = __bt_get(hctx, bt, last_tag);
if (tag != -1)
break;
- if (was_empty)
- atomic_set(&bs->wait_cnt, bt->wake_cnt);
-
blk_mq_put_ctx(data->ctx);
io_schedule();
@@ -519,10 +513,13 @@ static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
return -ENOMEM;
}
- for (i = 0; i < BT_WAIT_QUEUES; i++)
+ bt_update_count(bt, depth);
+
+ for (i = 0; i < BT_WAIT_QUEUES; i++) {
init_waitqueue_head(&bt->bs[i].wait);
+ atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt);
+ }
- bt_update_count(bt, depth);
return 0;
}
--
1.7.7.6
--
Regards,
Alexander Gordeev
[email protected]
On 2014-06-17 10:39, Alexander Gordeev wrote:
> On Tue, Jun 17, 2014 at 07:21:19AM -0600, Jens Axboe wrote:
>> On 2014-06-17 04:35, Christoph Hellwig wrote:
>>> Jens, did you plan to pick these up? There's a few bits of low hanging
>>> fruit for cleanups / micro-optimizations in this area, but I don't want
>>> to create conflicts with Alexanders work.
>>
>> I'm evaluating just getting rid of the cyclic waits, which is why
>> I've held off. But the patches don't apply to the current tree
>> anyway, so they would need a rediffing no matter what.
>
> Hi Jens,
>
> Patches 1/3 and 2/3 apply to Linus's tree and could go in without patch 3/3.
> I'll post v2 of 3/3.
As mentioned in another thread, I'm sort-of hoping we can just get rid
of the nasty multiple waitqueues. Will run some testing when I'm back.
If we still see a nice gain of having the cycling waitqueues, then I'll
test and apply your patches to close up the holes.
--
Jens Axboe