This patchset addresses several race conditions on cpu hotplug handling
for blk-mq. All problems are reproducible by the following script.
while true; do
echo 0 > /sys/devices/system/cpu/cpu1/online
echo 1 > /sys/devices/system/cpu/cpu1/online
done &
while true; do
modprobe -r null_blk
modprobe null_blk queue_mode=2 irqmode=1
sleep 0.1
done
* Changes from v1
- Release q->mq_map in blk_mq_release()
- Fix deadlock when reading cpu_list
- Fix race freeze and unfreeze
Akinobu Mita (6):
blk-mq: fix sysfs registration/unregistration race
blk-mq: Fix use after of free q->mq_map
blk-mq: fix q->mq_usage_counter access race
blk-mq: avoid inserting requests before establishing new mapping
blk-mq: fix freeze queue race
blk-mq: fix deadlock when reading cpu_list
block/blk-core.c | 2 ++
block/blk-mq-cpumap.c | 9 +++---
block/blk-mq-sysfs.c | 50 ++++++++++++++++++++++---------
block/blk-mq.c | 81 ++++++++++++++++++++++++++++++--------------------
block/blk-mq.h | 5 ++--
include/linux/blkdev.h | 9 ++++++
6 files changed, 102 insertions(+), 54 deletions(-)
Cc: Jens Axboe <[email protected]>
Cc: Ming Lei <[email protected]>
--
1.9.1
There is a race between cpu hotplug handling and adding/deleting
gendisk for blk-mq, where both are trying to register and unregister
the same sysfs entries.
CPU hotplug handling for blk-mq (blk_mq_queue_reinit) does unregister
and register sysfs entries of hctx for all request queues in
all_q_list. On the other hand, these entries for a request queue may
have already been unregistered or in the middle of registration when
CPU hotplug event occurs. Because those sysfs entries are registered
by blk_mq_register_disk() after the request queue is added to
all_q_list, and similarily the request queue is deleted from
all_q_list after those are unregistered by blk_mq_unregister_disk().
So we need proper mutual exclusion for those registration and
unregistration.
Signed-off-by: Akinobu Mita <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Ming Lei <[email protected]>
---
block/blk-core.c | 1 +
block/blk-mq-sysfs.c | 44 ++++++++++++++++++++++++++++++++++----------
include/linux/blkdev.h | 3 +++
3 files changed, 38 insertions(+), 10 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 82819e6..bbf67cd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -687,6 +687,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
init_waitqueue_head(&q->mq_freeze_wq);
+ mutex_init(&q->mq_sysfs_lock);
if (blkcg_init_queue(q))
goto fail_bdi;
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index b79685e..79a3e8d 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -332,13 +332,15 @@ static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
struct blk_mq_ctx *ctx;
int i;
- if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+ if (!(hctx->flags & BLK_MQ_F_SYSFS_UP))
return;
hctx_for_each_ctx(hctx, ctx, i)
kobject_del(&ctx->kobj);
kobject_del(&hctx->kobj);
+
+ hctx->flags &= ~BLK_MQ_F_SYSFS_UP;
}
static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
@@ -347,7 +349,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
struct blk_mq_ctx *ctx;
int i, ret;
- if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+ if (!hctx->nr_ctx)
return 0;
ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
@@ -357,10 +359,12 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
hctx_for_each_ctx(hctx, ctx, i) {
ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
if (ret)
- break;
+ return ret;
}
- return ret;
+ hctx->flags |= BLK_MQ_F_SYSFS_UP;
+
+ return 0;
}
void blk_mq_unregister_disk(struct gendisk *disk)
@@ -370,6 +374,8 @@ void blk_mq_unregister_disk(struct gendisk *disk)
struct blk_mq_ctx *ctx;
int i, j;
+ mutex_lock(&q->mq_sysfs_lock);
+
queue_for_each_hw_ctx(q, hctx, i) {
blk_mq_unregister_hctx(hctx);
@@ -384,6 +390,9 @@ void blk_mq_unregister_disk(struct gendisk *disk)
kobject_put(&q->mq_kobj);
kobject_put(&disk_to_dev(disk)->kobj);
+
+ q->mq_sysfs_init_done = false;
+ mutex_unlock(&q->mq_sysfs_lock);
}
static void blk_mq_sysfs_init(struct request_queue *q)
@@ -414,27 +423,30 @@ int blk_mq_register_disk(struct gendisk *disk)
struct blk_mq_hw_ctx *hctx;
int ret, i;
+ mutex_lock(&q->mq_sysfs_lock);
+
blk_mq_sysfs_init(q);
ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
if (ret < 0)
- return ret;
+ goto out;
kobject_uevent(&q->mq_kobj, KOBJ_ADD);
queue_for_each_hw_ctx(q, hctx, i) {
- hctx->flags |= BLK_MQ_F_SYSFS_UP;
ret = blk_mq_register_hctx(hctx);
if (ret)
break;
}
- if (ret) {
+ if (ret)
blk_mq_unregister_disk(disk);
- return ret;
- }
+ else
+ q->mq_sysfs_init_done = true;
+out:
+ mutex_unlock(&q->mq_sysfs_lock);
- return 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(blk_mq_register_disk);
@@ -443,8 +455,14 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
+ mutex_lock(&q->mq_sysfs_lock);
+ if (!q->mq_sysfs_init_done)
+ goto out;
+
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
+out:
+ mutex_unlock(&q->mq_sysfs_lock);
}
int blk_mq_sysfs_register(struct request_queue *q)
@@ -452,11 +470,17 @@ int blk_mq_sysfs_register(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i, ret = 0;
+ mutex_lock(&q->mq_sysfs_lock);
+ if (!q->mq_sysfs_init_done)
+ goto out;
+
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_register_hctx(hctx);
if (ret)
break;
}
+out:
+ mutex_unlock(&q->mq_sysfs_lock);
return ret;
}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d4068c1..c56f5a6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -462,6 +462,9 @@ struct request_queue {
struct blk_mq_tag_set *tag_set;
struct list_head tag_set_list;
+
+ struct mutex mq_sysfs_lock;
+ bool mq_sysfs_init_done;
};
#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */
--
1.9.1
CPU hotplug handling for blk-mq (blk_mq_queue_reinit) updates
q->mq_map by blk_mq_update_queue_map() for all request queues in
all_q_list. On the other hand, q->mq_map is released before deleting
the queue from all_q_list.
So if CPU hotplug event occurs in the window, invalid memory access
can happen. Fix it by releasing q->mq_map in blk_mq_release() to make
it happen latter than removal from all_q_list.
Signed-off-by: Akinobu Mita <[email protected]>
Suggested-by: Ming Lei <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Ming Lei <[email protected]>
---
block/blk-mq.c | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f537796..c5da908 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1928,6 +1928,9 @@ void blk_mq_release(struct request_queue *q)
kfree(hctx);
}
+ kfree(q->mq_map);
+ q->mq_map = NULL;
+
kfree(q->queue_hw_ctx);
/* ctx kobj stays in queue_ctx */
@@ -2073,11 +2076,6 @@ void blk_mq_free_queue(struct request_queue *q)
blk_mq_free_hw_queues(q, set);
percpu_ref_exit(&q->mq_usage_counter);
-
- kfree(q->mq_map);
-
- q->mq_map = NULL;
-
mutex_lock(&all_q_mutex);
list_del_init(&q->all_q_node);
mutex_unlock(&all_q_mutex);
--
1.9.1
CPU hotplug handling for blk-mq (blk_mq_queue_reinit) accesses
q->mq_usage_counter while freezing all request queues in all_q_list.
On the other hand, q->mq_usage_counter is deinitialized in
blk_mq_free_queue() before deleting the queue from all_q_list.
So if CPU hotplug event occurs in the window, percpu_ref_kill() is
called with q->mq_usage_counter which has already been marked dead,
and it triggers warning. Fix it by deleting the queue from all_q_list
earlier than destroying q->mq_usage_counter.
Signed-off-by: Akinobu Mita <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Ming Lei <[email protected]>
---
block/blk-mq.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c5da908..ff72e18 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2070,15 +2070,16 @@ void blk_mq_free_queue(struct request_queue *q)
{
struct blk_mq_tag_set *set = q->tag_set;
+ mutex_lock(&all_q_mutex);
+ list_del_init(&q->all_q_node);
+ mutex_unlock(&all_q_mutex);
+
blk_mq_del_queue_tag_set(q);
blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
blk_mq_free_hw_queues(q, set);
percpu_ref_exit(&q->mq_usage_counter);
- mutex_lock(&all_q_mutex);
- list_del_init(&q->all_q_node);
- mutex_unlock(&all_q_mutex);
}
/* Basically redo blk_mq_init_queue with queue frozen */
--
1.9.1
Notifier callbacks for CPU_ONLINE action can be run on the other CPU
than the CPU which was just onlined. So it is possible for the
process running on the just onlined CPU to insert request and run
hw queue before establishing new mapping which is done by
blk_mq_queue_reinit_notify().
This can cause a problem when the CPU has just been onlined first time
since the request queue was initialized. At this time ctx->index_hw
for the CPU, which is the index in hctx->ctxs[] for this ctx, is still
zero before blk_mq_queue_reinit_notify() is called by notifier
callbacks for CPU_ONLINE action.
For example, there is a single hw queue (hctx) and two CPU queues
(ctx0 for CPU0, and ctx1 for CPU1). Now CPU1 is just onlined and
a request is inserted into ctx1->rq_list and set bit0 in pending
bitmap as ctx1->index_hw is still zero.
And then while running hw queue, flush_busy_ctxs() finds bit0 is set
in pending bitmap and tries to retrieve requests in
hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0, so the
request in ctx1->rq_list is ignored.
Fix it by ensuring that new mapping is established before onlined cpu
starts running.
Signed-off-by: Akinobu Mita <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Ming Lei <[email protected]>
---
block/blk-mq-cpumap.c | 9 +++++----
block/blk-mq.c | 41 ++++++++++++++++++++++++++---------------
block/blk-mq.h | 3 ++-
3 files changed, 33 insertions(+), 20 deletions(-)
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 1e28ddb..8764c24 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -31,7 +31,8 @@ static int get_first_sibling(unsigned int cpu)
return cpu;
}
-int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
+int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
+ const struct cpumask *online_mask)
{
unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
cpumask_var_t cpus;
@@ -41,7 +42,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
cpumask_clear(cpus);
nr_cpus = nr_uniq_cpus = 0;
- for_each_online_cpu(i) {
+ for_each_cpu(i, online_mask) {
nr_cpus++;
first_sibling = get_first_sibling(i);
if (!cpumask_test_cpu(first_sibling, cpus))
@@ -51,7 +52,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
queue = 0;
for_each_possible_cpu(i) {
- if (!cpu_online(i)) {
+ if (!cpumask_test_cpu(i, online_mask)) {
map[i] = 0;
continue;
}
@@ -95,7 +96,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
if (!map)
return NULL;
- if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
+ if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask))
return map;
kfree(map);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ff72e18..ad07373 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1799,7 +1799,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
}
}
-static void blk_mq_map_swqueue(struct request_queue *q)
+static void blk_mq_map_swqueue(struct request_queue *q,
+ const struct cpumask *online_mask)
{
unsigned int i;
struct blk_mq_hw_ctx *hctx;
@@ -1816,7 +1817,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
*/
queue_for_each_ctx(q, ctx, i) {
/* If the cpu isn't online, the cpu is mapped to first hctx */
- if (!cpu_online(i))
+ if (!cpumask_test_cpu(i, online_mask))
continue;
hctx = q->mq_ops->map_queue(q, i);
@@ -2046,7 +2047,10 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
blk_mq_add_queue_tag_set(set, q);
- blk_mq_map_swqueue(q);
+ get_online_cpus();
+ blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, cpu_online_mask);
+ blk_mq_map_swqueue(q, cpu_online_mask);
+ put_online_cpus();
return q;
@@ -2083,13 +2087,14 @@ void blk_mq_free_queue(struct request_queue *q)
}
/* Basically redo blk_mq_init_queue with queue frozen */
-static void blk_mq_queue_reinit(struct request_queue *q)
+static void blk_mq_queue_reinit(struct request_queue *q,
+ const struct cpumask *online_mask)
{
WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
blk_mq_sysfs_unregister(q);
- blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
+ blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);
/*
* redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
@@ -2097,25 +2102,31 @@ static void blk_mq_queue_reinit(struct request_queue *q)
* involves free and re-allocate memory, worthy doing?)
*/
- blk_mq_map_swqueue(q);
+ blk_mq_map_swqueue(q, online_mask);
blk_mq_sysfs_register(q);
}
+static struct cpumask online_new;
+
static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
unsigned long action, void *hcpu)
{
struct request_queue *q;
+ int cpu = (unsigned long)hcpu;
- /*
- * Before new mappings are established, hotadded cpu might already
- * start handling requests. This doesn't break anything as we map
- * offline CPUs to first hardware queue. We will re-init the queue
- * below to get optimal settings.
- */
- if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
- action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DEAD:
+ case CPU_UP_CANCELED:
+ cpumask_copy(&online_new, cpu_online_mask);
+ break;
+ case CPU_UP_PREPARE:
+ cpumask_copy(&online_new, cpu_online_mask);
+ cpumask_set_cpu(cpu, &online_new);
+ break;
+ default:
return NOTIFY_OK;
+ }
mutex_lock(&all_q_mutex);
@@ -2139,7 +2150,7 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
}
list_for_each_entry(q, &all_q_list, all_q_node)
- blk_mq_queue_reinit(q);
+ blk_mq_queue_reinit(q, &online_new);
list_for_each_entry(q, &all_q_list, all_q_node)
blk_mq_unfreeze_queue(q);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 6a48c4c..f4fea79 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -51,7 +51,8 @@ void blk_mq_disable_hotplug(void);
* CPU -> queue mappings
*/
extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
-extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
+extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
+ const struct cpumask *online_mask);
extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
/*
--
1.9.1
There are several race conditions while freezing queue.
When unfreezing queue, there is a small window between decrementing
q->mq_freeze_depth to zero and percpu_ref_reinit() call with
q->mq_usage_counter. If the other calls blk_mq_freeze_queue_start()
in the window, q->mq_freeze_depth is increased from zero to one and
percpu_ref_kill() is called with q->mq_usage_counter which is already
killed. percpu refcount should be re-initialized before killed again.
Also, there is a race condition while switching to percpu mode.
percpu_ref_switch_to_percpu() and percpu_ref_kill() must not be
executed at the same time as the following scenario is possible:
1. q->mq_usage_counter is initialized in atomic mode.
(atomic counter: 1)
2. After the disk registration, a process like systemd-udev starts
accessing the disk, and successfully increases refcount successfully
by percpu_ref_tryget_live() in blk_mq_queue_enter().
(atomic counter: 2)
3. In the final stage of initialization, q->mq_usage_counter is being
switched to percpu mode by percpu_ref_switch_to_percpu() in
blk_mq_finish_init(). But if CONFIG_PREEMPT_VOLUNTARY is enabled,
the process is rescheduled in the middle of switching when calling
wait_event() in __percpu_ref_switch_to_percpu().
(atomic counter: 2)
4. CPU hotplug handling for blk-mq calls percpu_ref_kill() to freeze
request queue. q->mq_usage_counter is decreased and marked as
DEAD. Wait until all requests have finished.
(atomic counter: 1)
5. The process rescheduled in the step 3. is resumed and finishes
all remaining work in __percpu_ref_switch_to_percpu().
A bias value is added to atomic counter of q->mq_usage_counter.
(atomic counter: PERCPU_COUNT_BIAS + 1)
6. A request issed in the step 2. is finished and q->mq_usage_counter
is decreased by blk_mq_queue_exit(). q->mq_usage_counter is DEAD,
so atomic counter is decreased and no release handler is called.
(atomic counter: PERCPU_COUNT_BIAS)
7. CPU hotplug handling in the step 4. will wait forever as
q->mq_usage_counter will never be zero.
Also, percpu_ref_reinit() and percpu_ref_kill() must not be executed
at the same time. Because both functions could call
__percpu_ref_switch_to_percpu() which adds the bias value and
initialize percpu counter.
Fix those races by serializing with per-queue mutex.
Signed-off-by: Akinobu Mita <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Ming Lei <[email protected]>
---
block/blk-core.c | 1 +
block/blk-mq-sysfs.c | 2 ++
block/blk-mq.c | 8 ++++++++
include/linux/blkdev.h | 6 ++++++
4 files changed, 17 insertions(+)
diff --git a/block/blk-core.c b/block/blk-core.c
index bbf67cd..f3c5ae2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -687,6 +687,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
init_waitqueue_head(&q->mq_freeze_wq);
+ mutex_init(&q->mq_freeze_lock);
mutex_init(&q->mq_sysfs_lock);
if (blkcg_init_queue(q))
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 79a3e8d..8448513 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -413,7 +413,9 @@ static void blk_mq_sysfs_init(struct request_queue *q)
/* see blk_register_queue() */
void blk_mq_finish_init(struct request_queue *q)
{
+ mutex_lock(&q->mq_freeze_lock);
percpu_ref_switch_to_percpu(&q->mq_usage_counter);
+ mutex_unlock(&q->mq_freeze_lock);
}
int blk_mq_register_disk(struct gendisk *disk)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ad07373..f31de35 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -115,11 +115,15 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
{
int freeze_depth;
+ mutex_lock(&q->mq_freeze_lock);
+
freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
if (freeze_depth == 1) {
percpu_ref_kill(&q->mq_usage_counter);
blk_mq_run_hw_queues(q, false);
}
+
+ mutex_unlock(&q->mq_freeze_lock);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
@@ -143,12 +147,16 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
{
int freeze_depth;
+ mutex_lock(&q->mq_freeze_lock);
+
freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
WARN_ON_ONCE(freeze_depth < 0);
if (!freeze_depth) {
percpu_ref_reinit(&q->mq_usage_counter);
wake_up_all(&q->mq_freeze_wq);
}
+
+ mutex_unlock(&q->mq_freeze_lock);
}
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c56f5a6..0bf8bea 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -457,6 +457,12 @@ struct request_queue {
#endif
struct rcu_head rcu_head;
wait_queue_head_t mq_freeze_wq;
+ /*
+ * Protect concurrent access to mq_usage_counter by
+ * percpu_ref_switch_to_percpu(), percpu_ref_kill(), and
+ * percpu_ref_reinit().
+ */
+ struct mutex mq_freeze_lock;
struct percpu_ref mq_usage_counter;
struct list_head all_q_node;
--
1.9.1
CPU hotplug handling for blk-mq (blk_mq_queue_reinit) acquires
all_q_mutex in blk_mq_queue_reinit_notify() and then removes sysfs
entries by blk_mq_sysfs_unregister(). Removing sysfs entry needs to
be blocked until the active reference of the kernfs_node to be zero.
On the other hand, reading blk_mq_hw_sysfs_cpu sysfs entry (e.g.
/sys/block/nullb0/mq/0/cpu_list) acquires all_q_mutex in
blk_mq_hw_sysfs_cpus_show().
If these happen at the same time, a deadlock can happen. Because one
can wait for the active reference to be zero with holding all_q_mutex,
and the other tries to acquire all_q_mutex with holding the active
reference.
The reason that all_q_mutex is acquired in blk_mq_hw_sysfs_cpus_show()
is to avoid reading an imcomplete hctx->cpumask. Since reading sysfs
entry for blk-mq needs to acquire q->sysfs_lock, we can avoid deadlock
and reading an imcomplete hctx->cpumask by protecting q->sysfs_lock
while hctx->cpumask is being updated.
Signed-off-by: Akinobu Mita <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Ming Lei <[email protected]>
---
block/blk-mq-sysfs.c | 4 ----
block/blk-mq.c | 17 +++++++----------
block/blk-mq.h | 2 --
3 files changed, 7 insertions(+), 16 deletions(-)
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 8448513..9549020 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -218,8 +218,6 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
unsigned int i, first = 1;
ssize_t ret = 0;
- blk_mq_disable_hotplug();
-
for_each_cpu(i, hctx->cpumask) {
if (first)
ret += sprintf(ret + page, "%u", i);
@@ -229,8 +227,6 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
first = 0;
}
- blk_mq_enable_hotplug();
-
ret += sprintf(ret + page, "\n");
return ret;
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f31de35..76fd825 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1815,6 +1815,11 @@ static void blk_mq_map_swqueue(struct request_queue *q,
struct blk_mq_ctx *ctx;
struct blk_mq_tag_set *set = q->tag_set;
+ /*
+ * Avoid others reading imcomplete hctx->cpumask through sysfs
+ */
+ mutex_lock(&q->sysfs_lock);
+
queue_for_each_hw_ctx(q, hctx, i) {
cpumask_clear(hctx->cpumask);
hctx->nr_ctx = 0;
@@ -1835,6 +1840,8 @@ static void blk_mq_map_swqueue(struct request_queue *q,
hctx->ctxs[hctx->nr_ctx++] = ctx;
}
+ mutex_unlock(&q->sysfs_lock);
+
queue_for_each_hw_ctx(q, hctx, i) {
struct blk_mq_ctxmap *map = &hctx->ctx_map;
@@ -2321,16 +2328,6 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
return ret;
}
-void blk_mq_disable_hotplug(void)
-{
- mutex_lock(&all_q_mutex);
-}
-
-void blk_mq_enable_hotplug(void)
-{
- mutex_unlock(&all_q_mutex);
-}
-
static int __init blk_mq_init(void)
{
blk_mq_cpu_init();
diff --git a/block/blk-mq.h b/block/blk-mq.h
index f4fea79..2592682 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -44,8 +44,6 @@ void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
void blk_mq_cpu_init(void);
-void blk_mq_enable_hotplug(void);
-void blk_mq_disable_hotplug(void);
/*
* CPU -> queue mappings
--
1.9.1
On Thu, Jul 2, 2015 at 10:29 PM, Akinobu Mita <[email protected]> wrote:
> There is a race between cpu hotplug handling and adding/deleting
> gendisk for blk-mq, where both are trying to register and unregister
> the same sysfs entries.
>
> CPU hotplug handling for blk-mq (blk_mq_queue_reinit) does unregister
> and register sysfs entries of hctx for all request queues in
> all_q_list. On the other hand, these entries for a request queue may
> have already been unregistered or in the middle of registration when
> CPU hotplug event occurs. Because those sysfs entries are registered
> by blk_mq_register_disk() after the request queue is added to
> all_q_list, and similarily the request queue is deleted from
> all_q_list after those are unregistered by blk_mq_unregister_disk().
>
> So we need proper mutual exclusion for those registration and
> unregistration.
But that may not be enough, and you miss another point in which
blk_mq_sysfs_register() has to be called after the remapping is built,
so the introduced lock should have been held in blk_mq_queue_reinit().
>
> Signed-off-by: Akinobu Mita <[email protected]>
> Cc: Jens Axboe <[email protected]>
> Cc: Ming Lei <[email protected]>
> ---
> block/blk-core.c | 1 +
> block/blk-mq-sysfs.c | 44 ++++++++++++++++++++++++++++++++++----------
> include/linux/blkdev.h | 3 +++
> 3 files changed, 38 insertions(+), 10 deletions(-)
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 82819e6..bbf67cd 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -687,6 +687,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
> __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
>
> init_waitqueue_head(&q->mq_freeze_wq);
> + mutex_init(&q->mq_sysfs_lock);
The above should be put into blk_mq_init_allocated_queue().
>
> if (blkcg_init_queue(q))
> goto fail_bdi;
> diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
> index b79685e..79a3e8d 100644
> --- a/block/blk-mq-sysfs.c
> +++ b/block/blk-mq-sysfs.c
> @@ -332,13 +332,15 @@ static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
> struct blk_mq_ctx *ctx;
> int i;
>
> - if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
> + if (!(hctx->flags & BLK_MQ_F_SYSFS_UP))
> return;
It isn't needed to introduce above change.
>
> hctx_for_each_ctx(hctx, ctx, i)
> kobject_del(&ctx->kobj);
>
> kobject_del(&hctx->kobj);
> +
> + hctx->flags &= ~BLK_MQ_F_SYSFS_UP;
I guess you misunderstand the flag, which just means the syfs stuff
for the hcts has been setup, and you needn't clear it.
> }
>
> static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
> @@ -347,7 +349,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
> struct blk_mq_ctx *ctx;
> int i, ret;
>
> - if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
> + if (!hctx->nr_ctx)
> return 0;
>
> ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
> @@ -357,10 +359,12 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
> hctx_for_each_ctx(hctx, ctx, i) {
> ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
> if (ret)
> - break;
> + return ret;
> }
>
> - return ret;
> + hctx->flags |= BLK_MQ_F_SYSFS_UP;
Same with above.
> +
> + return 0;
> }
>
> void blk_mq_unregister_disk(struct gendisk *disk)
> @@ -370,6 +374,8 @@ void blk_mq_unregister_disk(struct gendisk *disk)
> struct blk_mq_ctx *ctx;
> int i, j;
>
> + mutex_lock(&q->mq_sysfs_lock);
> +
> queue_for_each_hw_ctx(q, hctx, i) {
> blk_mq_unregister_hctx(hctx);
>
> @@ -384,6 +390,9 @@ void blk_mq_unregister_disk(struct gendisk *disk)
> kobject_put(&q->mq_kobj);
>
> kobject_put(&disk_to_dev(disk)->kobj);
> +
> + q->mq_sysfs_init_done = false;
The flag isn't needed, but you should hold q->mq_sysfs_lock
inside blk_mq_queue_reinit().
> + mutex_unlock(&q->mq_sysfs_lock);
> }
>
> static void blk_mq_sysfs_init(struct request_queue *q)
> @@ -414,27 +423,30 @@ int blk_mq_register_disk(struct gendisk *disk)
> struct blk_mq_hw_ctx *hctx;
> int ret, i;
>
> + mutex_lock(&q->mq_sysfs_lock);
> +
> blk_mq_sysfs_init(q);
>
> ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
> if (ret < 0)
> - return ret;
> + goto out;
>
> kobject_uevent(&q->mq_kobj, KOBJ_ADD);
>
> queue_for_each_hw_ctx(q, hctx, i) {
> - hctx->flags |= BLK_MQ_F_SYSFS_UP;
Same with above.
> ret = blk_mq_register_hctx(hctx);
> if (ret)
> break;
> }
>
> - if (ret) {
> + if (ret)
> blk_mq_unregister_disk(disk);
> - return ret;
> - }
> + else
> + q->mq_sysfs_init_done = true;
> +out:
> + mutex_unlock(&q->mq_sysfs_lock);
>
> - return 0;
> + return ret;
> }
> EXPORT_SYMBOL_GPL(blk_mq_register_disk);
>
> @@ -443,8 +455,14 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
> struct blk_mq_hw_ctx *hctx;
> int i;
>
> + mutex_lock(&q->mq_sysfs_lock);
> + if (!q->mq_sysfs_init_done)
> + goto out;
> +
> queue_for_each_hw_ctx(q, hctx, i)
> blk_mq_unregister_hctx(hctx);
> +out:
> + mutex_unlock(&q->mq_sysfs_lock);
> }
>
> int blk_mq_sysfs_register(struct request_queue *q)
> @@ -452,11 +470,17 @@ int blk_mq_sysfs_register(struct request_queue *q)
> struct blk_mq_hw_ctx *hctx;
> int i, ret = 0;
>
> + mutex_lock(&q->mq_sysfs_lock);
> + if (!q->mq_sysfs_init_done)
> + goto out;
The above check needn't.
> +
> queue_for_each_hw_ctx(q, hctx, i) {
> ret = blk_mq_register_hctx(hctx);
> if (ret)
> break;
> }
> +out:
> + mutex_unlock(&q->mq_sysfs_lock);
>
> return ret;
> }
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index d4068c1..c56f5a6 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -462,6 +462,9 @@ struct request_queue {
>
> struct blk_mq_tag_set *tag_set;
> struct list_head tag_set_list;
> +
> + struct mutex mq_sysfs_lock;
> + bool mq_sysfs_init_done;
Same with above.
> };
>
> #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */
> --
> 1.9.1
>
--
Ming Lei
On Thu, Jul 2, 2015 at 10:29 PM, Akinobu Mita <[email protected]> wrote:
> CPU hotplug handling for blk-mq (blk_mq_queue_reinit) accesses
> q->mq_usage_counter while freezing all request queues in all_q_list.
> On the other hand, q->mq_usage_counter is deinitialized in
> blk_mq_free_queue() before deleting the queue from all_q_list.
>
> So if CPU hotplug event occurs in the window, percpu_ref_kill() is
> called with q->mq_usage_counter which has already been marked dead,
> and it triggers warning. Fix it by deleting the queue from all_q_list
> earlier than destroying q->mq_usage_counter.
>
> Signed-off-by: Akinobu Mita <[email protected]>
> Cc: Jens Axboe <[email protected]>
> Cc: Ming Lei <[email protected]>
Reviewed-by: Ming Lei <[email protected]>
> ---
> block/blk-mq.c | 7 ++++---
> 1 file changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index c5da908..ff72e18 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2070,15 +2070,16 @@ void blk_mq_free_queue(struct request_queue *q)
> {
> struct blk_mq_tag_set *set = q->tag_set;
>
> + mutex_lock(&all_q_mutex);
> + list_del_init(&q->all_q_node);
> + mutex_unlock(&all_q_mutex);
> +
> blk_mq_del_queue_tag_set(q);
>
> blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
> blk_mq_free_hw_queues(q, set);
>
> percpu_ref_exit(&q->mq_usage_counter);
> - mutex_lock(&all_q_mutex);
> - list_del_init(&q->all_q_node);
> - mutex_unlock(&all_q_mutex);
> }
>
> /* Basically redo blk_mq_init_queue with queue frozen */
> --
> 1.9.1
>
--
Ming Lei
On Thu, Jul 2, 2015 at 10:29 PM, Akinobu Mita <[email protected]> wrote:
> CPU hotplug handling for blk-mq (blk_mq_queue_reinit) updates
> q->mq_map by blk_mq_update_queue_map() for all request queues in
> all_q_list. On the other hand, q->mq_map is released before deleting
> the queue from all_q_list.
>
> So if CPU hotplug event occurs in the window, invalid memory access
> can happen. Fix it by releasing q->mq_map in blk_mq_release() to make
> it happen latter than removal from all_q_list.
>
> Signed-off-by: Akinobu Mita <[email protected]>
> Suggested-by: Ming Lei <[email protected]>
> Cc: Jens Axboe <[email protected]>
> Cc: Ming Lei <[email protected]>
Reviewed-by: Ming Lei <[email protected]>
> ---
> block/blk-mq.c | 8 +++-----
> 1 file changed, 3 insertions(+), 5 deletions(-)
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index f537796..c5da908 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1928,6 +1928,9 @@ void blk_mq_release(struct request_queue *q)
> kfree(hctx);
> }
>
> + kfree(q->mq_map);
> + q->mq_map = NULL;
> +
> kfree(q->queue_hw_ctx);
>
> /* ctx kobj stays in queue_ctx */
> @@ -2073,11 +2076,6 @@ void blk_mq_free_queue(struct request_queue *q)
> blk_mq_free_hw_queues(q, set);
>
> percpu_ref_exit(&q->mq_usage_counter);
> -
> - kfree(q->mq_map);
> -
> - q->mq_map = NULL;
> -
> mutex_lock(&all_q_mutex);
> list_del_init(&q->all_q_node);
> mutex_unlock(&all_q_mutex);
> --
> 1.9.1
>
--
Ming Lei
On Thu, 2 Jul 2015 23:29:56 +0900
Akinobu Mita <[email protected]> wrote:
> There are several race conditions while freezing queue.
>
> When unfreezing queue, there is a small window between decrementing
> q->mq_freeze_depth to zero and percpu_ref_reinit() call with
> q->mq_usage_counter. If the other calls blk_mq_freeze_queue_start()
> in the window, q->mq_freeze_depth is increased from zero to one and
> percpu_ref_kill() is called with q->mq_usage_counter which is already
> killed. percpu refcount should be re-initialized before killed again.
>
> Also, there is a race condition while switching to percpu mode.
> percpu_ref_switch_to_percpu() and percpu_ref_kill() must not be
> executed at the same time as the following scenario is possible:
>
> 1. q->mq_usage_counter is initialized in atomic mode.
> (atomic counter: 1)
>
> 2. After the disk registration, a process like systemd-udev starts
> accessing the disk, and successfully increases refcount successfully
> by percpu_ref_tryget_live() in blk_mq_queue_enter().
> (atomic counter: 2)
>
> 3. In the final stage of initialization, q->mq_usage_counter is being
> switched to percpu mode by percpu_ref_switch_to_percpu() in
> blk_mq_finish_init(). But if CONFIG_PREEMPT_VOLUNTARY is enabled,
> the process is rescheduled in the middle of switching when calling
> wait_event() in __percpu_ref_switch_to_percpu().
> (atomic counter: 2)
>
> 4. CPU hotplug handling for blk-mq calls percpu_ref_kill() to freeze
> request queue. q->mq_usage_counter is decreased and marked as
> DEAD. Wait until all requests have finished.
> (atomic counter: 1)
>
> 5. The process rescheduled in the step 3. is resumed and finishes
> all remaining work in __percpu_ref_switch_to_percpu().
> A bias value is added to atomic counter of q->mq_usage_counter.
> (atomic counter: PERCPU_COUNT_BIAS + 1)
>
> 6. A request issed in the step 2. is finished and q->mq_usage_counter
> is decreased by blk_mq_queue_exit(). q->mq_usage_counter is DEAD,
> so atomic counter is decreased and no release handler is called.
> (atomic counter: PERCPU_COUNT_BIAS)
>
> 7. CPU hotplug handling in the step 4. will wait forever as
> q->mq_usage_counter will never be zero.
>
> Also, percpu_ref_reinit() and percpu_ref_kill() must not be executed
> at the same time. Because both functions could call
> __percpu_ref_switch_to_percpu() which adds the bias value and
> initialize percpu counter.
>
> Fix those races by serializing with per-queue mutex.
This patch looks fine since at least changing DEAD state of percpu ref
state should have been synchronized by caller.
Also looks __percpu_ref_switch_to_percpu() need to check if the refcount
becomes dead after current switching, and seems something like following
is needed:
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 6111bcb..2532949 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -235,6 +235,10 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
+ /* dying or dead ref can't be switched to percpu mode w/o reinit */
+ if (ref->percpu_count_ptr & __PERCPU_REF_DEAD)
+ return;
+
atomic_long_add(PERCPU_COUNT_BIAS, &ref->count);
/*
>
> Signed-off-by: Akinobu Mita <[email protected]>
> Cc: Jens Axboe <[email protected]>
> Cc: Ming Lei <[email protected]>
> ---
> block/blk-core.c | 1 +
> block/blk-mq-sysfs.c | 2 ++
> block/blk-mq.c | 8 ++++++++
> include/linux/blkdev.h | 6 ++++++
> 4 files changed, 17 insertions(+)
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index bbf67cd..f3c5ae2 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -687,6 +687,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
> __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
>
> init_waitqueue_head(&q->mq_freeze_wq);
> + mutex_init(&q->mq_freeze_lock);
> mutex_init(&q->mq_sysfs_lock);
>
> if (blkcg_init_queue(q))
> diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
> index 79a3e8d..8448513 100644
> --- a/block/blk-mq-sysfs.c
> +++ b/block/blk-mq-sysfs.c
> @@ -413,7 +413,9 @@ static void blk_mq_sysfs_init(struct request_queue *q)
> /* see blk_register_queue() */
> void blk_mq_finish_init(struct request_queue *q)
> {
> + mutex_lock(&q->mq_freeze_lock);
> percpu_ref_switch_to_percpu(&q->mq_usage_counter);
> + mutex_unlock(&q->mq_freeze_lock);
> }
>
> int blk_mq_register_disk(struct gendisk *disk)
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index ad07373..f31de35 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -115,11 +115,15 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
> {
> int freeze_depth;
>
> + mutex_lock(&q->mq_freeze_lock);
> +
> freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
> if (freeze_depth == 1) {
> percpu_ref_kill(&q->mq_usage_counter);
> blk_mq_run_hw_queues(q, false);
> }
> +
> + mutex_unlock(&q->mq_freeze_lock);
> }
> EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
>
> @@ -143,12 +147,16 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
> {
> int freeze_depth;
>
> + mutex_lock(&q->mq_freeze_lock);
> +
> freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
> WARN_ON_ONCE(freeze_depth < 0);
> if (!freeze_depth) {
> percpu_ref_reinit(&q->mq_usage_counter);
> wake_up_all(&q->mq_freeze_wq);
> }
> +
> + mutex_unlock(&q->mq_freeze_lock);
> }
> EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
>
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index c56f5a6..0bf8bea 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -457,6 +457,12 @@ struct request_queue {
> #endif
> struct rcu_head rcu_head;
> wait_queue_head_t mq_freeze_wq;
> + /*
> + * Protect concurrent access to mq_usage_counter by
> + * percpu_ref_switch_to_percpu(), percpu_ref_kill(), and
> + * percpu_ref_reinit().
> + */
> + struct mutex mq_freeze_lock;
> struct percpu_ref mq_usage_counter;
> struct list_head all_q_node;
>
On Thu, Jul 2, 2015 at 10:29 PM, Akinobu Mita <[email protected]> wrote:
> Notifier callbacks for CPU_ONLINE action can be run on the other CPU
> than the CPU which was just onlined. So it is possible for the
> process running on the just onlined CPU to insert request and run
> hw queue before establishing new mapping which is done by
> blk_mq_queue_reinit_notify().
>
> This can cause a problem when the CPU has just been onlined first time
> since the request queue was initialized. At this time ctx->index_hw
> for the CPU, which is the index in hctx->ctxs[] for this ctx, is still
> zero before blk_mq_queue_reinit_notify() is called by notifier
> callbacks for CPU_ONLINE action.
>
> For example, there is a single hw queue (hctx) and two CPU queues
> (ctx0 for CPU0, and ctx1 for CPU1). Now CPU1 is just onlined and
> a request is inserted into ctx1->rq_list and set bit0 in pending
> bitmap as ctx1->index_hw is still zero.
>
> And then while running hw queue, flush_busy_ctxs() finds bit0 is set
> in pending bitmap and tries to retrieve requests in
> hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0, so the
> request in ctx1->rq_list is ignored.
>
> Fix it by ensuring that new mapping is established before onlined cpu
> starts running.
>
> Signed-off-by: Akinobu Mita <[email protected]>
> Cc: Jens Axboe <[email protected]>
> Cc: Ming Lei <[email protected]>
> ---
> block/blk-mq-cpumap.c | 9 +++++----
> block/blk-mq.c | 41 ++++++++++++++++++++++++++---------------
> block/blk-mq.h | 3 ++-
> 3 files changed, 33 insertions(+), 20 deletions(-)
>
> diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
> index 1e28ddb..8764c24 100644
> --- a/block/blk-mq-cpumap.c
> +++ b/block/blk-mq-cpumap.c
> @@ -31,7 +31,8 @@ static int get_first_sibling(unsigned int cpu)
> return cpu;
> }
>
> -int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
> +int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
> + const struct cpumask *online_mask)
> {
> unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
> cpumask_var_t cpus;
> @@ -41,7 +42,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
>
> cpumask_clear(cpus);
> nr_cpus = nr_uniq_cpus = 0;
> - for_each_online_cpu(i) {
> + for_each_cpu(i, online_mask) {
> nr_cpus++;
> first_sibling = get_first_sibling(i);
> if (!cpumask_test_cpu(first_sibling, cpus))
> @@ -51,7 +52,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
>
> queue = 0;
> for_each_possible_cpu(i) {
> - if (!cpu_online(i)) {
> + if (!cpumask_test_cpu(i, online_mask)) {
> map[i] = 0;
> continue;
> }
> @@ -95,7 +96,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
> if (!map)
> return NULL;
>
> - if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
> + if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask))
> return map;
>
> kfree(map);
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index ff72e18..ad07373 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1799,7 +1799,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
> }
> }
>
> -static void blk_mq_map_swqueue(struct request_queue *q)
> +static void blk_mq_map_swqueue(struct request_queue *q,
> + const struct cpumask *online_mask)
> {
> unsigned int i;
> struct blk_mq_hw_ctx *hctx;
> @@ -1816,7 +1817,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
> */
> queue_for_each_ctx(q, ctx, i) {
> /* If the cpu isn't online, the cpu is mapped to first hctx */
> - if (!cpu_online(i))
> + if (!cpumask_test_cpu(i, online_mask))
> continue;
>
> hctx = q->mq_ops->map_queue(q, i);
> @@ -2046,7 +2047,10 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>
> blk_mq_add_queue_tag_set(set, q);
>
> - blk_mq_map_swqueue(q);
> + get_online_cpus();
> + blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, cpu_online_mask);
> + blk_mq_map_swqueue(q, cpu_online_mask);
> + put_online_cpus();
>
> return q;
>
> @@ -2083,13 +2087,14 @@ void blk_mq_free_queue(struct request_queue *q)
> }
>
> /* Basically redo blk_mq_init_queue with queue frozen */
> -static void blk_mq_queue_reinit(struct request_queue *q)
> +static void blk_mq_queue_reinit(struct request_queue *q,
> + const struct cpumask *online_mask)
> {
> WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
>
> blk_mq_sysfs_unregister(q);
>
> - blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
> + blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);
>
> /*
> * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
> @@ -2097,25 +2102,31 @@ static void blk_mq_queue_reinit(struct request_queue *q)
> * involves free and re-allocate memory, worthy doing?)
> */
>
> - blk_mq_map_swqueue(q);
> + blk_mq_map_swqueue(q, online_mask);
>
> blk_mq_sysfs_register(q);
> }
>
> +static struct cpumask online_new;
Better to comment on why there isn't race on the single
global variable.
> +
> static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
> unsigned long action, void *hcpu)
> {
> struct request_queue *q;
> + int cpu = (unsigned long)hcpu;
>
> - /*
> - * Before new mappings are established, hotadded cpu might already
> - * start handling requests. This doesn't break anything as we map
> - * offline CPUs to first hardware queue. We will re-init the queue
> - * below to get optimal settings.
> - */
You should put some comment here about why the change is introduced
because the race isn't very obvious.
> - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
> - action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
> + switch (action & ~CPU_TASKS_FROZEN) {
> + case CPU_DEAD:
> + case CPU_UP_CANCELED:
> + cpumask_copy(&online_new, cpu_online_mask);
> + break;
> + case CPU_UP_PREPARE:
> + cpumask_copy(&online_new, cpu_online_mask);
> + cpumask_set_cpu(cpu, &online_new);
> + break;
> + default:
> return NOTIFY_OK;
> + }
>
> mutex_lock(&all_q_mutex);
>
> @@ -2139,7 +2150,7 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
> }
>
> list_for_each_entry(q, &all_q_list, all_q_node)
> - blk_mq_queue_reinit(q);
> + blk_mq_queue_reinit(q, &online_new);
>
> list_for_each_entry(q, &all_q_list, all_q_node)
> blk_mq_unfreeze_queue(q);
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index 6a48c4c..f4fea79 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -51,7 +51,8 @@ void blk_mq_disable_hotplug(void);
> * CPU -> queue mappings
> */
> extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
> -extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
> +extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
> + const struct cpumask *online_mask);
> extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
>
> /*
> --
> 1.9.1
>
--
Ming Lei
Hi Ming,
2015-07-08 20:48 GMT+09:00 Ming Lei <[email protected]>:
> On Thu, Jul 2, 2015 at 10:29 PM, Akinobu Mita <[email protected]> wrote:
>> There is a race between cpu hotplug handling and adding/deleting
>> gendisk for blk-mq, where both are trying to register and unregister
>> the same sysfs entries.
>>
>> CPU hotplug handling for blk-mq (blk_mq_queue_reinit) does unregister
>> and register sysfs entries of hctx for all request queues in
>> all_q_list. On the other hand, these entries for a request queue may
>> have already been unregistered or in the middle of registration when
>> CPU hotplug event occurs. Because those sysfs entries are registered
>> by blk_mq_register_disk() after the request queue is added to
>> all_q_list, and similarily the request queue is deleted from
>> all_q_list after those are unregistered by blk_mq_unregister_disk().
>>
>> So we need proper mutual exclusion for those registration and
>> unregistration.
>
> But that may not be enough, and you miss another point in which
> blk_mq_sysfs_register() has to be called after the remapping is built,
> so the introduced lock should have been held in blk_mq_queue_reinit().
OK.
Although q->mq_sysfs_lock is held in blk_mq_sysfs_register() by
this patch, it should be held in blk_mq_queue_reinit() as you
suggested. Since hctx->nr_ctx is reset temporarily in
blk_mq_map_swqueue() while cpu hotplug callback is called, so we
should avoid blk_mq_register_disk() seeing the temporary
hctx->nr_ctx value (i.e. zero) at the same time by holding
q->mq_sysfs_lock in blk_mq_queue_reinit().
>> Signed-off-by: Akinobu Mita <[email protected]>
>> Cc: Jens Axboe <[email protected]>
>> Cc: Ming Lei <[email protected]>
>> ---
>> block/blk-core.c | 1 +
>> block/blk-mq-sysfs.c | 44 ++++++++++++++++++++++++++++++++++----------
>> include/linux/blkdev.h | 3 +++
>> 3 files changed, 38 insertions(+), 10 deletions(-)
>>
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index 82819e6..bbf67cd 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -687,6 +687,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
>> __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
>>
>> init_waitqueue_head(&q->mq_freeze_wq);
>> + mutex_init(&q->mq_sysfs_lock);
>
> The above should be put into blk_mq_init_allocated_queue().
OK, make sense.
>>
>> if (blkcg_init_queue(q))
>> goto fail_bdi;
>> diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
>> index b79685e..79a3e8d 100644
>> --- a/block/blk-mq-sysfs.c
>> +++ b/block/blk-mq-sysfs.c
>> @@ -332,13 +332,15 @@ static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
>> struct blk_mq_ctx *ctx;
>> int i;
>>
>> - if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
>> + if (!(hctx->flags & BLK_MQ_F_SYSFS_UP))
>> return;
>
> It isn't needed to introduce above change.
>
>>
>> hctx_for_each_ctx(hctx, ctx, i)
>> kobject_del(&ctx->kobj);
>>
>> kobject_del(&hctx->kobj);
>> +
>> + hctx->flags &= ~BLK_MQ_F_SYSFS_UP;
>
> I guess you misunderstand the flag, which just means the syfs stuff
> for the hcts has been setup, and you needn't clear it.
I've changed the meaning of the flag which should have been
mentioned and also introduced q->mq_sysfs_init_done. So I admit
that the code has become much complicated.
But after consideration, I think we can only need q->mq_sysfs_init_done
and BLK_MQ_F_SYSFS_UP can be removed. Please see below.
>> }
>>
>> static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
>> @@ -347,7 +349,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
>> struct blk_mq_ctx *ctx;
>> int i, ret;
>>
>> - if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
>> + if (!hctx->nr_ctx)
>> return 0;
>>
>> ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
>> @@ -357,10 +359,12 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
>> hctx_for_each_ctx(hctx, ctx, i) {
>> ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
>> if (ret)
>> - break;
>> + return ret;
>> }
>>
>> - return ret;
>> + hctx->flags |= BLK_MQ_F_SYSFS_UP;
>
> Same with above.
>
>> +
>> + return 0;
>> }
>>
>> void blk_mq_unregister_disk(struct gendisk *disk)
>> @@ -370,6 +374,8 @@ void blk_mq_unregister_disk(struct gendisk *disk)
>> struct blk_mq_ctx *ctx;
>> int i, j;
>>
>> + mutex_lock(&q->mq_sysfs_lock);
>> +
>> queue_for_each_hw_ctx(q, hctx, i) {
>> blk_mq_unregister_hctx(hctx);
>>
>> @@ -384,6 +390,9 @@ void blk_mq_unregister_disk(struct gendisk *disk)
>> kobject_put(&q->mq_kobj);
>>
>> kobject_put(&disk_to_dev(disk)->kobj);
>> +
>> + q->mq_sysfs_init_done = false;
>
> The flag isn't needed, but you should hold q->mq_sysfs_lock
> inside blk_mq_queue_reinit().
The reason for q->mq_sysfs_init_done is to avoid the following
scenario.
While the request queue is added to 'all_q_list' (*),
blk_mq_queue_reinit() can be called for the queue anytime by CPU
hotplug callback. But blk_mq_sysfs_unregister (-) and
blk_mq_sysfs_register (+) in blk_mq_queue_reinit must not be
called before blk_mq_register_disk (++) is finished and after
blk_mq_unregister_disk (--). Because '/sys/block/nullb*/mq/' is
not exists.
BLK_MQ_F_SYSFS_UP flag in hctx->flags fixes this issue partially now.
In order to fix it completely, we just need q->mq_sysfs_init_done
instead of per hctx flag with appropriate locking.
null_add_dev
--> blk_mq_init_queue
--> blk_mq_init_allocated_queue
--> add to 'all_q_list' (*)
--> add_disk
--> blk_register_queue
--> blk_mq_register_disk (++)
null_del_dev
--> del_gendisk
--> blk_unregister_queue
--> blk_mq_unregister_disk (--)
--> blk_cleanup_queue
--> blk_mq_free_queue
--> del from 'all_q_list' (*)
blk_mq_queue_reinit
--> blk_mq_sysfs_unregister (-)
--> blk_mq_sysfs_register (+)
On Thu, Jul 9, 2015 at 10:25 PM, Akinobu Mita <[email protected]> wrote:
> Hi Ming,
>
> 2015-07-08 20:48 GMT+09:00 Ming Lei <[email protected]>:
>> On Thu, Jul 2, 2015 at 10:29 PM, Akinobu Mita <[email protected]> wrote:
>>> There is a race between cpu hotplug handling and adding/deleting
>>> gendisk for blk-mq, where both are trying to register and unregister
>>> the same sysfs entries.
>>>
>>> CPU hotplug handling for blk-mq (blk_mq_queue_reinit) does unregister
>>> and register sysfs entries of hctx for all request queues in
>>> all_q_list. On the other hand, these entries for a request queue may
>>> have already been unregistered or in the middle of registration when
>>> CPU hotplug event occurs. Because those sysfs entries are registered
>>> by blk_mq_register_disk() after the request queue is added to
>>> all_q_list, and similarily the request queue is deleted from
>>> all_q_list after those are unregistered by blk_mq_unregister_disk().
>>>
>>> So we need proper mutual exclusion for those registration and
>>> unregistration.
>>
>> But that may not be enough, and you miss another point in which
>> blk_mq_sysfs_register() has to be called after the remapping is built,
>> so the introduced lock should have been held in blk_mq_queue_reinit().
>
> OK.
>
> Although q->mq_sysfs_lock is held in blk_mq_sysfs_register() by
> this patch, it should be held in blk_mq_queue_reinit() as you
> suggested. Since hctx->nr_ctx is reset temporarily in
> blk_mq_map_swqueue() while cpu hotplug callback is called, so we
> should avoid blk_mq_register_disk() seeing the temporary
> hctx->nr_ctx value (i.e. zero) at the same time by holding
> q->mq_sysfs_lock in blk_mq_queue_reinit().
>
>>> Signed-off-by: Akinobu Mita <[email protected]>
>>> Cc: Jens Axboe <[email protected]>
>>> Cc: Ming Lei <[email protected]>
>>> ---
>>> block/blk-core.c | 1 +
>>> block/blk-mq-sysfs.c | 44 ++++++++++++++++++++++++++++++++++----------
>>> include/linux/blkdev.h | 3 +++
>>> 3 files changed, 38 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>> index 82819e6..bbf67cd 100644
>>> --- a/block/blk-core.c
>>> +++ b/block/blk-core.c
>>> @@ -687,6 +687,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
>>> __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
>>>
>>> init_waitqueue_head(&q->mq_freeze_wq);
>>> + mutex_init(&q->mq_sysfs_lock);
>>
>> The above should be put into blk_mq_init_allocated_queue().
>
> OK, make sense.
>
>>>
>>> if (blkcg_init_queue(q))
>>> goto fail_bdi;
>>> diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
>>> index b79685e..79a3e8d 100644
>>> --- a/block/blk-mq-sysfs.c
>>> +++ b/block/blk-mq-sysfs.c
>>> @@ -332,13 +332,15 @@ static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
>>> struct blk_mq_ctx *ctx;
>>> int i;
>>>
>>> - if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
>>> + if (!(hctx->flags & BLK_MQ_F_SYSFS_UP))
>>> return;
>>
>> It isn't needed to introduce above change.
>>
>>>
>>> hctx_for_each_ctx(hctx, ctx, i)
>>> kobject_del(&ctx->kobj);
>>>
>>> kobject_del(&hctx->kobj);
>>> +
>>> + hctx->flags &= ~BLK_MQ_F_SYSFS_UP;
>>
>> I guess you misunderstand the flag, which just means the syfs stuff
>> for the hcts has been setup, and you needn't clear it.
>
> I've changed the meaning of the flag which should have been
> mentioned and also introduced q->mq_sysfs_init_done. So I admit
> that the code has become much complicated.
>
> But after consideration, I think we can only need q->mq_sysfs_init_done
> and BLK_MQ_F_SYSFS_UP can be removed. Please see below.
>
>>> }
>>>
>>> static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
>>> @@ -347,7 +349,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
>>> struct blk_mq_ctx *ctx;
>>> int i, ret;
>>>
>>> - if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
>>> + if (!hctx->nr_ctx)
>>> return 0;
>>>
>>> ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
>>> @@ -357,10 +359,12 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
>>> hctx_for_each_ctx(hctx, ctx, i) {
>>> ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
>>> if (ret)
>>> - break;
>>> + return ret;
>>> }
>>>
>>> - return ret;
>>> + hctx->flags |= BLK_MQ_F_SYSFS_UP;
>>
>> Same with above.
>>
>>> +
>>> + return 0;
>>> }
>>>
>>> void blk_mq_unregister_disk(struct gendisk *disk)
>>> @@ -370,6 +374,8 @@ void blk_mq_unregister_disk(struct gendisk *disk)
>>> struct blk_mq_ctx *ctx;
>>> int i, j;
>>>
>>> + mutex_lock(&q->mq_sysfs_lock);
>>> +
>>> queue_for_each_hw_ctx(q, hctx, i) {
>>> blk_mq_unregister_hctx(hctx);
>>>
>>> @@ -384,6 +390,9 @@ void blk_mq_unregister_disk(struct gendisk *disk)
>>> kobject_put(&q->mq_kobj);
>>>
>>> kobject_put(&disk_to_dev(disk)->kobj);
>>> +
>>> + q->mq_sysfs_init_done = false;
>>
>> The flag isn't needed, but you should hold q->mq_sysfs_lock
>> inside blk_mq_queue_reinit().
>
> The reason for q->mq_sysfs_init_done is to avoid the following
> scenario.
>
> While the request queue is added to 'all_q_list' (*),
> blk_mq_queue_reinit() can be called for the queue anytime by CPU
> hotplug callback. But blk_mq_sysfs_unregister (-) and
> blk_mq_sysfs_register (+) in blk_mq_queue_reinit must not be
> called before blk_mq_register_disk (++) is finished and after
> blk_mq_unregister_disk (--). Because '/sys/block/nullb*/mq/' is
> not exists.
Yes.
>
> BLK_MQ_F_SYSFS_UP flag in hctx->flags fixes this issue partially now.
> In order to fix it completely, we just need q->mq_sysfs_init_done
> instead of per hctx flag with appropriate locking.
One flag should be OK, but keeping both just makes things
complicated, and looks mq_sysfs_init_done is better.
>
> null_add_dev
> --> blk_mq_init_queue
> --> blk_mq_init_allocated_queue
> --> add to 'all_q_list' (*)
> --> add_disk
> --> blk_register_queue
> --> blk_mq_register_disk (++)
>
> null_del_dev
> --> del_gendisk
> --> blk_unregister_queue
> --> blk_mq_unregister_disk (--)
> --> blk_cleanup_queue
> --> blk_mq_free_queue
> --> del from 'all_q_list' (*)
>
> blk_mq_queue_reinit
> --> blk_mq_sysfs_unregister (-)
> --> blk_mq_sysfs_register (+)
--
Ming Lei
On Thu, Jul 02, 2015 at 11:29:56PM +0900, Akinobu Mita wrote:
> There are several race conditions while freezing queue.
...
> Also, percpu_ref_reinit() and percpu_ref_kill() must not be executed
> at the same time. Because both functions could call
> __percpu_ref_switch_to_percpu() which adds the bias value and
> initialize percpu counter.
>
> Fix those races by serializing with per-queue mutex.
>
> Signed-off-by: Akinobu Mita <[email protected]>
> Cc: Jens Axboe <[email protected]>
> Cc: Ming Lei <[email protected]>
Yeap, all of the kill/reinit/switch mode functions assume caller side
synchronization.
Acked-by: Tejun Heo <[email protected]>
Thanks.
--
tejun
On Thu, Jul 09, 2015 at 02:55:10PM +0800, Ming Lei wrote:
> This patch looks fine since at least changing DEAD state of percpu ref
> state should have been synchronized by caller.
>
> Also looks __percpu_ref_switch_to_percpu() need to check if the refcount
> becomes dead after current switching, and seems something like following
> is needed:
Hmmm... but the caller is responsible for synchronization for these
operations. If the state changes while waiting, it indicates a
synchronization bug and testing it one more time is kinda weird. What
if it changes right after that test?
Thanks.
--
tejun
On Fri, Jul 10, 2015 at 5:37 AM, Tejun Heo <[email protected]> wrote:
> On Thu, Jul 09, 2015 at 02:55:10PM +0800, Ming Lei wrote:
>> This patch looks fine since at least changing DEAD state of percpu ref
>> state should have been synchronized by caller.
>>
>> Also looks __percpu_ref_switch_to_percpu() need to check if the refcount
>> becomes dead after current switching, and seems something like following
>> is needed:
>
> Hmmm... but the caller is responsible for synchronization for these
> operations. If the state changes while waiting, it indicates a
> synchronization bug and testing it one more time is kinda weird. What
> if it changes right after that test?
OK.
Also looks it is helpful to detect the bug early by adding
WARN_ON(ref->percpu_count_ptr & __PERCPU_REF_DEAD)
after wait_event().
--
Ming Lei