Drivers that need to do synchronous, blocking operations to do I/O generally
want to defer all I/O to a drŅver-private workqueue. Examples for that are
the loop driver, rbd, or ubi block driver, and probably lots more that haven't
been evaluated yet.
We have various block drivers that need to execute long term blocking
operations during I/O submission like file system or network I/O.
Currently these drivers just queue up work to an internal workqueue
from their request_fn. With blk-mq we can make sure they always get
called on their own workqueue directly for I/O submission by:
1) adding a flag to prevent inline submission of I/O, and
2) allowing the driver to pass in a workqueue in the tag_set that
will be used instead of kblockd.
Signed-off-by: Christoph Hellwig <[email protected]>
---
block/blk-core.c | 2 +-
block/blk-mq.c | 12 +++++++++---
block/blk.h | 1 +
include/linux/blk-mq.h | 4 ++++
4 files changed, 15 insertions(+), 4 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 0421b53..7f7249f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -61,7 +61,7 @@ struct kmem_cache *blk_requestq_cachep;
/*
* Controlling structure to kblockd
*/
-static struct workqueue_struct *kblockd_workqueue;
+struct workqueue_struct *kblockd_workqueue;
void blk_queue_congestion_threshold(struct request_queue *q)
{
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 22e50a5..3d27d22 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -804,12 +804,13 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
return;
- if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) {
+ if (!async && !(hctx->flags & BLK_MQ_F_WORKQUEUE) &&
+ cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) {
__blk_mq_run_hw_queue(hctx);
return;
}
- kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+ queue_delayed_work_on(blk_mq_hctx_next_cpu(hctx), hctx->wq,
&hctx->run_work, 0);
}
@@ -908,7 +909,7 @@ static void blk_mq_delay_work_fn(struct work_struct *work)
void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
{
- kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+ queue_delayed_work_on(blk_mq_hctx_next_cpu(hctx), hctx->wq,
&hctx->delay_work, msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_mq_delay_queue);
@@ -1581,6 +1582,11 @@ static int blk_mq_init_hctx(struct request_queue *q,
hctx->flags = set->flags;
hctx->cmd_size = set->cmd_size;
+ if (set->wq)
+ hctx->wq = set->wq;
+ else
+ hctx->wq = kblockd_workqueue;
+
blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
blk_mq_hctx_notify, hctx);
blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
diff --git a/block/blk.h b/block/blk.h
index 43b0361..fb46ad0 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -25,6 +25,7 @@ struct blk_flush_queue {
spinlock_t mq_flush_lock;
};
+extern struct workqueue_struct *kblockd_workqueue;
extern struct kmem_cache *blk_requestq_cachep;
extern struct kmem_cache *request_cachep;
extern struct kobj_type blk_queue_ktype;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 5a901d0..ebe4699 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -37,6 +37,8 @@ struct blk_mq_hw_ctx {
unsigned int queue_num;
struct blk_flush_queue *fq;
+ struct workqueue_struct *wq;
+
void *driver_data;
struct blk_mq_ctxmap ctx_map;
@@ -64,6 +66,7 @@ struct blk_mq_hw_ctx {
struct blk_mq_tag_set {
struct blk_mq_ops *ops;
+ struct workqueue_struct *wq;
unsigned int nr_hw_queues;
unsigned int queue_depth; /* max hw supported */
unsigned int reserved_tags;
@@ -156,6 +159,7 @@ enum {
BLK_MQ_F_SG_MERGE = 1 << 2,
BLK_MQ_F_SYSFS_UP = 1 << 3,
BLK_MQ_F_DEFER_ISSUE = 1 << 4,
+ BLK_MQ_F_WORKQUEUE = 1 << 5,
BLK_MQ_S_STOPPED = 0,
BLK_MQ_S_TAG_ACTIVE = 1,
--
1.9.1
Don't duplicate the code to handle the not cpu bounce case in the
caller, do it inside blk_mq_hctx_next_cpu instead.
Signed-off-by: Christoph Hellwig <[email protected]>
---
block/blk-mq.c | 34 +++++++++++++---------------------
1 file changed, 13 insertions(+), 21 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b355b59..22e50a5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -780,10 +780,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
*/
static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
{
- int cpu = hctx->next_cpu;
+ if (hctx->queue->nr_hw_queues == 1)
+ return WORK_CPU_UNBOUND;
if (--hctx->next_cpu_batch <= 0) {
- int next_cpu;
+ int cpu = hctx->next_cpu, next_cpu;
next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
if (next_cpu >= nr_cpu_ids)
@@ -791,9 +792,11 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
hctx->next_cpu = next_cpu;
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+
+ return cpu;
}
- return cpu;
+ return hctx->next_cpu;
}
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
@@ -801,16 +804,13 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
return;
- if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
+ if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) {
__blk_mq_run_hw_queue(hctx);
- else if (hctx->queue->nr_hw_queues == 1)
- kblockd_schedule_delayed_work(&hctx->run_work, 0);
- else {
- unsigned int cpu;
-
- cpu = blk_mq_hctx_next_cpu(hctx);
- kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
+ return;
}
+
+ kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+ &hctx->run_work, 0);
}
void blk_mq_run_queues(struct request_queue *q, bool async)
@@ -908,16 +908,8 @@ static void blk_mq_delay_work_fn(struct work_struct *work)
void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
{
- unsigned long tmo = msecs_to_jiffies(msecs);
-
- if (hctx->queue->nr_hw_queues == 1)
- kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
- else {
- unsigned int cpu;
-
- cpu = blk_mq_hctx_next_cpu(hctx);
- kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
- }
+ kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+ &hctx->delay_work, msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_mq_delay_queue);
--
1.9.1
Hi Christoph,
On Mon, Nov 3, 2014 at 4:23 PM, Christoph Hellwig <[email protected]> wrote:
> We have various block drivers that need to execute long term blocking
> operations during I/O submission like file system or network I/O.
>
> Currently these drivers just queue up work to an internal workqueue
> from their request_fn. With blk-mq we can make sure they always get
> called on their own workqueue directly for I/O submission by:
>
> 1) adding a flag to prevent inline submission of I/O, and
> 2) allowing the driver to pass in a workqueue in the tag_set that
> will be used instead of kblockd.
The above two aren't enough because the big problem is that
drivers need a per-request work structure instead of 'hctx->run_work',
otherwise there are at most NR_CPUS concurrent submissions.
So the per-request work structure should be exposed to blk-mq
too for the kind of usage, such as .blk_mq_req_work(req) callback
in case of BLK_MQ_F_WORKQUEUE.
Thanks,
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> block/blk-core.c | 2 +-
> block/blk-mq.c | 12 +++++++++---
> block/blk.h | 1 +
> include/linux/blk-mq.h | 4 ++++
> 4 files changed, 15 insertions(+), 4 deletions(-)
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 0421b53..7f7249f 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -61,7 +61,7 @@ struct kmem_cache *blk_requestq_cachep;
> /*
> * Controlling structure to kblockd
> */
> -static struct workqueue_struct *kblockd_workqueue;
> +struct workqueue_struct *kblockd_workqueue;
>
> void blk_queue_congestion_threshold(struct request_queue *q)
> {
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 22e50a5..3d27d22 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -804,12 +804,13 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
> if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
> return;
>
> - if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) {
> + if (!async && !(hctx->flags & BLK_MQ_F_WORKQUEUE) &&
> + cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) {
> __blk_mq_run_hw_queue(hctx);
> return;
> }
>
> - kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
> + queue_delayed_work_on(blk_mq_hctx_next_cpu(hctx), hctx->wq,
> &hctx->run_work, 0);
> }
>
> @@ -908,7 +909,7 @@ static void blk_mq_delay_work_fn(struct work_struct *work)
>
> void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
> {
> - kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
> + queue_delayed_work_on(blk_mq_hctx_next_cpu(hctx), hctx->wq,
> &hctx->delay_work, msecs_to_jiffies(msecs));
> }
> EXPORT_SYMBOL(blk_mq_delay_queue);
> @@ -1581,6 +1582,11 @@ static int blk_mq_init_hctx(struct request_queue *q,
> hctx->flags = set->flags;
> hctx->cmd_size = set->cmd_size;
>
> + if (set->wq)
> + hctx->wq = set->wq;
> + else
> + hctx->wq = kblockd_workqueue;
> +
> blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
> blk_mq_hctx_notify, hctx);
> blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
> diff --git a/block/blk.h b/block/blk.h
> index 43b0361..fb46ad0 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -25,6 +25,7 @@ struct blk_flush_queue {
> spinlock_t mq_flush_lock;
> };
>
> +extern struct workqueue_struct *kblockd_workqueue;
> extern struct kmem_cache *blk_requestq_cachep;
> extern struct kmem_cache *request_cachep;
> extern struct kobj_type blk_queue_ktype;
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 5a901d0..ebe4699 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -37,6 +37,8 @@ struct blk_mq_hw_ctx {
> unsigned int queue_num;
> struct blk_flush_queue *fq;
>
> + struct workqueue_struct *wq;
> +
> void *driver_data;
>
> struct blk_mq_ctxmap ctx_map;
> @@ -64,6 +66,7 @@ struct blk_mq_hw_ctx {
>
> struct blk_mq_tag_set {
> struct blk_mq_ops *ops;
> + struct workqueue_struct *wq;
> unsigned int nr_hw_queues;
> unsigned int queue_depth; /* max hw supported */
> unsigned int reserved_tags;
> @@ -156,6 +159,7 @@ enum {
> BLK_MQ_F_SG_MERGE = 1 << 2,
> BLK_MQ_F_SYSFS_UP = 1 << 3,
> BLK_MQ_F_DEFER_ISSUE = 1 << 4,
> + BLK_MQ_F_WORKQUEUE = 1 << 5,
>
> BLK_MQ_S_STOPPED = 0,
> BLK_MQ_S_TAG_ACTIVE = 1,
> --
> 1.9.1
>
--
Ming Lei
On Mon, Nov 03, 2014 at 04:40:47PM +0800, Ming Lei wrote:
> The above two aren't enough because the big problem is that
> drivers need a per-request work structure instead of 'hctx->run_work',
> otherwise there are at most NR_CPUS concurrent submissions.
>
> So the per-request work structure should be exposed to blk-mq
> too for the kind of usage, such as .blk_mq_req_work(req) callback
> in case of BLK_MQ_F_WORKQUEUE.
Hmm. Maybe a better option is to just add a flag to never defer
->queue_rq to a workqueue and let drivers handle the it?
On Mon, Nov 3, 2014 at 6:10 PM, Christoph Hellwig <[email protected]> wrote:
> On Mon, Nov 03, 2014 at 04:40:47PM +0800, Ming Lei wrote:
>> The above two aren't enough because the big problem is that
>> drivers need a per-request work structure instead of 'hctx->run_work',
>> otherwise there are at most NR_CPUS concurrent submissions.
>>
>> So the per-request work structure should be exposed to blk-mq
>> too for the kind of usage, such as .blk_mq_req_work(req) callback
>> in case of BLK_MQ_F_WORKQUEUE.
>
> Hmm. Maybe a better option is to just add a flag to never defer
> ->queue_rq to a workqueue and let drivers handle the it?
That should work, but might lose potential merge benefit of defer.
Thanks,