2014-11-03 08:24:03

by Christoph Hellwig

[permalink] [raw]
Subject: blk-mq: allow to defer ->queue_rq invocations to workqueue

Drivers that need to do synchronous, blocking operations to do I/O generally
want to defer all I/O to a drŅ–ver-private workqueue. Examples for that are
the loop driver, rbd, or ubi block driver, and probably lots more that haven't
been evaluated yet.


2014-11-03 08:23:29

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 2/2] blk-mq: allow direct dispatch to a driver specific workqueue

We have various block drivers that need to execute long term blocking
operations during I/O submission like file system or network I/O.

Currently these drivers just queue up work to an internal workqueue
from their request_fn. With blk-mq we can make sure they always get
called on their own workqueue directly for I/O submission by:

1) adding a flag to prevent inline submission of I/O, and
2) allowing the driver to pass in a workqueue in the tag_set that
will be used instead of kblockd.

Signed-off-by: Christoph Hellwig <[email protected]>
---
block/blk-core.c | 2 +-
block/blk-mq.c | 12 +++++++++---
block/blk.h | 1 +
include/linux/blk-mq.h | 4 ++++
4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 0421b53..7f7249f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -61,7 +61,7 @@ struct kmem_cache *blk_requestq_cachep;
/*
* Controlling structure to kblockd
*/
-static struct workqueue_struct *kblockd_workqueue;
+struct workqueue_struct *kblockd_workqueue;

void blk_queue_congestion_threshold(struct request_queue *q)
{
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 22e50a5..3d27d22 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -804,12 +804,13 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
return;

- if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) {
+ if (!async && !(hctx->flags & BLK_MQ_F_WORKQUEUE) &&
+ cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) {
__blk_mq_run_hw_queue(hctx);
return;
}

- kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+ queue_delayed_work_on(blk_mq_hctx_next_cpu(hctx), hctx->wq,
&hctx->run_work, 0);
}

@@ -908,7 +909,7 @@ static void blk_mq_delay_work_fn(struct work_struct *work)

void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
{
- kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+ queue_delayed_work_on(blk_mq_hctx_next_cpu(hctx), hctx->wq,
&hctx->delay_work, msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_mq_delay_queue);
@@ -1581,6 +1582,11 @@ static int blk_mq_init_hctx(struct request_queue *q,
hctx->flags = set->flags;
hctx->cmd_size = set->cmd_size;

+ if (set->wq)
+ hctx->wq = set->wq;
+ else
+ hctx->wq = kblockd_workqueue;
+
blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
blk_mq_hctx_notify, hctx);
blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
diff --git a/block/blk.h b/block/blk.h
index 43b0361..fb46ad0 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -25,6 +25,7 @@ struct blk_flush_queue {
spinlock_t mq_flush_lock;
};

+extern struct workqueue_struct *kblockd_workqueue;
extern struct kmem_cache *blk_requestq_cachep;
extern struct kmem_cache *request_cachep;
extern struct kobj_type blk_queue_ktype;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 5a901d0..ebe4699 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -37,6 +37,8 @@ struct blk_mq_hw_ctx {
unsigned int queue_num;
struct blk_flush_queue *fq;

+ struct workqueue_struct *wq;
+
void *driver_data;

struct blk_mq_ctxmap ctx_map;
@@ -64,6 +66,7 @@ struct blk_mq_hw_ctx {

struct blk_mq_tag_set {
struct blk_mq_ops *ops;
+ struct workqueue_struct *wq;
unsigned int nr_hw_queues;
unsigned int queue_depth; /* max hw supported */
unsigned int reserved_tags;
@@ -156,6 +159,7 @@ enum {
BLK_MQ_F_SG_MERGE = 1 << 2,
BLK_MQ_F_SYSFS_UP = 1 << 3,
BLK_MQ_F_DEFER_ISSUE = 1 << 4,
+ BLK_MQ_F_WORKQUEUE = 1 << 5,

BLK_MQ_S_STOPPED = 0,
BLK_MQ_S_TAG_ACTIVE = 1,
--
1.9.1

2014-11-03 08:23:32

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 1/2] blk-mq: handle single queue case in blk_mq_hctx_next_cpu

Don't duplicate the code to handle the not cpu bounce case in the
caller, do it inside blk_mq_hctx_next_cpu instead.

Signed-off-by: Christoph Hellwig <[email protected]>
---
block/blk-mq.c | 34 +++++++++++++---------------------
1 file changed, 13 insertions(+), 21 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b355b59..22e50a5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -780,10 +780,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
*/
static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
{
- int cpu = hctx->next_cpu;
+ if (hctx->queue->nr_hw_queues == 1)
+ return WORK_CPU_UNBOUND;

if (--hctx->next_cpu_batch <= 0) {
- int next_cpu;
+ int cpu = hctx->next_cpu, next_cpu;

next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
if (next_cpu >= nr_cpu_ids)
@@ -791,9 +792,11 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)

hctx->next_cpu = next_cpu;
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+
+ return cpu;
}

- return cpu;
+ return hctx->next_cpu;
}

void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
@@ -801,16 +804,13 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
return;

- if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
+ if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) {
__blk_mq_run_hw_queue(hctx);
- else if (hctx->queue->nr_hw_queues == 1)
- kblockd_schedule_delayed_work(&hctx->run_work, 0);
- else {
- unsigned int cpu;
-
- cpu = blk_mq_hctx_next_cpu(hctx);
- kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
+ return;
}
+
+ kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+ &hctx->run_work, 0);
}

void blk_mq_run_queues(struct request_queue *q, bool async)
@@ -908,16 +908,8 @@ static void blk_mq_delay_work_fn(struct work_struct *work)

void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
{
- unsigned long tmo = msecs_to_jiffies(msecs);
-
- if (hctx->queue->nr_hw_queues == 1)
- kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
- else {
- unsigned int cpu;
-
- cpu = blk_mq_hctx_next_cpu(hctx);
- kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
- }
+ kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+ &hctx->delay_work, msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_mq_delay_queue);

--
1.9.1

2014-11-03 08:40:53

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH 2/2] blk-mq: allow direct dispatch to a driver specific workqueue

Hi Christoph,

On Mon, Nov 3, 2014 at 4:23 PM, Christoph Hellwig <[email protected]> wrote:
> We have various block drivers that need to execute long term blocking
> operations during I/O submission like file system or network I/O.
>
> Currently these drivers just queue up work to an internal workqueue
> from their request_fn. With blk-mq we can make sure they always get
> called on their own workqueue directly for I/O submission by:
>
> 1) adding a flag to prevent inline submission of I/O, and
> 2) allowing the driver to pass in a workqueue in the tag_set that
> will be used instead of kblockd.

The above two aren't enough because the big problem is that
drivers need a per-request work structure instead of 'hctx->run_work',
otherwise there are at most NR_CPUS concurrent submissions.

So the per-request work structure should be exposed to blk-mq
too for the kind of usage, such as .blk_mq_req_work(req) callback
in case of BLK_MQ_F_WORKQUEUE.

Thanks,

>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> block/blk-core.c | 2 +-
> block/blk-mq.c | 12 +++++++++---
> block/blk.h | 1 +
> include/linux/blk-mq.h | 4 ++++
> 4 files changed, 15 insertions(+), 4 deletions(-)
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 0421b53..7f7249f 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -61,7 +61,7 @@ struct kmem_cache *blk_requestq_cachep;
> /*
> * Controlling structure to kblockd
> */
> -static struct workqueue_struct *kblockd_workqueue;
> +struct workqueue_struct *kblockd_workqueue;
>
> void blk_queue_congestion_threshold(struct request_queue *q)
> {
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 22e50a5..3d27d22 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -804,12 +804,13 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
> if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
> return;
>
> - if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) {
> + if (!async && !(hctx->flags & BLK_MQ_F_WORKQUEUE) &&
> + cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) {
> __blk_mq_run_hw_queue(hctx);
> return;
> }
>
> - kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
> + queue_delayed_work_on(blk_mq_hctx_next_cpu(hctx), hctx->wq,
> &hctx->run_work, 0);
> }
>
> @@ -908,7 +909,7 @@ static void blk_mq_delay_work_fn(struct work_struct *work)
>
> void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
> {
> - kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
> + queue_delayed_work_on(blk_mq_hctx_next_cpu(hctx), hctx->wq,
> &hctx->delay_work, msecs_to_jiffies(msecs));
> }
> EXPORT_SYMBOL(blk_mq_delay_queue);
> @@ -1581,6 +1582,11 @@ static int blk_mq_init_hctx(struct request_queue *q,
> hctx->flags = set->flags;
> hctx->cmd_size = set->cmd_size;
>
> + if (set->wq)
> + hctx->wq = set->wq;
> + else
> + hctx->wq = kblockd_workqueue;
> +
> blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
> blk_mq_hctx_notify, hctx);
> blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
> diff --git a/block/blk.h b/block/blk.h
> index 43b0361..fb46ad0 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -25,6 +25,7 @@ struct blk_flush_queue {
> spinlock_t mq_flush_lock;
> };
>
> +extern struct workqueue_struct *kblockd_workqueue;
> extern struct kmem_cache *blk_requestq_cachep;
> extern struct kmem_cache *request_cachep;
> extern struct kobj_type blk_queue_ktype;
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 5a901d0..ebe4699 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -37,6 +37,8 @@ struct blk_mq_hw_ctx {
> unsigned int queue_num;
> struct blk_flush_queue *fq;
>
> + struct workqueue_struct *wq;
> +
> void *driver_data;
>
> struct blk_mq_ctxmap ctx_map;
> @@ -64,6 +66,7 @@ struct blk_mq_hw_ctx {
>
> struct blk_mq_tag_set {
> struct blk_mq_ops *ops;
> + struct workqueue_struct *wq;
> unsigned int nr_hw_queues;
> unsigned int queue_depth; /* max hw supported */
> unsigned int reserved_tags;
> @@ -156,6 +159,7 @@ enum {
> BLK_MQ_F_SG_MERGE = 1 << 2,
> BLK_MQ_F_SYSFS_UP = 1 << 3,
> BLK_MQ_F_DEFER_ISSUE = 1 << 4,
> + BLK_MQ_F_WORKQUEUE = 1 << 5,
>
> BLK_MQ_S_STOPPED = 0,
> BLK_MQ_S_TAG_ACTIVE = 1,
> --
> 1.9.1
>



--
Ming Lei

2014-11-03 10:10:44

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 2/2] blk-mq: allow direct dispatch to a driver specific workqueue

On Mon, Nov 03, 2014 at 04:40:47PM +0800, Ming Lei wrote:
> The above two aren't enough because the big problem is that
> drivers need a per-request work structure instead of 'hctx->run_work',
> otherwise there are at most NR_CPUS concurrent submissions.
>
> So the per-request work structure should be exposed to blk-mq
> too for the kind of usage, such as .blk_mq_req_work(req) callback
> in case of BLK_MQ_F_WORKQUEUE.

Hmm. Maybe a better option is to just add a flag to never defer
->queue_rq to a workqueue and let drivers handle the it?

2014-11-03 11:54:52

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH 2/2] blk-mq: allow direct dispatch to a driver specific workqueue

On Mon, Nov 3, 2014 at 6:10 PM, Christoph Hellwig <[email protected]> wrote:
> On Mon, Nov 03, 2014 at 04:40:47PM +0800, Ming Lei wrote:
>> The above two aren't enough because the big problem is that
>> drivers need a per-request work structure instead of 'hctx->run_work',
>> otherwise there are at most NR_CPUS concurrent submissions.
>>
>> So the per-request work structure should be exposed to blk-mq
>> too for the kind of usage, such as .blk_mq_req_work(req) callback
>> in case of BLK_MQ_F_WORKQUEUE.
>
> Hmm. Maybe a better option is to just add a flag to never defer
> ->queue_rq to a workqueue and let drivers handle the it?

That should work, but might lose potential merge benefit of defer.


Thanks,