LinuxLists.cc - [PATCHSET/RFC] blk-mq scheduling framework

2016-12-07 23:10:12

[permalink] [raw]

Subject: [PATCHSET/RFC] blk-mq scheduling framework

I hacked up some support for registering blk-mq capable IO schedulers,
and when that was done, I adapted deadline to work with it as a new
mq-deadline scheduler.

Basically this is similar to the legacy scheduling patches I posted
recently, in that we setup a list of fake requests (I called them
shadows) that the IO scheduler works on. We then transform those
into a real blk-mq request, when we have to dispatch to hardware.

There's a hack in place to make this work with the flush/fua
requests, as those bypass the regular software queue insertion.
For now we simply ensure that we allocate a real request for
those. Howeer, I'd prefer if we simply inserted the request as
we usually would, and then start the flush state machinery when
we pull the request out of the queue. That would both be cleaner
from a flush perspective, and from the scheduling side as well.

I'm reusing the existing elevator interface, just augmenting
that with mq_ops and a ->uses_mq so we can tell which is which.
They show up automatically, for instance on a scsi-mq device:

$ cat /sys/block/sda/queue/scheduler
[mq-deadline] none

vs just a legacy device:

$ cat /sys/block/nullb0/queue/scheduler
noop deadline [cfq]

Changing schedulers is done in the same way as it always has,
by echoing into the 'scheduler' file. For MQ, there's a 'none'
setting as well that isn't a real scheduler, it simply turns off
the scheduler. Handy for comparison.

Obviously a direct deadline adaptation has performance implications,
so it can be improved. A _real_ MQ scheduler is forth coming, which
will sit on top of this interface. Paolo, for BFQ, this is the
interface you should target. Let me know if you have any questions
about how it works.

block/Kconfig.iosched | 6
block/Makefile | 3
block/blk-core.c | 3
block/blk-exec.c | 3
block/blk-flush.c | 7
block/blk-mq-sched.c | 243 ++++++++++++++++++
block/blk-mq-sched.h | 168 ++++++++++++
block/blk-mq-tag.c | 1
block/blk-mq.c | 241 ++++++++++--------
block/blk-mq.h | 33 --
block/elevator.c | 140 +++++++---
block/mq-deadline.c | 622 +++++++++++++++++++++++++++++++++++++++++++++++
include/linux/blk-mq.h | 3
include/linux/elevator.h | 30 ++
14 files changed, 1331 insertions(+), 172 deletions(-)

--
Jens Axboe

2016-12-07 23:10:16

[permalink] [raw]

Subject: [PATCH 2/7] blk-mq: abstract out blk_mq_dispatch_rq_list() helper

Takes a list of requests, and dispatches it. Moves any residual
requests to the dispatch list.

Signed-off-by: Jens Axboe <[email protected]>
---
block/blk-mq.c | 85 ++++++++++++++++++++++++++++++++--------------------------
block/blk-mq.h | 1 +
2 files changed, 48 insertions(+), 38 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b216746be9d3..abbf7cca4d0d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -821,41 +821,13 @@ static inline unsigned int queued_to_index(unsigned int queued)
return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
}

-/*
- * Run this hardware queue, pulling any software queues mapped to it in.
- * Note that this function currently has various problems around ordering
- * of IO. In particular, we'd like FIFO behaviour on handling existing
- * items on the hctx->dispatch list. Ignore that for now.
- */
-static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{
struct request_queue *q = hctx->queue;
struct request *rq;
- LIST_HEAD(rq_list);
LIST_HEAD(driver_list);
struct list_head *dptr;
- int queued;
-
- if (unlikely(blk_mq_hctx_stopped(hctx)))
- return;
-
- hctx->run++;
-
- /*
- * Touch any software queue that has pending entries.
- */
- flush_busy_ctxs(hctx, &rq_list);
-
- /*
- * If we have previous entries on our dispatch list, grab them
- * and stuff them at the front for more fair dispatch.
- */
- if (!list_empty_careful(&hctx->dispatch)) {
- spin_lock(&hctx->lock);
- if (!list_empty(&hctx->dispatch))
- list_splice_init(&hctx->dispatch, &rq_list);
- spin_unlock(&hctx->lock);
- }
+ int queued, ret = BLK_MQ_RQ_QUEUE_OK;

/*
* Start off with dptr being NULL, so we start the first request
@@ -867,16 +839,15 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
* Now process all the entries, sending them to the driver.
*/
queued = 0;
- while (!list_empty(&rq_list)) {
+ while (!list_empty(list)) {
struct blk_mq_queue_data bd;
- int ret;

- rq = list_first_entry(&rq_list, struct request, queuelist);
+ rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist);

bd.rq = rq;
bd.list = dptr;
- bd.last = list_empty(&rq_list);
+ bd.last = list_empty(list);

ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
@@ -884,7 +855,7 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
queued++;
break;
case BLK_MQ_RQ_QUEUE_BUSY:
- list_add(&rq->queuelist, &rq_list);
+ list_add(&rq->queuelist, list);
__blk_mq_requeue_request(rq);
break;
default:
@@ -902,7 +873,7 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
* We've done the first request. If we have more than 1
* left in the list, set dptr to defer issue.
*/
- if (!dptr && rq_list.next != rq_list.prev)
+ if (!dptr && list->next != list->prev)
dptr = &driver_list;
}

@@ -912,10 +883,11 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
* Any items that need requeuing? Stuff them into hctx->dispatch,
* that is where we will continue on next queue run.
*/
- if (!list_empty(&rq_list)) {
+ if (!list_empty(list)) {
spin_lock(&hctx->lock);
- list_splice(&rq_list, &hctx->dispatch);
+ list_splice(list, &hctx->dispatch);
spin_unlock(&hctx->lock);
+
/*
* the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
* it's possible the queue is stopped and restarted again
@@ -927,6 +899,43 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
**/
blk_mq_run_hw_queue(hctx, true);
}
+
+ return ret != BLK_MQ_RQ_QUEUE_BUSY;
+}
+
+/*
+ * Run this hardware queue, pulling any software queues mapped to it in.
+ * Note that this function currently has various problems around ordering
+ * of IO. In particular, we'd like FIFO behaviour on handling existing
+ * items on the hctx->dispatch list. Ignore that for now.
+ */
+static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
+{
+ LIST_HEAD(rq_list);
+ LIST_HEAD(driver_list);
+
+ if (unlikely(blk_mq_hctx_stopped(hctx)))
+ return;
+
+ hctx->run++;
+
+ /*
+ * Touch any software queue that has pending entries.
+ */
+ flush_busy_ctxs(hctx, &rq_list);
+
+ /*
+ * If we have previous entries on our dispatch list, grab them
+ * and stuff them at the front for more fair dispatch.
+ */
+ if (!list_empty_careful(&hctx->dispatch)) {
+ spin_lock(&hctx->lock);
+ if (!list_empty(&hctx->dispatch))
+ list_splice_init(&hctx->dispatch, &rq_list);
+ spin_unlock(&hctx->lock);
+ }
+
+ blk_mq_dispatch_rq_list(hctx, &rq_list);
}

static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-mq.h b/block/blk-mq.h
index b444370ae05b..3a54dd32a6fc 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -31,6 +31,7 @@ void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *);

/*
* CPU hotplug helpers
--
2.7.4

2016-12-07 23:10:14

[permalink] [raw]

Subject: [PATCH 5/7] blk-mq-sched: add framework for MQ capable IO schedulers

Signed-off-by: Jens Axboe <[email protected]>
---
block/blk-mq-sched.c | 243 +++++++++++++++++++++++++++++++++++++++++++++++++++
block/blk-mq-sched.h | 168 +++++++++++++++++++++++++++++++++++
2 files changed, 411 insertions(+)
create mode 100644 block/blk-mq-sched.c
create mode 100644 block/blk-mq-sched.h

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
new file mode 100644
index 000000000000..8317b26990f8
--- /dev/null
+++ b/block/blk-mq-sched.c
@@ -0,0 +1,243 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <linux/blk-mq.h>
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-sched.h"
+#include "blk-mq-tag.h"
+#include "blk-wbt.h"
+
+/*
+ * Empty set
+ */
+static struct blk_mq_ops mq_sched_tag_ops = {
+ .queue_rq = NULL,
+};
+
+void blk_mq_sched_free_requests(struct blk_mq_tags *tags)
+{
+ blk_mq_free_rq_map(NULL, tags, 0);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_free_requests);
+
+struct blk_mq_tags *blk_mq_sched_alloc_requests(unsigned int depth,
+ unsigned int numa_node)
+{
+ struct blk_mq_tag_set set = {
+ .ops = &mq_sched_tag_ops,
+ .nr_hw_queues = 1,
+ .queue_depth = depth,
+ .numa_node = numa_node,
+ };
+
+ return blk_mq_init_rq_map(&set, 0);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_alloc_requests);
+
+void blk_mq_sched_free_hctx_data(struct request_queue *q,
+ void (*exit)(struct blk_mq_hw_ctx *))
+{
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (exit)
+ exit(hctx);
+ kfree(hctx->sched_data);
+ hctx->sched_data = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
+
+int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
+ void (*init)(struct blk_mq_hw_ctx *))
+{
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
+ if (!hctx->sched_data)
+ goto error;
+
+ if (init)
+ init(hctx);
+ }
+
+ return 0;
+error:
+ blk_mq_sched_free_hctx_data(q, NULL);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data);
+
+struct request *blk_mq_sched_alloc_shadow_request(struct request_queue *q,
+ struct blk_mq_alloc_data *data,
+ struct blk_mq_tags *tags,
+ atomic_t *wait_index)
+{
+ struct sbq_wait_state *ws;
+ DEFINE_WAIT(wait);
+ struct request *rq;
+ int tag;
+
+ tag = __sbitmap_queue_get(&tags->bitmap_tags);
+ if (tag != -1)
+ goto done;
+
+ if (data->flags & BLK_MQ_REQ_NOWAIT)
+ return NULL;
+
+ ws = sbq_wait_ptr(&tags->bitmap_tags, wait_index);
+ do {
+ prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
+
+ tag = __sbitmap_queue_get(&tags->bitmap_tags);
+ if (tag != -1)
+ break;
+
+ blk_mq_run_hw_queue(data->hctx, false);
+
+ tag = __sbitmap_queue_get(&tags->bitmap_tags);
+ if (tag != -1)
+ break;
+
+ blk_mq_put_ctx(data->ctx);
+ io_schedule();
+
+ data->ctx = blk_mq_get_ctx(data->q);
+ data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu);
+ finish_wait(&ws->wait, &wait);
+ ws = sbq_wait_ptr(&tags->bitmap_tags, wait_index);
+ } while (1);
+
+ finish_wait(&ws->wait, &wait);
+done:
+ rq = tags->rqs[tag];
+ rq->tag = tag;
+ return rq;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_alloc_shadow_request);
+
+void blk_mq_sched_free_shadow_request(struct blk_mq_tags *tags,
+ struct request *rq)
+{
+ sbitmap_queue_clear(&tags->bitmap_tags, rq->tag, rq->mq_ctx->cpu);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_free_shadow_request);
+
+static void rq_copy(struct request *rq, struct request *src)
+{
+#define FIELD_COPY(dst, src, name) ((dst)->name = (src)->name)
+ FIELD_COPY(rq, src, cpu);
+ FIELD_COPY(rq, src, cmd_type);
+ FIELD_COPY(rq, src, cmd_flags);
+ rq->rq_flags |= (src->rq_flags & (RQF_PREEMPT | RQF_QUIET | RQF_PM | RQF_DONTPREP));
+ rq->rq_flags &= ~RQF_IO_STAT;
+ FIELD_COPY(rq, src, __data_len);
+ FIELD_COPY(rq, src, __sector);
+ FIELD_COPY(rq, src, bio);
+ FIELD_COPY(rq, src, biotail);
+ FIELD_COPY(rq, src, rq_disk);
+ FIELD_COPY(rq, src, part);
+ FIELD_COPY(rq, src, nr_phys_segments);
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+ FIELD_COPY(rq, src, nr_integrity_segments);
+#endif
+ FIELD_COPY(rq, src, ioprio);
+ FIELD_COPY(rq, src, timeout);
+
+ if (src->cmd_type == REQ_TYPE_BLOCK_PC) {
+ FIELD_COPY(rq, src, cmd);
+ FIELD_COPY(rq, src, cmd_len);
+ FIELD_COPY(rq, src, extra_len);
+ FIELD_COPY(rq, src, sense_len);
+ FIELD_COPY(rq, src, resid_len);
+ FIELD_COPY(rq, src, sense);
+ FIELD_COPY(rq, src, retries);
+ }
+
+ src->bio = src->biotail = NULL;
+}
+
+static void sched_rq_end_io(struct request *rq, int error)
+{
+ struct request *sched_rq = rq->end_io_data;
+
+ FIELD_COPY(sched_rq, rq, resid_len);
+ FIELD_COPY(sched_rq, rq, extra_len);
+ FIELD_COPY(sched_rq, rq, sense_len);
+ FIELD_COPY(sched_rq, rq, errors);
+ FIELD_COPY(sched_rq, rq, retries);
+
+ blk_account_io_completion(sched_rq, blk_rq_bytes(sched_rq));
+ blk_account_io_done(sched_rq);
+
+ wbt_done(sched_rq->q->rq_wb, &sched_rq->issue_stat);
+
+ if (sched_rq->end_io)
+ sched_rq->end_io(sched_rq, error);
+
+ blk_mq_free_request(rq);
+}
+
+struct request *
+blk_mq_sched_request_from_shadow(struct blk_mq_hw_ctx *hctx,
+ struct request *(*get_sched_rq)(struct blk_mq_hw_ctx *))
+{
+ struct blk_mq_alloc_data data;
+ struct request *sched_rq, *rq;
+
+ data.q = hctx->queue;
+ data.flags = BLK_MQ_REQ_NOWAIT;
+ data.ctx = blk_mq_get_ctx(hctx->queue);
+ data.hctx = hctx;
+
+ rq = __blk_mq_alloc_request(&data, 0);
+ blk_mq_put_ctx(data.ctx);
+
+ if (!rq) {
+ blk_mq_stop_hw_queue(hctx);
+ return NULL;
+ }
+
+ sched_rq = get_sched_rq(hctx);
+
+ if (!sched_rq) {
+ blk_queue_enter_live(hctx->queue);
+ __blk_mq_free_request(hctx, data.ctx, rq);
+ return NULL;
+ }
+
+ rq_copy(rq, sched_rq);
+ rq->end_io = sched_rq_end_io;
+ rq->end_io_data = sched_rq;
+
+ return rq;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_request_from_shadow);
+
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+{
+ struct elevator_queue *e = hctx->queue->elevator;
+ struct request *rq;
+ LIST_HEAD(rq_list);
+
+ if (unlikely(blk_mq_hctx_stopped(hctx)))
+ return;
+
+ hctx->run++;
+
+ if (!list_empty(&hctx->dispatch)) {
+ spin_lock(&hctx->lock);
+ if (!list_empty(&hctx->dispatch))
+ list_splice_init(&hctx->dispatch, &rq_list);
+ spin_unlock(&hctx->lock);
+ }
+
+ while ((rq = e->type->mq_ops.dispatch_request(hctx)) != NULL)
+ list_add_tail(&rq->queuelist, &rq_list);
+
+ blk_mq_dispatch_rq_list(hctx, &rq_list);
+}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
new file mode 100644
index 000000000000..125e14e5274a
--- /dev/null
+++ b/block/blk-mq-sched.h
@@ -0,0 +1,168 @@
+#ifndef BLK_MQ_SCHED_H
+#define BLK_MQ_SCHED_H
+
+#include "blk-mq.h"
+
+struct blk_mq_hw_ctx;
+struct blk_mq_ctx;
+struct request_queue;
+
+struct blk_mq_tags *blk_mq_sched_alloc_requests(unsigned int depth, unsigned int numa_node);
+void blk_mq_sched_free_requests(struct blk_mq_tags *tags);
+
+int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
+ void (*init)(struct blk_mq_hw_ctx *));
+void blk_mq_sched_free_hctx_data(struct request_queue *q,
+ void (*exit)(struct blk_mq_hw_ctx *));
+
+void blk_mq_sched_free_shadow_request(struct blk_mq_tags *tags,
+ struct request *rq);
+struct request *blk_mq_sched_alloc_shadow_request(struct request_queue *q,
+ struct blk_mq_alloc_data *data,
+ struct blk_mq_tags *tags,
+ atomic_t *wait_index);
+struct request *
+blk_mq_sched_request_from_shadow(struct blk_mq_hw_ctx *hctx,
+ struct request *(*get_sched_rq)(struct blk_mq_hw_ctx *));
+
+
+struct blk_mq_alloc_data {
+ /* input parameter */
+ struct request_queue *q;
+ unsigned int flags;
+
+ /* input & output parameter */
+ struct blk_mq_ctx *ctx;
+ struct blk_mq_hw_ctx *hctx;
+};
+
+static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
+ struct request_queue *q, unsigned int flags,
+ struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx)
+{
+ data->q = q;
+ data->flags = flags;
+ data->ctx = ctx;
+ data->hctx = hctx;
+}
+
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
+
+static inline bool
+blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+{
+ struct elevator_queue *e = q->elevator;
+
+ if (blk_queue_nomerges(q) || !bio_mergeable(bio))
+ return false;
+
+ if (e) {
+ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+
+ blk_mq_put_ctx(ctx);
+ return e->type->mq_ops.bio_merge(hctx, bio);
+ }
+
+ return false;
+}
+
+static inline struct request *
+blk_mq_sched_get_request(struct request_queue *q, struct bio *bio,
+ struct blk_mq_alloc_data *data)
+{
+ struct elevator_queue *e = q->elevator;
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_ctx *ctx;
+ struct request *rq;
+
+ blk_queue_enter_live(q);
+ ctx = blk_mq_get_ctx(q);
+ hctx = blk_mq_map_queue(q, ctx->cpu);
+
+ blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
+
+ if (e)
+ rq = e->type->mq_ops.get_request(q, bio, data);
+ else
+ rq = __blk_mq_alloc_request(data, bio->bi_opf);
+
+ if (rq)
+ data->hctx->queued++;
+
+ return rq;
+
+}
+
+static inline void
+blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue,
+ bool async)
+{
+ struct request_queue *q = rq->q;
+ struct elevator_queue *e = q->elevator;
+ struct blk_mq_ctx *ctx = rq->mq_ctx;
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+
+ if (e)
+ e->type->mq_ops.insert_request(hctx, rq, at_head);
+ else {
+ spin_lock(&ctx->lock);
+ __blk_mq_insert_request(hctx, rq, at_head);
+ spin_unlock(&ctx->lock);
+ }
+
+ if (run_queue)
+ blk_mq_run_hw_queue(hctx, async);
+}
+
+static inline bool
+blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
+ struct bio *bio)
+{
+ struct elevator_queue *e = q->elevator;
+
+ if (e && e->type->mq_ops.allow_merge)
+ return e->type->mq_ops.allow_merge(q, rq, bio);
+
+ return true;
+}
+
+static inline void
+blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+ struct elevator_queue *e = hctx->queue->elevator;
+
+ if (e && e->type->mq_ops.completed_request)
+ e->type->mq_ops.completed_request(hctx, rq);
+}
+
+static inline void blk_mq_sched_started_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ struct elevator_queue *e = q->elevator;
+
+ if (e && e->type->mq_ops.started_request)
+ e->type->mq_ops.started_request(rq);
+}
+
+static inline void blk_mq_sched_requeue_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ struct elevator_queue *e = q->elevator;
+
+ if (e && e->type->mq_ops.requeue_request)
+ e->type->mq_ops.requeue_request(rq);
+}
+
+static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
+{
+ struct elevator_queue *e = hctx->queue->elevator;
+
+ if (e && e->type->mq_ops.has_work)
+ return e->type->mq_ops.has_work(hctx);
+
+ return false;
+}
+
+
+#endif
--
2.7.4

2016-12-07 23:11:00

[permalink] [raw]

Subject: [PATCH 7/7] mq-deadline: add blk-mq adaptation of the deadline IO scheduler

Signed-off-by: Jens Axboe <[email protected]>
---
block/Kconfig.iosched | 6 +
block/Makefile | 1 +
block/mq-deadline.c | 622 ++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 629 insertions(+)
create mode 100644 block/mq-deadline.c

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 421bef9c4c48..490ef2850fae 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -32,6 +32,12 @@ config IOSCHED_CFQ

This is the default I/O scheduler.

+config MQ_IOSCHED_DEADLINE
+ tristate "MQ deadline I/O scheduler"
+ default y
+ ---help---
+ MQ version of the deadline IO scheduler.
+
config CFQ_GROUP_IOSCHED
bool "CFQ Group Scheduling support"
depends on IOSCHED_CFQ && BLK_CGROUP
diff --git a/block/Makefile b/block/Makefile
index 2eee9e1bb6db..3ee0abd7205a 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
+obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o

obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
new file mode 100644
index 000000000000..5083772c1352
--- /dev/null
+++ b/block/mq-deadline.c
@@ -0,0 +1,622 @@
+/*
+ * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
+ * for the blk-mq scheduling framework
+ *
+ * Copyright (C) 2016 Jens Axboe <[email protected]>
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/elevator.h>
+#include <linux/bio.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/rbtree.h>
+#include <linux/sbitmap.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
+
+/*
+ * Work-around for the flush machinery, since it intercepts before
+ * the request is properly added.
+ */
+static bool flush_fua_bypass(unsigned int op)
+{
+ return op & (REQ_PREFLUSH | REQ_FUA);
+}
+
+/*
+ * See Documentation/block/deadline-iosched.txt
+ */
+static const int read_expire = HZ / 2; /* max time before a read is submitted. */
+static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
+static const int writes_starved = 2; /* max times reads can starve a write */
+static const int fifo_batch = 16; /* # of sequential requests treated as one
+ by the above parameters. For throughput. */
+
+struct deadline_data {
+ /*
+ * run time data
+ */
+
+ /*
+ * requests (deadline_rq s) are present on both sort_list and fifo_list
+ */
+ struct rb_root sort_list[2];
+ struct list_head fifo_list[2];
+
+ /*
+ * next in sort order. read, write or both are NULL
+ */
+ struct request *next_rq[2];
+ unsigned int batching; /* number of sequential requests made */
+ unsigned int starved; /* times reads have starved writes */
+
+ /*
+ * settings that change how the i/o scheduler behaves
+ */
+ int fifo_expire[2];
+ int fifo_batch;
+ int writes_starved;
+ int front_merges;
+
+ spinlock_t lock;
+ struct list_head dispatch;
+ struct blk_mq_tags *tags;
+ atomic_t wait_index;
+};
+
+static inline struct rb_root *
+deadline_rb_root(struct deadline_data *dd, struct request *rq)
+{
+ return &dd->sort_list[rq_data_dir(rq)];
+}
+
+/*
+ * get the request after `rq' in sector-sorted order
+ */
+static inline struct request *
+deadline_latter_request(struct request *rq)
+{
+ struct rb_node *node = rb_next(&rq->rb_node);
+
+ if (node)
+ return rb_entry_rq(node);
+
+ return NULL;
+}
+
+static void
+deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
+{
+ struct rb_root *root = deadline_rb_root(dd, rq);
+
+ elv_rb_add(root, rq);
+}
+
+static inline void
+deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
+{
+ const int data_dir = rq_data_dir(rq);
+
+ if (dd->next_rq[data_dir] == rq)
+ dd->next_rq[data_dir] = deadline_latter_request(rq);
+
+ elv_rb_del(deadline_rb_root(dd, rq), rq);
+}
+
+/*
+ * remove rq from rbtree and fifo.
+ */
+static void deadline_remove_request(struct request_queue *q, struct request *rq)
+{
+ struct deadline_data *dd = q->elevator->elevator_data;
+
+ list_del_init(&rq->queuelist);
+ deadline_del_rq_rb(dd, rq);
+
+ elv_rqhash_del(q, rq);
+ if (q->last_merge == rq)
+ q->last_merge = NULL;
+}
+
+#if 0
+static void
+deadline_merged_requests(struct request_queue *q, struct request *req,
+ struct request *next)
+{
+ /*
+ * if next expires before rq, assign its expire time to rq
+ * and move into next position (next will be deleted) in fifo
+ */
+ if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
+ if (time_before((unsigned long)next->fifo_time,
+ (unsigned long)req->fifo_time)) {
+ list_move(&req->queuelist, &next->queuelist);
+ req->fifo_time = next->fifo_time;
+ }
+ }
+
+ /*
+ * kill knowledge of next, this one is a goner
+ */
+ deadline_remove_request(q, next);
+}
+#endif
+
+/*
+ * move an entry to dispatch queue
+ */
+static void
+deadline_move_request(struct deadline_data *dd, struct request *rq)
+{
+ const int data_dir = rq_data_dir(rq);
+
+ dd->next_rq[READ] = NULL;
+ dd->next_rq[WRITE] = NULL;
+ dd->next_rq[data_dir] = deadline_latter_request(rq);
+
+ /*
+ * take it off the sort and fifo list
+ */
+ deadline_remove_request(rq->q, rq);
+}
+
+/*
+ * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
+ * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
+ */
+static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
+{
+ struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
+
+ /*
+ * rq is expired!
+ */
+ if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * deadline_dispatch_requests selects the best request according to
+ * read/write expire, fifo_batch, etc
+ */
+static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+ struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+ struct request *rq;
+ bool reads, writes;
+ int data_dir;
+
+ spin_lock(&dd->lock);
+
+ if (!list_empty(&dd->dispatch)) {
+ rq = list_first_entry(&dd->dispatch, struct request, queuelist);
+ list_del_init(&rq->queuelist);
+ goto done;
+ }
+
+ reads = !list_empty(&dd->fifo_list[READ]);
+ writes = !list_empty(&dd->fifo_list[WRITE]);
+
+ /*
+ * batches are currently reads XOR writes
+ */
+ if (dd->next_rq[WRITE])
+ rq = dd->next_rq[WRITE];
+ else
+ rq = dd->next_rq[READ];
+
+ if (rq && dd->batching < dd->fifo_batch)
+ /* we have a next request are still entitled to batch */
+ goto dispatch_request;
+
+ /*
+ * at this point we are not running a batch. select the appropriate
+ * data direction (read / write)
+ */
+
+ if (reads) {
+ BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
+
+ if (writes && (dd->starved++ >= dd->writes_starved))
+ goto dispatch_writes;
+
+ data_dir = READ;
+
+ goto dispatch_find_request;
+ }
+
+ /*
+ * there are either no reads or writes have been starved
+ */
+
+ if (writes) {
+dispatch_writes:
+ BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
+
+ dd->starved = 0;
+
+ data_dir = WRITE;
+
+ goto dispatch_find_request;
+ }
+
+ spin_unlock(&dd->lock);
+ return NULL;
+
+dispatch_find_request:
+ /*
+ * we are not running a batch, find best request for selected data_dir
+ */
+ if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+ /*
+ * A deadline has expired, the last request was in the other
+ * direction, or we have run out of higher-sectored requests.
+ * Start again from the request with the earliest expiry time.
+ */
+ rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+ } else {
+ /*
+ * The last req was the same dir and we have a next request in
+ * sort order. No expired requests so continue on from here.
+ */
+ rq = dd->next_rq[data_dir];
+ }
+
+ dd->batching = 0;
+
+dispatch_request:
+ /*
+ * rq is the selected appropriate request.
+ */
+ dd->batching++;
+ deadline_move_request(dd, rq);
+done:
+ spin_unlock(&dd->lock);
+ return rq;
+}
+
+static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+ return blk_mq_sched_request_from_shadow(hctx, __dd_dispatch_request);
+}
+
+static void dd_exit_queue(struct elevator_queue *e)
+{
+ struct deadline_data *dd = e->elevator_data;
+
+ BUG_ON(!list_empty(&dd->fifo_list[READ]));
+ BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
+
+ blk_mq_sched_free_requests(dd->tags);
+ kfree(dd);
+}
+
+/*
+ * initialize elevator private data (deadline_data).
+ */
+static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
+{
+ struct deadline_data *dd;
+ struct elevator_queue *eq;
+
+ eq = elevator_alloc(q, e);
+ if (!eq)
+ return -ENOMEM;
+
+ dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
+ if (!dd) {
+ kobject_put(&eq->kobj);
+ return -ENOMEM;
+ }
+ eq->elevator_data = dd;
+
+ dd->tags = blk_mq_sched_alloc_requests(256, q->node);
+ if (!dd->tags) {
+ kfree(dd);
+ kobject_put(&eq->kobj);
+ return -ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&dd->fifo_list[READ]);
+ INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
+ dd->sort_list[READ] = RB_ROOT;
+ dd->sort_list[WRITE] = RB_ROOT;
+ dd->fifo_expire[READ] = read_expire;
+ dd->fifo_expire[WRITE] = write_expire;
+ dd->writes_starved = writes_starved;
+ dd->front_merges = 1;
+ dd->fifo_batch = fifo_batch;
+ spin_lock_init(&dd->lock);
+ INIT_LIST_HEAD(&dd->dispatch);
+ atomic_set(&dd->wait_index, 0);
+
+ q->elevator = eq;
+ return 0;
+}
+
+static int __dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
+ struct request **req)
+{
+ struct request_queue *q = hctx->queue;
+ struct deadline_data *dd = q->elevator->elevator_data;
+ struct request *__rq;
+ sector_t sector;
+ int ret;
+
+ /*
+ * First try one-hit cache.
+ */
+ if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) {
+ ret = blk_try_merge(q->last_merge, bio);
+ if (ret != ELEVATOR_NO_MERGE) {
+ *req = q->last_merge;
+ return ret;
+ }
+ }
+
+ if (blk_queue_noxmerges(q))
+ return ELEVATOR_NO_MERGE;
+
+ /*
+ * See if our hash lookup can find a potential backmerge.
+ */
+ __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);
+ if (__rq && elv_bio_merge_ok(__rq, bio)) {
+ *req = __rq;
+ return ELEVATOR_BACK_MERGE;
+ }
+
+ if (!dd->front_merges)
+ return ELEVATOR_NO_MERGE;
+
+ sector = bio_end_sector(bio);
+
+ __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
+ if (__rq) {
+ BUG_ON(sector != blk_rq_pos(__rq));
+
+ if (elv_bio_merge_ok(__rq, bio)) {
+ *req = __rq;
+ return ELEVATOR_FRONT_MERGE;
+ }
+ }
+
+ return ELEVATOR_NO_MERGE;
+}
+
+static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+{
+ struct request_queue *q = hctx->queue;
+ struct deadline_data *dd = q->elevator->elevator_data;
+ struct request *rq;
+ int ret;
+
+ spin_lock(&dd->lock);
+
+ ret = __dd_bio_merge(hctx, bio, &rq);
+
+ if (ret == ELEVATOR_BACK_MERGE) {
+ if (bio_attempt_back_merge(q, rq, bio)) {
+ if (!attempt_back_merge(q, rq)) {
+ q->last_merge = rq;
+ elv_rqhash_reposition(q, rq);
+ goto done;
+ }
+ }
+ ret = ELEVATOR_NO_MERGE;
+ } else if (ret == ELEVATOR_FRONT_MERGE) {
+ if (bio_attempt_front_merge(q, rq, bio)) {
+ if (!attempt_front_merge(q, rq)) {
+ q->last_merge = rq;
+ elv_rb_del(deadline_rb_root(dd, rq), rq);
+ deadline_add_rq_rb(dd, rq);
+ goto done;
+ }
+ }
+ ret = ELEVATOR_NO_MERGE;
+ }
+
+done:
+ spin_unlock(&dd->lock);
+ return ret != ELEVATOR_NO_MERGE;
+}
+
+/*
+ * add rq to rbtree and fifo
+ */
+static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+ bool at_head)
+{
+ struct request_queue *q = hctx->queue;
+ struct deadline_data *dd = q->elevator->elevator_data;
+ const int data_dir = rq_data_dir(rq);
+
+ if (unlikely(flush_fua_bypass(rq->cmd_flags))) {
+ spin_lock(&hctx->lock);
+ list_add_tail(&rq->queuelist, &hctx->dispatch);
+ spin_unlock(&hctx->lock);
+ return;
+ }
+
+ spin_lock(&dd->lock);
+
+ if (at_head || rq->cmd_type != REQ_TYPE_FS) {
+ if (at_head)
+ list_add(&rq->queuelist, &dd->dispatch);
+ else
+ list_add_tail(&rq->queuelist, &dd->dispatch);
+ } else {
+ deadline_add_rq_rb(dd, rq);
+
+ if (rq_mergeable(rq)) {
+ elv_rqhash_add(q, rq);
+ if (!q->last_merge)
+ q->last_merge = rq;
+ }
+
+ /*
+ * set expire time and add to fifo list
+ */
+ rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
+ list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
+ }
+
+ spin_unlock(&dd->lock);
+}
+
+static struct request *dd_get_request(struct request_queue *q, struct bio *bio,
+ struct blk_mq_alloc_data *data)
+{
+ struct deadline_data *dd = q->elevator->elevator_data;
+ struct request *rq;
+
+ if (unlikely(flush_fua_bypass(bio->bi_opf)))
+ rq = __blk_mq_alloc_request(data, bio->bi_opf);
+ else {
+ rq = blk_mq_sched_alloc_shadow_request(q, data, dd->tags, &dd->wait_index);
+ if (rq)
+ blk_mq_rq_ctx_init(q, data->ctx, rq, bio->bi_opf);
+ }
+
+ return rq;
+}
+
+static void dd_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+ struct request *sched_rq = rq->end_io_data;
+
+ /*
+ * sched_rq can be NULL, if we haven't setup the shadow yet
+ * because we failed getting one.
+ */
+ if (sched_rq) {
+ struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+
+ blk_mq_sched_free_shadow_request(dd->tags, sched_rq);
+ blk_mq_start_stopped_hw_queue(hctx, true);
+ }
+}
+
+static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
+{
+ struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+
+ return !list_empty_careful(&dd->dispatch) ||
+ !list_empty_careful(&dd->fifo_list[0]) ||
+ !list_empty_careful(&dd->fifo_list[1]);
+}
+
+/*
+ * sysfs parts below
+ */
+static ssize_t
+deadline_var_show(int var, char *page)
+{
+ return sprintf(page, "%d\n", var);
+}
+
+static ssize_t
+deadline_var_store(int *var, const char *page, size_t count)
+{
+ char *p = (char *) page;
+
+ *var = simple_strtol(p, &p, 10);
+ return count;
+}
+
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
+static ssize_t __FUNC(struct elevator_queue *e, char *page) \
+{ \
+ struct deadline_data *dd = e->elevator_data; \
+ int __data = __VAR; \
+ if (__CONV) \
+ __data = jiffies_to_msecs(__data); \
+ return deadline_var_show(__data, (page)); \
+}
+SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
+SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
+SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
+SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
+SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
+#undef SHOW_FUNCTION
+
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
+{ \
+ struct deadline_data *dd = e->elevator_data; \
+ int __data; \
+ int ret = deadline_var_store(&__data, (page), count); \
+ if (__data < (MIN)) \
+ __data = (MIN); \
+ else if (__data > (MAX)) \
+ __data = (MAX); \
+ if (__CONV) \
+ *(__PTR) = msecs_to_jiffies(__data); \
+ else \
+ *(__PTR) = __data; \
+ return ret; \
+}
+STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
+STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
+STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
+#undef STORE_FUNCTION
+
+#define DD_ATTR(name) \
+ __ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
+ deadline_##name##_store)
+
+static struct elv_fs_entry deadline_attrs[] = {
+ DD_ATTR(read_expire),
+ DD_ATTR(write_expire),
+ DD_ATTR(writes_starved),
+ DD_ATTR(front_merges),
+ DD_ATTR(fifo_batch),
+ __ATTR_NULL
+};
+
+static struct elevator_type mq_deadline = {
+ .mq_ops = {
+ .get_request = dd_get_request,
+ .insert_request = dd_insert_request,
+ .dispatch_request = dd_dispatch_request,
+ .completed_request = dd_completed_request,
+ .bio_merge = dd_bio_merge,
+ .has_work = dd_has_work,
+ .init_sched = dd_init_queue,
+ .exit_sched = dd_exit_queue,
+ },
+
+ .uses_mq = true,
+ .elevator_attrs = deadline_attrs,
+ .elevator_name = "mq-deadline",
+ .elevator_owner = THIS_MODULE,
+};
+
+static int __init deadline_init(void)
+{
+ return elv_register(&mq_deadline);
+}
+
+static void __exit deadline_exit(void)
+{
+ elv_unregister(&mq_deadline);
+}
+
+module_init(deadline_init);
+module_exit(deadline_exit);
+
+MODULE_AUTHOR("Jens Axboe");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MQ deadline IO scheduler");
--
2.7.4

2016-12-07 23:11:02

[permalink] [raw]

Subject: [PATCH 6/7] blk-mq-sched: add framework for MQ capable IO schedulers

Signed-off-by: Jens Axboe <[email protected]>
---
block/Makefile | 2 +-
block/blk-core.c | 3 +-
block/blk-exec.c | 3 +-
block/blk-flush.c | 7 +--
block/blk-mq-tag.c | 1 +
block/blk-mq.c | 138 +++++++++++++++++++++++++----------------------
block/blk-mq.h | 32 +++++------
block/elevator.c | 132 ++++++++++++++++++++++++++++++++++-----------
include/linux/blk-mq.h | 2 +-
include/linux/elevator.h | 25 ++++++++-
10 files changed, 222 insertions(+), 123 deletions(-)

diff --git a/block/Makefile b/block/Makefile
index a827f988c4e6..2eee9e1bb6db 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
- blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \
+ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
badblocks.o partitions/

diff --git a/block/blk-core.c b/block/blk-core.c
index 4b7ec5958055..a6163d9bb8a6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@

#include "blk.h"
#include "blk-mq.h"
+#include "blk-mq-sched.h"
#include "blk-wbt.h"

EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
@@ -2173,7 +2174,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
if (q->mq_ops) {
if (blk_queue_io_stat(q))
blk_account_io_start(rq, true);
- blk_mq_insert_request(rq, false, true, false);
+ blk_mq_sched_insert_request(rq, false, true, false);
return 0;
}

diff --git a/block/blk-exec.c b/block/blk-exec.c
index 3ecb00a6cf45..86656fdfa637 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -9,6 +9,7 @@
#include <linux/sched/sysctl.h>

#include "blk.h"
+#include "blk-mq-sched.h"

/*
* for max sense size
@@ -65,7 +66,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
* be reused after dying flag is set
*/
if (q->mq_ops) {
- blk_mq_insert_request(rq, at_head, true, false);
+ blk_mq_sched_insert_request(rq, at_head, true, false);
return;
}

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 27a42dab5a36..63b91697d167 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -74,6 +74,7 @@
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
+#include "blk-mq-sched.h"

/* FLUSH/FUA sequences */
enum {
@@ -425,9 +426,9 @@ void blk_insert_flush(struct request *rq)
*/
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
- if (q->mq_ops) {
- blk_mq_insert_request(rq, false, true, false);
- } else
+ if (q->mq_ops)
+ blk_mq_sched_insert_request(rq, false, true, false);
+ else
list_add_tail(&rq->queuelist, &q->queue_head);
return;
}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index dcf5ce3ba4bf..bbd494e23d57 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -12,6 +12,7 @@
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
+#include "blk-mq-sched.h"

bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
{
diff --git a/block/blk-mq.c b/block/blk-mq.c
index abbf7cca4d0d..102ef2945dfc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -32,6 +32,7 @@
#include "blk-mq-tag.h"
#include "blk-stat.h"
#include "blk-wbt.h"
+#include "blk-mq-sched.h"

static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);
@@ -41,7 +42,8 @@ static LIST_HEAD(all_q_list);
*/
static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
{
- return sbitmap_any_bit_set(&hctx->ctx_map);
+ return sbitmap_any_bit_set(&hctx->ctx_map) ||
+ blk_mq_sched_has_work(hctx);
}

/*
@@ -167,8 +169,8 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
}
EXPORT_SYMBOL(blk_mq_can_queue);

-static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
- struct request *rq, unsigned int op)
+void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
+ struct request *rq, unsigned int op)
{
INIT_LIST_HEAD(&rq->queuelist);
/* csd/requeue_work/fifo_time is initialized before use */
@@ -214,8 +216,8 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
ctx->rq_dispatched[op_is_sync(op)]++;
}

-static struct request *
-__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op)
+struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
+ unsigned int op)
{
struct request *rq;
unsigned int tag;
@@ -319,12 +321,14 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);

-static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
- struct blk_mq_ctx *ctx, struct request *rq)
+void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+ struct request *rq)
{
const int tag = rq->tag;
struct request_queue *q = rq->q;

+ blk_mq_sched_completed_request(hctx, rq);
+
if (rq->rq_flags & RQF_MQ_INFLIGHT)
atomic_dec(&hctx->nr_active);

@@ -467,6 +471,8 @@ void blk_mq_start_request(struct request *rq)
{
struct request_queue *q = rq->q;

+ blk_mq_sched_started_request(rq);
+
trace_block_rq_issue(q, rq);

rq->resid_len = blk_rq_bytes(rq);
@@ -515,6 +521,7 @@ static void __blk_mq_requeue_request(struct request *rq)

trace_block_rq_requeue(q, rq);
wbt_requeue(q->rq_wb, &rq->issue_stat);
+ blk_mq_sched_requeue_request(rq);

if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -549,13 +556,13 @@ static void blk_mq_requeue_work(struct work_struct *work)

rq->rq_flags &= ~RQF_SOFTBARRIER;
list_del_init(&rq->queuelist);
- blk_mq_insert_request(rq, true, false, false);
+ blk_mq_sched_insert_request(rq, true, false, false);
}

while (!list_empty(&rq_list)) {
rq = list_entry(rq_list.next, struct request, queuelist);
list_del_init(&rq->queuelist);
- blk_mq_insert_request(rq, false, false, false);
+ blk_mq_sched_insert_request(rq, false, false, false);
}

blk_mq_run_hw_queues(q, false);
@@ -761,8 +768,16 @@ static bool blk_mq_attempt_merge(struct request_queue *q,

if (!blk_rq_merge_ok(rq, bio))
continue;
+ if (!blk_mq_sched_allow_merge(q, rq, bio))
+ break;

el_ret = blk_try_merge(rq, bio);
+ if (el_ret == ELEVATOR_NO_MERGE)
+ continue;
+
+ if (!blk_mq_sched_allow_merge(q, rq, bio))
+ break;
+
if (el_ret == ELEVATOR_BACK_MERGE) {
if (bio_attempt_back_merge(q, rq, bio)) {
ctx->rq_merged++;
@@ -947,11 +962,17 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)

if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
rcu_read_lock();
- blk_mq_process_rq_list(hctx);
+ if (hctx->queue->elevator)
+ blk_mq_sched_dispatch_requests(hctx);
+ else
+ blk_mq_process_rq_list(hctx);
rcu_read_unlock();
} else {
srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
- blk_mq_process_rq_list(hctx);
+ if (hctx->queue->elevator)
+ blk_mq_sched_dispatch_requests(hctx);
+ else
+ blk_mq_process_rq_list(hctx);
srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
}
}
@@ -1135,8 +1156,8 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
list_add_tail(&rq->queuelist, &ctx->rq_list);
}

-static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
- struct request *rq, bool at_head)
+void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+ bool at_head)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;

@@ -1144,21 +1165,6 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
blk_mq_hctx_mark_pending(hctx, ctx);
}

-void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
- bool async)
-{
- struct blk_mq_ctx *ctx = rq->mq_ctx;
- struct request_queue *q = rq->q;
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-
- spin_lock(&ctx->lock);
- __blk_mq_insert_request(hctx, rq, at_head);
- spin_unlock(&ctx->lock);
-
- if (run_queue)
- blk_mq_run_hw_queue(hctx, async);
-}
-
static void blk_mq_insert_requests(struct request_queue *q,
struct blk_mq_ctx *ctx,
struct list_head *list,
@@ -1174,17 +1180,14 @@ static void blk_mq_insert_requests(struct request_queue *q,
* preemption doesn't flush plug list, so it's possible ctx->cpu is
* offline now
*/
- spin_lock(&ctx->lock);
while (!list_empty(list)) {
struct request *rq;

rq = list_first_entry(list, struct request, queuelist);
BUG_ON(rq->mq_ctx != ctx);
list_del_init(&rq->queuelist);
- __blk_mq_insert_req_list(hctx, rq, false);
+ blk_mq_sched_insert_request(rq, false, false, false);
}
- blk_mq_hctx_mark_pending(hctx, ctx);
- spin_unlock(&ctx->lock);

blk_mq_run_hw_queue(hctx, from_schedule);
}
@@ -1285,41 +1288,27 @@ static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
}
}

-static struct request *blk_mq_map_request(struct request_queue *q,
- struct bio *bio,
- struct blk_mq_alloc_data *data)
-{
- struct blk_mq_hw_ctx *hctx;
- struct blk_mq_ctx *ctx;
- struct request *rq;
-
- blk_queue_enter_live(q);
- ctx = blk_mq_get_ctx(q);
- hctx = blk_mq_map_queue(q, ctx->cpu);
-
- trace_block_getrq(q, bio, bio->bi_opf);
- blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
- rq = __blk_mq_alloc_request(data, bio->bi_opf);
-
- data->hctx->queued++;
- return rq;
-}
-
static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
{
- int ret;
struct request_queue *q = rq->q;
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
struct blk_mq_queue_data bd = {
.rq = rq,
.list = NULL,
.last = 1
};
- blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
+ struct blk_mq_hw_ctx *hctx;
+ blk_qc_t new_cookie;
+ int ret;
+
+ if (q->elevator)
+ goto insert;

+ hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
if (blk_mq_hctx_stopped(hctx))
goto insert;

+ new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
+
/*
* For OK queue, we are done. For error, kill it. Any other
* error (busy), just add it to our list as we previously
@@ -1341,7 +1330,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
}

insert:
- blk_mq_insert_request(rq, false, true, true);
+ blk_mq_sched_insert_request(rq, false, true, true);
}

/*
@@ -1374,9 +1363,14 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
return BLK_QC_T_NONE;

+ if (blk_mq_sched_bio_merge(q, bio))
+ return BLK_QC_T_NONE;
+
wb_acct = wbt_wait(q->rq_wb, bio, NULL);

- rq = blk_mq_map_request(q, bio, &data);
+ trace_block_getrq(q, bio, bio->bi_opf);
+
+ rq = blk_mq_sched_get_request(q, bio, &data);
if (unlikely(!rq)) {
__wbt_done(q->rq_wb, wb_acct);
return BLK_QC_T_NONE;
@@ -1438,6 +1432,12 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
goto done;
}

+ if (q->elevator) {
+ blk_mq_put_ctx(data.ctx);
+ blk_mq_bio_to_request(rq, bio);
+ blk_mq_sched_insert_request(rq, false, true, true);
+ goto done;
+ }
if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
/*
* For a SYNC request, send it to the hardware immediately. For
@@ -1483,9 +1483,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
} else
request_count = blk_plug_queued_count(q);

+ if (blk_mq_sched_bio_merge(q, bio))
+ return BLK_QC_T_NONE;
+
wb_acct = wbt_wait(q->rq_wb, bio, NULL);

- rq = blk_mq_map_request(q, bio, &data);
+ trace_block_getrq(q, bio, bio->bi_opf);
+
+ rq = blk_mq_sched_get_request(q, bio, &data);
if (unlikely(!rq)) {
__wbt_done(q->rq_wb, wb_acct);
return BLK_QC_T_NONE;
@@ -1535,6 +1540,12 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
return cookie;
}

+ if (q->elevator) {
+ blk_mq_put_ctx(data.ctx);
+ blk_mq_bio_to_request(rq, bio);
+ blk_mq_sched_insert_request(rq, false, true, true);
+ goto done;
+ }
if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
/*
* For a SYNC request, send it to the hardware immediately. For
@@ -1547,15 +1558,16 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
}

blk_mq_put_ctx(data.ctx);
+done:
return cookie;
}

-static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
- struct blk_mq_tags *tags, unsigned int hctx_idx)
+void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+ unsigned int hctx_idx)
{
struct page *page;

- if (tags->rqs && set->ops->exit_request) {
+ if (tags->rqs && set && set->ops->exit_request) {
int i;

for (i = 0; i < tags->nr_tags; i++) {
@@ -1588,8 +1600,8 @@ static size_t order_to_size(unsigned int order)
return (size_t)PAGE_SIZE << order;
}

-static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
- unsigned int hctx_idx)
+struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx)
{
struct blk_mq_tags *tags;
unsigned int i, j, entries_per_page, max_order = 4;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 3a54dd32a6fc..9284b1e0bccb 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -84,26 +84,6 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
put_cpu();
}

-struct blk_mq_alloc_data {
- /* input parameter */
- struct request_queue *q;
- unsigned int flags;
-
- /* input & output parameter */
- struct blk_mq_ctx *ctx;
- struct blk_mq_hw_ctx *hctx;
-};
-
-static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
- struct request_queue *q, unsigned int flags,
- struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx)
-{
- data->q = q;
- data->flags = flags;
- data->ctx = ctx;
- data->hctx = hctx;
-}
-
static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
{
return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
@@ -114,4 +94,16 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
return hctx->nr_ctx && hctx->tags;
}

+void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+ unsigned int hctx_idx);
+struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx);
+void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
+ struct request *rq, unsigned int op);
+void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+ struct request *rq);
+struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
+ unsigned int op);
+void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+ bool at_head);
#endif
diff --git a/block/elevator.c b/block/elevator.c
index 40f0c04e5ad3..d6d0f76bbe51 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -40,6 +40,7 @@
#include <trace/events/block.h>

#include "blk.h"
+#include "blk-mq-sched.h"

static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);
@@ -58,7 +59,9 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;

- if (e->type->ops.elevator_allow_bio_merge_fn)
+ if (e->uses_mq && e->type->mq_ops.allow_merge)
+ return e->type->mq_ops.allow_merge(q, rq, bio);
+ else if (!e->uses_mq && e->type->ops.elevator_allow_bio_merge_fn)
return e->type->ops.elevator_allow_bio_merge_fn(q, rq, bio);

return 1;
@@ -224,7 +227,10 @@ int elevator_init(struct request_queue *q, char *name)
}
}

- err = e->ops.elevator_init_fn(q, e);
+ if (e->uses_mq)
+ err = e->mq_ops.init_sched(q, e);
+ else
+ err = e->ops.elevator_init_fn(q, e);
if (err)
elevator_put(e);
return err;
@@ -234,7 +240,9 @@ EXPORT_SYMBOL(elevator_init);
void elevator_exit(struct elevator_queue *e)
{
mutex_lock(&e->sysfs_lock);
- if (e->type->ops.elevator_exit_fn)
+ if (e->uses_mq && e->type->mq_ops.exit_sched)
+ e->type->mq_ops.exit_sched(e);
+ else if (!e->uses_mq && e->type->ops.elevator_exit_fn)
e->type->ops.elevator_exit_fn(e);
mutex_unlock(&e->sysfs_lock);

@@ -803,7 +811,8 @@ int elv_register_queue(struct request_queue *q)
}
kobject_uevent(&e->kobj, KOBJ_ADD);
e->registered = 1;
- if (e->type->ops.elevator_registered_fn)
+ e->uses_mq = q->mq_ops != NULL;
+ if (!e->uses_mq && e->type->ops.elevator_registered_fn)
e->type->ops.elevator_registered_fn(q);
}
return error;
@@ -891,9 +900,14 @@ EXPORT_SYMBOL_GPL(elv_unregister);
static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
{
struct elevator_queue *old = q->elevator;
- bool registered = old->registered;
+ bool old_registered = false;
int err;

+ if (q->mq_ops) {
+ blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
+ }
+
/*
* Turn on BYPASS and drain all requests w/ elevator private data.
* Block layer doesn't call into a quiesced elevator - all requests
@@ -901,32 +915,54 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
* using INSERT_BACK. All requests have SOFTBARRIER set and no
* merge happens either.
*/
- blk_queue_bypass_start(q);
+ if (old) {
+ old_registered = old->registered;

- /* unregister and clear all auxiliary data of the old elevator */
- if (registered)
- elv_unregister_queue(q);
+ if (!q->mq_ops)
+ blk_queue_bypass_start(q);

- spin_lock_irq(q->queue_lock);
- ioc_clear_queue(q);
- spin_unlock_irq(q->queue_lock);
+ /* unregister and clear all auxiliary data of the old elevator */
+ if (old_registered)
+ elv_unregister_queue(q);
+
+ if (q->queue_lock) {
+ spin_lock_irq(q->queue_lock);
+ ioc_clear_queue(q);
+ spin_unlock_irq(q->queue_lock);
+ }
+ }

/* allocate, init and register new elevator */
- err = new_e->ops.elevator_init_fn(q, new_e);
- if (err)
- goto fail_init;
+ if (new_e) {
+ if (new_e->uses_mq)
+ err = new_e->mq_ops.init_sched(q, new_e);
+ else
+ err = new_e->ops.elevator_init_fn(q, new_e);
+ if (err)
+ goto fail_init;

- if (registered) {
err = elv_register_queue(q);
if (err)
goto fail_register;
- }
+ } else
+ q->elevator = NULL;

/* done, kill the old one and finish */
- elevator_exit(old);
- blk_queue_bypass_end(q);
+ if (old) {
+ elevator_exit(old);
+ if (!q->mq_ops)
+ blk_queue_bypass_end(q);
+ }
+
+ if (q->mq_ops) {
+ blk_mq_unfreeze_queue(q);
+ blk_mq_start_stopped_hw_queues(q, true);
+ }

- blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+ if (new_e)
+ blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+ else
+ blk_add_trace_msg(q, "elv switch: none");

return 0;

@@ -934,9 +970,16 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
elevator_exit(q->elevator);
fail_init:
/* switch failed, restore and re-register old elevator */
- q->elevator = old;
- elv_register_queue(q);
- blk_queue_bypass_end(q);
+ if (old) {
+ q->elevator = old;
+ elv_register_queue(q);
+ if (!q->mq_ops)
+ blk_queue_bypass_end(q);
+ }
+ if (q->mq_ops) {
+ blk_mq_unfreeze_queue(q);
+ blk_mq_start_stopped_hw_queues(q, true);
+ }

return err;
}
@@ -949,8 +992,11 @@ static int __elevator_change(struct request_queue *q, const char *name)
char elevator_name[ELV_NAME_MAX];
struct elevator_type *e;

- if (!q->elevator)
- return -ENXIO;
+ /*
+ * Special case for mq, turn off scheduling
+ */
+ if (q->mq_ops && !strncmp(name, "none", 4))
+ return elevator_switch(q, NULL);

strlcpy(elevator_name, name, sizeof(elevator_name));
e = elevator_get(strstrip(elevator_name), true);
@@ -959,11 +1005,23 @@ static int __elevator_change(struct request_queue *q, const char *name)
return -EINVAL;
}

- if (!strcmp(elevator_name, q->elevator->type->elevator_name)) {
+ if (q->elevator &&
+ !strcmp(elevator_name, q->elevator->type->elevator_name)) {
elevator_put(e);
return 0;
}

+ if (!e->uses_mq && q->mq_ops) {
+ printk(KERN_ERR "blk-mq-sched: elv %s does not support mq\n", elevator_name);
+ elevator_put(e);
+ return -EINVAL;
+ }
+ if (e->uses_mq && !q->mq_ops) {
+ printk(KERN_ERR "blk-mq-sched: elv %s is for mq\n", elevator_name);
+ elevator_put(e);
+ return -EINVAL;
+ }
+
return elevator_switch(q, e);
}

@@ -985,7 +1043,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
{
int ret;

- if (!q->elevator)
+ if (!q->mq_ops || q->request_fn)
return count;

ret = __elevator_change(q, name);
@@ -999,24 +1057,34 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
ssize_t elv_iosched_show(struct request_queue *q, char *name)
{
struct elevator_queue *e = q->elevator;
- struct elevator_type *elv;
+ struct elevator_type *elv = NULL;
struct elevator_type *__e;
int len = 0;

- if (!q->elevator || !blk_queue_stackable(q))
+ if (!blk_queue_stackable(q))
return sprintf(name, "none\n");

- elv = e->type;
+ if (!q->elevator)
+ len += sprintf(name+len, "[none] ");
+ else
+ elv = e->type;

spin_lock(&elv_list_lock);
list_for_each_entry(__e, &elv_list, list) {
- if (!strcmp(elv->elevator_name, __e->elevator_name))
+ if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) {
len += sprintf(name+len, "[%s] ", elv->elevator_name);
- else
+ continue;
+ }
+ if (__e->uses_mq && q->mq_ops)
+ len += sprintf(name+len, "%s ", __e->elevator_name);
+ else if (!__e->uses_mq && !q->mq_ops)
len += sprintf(name+len, "%s ", __e->elevator_name);
}
spin_unlock(&elv_list_lock);

+ if (q->mq_ops && q->elevator)
+ len += sprintf(name+len, "none");
+
len += sprintf(len+name, "\n");
return len;
}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 87e404aae267..c86b314dde97 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -22,6 +22,7 @@ struct blk_mq_hw_ctx {

unsigned long flags; /* BLK_MQ_F_* flags */

+ void *sched_data;
struct request_queue *queue;
struct blk_flush_queue *fq;

@@ -179,7 +180,6 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set);

void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);

-void blk_mq_insert_request(struct request *, bool, bool, bool);
void blk_mq_free_request(struct request *rq);
void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq);
bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index b276e9ef0e0b..b270b1f75767 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -77,6 +77,24 @@ struct elevator_ops
elevator_registered_fn *elevator_registered_fn;
};

+struct blk_mq_alloc_data;
+struct blk_mq_hw_ctx;
+
+struct elevator_mq_ops {
+ int (*init_sched)(struct request_queue *, struct elevator_type *);
+ void (*exit_sched)(struct elevator_queue *);
+
+ bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
+ bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *);
+ struct request *(*get_request)(struct request_queue *, struct bio *, struct blk_mq_alloc_data *);
+ void (*insert_request)(struct blk_mq_hw_ctx *, struct request *, bool);
+ struct request *(*dispatch_request)(struct blk_mq_hw_ctx *);
+ bool (*has_work)(struct blk_mq_hw_ctx *);
+ void (*completed_request)(struct blk_mq_hw_ctx *, struct request *);
+ void (*started_request)(struct request *);
+ void (*requeue_request)(struct request *);
+};
+
#define ELV_NAME_MAX (16)

struct elv_fs_entry {
@@ -94,12 +112,16 @@ struct elevator_type
struct kmem_cache *icq_cache;

/* fields provided by elevator implementation */
- struct elevator_ops ops;
+ union {
+ struct elevator_ops ops;
+ struct elevator_mq_ops mq_ops;
+ };
size_t icq_size; /* see iocontext.h */
size_t icq_align; /* ditto */
struct elv_fs_entry *elevator_attrs;
char elevator_name[ELV_NAME_MAX];
struct module *elevator_owner;
+ bool uses_mq;

/* managed by elevator core */
char icq_cache_name[ELV_NAME_MAX + 5]; /* elvname + "_io_cq" */
@@ -123,6 +145,7 @@ struct elevator_queue
struct kobject kobj;
struct mutex sysfs_lock;
unsigned int registered:1;
+ unsigned int uses_mq:1;
DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
};

--
2.7.4

2016-12-07 23:11:29

[permalink] [raw]

Subject: [PATCH 3/7] elevator: make the rqhash helpers exported

Signed-off-by: Jens Axboe <[email protected]>
---
block/elevator.c | 8 ++++----
include/linux/elevator.h | 5 +++++
2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index a18a5db274e4..40f0c04e5ad3 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -248,13 +248,13 @@ static inline void __elv_rqhash_del(struct request *rq)
rq->rq_flags &= ~RQF_HASHED;
}

-static void elv_rqhash_del(struct request_queue *q, struct request *rq)
+void elv_rqhash_del(struct request_queue *q, struct request *rq)
{
if (ELV_ON_HASH(rq))
__elv_rqhash_del(rq);
}

-static void elv_rqhash_add(struct request_queue *q, struct request *rq)
+void elv_rqhash_add(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;

@@ -263,13 +263,13 @@ static void elv_rqhash_add(struct request_queue *q, struct request *rq)
rq->rq_flags |= RQF_HASHED;
}

-static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
+void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
{
__elv_rqhash_del(rq);
elv_rqhash_add(q, rq);
}

-static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
+struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
{
struct elevator_queue *e = q->elevator;
struct hlist_node *next;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index f219c9aed360..b276e9ef0e0b 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -108,6 +108,11 @@ struct elevator_type

#define ELV_HASH_BITS 6

+void elv_rqhash_del(struct request_queue *q, struct request *rq);
+void elv_rqhash_add(struct request_queue *q, struct request *rq);
+void elv_rqhash_reposition(struct request_queue *q, struct request *rq);
+struct request *elv_rqhash_find(struct request_queue *q, sector_t offset);
+
/*
* each queue has an elevator_queue associated with it
*/
--
2.7.4

2016-12-07 23:11:44

[permalink] [raw]

Subject: [PATCH 4/7] blk-flush: run the queue when inserting blk-mq flush

Currently we pass in to run the queue async, but don't flag the
queue to be run. We don't need to run it async here, but we should
run it. So fixup the parameters.

Signed-off-by: Jens Axboe <[email protected]>
---
block/blk-flush.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 1bdbb3d3e5f5..27a42dab5a36 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -426,7 +426,7 @@ void blk_insert_flush(struct request *rq)
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
if (q->mq_ops) {
- blk_mq_insert_request(rq, false, false, true);
+ blk_mq_insert_request(rq, false, true, false);
} else
list_add_tail(&rq->queuelist, &q->queue_head);
return;
--
2.7.4

2016-12-07 23:10:09

[permalink] [raw]

Subject: [PATCH 1/7] blk-mq: add blk_mq_start_stopped_hw_queue()

We have a variant for all hardware queues, but not one for a single
hardware queue.

Signed-off-by: Jens Axboe <[email protected]>
---
block/blk-mq.c | 18 +++++++++++-------
include/linux/blk-mq.h | 1 +
2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 90db5b490df9..b216746be9d3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1064,18 +1064,22 @@ void blk_mq_start_hw_queues(struct request_queue *q)
}
EXPORT_SYMBOL(blk_mq_start_hw_queues);

+void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+{
+ if (!blk_mq_hctx_stopped(hctx))
+ return;
+
+ clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+ blk_mq_run_hw_queue(hctx, async);
+}
+
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
{
struct blk_mq_hw_ctx *hctx;
int i;

- queue_for_each_hw_ctx(q, hctx, i) {
- if (!blk_mq_hctx_stopped(hctx))
- continue;
-
- clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
- blk_mq_run_hw_queue(hctx, async);
- }
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_start_stopped_hw_queue(hctx, async);
}
EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 35a0af5ede6d..87e404aae267 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -231,6 +231,7 @@ void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
void blk_mq_stop_hw_queues(struct request_queue *q);
void blk_mq_start_hw_queues(struct request_queue *q);
+void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
void blk_mq_run_hw_queues(struct request_queue *q, bool async);
void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
--
2.7.4