2014-04-14 08:28:07

by Christoph Hellwig

[permalink] [raw]
Subject:

This is the majority of the blk-mq work still required for switching
over SCSI. There are a few more bits for I/O completion and requeueing
pending, but they will need further work.


2014-04-14 08:28:21

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 6/7] blk-mq: split out tag initialization, support shared tags

Add a new blk_mq_tag_set structure that gets set up before we initialize
the queue. A single blk_mq_tag_set structure can be shared by multiple
queues.

Signed-off-by: Christoph Hellwig <[email protected]>
---
block/blk-mq-cpumap.c | 6 +-
block/blk-mq-tag.c | 14 ---
block/blk-mq-tag.h | 19 +++-
block/blk-mq.c | 242 ++++++++++++++++++++++++--------------------
block/blk-mq.h | 5 +-
drivers/block/null_blk.c | 92 ++++++++++-------
drivers/block/virtio_blk.c | 48 +++++----
include/linux/blk-mq.h | 34 +++----
8 files changed, 260 insertions(+), 200 deletions(-)

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 0979213..5d0f93c 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -80,17 +80,17 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
return 0;
}

-unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg)
+unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
{
unsigned int *map;

/* If cpus are offline, map them to first hctx */
map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,
- reg->numa_node);
+ set->numa_node);
if (!map)
return NULL;

- if (!blk_mq_update_queue_map(map, reg->nr_hw_queues))
+ if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
return map;

kfree(map);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 83ae96c..7a799c4 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -1,25 +1,11 @@
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/percpu_ida.h>

#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"

-/*
- * Per tagged queue (tag address space) map
- */
-struct blk_mq_tags {
- unsigned int nr_tags;
- unsigned int nr_reserved_tags;
- unsigned int nr_batch_move;
- unsigned int nr_max_cache;
-
- struct percpu_ida free_tags;
- struct percpu_ida reserved_tags;
-};
-
void blk_mq_wait_for_tags(struct blk_mq_tags *tags)
{
int tag = blk_mq_get_tag(tags, __GFP_WAIT, false);
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 947ba2c..b602e3f 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -1,7 +1,24 @@
#ifndef INT_BLK_MQ_TAG_H
#define INT_BLK_MQ_TAG_H

-struct blk_mq_tags;
+#include <linux/percpu_ida.h>
+
+/*
+ * Tag address space map.
+ */
+struct blk_mq_tags {
+ unsigned int nr_tags;
+ unsigned int nr_reserved_tags;
+ unsigned int nr_batch_move;
+ unsigned int nr_max_cache;
+
+ struct percpu_ida free_tags;
+ struct percpu_ida reserved_tags;
+
+ struct request **rqs;
+ struct list_head page_list;
+};
+

extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
extern void blk_mq_free_tags(struct blk_mq_tags *tags);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1cb7618..c3b1810 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -81,7 +81,7 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,

tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
if (tag != BLK_MQ_TAG_FAIL) {
- rq = hctx->rqs[tag];
+ rq = hctx->tags->rqs[tag];
blk_rq_init(hctx->queue, rq);
rq->tag = tag;

@@ -401,6 +401,12 @@ static void blk_mq_requeue_request(struct request *rq)
rq->nr_phys_segments--;
}

+struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
+{
+ return tags->rqs[tag];
+}
+EXPORT_SYMBOL(blk_mq_tag_to_rq);
+
struct blk_mq_timeout_data {
struct blk_mq_hw_ctx *hctx;
unsigned long *next;
@@ -422,12 +428,13 @@ static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
do {
struct request *rq;

- tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag);
- if (tag >= hctx->queue_depth)
+ tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag);
+ if (tag >= hctx->tags->nr_tags)
break;

- rq = hctx->rqs[tag++];
-
+ rq = blk_mq_tag_to_rq(hctx->tags, tag++);
+ if (rq->q != hctx->queue)
+ continue;
if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
continue;

@@ -947,11 +954,11 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
}
EXPORT_SYMBOL(blk_mq_map_queue);

-struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg,
+struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *set,
unsigned int hctx_index)
{
return kmalloc_node(sizeof(struct blk_mq_hw_ctx),
- GFP_KERNEL | __GFP_ZERO, reg->numa_node);
+ GFP_KERNEL | __GFP_ZERO, set->numa_node);
}
EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);

@@ -1008,31 +1015,31 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
blk_mq_run_hw_queue(hctx, true);
}

-static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx, void *driver_data)
+static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
+ struct blk_mq_tags *tags, unsigned int hctx_idx)
{
struct page *page;

- if (hctx->rqs && hctx->queue->mq_ops->exit_request) {
+ if (tags->rqs && set->ops->exit_request) {
int i;

- for (i = 0; i < hctx->queue_depth; i++) {
- if (!hctx->rqs[i])
+ for (i = 0; i < tags->nr_tags; i++) {
+ if (!tags->rqs[i])
continue;
- hctx->queue->mq_ops->exit_request(driver_data, hctx,
- hctx->rqs[i], i);
+ set->ops->exit_request(set->driver_data, tags->rqs[i],
+ hctx_idx, i);
}
}

- while (!list_empty(&hctx->page_list)) {
- page = list_first_entry(&hctx->page_list, struct page, lru);
+ while (!list_empty(&tags->page_list)) {
+ page = list_first_entry(&tags->page_list, struct page, lru);
list_del_init(&page->lru);
__free_pages(page, page->private);
}

- kfree(hctx->rqs);
+ kfree(tags->rqs);

- if (hctx->tags)
- blk_mq_free_tags(hctx->tags);
+ blk_mq_free_tags(tags);
}

static size_t order_to_size(unsigned int order)
@@ -1045,30 +1052,36 @@ static size_t order_to_size(unsigned int order)
return ret;
}

-static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
- struct blk_mq_reg *reg, void *driver_data, int node)
+static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx)
{
- unsigned int reserved_tags = reg->reserved_tags;
+ struct blk_mq_tags *tags;
unsigned int i, j, entries_per_page, max_order = 4;
size_t rq_size, left;
- int error;

- INIT_LIST_HEAD(&hctx->page_list);
+ tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
+ set->numa_node);
+ if (!tags)
+ return NULL;

- hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
- GFP_KERNEL, node);
- if (!hctx->rqs)
- return -ENOMEM;
+ INIT_LIST_HEAD(&tags->page_list);
+
+ tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *),
+ GFP_KERNEL, set->numa_node);
+ if (!tags->rqs) {
+ blk_mq_free_tags(tags);
+ return NULL;
+ }

/*
* rq_size is the size of the request plus driver payload, rounded
* to the cacheline size
*/
- rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
+ rq_size = round_up(sizeof(struct request) + set->cmd_size,
cache_line_size());
- left = rq_size * hctx->queue_depth;
+ left = rq_size * set->queue_depth;

- for (i = 0; i < hctx->queue_depth;) {
+ for (i = 0; i < set->queue_depth; ) {
int this_order = max_order;
struct page *page;
int to_do;
@@ -1078,7 +1091,8 @@ static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
this_order--;

do {
- page = alloc_pages_node(node, GFP_KERNEL, this_order);
+ page = alloc_pages_node(set->numa_node, GFP_KERNEL,
+ this_order);
if (page)
break;
if (!this_order--)
@@ -1088,22 +1102,22 @@ static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
} while (1);

if (!page)
- break;
+ goto fail;

page->private = this_order;
- list_add_tail(&page->lru, &hctx->page_list);
+ list_add_tail(&page->lru, &tags->page_list);

p = page_address(page);
entries_per_page = order_to_size(this_order) / rq_size;
- to_do = min(entries_per_page, hctx->queue_depth - i);
+ to_do = min(entries_per_page, set->queue_depth - i);
left -= to_do * rq_size;
for (j = 0; j < to_do; j++) {
- hctx->rqs[i] = p;
- if (reg->ops->init_request) {
- error = reg->ops->init_request(driver_data,
- hctx, hctx->rqs[i], i);
- if (error)
- goto err_rq_map;
+ tags->rqs[i] = p;
+ if (set->ops->init_request) {
+ if (set->ops->init_request(set->driver_data,
+ tags->rqs[i], hctx_idx, i,
+ set->numa_node))
+ goto fail;
}

p += rq_size;
@@ -1111,30 +1125,16 @@ static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
}
}

- if (i < (reserved_tags + BLK_MQ_TAG_MIN)) {
- error = -ENOMEM;
- goto err_rq_map;
- }
- if (i != hctx->queue_depth) {
- hctx->queue_depth = i;
- pr_warn("%s: queue depth set to %u because of low memory\n",
- __func__, i);
- }
+ return tags;

- hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
- if (!hctx->tags) {
- error = -ENOMEM;
- goto err_rq_map;
- }
-
- return 0;
-err_rq_map:
- blk_mq_free_rq_map(hctx, driver_data);
- return error;
+fail:
+ pr_warn("%s: failed to allocate requests\n", __func__);
+ blk_mq_free_rq_map(set, tags, hctx_idx);
+ return NULL;
}

static int blk_mq_init_hw_queues(struct request_queue *q,
- struct blk_mq_reg *reg, void *driver_data)
+ struct blk_mq_tag_set *set)
{
struct blk_mq_hw_ctx *hctx;
unsigned int i, j;
@@ -1148,23 +1148,21 @@ static int blk_mq_init_hw_queues(struct request_queue *q,

node = hctx->numa_node;
if (node == NUMA_NO_NODE)
- node = hctx->numa_node = reg->numa_node;
+ node = hctx->numa_node = set->numa_node;

INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);
spin_lock_init(&hctx->lock);
INIT_LIST_HEAD(&hctx->dispatch);
hctx->queue = q;
hctx->queue_num = i;
- hctx->flags = reg->flags;
- hctx->queue_depth = reg->queue_depth;
- hctx->cmd_size = reg->cmd_size;
+ hctx->flags = set->flags;
+ hctx->cmd_size = set->cmd_size;

blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
blk_mq_hctx_notify, hctx);
blk_mq_register_cpu_notifier(&hctx->cpu_notifier);

- if (blk_mq_init_rq_map(hctx, reg, driver_data, node))
- break;
+ hctx->tags = set->tags[i];

/*
* Allocate space for all possible cpus to avoid allocation in
@@ -1184,8 +1182,8 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
hctx->nr_ctx_map = num_maps;
hctx->nr_ctx = 0;

- if (reg->ops->init_hctx &&
- reg->ops->init_hctx(hctx, driver_data, i))
+ if (set->ops->init_hctx &&
+ set->ops->init_hctx(hctx, set->driver_data, i))
break;
}

@@ -1199,11 +1197,10 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
if (i == j)
break;

- if (reg->ops->exit_hctx)
- reg->ops->exit_hctx(hctx, j);
+ if (set->ops->exit_hctx)
+ set->ops->exit_hctx(hctx, j);

blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
- blk_mq_free_rq_map(hctx, driver_data);
kfree(hctx->ctxs);
}

@@ -1262,41 +1259,25 @@ static void blk_mq_map_swqueue(struct request_queue *q)
}
}

-struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
- void *driver_data)
+struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
{
struct blk_mq_hw_ctx **hctxs;
struct blk_mq_ctx *ctx;
struct request_queue *q;
int i;

- if (!reg->nr_hw_queues ||
- !reg->ops->queue_rq || !reg->ops->map_queue ||
- !reg->ops->alloc_hctx || !reg->ops->free_hctx)
- return ERR_PTR(-EINVAL);
-
- if (!reg->queue_depth)
- reg->queue_depth = BLK_MQ_MAX_DEPTH;
- else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) {
- pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth);
- reg->queue_depth = BLK_MQ_MAX_DEPTH;
- }
-
- if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
- return ERR_PTR(-EINVAL);
-
ctx = alloc_percpu(struct blk_mq_ctx);
if (!ctx)
return ERR_PTR(-ENOMEM);

- hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
- reg->numa_node);
+ hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
+ set->numa_node);

if (!hctxs)
goto err_percpu;

- for (i = 0; i < reg->nr_hw_queues; i++) {
- hctxs[i] = reg->ops->alloc_hctx(reg, i);
+ for (i = 0; i < set->nr_hw_queues; i++) {
+ hctxs[i] = set->ops->alloc_hctx(set, i);
if (!hctxs[i])
goto err_hctxs;

@@ -1304,11 +1285,11 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
hctxs[i]->queue_num = i;
}

- q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node);
+ q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
if (!q)
goto err_hctxs;

- q->mq_map = blk_mq_make_queue_map(reg);
+ q->mq_map = blk_mq_make_queue_map(set);
if (!q->mq_map)
goto err_map;

@@ -1316,33 +1297,34 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
blk_queue_rq_timeout(q, 30000);

q->nr_queues = nr_cpu_ids;
- q->nr_hw_queues = reg->nr_hw_queues;
+ q->nr_hw_queues = set->nr_hw_queues;

q->queue_ctx = ctx;
q->queue_hw_ctx = hctxs;

- q->mq_ops = reg->ops;
+ q->mq_ops = set->ops;
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;

q->sg_reserved_size = INT_MAX;

blk_queue_make_request(q, blk_mq_make_request);
- blk_queue_rq_timed_out(q, reg->ops->timeout);
- if (reg->timeout)
- blk_queue_rq_timeout(q, reg->timeout);
+ blk_queue_rq_timed_out(q, set->ops->timeout);
+ if (set->timeout)
+ blk_queue_rq_timeout(q, set->timeout);

- if (reg->ops->complete)
- blk_queue_softirq_done(q, reg->ops->complete);
+ if (set->ops->complete)
+ blk_queue_softirq_done(q, set->ops->complete);

blk_mq_init_flush(q);
- blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
+ blk_mq_init_cpu_queues(q, set->nr_hw_queues);

- q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size,
- cache_line_size()), GFP_KERNEL);
+ q->flush_rq = kzalloc(round_up(sizeof(struct request) +
+ set->cmd_size, cache_line_size()),
+ GFP_KERNEL);
if (!q->flush_rq)
goto err_hw;

- if (blk_mq_init_hw_queues(q, reg, driver_data))
+ if (blk_mq_init_hw_queues(q, set))
goto err_flush_rq;

blk_mq_map_swqueue(q);
@@ -1360,10 +1342,10 @@ err_hw:
err_map:
blk_cleanup_queue(q);
err_hctxs:
- for (i = 0; i < reg->nr_hw_queues; i++) {
+ for (i = 0; i < set->nr_hw_queues; i++) {
if (!hctxs[i])
break;
- reg->ops->free_hctx(hctxs[i], i);
+ set->ops->free_hctx(hctxs[i], i);
}
kfree(hctxs);
err_percpu:
@@ -1380,7 +1362,6 @@ void blk_mq_free_queue(struct request_queue *q)
queue_for_each_hw_ctx(q, hctx, i) {
kfree(hctx->ctx_map);
kfree(hctx->ctxs);
- blk_mq_free_rq_map(hctx, q->queuedata);
blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
if (q->mq_ops->exit_hctx)
q->mq_ops->exit_hctx(hctx, i);
@@ -1440,6 +1421,51 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
return NOTIFY_OK;
}

+int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
+{
+ int i;
+
+ if (!set->nr_hw_queues)
+ return -EINVAL;
+ if (!set->queue_depth || set->queue_depth > BLK_MQ_MAX_DEPTH)
+ return -EINVAL;
+ if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
+ return -EINVAL;
+
+ if (!set->nr_hw_queues ||
+ !set->ops->queue_rq || !set->ops->map_queue ||
+ !set->ops->alloc_hctx || !set->ops->free_hctx)
+ return -EINVAL;
+
+
+ set->tags = kmalloc_node(set->nr_hw_queues * sizeof(struct blk_mq_tags),
+ GFP_KERNEL, set->numa_node);
+ if (!set->tags)
+ goto out;
+
+ for (i = 0; i < set->nr_hw_queues; i++) {
+ set->tags[i] = blk_mq_init_rq_map(set, i);
+ if (!set->tags[i])
+ goto out_unwind;
+ }
+
+ return 0;
+
+out_unwind:
+ while (--i >= 0)
+ blk_mq_free_rq_map(set, set->tags[i], i);
+out:
+ return -ENOMEM;
+}
+
+void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
+{
+ int i;
+
+ for (i = 0; i < set->nr_hw_queues; i++)
+ blk_mq_free_rq_map(set, set->tags[i], i);
+}
+
void blk_mq_disable_hotplug(void)
{
mutex_lock(&all_q_mutex);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 7964dad..5fa14f1 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -1,6 +1,8 @@
#ifndef INT_BLK_MQ_H
#define INT_BLK_MQ_H

+struct blk_mq_tag_set;
+
struct blk_mq_ctx {
struct {
spinlock_t lock;
@@ -46,8 +48,7 @@ void blk_mq_disable_hotplug(void);
/*
* CPU -> queue mappings
*/
-struct blk_mq_reg;
-extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg);
+extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);

void blk_mq_add_timer(struct request *rq);
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 71df69d..8e7e3a0 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -32,6 +32,7 @@ struct nullb {
unsigned int index;
struct request_queue *q;
struct gendisk *disk;
+ struct blk_mq_tag_set tag_set;
struct hrtimer timer;
unsigned int queue_depth;
spinlock_t lock;
@@ -320,10 +321,11 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
return BLK_MQ_RQ_QUEUE_OK;
}

-static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_reg *reg, unsigned int hctx_index)
+static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_tag_set *set,
+ unsigned int hctx_index)
{
- int b_size = DIV_ROUND_UP(reg->nr_hw_queues, nr_online_nodes);
- int tip = (reg->nr_hw_queues % nr_online_nodes);
+ int b_size = DIV_ROUND_UP(set->nr_hw_queues, nr_online_nodes);
+ int tip = (set->nr_hw_queues % nr_online_nodes);
int node = 0, i, n;

/*
@@ -338,7 +340,7 @@ static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_reg *reg, unsigned in

tip--;
if (!tip)
- b_size = reg->nr_hw_queues / nr_online_nodes;
+ b_size = set->nr_hw_queues / nr_online_nodes;
}
}

@@ -387,13 +389,17 @@ static struct blk_mq_ops null_mq_ops = {
.map_queue = blk_mq_map_queue,
.init_hctx = null_init_hctx,
.complete = null_softirq_done_fn,
+ .alloc_hctx = blk_mq_alloc_single_hw_queue,
+ .free_hctx = blk_mq_free_single_hw_queue,
};

-static struct blk_mq_reg null_mq_reg = {
- .ops = &null_mq_ops,
- .queue_depth = 64,
- .cmd_size = sizeof(struct nullb_cmd),
- .flags = BLK_MQ_F_SHOULD_MERGE,
+static struct blk_mq_ops null_mq_ops_pernode = {
+ .queue_rq = null_queue_rq,
+ .map_queue = blk_mq_map_queue,
+ .init_hctx = null_init_hctx,
+ .complete = null_softirq_done_fn,
+ .alloc_hctx = null_alloc_hctx,
+ .free_hctx = null_free_hctx,
};

static void null_del_dev(struct nullb *nullb)
@@ -402,6 +408,8 @@ static void null_del_dev(struct nullb *nullb)

del_gendisk(nullb->disk);
blk_cleanup_queue(nullb->q);
+ if (queue_mode == NULL_Q_MQ)
+ blk_mq_free_tag_set(&nullb->tag_set);
put_disk(nullb->disk);
kfree(nullb);
}
@@ -506,7 +514,7 @@ static int null_add_dev(void)

nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node);
if (!nullb)
- return -ENOMEM;
+ goto out;

spin_lock_init(&nullb->lock);

@@ -514,49 +522,47 @@ static int null_add_dev(void)
submit_queues = nr_online_nodes;

if (setup_queues(nullb))
- goto err;
+ goto out_free_nullb;

if (queue_mode == NULL_Q_MQ) {
- null_mq_reg.numa_node = home_node;
- null_mq_reg.queue_depth = hw_queue_depth;
- null_mq_reg.nr_hw_queues = submit_queues;
-
- if (use_per_node_hctx) {
- null_mq_reg.ops->alloc_hctx = null_alloc_hctx;
- null_mq_reg.ops->free_hctx = null_free_hctx;
- } else {
- null_mq_reg.ops->alloc_hctx = blk_mq_alloc_single_hw_queue;
- null_mq_reg.ops->free_hctx = blk_mq_free_single_hw_queue;
- }
-
- nullb->q = blk_mq_init_queue(&null_mq_reg, nullb);
+ if (use_per_node_hctx)
+ nullb->tag_set.ops = &null_mq_ops_pernode;
+ else
+ nullb->tag_set.ops = &null_mq_ops;
+ nullb->tag_set.nr_hw_queues = submit_queues;
+ nullb->tag_set.queue_depth = hw_queue_depth;
+ nullb->tag_set.numa_node = home_node;
+ nullb->tag_set.cmd_size = sizeof(struct nullb_cmd);
+ nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+ nullb->tag_set.driver_data = nullb;
+
+ if (blk_mq_alloc_tag_set(&nullb->tag_set))
+ goto out_cleanup_queues;
+
+ nullb->q = blk_mq_init_queue(&nullb->tag_set);
+ if (!nullb->q)
+ goto out_cleanup_tags;
} else if (queue_mode == NULL_Q_BIO) {
nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node);
+ if (!nullb->q)
+ goto out_cleanup_queues;
blk_queue_make_request(nullb->q, null_queue_bio);
init_driver_queues(nullb);
} else {
nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node);
+ if (!nullb->q)
+ goto out_cleanup_queues;
blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
- if (nullb->q)
- blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
+ blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
init_driver_queues(nullb);
}

- if (!nullb->q)
- goto queue_fail;
-
nullb->q->queuedata = nullb;
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q);

disk = nullb->disk = alloc_disk_node(1, home_node);
- if (!disk) {
-queue_fail:
- blk_cleanup_queue(nullb->q);
- cleanup_queues(nullb);
-err:
- kfree(nullb);
- return -ENOMEM;
- }
+ if (!disk)
+ goto out_cleanup_blk_queue;

mutex_lock(&lock);
list_add_tail(&nullb->list, &nullb_list);
@@ -579,6 +585,18 @@ err:
sprintf(disk->disk_name, "nullb%d", nullb->index);
add_disk(disk);
return 0;
+
+out_cleanup_blk_queue:
+ blk_cleanup_queue(nullb->q);
+out_cleanup_tags:
+ if (queue_mode == NULL_Q_MQ)
+ blk_mq_free_tag_set(&nullb->tag_set);
+out_cleanup_queues:
+ cleanup_queues(nullb);
+out_free_nullb:
+ kfree(nullb);
+out:
+ return -ENOMEM;
}

static int __init null_init(void)
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index d06206a..f909a88 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -30,6 +30,9 @@ struct virtio_blk
/* The disk structure for the kernel. */
struct gendisk *disk;

+ /* Block layer tags. */
+ struct blk_mq_tag_set tag_set;
+
/* Process context for config space updates */
struct work_struct config_work;

@@ -480,8 +483,9 @@ static const struct device_attribute dev_attr_cache_type_rw =
__ATTR(cache_type, S_IRUGO|S_IWUSR,
virtblk_cache_type_show, virtblk_cache_type_store);

-static int virtblk_init_request(void *data, struct blk_mq_hw_ctx *hctx,
- struct request *rq, unsigned int nr)
+static int virtblk_init_request(void *data, struct request *rq,
+ unsigned int hctx_idx, unsigned int request_idx,
+ unsigned int numa_node)
{
struct virtio_blk *vblk = data;
struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
@@ -495,18 +499,12 @@ static struct blk_mq_ops virtio_mq_ops = {
.map_queue = blk_mq_map_queue,
.alloc_hctx = blk_mq_alloc_single_hw_queue,
.free_hctx = blk_mq_free_single_hw_queue,
- .init_request = virtblk_init_request,
.complete = virtblk_request_done,
+ .init_request = virtblk_init_request,
};

-static struct blk_mq_reg virtio_mq_reg = {
- .ops = &virtio_mq_ops,
- .nr_hw_queues = 1,
- .queue_depth = 0, /* Set in virtblk_probe */
- .numa_node = NUMA_NO_NODE,
- .flags = BLK_MQ_F_SHOULD_MERGE,
-};
-module_param_named(queue_depth, virtio_mq_reg.queue_depth, uint, 0444);
+static unsigned int virtblk_queue_depth;
+module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);

static int virtblk_probe(struct virtio_device *vdev)
{
@@ -562,20 +560,32 @@ static int virtblk_probe(struct virtio_device *vdev)
}

/* Default queue sizing is to fill the ring. */
- if (!virtio_mq_reg.queue_depth) {
- virtio_mq_reg.queue_depth = vblk->vq->num_free;
+ if (!virtblk_queue_depth) {
+ virtblk_queue_depth = vblk->vq->num_free;
/* ... but without indirect descs, we use 2 descs per req */
if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
- virtio_mq_reg.queue_depth /= 2;
+ virtblk_queue_depth /= 2;
}
- virtio_mq_reg.cmd_size =
+
+ memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
+ vblk->tag_set.ops = &virtio_mq_ops;
+ vblk->tag_set.nr_hw_queues = 1;
+ vblk->tag_set.queue_depth = virtblk_queue_depth;
+ vblk->tag_set.numa_node = NUMA_NO_NODE;
+ vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+ vblk->tag_set.cmd_size =
sizeof(struct virtblk_req) +
sizeof(struct scatterlist) * sg_elems;
+ vblk->tag_set.driver_data = vblk;

- q = vblk->disk->queue = blk_mq_init_queue(&virtio_mq_reg, vblk);
+ err = blk_mq_alloc_tag_set(&vblk->tag_set);
+ if (err)
+ goto out_put_disk;
+
+ q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set);
if (!q) {
err = -ENOMEM;
- goto out_put_disk;
+ goto out_free_tags;
}

q->queuedata = vblk;
@@ -678,6 +688,8 @@ static int virtblk_probe(struct virtio_device *vdev)
out_del_disk:
del_gendisk(vblk->disk);
blk_cleanup_queue(vblk->disk->queue);
+out_free_tags:
+ blk_mq_free_tag_set(&vblk->tag_set);
out_put_disk:
put_disk(vblk->disk);
out_free_vq:
@@ -704,6 +716,8 @@ static void virtblk_remove(struct virtio_device *vdev)
del_gendisk(vblk->disk);
blk_cleanup_queue(vblk->disk->queue);

+ blk_mq_free_tag_set(&vblk->tag_set);
+
/* Stop all the virtqueues. */
vdev->config->reset(vdev);

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 897ca1a..e3e1f41 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -32,8 +32,6 @@ struct blk_mq_hw_ctx {
unsigned int nr_ctx_map;
unsigned long *ctx_map;

- struct request **rqs;
- struct list_head page_list;
struct blk_mq_tags *tags;

unsigned long queued;
@@ -41,7 +39,6 @@ struct blk_mq_hw_ctx {
#define BLK_MQ_MAX_DISPATCH_ORDER 10
unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER];

- unsigned int queue_depth;
unsigned int numa_node;
unsigned int cmd_size; /* per-request extra data */

@@ -49,7 +46,7 @@ struct blk_mq_hw_ctx {
struct kobject kobj;
};

-struct blk_mq_reg {
+struct blk_mq_tag_set {
struct blk_mq_ops *ops;
unsigned int nr_hw_queues;
unsigned int queue_depth;
@@ -58,18 +55,22 @@ struct blk_mq_reg {
int numa_node;
unsigned int timeout;
unsigned int flags; /* BLK_MQ_F_* */
+ void *driver_data;
+
+ struct blk_mq_tags **tags;
};

typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
-typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_reg *,unsigned int);
+typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_tag_set *,
+ unsigned int);
typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
-typedef int (init_request_fn)(void *, struct blk_mq_hw_ctx *,
- struct request *, unsigned int);
-typedef void (exit_request_fn)(void *, struct blk_mq_hw_ctx *,
- struct request *, unsigned int);
+typedef int (init_request_fn)(void *, struct request *, unsigned int,
+ unsigned int, unsigned int);
+typedef void (exit_request_fn)(void *, struct request *, unsigned int,
+ unsigned int);

struct blk_mq_ops {
/*
@@ -126,10 +127,13 @@ enum {
BLK_MQ_MAX_DEPTH = 2048,
};

-struct request_queue *blk_mq_init_queue(struct blk_mq_reg *, void *);
+struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
int blk_mq_register_disk(struct gendisk *);
void blk_mq_unregister_disk(struct gendisk *);

+int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
+void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
+
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);

void blk_mq_insert_request(struct request *, bool, bool, bool);
@@ -138,10 +142,10 @@ void blk_mq_free_request(struct request *rq);
bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp);
struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp);
-struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag);
+struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);

struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
-struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *, unsigned int);
+struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int);
void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int);

bool blk_mq_end_io_partial(struct request *rq, int error,
@@ -172,12 +176,6 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
return (void *) rq + sizeof(*rq);
}

-static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx,
- unsigned int tag)
-{
- return hctx->rqs[tag];
-}
-
#define queue_for_each_hw_ctx(q, hctx, i) \
for ((i) = 0; (i) < (q)->nr_hw_queues && \
({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
--
1.7.10.4

2014-04-14 08:28:16

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 7/7] block: all blk-mq requests are tagged

Instead of setting the REQ_QUEUED flag on each of them just take it into
account in the only macro checking it.

Signed-off-by: Christoph Hellwig <[email protected]>
---
include/linux/blkdev.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 99617cf..7c0e72f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1102,7 +1102,8 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
/*
* tag stuff
*/
-#define blk_rq_tagged(rq) ((rq)->cmd_flags & REQ_QUEUED)
+#define blk_rq_tagged(rq) \
+ ((rq)->mq_ctx || ((rq)->cmd_flags & REQ_QUEUED))
extern int blk_queue_start_tag(struct request_queue *, struct request *);
extern struct request *blk_queue_find_tag(struct request_queue *, int);
extern void blk_queue_end_tag(struct request_queue *, struct request *);
--
1.7.10.4

2014-04-14 08:30:13

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 4/7] blk-mq: add ->init_request and ->exit_request methods

The current blk_mq_init_commands/blk_mq_free_commands interface has a two
problems:

1) Because only the constructor is passed to blk_mq_init_commands there
is no easy way to clean up when a comman initialization failed. The
current code simply leaks the allocations done in the constructor.
2) There is no good place to call blk_mq_free_commands: before
blk_cleanup_queue there is no guarantee that all outstanding commands
have completed, so we can't free them yet. After blk_cleanup_queue
the queue has usually been freed. This can be worked around by
grabbing an unconditional reference before calling blk_cleanup_queue
and dropping it after blk_mq_free_commands is done, although that's
not exatly pretty and driver writers are guaranteed to get it wrong
sooner or later.

Both issues are easily fixed by making the request constructor and
destructor normal blk_mq_ops methods.

Signed-off-by: Christoph Hellwig <[email protected]>
---
block/blk-mq.c | 105 ++++++++++++++------------------------------
drivers/block/virtio_blk.c | 23 +++++-----
include/linux/blk-mq.h | 14 +++++-
3 files changed, 55 insertions(+), 87 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 67859e9..ef7acc0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1009,74 +1009,20 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
blk_mq_run_hw_queue(hctx, true);
}

-static int blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
- int (*init)(void *, struct blk_mq_hw_ctx *,
- struct request *, unsigned int),
- void *data)
+static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx, void *driver_data)
{
- unsigned int i;
- int ret = 0;
-
- for (i = 0; i < hctx->queue_depth; i++) {
- struct request *rq = hctx->rqs[i];
-
- ret = init(data, hctx, rq, i);
- if (ret)
- break;
- }
-
- return ret;
-}
-
-int blk_mq_init_commands(struct request_queue *q,
- int (*init)(void *, struct blk_mq_hw_ctx *,
- struct request *, unsigned int),
- void *data)
-{
- struct blk_mq_hw_ctx *hctx;
- unsigned int i;
- int ret = 0;
-
- queue_for_each_hw_ctx(q, hctx, i) {
- ret = blk_mq_init_hw_commands(hctx, init, data);
- if (ret)
- break;
- }
-
- return ret;
-}
-EXPORT_SYMBOL(blk_mq_init_commands);
-
-static void blk_mq_free_hw_commands(struct blk_mq_hw_ctx *hctx,
- void (*free)(void *, struct blk_mq_hw_ctx *,
- struct request *, unsigned int),
- void *data)
-{
- unsigned int i;
+ struct page *page;

- for (i = 0; i < hctx->queue_depth; i++) {
- struct request *rq = hctx->rqs[i];
+ if (hctx->rqs && hctx->queue->mq_ops->exit_request) {
+ int i;

- free(data, hctx, rq, i);
+ for (i = 0; i < hctx->queue_depth; i++) {
+ if (!hctx->rqs[i])
+ continue;
+ hctx->queue->mq_ops->exit_request(driver_data, hctx,
+ hctx->rqs[i], i);
+ }
}
-}
-
-void blk_mq_free_commands(struct request_queue *q,
- void (*free)(void *, struct blk_mq_hw_ctx *,
- struct request *, unsigned int),
- void *data)
-{
- struct blk_mq_hw_ctx *hctx;
- unsigned int i;
-
- queue_for_each_hw_ctx(q, hctx, i)
- blk_mq_free_hw_commands(hctx, free, data);
-}
-EXPORT_SYMBOL(blk_mq_free_commands);
-
-static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx)
-{
- struct page *page;

while (!list_empty(&hctx->page_list)) {
page = list_first_entry(&hctx->page_list, struct page, lru);
@@ -1101,10 +1047,12 @@ static size_t order_to_size(unsigned int order)
}

static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
- unsigned int reserved_tags, int node)
+ struct blk_mq_reg *reg, void *driver_data, int node)
{
+ unsigned int reserved_tags = reg->reserved_tags;
unsigned int i, j, entries_per_page, max_order = 4;
size_t rq_size, left;
+ int error;

INIT_LIST_HEAD(&hctx->page_list);

@@ -1153,14 +1101,23 @@ static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
for (j = 0; j < to_do; j++) {
hctx->rqs[i] = p;
blk_rq_init(hctx->queue, hctx->rqs[i]);
+ if (reg->ops->init_request) {
+ error = reg->ops->init_request(driver_data,
+ hctx, hctx->rqs[i], i);
+ if (error)
+ goto err_rq_map;
+ }
+
p += rq_size;
i++;
}
}

- if (i < (reserved_tags + BLK_MQ_TAG_MIN))
+ if (i < (reserved_tags + BLK_MQ_TAG_MIN)) {
+ error = -ENOMEM;
goto err_rq_map;
- else if (i != hctx->queue_depth) {
+ }
+ if (i != hctx->queue_depth) {
hctx->queue_depth = i;
pr_warn("%s: queue depth set to %u because of low memory\n",
__func__, i);
@@ -1168,12 +1125,14 @@ static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,

hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
if (!hctx->tags) {
-err_rq_map:
- blk_mq_free_rq_map(hctx);
- return -ENOMEM;
+ error = -ENOMEM;
+ goto err_rq_map;
}

return 0;
+err_rq_map:
+ blk_mq_free_rq_map(hctx, driver_data);
+ return error;
}

static int blk_mq_init_hw_queues(struct request_queue *q,
@@ -1206,7 +1165,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
blk_mq_hctx_notify, hctx);
blk_mq_register_cpu_notifier(&hctx->cpu_notifier);

- if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node))
+ if (blk_mq_init_rq_map(hctx, reg, driver_data, node))
break;

/*
@@ -1246,7 +1205,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
reg->ops->exit_hctx(hctx, j);

blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
- blk_mq_free_rq_map(hctx);
+ blk_mq_free_rq_map(hctx, driver_data);
kfree(hctx->ctxs);
}

@@ -1423,7 +1382,7 @@ void blk_mq_free_queue(struct request_queue *q)
queue_for_each_hw_ctx(q, hctx, i) {
kfree(hctx->ctx_map);
kfree(hctx->ctxs);
- blk_mq_free_rq_map(hctx);
+ blk_mq_free_rq_map(hctx, q->queuedata);
blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
if (q->mq_ops->exit_hctx)
q->mq_ops->exit_hctx(hctx, i);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index c7d02bc..d06206a 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -480,11 +480,22 @@ static const struct device_attribute dev_attr_cache_type_rw =
__ATTR(cache_type, S_IRUGO|S_IWUSR,
virtblk_cache_type_show, virtblk_cache_type_store);

+static int virtblk_init_request(void *data, struct blk_mq_hw_ctx *hctx,
+ struct request *rq, unsigned int nr)
+{
+ struct virtio_blk *vblk = data;
+ struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
+
+ sg_init_table(vbr->sg, vblk->sg_elems);
+ return 0;
+}
+
static struct blk_mq_ops virtio_mq_ops = {
.queue_rq = virtio_queue_rq,
.map_queue = blk_mq_map_queue,
.alloc_hctx = blk_mq_alloc_single_hw_queue,
.free_hctx = blk_mq_free_single_hw_queue,
+ .init_request = virtblk_init_request,
.complete = virtblk_request_done,
};

@@ -497,16 +508,6 @@ static struct blk_mq_reg virtio_mq_reg = {
};
module_param_named(queue_depth, virtio_mq_reg.queue_depth, uint, 0444);

-static int virtblk_init_vbr(void *data, struct blk_mq_hw_ctx *hctx,
- struct request *rq, unsigned int nr)
-{
- struct virtio_blk *vblk = data;
- struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
-
- sg_init_table(vbr->sg, vblk->sg_elems);
- return 0;
-}
-
static int virtblk_probe(struct virtio_device *vdev)
{
struct virtio_blk *vblk;
@@ -577,8 +578,6 @@ static int virtblk_probe(struct virtio_device *vdev)
goto out_put_disk;
}

- blk_mq_init_commands(q, virtblk_init_vbr, vblk);
-
q->queuedata = vblk;

virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0120451..897ca1a 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -66,6 +66,10 @@ typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_reg *,unsigned int);
typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
+typedef int (init_request_fn)(void *, struct blk_mq_hw_ctx *,
+ struct request *, unsigned int);
+typedef void (exit_request_fn)(void *, struct blk_mq_hw_ctx *,
+ struct request *, unsigned int);

struct blk_mq_ops {
/*
@@ -98,6 +102,14 @@ struct blk_mq_ops {
*/
init_hctx_fn *init_hctx;
exit_hctx_fn *exit_hctx;
+
+ /*
+ * Called for every command allocated by the block layer to allow
+ * the driver to set up driver specific data.
+ * Ditto for exit/teardown.
+ */
+ init_request_fn *init_request;
+ exit_request_fn *exit_request;
};

enum {
@@ -117,8 +129,6 @@ enum {
struct request_queue *blk_mq_init_queue(struct blk_mq_reg *, void *);
int blk_mq_register_disk(struct gendisk *);
void blk_mq_unregister_disk(struct gendisk *);
-int blk_mq_init_commands(struct request_queue *, int (*init)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data);
-void blk_mq_free_commands(struct request_queue *, void (*free)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data);

void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);

--
1.7.10.4

2014-04-14 08:30:11

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 5/7] blk-mq: initialize request on allocation

If we want to share tag and request allocation between queues we cannot
initialize the request at init/free time, but need to initialize it
at allocation time as it might get used for different queues over its
lifetime.

Signed-off-by: Christoph Hellwig <[email protected]>
---
block/blk-mq.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ef7acc0..1cb7618 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -82,6 +82,7 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
if (tag != BLK_MQ_TAG_FAIL) {
rq = hctx->rqs[tag];
+ blk_rq_init(hctx->queue, rq);
rq->tag = tag;

return rq;
@@ -254,9 +255,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
const int tag = rq->tag;
struct request_queue *q = rq->q;

- blk_rq_init(hctx->queue, rq);
blk_mq_put_tag(hctx->tags, tag);
-
blk_mq_queue_exit(q);
}

@@ -1100,7 +1099,6 @@ static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
left -= to_do * rq_size;
for (j = 0; j < to_do; j++) {
hctx->rqs[i] = p;
- blk_rq_init(hctx->queue, hctx->rqs[i]);
if (reg->ops->init_request) {
error = reg->ops->init_request(driver_data,
hctx, hctx->rqs[i], i);
--
1.7.10.4

2014-04-14 08:31:36

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 1/7] blk-mq: initialize resid_len

Signed-off-by: Christoph Hellwig <[email protected]>
---
block/blk-mq.c | 2 ++
1 file changed, 2 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1d2a9bd..c1f4736 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -350,6 +350,8 @@ static void blk_mq_start_request(struct request *rq, bool last)

trace_block_rq_issue(q, rq);

+ rq->resid_len = blk_rq_bytes(rq);
+
/*
* Just mark start time and set the started bit. Due to memory
* ordering, we know we'll see the correct deadline as long as
--
1.7.10.4

2014-04-14 08:31:35

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 2/7] blk-mq: do not initialize req->special

Drivers can reach their private data easily using the blk_mq_rq_to_pdu
helper and don't need req->special. By not initializing it code can
be simplified nicely, and we also shave off a few more instructions from
the I/O path.

Signed-off-by: Christoph Hellwig <[email protected]>
---
block/blk-flush.c | 10 ++--------
block/blk-mq.c | 15 ++-------------
block/blk-mq.h | 1 -
drivers/block/null_blk.c | 4 ++--
drivers/block/virtio_blk.c | 6 +++---
5 files changed, 9 insertions(+), 27 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 43e6b47..9a0c427 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -306,22 +306,16 @@ static bool blk_kick_flush(struct request_queue *q)
*/
q->flush_pending_idx ^= 1;

+ blk_rq_init(q, q->flush_rq);
if (q->mq_ops) {
- struct blk_mq_ctx *ctx = first_rq->mq_ctx;
- struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
-
- blk_mq_rq_init(hctx, q->flush_rq);
- q->flush_rq->mq_ctx = ctx;
-
/*
* Reuse the tag value from the fist waiting request,
* with blk-mq the tag is generated during request
* allocation and drivers can rely on it being inside
* the range they asked for.
*/
+ q->flush_rq->mq_ctx = first_rq->mq_ctx;
q->flush_rq->tag = first_rq->tag;
- } else {
- blk_rq_init(q, q->flush_rq);
}

q->flush_rq->cmd_type = REQ_TYPE_FS;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c1f4736..747feb9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -248,24 +248,13 @@ struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,
}
EXPORT_SYMBOL(blk_mq_alloc_reserved_request);

-/*
- * Re-init and set pdu, if we have it
- */
-void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
-{
- blk_rq_init(hctx->queue, rq);
-
- if (hctx->cmd_size)
- rq->special = blk_mq_rq_to_pdu(rq);
-}
-
static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx, struct request *rq)
{
const int tag = rq->tag;
struct request_queue *q = rq->q;

- blk_mq_rq_init(hctx, rq);
+ blk_rq_init(hctx->queue, rq);
blk_mq_put_tag(hctx->tags, tag);

blk_mq_queue_exit(q);
@@ -1143,7 +1132,7 @@ static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
left -= to_do * rq_size;
for (j = 0; j < to_do; j++) {
hctx->rqs[i] = p;
- blk_mq_rq_init(hctx, hctx->rqs[i]);
+ blk_rq_init(hctx->queue, hctx->rqs[i]);
p += rq_size;
i++;
}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ebbe6ba..238379a 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -27,7 +27,6 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_init_flush(struct request_queue *q);
void blk_mq_drain_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
-void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq);

/*
* CPU hotplug helpers
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 091b9ea..71df69d 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -226,7 +226,7 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)

static void null_softirq_done_fn(struct request *rq)
{
- end_cmd(rq->special);
+ end_cmd(blk_mq_rq_to_pdu(rq));
}

static inline void null_handle_cmd(struct nullb_cmd *cmd)
@@ -311,7 +311,7 @@ static void null_request_fn(struct request_queue *q)

static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
{
- struct nullb_cmd *cmd = rq->special;
+ struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);

cmd->rq = rq;
cmd->nq = hctx->driver_data;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 6d8a87f..c7d02bc 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -112,7 +112,7 @@ static int __virtblk_add_req(struct virtqueue *vq,

static inline void virtblk_request_done(struct request *req)
{
- struct virtblk_req *vbr = req->special;
+ struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
int error = virtblk_result(vbr);

if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
@@ -154,7 +154,7 @@ static void virtblk_done(struct virtqueue *vq)
static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
{
struct virtio_blk *vblk = hctx->queue->queuedata;
- struct virtblk_req *vbr = req->special;
+ struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
unsigned long flags;
unsigned int num;
const bool last = (req->cmd_flags & REQ_END) != 0;
@@ -501,7 +501,7 @@ static int virtblk_init_vbr(void *data, struct blk_mq_hw_ctx *hctx,
struct request *rq, unsigned int nr)
{
struct virtio_blk *vblk = data;
- struct virtblk_req *vbr = rq->special;
+ struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);

sg_init_table(vbr->sg, vblk->sg_elems);
return 0;
--
1.7.10.4

2014-04-14 08:31:33

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 3/7] blk-mq: make ->flush_rq fully transparent to drivers

Drivers shouldn't have to care about the block layer setting aside a
request to implement the flush state machine. We already override the
mq context and tag to make it more transparent, but so far haven't deal
with the driver private data in the request. Make sure to override this
as well, and while we're at it add a proper helper sitting in blk-mq.c
that implements the full impersonation.

Signed-off-by: Christoph Hellwig <[email protected]>
---
block/blk-flush.c | 12 ++----------
block/blk-mq.c | 20 ++++++++++++++++++++
block/blk-mq.h | 2 ++
3 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 9a0c427..b218f61 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -307,16 +307,8 @@ static bool blk_kick_flush(struct request_queue *q)
q->flush_pending_idx ^= 1;

blk_rq_init(q, q->flush_rq);
- if (q->mq_ops) {
- /*
- * Reuse the tag value from the fist waiting request,
- * with blk-mq the tag is generated during request
- * allocation and drivers can rely on it being inside
- * the range they asked for.
- */
- q->flush_rq->mq_ctx = first_rq->mq_ctx;
- q->flush_rq->tag = first_rq->tag;
- }
+ if (q->mq_ops)
+ blk_mq_clone_flush_request(q->flush_rq, first_rq);

q->flush_rq->cmd_type = REQ_TYPE_FS;
q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 747feb9..67859e9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -272,6 +272,26 @@ void blk_mq_free_request(struct request *rq)
__blk_mq_free_request(hctx, ctx, rq);
}

+/*
+ * Clone all relevant state from a request that has been put on hold in
+ * the flush state machine into the preallocated flush request that hangs
+ * off the request queue.
+ *
+ * For a driver the flush request should be invisible, that's why we are
+ * impersonating the original request here.
+ */
+void blk_mq_clone_flush_request(struct request *flush_rq,
+ struct request *orig_rq)
+{
+ struct blk_mq_hw_ctx *hctx =
+ orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu);
+
+ flush_rq->mq_ctx = orig_rq->mq_ctx;
+ flush_rq->tag = orig_rq->tag;
+ memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq),
+ hctx->cmd_size);
+}
+
bool blk_mq_end_io_partial(struct request *rq, int error, unsigned int nr_bytes)
{
if (blk_update_request(rq, error, blk_rq_bytes(rq)))
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 238379a..7964dad 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -27,6 +27,8 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_init_flush(struct request_queue *q);
void blk_mq_drain_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
+void blk_mq_clone_flush_request(struct request *flush_rq,
+ struct request *orig_rq);

/*
* CPU hotplug helpers
--
1.7.10.4

2014-04-15 20:16:10

by Jens Axboe

[permalink] [raw]
Subject: Re:

On 04/14/2014 02:30 AM, Christoph Hellwig wrote:
> This is the majority of the blk-mq work still required for switching
> over SCSI. There are a few more bits for I/O completion and requeueing
> pending, but they will need further work.

Looks OK to me, I have applied them all. Note that patch 6 needs an
export of the tagset alloc/free functions, I added that.

--
Jens Axboe

2014-04-17 14:54:36

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH 5/7] blk-mq: initialize request on allocation

On Mon, Apr 14, 2014 at 4:30 PM, Christoph Hellwig <[email protected]> wrote:
> If we want to share tag and request allocation between queues we cannot
> initialize the request at init/free time, but need to initialize it
> at allocation time as it might get used for different queues over its
> lifetime.

Could you explain the use pattern? Looks you mean there are
still users of the tag/req even after it is freed, that looks a bit
weird since the tag/req can still be reallocated in another path
after it is freed.

>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> block/blk-mq.c | 4 +---
> 1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index ef7acc0..1cb7618 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -82,6 +82,7 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
> tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
> if (tag != BLK_MQ_TAG_FAIL) {
> rq = hctx->rqs[tag];
> + blk_rq_init(hctx->queue, rq);
> rq->tag = tag;
>
> return rq;
> @@ -254,9 +255,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
> const int tag = rq->tag;
> struct request_queue *q = rq->q;
>
> - blk_rq_init(hctx->queue, rq);
> blk_mq_put_tag(hctx->tags, tag);
> -
> blk_mq_queue_exit(q);
> }
>
> @@ -1100,7 +1099,6 @@ static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
> left -= to_do * rq_size;
> for (j = 0; j < to_do; j++) {
> hctx->rqs[i] = p;
> - blk_rq_init(hctx->queue, hctx->rqs[i]);
> if (reg->ops->init_request) {
> error = reg->ops->init_request(driver_data,
> hctx, hctx->rqs[i], i);
> --
> 1.7.10.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/


Thanks,
--
Ming Lei

2014-04-17 14:57:11

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 5/7] blk-mq: initialize request on allocation

On Thu, Apr 17, 2014 at 10:54:23PM +0800, Ming Lei wrote:
> On Mon, Apr 14, 2014 at 4:30 PM, Christoph Hellwig <[email protected]> wrote:
> > If we want to share tag and request allocation between queues we cannot
> > initialize the request at init/free time, but need to initialize it
> > at allocation time as it might get used for different queues over its
> > lifetime.
>
> Could you explain the use pattern? Looks you mean there are
> still users of the tag/req even after it is freed, that looks a bit
> weird since the tag/req can still be reallocated in another path
> after it is freed.

No difference in use pattern. But blk_rq_init initializes the rq->q field,
and a request might get reused for a different queue.

2014-04-17 15:09:34

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH 5/7] blk-mq: initialize request on allocation

On Thu, Apr 17, 2014 at 10:57 PM, Christoph Hellwig <[email protected]> wrote:
> On Thu, Apr 17, 2014 at 10:54:23PM +0800, Ming Lei wrote:
>> On Mon, Apr 14, 2014 at 4:30 PM, Christoph Hellwig <[email protected]> wrote:
>> > If we want to share tag and request allocation between queues we cannot
>> > initialize the request at init/free time, but need to initialize it
>> > at allocation time as it might get used for different queues over its
>> > lifetime.
>>
>> Could you explain the use pattern? Looks you mean there are
>> still users of the tag/req even after it is freed, that looks a bit
>> weird since the tag/req can still be reallocated in another path
>> after it is freed.
>
> No difference in use pattern. But blk_rq_init initializes the rq->q field,
> and a request might get reused for a different queue.

If so, it may be better to only initialize the rq->q in allocation
because Jens said 'It's done that way because of presumed cache
hotness on completion'.


Thanks,
--
Ming Lei