2015-07-13 09:56:16

by Bob Liu

[permalink] [raw]
Subject: [PATCH v3] xen/blkfront: convert to blk-mq APIs

Note: This patch is based on original work of Arianna's internship for
GNOME's Outreach Program for Women.

Only one hardware queue is used now, so there is no performance change.

The legacy non-mq code is deleted completely which is the same as other
drivers like virtio, mtip, and nvme.

Also dropped one unnecessary holding of info->io_lock when calling
blk_mq_stop_hw_queues().

Changes in v2:
- Reorganized blk_mq_queue_rq()
- Restored most io_locks in place

Change in v3:
- Rename blk_mq_queue_rq to blkif_queue_rq

Signed-off-by: Arianna Avanzini <[email protected]>
Signed-off-by: Bob Liu <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
Acked-by: Jens Axboe <[email protected]>
---
drivers/block/xen-blkfront.c | 146 +++++++++++++++++-------------------------
1 file changed, 60 insertions(+), 86 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 6d89ed3..5b45ee5 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -37,6 +37,7 @@

#include <linux/interrupt.h>
#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
#include <linux/hdreg.h>
#include <linux/cdrom.h>
#include <linux/module.h>
@@ -148,6 +149,7 @@ struct blkfront_info
unsigned int feature_persistent:1;
unsigned int max_indirect_segments;
int is_ready;
+ struct blk_mq_tag_set tag_set;
};

static unsigned int nr_minors;
@@ -616,54 +618,41 @@ static inline bool blkif_request_flush_invalid(struct request *req,
!(info->feature_flush & REQ_FUA)));
}

-/*
- * do_blkif_request
- * read a block; request is in a request queue
- */
-static void do_blkif_request(struct request_queue *rq)
+static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *qd)
{
- struct blkfront_info *info = NULL;
- struct request *req;
- int queued;
-
- pr_debug("Entered do_blkif_request\n");
-
- queued = 0;
+ struct blkfront_info *info = qd->rq->rq_disk->private_data;

- while ((req = blk_peek_request(rq)) != NULL) {
- info = req->rq_disk->private_data;
-
- if (RING_FULL(&info->ring))
- goto wait;
+ blk_mq_start_request(qd->rq);
+ spin_lock_irq(&info->io_lock);
+ if (RING_FULL(&info->ring))
+ goto out_busy;

- blk_start_request(req);
+ if (blkif_request_flush_invalid(qd->rq, info))
+ goto out_err;

- if (blkif_request_flush_invalid(req, info)) {
- __blk_end_request_all(req, -EOPNOTSUPP);
- continue;
- }
+ if (blkif_queue_request(qd->rq))
+ goto out_busy;

- pr_debug("do_blk_req %p: cmd %p, sec %lx, "
- "(%u/%u) [%s]\n",
- req, req->cmd, (unsigned long)blk_rq_pos(req),
- blk_rq_cur_sectors(req), blk_rq_sectors(req),
- rq_data_dir(req) ? "write" : "read");
-
- if (blkif_queue_request(req)) {
- blk_requeue_request(rq, req);
-wait:
- /* Avoid pointless unplugs. */
- blk_stop_queue(rq);
- break;
- }
+ flush_requests(info);
+ spin_unlock_irq(&info->io_lock);
+ return BLK_MQ_RQ_QUEUE_OK;

- queued++;
- }
+out_err:
+ spin_unlock_irq(&info->io_lock);
+ return BLK_MQ_RQ_QUEUE_ERROR;

- if (queued != 0)
- flush_requests(info);
+out_busy:
+ spin_unlock_irq(&info->io_lock);
+ blk_mq_stop_hw_queue(hctx);
+ return BLK_MQ_RQ_QUEUE_BUSY;
}

+static struct blk_mq_ops blkfront_mq_ops = {
+ .queue_rq = blkif_queue_rq,
+ .map_queue = blk_mq_map_queue,
+};
+
static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
unsigned int physical_sector_size,
unsigned int segments)
@@ -671,9 +660,22 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
struct request_queue *rq;
struct blkfront_info *info = gd->private_data;

- rq = blk_init_queue(do_blkif_request, &info->io_lock);
- if (rq == NULL)
+ memset(&info->tag_set, 0, sizeof(info->tag_set));
+ info->tag_set.ops = &blkfront_mq_ops;
+ info->tag_set.nr_hw_queues = 1;
+ info->tag_set.queue_depth = BLK_RING_SIZE(info);
+ info->tag_set.numa_node = NUMA_NO_NODE;
+ info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+ info->tag_set.cmd_size = 0;
+ info->tag_set.driver_data = info;
+
+ if (blk_mq_alloc_tag_set(&info->tag_set))
return -1;
+ rq = blk_mq_init_queue(&info->tag_set);
+ if (IS_ERR(rq)) {
+ blk_mq_free_tag_set(&info->tag_set);
+ return -1;
+ }

queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);

@@ -901,19 +903,15 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
static void xlvbd_release_gendisk(struct blkfront_info *info)
{
unsigned int minor, nr_minors;
- unsigned long flags;

if (info->rq == NULL)
return;

- spin_lock_irqsave(&info->io_lock, flags);
-
/* No more blkif_request(). */
- blk_stop_queue(info->rq);
+ blk_mq_stop_hw_queues(info->rq);

/* No more gnttab callback work. */
gnttab_cancel_free_callback(&info->callback);
- spin_unlock_irqrestore(&info->io_lock, flags);

/* Flush gnttab callback work. Must be done with no locks held. */
flush_work(&info->work);
@@ -925,20 +923,18 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
xlbd_release_minors(minor, nr_minors);

blk_cleanup_queue(info->rq);
+ blk_mq_free_tag_set(&info->tag_set);
info->rq = NULL;

put_disk(info->gd);
info->gd = NULL;
}

+/* Must be called with io_lock holded */
static void kick_pending_request_queues(struct blkfront_info *info)
{
- if (!RING_FULL(&info->ring)) {
- /* Re-enable calldowns. */
- blk_start_queue(info->rq);
- /* Kick things off immediately. */
- do_blkif_request(info->rq);
- }
+ if (!RING_FULL(&info->ring))
+ blk_mq_start_stopped_hw_queues(info->rq, true);
}

static void blkif_restart_queue(struct work_struct *work)
@@ -963,7 +959,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
/* No more blkif_request(). */
if (info->rq)
- blk_stop_queue(info->rq);
+ blk_mq_stop_hw_queues(info->rq);

/* Remove all persistent grants */
if (!list_empty(&info->grants)) {
@@ -1144,7 +1140,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
RING_IDX i, rp;
unsigned long flags;
struct blkfront_info *info = (struct blkfront_info *)dev_id;
- int error;

spin_lock_irqsave(&info->io_lock, flags);

@@ -1185,37 +1180,37 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
continue;
}

- error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
+ req->errors = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
switch (bret->operation) {
case BLKIF_OP_DISCARD:
if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
struct request_queue *rq = info->rq;
printk(KERN_WARNING "blkfront: %s: %s op failed\n",
info->gd->disk_name, op_name(bret->operation));
- error = -EOPNOTSUPP;
+ req->errors = -EOPNOTSUPP;
info->feature_discard = 0;
info->feature_secdiscard = 0;
queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
}
- __blk_end_request_all(req, error);
+ blk_mq_complete_request(req);
break;
case BLKIF_OP_FLUSH_DISKCACHE:
case BLKIF_OP_WRITE_BARRIER:
if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
printk(KERN_WARNING "blkfront: %s: %s op failed\n",
info->gd->disk_name, op_name(bret->operation));
- error = -EOPNOTSUPP;
+ req->errors = -EOPNOTSUPP;
}
if (unlikely(bret->status == BLKIF_RSP_ERROR &&
info->shadow[id].req.u.rw.nr_segments == 0)) {
printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
info->gd->disk_name, op_name(bret->operation));
- error = -EOPNOTSUPP;
+ req->errors = -EOPNOTSUPP;
}
- if (unlikely(error)) {
- if (error == -EOPNOTSUPP)
- error = 0;
+ if (unlikely(req->errors)) {
+ if (req->errors == -EOPNOTSUPP)
+ req->errors = 0;
info->feature_flush = 0;
xlvbd_flush(info);
}
@@ -1226,7 +1221,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
"request: %x\n", bret->status);

- __blk_end_request_all(req, error);
+ blk_mq_complete_request(req);
break;
default:
BUG();
@@ -1555,28 +1550,6 @@ static int blkif_recover(struct blkfront_info *info)

kfree(copy);

- /*
- * Empty the queue, this is important because we might have
- * requests in the queue with more segments than what we
- * can handle now.
- */
- spin_lock_irq(&info->io_lock);
- while ((req = blk_fetch_request(info->rq)) != NULL) {
- if (req->cmd_flags &
- (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
- list_add(&req->queuelist, &requests);
- continue;
- }
- merge_bio.head = req->bio;
- merge_bio.tail = req->biotail;
- bio_list_merge(&bio_list, &merge_bio);
- req->bio = NULL;
- if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
- pr_alert("diskcache flush request found!\n");
- __blk_end_request_all(req, 0);
- }
- spin_unlock_irq(&info->io_lock);
-
xenbus_switch_state(info->xbdev, XenbusStateConnected);

spin_lock_irq(&info->io_lock);
@@ -1591,9 +1564,10 @@ static int blkif_recover(struct blkfront_info *info)
/* Requeue pending requests (flush or discard) */
list_del_init(&req->queuelist);
BUG_ON(req->nr_phys_segments > segs);
- blk_requeue_request(info->rq, req);
+ blk_mq_requeue_request(req);
}
spin_unlock_irq(&info->io_lock);
+ blk_mq_kick_requeue_list(info->rq);

while ((bio = bio_list_pop(&bio_list)) != NULL) {
/* Traverse the list of pending bios and re-queue them */
--
1.7.10.4


2015-07-20 14:18:20

by David Vrabel

[permalink] [raw]
Subject: Re: [Xen-devel] [PATCH v3] xen/blkfront: convert to blk-mq APIs

On 13/07/15 10:55, Bob Liu wrote:
> Note: This patch is based on original work of Arianna's internship for
> GNOME's Outreach Program for Women.
>
> Only one hardware queue is used now, so there is no performance change.
>
> The legacy non-mq code is deleted completely which is the same as other
> drivers like virtio, mtip, and nvme.
>
> Also dropped one unnecessary holding of info->io_lock when calling
> blk_mq_stop_hw_queues().

Applied to for-linus-4.3, thanks.

> Changes in v2:
> - Reorganized blk_mq_queue_rq()
> - Restored most io_locks in place
>
> Change in v3:
> - Rename blk_mq_queue_rq to blkif_queue_rq

Next time, please put changes after a --- marker so they're
automatically dropped when applied.

David

2015-08-19 11:12:17

by Bob Liu

[permalink] [raw]
Subject: Re: [PATCH v3] xen/blkfront: convert to blk-mq APIs

Hi Jens & Christoph,

Rafal reported an issue about this patch, that's after this patch no more
merges happen and the performance dropped if "modprobe null_blk irqmode=2 completion_nsec=1000000",
but works fine if "modprobe null_blk".

I'm not sure whether it's as expect or not.
Do you have any suggestions? Thank you!

Here is the test result:

fio --name=test --ioengine=libaio --rw=read --numjobs=8 --iodepth=32 \
--time_based=1 --runtime=30 --bs=4KB --filename=/dev/xvdb \
--direct=1 --group_reporting=1 --iodepth_batch=16

========================================================================
modprobe null_blk
========================================================================
------------------------------------------------------------------------
*no patch* (avgrq-sz = 8.00 avgqu-sz=5.00)
------------------------------------------------------------------------
READ: io=10655MB, aggrb=363694KB/s, minb=363694KB/s, maxb=363694KB/s, mint=30001msec, maxt=30001msec

Disk stats (read/write):
xvdb: ios=2715852/0, merge=1089/0, ticks=126572/0, in_queue=127456, util=100.00%

------------------------------------------------------------------------
*with patch* (avgrq-sz = 8.00 avgqu-sz=8.00)
------------------------------------------------------------------------
READ: io=20655MB, aggrb=705010KB/s, minb=705010KB/s, maxb=705010KB/s, mint=30001msec, maxt=30001msec

Disk stats (read/write):
xvdb: ios=5274633/0, merge=22/0, ticks=243208/0, in_queue=242908, util=99.98%

========================================================================
modprobe null_blk irqmode=2 completion_nsec=1000000
========================================================================
------------------------------------------------------------------------
*no patch* (avgrq-sz = 34.00 avgqu-sz=38.00)
------------------------------------------------------------------------
READ: io=10372MB, aggrb=354008KB/s, minb=354008KB/s, maxb=354008KB/s, mint=30003msec, maxt=30003msec

Disk stats (read/write):
xvdb: ios=621760/0, *merge=1988170/0*, ticks=1136700/0, in_queue=1146020, util=99.76%

------------------------------------------------------------------------
*with patch* (avgrq-sz = 8.00 avgqu-sz=28.00)
------------------------------------------------------------------------
READ: io=2876.8MB, aggrb=98187KB/s, minb=98187KB/s, maxb=98187KB/s, mint=30002msec, maxt=30002msec

Disk stats (read/write):
xvdb: ios=734048/0, merge=0/0, ticks=843584/0, in_queue=843080, util=99.72%

Regards,
-Bob

On 07/13/2015 05:55 PM, Bob Liu wrote:
> Note: This patch is based on original work of Arianna's internship for
> GNOME's Outreach Program for Women.
>
> Only one hardware queue is used now, so there is no performance change.
>
> The legacy non-mq code is deleted completely which is the same as other
> drivers like virtio, mtip, and nvme.
>
> Also dropped one unnecessary holding of info->io_lock when calling
> blk_mq_stop_hw_queues().
>
> Changes in v2:
> - Reorganized blk_mq_queue_rq()
> - Restored most io_locks in place
>
> Change in v3:
> - Rename blk_mq_queue_rq to blkif_queue_rq
>
> Signed-off-by: Arianna Avanzini <[email protected]>
> Signed-off-by: Bob Liu <[email protected]>
> Reviewed-by: Christoph Hellwig <[email protected]>
> Acked-by: Jens Axboe <[email protected]>
> ---
> drivers/block/xen-blkfront.c | 146 +++++++++++++++++-------------------------
> 1 file changed, 60 insertions(+), 86 deletions(-)
>
> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
> index 6d89ed3..5b45ee5 100644
> --- a/drivers/block/xen-blkfront.c
> +++ b/drivers/block/xen-blkfront.c
> @@ -37,6 +37,7 @@
>
> #include <linux/interrupt.h>
> #include <linux/blkdev.h>
> +#include <linux/blk-mq.h>
> #include <linux/hdreg.h>
> #include <linux/cdrom.h>
> #include <linux/module.h>
> @@ -148,6 +149,7 @@ struct blkfront_info
> unsigned int feature_persistent:1;
> unsigned int max_indirect_segments;
> int is_ready;
> + struct blk_mq_tag_set tag_set;
> };
>
> static unsigned int nr_minors;
> @@ -616,54 +618,41 @@ static inline bool blkif_request_flush_invalid(struct request *req,
> !(info->feature_flush & REQ_FUA)));
> }
>
> -/*
> - * do_blkif_request
> - * read a block; request is in a request queue
> - */
> -static void do_blkif_request(struct request_queue *rq)
> +static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
> + const struct blk_mq_queue_data *qd)
> {
> - struct blkfront_info *info = NULL;
> - struct request *req;
> - int queued;
> -
> - pr_debug("Entered do_blkif_request\n");
> -
> - queued = 0;
> + struct blkfront_info *info = qd->rq->rq_disk->private_data;
>
> - while ((req = blk_peek_request(rq)) != NULL) {
> - info = req->rq_disk->private_data;
> -
> - if (RING_FULL(&info->ring))
> - goto wait;
> + blk_mq_start_request(qd->rq);
> + spin_lock_irq(&info->io_lock);
> + if (RING_FULL(&info->ring))
> + goto out_busy;
>
> - blk_start_request(req);
> + if (blkif_request_flush_invalid(qd->rq, info))
> + goto out_err;
>
> - if (blkif_request_flush_invalid(req, info)) {
> - __blk_end_request_all(req, -EOPNOTSUPP);
> - continue;
> - }
> + if (blkif_queue_request(qd->rq))
> + goto out_busy;
>
> - pr_debug("do_blk_req %p: cmd %p, sec %lx, "
> - "(%u/%u) [%s]\n",
> - req, req->cmd, (unsigned long)blk_rq_pos(req),
> - blk_rq_cur_sectors(req), blk_rq_sectors(req),
> - rq_data_dir(req) ? "write" : "read");
> -
> - if (blkif_queue_request(req)) {
> - blk_requeue_request(rq, req);
> -wait:
> - /* Avoid pointless unplugs. */
> - blk_stop_queue(rq);
> - break;
> - }
> + flush_requests(info);
> + spin_unlock_irq(&info->io_lock);
> + return BLK_MQ_RQ_QUEUE_OK;
>
> - queued++;
> - }
> +out_err:
> + spin_unlock_irq(&info->io_lock);
> + return BLK_MQ_RQ_QUEUE_ERROR;
>
> - if (queued != 0)
> - flush_requests(info);
> +out_busy:
> + spin_unlock_irq(&info->io_lock);
> + blk_mq_stop_hw_queue(hctx);
> + return BLK_MQ_RQ_QUEUE_BUSY;
> }
>
> +static struct blk_mq_ops blkfront_mq_ops = {
> + .queue_rq = blkif_queue_rq,
> + .map_queue = blk_mq_map_queue,
> +};
> +
> static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
> unsigned int physical_sector_size,
> unsigned int segments)
> @@ -671,9 +660,22 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
> struct request_queue *rq;
> struct blkfront_info *info = gd->private_data;
>
> - rq = blk_init_queue(do_blkif_request, &info->io_lock);
> - if (rq == NULL)
> + memset(&info->tag_set, 0, sizeof(info->tag_set));
> + info->tag_set.ops = &blkfront_mq_ops;
> + info->tag_set.nr_hw_queues = 1;
> + info->tag_set.queue_depth = BLK_RING_SIZE(info);
> + info->tag_set.numa_node = NUMA_NO_NODE;
> + info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
> + info->tag_set.cmd_size = 0;
> + info->tag_set.driver_data = info;
> +
> + if (blk_mq_alloc_tag_set(&info->tag_set))
> return -1;
> + rq = blk_mq_init_queue(&info->tag_set);
> + if (IS_ERR(rq)) {
> + blk_mq_free_tag_set(&info->tag_set);
> + return -1;
> + }
>
> queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
>
> @@ -901,19 +903,15 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
> static void xlvbd_release_gendisk(struct blkfront_info *info)
> {
> unsigned int minor, nr_minors;
> - unsigned long flags;
>
> if (info->rq == NULL)
> return;
>
> - spin_lock_irqsave(&info->io_lock, flags);
> -
> /* No more blkif_request(). */
> - blk_stop_queue(info->rq);
> + blk_mq_stop_hw_queues(info->rq);
>
> /* No more gnttab callback work. */
> gnttab_cancel_free_callback(&info->callback);
> - spin_unlock_irqrestore(&info->io_lock, flags);
>
> /* Flush gnttab callback work. Must be done with no locks held. */
> flush_work(&info->work);
> @@ -925,20 +923,18 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
> xlbd_release_minors(minor, nr_minors);
>
> blk_cleanup_queue(info->rq);
> + blk_mq_free_tag_set(&info->tag_set);
> info->rq = NULL;
>
> put_disk(info->gd);
> info->gd = NULL;
> }
>
> +/* Must be called with io_lock holded */
> static void kick_pending_request_queues(struct blkfront_info *info)
> {
> - if (!RING_FULL(&info->ring)) {
> - /* Re-enable calldowns. */
> - blk_start_queue(info->rq);
> - /* Kick things off immediately. */
> - do_blkif_request(info->rq);
> - }
> + if (!RING_FULL(&info->ring))
> + blk_mq_start_stopped_hw_queues(info->rq, true);
> }
>
> static void blkif_restart_queue(struct work_struct *work)
> @@ -963,7 +959,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
> BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
> /* No more blkif_request(). */
> if (info->rq)
> - blk_stop_queue(info->rq);
> + blk_mq_stop_hw_queues(info->rq);
>
> /* Remove all persistent grants */
> if (!list_empty(&info->grants)) {
> @@ -1144,7 +1140,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
> RING_IDX i, rp;
> unsigned long flags;
> struct blkfront_info *info = (struct blkfront_info *)dev_id;
> - int error;
>
> spin_lock_irqsave(&info->io_lock, flags);
>
> @@ -1185,37 +1180,37 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
> continue;
> }
>
> - error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
> + req->errors = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
> switch (bret->operation) {
> case BLKIF_OP_DISCARD:
> if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
> struct request_queue *rq = info->rq;
> printk(KERN_WARNING "blkfront: %s: %s op failed\n",
> info->gd->disk_name, op_name(bret->operation));
> - error = -EOPNOTSUPP;
> + req->errors = -EOPNOTSUPP;
> info->feature_discard = 0;
> info->feature_secdiscard = 0;
> queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
> queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
> }
> - __blk_end_request_all(req, error);
> + blk_mq_complete_request(req);
> break;
> case BLKIF_OP_FLUSH_DISKCACHE:
> case BLKIF_OP_WRITE_BARRIER:
> if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
> printk(KERN_WARNING "blkfront: %s: %s op failed\n",
> info->gd->disk_name, op_name(bret->operation));
> - error = -EOPNOTSUPP;
> + req->errors = -EOPNOTSUPP;
> }
> if (unlikely(bret->status == BLKIF_RSP_ERROR &&
> info->shadow[id].req.u.rw.nr_segments == 0)) {
> printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
> info->gd->disk_name, op_name(bret->operation));
> - error = -EOPNOTSUPP;
> + req->errors = -EOPNOTSUPP;
> }
> - if (unlikely(error)) {
> - if (error == -EOPNOTSUPP)
> - error = 0;
> + if (unlikely(req->errors)) {
> + if (req->errors == -EOPNOTSUPP)
> + req->errors = 0;
> info->feature_flush = 0;
> xlvbd_flush(info);
> }
> @@ -1226,7 +1221,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
> dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
> "request: %x\n", bret->status);
>
> - __blk_end_request_all(req, error);
> + blk_mq_complete_request(req);
> break;
> default:
> BUG();
> @@ -1555,28 +1550,6 @@ static int blkif_recover(struct blkfront_info *info)
>
> kfree(copy);
>
> - /*
> - * Empty the queue, this is important because we might have
> - * requests in the queue with more segments than what we
> - * can handle now.
> - */
> - spin_lock_irq(&info->io_lock);
> - while ((req = blk_fetch_request(info->rq)) != NULL) {
> - if (req->cmd_flags &
> - (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
> - list_add(&req->queuelist, &requests);
> - continue;
> - }
> - merge_bio.head = req->bio;
> - merge_bio.tail = req->biotail;
> - bio_list_merge(&bio_list, &merge_bio);
> - req->bio = NULL;
> - if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
> - pr_alert("diskcache flush request found!\n");
> - __blk_end_request_all(req, 0);
> - }
> - spin_unlock_irq(&info->io_lock);
> -
> xenbus_switch_state(info->xbdev, XenbusStateConnected);
>
> spin_lock_irq(&info->io_lock);
> @@ -1591,9 +1564,10 @@ static int blkif_recover(struct blkfront_info *info)
> /* Requeue pending requests (flush or discard) */
> list_del_init(&req->queuelist);
> BUG_ON(req->nr_phys_segments > segs);
> - blk_requeue_request(info->rq, req);
> + blk_mq_requeue_request(req);
> }
> spin_unlock_irq(&info->io_lock);
> + blk_mq_kick_requeue_list(info->rq);
>
> while ((bio = bio_list_pop(&bio_list)) != NULL) {
> /* Traverse the list of pending bios and re-queue them */
>

2015-08-21 08:46:47

by Rafal Mielniczuk

[permalink] [raw]
Subject: Re: [PATCH v3] xen/blkfront: convert to blk-mq APIs

On 19/08/15 12:12, Bob Liu wrote:
> Hi Jens & Christoph,
>
> Rafal reported an issue about this patch, that's after this patch no more
> merges happen and the performance dropped if "modprobe null_blk irqmode=2 completion_nsec=1000000",
> but works fine if "modprobe null_blk".
>
> I'm not sure whether it's as expect or not.
> Do you have any suggestions? Thank you!
>
> Here is the test result:
>
> fio --name=test --ioengine=libaio --rw=read --numjobs=8 --iodepth=32 \
> --time_based=1 --runtime=30 --bs=4KB --filename=/dev/xvdb \
> --direct=1 --group_reporting=1 --iodepth_batch=16
>
> ========================================================================
> modprobe null_blk
> ========================================================================
> ------------------------------------------------------------------------
> *no patch* (avgrq-sz = 8.00 avgqu-sz=5.00)
> ------------------------------------------------------------------------
> READ: io=10655MB, aggrb=363694KB/s, minb=363694KB/s, maxb=363694KB/s, mint=30001msec, maxt=30001msec
>
> Disk stats (read/write):
> xvdb: ios=2715852/0, merge=1089/0, ticks=126572/0, in_queue=127456, util=100.00%
>
> ------------------------------------------------------------------------
> *with patch* (avgrq-sz = 8.00 avgqu-sz=8.00)
> ------------------------------------------------------------------------
> READ: io=20655MB, aggrb=705010KB/s, minb=705010KB/s, maxb=705010KB/s, mint=30001msec, maxt=30001msec
>
> Disk stats (read/write):
> xvdb: ios=5274633/0, merge=22/0, ticks=243208/0, in_queue=242908, util=99.98%
>
> ========================================================================
> modprobe null_blk irqmode=2 completion_nsec=1000000
> ========================================================================
> ------------------------------------------------------------------------
> *no patch* (avgrq-sz = 34.00 avgqu-sz=38.00)
> ------------------------------------------------------------------------
> READ: io=10372MB, aggrb=354008KB/s, minb=354008KB/s, maxb=354008KB/s, mint=30003msec, maxt=30003msec
>
> Disk stats (read/write):
> xvdb: ios=621760/0, *merge=1988170/0*, ticks=1136700/0, in_queue=1146020, util=99.76%
>
> ------------------------------------------------------------------------
> *with patch* (avgrq-sz = 8.00 avgqu-sz=28.00)
> ------------------------------------------------------------------------
> READ: io=2876.8MB, aggrb=98187KB/s, minb=98187KB/s, maxb=98187KB/s, mint=30002msec, maxt=30002msec
>
> Disk stats (read/write):
> xvdb: ios=734048/0, merge=0/0, ticks=843584/0, in_queue=843080, util=99.72%
>
> Regards,
> -Bob

Hello,

We got a problem with the lack of merges also when we tested on null_blk device in dom0 directly.
When we enabled multi queue block-layer we got no merges, even when we set the number of submission queues to 1.

If I don't miss anything, that could suggest the problem lays somewhere in the blk-mq layer itself?

Please take a look at the results below:

fio --name=test --ioengine=libaio --rw=read --numjobs=8 --iodepth=32 \
--time_based=1 --runtime=30 --bs=4KB --filename=/dev/nullb0 \
--direct=1 --group_reporting=1

========================================================================
modprobe null_blk irqmode=2 completion_nsec=1000000 queue_mode=1 submit_queues=1
========================================================================
READ: io=13692MB, aggrb=467320KB/s, minb=467320KB/s, maxb=467320KB/s, mint=30002msec, maxt=30002msec

Disk stats (read/write):
nullb0: ios=991026/0, merge=2499524/0, ticks=1846952/0, in_queue=900012, util=100.00%

========================================================================
modprobe null_blk irqmode=2 completion_nsec=1000000 queue_mode=2 submit_queues=1
========================================================================
READ: io=6839.1MB, aggrb=233452KB/s, minb=233452KB/s, maxb=233452KB/s, mint=30002msec, maxt=30002msec

Disk stats (read/write):
nullb0: ios=1743967/0, merge=0/0, ticks=1712900/0, in_queue=1839072, util=100.00%

Thanks,
Rafal

>
> On 07/13/2015 05:55 PM, Bob Liu wrote:
>> Note: This patch is based on original work of Arianna's internship for
>> GNOME's Outreach Program for Women.
>>
>> Only one hardware queue is used now, so there is no performance change.
>>
>> The legacy non-mq code is deleted completely which is the same as other
>> drivers like virtio, mtip, and nvme.
>>
>> Also dropped one unnecessary holding of info->io_lock when calling
>> blk_mq_stop_hw_queues().
>>
>> Changes in v2:
>> - Reorganized blk_mq_queue_rq()
>> - Restored most io_locks in place
>>
>> Change in v3:
>> - Rename blk_mq_queue_rq to blkif_queue_rq
>>
>> Signed-off-by: Arianna Avanzini <[email protected]>
>> Signed-off-by: Bob Liu <[email protected]>
>> Reviewed-by: Christoph Hellwig <[email protected]>
>> Acked-by: Jens Axboe <[email protected]>
>> ---
>> drivers/block/xen-blkfront.c | 146 +++++++++++++++++-------------------------
>> 1 file changed, 60 insertions(+), 86 deletions(-)
>>
>> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
>> index 6d89ed3..5b45ee5 100644
>> --- a/drivers/block/xen-blkfront.c
>> +++ b/drivers/block/xen-blkfront.c
>> @@ -37,6 +37,7 @@
>>
>> #include <linux/interrupt.h>
>> #include <linux/blkdev.h>
>> +#include <linux/blk-mq.h>
>> #include <linux/hdreg.h>
>> #include <linux/cdrom.h>
>> #include <linux/module.h>
>> @@ -148,6 +149,7 @@ struct blkfront_info
>> unsigned int feature_persistent:1;
>> unsigned int max_indirect_segments;
>> int is_ready;
>> + struct blk_mq_tag_set tag_set;
>> };
>>
>> static unsigned int nr_minors;
>> @@ -616,54 +618,41 @@ static inline bool blkif_request_flush_invalid(struct request *req,
>> !(info->feature_flush & REQ_FUA)));
>> }
>>
>> -/*
>> - * do_blkif_request
>> - * read a block; request is in a request queue
>> - */
>> -static void do_blkif_request(struct request_queue *rq)
>> +static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
>> + const struct blk_mq_queue_data *qd)
>> {
>> - struct blkfront_info *info = NULL;
>> - struct request *req;
>> - int queued;
>> -
>> - pr_debug("Entered do_blkif_request\n");
>> -
>> - queued = 0;
>> + struct blkfront_info *info = qd->rq->rq_disk->private_data;
>>
>> - while ((req = blk_peek_request(rq)) != NULL) {
>> - info = req->rq_disk->private_data;
>> -
>> - if (RING_FULL(&info->ring))
>> - goto wait;
>> + blk_mq_start_request(qd->rq);
>> + spin_lock_irq(&info->io_lock);
>> + if (RING_FULL(&info->ring))
>> + goto out_busy;
>>
>> - blk_start_request(req);
>> + if (blkif_request_flush_invalid(qd->rq, info))
>> + goto out_err;
>>
>> - if (blkif_request_flush_invalid(req, info)) {
>> - __blk_end_request_all(req, -EOPNOTSUPP);
>> - continue;
>> - }
>> + if (blkif_queue_request(qd->rq))
>> + goto out_busy;
>>
>> - pr_debug("do_blk_req %p: cmd %p, sec %lx, "
>> - "(%u/%u) [%s]\n",
>> - req, req->cmd, (unsigned long)blk_rq_pos(req),
>> - blk_rq_cur_sectors(req), blk_rq_sectors(req),
>> - rq_data_dir(req) ? "write" : "read");
>> -
>> - if (blkif_queue_request(req)) {
>> - blk_requeue_request(rq, req);
>> -wait:
>> - /* Avoid pointless unplugs. */
>> - blk_stop_queue(rq);
>> - break;
>> - }
>> + flush_requests(info);
>> + spin_unlock_irq(&info->io_lock);
>> + return BLK_MQ_RQ_QUEUE_OK;
>>
>> - queued++;
>> - }
>> +out_err:
>> + spin_unlock_irq(&info->io_lock);
>> + return BLK_MQ_RQ_QUEUE_ERROR;
>>
>> - if (queued != 0)
>> - flush_requests(info);
>> +out_busy:
>> + spin_unlock_irq(&info->io_lock);
>> + blk_mq_stop_hw_queue(hctx);
>> + return BLK_MQ_RQ_QUEUE_BUSY;
>> }
>>
>> +static struct blk_mq_ops blkfront_mq_ops = {
>> + .queue_rq = blkif_queue_rq,
>> + .map_queue = blk_mq_map_queue,
>> +};
>> +
>> static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
>> unsigned int physical_sector_size,
>> unsigned int segments)
>> @@ -671,9 +660,22 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
>> struct request_queue *rq;
>> struct blkfront_info *info = gd->private_data;
>>
>> - rq = blk_init_queue(do_blkif_request, &info->io_lock);
>> - if (rq == NULL)
>> + memset(&info->tag_set, 0, sizeof(info->tag_set));
>> + info->tag_set.ops = &blkfront_mq_ops;
>> + info->tag_set.nr_hw_queues = 1;
>> + info->tag_set.queue_depth = BLK_RING_SIZE(info);
>> + info->tag_set.numa_node = NUMA_NO_NODE;
>> + info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
>> + info->tag_set.cmd_size = 0;
>> + info->tag_set.driver_data = info;
>> +
>> + if (blk_mq_alloc_tag_set(&info->tag_set))
>> return -1;
>> + rq = blk_mq_init_queue(&info->tag_set);
>> + if (IS_ERR(rq)) {
>> + blk_mq_free_tag_set(&info->tag_set);
>> + return -1;
>> + }
>>
>> queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
>>
>> @@ -901,19 +903,15 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
>> static void xlvbd_release_gendisk(struct blkfront_info *info)
>> {
>> unsigned int minor, nr_minors;
>> - unsigned long flags;
>>
>> if (info->rq == NULL)
>> return;
>>
>> - spin_lock_irqsave(&info->io_lock, flags);
>> -
>> /* No more blkif_request(). */
>> - blk_stop_queue(info->rq);
>> + blk_mq_stop_hw_queues(info->rq);
>>
>> /* No more gnttab callback work. */
>> gnttab_cancel_free_callback(&info->callback);
>> - spin_unlock_irqrestore(&info->io_lock, flags);
>>
>> /* Flush gnttab callback work. Must be done with no locks held. */
>> flush_work(&info->work);
>> @@ -925,20 +923,18 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
>> xlbd_release_minors(minor, nr_minors);
>>
>> blk_cleanup_queue(info->rq);
>> + blk_mq_free_tag_set(&info->tag_set);
>> info->rq = NULL;
>>
>> put_disk(info->gd);
>> info->gd = NULL;
>> }
>>
>> +/* Must be called with io_lock holded */
>> static void kick_pending_request_queues(struct blkfront_info *info)
>> {
>> - if (!RING_FULL(&info->ring)) {
>> - /* Re-enable calldowns. */
>> - blk_start_queue(info->rq);
>> - /* Kick things off immediately. */
>> - do_blkif_request(info->rq);
>> - }
>> + if (!RING_FULL(&info->ring))
>> + blk_mq_start_stopped_hw_queues(info->rq, true);
>> }
>>
>> static void blkif_restart_queue(struct work_struct *work)
>> @@ -963,7 +959,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
>> BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
>> /* No more blkif_request(). */
>> if (info->rq)
>> - blk_stop_queue(info->rq);
>> + blk_mq_stop_hw_queues(info->rq);
>>
>> /* Remove all persistent grants */
>> if (!list_empty(&info->grants)) {
>> @@ -1144,7 +1140,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
>> RING_IDX i, rp;
>> unsigned long flags;
>> struct blkfront_info *info = (struct blkfront_info *)dev_id;
>> - int error;
>>
>> spin_lock_irqsave(&info->io_lock, flags);
>>
>> @@ -1185,37 +1180,37 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
>> continue;
>> }
>>
>> - error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
>> + req->errors = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
>> switch (bret->operation) {
>> case BLKIF_OP_DISCARD:
>> if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
>> struct request_queue *rq = info->rq;
>> printk(KERN_WARNING "blkfront: %s: %s op failed\n",
>> info->gd->disk_name, op_name(bret->operation));
>> - error = -EOPNOTSUPP;
>> + req->errors = -EOPNOTSUPP;
>> info->feature_discard = 0;
>> info->feature_secdiscard = 0;
>> queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
>> queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
>> }
>> - __blk_end_request_all(req, error);
>> + blk_mq_complete_request(req);
>> break;
>> case BLKIF_OP_FLUSH_DISKCACHE:
>> case BLKIF_OP_WRITE_BARRIER:
>> if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
>> printk(KERN_WARNING "blkfront: %s: %s op failed\n",
>> info->gd->disk_name, op_name(bret->operation));
>> - error = -EOPNOTSUPP;
>> + req->errors = -EOPNOTSUPP;
>> }
>> if (unlikely(bret->status == BLKIF_RSP_ERROR &&
>> info->shadow[id].req.u.rw.nr_segments == 0)) {
>> printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
>> info->gd->disk_name, op_name(bret->operation));
>> - error = -EOPNOTSUPP;
>> + req->errors = -EOPNOTSUPP;
>> }
>> - if (unlikely(error)) {
>> - if (error == -EOPNOTSUPP)
>> - error = 0;
>> + if (unlikely(req->errors)) {
>> + if (req->errors == -EOPNOTSUPP)
>> + req->errors = 0;
>> info->feature_flush = 0;
>> xlvbd_flush(info);
>> }
>> @@ -1226,7 +1221,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
>> dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
>> "request: %x\n", bret->status);
>>
>> - __blk_end_request_all(req, error);
>> + blk_mq_complete_request(req);
>> break;
>> default:
>> BUG();
>> @@ -1555,28 +1550,6 @@ static int blkif_recover(struct blkfront_info *info)
>>
>> kfree(copy);
>>
>> - /*
>> - * Empty the queue, this is important because we might have
>> - * requests in the queue with more segments than what we
>> - * can handle now.
>> - */
>> - spin_lock_irq(&info->io_lock);
>> - while ((req = blk_fetch_request(info->rq)) != NULL) {
>> - if (req->cmd_flags &
>> - (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
>> - list_add(&req->queuelist, &requests);
>> - continue;
>> - }
>> - merge_bio.head = req->bio;
>> - merge_bio.tail = req->biotail;
>> - bio_list_merge(&bio_list, &merge_bio);
>> - req->bio = NULL;
>> - if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
>> - pr_alert("diskcache flush request found!\n");
>> - __blk_end_request_all(req, 0);
>> - }
>> - spin_unlock_irq(&info->io_lock);
>> -
>> xenbus_switch_state(info->xbdev, XenbusStateConnected);
>>
>> spin_lock_irq(&info->io_lock);
>> @@ -1591,9 +1564,10 @@ static int blkif_recover(struct blkfront_info *info)
>> /* Requeue pending requests (flush or discard) */
>> list_del_init(&req->queuelist);
>> BUG_ON(req->nr_phys_segments > segs);
>> - blk_requeue_request(info->rq, req);
>> + blk_mq_requeue_request(req);
>> }
>> spin_unlock_irq(&info->io_lock);
>> + blk_mq_kick_requeue_list(info->rq);
>>
>> while ((bio = bio_list_pop(&bio_list)) != NULL) {
>> /* Traverse the list of pending bios and re-queue them */
>>

2015-08-25 02:14:46

by Bob Liu

[permalink] [raw]
Subject: Re: [PATCH v3] xen/blkfront: convert to blk-mq APIs

Hi Rafal,

Please have a try adding "--iodepth_batch=32 --iodepth_batch_complete=32" to the fio command line.
I didn't see this issue any more, neither for domU.

Thanks,
-Bob

On 08/21/2015 04:46 PM, Rafal Mielniczuk wrote:
> On 19/08/15 12:12, Bob Liu wrote:
>> Hi Jens & Christoph,
>>
>> Rafal reported an issue about this patch, that's after this patch no more
>> merges happen and the performance dropped if "modprobe null_blk irqmode=2 completion_nsec=1000000",
>> but works fine if "modprobe null_blk".
>>
>> I'm not sure whether it's as expect or not.
>> Do you have any suggestions? Thank you!
>>
>> Here is the test result:
>>
>> fio --name=test --ioengine=libaio --rw=read --numjobs=8 --iodepth=32 \
>> --time_based=1 --runtime=30 --bs=4KB --filename=/dev/xvdb \
>> --direct=1 --group_reporting=1 --iodepth_batch=16
>>
>> ========================================================================
>> modprobe null_blk
>> ========================================================================
>> ------------------------------------------------------------------------
>> *no patch* (avgrq-sz = 8.00 avgqu-sz=5.00)
>> ------------------------------------------------------------------------
>> READ: io=10655MB, aggrb=363694KB/s, minb=363694KB/s, maxb=363694KB/s, mint=30001msec, maxt=30001msec
>>
>> Disk stats (read/write):
>> xvdb: ios=2715852/0, merge=1089/0, ticks=126572/0, in_queue=127456, util=100.00%
>>
>> ------------------------------------------------------------------------
>> *with patch* (avgrq-sz = 8.00 avgqu-sz=8.00)
>> ------------------------------------------------------------------------
>> READ: io=20655MB, aggrb=705010KB/s, minb=705010KB/s, maxb=705010KB/s, mint=30001msec, maxt=30001msec
>>
>> Disk stats (read/write):
>> xvdb: ios=5274633/0, merge=22/0, ticks=243208/0, in_queue=242908, util=99.98%
>>
>> ========================================================================
>> modprobe null_blk irqmode=2 completion_nsec=1000000
>> ========================================================================
>> ------------------------------------------------------------------------
>> *no patch* (avgrq-sz = 34.00 avgqu-sz=38.00)
>> ------------------------------------------------------------------------
>> READ: io=10372MB, aggrb=354008KB/s, minb=354008KB/s, maxb=354008KB/s, mint=30003msec, maxt=30003msec
>>
>> Disk stats (read/write):
>> xvdb: ios=621760/0, *merge=1988170/0*, ticks=1136700/0, in_queue=1146020, util=99.76%
>>
>> ------------------------------------------------------------------------
>> *with patch* (avgrq-sz = 8.00 avgqu-sz=28.00)
>> ------------------------------------------------------------------------
>> READ: io=2876.8MB, aggrb=98187KB/s, minb=98187KB/s, maxb=98187KB/s, mint=30002msec, maxt=30002msec
>>
>> Disk stats (read/write):
>> xvdb: ios=734048/0, merge=0/0, ticks=843584/0, in_queue=843080, util=99.72%
>>
>> Regards,
>> -Bob
>
> Hello,
>
> We got a problem with the lack of merges also when we tested on null_blk device in dom0 directly.
> When we enabled multi queue block-layer we got no merges, even when we set the number of submission queues to 1.
>
> If I don't miss anything, that could suggest the problem lays somewhere in the blk-mq layer itself?
>
> Please take a look at the results below:
>
> fio --name=test --ioengine=libaio --rw=read --numjobs=8 --iodepth=32 \
> --time_based=1 --runtime=30 --bs=4KB --filename=/dev/nullb0 \
> --direct=1 --group_reporting=1
>
> ========================================================================
> modprobe null_blk irqmode=2 completion_nsec=1000000 queue_mode=1 submit_queues=1
> ========================================================================
> READ: io=13692MB, aggrb=467320KB/s, minb=467320KB/s, maxb=467320KB/s, mint=30002msec, maxt=30002msec
>
> Disk stats (read/write):
> nullb0: ios=991026/0, merge=2499524/0, ticks=1846952/0, in_queue=900012, util=100.00%
>
> ========================================================================
> modprobe null_blk irqmode=2 completion_nsec=1000000 queue_mode=2 submit_queues=1
> ========================================================================
> READ: io=6839.1MB, aggrb=233452KB/s, minb=233452KB/s, maxb=233452KB/s, mint=30002msec, maxt=30002msec
>
> Disk stats (read/write):
> nullb0: ios=1743967/0, merge=0/0, ticks=1712900/0, in_queue=1839072, util=100.00%
>
> Thanks,
> Rafal
>
>>
>> On 07/13/2015 05:55 PM, Bob Liu wrote:
>>> Note: This patch is based on original work of Arianna's internship for
>>> GNOME's Outreach Program for Women.
>>>
>>> Only one hardware queue is used now, so there is no performance change.
>>>
>>> The legacy non-mq code is deleted completely which is the same as other
>>> drivers like virtio, mtip, and nvme.
>>>
>>> Also dropped one unnecessary holding of info->io_lock when calling
>>> blk_mq_stop_hw_queues().
>>>
>>> Changes in v2:
>>> - Reorganized blk_mq_queue_rq()
>>> - Restored most io_locks in place
>>>
>>> Change in v3:
>>> - Rename blk_mq_queue_rq to blkif_queue_rq
>>>
>>> Signed-off-by: Arianna Avanzini <[email protected]>
>>> Signed-off-by: Bob Liu <[email protected]>
>>> Reviewed-by: Christoph Hellwig <[email protected]>
>>> Acked-by: Jens Axboe <[email protected]>
>>> ---
>>> drivers/block/xen-blkfront.c | 146 +++++++++++++++++-------------------------
>>> 1 file changed, 60 insertions(+), 86 deletions(-)
>>>
>>> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
>>> index 6d89ed3..5b45ee5 100644
>>> --- a/drivers/block/xen-blkfront.c
>>> +++ b/drivers/block/xen-blkfront.c
>>> @@ -37,6 +37,7 @@
>>>
>>> #include <linux/interrupt.h>
>>> #include <linux/blkdev.h>
>>> +#include <linux/blk-mq.h>
>>> #include <linux/hdreg.h>
>>> #include <linux/cdrom.h>
>>> #include <linux/module.h>
>>> @@ -148,6 +149,7 @@ struct blkfront_info
>>> unsigned int feature_persistent:1;
>>> unsigned int max_indirect_segments;
>>> int is_ready;
>>> + struct blk_mq_tag_set tag_set;
>>> };
>>>
>>> static unsigned int nr_minors;
>>> @@ -616,54 +618,41 @@ static inline bool blkif_request_flush_invalid(struct request *req,
>>> !(info->feature_flush & REQ_FUA)));
>>> }
>>>
>>> -/*
>>> - * do_blkif_request
>>> - * read a block; request is in a request queue
>>> - */
>>> -static void do_blkif_request(struct request_queue *rq)
>>> +static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
>>> + const struct blk_mq_queue_data *qd)
>>> {
>>> - struct blkfront_info *info = NULL;
>>> - struct request *req;
>>> - int queued;
>>> -
>>> - pr_debug("Entered do_blkif_request\n");
>>> -
>>> - queued = 0;
>>> + struct blkfront_info *info = qd->rq->rq_disk->private_data;
>>>
>>> - while ((req = blk_peek_request(rq)) != NULL) {
>>> - info = req->rq_disk->private_data;
>>> -
>>> - if (RING_FULL(&info->ring))
>>> - goto wait;
>>> + blk_mq_start_request(qd->rq);
>>> + spin_lock_irq(&info->io_lock);
>>> + if (RING_FULL(&info->ring))
>>> + goto out_busy;
>>>
>>> - blk_start_request(req);
>>> + if (blkif_request_flush_invalid(qd->rq, info))
>>> + goto out_err;
>>>
>>> - if (blkif_request_flush_invalid(req, info)) {
>>> - __blk_end_request_all(req, -EOPNOTSUPP);
>>> - continue;
>>> - }
>>> + if (blkif_queue_request(qd->rq))
>>> + goto out_busy;
>>>
>>> - pr_debug("do_blk_req %p: cmd %p, sec %lx, "
>>> - "(%u/%u) [%s]\n",
>>> - req, req->cmd, (unsigned long)blk_rq_pos(req),
>>> - blk_rq_cur_sectors(req), blk_rq_sectors(req),
>>> - rq_data_dir(req) ? "write" : "read");
>>> -
>>> - if (blkif_queue_request(req)) {
>>> - blk_requeue_request(rq, req);
>>> -wait:
>>> - /* Avoid pointless unplugs. */
>>> - blk_stop_queue(rq);
>>> - break;
>>> - }
>>> + flush_requests(info);
>>> + spin_unlock_irq(&info->io_lock);
>>> + return BLK_MQ_RQ_QUEUE_OK;
>>>
>>> - queued++;
>>> - }
>>> +out_err:
>>> + spin_unlock_irq(&info->io_lock);
>>> + return BLK_MQ_RQ_QUEUE_ERROR;
>>>
>>> - if (queued != 0)
>>> - flush_requests(info);
>>> +out_busy:
>>> + spin_unlock_irq(&info->io_lock);
>>> + blk_mq_stop_hw_queue(hctx);
>>> + return BLK_MQ_RQ_QUEUE_BUSY;
>>> }
>>>
>>> +static struct blk_mq_ops blkfront_mq_ops = {
>>> + .queue_rq = blkif_queue_rq,
>>> + .map_queue = blk_mq_map_queue,
>>> +};
>>> +
>>> static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
>>> unsigned int physical_sector_size,
>>> unsigned int segments)
>>> @@ -671,9 +660,22 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
>>> struct request_queue *rq;
>>> struct blkfront_info *info = gd->private_data;
>>>
>>> - rq = blk_init_queue(do_blkif_request, &info->io_lock);
>>> - if (rq == NULL)
>>> + memset(&info->tag_set, 0, sizeof(info->tag_set));
>>> + info->tag_set.ops = &blkfront_mq_ops;
>>> + info->tag_set.nr_hw_queues = 1;
>>> + info->tag_set.queue_depth = BLK_RING_SIZE(info);
>>> + info->tag_set.numa_node = NUMA_NO_NODE;
>>> + info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
>>> + info->tag_set.cmd_size = 0;
>>> + info->tag_set.driver_data = info;
>>> +
>>> + if (blk_mq_alloc_tag_set(&info->tag_set))
>>> return -1;
>>> + rq = blk_mq_init_queue(&info->tag_set);
>>> + if (IS_ERR(rq)) {
>>> + blk_mq_free_tag_set(&info->tag_set);
>>> + return -1;
>>> + }
>>>
>>> queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
>>>
>>> @@ -901,19 +903,15 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
>>> static void xlvbd_release_gendisk(struct blkfront_info *info)
>>> {
>>> unsigned int minor, nr_minors;
>>> - unsigned long flags;
>>>
>>> if (info->rq == NULL)
>>> return;
>>>
>>> - spin_lock_irqsave(&info->io_lock, flags);
>>> -
>>> /* No more blkif_request(). */
>>> - blk_stop_queue(info->rq);
>>> + blk_mq_stop_hw_queues(info->rq);
>>>
>>> /* No more gnttab callback work. */
>>> gnttab_cancel_free_callback(&info->callback);
>>> - spin_unlock_irqrestore(&info->io_lock, flags);
>>>
>>> /* Flush gnttab callback work. Must be done with no locks held. */
>>> flush_work(&info->work);
>>> @@ -925,20 +923,18 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
>>> xlbd_release_minors(minor, nr_minors);
>>>
>>> blk_cleanup_queue(info->rq);
>>> + blk_mq_free_tag_set(&info->tag_set);
>>> info->rq = NULL;
>>>
>>> put_disk(info->gd);
>>> info->gd = NULL;
>>> }
>>>
>>> +/* Must be called with io_lock holded */
>>> static void kick_pending_request_queues(struct blkfront_info *info)
>>> {
>>> - if (!RING_FULL(&info->ring)) {
>>> - /* Re-enable calldowns. */
>>> - blk_start_queue(info->rq);
>>> - /* Kick things off immediately. */
>>> - do_blkif_request(info->rq);
>>> - }
>>> + if (!RING_FULL(&info->ring))
>>> + blk_mq_start_stopped_hw_queues(info->rq, true);
>>> }
>>>
>>> static void blkif_restart_queue(struct work_struct *work)
>>> @@ -963,7 +959,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
>>> BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
>>> /* No more blkif_request(). */
>>> if (info->rq)
>>> - blk_stop_queue(info->rq);
>>> + blk_mq_stop_hw_queues(info->rq);
>>>
>>> /* Remove all persistent grants */
>>> if (!list_empty(&info->grants)) {
>>> @@ -1144,7 +1140,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
>>> RING_IDX i, rp;
>>> unsigned long flags;
>>> struct blkfront_info *info = (struct blkfront_info *)dev_id;
>>> - int error;
>>>
>>> spin_lock_irqsave(&info->io_lock, flags);
>>>
>>> @@ -1185,37 +1180,37 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
>>> continue;
>>> }
>>>
>>> - error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
>>> + req->errors = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
>>> switch (bret->operation) {
>>> case BLKIF_OP_DISCARD:
>>> if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
>>> struct request_queue *rq = info->rq;
>>> printk(KERN_WARNING "blkfront: %s: %s op failed\n",
>>> info->gd->disk_name, op_name(bret->operation));
>>> - error = -EOPNOTSUPP;
>>> + req->errors = -EOPNOTSUPP;
>>> info->feature_discard = 0;
>>> info->feature_secdiscard = 0;
>>> queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
>>> queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
>>> }
>>> - __blk_end_request_all(req, error);
>>> + blk_mq_complete_request(req);
>>> break;
>>> case BLKIF_OP_FLUSH_DISKCACHE:
>>> case BLKIF_OP_WRITE_BARRIER:
>>> if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
>>> printk(KERN_WARNING "blkfront: %s: %s op failed\n",
>>> info->gd->disk_name, op_name(bret->operation));
>>> - error = -EOPNOTSUPP;
>>> + req->errors = -EOPNOTSUPP;
>>> }
>>> if (unlikely(bret->status == BLKIF_RSP_ERROR &&
>>> info->shadow[id].req.u.rw.nr_segments == 0)) {
>>> printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
>>> info->gd->disk_name, op_name(bret->operation));
>>> - error = -EOPNOTSUPP;
>>> + req->errors = -EOPNOTSUPP;
>>> }
>>> - if (unlikely(error)) {
>>> - if (error == -EOPNOTSUPP)
>>> - error = 0;
>>> + if (unlikely(req->errors)) {
>>> + if (req->errors == -EOPNOTSUPP)
>>> + req->errors = 0;
>>> info->feature_flush = 0;
>>> xlvbd_flush(info);
>>> }
>>> @@ -1226,7 +1221,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
>>> dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
>>> "request: %x\n", bret->status);
>>>
>>> - __blk_end_request_all(req, error);
>>> + blk_mq_complete_request(req);
>>> break;
>>> default:
>>> BUG();
>>> @@ -1555,28 +1550,6 @@ static int blkif_recover(struct blkfront_info *info)
>>>
>>> kfree(copy);
>>>
>>> - /*
>>> - * Empty the queue, this is important because we might have
>>> - * requests in the queue with more segments than what we
>>> - * can handle now.
>>> - */
>>> - spin_lock_irq(&info->io_lock);
>>> - while ((req = blk_fetch_request(info->rq)) != NULL) {
>>> - if (req->cmd_flags &
>>> - (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
>>> - list_add(&req->queuelist, &requests);
>>> - continue;
>>> - }
>>> - merge_bio.head = req->bio;
>>> - merge_bio.tail = req->biotail;
>>> - bio_list_merge(&bio_list, &merge_bio);
>>> - req->bio = NULL;
>>> - if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
>>> - pr_alert("diskcache flush request found!\n");
>>> - __blk_end_request_all(req, 0);
>>> - }
>>> - spin_unlock_irq(&info->io_lock);
>>> -
>>> xenbus_switch_state(info->xbdev, XenbusStateConnected);
>>>
>>> spin_lock_irq(&info->io_lock);
>>> @@ -1591,9 +1564,10 @@ static int blkif_recover(struct blkfront_info *info)
>>> /* Requeue pending requests (flush or discard) */
>>> list_del_init(&req->queuelist);
>>> BUG_ON(req->nr_phys_segments > segs);
>>> - blk_requeue_request(info->rq, req);
>>> + blk_mq_requeue_request(req);
>>> }
>>> spin_unlock_irq(&info->io_lock);
>>> + blk_mq_kick_requeue_list(info->rq);
>>>
>>> while ((bio = bio_list_pop(&bio_list)) != NULL) {
>>> /* Traverse the list of pending bios and re-queue them */
>>>

2015-08-27 11:06:20

by Rafal Mielniczuk

[permalink] [raw]
Subject: Re: [PATCH v3] xen/blkfront: convert to blk-mq APIs

On 25/08/15 03:14, Bob Liu wrote:
> Hi Rafal,
>
> Please have a try adding "--iodepth_batch=32 --iodepth_batch_complete=32" to the fio command line.
> I didn't see this issue any more, neither for domU.
>
> Thanks,
> -Bob

Hello,

Using 4.2-rc8 kernel, I can confirm that merges are happening after adding this option,
both in dom0 and in the guest using null_blk device.

Using the latest stable kernel 4.1.6 with the same configuration, there are no merges.

Do you know which change might have caused this difference?

Below are my results for dom0:

====================================================================================
modprobe null_blk irqmode=2 completion_nsec=1000000 queue_mode=1 submit_queues=1
====================================================================================

READ: io=22111MB, aggrb=754705KB/s, minb=754705KB/s, maxb=754705KB/s, mint=30001msec, maxt=30001msec

Disk stats (read/write):
nullb0: ios=352340/0, merge=5285340/0, ticks=397340/0, in_queue=396112, util=95.87%

====================================================================================
modprobe null_blk irqmode=2 completion_nsec=1000000 queue_mode=2 submit_queues=1
====================================================================================

READ: io=22409MB, aggrb=764838KB/s, minb=764838KB/s, maxb=764838KB/s, mint=30002msec, maxt=30002msec

Disk stats (read/write):
nullb0: ios=357070/0, merge=5356290/0, ticks=420812/0, in_queue=450772, util=99.32%

Thanks,
Rafal
>
> On 08/21/2015 04:46 PM, Rafal Mielniczuk wrote:
>> On 19/08/15 12:12, Bob Liu wrote:
>>> Hi Jens & Christoph,
>>>
>>> Rafal reported an issue about this patch, that's after this patch no more
>>> merges happen and the performance dropped if "modprobe null_blk irqmode=2 completion_nsec=1000000",
>>> but works fine if "modprobe null_blk".
>>>
>>> I'm not sure whether it's as expect or not.
>>> Do you have any suggestions? Thank you!
>>>
>>> Here is the test result:
>>>
>>> fio --name=test --ioengine=libaio --rw=read --numjobs=8 --iodepth=32 \
>>> --time_based=1 --runtime=30 --bs=4KB --filename=/dev/xvdb \
>>> --direct=1 --group_reporting=1 --iodepth_batch=16
>>>
>>> ========================================================================
>>> modprobe null_blk
>>> ========================================================================
>>> ------------------------------------------------------------------------
>>> *no patch* (avgrq-sz = 8.00 avgqu-sz=5.00)
>>> ------------------------------------------------------------------------
>>> READ: io=10655MB, aggrb=363694KB/s, minb=363694KB/s, maxb=363694KB/s, mint=30001msec, maxt=30001msec
>>>
>>> Disk stats (read/write):
>>> xvdb: ios=2715852/0, merge=1089/0, ticks=126572/0, in_queue=127456, util=100.00%
>>>
>>> ------------------------------------------------------------------------
>>> *with patch* (avgrq-sz = 8.00 avgqu-sz=8.00)
>>> ------------------------------------------------------------------------
>>> READ: io=20655MB, aggrb=705010KB/s, minb=705010KB/s, maxb=705010KB/s, mint=30001msec, maxt=30001msec
>>>
>>> Disk stats (read/write):
>>> xvdb: ios=5274633/0, merge=22/0, ticks=243208/0, in_queue=242908, util=99.98%
>>>
>>> ========================================================================
>>> modprobe null_blk irqmode=2 completion_nsec=1000000
>>> ========================================================================
>>> ------------------------------------------------------------------------
>>> *no patch* (avgrq-sz = 34.00 avgqu-sz=38.00)
>>> ------------------------------------------------------------------------
>>> READ: io=10372MB, aggrb=354008KB/s, minb=354008KB/s, maxb=354008KB/s, mint=30003msec, maxt=30003msec
>>>
>>> Disk stats (read/write):
>>> xvdb: ios=621760/0, *merge=1988170/0*, ticks=1136700/0, in_queue=1146020, util=99.76%
>>>
>>> ------------------------------------------------------------------------
>>> *with patch* (avgrq-sz = 8.00 avgqu-sz=28.00)
>>> ------------------------------------------------------------------------
>>> READ: io=2876.8MB, aggrb=98187KB/s, minb=98187KB/s, maxb=98187KB/s, mint=30002msec, maxt=30002msec
>>>
>>> Disk stats (read/write):
>>> xvdb: ios=734048/0, merge=0/0, ticks=843584/0, in_queue=843080, util=99.72%
>>>
>>> Regards,
>>> -Bob
>> Hello,
>>
>> We got a problem with the lack of merges also when we tested on null_blk device in dom0 directly.
>> When we enabled multi queue block-layer we got no merges, even when we set the number of submission queues to 1.
>>
>> If I don't miss anything, that could suggest the problem lays somewhere in the blk-mq layer itself?
>>
>> Please take a look at the results below:
>>
>> fio --name=test --ioengine=libaio --rw=read --numjobs=8 --iodepth=32 \
>> --time_based=1 --runtime=30 --bs=4KB --filename=/dev/nullb0 \
>> --direct=1 --group_reporting=1
>>
>> ========================================================================
>> modprobe null_blk irqmode=2 completion_nsec=1000000 queue_mode=1 submit_queues=1
>> ========================================================================
>> READ: io=13692MB, aggrb=467320KB/s, minb=467320KB/s, maxb=467320KB/s, mint=30002msec, maxt=30002msec
>>
>> Disk stats (read/write):
>> nullb0: ios=991026/0, merge=2499524/0, ticks=1846952/0, in_queue=900012, util=100.00%
>>
>> ========================================================================
>> modprobe null_blk irqmode=2 completion_nsec=1000000 queue_mode=2 submit_queues=1
>> ========================================================================
>> READ: io=6839.1MB, aggrb=233452KB/s, minb=233452KB/s, maxb=233452KB/s, mint=30002msec, maxt=30002msec
>>
>> Disk stats (read/write):
>> nullb0: ios=1743967/0, merge=0/0, ticks=1712900/0, in_queue=1839072, util=100.00%
>>
>> Thanks,
>> Rafal
>>
>>> On 07/13/2015 05:55 PM, Bob Liu wrote:
>>>> Note: This patch is based on original work of Arianna's internship for
>>>> GNOME's Outreach Program for Women.
>>>>
>>>> Only one hardware queue is used now, so there is no performance change.
>>>>
>>>> The legacy non-mq code is deleted completely which is the same as other
>>>> drivers like virtio, mtip, and nvme.
>>>>
>>>> Also dropped one unnecessary holding of info->io_lock when calling
>>>> blk_mq_stop_hw_queues().
>>>>
>>>> Changes in v2:
>>>> - Reorganized blk_mq_queue_rq()
>>>> - Restored most io_locks in place
>>>>
>>>> Change in v3:
>>>> - Rename blk_mq_queue_rq to blkif_queue_rq
>>>>
>>>> Signed-off-by: Arianna Avanzini <[email protected]>
>>>> Signed-off-by: Bob Liu <[email protected]>
>>>> Reviewed-by: Christoph Hellwig <[email protected]>
>>>> Acked-by: Jens Axboe <[email protected]>
>>>> ---
>>>> drivers/block/xen-blkfront.c | 146 +++++++++++++++++-------------------------
>>>> 1 file changed, 60 insertions(+), 86 deletions(-)
>>>>
>>>> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
>>>> index 6d89ed3..5b45ee5 100644
>>>> --- a/drivers/block/xen-blkfront.c
>>>> +++ b/drivers/block/xen-blkfront.c
>>>> @@ -37,6 +37,7 @@
>>>>
>>>> #include <linux/interrupt.h>
>>>> #include <linux/blkdev.h>
>>>> +#include <linux/blk-mq.h>
>>>> #include <linux/hdreg.h>
>>>> #include <linux/cdrom.h>
>>>> #include <linux/module.h>
>>>> @@ -148,6 +149,7 @@ struct blkfront_info
>>>> unsigned int feature_persistent:1;
>>>> unsigned int max_indirect_segments;
>>>> int is_ready;
>>>> + struct blk_mq_tag_set tag_set;
>>>> };
>>>>
>>>> static unsigned int nr_minors;
>>>> @@ -616,54 +618,41 @@ static inline bool blkif_request_flush_invalid(struct request *req,
>>>> !(info->feature_flush & REQ_FUA)));
>>>> }
>>>>
>>>> -/*
>>>> - * do_blkif_request
>>>> - * read a block; request is in a request queue
>>>> - */
>>>> -static void do_blkif_request(struct request_queue *rq)
>>>> +static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
>>>> + const struct blk_mq_queue_data *qd)
>>>> {
>>>> - struct blkfront_info *info = NULL;
>>>> - struct request *req;
>>>> - int queued;
>>>> -
>>>> - pr_debug("Entered do_blkif_request\n");
>>>> -
>>>> - queued = 0;
>>>> + struct blkfront_info *info = qd->rq->rq_disk->private_data;
>>>>
>>>> - while ((req = blk_peek_request(rq)) != NULL) {
>>>> - info = req->rq_disk->private_data;
>>>> -
>>>> - if (RING_FULL(&info->ring))
>>>> - goto wait;
>>>> + blk_mq_start_request(qd->rq);
>>>> + spin_lock_irq(&info->io_lock);
>>>> + if (RING_FULL(&info->ring))
>>>> + goto out_busy;
>>>>
>>>> - blk_start_request(req);
>>>> + if (blkif_request_flush_invalid(qd->rq, info))
>>>> + goto out_err;
>>>>
>>>> - if (blkif_request_flush_invalid(req, info)) {
>>>> - __blk_end_request_all(req, -EOPNOTSUPP);
>>>> - continue;
>>>> - }
>>>> + if (blkif_queue_request(qd->rq))
>>>> + goto out_busy;
>>>>
>>>> - pr_debug("do_blk_req %p: cmd %p, sec %lx, "
>>>> - "(%u/%u) [%s]\n",
>>>> - req, req->cmd, (unsigned long)blk_rq_pos(req),
>>>> - blk_rq_cur_sectors(req), blk_rq_sectors(req),
>>>> - rq_data_dir(req) ? "write" : "read");
>>>> -
>>>> - if (blkif_queue_request(req)) {
>>>> - blk_requeue_request(rq, req);
>>>> -wait:
>>>> - /* Avoid pointless unplugs. */
>>>> - blk_stop_queue(rq);
>>>> - break;
>>>> - }
>>>> + flush_requests(info);
>>>> + spin_unlock_irq(&info->io_lock);
>>>> + return BLK_MQ_RQ_QUEUE_OK;
>>>>
>>>> - queued++;
>>>> - }
>>>> +out_err:
>>>> + spin_unlock_irq(&info->io_lock);
>>>> + return BLK_MQ_RQ_QUEUE_ERROR;
>>>>
>>>> - if (queued != 0)
>>>> - flush_requests(info);
>>>> +out_busy:
>>>> + spin_unlock_irq(&info->io_lock);
>>>> + blk_mq_stop_hw_queue(hctx);
>>>> + return BLK_MQ_RQ_QUEUE_BUSY;
>>>> }
>>>>
>>>> +static struct blk_mq_ops blkfront_mq_ops = {
>>>> + .queue_rq = blkif_queue_rq,
>>>> + .map_queue = blk_mq_map_queue,
>>>> +};
>>>> +
>>>> static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
>>>> unsigned int physical_sector_size,
>>>> unsigned int segments)
>>>> @@ -671,9 +660,22 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
>>>> struct request_queue *rq;
>>>> struct blkfront_info *info = gd->private_data;
>>>>
>>>> - rq = blk_init_queue(do_blkif_request, &info->io_lock);
>>>> - if (rq == NULL)
>>>> + memset(&info->tag_set, 0, sizeof(info->tag_set));
>>>> + info->tag_set.ops = &blkfront_mq_ops;
>>>> + info->tag_set.nr_hw_queues = 1;
>>>> + info->tag_set.queue_depth = BLK_RING_SIZE(info);
>>>> + info->tag_set.numa_node = NUMA_NO_NODE;
>>>> + info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
>>>> + info->tag_set.cmd_size = 0;
>>>> + info->tag_set.driver_data = info;
>>>> +
>>>> + if (blk_mq_alloc_tag_set(&info->tag_set))
>>>> return -1;
>>>> + rq = blk_mq_init_queue(&info->tag_set);
>>>> + if (IS_ERR(rq)) {
>>>> + blk_mq_free_tag_set(&info->tag_set);
>>>> + return -1;
>>>> + }
>>>>
>>>> queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
>>>>
>>>> @@ -901,19 +903,15 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
>>>> static void xlvbd_release_gendisk(struct blkfront_info *info)
>>>> {
>>>> unsigned int minor, nr_minors;
>>>> - unsigned long flags;
>>>>
>>>> if (info->rq == NULL)
>>>> return;
>>>>
>>>> - spin_lock_irqsave(&info->io_lock, flags);
>>>> -
>>>> /* No more blkif_request(). */
>>>> - blk_stop_queue(info->rq);
>>>> + blk_mq_stop_hw_queues(info->rq);
>>>>
>>>> /* No more gnttab callback work. */
>>>> gnttab_cancel_free_callback(&info->callback);
>>>> - spin_unlock_irqrestore(&info->io_lock, flags);
>>>>
>>>> /* Flush gnttab callback work. Must be done with no locks held. */
>>>> flush_work(&info->work);
>>>> @@ -925,20 +923,18 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
>>>> xlbd_release_minors(minor, nr_minors);
>>>>
>>>> blk_cleanup_queue(info->rq);
>>>> + blk_mq_free_tag_set(&info->tag_set);
>>>> info->rq = NULL;
>>>>
>>>> put_disk(info->gd);
>>>> info->gd = NULL;
>>>> }
>>>>
>>>> +/* Must be called with io_lock holded */
>>>> static void kick_pending_request_queues(struct blkfront_info *info)
>>>> {
>>>> - if (!RING_FULL(&info->ring)) {
>>>> - /* Re-enable calldowns. */
>>>> - blk_start_queue(info->rq);
>>>> - /* Kick things off immediately. */
>>>> - do_blkif_request(info->rq);
>>>> - }
>>>> + if (!RING_FULL(&info->ring))
>>>> + blk_mq_start_stopped_hw_queues(info->rq, true);
>>>> }
>>>>
>>>> static void blkif_restart_queue(struct work_struct *work)
>>>> @@ -963,7 +959,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
>>>> BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
>>>> /* No more blkif_request(). */
>>>> if (info->rq)
>>>> - blk_stop_queue(info->rq);
>>>> + blk_mq_stop_hw_queues(info->rq);
>>>>
>>>> /* Remove all persistent grants */
>>>> if (!list_empty(&info->grants)) {
>>>> @@ -1144,7 +1140,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
>>>> RING_IDX i, rp;
>>>> unsigned long flags;
>>>> struct blkfront_info *info = (struct blkfront_info *)dev_id;
>>>> - int error;
>>>>
>>>> spin_lock_irqsave(&info->io_lock, flags);
>>>>
>>>> @@ -1185,37 +1180,37 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
>>>> continue;
>>>> }
>>>>
>>>> - error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
>>>> + req->errors = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
>>>> switch (bret->operation) {
>>>> case BLKIF_OP_DISCARD:
>>>> if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
>>>> struct request_queue *rq = info->rq;
>>>> printk(KERN_WARNING "blkfront: %s: %s op failed\n",
>>>> info->gd->disk_name, op_name(bret->operation));
>>>> - error = -EOPNOTSUPP;
>>>> + req->errors = -EOPNOTSUPP;
>>>> info->feature_discard = 0;
>>>> info->feature_secdiscard = 0;
>>>> queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
>>>> queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
>>>> }
>>>> - __blk_end_request_all(req, error);
>>>> + blk_mq_complete_request(req);
>>>> break;
>>>> case BLKIF_OP_FLUSH_DISKCACHE:
>>>> case BLKIF_OP_WRITE_BARRIER:
>>>> if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
>>>> printk(KERN_WARNING "blkfront: %s: %s op failed\n",
>>>> info->gd->disk_name, op_name(bret->operation));
>>>> - error = -EOPNOTSUPP;
>>>> + req->errors = -EOPNOTSUPP;
>>>> }
>>>> if (unlikely(bret->status == BLKIF_RSP_ERROR &&
>>>> info->shadow[id].req.u.rw.nr_segments == 0)) {
>>>> printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
>>>> info->gd->disk_name, op_name(bret->operation));
>>>> - error = -EOPNOTSUPP;
>>>> + req->errors = -EOPNOTSUPP;
>>>> }
>>>> - if (unlikely(error)) {
>>>> - if (error == -EOPNOTSUPP)
>>>> - error = 0;
>>>> + if (unlikely(req->errors)) {
>>>> + if (req->errors == -EOPNOTSUPP)
>>>> + req->errors = 0;
>>>> info->feature_flush = 0;
>>>> xlvbd_flush(info);
>>>> }
>>>> @@ -1226,7 +1221,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
>>>> dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
>>>> "request: %x\n", bret->status);
>>>>
>>>> - __blk_end_request_all(req, error);
>>>> + blk_mq_complete_request(req);
>>>> break;
>>>> default:
>>>> BUG();
>>>> @@ -1555,28 +1550,6 @@ static int blkif_recover(struct blkfront_info *info)
>>>>
>>>> kfree(copy);
>>>>
>>>> - /*
>>>> - * Empty the queue, this is important because we might have
>>>> - * requests in the queue with more segments than what we
>>>> - * can handle now.
>>>> - */
>>>> - spin_lock_irq(&info->io_lock);
>>>> - while ((req = blk_fetch_request(info->rq)) != NULL) {
>>>> - if (req->cmd_flags &
>>>> - (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
>>>> - list_add(&req->queuelist, &requests);
>>>> - continue;
>>>> - }
>>>> - merge_bio.head = req->bio;
>>>> - merge_bio.tail = req->biotail;
>>>> - bio_list_merge(&bio_list, &merge_bio);
>>>> - req->bio = NULL;
>>>> - if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
>>>> - pr_alert("diskcache flush request found!\n");
>>>> - __blk_end_request_all(req, 0);
>>>> - }
>>>> - spin_unlock_irq(&info->io_lock);
>>>> -
>>>> xenbus_switch_state(info->xbdev, XenbusStateConnected);
>>>>
>>>> spin_lock_irq(&info->io_lock);
>>>> @@ -1591,9 +1564,10 @@ static int blkif_recover(struct blkfront_info *info)
>>>> /* Requeue pending requests (flush or discard) */
>>>> list_del_init(&req->queuelist);
>>>> BUG_ON(req->nr_phys_segments > segs);
>>>> - blk_requeue_request(info->rq, req);
>>>> + blk_mq_requeue_request(req);
>>>> }
>>>> spin_unlock_irq(&info->io_lock);
>>>> + blk_mq_kick_requeue_list(info->rq);
>>>>
>>>> while ((bio = bio_list_pop(&bio_list)) != NULL) {
>>>> /* Traverse the list of pending bios and re-queue them */
>>>>