2020-04-23 21:09:05

by Salman Qazi

[permalink] [raw]
Subject: [PATCH v2] block: Limit number of items taken from the I/O scheduler in one go

Flushes bypass the I/O scheduler and get added to hctx->dispatch
in blk_mq_sched_bypass_insert. This can happen while a kworker is running
hctx->run_work work item and is past the point in
blk_mq_sched_dispatch_requests where hctx->dispatch is checked.

The blk_mq_do_dispatch_sched call is not guaranteed to end in bounded time,
because the I/O scheduler can feed an arbitrary number of commands.

Since we have only one hctx->run_work, the commands waiting in
hctx->dispatch will wait an arbitrary length of time for run_work to be
rerun.

A similar phenomenon exists with dispatches from the software queue.

The solution is to poll hctx->dispatch in blk_mq_do_dispatch_sched and
blk_mq_do_dispatch_ctx and return from the run_work handler and let it
rerun.

Signed-off-by: Salman Qazi <[email protected]>
Reviewed-by: Ming Lei <[email protected]>
---
block/blk-mq-sched.c | 49 ++++++++++++++++++++++++++++++++++++++------
1 file changed, 43 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 74cedea56034..b69b780351c1 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -84,12 +84,17 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
* Only SCSI implements .get_budget and .put_budget, and SCSI restarts
* its queue by itself in its completion handler, so we don't need to
* restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ *
+ * Returns true if hctx->dispatch was found non-empty and
+ * run_work has to be run again. This is necessary to avoid
+ * starving flushes.
*/
-static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
LIST_HEAD(rq_list);
+ bool ret = false;

do {
struct request *rq;
@@ -97,6 +102,11 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
break;

+ if (!list_empty_careful(&hctx->dispatch)) {
+ ret = true;
+ break;
+ }
+
if (!blk_mq_get_dispatch_budget(hctx))
break;

@@ -113,6 +123,8 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
*/
list_add(&rq->queuelist, &rq_list);
} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+ return ret;
}

static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
@@ -130,16 +142,26 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
* Only SCSI implements .get_budget and .put_budget, and SCSI restarts
* its queue by itself in its completion handler, so we don't need to
* restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ *
+ * Returns true if hctx->dispatch was found non-empty and
+ * run_work has to be run again. This is necessary to avoid
+ * starving flushes.
*/
-static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
LIST_HEAD(rq_list);
struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+ bool ret = false;

do {
struct request *rq;

+ if (!list_empty_careful(&hctx->dispatch)) {
+ ret = true;
+ break;
+ }
+
if (!sbitmap_any_bit_set(&hctx->ctx_map))
break;

@@ -165,6 +187,7 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
} while (blk_mq_dispatch_rq_list(q, &rq_list, true));

WRITE_ONCE(hctx->dispatch_from, ctx);
+ return ret;
}

void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
@@ -172,6 +195,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
+ bool run_again;
+ bool restarted = false;
LIST_HEAD(rq_list);

/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -180,6 +205,9 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)

hctx->run++;

+again:
+ run_again = false;
+
/*
* If we have previous entries on our dispatch list, grab them first for
* more fair dispatch.
@@ -208,19 +236,28 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
blk_mq_sched_mark_restart_hctx(hctx);
if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
if (has_sched_dispatch)
- blk_mq_do_dispatch_sched(hctx);
+ run_again = blk_mq_do_dispatch_sched(hctx);
else
- blk_mq_do_dispatch_ctx(hctx);
+ run_again = blk_mq_do_dispatch_ctx(hctx);
}
} else if (has_sched_dispatch) {
- blk_mq_do_dispatch_sched(hctx);
+ run_again = blk_mq_do_dispatch_sched(hctx);
} else if (hctx->dispatch_busy) {
/* dequeue request one by one from sw queue if queue is busy */
- blk_mq_do_dispatch_ctx(hctx);
+ run_again = blk_mq_do_dispatch_ctx(hctx);
} else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
blk_mq_dispatch_rq_list(q, &rq_list, false);
}
+
+ if (run_again) {
+ if (!restarted) {
+ restarted = true;
+ goto again;
+ }
+
+ blk_mq_run_hw_queue(hctx, true);
+ }
}

bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
--
2.26.1.301.g55bc3eb7cb9-goog


2020-04-23 21:35:21

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH v2] block: Limit number of items taken from the I/O scheduler in one go

On 4/23/20 3:05 PM, Salman Qazi wrote:
> Flushes bypass the I/O scheduler and get added to hctx->dispatch
> in blk_mq_sched_bypass_insert. This can happen while a kworker is running
> hctx->run_work work item and is past the point in
> blk_mq_sched_dispatch_requests where hctx->dispatch is checked.
>
> The blk_mq_do_dispatch_sched call is not guaranteed to end in bounded time,
> because the I/O scheduler can feed an arbitrary number of commands.
>
> Since we have only one hctx->run_work, the commands waiting in
> hctx->dispatch will wait an arbitrary length of time for run_work to be
> rerun.
>
> A similar phenomenon exists with dispatches from the software queue.
>
> The solution is to poll hctx->dispatch in blk_mq_do_dispatch_sched and
> blk_mq_do_dispatch_ctx and return from the run_work handler and let it
> rerun.

Any changes since v1? It's customary to put that in here too, below
the --- lines.

--
Jens Axboe

2020-04-24 01:44:41

by Salman Qazi

[permalink] [raw]
Subject: Re: [PATCH v2] block: Limit number of items taken from the I/O scheduler in one go

I remailed it with the changes since v1 added. But just to answer directly:

Changes since v1:

* Removed max_sched_batch.
* Extended the fix to the software queue.
* Use a return value from blk_mq_do_dispatch_sched to indicate if
the dispatch should be rerun.
* Some comments added.

On Thu, Apr 23, 2020 at 2:30 PM Jens Axboe <[email protected]> wrote:
>
> On 4/23/20 3:05 PM, Salman Qazi wrote:
> > Flushes bypass the I/O scheduler and get added to hctx->dispatch
> > in blk_mq_sched_bypass_insert. This can happen while a kworker is running
> > hctx->run_work work item and is past the point in
> > blk_mq_sched_dispatch_requests where hctx->dispatch is checked.
> >
> > The blk_mq_do_dispatch_sched call is not guaranteed to end in bounded time,
> > because the I/O scheduler can feed an arbitrary number of commands.
> >
> > Since we have only one hctx->run_work, the commands waiting in
> > hctx->dispatch will wait an arbitrary length of time for run_work to be
> > rerun.
> >
> > A similar phenomenon exists with dispatches from the software queue.
> >
> > The solution is to poll hctx->dispatch in blk_mq_do_dispatch_sched and
> > blk_mq_do_dispatch_ctx and return from the run_work handler and let it
> > rerun.
>
> Any changes since v1? It's customary to put that in here too, below
> the --- lines.
>
> --
> Jens Axboe
>

2020-04-24 06:17:43

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v2] block: Limit number of items taken from the I/O scheduler in one go

> index 74cedea56034..b69b780351c1 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -84,12 +84,17 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
> * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
> * its queue by itself in its completion handler, so we don't need to
> * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> + *
> + * Returns true if hctx->dispatch was found non-empty and
> + * run_work has to be run again. This is necessary to avoid
> + * starving flushes.
> */

Please use up all 80 chars for your comments (just like the existing
part of the comment).

> * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
> * its queue by itself in its completion handler, so we don't need to
> * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> + *
> + * Returns true if hctx->dispatch was found non-empty and
> + * run_work has to be run again. This is necessary to avoid
> + * starving flushes.

Same here.

> +again:
> + run_again = false;
> +
> /*
> * If we have previous entries on our dispatch list, grab them first for
> * more fair dispatch.
> @@ -208,19 +236,28 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> blk_mq_sched_mark_restart_hctx(hctx);
> if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> if (has_sched_dispatch)
> - blk_mq_do_dispatch_sched(hctx);
> + run_again = blk_mq_do_dispatch_sched(hctx);
> else
> - blk_mq_do_dispatch_ctx(hctx);
> + run_again = blk_mq_do_dispatch_ctx(hctx);
> }
> } else if (has_sched_dispatch) {
> - blk_mq_do_dispatch_sched(hctx);
> + run_again = blk_mq_do_dispatch_sched(hctx);
> } else if (hctx->dispatch_busy) {
> /* dequeue request one by one from sw queue if queue is busy */
> - blk_mq_do_dispatch_ctx(hctx);
> + run_again = blk_mq_do_dispatch_ctx(hctx);
> } else {
> blk_mq_flush_busy_ctxs(hctx, &rq_list);
> blk_mq_dispatch_rq_list(q, &rq_list, false);
> }
> +
> + if (run_again) {
> + if (!restarted) {
> + restarted = true;
> + goto again;
> + }
> +
> + blk_mq_run_hw_queue(hctx, true);
> + }

This is a weird loop. I'd split the code betweem the again label and
the run_again check here into a __blk_mq_sched_dispatch_requests
helper, and then you can do:

if (__blk_mq_sched_dispatch_requests()) {
if (__blk_mq_sched_dispatch_requests())
blk_mq_run_hw_queue(hctx, true);
}

here. Preferably with ha good comment explaining the logic.

2020-04-24 06:43:56

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v2] block: Limit number of items taken from the I/O scheduler in one go

On Fri, Apr 24, 2020 at 08:15:29AM +0200, Christoph Hellwig wrote:
> This is a weird loop. I'd split the code betweem the again label and
> the run_again check here into a __blk_mq_sched_dispatch_requests
> helper, and then you can do:
>
> if (__blk_mq_sched_dispatch_requests()) {
> if (__blk_mq_sched_dispatch_requests())
> blk_mq_run_hw_queue(hctx, true);
> }
>
> here. Preferably with ha good comment explaining the logic.

Also I wonder if inverting the return values in the lower level function
would make things a little more readable - a true return suggests
everything worked fine. Alternative 0 for sucess and -EAGAIN for needs
a retry also would be pretty self-documenting.