I've just ran into an issue where I ran out of blk-mq tags on my virtio
setup on what appears a heavy fsync workload. When drilling it down it
seemed to be a tag leak. This reminded me of a commit I had seen in
the scsi-mq tree but never on any mailinglist (and neither in my Inbox
despite the Cc tag) which turned out to be in Jens' blk-mq tree either.
https://git.kernel.org/cgit/linux/kernel/git/nab/target-pending.git/commit/?h=scsi-mq&id=620fe5a562440a823e7a25ae99dc19a6fbd53949
>From a quick look over the patch I'm not entirely sure it's correct, but
it's at least going in the right direction. The reason for the issue
that Alexander and I saw is the following:
The flush state machine takes in a struct request, which then is
submitted multiple times to the underling driver. The old block code
requeses the same request for each of those, so it does not have an
issue with tapping into the request pool. The new one on the other hand
allocates a new request for each of the actualy steps of the flush
sequence.
In theory this should make the patch from Alexander the right fix,
except that we do have a corner case where we do not do the above steps:
In case the state machine doesn't need to be run we issue the request
directly. This either happens if the driver doesn't need any cache
flushes, or we only requested force unit access and the driver supports
that natively.
It seems like we should set simply not require a tag at all for those
sub requests and solve the issue that way.
And Alexander and Nick: how did this patch end up in Nick's tree? I'm a
bit curious at all as there's been no visibility at all, despite Cc tags
to Jens and me.
I think this variant of the patch from Alexander should fix the issue
in a minimally invasive way. Longer term I'd prefer to use q->flush_rq
like in the non-mq case by copying over the context and tag information.
diff --git a/block/blk-core.c b/block/blk-core.c
index 3bb9e9f..9677c65 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1102,7 +1102,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
{
if (q->mq_ops)
- return blk_mq_alloc_request(q, rw, gfp_mask);
+ return blk_mq_alloc_request(q, rw, gfp_mask, false);
else
return blk_old_get_request(q, rw, gfp_mask);
}
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 3e4cc9c..331e627 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -286,7 +286,7 @@ static void mq_flush_work(struct work_struct *work)
/* We don't need set REQ_FLUSH_SEQ, it's for consistency */
rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
- __GFP_WAIT|GFP_ATOMIC);
+ __GFP_WAIT|GFP_ATOMIC, true);
rq->cmd_type = REQ_TYPE_FS;
rq->end_io = flush_end_io;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c88c0de..134100d1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -204,14 +204,15 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
return rq;
}
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp)
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
+ gfp_t gfp, bool reserved)
{
struct request *rq;
if (blk_mq_queue_enter(q))
return NULL;
- rq = blk_mq_alloc_request_pinned(q, rw, gfp, false);
+ rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved);
blk_mq_put_ctx(rq->mq_ctx);
return rq;
}
@@ -1286,6 +1287,15 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
reg->queue_depth = BLK_MQ_MAX_DEPTH;
}
+ /*
+ * Set aside a tag for flush requests. It will only be used while
+ * another flush request is in progress but outside the driver.
+ *
+ * TODO: only allocate if flushes are supported
+ */
+ reg->queue_depth++;
+ reg->reserved_tags++;
+
if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
return ERR_PTR(-EINVAL);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 158da52..3eb1beb 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -123,7 +123,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
void blk_mq_run_queues(struct request_queue *q, bool async);
void blk_mq_free_request(struct request *rq);
bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp);
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved);
struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp);
struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag);
On Sat, Oct 26 2013, Christoph Hellwig wrote:
> I think this variant of the patch from Alexander should fix the issue
> in a minimally invasive way. Longer term I'd prefer to use q->flush_rq
> like in the non-mq case by copying over the context and tag information.
This one is pretty simple, we could definitely use it as a band aid. I
too would greatly prefer using the static ->flush_rq instead. Just have
it marked to bypass most of the free logic.
I'll add this one.
--
Jens Axboe
On Sun, Oct 27, 2013 at 10:29:25PM +0000, Jens Axboe wrote:
> On Sat, Oct 26 2013, Christoph Hellwig wrote:
> > I think this variant of the patch from Alexander should fix the issue
> > in a minimally invasive way. Longer term I'd prefer to use q->flush_rq
> > like in the non-mq case by copying over the context and tag information.
>
> This one is pretty simple, we could definitely use it as a band aid. I
> too would greatly prefer using the static ->flush_rq instead. Just have
> it marked to bypass most of the free logic.
We already bypass the free logical by setting and end_io callback for
a while, similar to what the old code does. Maybe it's not all that
hard to prealloc the request, let me give a sping. Using the static
allocated one will be hard due to the driver-specific extra data,
though.
> I'll add this one.
Gimme another day or so to figure this out.
On 10/28/2013 02:48 AM, Christoph Hellwig wrote:
> On Sun, Oct 27, 2013 at 10:29:25PM +0000, Jens Axboe wrote:
>> On Sat, Oct 26 2013, Christoph Hellwig wrote:
>>> I think this variant of the patch from Alexander should fix the issue
>>> in a minimally invasive way. Longer term I'd prefer to use q->flush_rq
>>> like in the non-mq case by copying over the context and tag information.
>>
>> This one is pretty simple, we could definitely use it as a band aid. I
>> too would greatly prefer using the static ->flush_rq instead. Just have
>> it marked to bypass most of the free logic.
>
> We already bypass the free logical by setting and end_io callback for
> a while, similar to what the old code does. Maybe it's not all that
> hard to prealloc the request, let me give a sping. Using the static
> allocated one will be hard due to the driver-specific extra data,
> though.
It's not that I think the existing patch is THAT bad, it fits in alright
with the reserved tagging and works regardless of whether a driver uses
reserved tags or not. And it does have the upside of not requiring
special checks or logic for this special non-tagged request that using
the preallocated would might need.
>> I'll add this one.
>
> Gimme another day or so to figure this out.
OK, holding off.
--
Jens Axboe
On Mon, Oct 28, 2013 at 10:29:11AM -0600, Jens Axboe wrote:
> It's not that I think the existing patch is THAT bad, it fits in alright
> with the reserved tagging and works regardless of whether a driver uses
> reserved tags or not. And it does have the upside of not requiring
> special checks or logic for this special non-tagged request that using
> the preallocated would might need.
The problem with it is that it will pass a tag number to the low level
driver which it doesn't expect. In case the tags are used 1:1 as
hardware tags that would lead to nasy bugs.
At vefy least we'd need to mess with ->tag for this special request.
On 10/28/2013 10:46 AM, Christoph Hellwig wrote:
> On Mon, Oct 28, 2013 at 10:29:11AM -0600, Jens Axboe wrote:
>> It's not that I think the existing patch is THAT bad, it fits in alright
>> with the reserved tagging and works regardless of whether a driver uses
>> reserved tags or not. And it does have the upside of not requiring
>> special checks or logic for this special non-tagged request that using
>> the preallocated would might need.
>
> The problem with it is that it will pass a tag number to the low level
> driver which it doesn't expect. In case the tags are used 1:1 as
> hardware tags that would lead to nasy bugs.
>
> At vefy least we'd need to mess with ->tag for this special request.
Yes indeed. Actually the more I think about it, the better I like just
using the normal tagging infrastructure and punting to a reserved tag
for the flush. Just needs adding of the check whether it actually needs
it or not.
--
Jens Axboe
On Mon, Oct 28, 2013 at 10:59:32AM -0600, Jens Axboe wrote:
> > The problem with it is that it will pass a tag number to the low level
> > driver which it doesn't expect. In case the tags are used 1:1 as
> > hardware tags that would lead to nasy bugs.
> >
> > At vefy least we'd need to mess with ->tag for this special request.
>
> Yes indeed. Actually the more I think about it, the better I like just
> using the normal tagging infrastructure and punting to a reserved tag
> for the flush. Just needs adding of the check whether it actually needs
> it or not.
That issue happens when using reserved tags as-is: e.g. the device
supports 32 hardware tag, first reserved one is 33, something the
hardware can't handle.
So either way we'll have to copy over the tag from the original request.
I don't think I can come up with something sane for that ASAP, so I'd
suggest you actually do take my patch for now, and we'll sort something
out less in a hurry.
On 10/28/2013 10:57 AM, Shaohua Li wrote:
>
>
>
> 2013/10/28 Jens Axboe <[email protected] <mailto:[email protected]>>
>
> On 10/28/2013 02:48 AM, Christoph Hellwig wrote:
> > On Sun, Oct 27, 2013 at 10:29:25PM +0000, Jens Axboe wrote:
> >> On Sat, Oct 26 2013, Christoph Hellwig wrote:
> >>> I think this variant of the patch from Alexander should fix the
> issue
> >>> in a minimally invasive way. Longer term I'd prefer to use
> q->flush_rq
> >>> like in the non-mq case by copying over the context and tag
> information.
> >>
> >> This one is pretty simple, we could definitely use it as a band
> aid. I
> >> too would greatly prefer using the static ->flush_rq instead.
> Just have
> >> it marked to bypass most of the free logic.
> >
> > We already bypass the free logical by setting and end_io callback for
> > a while, similar to what the old code does. Maybe it's not all that
> > hard to prealloc the request, let me give a sping. Using the static
> > allocated one will be hard due to the driver-specific extra data,
> > though.
>
> It's not that I think the existing patch is THAT bad, it fits in alright
> with the reserved tagging and works regardless of whether a driver uses
> reserved tags or not. And it does have the upside of not requiring
> special checks or logic for this special non-tagged request that using
> the preallocated would might need.
>
> >> I'll add this one.
> >
> > Gimme another day or so to figure this out.
>
> OK, holding off.
>
>
> Another option: we could throttle flush-request allocation in
> blk_mq_alloc_request(), for example, flush_req_nr >= max_tags - 1, make
> the allocation wait.
That could work too. If we back off, then we could restart it once a
request completes. That does, however, requiring checking that and
potentially kicking all the queues on completion when that happens.
--
Jens Axboe
On 10/28/2013 01:30 PM, Christoph Hellwig wrote:
> On Mon, Oct 28, 2013 at 10:59:32AM -0600, Jens Axboe wrote:
>>> The problem with it is that it will pass a tag number to the low level
>>> driver which it doesn't expect. In case the tags are used 1:1 as
>>> hardware tags that would lead to nasy bugs.
>>>
>>> At vefy least we'd need to mess with ->tag for this special request.
>>
>> Yes indeed. Actually the more I think about it, the better I like just
>> using the normal tagging infrastructure and punting to a reserved tag
>> for the flush. Just needs adding of the check whether it actually needs
>> it or not.
>
> That issue happens when using reserved tags as-is: e.g. the device
> supports 32 hardware tag, first reserved one is 33, something the
> hardware can't handle.
>
> So either way we'll have to copy over the tag from the original request.
>
> I don't think I can come up with something sane for that ASAP, so I'd
> suggest you actually do take my patch for now, and we'll sort something
> out less in a hurry.
Agree, I will queue it up.
--
Jens Axboe
On Mon, Oct 28, 2013 at 01:38:57PM -0600, Jens Axboe wrote:
> On 10/28/2013 10:57 AM, Shaohua Li wrote:
> >
> >
> >
> > 2013/10/28 Jens Axboe <[email protected] <mailto:[email protected]>>
> >
> > On 10/28/2013 02:48 AM, Christoph Hellwig wrote:
> > > On Sun, Oct 27, 2013 at 10:29:25PM +0000, Jens Axboe wrote:
> > >> On Sat, Oct 26 2013, Christoph Hellwig wrote:
> > >>> I think this variant of the patch from Alexander should fix the
> > issue
> > >>> in a minimally invasive way. Longer term I'd prefer to use
> > q->flush_rq
> > >>> like in the non-mq case by copying over the context and tag
> > information.
> > >>
> > >> This one is pretty simple, we could definitely use it as a band
> > aid. I
> > >> too would greatly prefer using the static ->flush_rq instead.
> > Just have
> > >> it marked to bypass most of the free logic.
> > >
> > > We already bypass the free logical by setting and end_io callback for
> > > a while, similar to what the old code does. Maybe it's not all that
> > > hard to prealloc the request, let me give a sping. Using the static
> > > allocated one will be hard due to the driver-specific extra data,
> > > though.
> >
> > It's not that I think the existing patch is THAT bad, it fits in alright
> > with the reserved tagging and works regardless of whether a driver uses
> > reserved tags or not. And it does have the upside of not requiring
> > special checks or logic for this special non-tagged request that using
> > the preallocated would might need.
> >
> > >> I'll add this one.
> > >
> > > Gimme another day or so to figure this out.
> >
> > OK, holding off.
> >
> >
> > Another option: we could throttle flush-request allocation in
> > blk_mq_alloc_request(), for example, flush_req_nr >= max_tags - 1, make
> > the allocation wait.
>
> That could work too. If we back off, then we could restart it once a
> request completes. That does, however, requiring checking that and
> potentially kicking all the queues on completion when that happens.
Sounds not a big problem because the case flush_req uses all tags is very rare.
The good side is we can avoid reserving a tag, which is precious.
I cooked a patch to demonstrate the idea, only compiled yet.
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 3e4cc9c..192c2aa 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -284,7 +284,6 @@ static void mq_flush_work(struct work_struct *work)
q = container_of(work, struct request_queue, mq_flush_work);
- /* We don't need set REQ_FLUSH_SEQ, it's for consistency */
rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
__GFP_WAIT|GFP_ATOMIC);
rq->cmd_type = REQ_TYPE_FS;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ac804c6..fbbe0cc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -180,8 +180,21 @@ static void blk_mq_rq_ctx_init(struct blk_mq_ctx *ctx, struct request *rq,
}
static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
- gfp_t gfp, bool reserved)
+ gfp_t gfp, bool reserved,
+ int rw)
{
+
+ /*
+ * flush need allocate a request, leave at least one request for
+ * non-flush IO to avoid deadlock
+ */
+ if ((rw & REQ_FLUSH) && !(rw & REQ_FLUSH_SEQ)) {
+ atomic_inc(&hctx->pending_flush);
+ /* fallback to a wait allocation */
+ if (atomic_read(&hctx->pending_flush) >= hctx->queue_depth -
+ hctx->reserved_tags - 1)
+ return NULL;
+ }
return blk_mq_alloc_rq(hctx, gfp, reserved);
}
@@ -195,7 +208,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
- rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
+ rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved, rw);
if (rq) {
blk_mq_rq_ctx_init(ctx, rq, rw);
break;
@@ -253,6 +266,10 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
const int tag = rq->tag;
struct request_queue *q = rq->q;
+ if ((rq->cmd_flags & REQ_FLUSH) && !(rq->cmd_flags & REQ_FLUSH_SEQ)) {
+ atomic_dec(&hctx->pending_flush);
+ }
+
blk_mq_rq_init(hctx, rq);
blk_mq_put_tag(hctx->tags, tag);
@@ -918,7 +935,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
hctx = q->mq_ops->map_queue(q, ctx->cpu);
trace_block_getrq(q, bio, rw);
- rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
+ rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false, rw);
if (likely(rq))
blk_mq_rq_ctx_init(ctx, rq, rw);
else {
@@ -1202,6 +1219,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
hctx->queue_num = i;
hctx->flags = reg->flags;
hctx->queue_depth = reg->queue_depth;
+ hctx->reserved_tags = reg->reserved_tags;
hctx->cmd_size = reg->cmd_size;
blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 3368b97..0f81528 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -36,12 +36,15 @@ struct blk_mq_hw_ctx {
struct list_head page_list;
struct blk_mq_tags *tags;
+ atomic_t pending_flush;
+
unsigned long queued;
unsigned long run;
#define BLK_MQ_MAX_DISPATCH_ORDER 10
unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
unsigned int queue_depth;
+ unsigned int reserved_tags;
unsigned int numa_node;
unsigned int cmd_size; /* per-request extra data */