Hi Tejun,
On Fri, Jan 21 2011 at 10:59am -0500,
Tejun Heo <[email protected]> wrote:
>
> * As flush requests are never put on the IO scheduler, request fields
> used for flush share space with rq->rb_node. rq->completion_data is
> moved out of the union. This increases the request size by one
> pointer.
>
> As rq->elevator_private* are used only by the iosched too, it is
> possible to reduce the request size further. However, to do that,
> we need to modify request allocation path such that iosched data is
> not allocated for flush requests.
I decided to take a crack at using rq->elevator_private* and came up
with the following patch.
Unfortunately, in testing I found that flush requests that have data do
in fact eventually get added to the queue as normal requests, via:
1) "data but flush is not necessary" case in blk_insert_flush
2) REQ_FSEQ_DATA case in blk_flush_complete_seq
I know this because in my following get_request() change to _not_ call
elv_set_request() for flush requests hit cfq_put_request()'s
BUG_ON(!cfqq->allocated[rw]). cfqq->allocated[rw] gets set via
elv_set_request()'s call to cfq_set_request().
So this seems to call in to question the running theory that flush
requests can share 'struct request' space with elevator-specific members
(via union) -- be it rq->rb_node or rq->elevator_private*.
Please advise, thanks!
Mike
From: Mike Snitzer <[email protected]>
Skip elevator initialization for flush requests by passing priv=0 to
blk_alloc_request() in get_request(). As such elv_set_request() is
never called for flush requests.
Move elevator_private* into 'struct elevator' and have the flush fields
share a union with it.
Reclaim the space lost in 'struct request' by moving 'completion_data'
back in to the union with 'rb_node'.
Signed-off-by: Mike Snitzer <[email protected]>
---
block/blk-core.c | 13 +++++++++----
block/cfq-iosched.c | 18 +++++++++---------
block/elevator.c | 2 +-
include/linux/blkdev.h | 26 +++++++++++++++-----------
4 files changed, 34 insertions(+), 25 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 72dd23b..f507888 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -764,7 +764,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
struct request_list *rl = &q->rq;
struct io_context *ioc = NULL;
const bool is_sync = rw_is_sync(rw_flags) != 0;
- int may_queue, priv;
+ int may_queue, priv = 0;
may_queue = elv_may_queue(q, rw_flags);
if (may_queue == ELV_MQUEUE_NO)
@@ -808,9 +808,14 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
rl->count[is_sync]++;
rl->starved[is_sync] = 0;
- priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
- if (priv)
- rl->elvpriv++;
+ /*
+ * Skip elevator initialization for flush requests
+ */
+ if (!(bio && (bio->bi_rw & (REQ_FLUSH | REQ_FUA)))) {
+ priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
+ if (priv)
+ rl->elvpriv++;
+ }
if (blk_queue_io_stat(q))
rw_flags |= REQ_IO_STAT;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 501ffdf..580ae0a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -54,9 +54,9 @@ static const int cfq_hist_divisor = 4;
#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
#define RQ_CIC(rq) \
- ((struct cfq_io_context *) (rq)->elevator_private)
-#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2)
-#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3)
+ ((struct cfq_io_context *) (rq)->elevator.private)
+#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator.private2)
+#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator.private3)
static struct kmem_cache *cfq_pool;
static struct kmem_cache *cfq_ioc_pool;
@@ -3609,12 +3609,12 @@ static void cfq_put_request(struct request *rq)
put_io_context(RQ_CIC(rq)->ioc);
- rq->elevator_private = NULL;
- rq->elevator_private2 = NULL;
+ rq->elevator.private = NULL;
+ rq->elevator.private2 = NULL;
/* Put down rq reference on cfqg */
cfq_put_cfqg(RQ_CFQG(rq));
- rq->elevator_private3 = NULL;
+ rq->elevator.private3 = NULL;
cfq_put_queue(cfqq);
}
@@ -3702,9 +3702,9 @@ new_queue:
cfqq->allocated[rw]++;
cfqq->ref++;
- rq->elevator_private = cic;
- rq->elevator_private2 = cfqq;
- rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
+ rq->elevator.private = cic;
+ rq->elevator.private2 = cfqq;
+ rq->elevator.private3 = cfq_ref_get_cfqg(cfqq->cfqg);
spin_unlock_irqrestore(q->queue_lock, flags);
diff --git a/block/elevator.c b/block/elevator.c
index 270e097..02b66be 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -764,7 +764,7 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
if (e->ops->elevator_set_req_fn)
return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
- rq->elevator_private = NULL;
+ rq->elevator.private = NULL;
return 0;
}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8a082a5..0c569ec 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -99,25 +99,29 @@ struct request {
/*
* The rb_node is only used inside the io scheduler, requests
* are pruned when moved to the dispatch queue. So let the
- * flush fields share space with the rb_node.
+ * completion_data share space with the rb_node.
*/
union {
struct rb_node rb_node; /* sort/lookup */
- struct {
- unsigned int seq;
- struct list_head list;
- } flush;
+ void *completion_data;
};
- void *completion_data;
On Tue, Jan 25 2011 at 3:41pm -0500,
Mike Snitzer <[email protected]> wrote:
> Hi Tejun,
>
> On Fri, Jan 21 2011 at 10:59am -0500,
> Tejun Heo <[email protected]> wrote:
> >
> > * As flush requests are never put on the IO scheduler, request fields
> > used for flush share space with rq->rb_node. rq->completion_data is
> > moved out of the union. This increases the request size by one
> > pointer.
> >
> > As rq->elevator_private* are used only by the iosched too, it is
> > possible to reduce the request size further. However, to do that,
> > we need to modify request allocation path such that iosched data is
> > not allocated for flush requests.
>
> I decided to take a crack at using rq->elevator_private* and came up
> with the following patch.
>
> Unfortunately, in testing I found that flush requests that have data do
> in fact eventually get added to the queue as normal requests, via:
> 1) "data but flush is not necessary" case in blk_insert_flush
> 2) REQ_FSEQ_DATA case in blk_flush_complete_seq
Vivek helped me understand that adding the request to the queue doesn't
mean it goes to the elevator. It is inserting the request directly to
the underlying queue.
That embarassing oversight aside, the flush request is still getting to
the elevator somehow -- even though elv_set_request() was clearly not
called.
It is an interesting duality that:
1) REQ_ELVPRIV is never set because priv=0 is passed to blk_alloc_request
2) yet when blk_free_request() checks rq->cmd_flags REQ_ELVPRIV is set;
resulting in the call to elv_put_request()
> I know this because in my following get_request() change to _not_ call
> elv_set_request() for flush requests hit cfq_put_request()'s
> BUG_ON(!cfqq->allocated[rw]).
FYI, here is the backtrace:
PID: 0 TASK: ffff88007ccd6b30 CPU: 1 COMMAND: "swapper"
#0 [ffff880002103930] show_trace_log_lvl at ffffffff8100f3ec
#1 [ffff880002103980] delay_tsc at ffffffff8125e62a
#2 [ffff8800021039b0] __const_udelay at ffffffff8125e5d6
#3 [ffff8800021039c0] panic at ffffffff814c3604
#4 [ffff880002103a40] oops_end at ffffffff814c7622
#5 [ffff880002103a70] die at ffffffff8100f33b
#6 [ffff880002103aa0] do_trap at ffffffff814c6ec4
#7 [ffff880002103b00] do_invalid_op at ffffffff8100cee5
#8 [ffff880002103ba0] invalid_op at ffffffff8100bf5b
[exception RIP: cfq_put_request+128]
RIP: ffffffff8124f000 RSP: ffff880002103c58 RFLAGS: 00010046
RAX: 0000000000000019 RBX: ffff88007b5668a0 RCX: 0000000000002c7b
RDX: 0000000000000000 RSI: ffff88007b5668a0 RDI: ffff88007b5668a0
RBP: ffff880002103c68 R8: 0000000000000003 R9: 0000000000000001
R10: 0000000000000003 R11: 0000000000000003 R12: ffff88007b566940
R13: 00000000018c2441 R14: 0000000000000001 R15: 00000000000000a5
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#9 [ffff880002103c70] elv_put_request at ffffffff8123208e
#10 [ffff880002103c80] __blk_put_request at ffffffff8123ae23
#11 [ffff880002103cb0] blk_finish_request at ffffffff8123b049
#12 [ffff880002103d00] __blk_end_request_all at ffffffff8123b0fb
#13 [ffff880002103d20] blk_flush_complete_seq at ffffffff8123de4c
#14 [ffff880002103d50] flush_end_io at ffffffff8123e095
#15 [ffff880002103da0] blk_finish_request at ffffffff8123aedb
#16 [ffff880002103df0] __blk_end_request_all at ffffffff8123b0fb
#17 [ffff880002103e10] blk_done at ffffffffa002e085
#18 [ffff880002103e50] vring_interrupt at ffffffffa001f19c
#19 [ffff880002103e70] vp_vring_interrupt at ffffffffa00264bb
#20 [ffff880002103ec0] vp_interrupt at ffffffffa0026544
#21 [ffff880002103ee0] handle_IRQ_event at ffffffff810d10b0
#22 [ffff880002103f30] handle_fasteoi_irq at ffffffff810d38a9
#23 [ffff880002103f60] handle_irq at ffffffff8100dfb9
#24 [ffff880002103f80] do_IRQ at ffffffff814cb32c
--- <IRQ stack> ---
#25 [ffff88007ccf9e28] ret_from_intr at ffffffff8100bad3
[exception RIP: native_safe_halt+11]
RIP: ffffffff81033f0b RSP: ffff88007ccf9ed8 RFLAGS: 00000246
RAX: 0000000000000000 RBX: ffff88007ccf9ed8 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffffffff81d0f1e8
RBP: ffffffff8100bace R8: 0000000000000000 R9: 0000000000000001
R10: 0000000000000000 R11: 00000000fffba7c1 R12: 0000000000000001
R13: ffff88007ccf9e68 R14: 0000000281075d93 R15: ffff88007ccf9e98
ORIG_RAX: ffffffffffffffc4 CS: 0010 SS: 0018
#26 [ffff88007ccf9ee0] default_idle at ffffffff81013e0d
#27 [ffff88007ccf9f00] cpu_idle at ffffffff81009e96
On Tue, Jan 25 2011 at 4:55pm -0500,
Mike Snitzer <[email protected]> wrote:
> On Tue, Jan 25 2011 at 3:41pm -0500,
> Mike Snitzer <[email protected]> wrote:
>
> > Hi Tejun,
> >
> > On Fri, Jan 21 2011 at 10:59am -0500,
> > Tejun Heo <[email protected]> wrote:
> > >
> > > * As flush requests are never put on the IO scheduler, request fields
> > > used for flush share space with rq->rb_node. rq->completion_data is
> > > moved out of the union. This increases the request size by one
> > > pointer.
> > >
> > > As rq->elevator_private* are used only by the iosched too, it is
> > > possible to reduce the request size further. However, to do that,
> > > we need to modify request allocation path such that iosched data is
> > > not allocated for flush requests.
> >
> > I decided to take a crack at using rq->elevator_private* and came up
> > with the following patch.
...
> It is an interesting duality that:
> 1) REQ_ELVPRIV is never set because priv=0 is passed to blk_alloc_request
> 2) yet when blk_free_request() checks rq->cmd_flags REQ_ELVPRIV is set;
> resulting in the call to elv_put_request()
Turns out priv=0 was never actually being established in get_request()
on the kernel I was testing (see below).
> > I know this because in my following get_request() change to _not_ call
> > elv_set_request() for flush requests hit cfq_put_request()'s
> > BUG_ON(!cfqq->allocated[rw]).
>
> FYI, here is the backtrace:
That backtrace was from a RHEL6 port of your recent flush merge work.
One RHELism associated with the RHEL6 flush+fua port in general (and now
this flush merge port) is that bio and request flags are _not_ shared.
As such I introduced BIO_FLUSH and BIO_FUA flags in RHEL6. Long story
short, my 4/3 patch works just fine ontop of your 3 patches that Jens
just staged for 2.6.39.
So sorry for the noise! I'm kicking myself for having tainted my patch
(and your work indirectly) by making an issue out of nothing.
Taking a step back, what do you think of my proposed patch?
Mike
Hello,
On Tue, Jan 25, 2011 at 03:41:58PM -0500, Mike Snitzer wrote:
> Unfortunately, in testing I found that flush requests that have data do
> in fact eventually get added to the queue as normal requests, via:
> 1) "data but flush is not necessary" case in blk_insert_flush
> 2) REQ_FSEQ_DATA case in blk_flush_complete_seq
>
> I know this because in my following get_request() change to _not_ call
> elv_set_request() for flush requests hit cfq_put_request()'s
> BUG_ON(!cfqq->allocated[rw]). cfqq->allocated[rw] gets set via
> elv_set_request()'s call to cfq_set_request().
>
> So this seems to call in to question the running theory that flush
> requests can share 'struct request' space with elevator-specific members
> (via union) -- be it rq->rb_node or rq->elevator_private*.
As this part seems to have already been solved, I'm skipping this
part.
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 72dd23b..f507888 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -764,7 +764,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
> struct request_list *rl = &q->rq;
> struct io_context *ioc = NULL;
> const bool is_sync = rw_is_sync(rw_flags) != 0;
> - int may_queue, priv;
> + int may_queue, priv = 0;
>
> may_queue = elv_may_queue(q, rw_flags);
> if (may_queue == ELV_MQUEUE_NO)
> @@ -808,9 +808,14 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
> rl->count[is_sync]++;
> rl->starved[is_sync] = 0;
>
> - priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
> - if (priv)
> - rl->elvpriv++;
> + /*
> + * Skip elevator initialization for flush requests
> + */
> + if (!(bio && (bio->bi_rw & (REQ_FLUSH | REQ_FUA)))) {
> + priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
> + if (priv)
> + rl->elvpriv++;
> + }
I thought about doing it this way but I think we're burying the
REQ_FLUSH|REQ_FUA test logic too deep. get_request() shouldn't
"magically" know not to allocate elevator data. The decision should
be made higher in the stack and passed down to get_request(). e.g. if
REQ_SORTED is set in @rw, elevator data is allocated; otherwise, not.
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 8a082a5..0c569ec 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -99,25 +99,29 @@ struct request {
> /*
> * The rb_node is only used inside the io scheduler, requests
> * are pruned when moved to the dispatch queue. So let the
> - * flush fields share space with the rb_node.
> + * completion_data share space with the rb_node.
> */
> union {
> struct rb_node rb_node; /* sort/lookup */
> - struct {
> - unsigned int seq;
> - struct list_head list;
> - } flush;
> + void *completion_data;
> };
>
> - void *completion_data;
> -
> /*
> * Three pointers are available for the IO schedulers, if they need
> - * more they have to dynamically allocate it.
> + * more they have to dynamically allocate it. Let the flush fields
> + * share space with these three pointers.
> */
> - void *elevator_private;
> - void *elevator_private2;
> - void *elevator_private3;
> + union {
> + struct {
> + void *private;
> + void *private2;
> + void *private3;
> + } elevator;
> + struct {
> + unsigned int seq;
> + struct list_head list;
> + } flush;
> + };
Another thing is, can we please make private* an array? The number
postfixes are irksome. It's even one based instead of zero!
Also, it would be great to better describe the lifetime difference
between the first and the second unions and why it has be organized
this way (rb_node and completion_data can live together but rb_node
and flush can't).
Thank you.
--
tejun
On Wed, Jan 26, 2011 at 11:03:22AM +0100, Tejun Heo wrote:
> Also, it would be great to better describe the lifetime difference
> between the first and the second unions and why it has be organized
> this way (rb_node and completion_data can live together but rb_node
> and flush can't).
Oops, what can't live together are elevator_private* and
completion_data.
Thanks.
--
tejun
On Wed, Jan 26 2011 at 5:03am -0500,
Tejun Heo <[email protected]> wrote:
> > diff --git a/block/blk-core.c b/block/blk-core.c
> > index 72dd23b..f507888 100644
> > --- a/block/blk-core.c
> > +++ b/block/blk-core.c
> > @@ -764,7 +764,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
> > struct request_list *rl = &q->rq;
> > struct io_context *ioc = NULL;
> > const bool is_sync = rw_is_sync(rw_flags) != 0;
> > - int may_queue, priv;
> > + int may_queue, priv = 0;
> >
> > may_queue = elv_may_queue(q, rw_flags);
> > if (may_queue == ELV_MQUEUE_NO)
> > @@ -808,9 +808,14 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
> > rl->count[is_sync]++;
> > rl->starved[is_sync] = 0;
> >
> > - priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
> > - if (priv)
> > - rl->elvpriv++;
> > + /*
> > + * Skip elevator initialization for flush requests
> > + */
> > + if (!(bio && (bio->bi_rw & (REQ_FLUSH | REQ_FUA)))) {
> > + priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
> > + if (priv)
> > + rl->elvpriv++;
> > + }
>
> I thought about doing it this way but I think we're burying the
> REQ_FLUSH|REQ_FUA test logic too deep. get_request() shouldn't
> "magically" know not to allocate elevator data.
There is already a considerable amount of REQ_FLUSH|REQ_FUA special
casing magic sprinkled though-out the block layer. Why is this
get_request() change the case that goes too far?
> The decision should
> be made higher in the stack and passed down to get_request(). e.g. if
> REQ_SORTED is set in @rw, elevator data is allocated; otherwise, not.
Considering REQ_SORTED is set in elv_insert(), well after get_request()
is called, I'm not seeing what you're suggesting.
Anyway, I agree that ideally we'd have a mechanism to explicitly
short-circuit elevator initialization. But doing so in a meaningful way
would likely require a fair amount of refactoring of get_request* and
its callers. I'll come back to this and have another look but my gut is
this interface churn wouldn't _really_ help -- all things considered.
> > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> > index 8a082a5..0c569ec 100644
> > --- a/include/linux/blkdev.h
> > +++ b/include/linux/blkdev.h
> > @@ -99,25 +99,29 @@ struct request {
> > /*
> > * The rb_node is only used inside the io scheduler, requests
> > * are pruned when moved to the dispatch queue. So let the
> > - * flush fields share space with the rb_node.
> > + * completion_data share space with the rb_node.
> > */
> > union {
> > struct rb_node rb_node; /* sort/lookup */
> > - struct {
> > - unsigned int seq;
> > - struct list_head list;
> > - } flush;
> > + void *completion_data;
> > };
> >
> > - void *completion_data;
> > -
> > /*
> > * Three pointers are available for the IO schedulers, if they need
> > - * more they have to dynamically allocate it.
> > + * more they have to dynamically allocate it. Let the flush fields
> > + * share space with these three pointers.
> > */
> > - void *elevator_private;
> > - void *elevator_private2;
> > - void *elevator_private3;
> > + union {
> > + struct {
> > + void *private;
> > + void *private2;
> > + void *private3;
> > + } elevator;
> > + struct {
> > + unsigned int seq;
> > + struct list_head list;
> > + } flush;
> > + };
>
> Another thing is, can we please make private* an array? The number
> postfixes are irksome. It's even one based instead of zero!
Sure, I can sort that out.
> > Also, it would be great to better describe the lifetime difference
> > between the first and the second unions and why it has be organized
> > this way (rb_node and completion_data can live together but rb_node
> > and flush can't).
>
> Oops, what can't live together are elevator_private* and
> completion_data.
I'll better describe the 2nd union's sharing in the next revision.
Mike
Hello,
On Tue, Feb 01, 2011 at 12:38:46PM -0500, Mike Snitzer wrote:
> > I thought about doing it this way but I think we're burying the
> > REQ_FLUSH|REQ_FUA test logic too deep. get_request() shouldn't
> > "magically" know not to allocate elevator data.
>
> There is already a considerable amount of REQ_FLUSH|REQ_FUA special
> casing magic sprinkled though-out the block layer. Why is this
> get_request() change the case that goes too far?
After the reimplementation, FLUSH implementation seems to be pretty
well isolated. Also, having REQ_FLUSH logic in the issue and
completion paths is logical and preventing them from leaking to other
places sounds like a good idea.
> > The decision should
> > be made higher in the stack and passed down to get_request(). e.g. if
> > REQ_SORTED is set in @rw, elevator data is allocated; otherwise, not.
>
> Considering REQ_SORTED is set in elv_insert(), well after get_request()
> is called, I'm not seeing what you're suggesting.
I was suggesting using REQ_SORTED in @rw parameter to indicate "this
request may be sorted and thus needs elevator data allocation".
> Anyway, I agree that ideally we'd have a mechanism to explicitly
> short-circuit elevator initialization. But doing so in a meaningful way
> would likely require a fair amount of refactoring of get_request* and
> its callers. I'll come back to this and have another look but my gut is
> this interface churn wouldn't _really_ help -- all things considered.
I don't know. I agree that it's not a critical issue but, to me,
subjectively of course, it feels a bit too subtle. The sharing of
fields using unions is already subtle enough. I with that at least
the allocation switching would be obvious and explicit. The combined
subtleties scare me.
Thank you.
--
tejun
Flush requests are never put on the IO scheduler. Convert request
structure's elevator_private* into an array and have the flush fields
share a union with it.
Reclaim the space lost in 'struct request' by moving 'completion_data'
back in the union with 'rb_node'.
Signed-off-by: Mike Snitzer <[email protected]>
---
block/cfq-iosched.c | 18 +++++++++---------
block/elevator.c | 2 +-
include/linux/blkdev.h | 23 ++++++++++++-----------
3 files changed, 22 insertions(+), 21 deletions(-)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 501ffdf..8692958 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -54,9 +54,9 @@ static const int cfq_hist_divisor = 4;
#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
#define RQ_CIC(rq) \
- ((struct cfq_io_context *) (rq)->elevator_private)
-#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2)
-#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3)
+ ((struct cfq_io_context *) (rq)->elevator_private[0])
+#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private[1])
+#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private[2])
static struct kmem_cache *cfq_pool;
static struct kmem_cache *cfq_ioc_pool;
@@ -3609,12 +3609,12 @@ static void cfq_put_request(struct request *rq)
put_io_context(RQ_CIC(rq)->ioc);
- rq->elevator_private = NULL;
- rq->elevator_private2 = NULL;
+ rq->elevator_private[0] = NULL;
+ rq->elevator_private[1] = NULL;
/* Put down rq reference on cfqg */
cfq_put_cfqg(RQ_CFQG(rq));
- rq->elevator_private3 = NULL;
+ rq->elevator_private[2] = NULL;
cfq_put_queue(cfqq);
}
@@ -3702,9 +3702,9 @@ new_queue:
cfqq->allocated[rw]++;
cfqq->ref++;
- rq->elevator_private = cic;
- rq->elevator_private2 = cfqq;
- rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
+ rq->elevator_private[0] = cic;
+ rq->elevator_private[1] = cfqq;
+ rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
spin_unlock_irqrestore(q->queue_lock, flags);
diff --git a/block/elevator.c b/block/elevator.c
index 270e097..f98e92e 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -764,7 +764,7 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
if (e->ops->elevator_set_req_fn)
return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
- rq->elevator_private = NULL;
+ rq->elevator_private[0] = NULL;
return 0;
}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8a082a5..e3ee74f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -99,25 +99,26 @@ struct request {
/*
* The rb_node is only used inside the io scheduler, requests
* are pruned when moved to the dispatch queue. So let the
- * flush fields share space with the rb_node.
+ * completion_data share space with the rb_node.
*/
union {
struct rb_node rb_node; /* sort/lookup */
- struct {
- unsigned int seq;
- struct list_head list;
- } flush;
+ void *completion_data;
};
- void *completion_data;
Skip elevator initialization during request allocation if REQ_SORTED
is not set in the @rw_flags passed to the request allocator.
Set REQ_SORTED for all requests that may be put on IO scheduler. Flush
requests are not put on IO scheduler so REQ_SORTED is not set for
them.
Signed-off-by: Mike Snitzer <[email protected]>
---
block/blk-core.c | 24 +++++++++++++++++++-----
1 files changed, 19 insertions(+), 5 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 72dd23b..f6fcc64 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -764,7 +764,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
struct request_list *rl = &q->rq;
struct io_context *ioc = NULL;
const bool is_sync = rw_is_sync(rw_flags) != 0;
- int may_queue, priv;
+ int may_queue, priv = 0;
may_queue = elv_may_queue(q, rw_flags);
if (may_queue == ELV_MQUEUE_NO)
@@ -808,9 +808,14 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
rl->count[is_sync]++;
rl->starved[is_sync] = 0;
- priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
- if (priv)
- rl->elvpriv++;
+ /*
+ * Only initialize elevator data if REQ_SORTED is set.
+ */
+ if (rw_flags & REQ_SORTED) {
+ priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
+ if (priv)
+ rl->elvpriv++;
+ }
if (blk_queue_io_stat(q))
rw_flags |= REQ_IO_STAT;
@@ -1197,6 +1202,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
const unsigned short prio = bio_prio(bio);
const bool sync = !!(bio->bi_rw & REQ_SYNC);
const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
+ const bool flush = !!(bio->bi_rw & (REQ_FLUSH | REQ_FUA));
const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
int where = ELEVATOR_INSERT_SORT;
int rw_flags;
@@ -1210,7 +1216,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
spin_lock_irq(q->queue_lock);
- if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+ if (flush) {
where = ELEVATOR_INSERT_FLUSH;
goto get_rq;
}
@@ -1293,6 +1299,14 @@ get_rq:
rw_flags |= REQ_SYNC;
/*
+ * Set REQ_SORTED for all requests that may be put on IO scheduler.
+ * The request allocator's IO scheduler initialization will be skipped
+ * if REQ_SORTED is not set.
+ */
+ if (!flush)
+ rw_flags |= REQ_SORTED;
+
+ /*
* Grab a free request. This is might sleep but can not fail.
* Returns with the queue unlocked.
*/
--
1.7.3.4
On Tue, Feb 01, 2011 at 05:46:12PM -0500, Mike Snitzer wrote:
> Skip elevator initialization during request allocation if REQ_SORTED
> is not set in the @rw_flags passed to the request allocator.
>
> Set REQ_SORTED for all requests that may be put on IO scheduler. Flush
> requests are not put on IO scheduler so REQ_SORTED is not set for
> them.
So we are doing all this so that elevator_private and flush data can
share the space through union and we can avoid increasing the size
of struct rq by 1 pointer (4 or 8 bytes depneding on arch)?
Looks good to me. One minor comment inline.
Acked-by: Vivek Goyal <[email protected]>
Vivek
>
> Signed-off-by: Mike Snitzer <[email protected]>
> ---
> block/blk-core.c | 24 +++++++++++++++++++-----
> 1 files changed, 19 insertions(+), 5 deletions(-)
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 72dd23b..f6fcc64 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -764,7 +764,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
> struct request_list *rl = &q->rq;
> struct io_context *ioc = NULL;
> const bool is_sync = rw_is_sync(rw_flags) != 0;
> - int may_queue, priv;
> + int may_queue, priv = 0;
>
> may_queue = elv_may_queue(q, rw_flags);
> if (may_queue == ELV_MQUEUE_NO)
> @@ -808,9 +808,14 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
> rl->count[is_sync]++;
> rl->starved[is_sync] = 0;
>
> - priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
> - if (priv)
> - rl->elvpriv++;
> + /*
> + * Only initialize elevator data if REQ_SORTED is set.
> + */
> + if (rw_flags & REQ_SORTED) {
> + priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
> + if (priv)
> + rl->elvpriv++;
> + }
>
> if (blk_queue_io_stat(q))
> rw_flags |= REQ_IO_STAT;
> @@ -1197,6 +1202,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
> const unsigned short prio = bio_prio(bio);
> const bool sync = !!(bio->bi_rw & REQ_SYNC);
> const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
> + const bool flush = !!(bio->bi_rw & (REQ_FLUSH | REQ_FUA));
> const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
> int where = ELEVATOR_INSERT_SORT;
> int rw_flags;
> @@ -1210,7 +1216,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
>
> spin_lock_irq(q->queue_lock);
>
> - if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
> + if (flush) {
> where = ELEVATOR_INSERT_FLUSH;
> goto get_rq;
> }
> @@ -1293,6 +1299,14 @@ get_rq:
> rw_flags |= REQ_SYNC;
>
> /*
> + * Set REQ_SORTED for all requests that may be put on IO scheduler.
> + * The request allocator's IO scheduler initialization will be skipped
> + * if REQ_SORTED is not set.
> + */
Do you want to mention here that why do we want to avoid IO scheduler
initialization. Specifically mention that set_request() is avoided so
that elevator_private[*] are not initialized and that space can be
used by flush request data.
> + if (!flush)
> + rw_flags |= REQ_SORTED;
> +
> + /*
> * Grab a free request. This is might sleep but can not fail.
> * Returns with the queue unlocked.
> */
> --
> 1.7.3.4
On Tue, Feb 01, 2011 at 05:46:13PM -0500, Mike Snitzer wrote:
> Flush requests are never put on the IO scheduler. Convert request
> structure's elevator_private* into an array and have the flush fields
> share a union with it.
>
> Reclaim the space lost in 'struct request' by moving 'completion_data'
> back in the union with 'rb_node'.
>
Looks good to me.
Acked-by: Vivek Goyal <[email protected]>
Vivek
> Signed-off-by: Mike Snitzer <[email protected]>
> ---
> block/cfq-iosched.c | 18 +++++++++---------
> block/elevator.c | 2 +-
> include/linux/blkdev.h | 23 ++++++++++++-----------
> 3 files changed, 22 insertions(+), 21 deletions(-)
>
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index 501ffdf..8692958 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -54,9 +54,9 @@ static const int cfq_hist_divisor = 4;
> #define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
>
> #define RQ_CIC(rq) \
> - ((struct cfq_io_context *) (rq)->elevator_private)
> -#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2)
> -#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3)
> + ((struct cfq_io_context *) (rq)->elevator_private[0])
> +#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private[1])
> +#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private[2])
>
> static struct kmem_cache *cfq_pool;
> static struct kmem_cache *cfq_ioc_pool;
> @@ -3609,12 +3609,12 @@ static void cfq_put_request(struct request *rq)
>
> put_io_context(RQ_CIC(rq)->ioc);
>
> - rq->elevator_private = NULL;
> - rq->elevator_private2 = NULL;
> + rq->elevator_private[0] = NULL;
> + rq->elevator_private[1] = NULL;
>
> /* Put down rq reference on cfqg */
> cfq_put_cfqg(RQ_CFQG(rq));
> - rq->elevator_private3 = NULL;
> + rq->elevator_private[2] = NULL;
>
> cfq_put_queue(cfqq);
> }
> @@ -3702,9 +3702,9 @@ new_queue:
>
> cfqq->allocated[rw]++;
> cfqq->ref++;
> - rq->elevator_private = cic;
> - rq->elevator_private2 = cfqq;
> - rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
> + rq->elevator_private[0] = cic;
> + rq->elevator_private[1] = cfqq;
> + rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
>
> spin_unlock_irqrestore(q->queue_lock, flags);
>
> diff --git a/block/elevator.c b/block/elevator.c
> index 270e097..f98e92e 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -764,7 +764,7 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
> if (e->ops->elevator_set_req_fn)
> return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
>
> - rq->elevator_private = NULL;
> + rq->elevator_private[0] = NULL;
> return 0;
> }
>
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 8a082a5..e3ee74f 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -99,25 +99,26 @@ struct request {
> /*
> * The rb_node is only used inside the io scheduler, requests
> * are pruned when moved to the dispatch queue. So let the
> - * flush fields share space with the rb_node.
> + * completion_data share space with the rb_node.
> */
> union {
> struct rb_node rb_node; /* sort/lookup */
> - struct {
> - unsigned int seq;
> - struct list_head list;
> - } flush;
> + void *completion_data;
> };
>
> - void *completion_data;
> -
> /*
> * Three pointers are available for the IO schedulers, if they need
> - * more they have to dynamically allocate it.
> + * more they have to dynamically allocate it. Flush requests are
> + * never put on the IO scheduler. So let the flush fields share
> + * space with the three elevator_private pointers.
> */
> - void *elevator_private;
> - void *elevator_private2;
> - void *elevator_private3;
> + union {
> + void *elevator_private[3];
> + struct {
> + unsigned int seq;
> + struct list_head list;
> + } flush;
> + };
>
> struct gendisk *rq_disk;
> struct hd_struct *part;
> --
> 1.7.3.4
On Wed, Feb 02 2011 at 4:51pm -0500,
Vivek Goyal <[email protected]> wrote:
> On Tue, Feb 01, 2011 at 05:46:12PM -0500, Mike Snitzer wrote:
> > Skip elevator initialization during request allocation if REQ_SORTED
> > is not set in the @rw_flags passed to the request allocator.
> >
> > Set REQ_SORTED for all requests that may be put on IO scheduler. Flush
> > requests are not put on IO scheduler so REQ_SORTED is not set for
> > them.
>
> So we are doing all this so that elevator_private and flush data can
> share the space through union and we can avoid increasing the size
> of struct rq by 1 pointer (4 or 8 bytes depneding on arch)?
Correct.
> > diff --git a/block/blk-core.c b/block/blk-core.c
> > index 72dd23b..f6fcc64 100644
> > --- a/block/blk-core.c
> > +++ b/block/blk-core.c
> > @@ -1197,6 +1202,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
> > const unsigned short prio = bio_prio(bio);
> > const bool sync = !!(bio->bi_rw & REQ_SYNC);
> > const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
> > + const bool flush = !!(bio->bi_rw & (REQ_FLUSH | REQ_FUA));
> > const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
> > int where = ELEVATOR_INSERT_SORT;
> > int rw_flags;
> > @@ -1210,7 +1216,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
> >
> > spin_lock_irq(q->queue_lock);
> >
> > - if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
> > + if (flush) {
> > where = ELEVATOR_INSERT_FLUSH;
> > goto get_rq;
> > }
> > @@ -1293,6 +1299,14 @@ get_rq:
> > rw_flags |= REQ_SYNC;
> >
> > /*
> > + * Set REQ_SORTED for all requests that may be put on IO scheduler.
> > + * The request allocator's IO scheduler initialization will be skipped
> > + * if REQ_SORTED is not set.
> > + */
>
> Do you want to mention here that why do we want to avoid IO scheduler
> initialization. Specifically mention that set_request() is avoided so
> that elevator_private[*] are not initialized and that space can be
> used by flush request data.
Sure, I'll post a v3 for this patch with that edit and your Acked-by.
Thanks,
Mike
Set REQ_SORTED, in the @rw_flags passed to the request allocator, for
any request that may be put on IO scheduler. Skip elevator data
initialization during request allocation if REQ_SORTED is not set.
REQ_SORTED is not set for flush requests because they are never put on
the IO scheduler.
Signed-off-by: Mike Snitzer <[email protected]>
Acked-by: Vivek Goyal <[email protected]>
---
block/blk-core.c | 26 +++++++++++++++++++++-----
1 file changed, 21 insertions(+), 5 deletions(-)
v3: edits to patch header and __make_request() comment
Index: linux-2.6/block/blk-core.c
===================================================================
--- linux-2.6.orig/block/blk-core.c
+++ linux-2.6/block/blk-core.c
@@ -764,7 +764,7 @@ static struct request *get_request(struc
struct request_list *rl = &q->rq;
struct io_context *ioc = NULL;
const bool is_sync = rw_is_sync(rw_flags) != 0;
- int may_queue, priv;
+ int may_queue, priv = 0;
may_queue = elv_may_queue(q, rw_flags);
if (may_queue == ELV_MQUEUE_NO)
@@ -808,9 +808,14 @@ static struct request *get_request(struc
rl->count[is_sync]++;
rl->starved[is_sync] = 0;
- priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
- if (priv)
- rl->elvpriv++;
+ /*
+ * Only initialize elevator data if REQ_SORTED is set.
+ */
+ if (rw_flags & REQ_SORTED) {
+ priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
+ if (priv)
+ rl->elvpriv++;
+ }
if (blk_queue_io_stat(q))
rw_flags |= REQ_IO_STAT;
@@ -1197,6 +1202,7 @@ static int __make_request(struct request
const unsigned short prio = bio_prio(bio);
const bool sync = !!(bio->bi_rw & REQ_SYNC);
const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
+ const bool flush = !!(bio->bi_rw & (REQ_FLUSH | REQ_FUA));
const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
int where = ELEVATOR_INSERT_SORT;
int rw_flags;
@@ -1210,7 +1216,7 @@ static int __make_request(struct request
spin_lock_irq(q->queue_lock);
- if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+ if (flush) {
where = ELEVATOR_INSERT_FLUSH;
goto get_rq;
}
@@ -1293,6 +1299,16 @@ get_rq:
rw_flags |= REQ_SYNC;
/*
+ * Set REQ_SORTED for all requests that may be put on IO scheduler.
+ * The request allocator's IO scheduler initialization will be skipped
+ * if REQ_SORTED is not set -- elv_set_request() is avoided so that
+ * that the allocated request's elevator_private pointers are not
+ * initialized and that space can be used by flush request data.
+ */
+ if (!flush)
+ rw_flags |= REQ_SORTED;
+
+ /*
* Grab a free request. This is might sleep but can not fail.
* Returns with the queue unlocked.
*/
On Tue, Feb 01, 2011 at 05:46:13PM -0500, Mike Snitzer wrote:
> Flush requests are never put on the IO scheduler. Convert request
> structure's elevator_private* into an array and have the flush fields
> share a union with it.
>
> Reclaim the space lost in 'struct request' by moving 'completion_data'
> back in the union with 'rb_node'.
>
> Signed-off-by: Mike Snitzer <[email protected]>
Acked-by: Tejun Heo <[email protected]>
Thanks.
--
tejun
Hello,
On Wed, Feb 02, 2011 at 05:55:49PM -0500, Mike Snitzer wrote:
> @@ -808,9 +808,14 @@ static struct request *get_request(struc
> rl->count[is_sync]++;
> rl->starved[is_sync] = 0;
>
> - priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
> - if (priv)
> - rl->elvpriv++;
> + /*
> + * Only initialize elevator data if REQ_SORTED is set.
> + */
> + if (rw_flags & REQ_SORTED) {
> + priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
> + if (priv)
> + rl->elvpriv++;
> + }
This isn't enough. Now the allocated requests would have REQ_SORTED
set before going into elevator. You probably want to filter out
REQ_SORTED to elv_may_queue() too.
Also, it would be great if you update the comment on top of
get_request() to explain what @rw_flags does.
Thank you.
--
tejun
On 2011-02-02 23:55, Mike Snitzer wrote:
> Set REQ_SORTED, in the @rw_flags passed to the request allocator, for
> any request that may be put on IO scheduler. Skip elevator data
> initialization during request allocation if REQ_SORTED is not set.
>
> REQ_SORTED is not set for flush requests because they are never put on
> the IO scheduler.
That looks very wrong. REQ_SORTED gets set _when_ the request is sorted
into the IO scheduler. This is gross misuse, a bad hack.
--
Jens Axboe
Hey, Jens.
On Thu, Feb 03, 2011 at 02:24:42PM +0100, Jens Axboe wrote:
> On 2011-02-02 23:55, Mike Snitzer wrote:
> > REQ_SORTED is not set for flush requests because they are never put on
> > the IO scheduler.
>
> That looks very wrong. REQ_SORTED gets set _when_ the request is sorted
> into the IO scheduler. This is gross misuse, a bad hack.
The rationale behind suggesting was that it indicates to the allocator
that the request may be sorted as how the request will be used is
communicated using @rw_flags to the allocator. The patch is buggy
that the flag actually ends up on the request. Any better idea how to
communicate it?
Thanks.
--
tejun
Set REQ_SORTED, in the @rw_flags passed to the request allocator, for
all requests that may be put on IO scheduler. REQ_SORTED is not set for
flush requests because they are never put on the IO scheduler.
Skip elevator data initialization during request allocation if
REQ_SORTED is not set in @rw_flags passed to get_request().
Signed-off-by: Mike Snitzer <[email protected]>
Acked-by: Vivek Goyal <[email protected]>
---
block/blk-core.c | 33 ++++++++++++++++++++++++++++-----
1 file changed, 28 insertions(+), 5 deletions(-)
v4: fixed bug where REQ_SORTED wasn't cleared on entry to get_request
- and Jens, yes I agree this is still a hack
Index: linux-2.6/block/blk-core.c
===================================================================
--- linux-2.6.orig/block/blk-core.c
+++ linux-2.6/block/blk-core.c
@@ -754,6 +754,9 @@ static void freed_request(struct request
/*
* Get a free request, queue_lock must be held.
+ * @rw_flags: may be overloaded to convey additional request features;
+ * any overloaded feature flags must be cleared immediately.
+ *
* Returns NULL on failure, with queue_lock held.
* Returns !NULL on success, with queue_lock *not held*.
*/
@@ -764,7 +767,11 @@ static struct request *get_request(struc
struct request_list *rl = &q->rq;
struct io_context *ioc = NULL;
const bool is_sync = rw_is_sync(rw_flags) != 0;
- int may_queue, priv;
+ const bool init_elv_data = !!(rw_flags & REQ_SORTED);
+ int may_queue, priv = 0;
+
+ if (init_elv_data)
+ rw_flags &= ~REQ_SORTED;
may_queue = elv_may_queue(q, rw_flags);
if (may_queue == ELV_MQUEUE_NO)
@@ -808,9 +815,14 @@ static struct request *get_request(struc
rl->count[is_sync]++;
rl->starved[is_sync] = 0;
- priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
- if (priv)
- rl->elvpriv++;
+ /*
+ * Only initialize elevator data if IO scheduler may be used.
+ */
+ if (init_elv_data) {
+ priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
+ if (priv)
+ rl->elvpriv++;
+ }
if (blk_queue_io_stat(q))
rw_flags |= REQ_IO_STAT;
@@ -1197,6 +1209,7 @@ static int __make_request(struct request
const unsigned short prio = bio_prio(bio);
const bool sync = !!(bio->bi_rw & REQ_SYNC);
const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
+ const bool flush = !!(bio->bi_rw & (REQ_FLUSH | REQ_FUA));
const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
int where = ELEVATOR_INSERT_SORT;
int rw_flags;
@@ -1210,7 +1223,7 @@ static int __make_request(struct request
spin_lock_irq(q->queue_lock);
- if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+ if (flush) {
where = ELEVATOR_INSERT_FLUSH;
goto get_rq;
}
@@ -1293,6 +1306,16 @@ get_rq:
rw_flags |= REQ_SYNC;
/*
+ * Set REQ_SORTED for all requests that may be put on IO scheduler.
+ * The request allocator's IO scheduler initialization will be skipped
+ * if REQ_SORTED is not set -- elv_set_request() is avoided so that
+ * that the allocated request's elevator_private pointers are not
+ * initialized and that space can be used by flush request data.
+ */
+ if (!flush)
+ rw_flags |= REQ_SORTED;
+
+ /*
* Grab a free request. This is might sleep but can not fail.
* Returns with the queue unlocked.
*/
On Thu, Feb 03 2011 at 8:24am -0500,
Jens Axboe <[email protected]> wrote:
> On 2011-02-02 23:55, Mike Snitzer wrote:
> > Set REQ_SORTED, in the @rw_flags passed to the request allocator, for
> > any request that may be put on IO scheduler. Skip elevator data
> > initialization during request allocation if REQ_SORTED is not set.
> >
> > REQ_SORTED is not set for flush requests because they are never put on
> > the IO scheduler.
>
> That looks very wrong. REQ_SORTED gets set _when_ the request is sorted
> into the IO scheduler.
Yes, considerable oversight on my part.
> This is gross misuse, a bad hack.
As my recently posted v4 stated, even without the bug it is still a hack.
My initial version didn't get so cute with overloading flags, etc, e.g:
diff --git a/block/blk-core.c b/block/blk-core.c
index 72dd23b..e098598 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -764,7 +764,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
struct request_list *rl = &q->rq;
struct io_context *ioc = NULL;
const bool is_sync = rw_is_sync(rw_flags) != 0;
- int may_queue, priv;
+ int may_queue, priv = 0;
may_queue = elv_may_queue(q, rw_flags);
if (may_queue == ELV_MQUEUE_NO)
@@ -808,9 +808,14 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
rl->count[is_sync]++;
rl->starved[is_sync] = 0;
- priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
- if (priv)
- rl->elvpriv++;
+ /*
+ * Skip elevator data initialization for flush requests.
+ */
+ if (!(bio && (bio->bi_rw & (REQ_FLUSH | REQ_FUA)))) {
+ priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
+ if (priv)
+ rl->elvpriv++;
+ }
if (blk_queue_io_stat(q))
rw_flags |= REQ_IO_STAT;
I ran with Tejun's suggestion of overloading @rw_flags when it became
clear he wasn't happy with the above (because it spread FLUSH/FUA
special casing too thin).
Regardless, other ideas for how to convey this info to get_request()
would be appreciated.
Mike
On Thu, Feb 03, 2011 at 02:38:20PM +0100, Tejun Heo wrote:
> Hey, Jens.
>
> On Thu, Feb 03, 2011 at 02:24:42PM +0100, Jens Axboe wrote:
> > On 2011-02-02 23:55, Mike Snitzer wrote:
> > > REQ_SORTED is not set for flush requests because they are never put on
> > > the IO scheduler.
> >
> > That looks very wrong. REQ_SORTED gets set _when_ the request is sorted
> > into the IO scheduler. This is gross misuse, a bad hack.
>
> The rationale behind suggesting was that it indicates to the allocator
> that the request may be sorted as how the request will be used is
> communicated using @rw_flags to the allocator. The patch is buggy
> that the flag actually ends up on the request. Any better idea how to
> communicate it?
Though you did not like the V1 of patch, personally I also liked just parsing
FLUSH or FUA flag in get_request().
Or how about intoducing a helper function blk_rq_should_init_elevator()
or something like that and this function will parse FLUSH, FUA flags.
Thanks
Vivek
Hello,
On Fri, Feb 04, 2011 at 10:04:57AM -0500, Vivek Goyal wrote:
> > The rationale behind suggesting was that it indicates to the allocator
> > that the request may be sorted as how the request will be used is
> > communicated using @rw_flags to the allocator. The patch is buggy
> > that the flag actually ends up on the request. Any better idea how to
> > communicate it?
>
> Though you did not like the V1 of patch, personally I also liked just parsing
> FLUSH or FUA flag in get_request().
>
> Or how about intoducing a helper function blk_rq_should_init_elevator()
> or something like that and this function will parse FLUSH, FUA flags.
I suppose it's Jens' call, but if the FLUSH/FUA testing goes inside
the alloc function, please decorate with big fat comment and mention
it in the comment for the union definition too.
Thank you.
--
tejun
Skip elevator initialization for flush requests by passing priv=0 to
blk_alloc_request() in get_request(). As such elv_set_request() is
never called for flush requests.
Signed-off-by: Mike Snitzer <[email protected]>
---
block/blk-core.c | 29 +++++++++++++++++++++++++----
1 file changed, 25 insertions(+), 4 deletions(-)
v5: introduces blk_rq_should_init_elevator() to abstract the elevator
initialization decision to promote future reuse and clarity.
Index: linux-2.6/block/blk-core.c
===================================================================
--- linux-2.6.orig/block/blk-core.c
+++ linux-2.6/block/blk-core.c
@@ -753,6 +753,25 @@ static void freed_request(struct request
}
/*
+ * Determine if elevator data should be initialized when allocating the
+ * request associated with @bio.
+ */
+static bool blk_rq_should_init_elevator(struct bio *bio)
+{
+ if (!bio)
+ return true;
+
+ /*
+ * Flush requests do not use the elevator so skip initialization.
+ * This allows a request to share the flush and elevator data.
+ */
+ if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))
+ return false;
+
+ return true;
+}
+
+/*
* Get a free request, queue_lock must be held.
* Returns NULL on failure, with queue_lock held.
* Returns !NULL on success, with queue_lock *not held*.
@@ -764,7 +783,7 @@ static struct request *get_request(struc
struct request_list *rl = &q->rq;
struct io_context *ioc = NULL;
const bool is_sync = rw_is_sync(rw_flags) != 0;
- int may_queue, priv;
+ int may_queue, priv = 0;
may_queue = elv_may_queue(q, rw_flags);
if (may_queue == ELV_MQUEUE_NO)
@@ -808,9 +827,11 @@ static struct request *get_request(struc
rl->count[is_sync]++;
rl->starved[is_sync] = 0;
- priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
- if (priv)
- rl->elvpriv++;
+ if (blk_rq_should_init_elevator(bio)) {
+ priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
+ if (priv)
+ rl->elvpriv++;
+ }
if (blk_queue_io_stat(q))
rw_flags |= REQ_IO_STAT;
On 2011-02-01 23:46, Mike Snitzer wrote:
> Flush requests are never put on the IO scheduler. Convert request
> structure's elevator_private* into an array and have the flush fields
> share a union with it.
>
> Reclaim the space lost in 'struct request' by moving 'completion_data'
> back in the union with 'rb_node'.
Thanks Mike, I've merged this one and v5 1/2 as well.
--
Jens Axboe