2005-10-19 12:36:58

by Tejun Heo

[permalink] [raw]
Subject: [PATCH linux-2.6-block:master] blk: reimplement elevator switch

Hello, Jens.

This patch reimplements elevator switch. This patch assumes generic
dispatch queue patchset is applied.

* Each request is tagged with REQ_ELVPRIV flag if it has its elevator
private data set.
* Requests which doesn't have REQ_ELVPRIV flag set never enter
iosched. They are always directly back inserted to dispatch queue.
Of course, elevator_put_req_fn is called only for requests which
have its REQ_ELVPRIV set.
* Request queue maintains the current number of requests which have
its elevator data set (elevator_set_req_fn called) in
q->rq->elvpriv.
* If a request queue has QUEUE_FLAG_BYPASS set, elevator private data
is not allocated for new requests.

To switch to another iosched, we set QUEUE_FLAG_BYPASS and wait until
elvpriv goes to zero; then, we attach the new iosched and clears
QUEUE_FLAG_BYPASS. New implementation is much simpler and main code
paths are less cluttered, IMHO.

Signed-off-by: Tejun Heo <[email protected]>

Index: blk-fixes/drivers/block/elevator.c
===================================================================
--- blk-fixes.orig/drivers/block/elevator.c 2005-10-17 03:57:33.000000000 +0900
+++ blk-fixes/drivers/block/elevator.c 2005-10-17 21:13:18.000000000 +0900
@@ -34,6 +34,7 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/compiler.h>
+#include <linux/delay.h>

#include <asm/uaccess.h>

@@ -135,13 +136,7 @@ static int elevator_attach(request_queue
memset(eq, 0, sizeof(*eq));
eq->ops = &e->ops;
eq->elevator_type = e;
-
- INIT_LIST_HEAD(&q->queue_head);
- q->last_merge = NULL;
q->elevator = eq;
- q->last_sector = 0;
- q->boundary_rq = NULL;
- q->max_back_kb = 0;

if (eq->ops->elevator_init_fn)
ret = eq->ops->elevator_init_fn(q, eq);
@@ -190,6 +185,12 @@ int elevator_init(request_queue_t *q, ch
struct elevator_queue *eq;
int ret = 0;

+ INIT_LIST_HEAD(&q->queue_head);
+ q->last_merge = NULL;
+ q->last_sector = 0;
+ q->boundary_rq = NULL;
+ q->max_back_kb = 0;
+
elevator_setup_default();

if (!name)
@@ -353,23 +354,14 @@ void __elv_add_request(request_queue_t *
q->last_sector = rq_last_sector(rq);
q->boundary_rq = rq;
}
- }
+ } else if (!(rq->flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT)
+ where = ELEVATOR_INSERT_BACK;

if (plug)
blk_plug_device(q);

rq->q = q;

- if (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))) {
- /*
- * if drain is set, store the request "locally". when the drain
- * is finished, the requests will be handed ordered to the io
- * scheduler
- */
- list_add_tail(&rq->queuelist, &q->drain_list);
- return;
- }
-
switch (where) {
case ELEVATOR_INSERT_FRONT:
rq->flags |= REQ_SOFTBARRIER;
@@ -676,25 +668,36 @@ EXPORT_SYMBOL_GPL(elv_unregister);
* switch to new_e io scheduler. be careful not to introduce deadlocks -
* we don't free the old io scheduler, before we have allocated what we
* need for the new one. this way we have a chance of going back to the old
- * one, if the new one fails init for some reason. we also do an intermediate
- * switch to noop to ensure safety with stack-allocated requests, since they
- * don't originate from the block layer allocator. noop is safe here, because
- * it never needs to touch the elevator itself for completion events. DRAIN
- * flags will make sure we don't touch it for additions either.
+ * one, if the new one fails init for some reason.
*/
static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
{
- elevator_t *e = kmalloc(sizeof(elevator_t), GFP_KERNEL);
- struct elevator_type *noop_elevator = NULL;
- elevator_t *old_elevator;
+ elevator_t *old_elevator, *e;

+ /*
+ * Allocate new elevator
+ */
+ e = kmalloc(sizeof(elevator_t), GFP_KERNEL);
if (!e)
goto error;

/*
- * first step, drain requests from the block freelist
+ * Turn on BYPASS and drain all requests w/ elevator private data
*/
- blk_wait_queue_drained(q, 0);
+ spin_lock_irq(q->queue_lock);
+
+ set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+
+ while (q->elevator->ops->elevator_dispatch_fn(q, 1))
+ ;
+
+ while (q->rq.elvpriv) {
+ spin_unlock_irq(q->queue_lock);
+ msleep(100);
+ spin_lock_irq(q->queue_lock);
+ }
+
+ spin_unlock_irq(q->queue_lock);

/*
* unregister old elevator data
@@ -703,18 +706,6 @@ static void elevator_switch(request_queu
old_elevator = q->elevator;

/*
- * next step, switch to noop since it uses no private rq structures
- * and doesn't allocate any memory for anything. then wait for any
- * non-fs requests in-flight
- */
- noop_elevator = elevator_get("noop");
- spin_lock_irq(q->queue_lock);
- elevator_attach(q, noop_elevator, e);
- spin_unlock_irq(q->queue_lock);
-
- blk_wait_queue_drained(q, 1);
-
- /*
* attach and start new elevator
*/
if (elevator_attach(q, new_e, e))
@@ -724,11 +715,10 @@ static void elevator_switch(request_queu
goto fail_register;

/*
- * finally exit old elevator and start queue again
+ * finally exit old elevator and turn off BYPASS.
*/
elevator_exit(old_elevator);
- blk_finish_queue_drain(q);
- elevator_put(noop_elevator);
+ clear_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
return;

fail_register:
@@ -737,13 +727,13 @@ fail_register:
* one again (along with re-adding the sysfs dir)
*/
elevator_exit(e);
+ e = NULL;
fail:
q->elevator = old_elevator;
elv_register_queue(q);
- blk_finish_queue_drain(q);
+ clear_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+ kfree(e);
error:
- if (noop_elevator)
- elevator_put(noop_elevator);
elevator_put(new_e);
printk(KERN_ERR "elevator: switch to %s failed\n",new_e->elevator_name);
}
Index: blk-fixes/drivers/block/ll_rw_blk.c
===================================================================
--- blk-fixes.orig/drivers/block/ll_rw_blk.c 2005-10-17 03:57:33.000000000 +0900
+++ blk-fixes/drivers/block/ll_rw_blk.c 2005-10-17 21:13:18.000000000 +0900
@@ -263,8 +263,6 @@ void blk_queue_make_request(request_queu
blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);

blk_queue_activity_fn(q, NULL, NULL);
-
- INIT_LIST_HEAD(&q->drain_list);
}

EXPORT_SYMBOL(blk_queue_make_request);
@@ -1050,6 +1048,7 @@ static char *rq_flags[] = {
"REQ_STARTED",
"REQ_DONTPREP",
"REQ_QUEUED",
+ "REQ_ELVPRIV",
"REQ_PC",
"REQ_BLOCK_PC",
"REQ_SENSE",
@@ -1640,9 +1639,9 @@ static int blk_init_free_list(request_qu

rl->count[READ] = rl->count[WRITE] = 0;
rl->starved[READ] = rl->starved[WRITE] = 0;
+ rl->elvpriv = 0;
init_waitqueue_head(&rl->wait[READ]);
init_waitqueue_head(&rl->wait[WRITE]);
- init_waitqueue_head(&rl->drain);

rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
mempool_free_slab, request_cachep, q->node);
@@ -1785,12 +1784,14 @@ EXPORT_SYMBOL(blk_get_queue);

static inline void blk_free_request(request_queue_t *q, struct request *rq)
{
- elv_put_request(q, rq);
+ if (rq->flags & REQ_ELVPRIV)
+ elv_put_request(q, rq);
mempool_free(rq, q->rq.rq_pool);
}

static inline struct request *
-blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, int gfp_mask)
+blk_alloc_request(request_queue_t *q, int rw, struct bio *bio,
+ int priv, int gfp_mask)
{
struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);

@@ -1803,11 +1804,15 @@ blk_alloc_request(request_queue_t *q, in
*/
rq->flags = rw;

- if (!elv_set_request(q, rq, bio, gfp_mask))
- return rq;
+ if (priv) {
+ if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) {
+ mempool_free(rq, q->rq.rq_pool);
+ return NULL;
+ }
+ rq->flags |= REQ_ELVPRIV;
+ }

- mempool_free(rq, q->rq.rq_pool);
- return NULL;
+ return rq;
}

/*
@@ -1863,22 +1868,18 @@ static void __freed_request(request_queu
* A request has just been released. Account for it, update the full and
* congestion status, wake up any waiters. Called under q->queue_lock.
*/
-static void freed_request(request_queue_t *q, int rw)
+static void freed_request(request_queue_t *q, int rw, int priv)
{
struct request_list *rl = &q->rq;

rl->count[rw]--;
+ if (priv)
+ rl->elvpriv--;

__freed_request(q, rw);

if (unlikely(rl->starved[rw ^ 1]))
__freed_request(q, rw ^ 1);
-
- if (!rl->count[READ] && !rl->count[WRITE]) {
- smp_mb();
- if (unlikely(waitqueue_active(&rl->drain)))
- wake_up(&rl->drain);
- }
}

#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
@@ -1893,9 +1894,7 @@ static struct request *get_request(reque
struct request *rq = NULL;
struct request_list *rl = &q->rq;
struct io_context *ioc = current_io_context(GFP_ATOMIC);
-
- if (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags)))
- goto out;
+ int priv;

if (rl->count[rw]+1 >= q->nr_requests) {
/*
@@ -1940,9 +1939,14 @@ get_rq:
rl->starved[rw] = 0;
if (rl->count[rw] >= queue_congestion_on_threshold(q))
set_queue_congested(q, rw);
+
+ priv = !test_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+ if (priv)
+ rl->elvpriv++;
+
spin_unlock_irq(q->queue_lock);

- rq = blk_alloc_request(q, rw, bio, gfp_mask);
+ rq = blk_alloc_request(q, rw, bio, priv, gfp_mask);
if (!rq) {
/*
* Allocation failed presumably due to memory. Undo anything
@@ -1952,7 +1956,7 @@ get_rq:
* wait queue, but this is pretty rare.
*/
spin_lock_irq(q->queue_lock);
- freed_request(q, rw);
+ freed_request(q, rw, priv);

/*
* in the very unlikely event that allocation failed and no
@@ -2468,11 +2472,12 @@ static void __blk_put_request(request_qu
*/
if (rl) {
int rw = rq_data_dir(req);
+ int priv = req->flags & REQ_ELVPRIV;

BUG_ON(!list_empty(&req->queuelist));

blk_free_request(q, req);
- freed_request(q, rw);
+ freed_request(q, rw, priv);
}
}

@@ -2800,97 +2805,6 @@ static inline void blk_partition_remap(s
}
}

-void blk_finish_queue_drain(request_queue_t *q)
-{
- struct request_list *rl = &q->rq;
- struct request *rq;
- int requeued = 0;
-
- spin_lock_irq(q->queue_lock);
- clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
-
- while (!list_empty(&q->drain_list)) {
- rq = list_entry_rq(q->drain_list.next);
-
- list_del_init(&rq->queuelist);
- elv_requeue_request(q, rq);
- requeued++;
- }
-
- if (requeued)
- q->request_fn(q);
-
- spin_unlock_irq(q->queue_lock);
-
- wake_up(&rl->wait[0]);
- wake_up(&rl->wait[1]);
- wake_up(&rl->drain);
-}
-
-static int wait_drain(request_queue_t *q, struct request_list *rl, int dispatch)
-{
- int wait = rl->count[READ] + rl->count[WRITE];
-
- if (dispatch)
- wait += !list_empty(&q->queue_head);
-
- return wait;
-}
-
-/*
- * We rely on the fact that only requests allocated through blk_alloc_request()
- * have io scheduler private data structures associated with them. Any other
- * type of request (allocated on stack or through kmalloc()) should not go
- * to the io scheduler core, but be attached to the queue head instead.
- */
-void blk_wait_queue_drained(request_queue_t *q, int wait_dispatch)
-{
- struct request_list *rl = &q->rq;
- DEFINE_WAIT(wait);
-
- spin_lock_irq(q->queue_lock);
- set_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
-
- while (wait_drain(q, rl, wait_dispatch)) {
- prepare_to_wait(&rl->drain, &wait, TASK_UNINTERRUPTIBLE);
-
- if (wait_drain(q, rl, wait_dispatch)) {
- __generic_unplug_device(q);
- spin_unlock_irq(q->queue_lock);
- io_schedule();
- spin_lock_irq(q->queue_lock);
- }
-
- finish_wait(&rl->drain, &wait);
- }
-
- spin_unlock_irq(q->queue_lock);
-}
-
-/*
- * block waiting for the io scheduler being started again.
- */
-static inline void block_wait_queue_running(request_queue_t *q)
-{
- DEFINE_WAIT(wait);
-
- while (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))) {
- struct request_list *rl = &q->rq;
-
- prepare_to_wait_exclusive(&rl->drain, &wait,
- TASK_UNINTERRUPTIBLE);
-
- /*
- * re-check the condition. avoids using prepare_to_wait()
- * in the fast path (queue is running)
- */
- if (test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))
- io_schedule();
-
- finish_wait(&rl->drain, &wait);
- }
-}
-
static void handle_bad_sector(struct bio *bio)
{
char b[BDEVNAME_SIZE];
@@ -2986,8 +2900,6 @@ end_io:
if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
goto end_io;

- block_wait_queue_running(q);
-
/*
* If this device has partitions, remap block n
* of partition p to block n+start(p) of the disk.
Index: blk-fixes/include/linux/blkdev.h
===================================================================
--- blk-fixes.orig/include/linux/blkdev.h 2005-10-17 03:57:33.000000000 +0900
+++ blk-fixes/include/linux/blkdev.h 2005-10-17 21:13:34.000000000 +0900
@@ -107,9 +107,9 @@ typedef void (rq_end_io_fn)(struct reque
struct request_list {
int count[2];
int starved[2];
+ int elvpriv;
mempool_t *rq_pool;
wait_queue_head_t wait[2];
- wait_queue_head_t drain;
};

#define BLK_MAX_CDB 16
@@ -211,6 +211,7 @@ enum rq_flag_bits {
__REQ_STARTED, /* drive already may have started this one */
__REQ_DONTPREP, /* don't call prep for this one */
__REQ_QUEUED, /* uses queueing */
+ __REQ_ELVPRIV, /* elevator private data attached */
/*
* for ATA/ATAPI devices
*/
@@ -244,6 +245,7 @@ enum rq_flag_bits {
#define REQ_STARTED (1 << __REQ_STARTED)
#define REQ_DONTPREP (1 << __REQ_DONTPREP)
#define REQ_QUEUED (1 << __REQ_QUEUED)
+#define REQ_ELVPRIV (1 << __REQ_ELVPRIV)
#define REQ_PC (1 << __REQ_PC)
#define REQ_BLOCK_PC (1 << __REQ_BLOCK_PC)
#define REQ_SENSE (1 << __REQ_SENSE)
@@ -414,8 +416,6 @@ struct request_queue
unsigned int sg_reserved_size;
int node;

- struct list_head drain_list;
-
/*
* reserved for flush operations
*/
@@ -443,7 +443,7 @@ enum {
#define QUEUE_FLAG_DEAD 5 /* queue being torn down */
#define QUEUE_FLAG_REENTER 6 /* Re-entrancy avoidance */
#define QUEUE_FLAG_PLUGGED 7 /* queue is plugged */
-#define QUEUE_FLAG_DRAIN 8 /* draining queue for sched switch */
+#define QUEUE_FLAG_BYPASS 8 /* don't use elevator, just do FIFO */
#define QUEUE_FLAG_FLUSH 9 /* doing barrier flush sequence */

#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
@@ -655,8 +655,6 @@ extern void blk_dump_rq_flags(struct req
extern void generic_unplug_device(request_queue_t *);
extern void __generic_unplug_device(request_queue_t *);
extern long nr_blockdev_pages(void);
-extern void blk_wait_queue_drained(request_queue_t *, int);
-extern void blk_finish_queue_drain(request_queue_t *);

int blk_get_queue(request_queue_t *);
request_queue_t *blk_alloc_queue(int gfp_mask);


2005-10-20 12:24:19

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH linux-2.6-block:master] blk: reimplement elevator switch

On Wed, Oct 19 2005, Tejun Heo wrote:
> Hello, Jens.
>
> This patch reimplements elevator switch. This patch assumes generic
> dispatch queue patchset is applied.
>
> * Each request is tagged with REQ_ELVPRIV flag if it has its elevator
> private data set.
> * Requests which doesn't have REQ_ELVPRIV flag set never enter
> iosched. They are always directly back inserted to dispatch queue.
> Of course, elevator_put_req_fn is called only for requests which
> have its REQ_ELVPRIV set.
> * Request queue maintains the current number of requests which have
> its elevator data set (elevator_set_req_fn called) in
> q->rq->elvpriv.
> * If a request queue has QUEUE_FLAG_BYPASS set, elevator private data
> is not allocated for new requests.
>
> To switch to another iosched, we set QUEUE_FLAG_BYPASS and wait until
> elvpriv goes to zero; then, we attach the new iosched and clears
> QUEUE_FLAG_BYPASS. New implementation is much simpler and main code
> paths are less cluttered, IMHO.

Wonderful! Applied as-is, I didn't make any changes to this one. I agree
it's much cleaner than the previous approach, both in the code and in
killing the request_queue and request_list members.

I'm going to make a little few tweaks:

- The naming, QUEUE_FLAG_BYPASS isn't really clear. I don't know what
this means without looking at specific parts of the code. Testing of
same flag in various locations would also be preferred instead of
passing priv around and cluttering the function parameters, however we
should split the queue flags a little for this. Basically into an
atomic and non-atomic part. So I'll leave that alone for now.

- The msleep(100) seems a little too slow. With the switching being more
efficient now, in 100msecs we can complete lots of requests.

--
Jens Axboe

2005-10-20 13:54:51

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH linux-2.6-block:master] blk: reimplement elevator switch

On Thu, Oct 20, 2005 at 02:25:05PM +0200, Jens Axboe wrote:
> On Wed, Oct 19 2005, Tejun Heo wrote:
> > Hello, Jens.
> >
> > This patch reimplements elevator switch. This patch assumes generic
> > dispatch queue patchset is applied.
> >
> > * Each request is tagged with REQ_ELVPRIV flag if it has its elevator
> > private data set.
> > * Requests which doesn't have REQ_ELVPRIV flag set never enter
> > iosched. They are always directly back inserted to dispatch queue.
> > Of course, elevator_put_req_fn is called only for requests which
> > have its REQ_ELVPRIV set.
> > * Request queue maintains the current number of requests which have
> > its elevator data set (elevator_set_req_fn called) in
> > q->rq->elvpriv.
> > * If a request queue has QUEUE_FLAG_BYPASS set, elevator private data
> > is not allocated for new requests.
> >
> > To switch to another iosched, we set QUEUE_FLAG_BYPASS and wait until
> > elvpriv goes to zero; then, we attach the new iosched and clears
> > QUEUE_FLAG_BYPASS. New implementation is much simpler and main code
> > paths are less cluttered, IMHO.
>
> Wonderful! Applied as-is, I didn't make any changes to this one. I agree
> it's much cleaner than the previous approach, both in the code and in
> killing the request_queue and request_list members.
>
> I'm going to make a little few tweaks:
>
> - The naming, QUEUE_FLAG_BYPASS isn't really clear. I don't know what
> this means without looking at specific parts of the code. Testing of

It means to bypass ioscheds and go directly into dispatch queue.

> same flag in various locations would also be preferred instead of
> passing priv around and cluttering the function parameters, however we
> should split the queue flags a little for this. Basically into an
> atomic and non-atomic part. So I'll leave that alone for now.

Hmmm...

> - The msleep(100) seems a little too slow. With the switching being more
> efficient now, in 100msecs we can complete lots of requests.

Yeap, agreed.

--
tejun

2005-10-20 14:06:49

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH linux-2.6-block:master] blk: reimplement elevator switch

On Thu, Oct 20 2005, Tejun Heo wrote:
> On Thu, Oct 20, 2005 at 02:25:05PM +0200, Jens Axboe wrote:
> > On Wed, Oct 19 2005, Tejun Heo wrote:
> > > Hello, Jens.
> > >
> > > This patch reimplements elevator switch. This patch assumes generic
> > > dispatch queue patchset is applied.
> > >
> > > * Each request is tagged with REQ_ELVPRIV flag if it has its elevator
> > > private data set.
> > > * Requests which doesn't have REQ_ELVPRIV flag set never enter
> > > iosched. They are always directly back inserted to dispatch queue.
> > > Of course, elevator_put_req_fn is called only for requests which
> > > have its REQ_ELVPRIV set.
> > > * Request queue maintains the current number of requests which have
> > > its elevator data set (elevator_set_req_fn called) in
> > > q->rq->elvpriv.
> > > * If a request queue has QUEUE_FLAG_BYPASS set, elevator private data
> > > is not allocated for new requests.
> > >
> > > To switch to another iosched, we set QUEUE_FLAG_BYPASS and wait until
> > > elvpriv goes to zero; then, we attach the new iosched and clears
> > > QUEUE_FLAG_BYPASS. New implementation is much simpler and main code
> > > paths are less cluttered, IMHO.
> >
> > Wonderful! Applied as-is, I didn't make any changes to this one. I agree
> > it's much cleaner than the previous approach, both in the code and in
> > killing the request_queue and request_list members.
> >
> > I'm going to make a little few tweaks:
> >
> > - The naming, QUEUE_FLAG_BYPASS isn't really clear. I don't know what
> > this means without looking at specific parts of the code. Testing of
>
> It means to bypass ioscheds and go directly into dispatch queue.

Certainly :-) I renamed it to QUEUE_FLAG_ELVSWITCH for now, if another
use comes of this we can name it something appropriately generic then.

> > same flag in various locations would also be preferred instead of
> > passing priv around and cluttering the function parameters, however we
> > should split the queue flags a little for this. Basically into an
> > atomic and non-atomic part. So I'll leave that alone for now.
>
> Hmmm...

There's room for optimization there, lots of places we check queue flags
(and set them) inside the queue lock, we don't need to use the bit
operations for those. But we cannot safely mix them either.

> > - The msleep(100) seems a little too slow. With the switching being more
> > efficient now, in 100msecs we can complete lots of requests.
>
> Yeap, agreed.

Cool

--
Jens Axboe