LinuxLists.cc - [patch] deadline-ioscheduler rb-tree sort

2002-10-31 13:37:12

Subject: [patch] deadline-ioscheduler rb-tree sort

Hi,

This is something I whipped together the other day for testing the over
head of the O(N) insert scan that the deadline io scheduler does. So
instead of using a plain linked list, I adapted it to use an rb tree
since that was already available with the generic implementation in the
kernel.

Results are quite promising, overhead is down a lot. I'm not pushing
this for inclusion, just sending it out so others can test etc. Patch is
against 2.5.45.

===== drivers/block/ll_rw_blk.c 1.135 vs edited =====
--- 1.135/drivers/block/ll_rw_blk.c Mon Oct 28 20:57:59 2002
+++ edited/drivers/block/ll_rw_blk.c Thu Oct 31 14:31:09 2002
@@ -2175,8 +2183,8 @@
queue_nr_requests = (total_ram >> 9) & ~7;
if (queue_nr_requests < 16)
queue_nr_requests = 16;
- if (queue_nr_requests > 128)
- queue_nr_requests = 128;
+ if (queue_nr_requests > 512)
+ queue_nr_requests = 512;

batch_requests = queue_nr_requests / 8;
if (batch_requests > 8)
===== drivers/block/deadline-iosched.c 1.9 vs edited =====
--- 1.9/drivers/block/deadline-iosched.c Tue Oct 29 12:19:59 2002
+++ edited/drivers/block/deadline-iosched.c Thu Oct 31 10:48:09 2002
@@ -17,6 +17,7 @@
#include <linux/init.h>
#include <linux/compiler.h>
#include <linux/hash.h>
+#include <linux/rbtree.h>

/*
* feel free to try other values :-). read_expire value is the timeout for
@@ -33,7 +34,7 @@
*/
static int writes_starved = 2;

-static const int deadline_hash_shift = 8;
+static const int deadline_hash_shift = 10;
#define DL_HASH_BLOCK(sec) ((sec) >> 3)
#define DL_HASH_FN(sec) (hash_long(DL_HASH_BLOCK((sec)), deadline_hash_shift))
#define DL_HASH_ENTRIES (1 << deadline_hash_shift)
@@ -48,19 +49,20 @@
/*
* run time data
*/
- struct list_head sort_list[2]; /* sorted listed */
+ struct rb_root rb_list[2];
struct list_head read_fifo; /* fifo list */
struct list_head *dispatch; /* driver dispatch queue */
struct list_head *hash; /* request hash */
sector_t last_sector; /* last sector sent to drive */
unsigned long hash_valid_count; /* barrier hash count */
unsigned int starved; /* writes starved */
+ unsigned int rb_entries;

/*
* settings that change how the i/o scheduler behaves
*/
unsigned int fifo_batch;
- unsigned long read_expire;
+ unsigned int read_expire;
unsigned int seek_cost;
unsigned int writes_starved;
};
@@ -69,10 +71,24 @@
* pre-request data.
*/
struct deadline_rq {
- struct list_head fifo;
+ /*
+ * rbtree index, key is the starting offset
+ */
+ struct rb_node rb_node;
+ sector_t rb_key;
+
+ struct request *request;
+
+ /*
+ * request hash, key is the ending offset (for back merge lookup)
+ */
struct list_head hash;
unsigned long hash_valid_count;
- struct request *request;
+
+ /*
+ * expire fifo
+ */
+ struct list_head fifo;
unsigned long expires;
};

@@ -81,23 +97,23 @@
#define RQ_DATA(rq) ((struct deadline_rq *) (rq)->elevator_private)

/*
- * rq hash
+ * the back merge hash support functions
*/
-static inline void __deadline_del_rq_hash(struct deadline_rq *drq)
+static inline void __deadline_hash_del(struct deadline_rq *drq)
{
drq->hash_valid_count = 0;
list_del_init(&drq->hash);
}

#define ON_HASH(drq) (drq)->hash_valid_count
-static inline void deadline_del_rq_hash(struct deadline_rq *drq)
+static inline void deadline_hash_del(struct deadline_rq *drq)
{
if (ON_HASH(drq))
- __deadline_del_rq_hash(drq);
+ __deadline_hash_del(drq);
}

static inline void
-deadline_add_rq_hash(struct deadline_data *dd, struct deadline_rq *drq)
+deadline_hash_add(struct deadline_data *dd, struct deadline_rq *drq)
{
struct request *rq = drq->request;

@@ -109,33 +125,30 @@

#define list_entry_hash(ptr) list_entry((ptr), struct deadline_rq, hash)
static struct request *
-deadline_find_hash(struct deadline_data *dd, sector_t offset)
+deadline_hash_find(struct deadline_data *dd, sector_t offset)
{
struct list_head *hash_list = &dd->hash[DL_HASH_FN(offset)];
struct list_head *entry, *next = hash_list->next;
- struct deadline_rq *drq;
- struct request *rq = NULL;

while ((entry = next) != hash_list) {
+ struct deadline_rq *drq = list_entry_hash(entry);
+ struct request *__rq = drq->request;
+
next = entry->next;

- drq = list_entry_hash(entry);
-
BUG_ON(!drq->hash_valid_count);

- if (!rq_mergeable(drq->request)
+ if (!rq_mergeable(__rq)
|| drq->hash_valid_count != dd->hash_valid_count) {
- __deadline_del_rq_hash(drq);
+ __deadline_hash_del(drq);
continue;
}

- if (drq->request->sector + drq->request->nr_sectors == offset) {
- rq = drq->request;
- break;
- }
+ if (__rq->sector + __rq->nr_sectors == offset)
+ return __rq;
}

- return rq;
+ return NULL;
}

static sector_t deadline_get_last_sector(struct deadline_data *dd)
@@ -154,86 +167,135 @@
return last_sec;
}

+/*
+ * rb tree support functions
+ */
+#define RB_NONE (2)
+#define RB_EMPTY(root) ((root)->rb_node == NULL)
+#define ON_RB(node) ((node)->rb_color != RB_NONE)
+#define RB_CLEAR(node) ((node)->rb_color = RB_NONE)
+#define deadline_rb_entry(node) rb_entry((node), struct deadline_rq, rb_node)
+#define DRQ_RB_ROOT(dd, drq) (&(dd)->rb_list[rq_data_dir((drq)->request)])
+
+static inline int
+__deadline_rb_add(struct deadline_data *dd, struct deadline_rq *drq)
+{
+ struct rb_node **p = &DRQ_RB_ROOT(dd, drq)->rb_node;
+ struct rb_node *parent = NULL;
+ struct deadline_rq *__drq;
+
+ while (*p) {
+ parent = *p;
+ __drq = deadline_rb_entry(parent);
+
+ if (drq->rb_key < __drq->rb_key)
+ p = &(*p)->rb_left;
+ else if (drq->rb_key > __drq->rb_key)
+ p = &(*p)->rb_right;
+ else
+ return 1;
+ }
+
+ rb_link_node(&drq->rb_node, parent, p);
+ return 0;
+}
+
+static void deadline_rb_add(struct deadline_data *dd, struct deadline_rq *drq)
+{
+ drq->rb_key = drq->request->sector;
+
+ if (!__deadline_rb_add(dd, drq)) {
+ rb_insert_color(&drq->rb_node, DRQ_RB_ROOT(dd, drq));
+ dd->rb_entries++;
+ return;
+ }
+
+ /*
+ * this cannot happen
+ */
+ blk_dump_rq_flags(drq->request, "deadline_rb_add alias");
+ list_add_tail(&drq->request->queuelist, dd->dispatch);
+}
+
+static inline void
+deadline_rb_del(struct deadline_data *dd, struct deadline_rq *drq)
+{
+ if (ON_RB(&drq->rb_node)) {
+ rb_erase(&drq->rb_node, DRQ_RB_ROOT(dd, drq));
+ RB_CLEAR(&drq->rb_node);
+ dd->rb_entries--;
+ BUG_ON(dd->rb_entries < 0);
+ }
+}
+
+static struct request *
+deadline_rb_find(struct deadline_data *dd, sector_t sector, int data_dir)
+{
+ struct rb_node *n = dd->rb_list[data_dir].rb_node;
+ struct deadline_rq *drq;
+
+ while (n) {
+ drq = deadline_rb_entry(n);
+
+ if (sector < drq->rb_key)
+ n = n->rb_left;
+ else if (sector > drq->rb_key)
+ n = n->rb_right;
+ else
+ return drq->request;
+ }
+
+ return NULL;
+}
+
static int
deadline_merge(request_queue_t *q, struct list_head **insert, struct bio *bio)
{
struct deadline_data *dd = q->elevator.elevator_data;
- const int data_dir = bio_data_dir(bio);
- struct list_head *entry, *sort_list;
- struct request *__rq;
int ret = ELEVATOR_NO_MERGE;
+ struct request *__rq;

/*
* try last_merge to avoid going to hash
*/
ret = elv_try_last_merge(q, bio);
if (ret != ELEVATOR_NO_MERGE) {
- *insert = q->last_merge;
+ __rq = list_entry_rq(q->last_merge);
goto out;
}

/*
* see if the merge hash can satisfy a back merge
*/
- if ((__rq = deadline_find_hash(dd, bio->bi_sector))) {
+ if ((__rq = deadline_hash_find(dd, bio->bi_sector))) {
BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);

if (elv_rq_merge_ok(__rq, bio)) {
- *insert = &__rq->queuelist;
ret = ELEVATOR_BACK_MERGE;
goto out;
}
}

/*
- * scan list from back to find insertion point.
+ * probably not worth the extra overhead
*/
- entry = sort_list = &dd->sort_list[data_dir];
- while ((entry = entry->prev) != sort_list) {
- __rq = list_entry_rq(entry);
-
- BUG_ON(__rq->flags & REQ_STARTED);
-
- if (!(__rq->flags & REQ_CMD))
- continue;
+ __rq = deadline_rb_find(dd, bio->bi_sector + bio_sectors(bio), bio_data_dir(bio));
+ if (__rq) {
+ BUG_ON(bio->bi_sector + bio_sectors(bio) != __rq->sector);

- /*
- * it's not necessary to break here, and in fact it could make
- * us loose a front merge. emperical evidence shows this to
- * be a big waste of cycles though, so quit scanning
- */
- if (!*insert && bio_rq_in_between(bio, __rq, sort_list)) {
- *insert = &__rq->queuelist;
- break;
- }
-
- if (__rq->flags & REQ_BARRIER)
- break;
-
- /*
- * checking for a front merge, hash will miss those
- */
- if (__rq->sector - bio_sectors(bio) == bio->bi_sector) {
- ret = elv_try_merge(__rq, bio);
- if (ret != ELEVATOR_NO_MERGE) {
- *insert = &__rq->queuelist;
- break;
- }
+ if (elv_rq_merge_ok(__rq, bio)) {
+ ret = ELEVATOR_FRONT_MERGE;
+ goto out;
}
}

- /*
- * no insertion point found, check the very front
- */
- if (!*insert && !list_empty(sort_list)) {
- __rq = list_entry_rq(sort_list->next);
-
- if (bio->bi_sector + bio_sectors(bio) < __rq->sector &&
- bio->bi_sector > deadline_get_last_sector(dd))
- *insert = sort_list;
+ *insert = NULL;
+out:
+ if (ret != ELEVATOR_NO_MERGE) {
+ *insert = &__rq->queuelist;
+ q->last_merge = &__rq->queuelist;
}

-out:
return ret;
}

@@ -242,8 +304,16 @@
struct deadline_data *dd = q->elevator.elevator_data;
struct deadline_rq *drq = RQ_DATA(req);

- deadline_del_rq_hash(drq);
- deadline_add_rq_hash(dd, drq);
+ deadline_hash_del(drq);
+ deadline_hash_add(dd, drq);
+
+ /*
+ * the merge was a front merge, we need to reposition request
+ */
+ if (req->sector != drq->rb_key) {
+ deadline_rb_del(dd, drq);
+ deadline_rb_add(dd, drq);
+ }

q->last_merge = &req->queuelist;
}
@@ -258,8 +328,13 @@
BUG_ON(!drq);
BUG_ON(!dnext);

- deadline_del_rq_hash(drq);
- deadline_add_rq_hash(dd, drq);
+ deadline_hash_del(drq);
+ deadline_hash_add(dd, drq);
+
+ if (req->sector != drq->rb_key) {
+ deadline_rb_del(dd, drq);
+ deadline_rb_add(dd, drq);
+ }

/*
* if dnext expires before drq, assign it's expire time to drq
@@ -274,16 +349,16 @@
}

/*
- * move request from sort list to dispatch queue. maybe remove from rq hash
- * here too?
+ * move request from sort list to dispatch queue.
*/
static inline void
deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
{
struct deadline_rq *drq = RQ_DATA(rq);

- list_move_tail(&rq->queuelist, dd->dispatch);
+ deadline_rb_del(dd, drq);
list_del_init(&drq->fifo);
+ list_add_tail(&rq->queuelist, dd->dispatch);
}

/*
@@ -291,12 +366,12 @@
*/
static void deadline_move_requests(struct deadline_data *dd, struct request *rq)
{
- struct list_head *sort_head = &dd->sort_list[rq_data_dir(rq)];
sector_t last_sec = deadline_get_last_sector(dd);
int batch_count = dd->fifo_batch;
+ struct deadline_rq *drq = RQ_DATA(rq);

do {
- struct list_head *nxt = rq->queuelist.next;
+ struct rb_node *rbnext = rb_next(&drq->rb_node);
int this_rq_cost;

/*
@@ -308,7 +383,7 @@
/*
* if this is the last entry, don't bother doing accounting
*/
- if (nxt == sort_head)
+ if (rbnext == NULL)
break;

this_rq_cost = dd->seek_cost;
@@ -320,7 +395,8 @@
break;

last_sec = rq->sector + rq->nr_sectors;
- rq = list_entry_rq(nxt);
+ drq = deadline_rb_entry(rbnext);
+ rq = drq->request;
} while (1);
}

@@ -343,25 +419,11 @@
return 0;
}

-static struct request *deadline_next_request(request_queue_t *q)
+static int deadline_dispatch_requests(struct deadline_data *dd)
{
- struct deadline_data *dd = q->elevator.elevator_data;
+ const int writes = !RB_EMPTY(&dd->rb_list[WRITE]);
struct deadline_rq *drq;
- struct list_head *nxt;
- struct request *rq;
- int writes;
-
- /*
- * if still requests on the dispatch queue, just grab the first one
- */
- if (!list_empty(&q->queue_head)) {
-dispatch:
- rq = list_entry_rq(q->queue_head.next);
- dd->last_sector = rq->sector + rq->nr_sectors;
- return rq;
- }
-
- writes = !list_empty(&dd->sort_list[WRITE]);
+ int ret = 0;

/*
* if we have expired entries on the fifo list, move some to dispatch
@@ -370,19 +432,20 @@
if (writes && (dd->starved++ >= dd->writes_starved))
goto dispatch_writes;

- nxt = dd->read_fifo.next;
- drq = list_entry_fifo(nxt);
+ drq = list_entry_fifo(dd->read_fifo.next);
deadline_move_requests(dd, drq->request);
- goto dispatch;
+ ret = 1;
+ goto out;
}

- if (!list_empty(&dd->sort_list[READ])) {
+ if (!RB_EMPTY(&dd->rb_list[READ])) {
if (writes && (dd->starved++ >= dd->writes_starved))
goto dispatch_writes;

- nxt = dd->sort_list[READ].next;
- deadline_move_requests(dd, list_entry_rq(nxt));
- goto dispatch;
+ drq = deadline_rb_entry(rb_first(&dd->rb_list[READ]));
+ deadline_move_requests(dd, drq->request);
+ ret = 1;
+ goto out;
}

/*
@@ -391,14 +454,41 @@
*/
if (writes) {
dispatch_writes:
- nxt = dd->sort_list[WRITE].next;
- deadline_move_requests(dd, list_entry_rq(nxt));
+ drq = deadline_rb_entry(rb_first(&dd->rb_list[WRITE]));
+ deadline_move_requests(dd, drq->request);
dd->starved = 0;
- goto dispatch;
+ ret = 1;
}

- BUG_ON(!list_empty(&dd->sort_list[READ]));
- BUG_ON(writes);
+out:
+ return ret;
+}
+
+static struct request *deadline_next_request(request_queue_t *q)
+{
+ struct deadline_data *dd = q->elevator.elevator_data;
+ struct request *rq;
+
+ /*
+ * if there are still requests on the dispatch queue, grab the first one
+ */
+ if (!list_empty(&q->queue_head)) {
+dispatch:
+ rq = list_entry_rq(q->queue_head.next);
+ dd->last_sector = rq->sector + rq->nr_sectors;
+ return rq;
+ }
+
+ if (deadline_dispatch_requests(dd))
+ goto dispatch;
+
+ /*
+ * if we have entries on the read or write sorted list, its a bug
+ * if deadline_dispatch_requests() didn't move any
+ */
+ BUG_ON(!RB_EMPTY(&dd->rb_list[READ]));
+ BUG_ON(!RB_EMPTY(&dd->rb_list[WRITE]));
+
return NULL;
}

@@ -409,28 +499,23 @@
struct deadline_rq *drq = RQ_DATA(rq);
const int data_dir = rq_data_dir(rq);

- /*
- * flush hash on barrier insert, as not to allow merges before a
- * barrier.
- */
if (unlikely(rq->flags & REQ_BARRIER)) {
DL_INVALIDATE_HASH(dd);
q->last_merge = NULL;
}

- /*
- * add to sort list
- */
- if (!insert_here)
- insert_here = dd->sort_list[data_dir].prev;
+ if (unlikely(!(rq->flags & REQ_CMD))) {
+ if (!insert_here)
+ insert_here = dd->dispatch->prev;

- list_add(&rq->queuelist, insert_here);
-
- if (unlikely(!(rq->flags & REQ_CMD)))
+ list_add(&rq->queuelist, insert_here);
return;
+ }
+
+ deadline_rb_add(dd, drq);

if (rq_mergeable(rq)) {
- deadline_add_rq_hash(dd, drq);
+ deadline_hash_add(dd, drq);

if (!q->last_merge)
q->last_merge = &rq->queuelist;
@@ -443,15 +528,27 @@
drq->expires = jiffies + dd->read_expire;
list_add_tail(&drq->fifo, &dd->read_fifo);
}
+
+ /*
+ * heuristic to offload the time spent moving entries interrupt
+ * context. this happens when a driver queues a new request(s) from
+ * its isr
+ */
+#if 0
+ if (dd->rb_entries >= 8)
+ deadline_dispatch_requests(dd);
+#endif
}

static void deadline_remove_request(request_queue_t *q, struct request *rq)
{
+ struct deadline_data *dd = q->elevator.elevator_data;
struct deadline_rq *drq = RQ_DATA(rq);

if (drq) {
list_del_init(&drq->fifo);
- deadline_del_rq_hash(drq);
+ deadline_hash_del(drq);
+ deadline_rb_del(dd, drq);
}
}

@@ -459,8 +556,8 @@
{
struct deadline_data *dd = q->elevator.elevator_data;

- if (!list_empty(&dd->sort_list[WRITE]) ||
- !list_empty(&dd->sort_list[READ]) ||
+ if (!RB_EMPTY(&dd->rb_list[WRITE]) ||
+ !RB_EMPTY(&dd->rb_list[READ]) ||
!list_empty(&q->queue_head))
return 0;

@@ -473,7 +570,7 @@
{
struct deadline_data *dd = q->elevator.elevator_data;

- return &dd->sort_list[rq_data_dir(rq)];
+ return dd->dispatch;
}

static void deadline_exit(request_queue_t *q, elevator_t *e)
@@ -484,8 +581,8 @@
int i;

BUG_ON(!list_empty(&dd->read_fifo));
- BUG_ON(!list_empty(&dd->sort_list[READ]));
- BUG_ON(!list_empty(&dd->sort_list[WRITE]));
+ BUG_ON(!RB_EMPTY(&dd->rb_list[READ]));
+ BUG_ON(!RB_EMPTY(&dd->rb_list[WRITE]));

for (i = READ; i <= WRITE; i++) {
struct request_list *rl = &q->rq[i];
@@ -538,8 +635,8 @@
INIT_LIST_HEAD(&dd->hash[i]);

INIT_LIST_HEAD(&dd->read_fifo);
- INIT_LIST_HEAD(&dd->sort_list[READ]);
- INIT_LIST_HEAD(&dd->sort_list[WRITE]);
+ dd->rb_list[READ] = RB_ROOT;
+ dd->rb_list[WRITE] = RB_ROOT;
dd->dispatch = &q->queue_head;
dd->fifo_batch = fifo_batch;
dd->read_expire = read_expire;
@@ -567,6 +664,7 @@
memset(drq, 0, sizeof(*drq));
INIT_LIST_HEAD(&drq->fifo);
INIT_LIST_HEAD(&drq->hash);
+ RB_CLEAR(&drq->rb_node);
drq->request = rq;
rq->elevator_private = drq;
}

--
Jens Axboe

2002-11-01 11:27:51

by Jens Axboe

[permalink] [raw]

Subject: rbtree scores (was Re: [patch] deadline-ioscheduler rb-tree sort)

On Thu, Oct 31 2002, Jens Axboe wrote:
> Hi,
>
> This is something I whipped together the other day for testing the over
> head of the O(N) insert scan that the deadline io scheduler does. So
> instead of using a plain linked list, I adapted it to use an rb tree
> since that was already available with the generic implementation in the
> kernel.
>
> Results are quite promising, overhead is down a lot. I'm not pushing
> this for inclusion, just sending it out so others can test etc. Patch is
> against 2.5.45.

I completed a couple of runs of tiobench, and thought I would share a
few notes. The test was a 128 thread tiobench, using varying queue
sizes: ranging from 128 per list (the default), 512 per list, and 4096
per list. IO performance generally increased with bigger queues, for now
lets just look at profiles though.

Testing was done on a 9GB SCSI drive, SMP machine, 512megs of RAM. File
system used was ext2. Kernel used was 2.5.45-vanilla with and without
the deadline-rbtree patch.

With 128 requests per list, the stock kernel has the following relevant
oprofile results:

c0236000 1972 0.0233741 deadline_merge
c0232cd0 683 0.00809559 __make_request
c0235f30 315 0.00373369 deadline_find_hash
c02364a0 283 0.0033544 deadline_add_request
c0236310 248 0.00293954 deadline_move_requests
c02363b0 196 0.00232319 deadline_next_request

rbtree version has the following highscore list:

c0232cd0 560 0.00680737 __make_request
c02361d0 267 0.00324566 deadline_rb_find
c0235f80 234 0.00284451 deadline_hash_add
c0235fe0 225 0.0027351 deadline_hash_find
c0236100 204 0.00247983 deadline_rb_add
c0236660 157 0.00190849 deadline_next_request

No regression when compared to the stock settings, in fact it looks
pretty darn good.

With 512 requests per list, stock kernel scores:

c0236000 6094 0.0860452 deadline_merge
c0232cd0 714 0.0100814 __make_request
c0235f30 545 0.00769521 deadline_find_hash
c02364a0 307 0.00433474 deadline_add_request
c0236310 234 0.003304 deadline_move_requests
c02363b0 163 0.0023015 deadline_next_request

and rbtree version:

c0232cd0 665 0.00950867 __make_request
c0235fe0 350 0.00500456 deadline_hash_find
c02361d0 287 0.00410374 deadline_rb_find
c0235f80 236 0.0033745 deadline_hash_add
c0236100 197 0.00281685 deadline_rb_add
c0236660 121 0.00173015 deadline_next_request

stock kernel shows worse behaviour with 512 requests vs 128, rbtree
version is basically the same. Final test was with 4096 requests per
list. stock kernel highscore:

c0236000 11753 0.228824 deadline_merge
c0232cd0 707 0.0137649 __make_request
c0235f30 625 0.0121684 deadline_find_hash
c02364a0 309 0.00601604 deadline_add_request
c02363b0 134 0.0026089 deadline_next_request

and rbtree for 4096 requests:

c0232cd0 1152 0.0163495 __make_request
c0235fe0 931 0.013213 deadline_hash_find
c02361d0 523 0.00742254 deadline_rb_find
c0235f80 424 0.00601751 deadline_hash_add
c0236100 293 0.00415832 deadline_rb_add
c0236660 166 0.00235591 deadline_next_request

Now the back merge hash table is getting too small, obviously. It's
worth nothing that with 4096 requests, deadline_merge in the stock
kernel is the biggest cpu user in the kernel apart from
copy_to/from_user and mark_offset_tsc.

As expected, the stock version O(N) insertion scan really hurts. Even
with 128 requests per list, rbtree version is far superior. Once bigger
lists are used, there's just no comparison whatsoever.

Full profile numbers are available from:

http://www.kernel.org/pub/linux/kernel/people/axboe/misc/

--
Jens Axboe

2002-11-01 19:27:49

by Andrew Morton

[permalink] [raw]

Subject: Re: rbtree scores (was Re: [patch] deadline-ioscheduler rb-tree sort)

Jens Axboe wrote:
>
> As expected, the stock version O(N) insertion scan really hurts. Even
> with 128 requests per list, rbtree version is far superior. Once bigger
> lists are used, there's just no comparison whatsoever.
>

Jens, the tree just makes sense.

Remember that we added the blk_grow_request_list() thing to 2.4 for
the Dolphin SCI and other high-end hardware. High throughput, long
request queue, uh-oh. Probably they're not using the stock merge/insert
engine, but I bet someone will want to (Hi, Bill).

And we do know that for some classes of workloads, a larger request
queue is worth a 10x boost in throughput.

The length of the request queue is really a very important VM and
block parameter. Varying it will have much less impact on the VM than
it used to, but it is still there...

When we get on to making the block tunables tunable, request queue
length should be there, and we will be needing that tree.

Have I convinced you yet?

2002-11-01 20:49:19

by Jamie Lokier

[permalink] [raw]

Subject: Re: rbtree scores (was Re: [patch] deadline-ioscheduler rb-tree sort)

Andrew Morton wrote:
> Jens Axboe wrote:
> >
> > As expected, the stock version O(N) insertion scan really hurts. Even
> > with 128 requests per list, rbtree version is far superior. Once bigger
> > lists are used, there's just no comparison whatsoever.
> >
>
> Jens, the tree just makes sense.

Just a few comments about data structures - not important.

Technically I think that a priority queue, i.e. a heap (partially
ordered tree) is sufficient for the request queue. I don't know the
request queue code well enough to be sure, though.

Both heaps and trees are O(N log N), the difference being that an
rbtree does a bit more constant-time work to balance the tree while
maintaining a stricter ordering.

If it was worth it (I suspect not), you can make a data structure
which has O(1) amortised insertion time for a number of common cases,
such as runs of ascending block numbers. Seems a likely pattern for a
filesystem...

Implementing the latter would likely be a lot of work for little gain
though.

-- Jamie

2002-11-02 08:53:33

by Jens Axboe

[permalink] [raw]

Subject: Re: rbtree scores (was Re: [patch] deadline-ioscheduler rb-tree sort)

On Fri, Nov 01 2002, Jamie Lokier wrote:
> Andrew Morton wrote:
> > Jens Axboe wrote:
> > >
> > > As expected, the stock version O(N) insertion scan really hurts. Even
> > > with 128 requests per list, rbtree version is far superior. Once bigger
> > > lists are used, there's just no comparison whatsoever.
> > >
> >
> > Jens, the tree just makes sense.
>
> Just a few comments about data structures - not important.
>
> Technically I think that a priority queue, i.e. a heap (partially
> ordered tree) is sufficient for the request queue. I don't know the
> request queue code well enough to be sure, though.

I looked into that as well, and I do have a generic binomial heap
implementation that I plan on test as well.

> If it was worth it (I suspect not), you can make a data structure
> which has O(1) amortised insertion time for a number of common cases,
> such as runs of ascending block numbers. Seems a likely pattern for a
> filesystem...

Fibonacci heaps, for instance. I looked into that as well. However, it's
actually more important to have (if possible) O(1) extraction than
insert. Extraction is typically run from interrupt context, when a
driver wants to requeue more requests because it has completed one (or
some). That was a worry with the rbtree solution. The linked list may
have had sucky O(N) insert, but extraction was a nice O(1). So far I
haven't been able to notice any regression in this area, regardless.

> Implementing the latter would likely be a lot of work for little gain
> though.

Indeed

--
Jens Axboe

2002-11-02 08:51:11

by Jens Axboe

[permalink] [raw]

Subject: Re: rbtree scores (was Re: [patch] deadline-ioscheduler rb-tree sort)

On Fri, Nov 01 2002, Andrew Morton wrote:
> Jens Axboe wrote:
> >
> > As expected, the stock version O(N) insertion scan really hurts. Even
> > with 128 requests per list, rbtree version is far superior. Once bigger
> > lists are used, there's just no comparison whatsoever.
> >
>
> Jens, the tree just makes sense.

Agree

> Remember that we added the blk_grow_request_list() thing to 2.4 for
> the Dolphin SCI and other high-end hardware. High throughput, long
> request queue, uh-oh. Probably they're not using the stock merge/insert
> engine, but I bet someone will want to (Hi, Bill).
>
> And we do know that for some classes of workloads, a larger request
> queue is worth a 10x boost in throughput.

Indeed

> The length of the request queue is really a very important VM and
> block parameter. Varying it will have much less impact on the VM than
> it used to, but it is still there...

Well yes, cue long story about vm flushing acting up with longer
queues...

> When we get on to making the block tunables tunable, request queue
> length should be there, and we will be needing that tree.
>
> Have I convinced you yet?

I think there are two sides to this. One is that some devices just dont
ever need long queues. For a floppy and cdrom, it's just a huge waste of
memory, they should always just limited to a few entries. Some devices
might benefit from nice long queues, we want to give them the option of
having them. That's the hardware side, and I suspect such hints are best
passed in by the drivers. Probably by specifying device class.

Then there's the vm side, where we don't want to waste large amounts of
memory on requests... So based on the class of a device, select a
default depth that fits with the current system.

But yes, definitely needs to be tweakable.

--
Jens Axboe

2002-11-02 20:15:46

by Jamie Lokier

[permalink] [raw]

Subject: Re: rbtree scores (was Re: [patch] deadline-ioscheduler rb-tree sort)

Jens Axboe wrote:
> > If it was worth it (I suspect not), you can make a data structure
> > which has O(1) amortised insertion time for a number of common cases,
> > such as runs of ascending block numbers. Seems a likely pattern for a
> > filesystem...
>
> Fibonacci heaps, for instance. I looked into that as well. However, it's
> actually more important to have (if possible) O(1) extraction than
> insert. Extraction is typically run from interrupt context, when a
> driver wants to requeue more requests because it has completed one (or
> some). That was a worry with the rbtree solution. The linked list may
> have had sucky O(N) insert, but extraction was a nice O(1). So far I
> haven't been able to notice any regression in this area, regardless.

There's a skip list variant which offers O(1) extraction from the head
of the list, and probabilistic O(log n) insertion, fwiw.

-- Jamie