2003-05-30 21:53:04

by Neil Schemenauer

[permalink] [raw]
Subject: [PATCH][CFT] new IO scheduler for 2.4.20

The major benefit of this patch is that read latency is much lower while
lots of writes are occuring. On my machine, running:

while :; do dd if=/dev/zero of=foo bs=1M count=1000 conv=notrunc; done

makes 2.4.20 unusable. With this patch the "write bomb" causes no
particular problems.

With this version of the patch I've improved the bulk read performance
of the elevator. The bonnie++ results are now:

-Per Chr- --Block-- -Rewrite- -Per Chr- --Block--
Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP
2.4.20 1G 13001 97 34939 18 13034 7 12175 92 34112 14
2.4.20-nas 1G 12923 98 36471 17 13340 8 10809 83 35569 13

Note that the "rewrite" and "per-char read" stats are slightly bogus for
2.4.20-nas. Reads get a boost in priority over writes. When the
"per-char read" test has started there is still some writing happening
from the rewrite test. I think the net effect is that the "rewrite"
number is too high and the "per-char read" number is too low.

I would be very pleased if someone could run some tests on using bonnie,
contest, or their other favorite benchmarks and post the results.

Neil


diff -u -ur linux-2.4.20/Makefile linux-iosched-2/Makefile
--- linux-2.4.20/Makefile 2003-04-14 14:47:20.000000000 -0400
+++ linux-iosched-2/Makefile 2003-05-30 17:27:16.000000000 -0400
@@ -1,7 +1,7 @@
VERSION = 2
PATCHLEVEL = 4
SUBLEVEL = 20
-EXTRAVERSION =
+EXTRAVERSION = -nas

KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)

diff -u -ur linux-2.4.20/drivers/block/elevator.c linux-iosched-2/drivers/block/elevator.c
--- linux-2.4.20/drivers/block/elevator.c 2003-04-14 14:47:22.000000000 -0400
+++ linux-iosched-2/drivers/block/elevator.c 2003-05-30 17:28:57.000000000 -0400
@@ -74,6 +74,81 @@
return 0;
}

+int elevator_neil_merge(request_queue_t *q, struct request **req,
+ struct list_head * head,
+ struct buffer_head *bh, int rw,
+ int max_sectors)
+{
+ struct list_head *entry = &q->queue_head;
+ struct request *__rq;
+ unsigned int count = bh->b_size >> 9, ret = ELEVATOR_NO_MERGE;
+ unsigned int reads = 0, writes = 0;
+ /* XXX make tunable? */
+ unsigned int expire_time = jiffies - 1*HZ;
+
+ if (list_empty(&q->queue_head))
+ goto out;
+
+ /* try to merge requests, fall back to ordering them by sector */
+ while ((entry = entry->prev) != head) {
+ __rq = blkdev_entry_to_request(entry);
+
+ if (__rq->elevator_sequence <= 0)
+ break;
+ if (__rq->cmd == READ)
+ ++reads;
+ else
+ ++writes;
+ if (__rq->rq_dev != bh->b_rdev)
+ continue;
+ if (__rq->waiting)
+ continue;
+ if (__rq->cmd != rw)
+ continue;
+ if (time_before(__rq->start_time, expire_time))
+ break;
+ if (bh->b_rsector > __rq->sector)
+ *req = __rq;
+ if (__rq->nr_sectors + count > max_sectors)
+ continue;
+ if (__rq->sector + __rq->nr_sectors == bh->b_rsector) {
+ ret = ELEVATOR_BACK_MERGE;
+ *req = __rq;
+ goto out;
+ } else if (__rq->sector - count == bh->b_rsector) {
+ ret = ELEVATOR_FRONT_MERGE;
+ *req = __rq;
+ goto out;
+ }
+ }
+ if (!*req && rw == READ) {
+ int extra_writes = writes - reads;
+ /*
+ * If there are more writes than reads in the queue then put
+ * read requests ahead of the extra writes. This prevents
+ * writes from starving reads.
+ */
+ entry = q->queue_head.prev;
+ while (extra_writes > 0 && entry != head) {
+ __rq = blkdev_entry_to_request(entry);
+ if (__rq->cmd == WRITE)
+ --extra_writes;
+ else if (time_before(__rq->start_time, expire_time))
+ break;
+ entry = entry->prev;
+ }
+ *req = blkdev_entry_to_request(entry);
+ }
+out:
+ return ret;
+}
+
+void elevator_neil_merge_req(struct request *req, struct request *next)
+{
+ if (time_before(next->start_time, req->start_time))
+ req->start_time = next->start_time;
+}
+

int elevator_linus_merge(request_queue_t *q, struct request **req,
struct list_head * head,
diff -u -ur linux-2.4.20/drivers/block/ll_rw_blk.c linux-iosched-2/drivers/block/ll_rw_blk.c
--- linux-2.4.20/drivers/block/ll_rw_blk.c 2003-04-14 14:47:22.000000000 -0400
+++ linux-iosched-2/drivers/block/ll_rw_blk.c 2003-05-30 17:27:16.000000000 -0400
@@ -480,7 +480,7 @@
void blk_init_queue(request_queue_t * q, request_fn_proc * rfn)
{
INIT_LIST_HEAD(&q->queue_head);
- elevator_init(&q->elevator, ELEVATOR_LINUS);
+ elevator_init(&q->elevator, ELEVATOR_NEIL);
blk_init_free_list(q);
q->request_fn = rfn;
q->back_merge_fn = ll_back_merge_fn;
@@ -922,7 +922,8 @@
rw = READ; /* drop into READ */
case READ:
case WRITE:
- latency = elevator_request_latency(elevator, rw);
+ /* latency = elevator_request_latency(elevator, rw); */
+ latency = 1;
break;
default:
BUG();
diff -u -ur linux-2.4.20/include/linux/elevator.h linux-iosched-2/include/linux/elevator.h
--- linux-2.4.20/include/linux/elevator.h 2003-04-14 14:47:24.000000000 -0400
+++ linux-iosched-2/include/linux/elevator.h 2003-05-30 17:27:16.000000000 -0400
@@ -31,6 +31,9 @@
void elevator_linus_merge_cleanup(request_queue_t *, struct request *, int);
void elevator_linus_merge_req(struct request *, struct request *);

+int elevator_neil_merge(request_queue_t *, struct request **, struct list_head *, struct buffer_head *, int, int);
+void elevator_neil_merge_req(struct request *, struct request *);
+
typedef struct blkelv_ioctl_arg_s {
int queue_ID;
int read_latency;
@@ -101,3 +104,12 @@
})

#endif
+
+#define ELEVATOR_NEIL \
+((elevator_t) { \
+ 0, /* read_latency */ \
+ 0, /* write_latency */ \
+ \
+ elevator_neil_merge, /* elevator_merge_fn */ \
+ elevator_neil_merge_req, /* elevator_merge_req_fn */ \
+ })


2003-05-30 23:26:11

by Con Kolivas

[permalink] [raw]
Subject: Re: [PATCH][CFT] new IO scheduler for 2.4.20

On Sat, 31 May 2003 08:09, Neil Schemenauer wrote:
> The major benefit of this patch is that read latency is much lower while
> lots of writes are occuring. On my machine, running:
>
> while :; do dd if=/dev/zero of=foo bs=1M count=1000 conv=notrunc; done
>
> makes 2.4.20 unusable. With this patch the "write bomb" causes no
> particular problems.
>
> With this version of the patch I've improved the bulk read performance
> of the elevator. The bonnie++ results are now:
>
> -Per Chr- --Block-- -Rewrite- -Per Chr- --Block--
> Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP
> 2.4.20 1G 13001 97 34939 18 13034 7 12175 92 34112 14
> 2.4.20-nas 1G 12923 98 36471 17 13340 8 10809 83 35569 13
>
> Note that the "rewrite" and "per-char read" stats are slightly bogus for
> 2.4.20-nas. Reads get a boost in priority over writes. When the
> "per-char read" test has started there is still some writing happening
> from the rewrite test. I think the net effect is that the "rewrite"
> number is too high and the "per-char read" number is too low.
>
> I would be very pleased if someone could run some tests on using bonnie,
> contest, or their other favorite benchmarks and post the results.

Nice to see 2.4 getting some attention. I'll try and get around to contesting
it.

How does this compare to akpm's read-latency2 patch that he posed some time
ago? That seems to make a massive difference but was knocked back for style
or approach.

Con

2003-05-31 00:36:24

by Neil Schemenauer

[permalink] [raw]
Subject: Re: [PATCH][CFT] new IO scheduler for 2.4.20

Con Kolivas wrote:
> How does this compare to akpm's read-latency2 patch that he posed some
> time ago? That seems to make a massive difference but was knocked back
> for style or approach.

It looks like they do fairly similar things. Andrew's patch puts
unmergable read requests at a fixed distance from the front of the
queue. My patch lets unmerged reads skip max((reads - writes), 0)
requests. That's probably more fair when lots of reads and writes are
in queue.

Andrew's idea of always allowing a merge is probably a good idea and
could be adopted.

My patch uses a fixed deadline for requests (similar to Jen's deadline
scheduler). I'm not sure if that's an advantage or not. Note that the
deadline of writes are ignored when inserting a read.

I didn't change the size of the request queue. I can't find where that
gets set in 2.4.20. :-(

Sorry for the hand-waving. I didn't know about Andrew's patch and I
obviously didn't do enough testing yet.

Neil

2003-05-31 00:45:34

by Robert Love

[permalink] [raw]
Subject: Re: [PATCH][CFT] new IO scheduler for 2.4.20

On Sat, 2003-05-31 at 00:52, Neil Schemenauer wrote:

> I didn't change the size of the request queue. I can't find where that
> gets set in 2.4.20. :-(

in drivers/block/ll_rw_blk.c :: blk_init_free_list()

I think the main reason (aside from memory consumption) 2.4 has such
small queues is to minimize read latency. If your patch goes off fixes
that, you can probably get away with larger queues, which will help in
merging. But then you definitely want to look into always merging reads,
like Andrew does.

A maximum of ~1000 request entries seems nice if you really give a boost
to reads otherwise.

Robert Love

2003-06-02 17:11:23

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH][CFT] new IO scheduler for 2.4.20

On Sat, 31 May 2003 08:09, Neil Schemenauer wrote:
> The major benefit of this patch is that read latency is much lower while
> lots of writes are occuring. On my machine, running:
>
> while :; do dd if=/dev/zero of=foo bs=1M count=1000 conv=notrunc; done
>
> makes 2.4.20 unusable. With this patch the "write bomb" causes no
> particular problems.
>
> With this version of the patch I've improved the bulk read performance
> of the elevator. The bonnie++ results are now:
>
> -Per Chr- --Block-- -Rewrite- -Per Chr- --Block--
> Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP
> 2.4.20 1G 13001 97 34939 18 13034 7 12175 92 34112 14
> 2.4.20-nas 1G 12923 98 36471 17 13340 8 10809 83 35569 13
>
> Note that the "rewrite" and "per-char read" stats are slightly bogus for
> 2.4.20-nas. Reads get a boost in priority over writes. When the
> "per-char read" test has started there is still some writing happening
> from the rewrite test. I think the net effect is that the "rewrite"
> number is too high and the "per-char read" number is too low.
>
> I would be very pleased if someone could run some tests on using bonnie,
> contest, or their other favorite benchmarks and post the results.

In local testing we hit the following problem at boot:

SCSI device sda: 35565080 512-byte hdwr sectors (18209 MB)
Partition check:
sda:elevator returned crap (-1069467520)
------------[ cut here ]------------
kernel BUG at ll_rw_blk.c:1043!
invalid operand: 0000

CPU: 0
EIP: 0060:[<c01d40ab>] Not tainted
EFLAGS: 00010086

EIP is at __make_request [kernel] 0x3bb
(2.4.20-9smp-l18-b_devel-30-05-2003-fercher-nas)
eax: 00000025 ebx: c466a2e0 ecx: 00000001 edx: 00000001
esi: 00000000 edi: c464b638 ebp: c44c9d30 esp: c44c9c90
ds: 0068 es: 0068 ss: 0068
Process swapper (pid: 1, stackpage=c44c9000)
Stack: c02c7ea6 c0413880 c464b640 c463b880 00000400 00000000 00000002
00000002
00000000 c44c8000 00000000 c0413880 c03e1fc4 c011ac62 c44c9d10
c0413880
00000000 c44c9cec c020707b c4623a38 c4623a38 c4695980 c4652400
c4623a00
Call Trace: [<c011ac62>] schedule [kernel] 0x362 (0xc44c9cc4))
[<c020707b>] scsi_insert_special_req [kernel] 0x1b (0xc44c9cd8))
[<c020723a>] scsi_queue_next_request [kernel] 0x4a (0xc44c9d04))
[<c01d43f9>] generic_make_request [kernel] 0x119 (0xc44c9d34))
[<c01d445f>] submit_bh [kernel] 0x4f (0xc44c9d68))
[<c014f140>] block_read_full_page [kernel] 0x2e0 (0xc44c9d84))
[<c012411d>] bh_action [kernel] 0x4d (0xc44c9dc8))
[<c0123fc4>] tasklet_hi_action [kernel] 0x74 (0xc44c9dd4))
[<c0135981>] add_to_page_cache_unique [kernel] 0x81 (0xc44c9e14))
[<c0138459>] read_cache_page [kernel] 0x89 (0xc44c9e30))
[<c0152a20>] blkdev_get_block [kernel] 0x0 (0xc44c9e38))
[<c014367c>] __alloc_pages [kernel] 0x7c (0xc44c9e44))
[<c0174d46>] read_dev_sector [kernel] 0x36 (0xc44c9e5c))
[<c0152ab0>] blkdev_readpage [kernel] 0x0 (0xc44c9e68))
[<c01756e9>] handle_ide_mess [kernel] 0x29 (0xc44c9e90))
[<c01758c5>] msdos_partition [kernel] 0x65 (0xc44c9ebc))
[<c0163cfc>] new_inode [kernel] 0xc (0xc44c9ed0))
[<c0174b4f>] check_partition [kernel] 0xff (0xc44c9ef0))
[<c02236fa>] sd_init_onedisk [kernel] 0x7ba (0xc44c9f24))
[<c0174cd2>] grok_partitions [kernel] 0xc2 (0xc44c9f5c))
[<c0223d76>] sd_finish [kernel] 0x106 (0xc44c9f80))
[<c0201985>] scsi_register_device_module [kernel] 0x155 (0xc44c9fa8))
[<c0105094>] init [kernel] 0x34 (0xc44c9fdc))
[<c0105060>] init [kernel] 0x0 (0xc44c9fe0))
[<c0107459>] kernel_thread_helper [kernel] 0x5 (0xc44c9ff0))


Code: 0f 0b 13 04 9a 7e 2c c0 59 5f 8b 95 74 ff ff ff 85 d2 74 15
<0>Kernel panic: Attempted to kill init!

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/