Date: Wed, 25 Sep 2002 19:20:24 +0200
From: Jens Axboe <axboe@suse.de>
To: Linux Kernel <linux-kernel@vger.kernel.org>
Subject: [PATCH] deadline io scheduler
Message-ID: <20020925172024.GH15479@suse.de>
Mime-Version: 1.0
Content-Type: multipart/mixed; boundary="IJpNTDwzlM2Ie8A6"
Content-Disposition: inline
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 18092
Lines: 679


--IJpNTDwzlM2Ie8A6
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

Hi,

Due to recent "problems" (well the vm being just too damn good at keep
disks busy these days), it's become even more apparent that our current
io scheduler just cannot cope with some work loads. Repeated starvartion
of reads is the most important one. The Andrew Morton Interactive
Workload (AMIW) [1] rates the current kernel poorly, on my test machine
it completes in 1-2 minutes depending on your luck. 2.5.38-BK does a lot
better, but mainly because it's being extremely unfair. This deadline io
scheduler finishes the AMIW in anywhere from ~0.5 seconds to ~3-4
seconds, depending on the io load.

I'd like folks to give it a test spin. Make two kernels, a 2.5.38
pristine and a 2.5.38 with this patch applied. Now beat on each of them,
while listening to mp3's. Or read mails and change folders. Or anything
else that gives you a feel for the interactiveness of the machine. Then
report your findings. I'm interested in _anything_.

There are a few tunables, but I'd suggest trying the defaults first.
Then expirement with these two:

static int read_expire = HZ / 2;

This defines the read expire time, current default is 500ms.

static int writes_starved = 2;

This defines how many times reads can starve writes. 2 means that we can
do two rounds of reads for 1 write.

If you are curious how deadline-iosched works, search lkml archives for
previous announcements. I might make a new one if there's any
interesting in a big detailed analysis, since there has been some
changes since last release.

[1] Flush lots of stuff to disk (I start a dbench xxx, or do a dd
if=/dev/zero of=test_file bs=64k), and then time a cat dir/*.c where
dir/ holds lots of source files.

-- 
Jens Axboe


--IJpNTDwzlM2Ie8A6
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=deadline-iosched-11

# This is a BitKeeper generated patch for the following project:
# Project Name: Linux kernel tree
# This patch format is intended for GNU patch command version 2.5 or higher.
# This patch includes the following deltas:
#	           ChangeSet	1.607   -> 1.608  
#	include/linux/elevator.h	1.11    -> 1.12   
#	drivers/block/ll_rw_blk.c	1.109   -> 1.110  
#	drivers/block/Makefile	1.9     -> 1.10   
#	               (new)	        -> 1.1     drivers/block/deadline-iosched.c
#
# The following is the BitKeeper ChangeSet Log
# --------------------------------------------
# 02/09/25	axboe@burns.home.kernel.dk	1.608
# deadline io scheduler
# --------------------------------------------
#
diff -Nru a/drivers/block/Makefile b/drivers/block/Makefile
--- a/drivers/block/Makefile	Wed Sep 25 19:16:26 2002
+++ b/drivers/block/Makefile	Wed Sep 25 19:16:26 2002
@@ -9,9 +9,9 @@
 #
 
 export-objs	:= elevator.o ll_rw_blk.o loop.o genhd.o acsi.o \
-		   block_ioctl.o
+		   block_ioctl.o deadline-iosched.o
 
-obj-y	:= elevator.o ll_rw_blk.o blkpg.o genhd.o block_ioctl.o
+obj-y	:= elevator.o ll_rw_blk.o blkpg.o genhd.o block_ioctl.o deadline-iosched.o
 
 obj-$(CONFIG_MAC_FLOPPY)	+= swim3.o
 obj-$(CONFIG_BLK_DEV_FD)	+= floppy.o
diff -Nru a/drivers/block/deadline-iosched.c b/drivers/block/deadline-iosched.c
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/drivers/block/deadline-iosched.c	Wed Sep 25 19:16:26 2002
@@ -0,0 +1,557 @@
+/*
+ *  linux/drivers/block/deadline-iosched.c
+ *
+ *  Deadline i/o scheduler.
+ *
+ *  Copyright (C) 2002 Jens Axboe <axboe@suse.de>
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/elevator.h>
+#include <linux/bio.h>
+#include <linux/blk.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/hash.h>
+
+/*
+ * feel free to try other values :-). read_expire value is the timeout for
+ * reads, our goal is to start a request "around" the time when it expires.
+ * fifo_batch is how many steps along the sorted list we will take when the
+ * front fifo request expires.
+ */
+static int read_expire = HZ / 2;	/* 500ms start timeout */
+static int fifo_batch = 64;		/* 4 seeks, or 64 contig */
+static int seek_cost = 16;		/* seek is 16 times more expensive */
+
+/*
+ * how many times reads are allowed to starve writes
+ */
+static int writes_starved = 2;
+
+static const int deadline_hash_shift = 8;
+#define DL_HASH_BLOCK(sec)	((sec) >> 3)
+#define DL_HASH_FN(sec)		(hash_long(DL_HASH_BLOCK((sec)), deadline_hash_shift))
+#define DL_HASH_ENTRIES		(1 << deadline_hash_shift)
+
+#define DL_INVALIDATE_HASH(dd)				\
+	do {						\
+		if (!++(dd)->hash_valid_count)		\
+			(dd)->hash_valid_count = 1;	\
+	} while (0)
+
+struct deadline_data {
+	/*
+	 * run time data
+	 */
+	struct list_head sort_list[2];	/* sorted listed */
+	struct list_head read_fifo;	/* fifo list */
+	struct list_head *dispatch;	/* driver dispatch queue */
+	struct list_head *hash;		/* request hash */
+	sector_t last_sector;		/* last sector sent to drive */
+	unsigned long hash_valid_count;	/* barrier hash count */
+	unsigned int starved;		/* writes starved */
+
+	/*
+	 * settings that change how the i/o scheduler behaves
+	 */
+	unsigned int fifo_batch;
+	unsigned long read_expire;
+	unsigned int seek_cost;
+	unsigned int writes_starved;
+};
+
+/*
+ * pre-request data.
+ */
+struct deadline_rq {
+	struct list_head fifo;
+	struct list_head hash;
+	unsigned long hash_valid_count;
+	struct request *request;
+	unsigned long expires;
+};
+
+static kmem_cache_t *drq_pool;
+
+#define RQ_DATA(rq)	((struct deadline_rq *) (rq)->elevator_private)
+
+/*
+ * rq hash
+ */
+static inline void __deadline_del_rq_hash(struct deadline_rq *drq)
+{
+	drq->hash_valid_count = 0;
+	list_del_init(&drq->hash);
+}
+
+#define ON_HASH(drq)	(drq)->hash_valid_count
+static inline void deadline_del_rq_hash(struct deadline_rq *drq)
+{
+	if (ON_HASH(drq))
+		__deadline_del_rq_hash(drq);
+}
+
+static inline void
+deadline_add_rq_hash(struct deadline_data *dd, struct deadline_rq *drq)
+{
+	struct request *rq = drq->request;
+
+	BUG_ON(ON_HASH(drq));
+
+	drq->hash_valid_count = dd->hash_valid_count;
+	list_add(&drq->hash, &dd->hash[DL_HASH_FN(rq->sector +rq->nr_sectors)]);
+}
+
+#define list_entry_hash(ptr)	list_entry((ptr), struct deadline_rq, hash)
+static struct request *
+deadline_find_hash(struct deadline_data *dd, sector_t offset)
+{
+	struct list_head *hash_list = &dd->hash[DL_HASH_FN(offset)];
+	struct list_head *entry, *next = hash_list->next;
+	struct deadline_rq *drq;
+	struct request *rq = NULL;
+
+	while ((entry = next) != hash_list) {
+		next = entry->next;
+		drq = list_entry_hash(entry);
+
+		BUG_ON(!drq->hash_valid_count);
+
+		if (!rq_mergeable(drq->request)
+		    || drq->hash_valid_count != dd->hash_valid_count) {
+			__deadline_del_rq_hash(drq);
+			continue;
+		}
+
+		if (drq->request->sector + drq->request->nr_sectors == offset) {
+			rq = drq->request;
+			break;
+		}
+	}
+
+	return rq;
+}
+
+static int
+deadline_merge(request_queue_t *q, struct request **req, struct bio *bio)
+{
+	struct deadline_data *dd = q->elevator.elevator_data;
+	const int data_dir = bio_data_dir(bio);
+	struct list_head *entry, *sort_list;
+	struct deadline_rq *drq;
+	struct request *__rq;
+	int ret = ELEVATOR_NO_MERGE;
+
+	/*
+	 * try last_merge to avoid going to hash
+	 */
+	ret = elv_try_last_merge(q, req, bio);
+	if (ret != ELEVATOR_NO_MERGE)
+		goto out;
+
+	/*
+	 * see if the merge hash can satisfy a back merge
+	 */
+	if ((__rq = deadline_find_hash(dd, bio->bi_sector))) {
+		BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
+
+		if (elv_rq_merge_ok(__rq, bio)) {
+			*req = __rq;
+			q->last_merge = &__rq->queuelist;
+			ret = ELEVATOR_BACK_MERGE;
+			goto out_ret;
+		}
+	}
+
+	entry = sort_list = &dd->sort_list[data_dir];
+	while ((entry = entry->prev) != sort_list) {
+		__rq = list_entry_rq(entry);
+		drq = RQ_DATA(__rq);
+
+		BUG_ON(__rq->flags & REQ_STARTED);
+
+		if (!(__rq->flags & REQ_CMD))
+			continue;
+
+		if (!*req && bio_rq_in_between(bio, __rq, sort_list))
+			*req = __rq;
+
+		if (__rq->flags & REQ_BARRIER)
+			break;
+
+		/*
+		 * checking for a front merge, hash will miss those
+		 */
+		if (__rq->sector - bio_sectors(bio) == bio->bi_sector) {
+			ret = elv_try_merge(__rq, bio);
+			if (ret != ELEVATOR_NO_MERGE) {
+				*req = __rq;
+				q->last_merge = &__rq->queuelist;
+				break;
+			}
+		}
+	}
+
+out:
+	if (ret != ELEVATOR_NO_MERGE) {
+		struct deadline_rq *drq = RQ_DATA(*req);
+
+		deadline_del_rq_hash(drq);
+		deadline_add_rq_hash(dd, drq);
+	}
+out_ret:
+	return ret;
+}
+
+static void
+deadline_merge_request(request_queue_t *q, struct request *req, struct request *next)
+{
+	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_rq *drq = RQ_DATA(req);
+	struct deadline_rq *dnext = RQ_DATA(next);
+
+	BUG_ON(!drq);
+	BUG_ON(!dnext);
+
+	deadline_del_rq_hash(drq);
+	deadline_add_rq_hash(dd, drq);
+
+	/*
+	 * if dnext expires before drq, assign it's expire time to drq
+	 * and move into dnext position (dnext will be deleted) in fifo
+	 */
+	if (!list_empty(&drq->fifo) && !list_empty(&dnext->fifo)) {
+		if (time_before(dnext->expires, drq->expires)) {
+			list_move(&drq->fifo, &dnext->fifo);
+			drq->expires = dnext->expires;
+		}
+	}
+}
+
+/*
+ * move request from sort list to dispatch queue. maybe remove from rq hash
+ * here too?
+ */
+static inline void
+deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
+{
+	struct deadline_rq *drq = RQ_DATA(rq);
+
+	list_move_tail(&rq->queuelist, dd->dispatch);
+	list_del_init(&drq->fifo);
+}
+
+/*
+ * move along sort list and move entries to dispatch queue, starting from rq
+ */
+static void deadline_move_requests(struct deadline_data *dd, struct request *rq)
+{
+	struct list_head *sort_head = &dd->sort_list[rq_data_dir(rq)];
+	sector_t last_sec = dd->last_sector;
+	int batch_count = dd->fifo_batch;
+
+	do {
+		struct list_head *nxt = rq->queuelist.next;
+
+		/*
+		 * take it off the sort and fifo list, move
+		 * to dispatch queue
+		 */
+		deadline_move_to_dispatch(dd, rq);
+
+		if (rq->sector == last_sec)
+			batch_count--;
+		else
+			batch_count -= dd->seek_cost;
+
+		if (nxt == sort_head)
+			break;
+
+		last_sec = rq->sector + rq->nr_sectors;
+		rq = list_entry_rq(nxt);
+	} while (batch_count > 0);
+}
+
+/*
+ * returns 0 if there are no expired reads on the fifo, 1 otherwise
+ */
+#define list_entry_fifo(ptr)	list_entry((ptr), struct deadline_rq, fifo)
+static inline int deadline_check_fifo(struct deadline_data *dd)
+{
+	struct deadline_rq *drq;
+
+	if (list_empty(&dd->read_fifo))
+		return 0;
+
+	drq = list_entry_fifo(dd->read_fifo.next);
+	if (time_before(jiffies, drq->expires))
+		return 0;
+
+	return 1;
+}
+
+static struct request *deadline_next_request(request_queue_t *q)
+{
+	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_rq *drq;
+	struct list_head *nxt;
+	struct request *rq;
+	int writes;
+
+	/*
+	 * if still requests on the dispatch queue, just grab the first one
+	 */
+	if (!list_empty(&q->queue_head)) {
+dispatch:
+		rq = list_entry_rq(q->queue_head.next);
+		dd->last_sector = rq->sector + rq->nr_sectors;
+		return rq;
+	}
+
+	writes = !list_empty(&dd->sort_list[WRITE]);
+
+	/*
+	 * if we have expired entries on the fifo list, move some to dispatch
+	 */
+	if (deadline_check_fifo(dd)) {
+		if (writes && (dd->starved++ >= dd->writes_starved))
+			goto dispatch_writes;
+
+		nxt = dd->read_fifo.next;
+		drq = list_entry_fifo(nxt);
+		deadline_move_requests(dd, drq->request);
+		goto dispatch;
+	}
+
+	if (!list_empty(&dd->sort_list[READ])) {
+		if (writes && (dd->starved++ >= dd->writes_starved))
+			goto dispatch_writes;
+
+		nxt = dd->sort_list[READ].next;
+		deadline_move_requests(dd, list_entry_rq(nxt));
+		goto dispatch;
+	}
+
+	/*
+	 * either there are no reads expired or on sort list, or the reads
+	 * have starved writes for too long. dispatch some writes
+	 */
+	if (writes) {
+dispatch_writes:
+		nxt = dd->sort_list[WRITE].next;
+		deadline_move_requests(dd, list_entry_rq(nxt));
+		dd->starved = 0;
+		goto dispatch;
+	}
+
+	BUG_ON(!list_empty(&dd->sort_list[READ]));
+	BUG_ON(writes);
+	return NULL;
+}
+
+static void
+deadline_add_request(request_queue_t *q, struct request *rq, struct list_head *insert_here)
+{
+	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_rq *drq = RQ_DATA(rq);
+	const int data_dir = rq_data_dir(rq);
+
+	/*
+	 * flush hash on barrier insert, as not to allow merges before a
+	 * barrier.
+	 */
+	if (unlikely(rq->flags & REQ_BARRIER)) {
+		DL_INVALIDATE_HASH(dd);
+		q->last_merge = NULL;
+	}
+
+	/*
+	 * add to sort list
+	 */
+	if (!insert_here)
+		insert_here = dd->sort_list[data_dir].prev;
+
+	list_add(&rq->queuelist, insert_here);
+
+	if (unlikely(!(rq->flags & REQ_CMD)))
+		return;
+
+	if (rq_mergeable(rq)) {
+		deadline_add_rq_hash(dd, drq);
+
+		if (!q->last_merge)
+			q->last_merge = &rq->queuelist;
+	}
+
+	if (data_dir == READ) {
+		/*
+		 * set expire time and add to fifo list
+		 */
+		drq->expires = jiffies + dd->read_expire;
+		list_add_tail(&drq->fifo, &dd->read_fifo);
+	}
+}
+
+static void deadline_remove_request(request_queue_t *q, struct request *rq)
+{
+	struct deadline_rq *drq = RQ_DATA(rq);
+
+	if (drq) {
+		list_del_init(&drq->fifo);
+		deadline_del_rq_hash(drq);
+	}
+}
+
+static int deadline_queue_empty(request_queue_t *q)
+{
+	struct deadline_data *dd = q->elevator.elevator_data;
+
+	if (!list_empty(&q->queue_head) || !list_empty(&dd->sort_list[READ])
+	    || !list_empty(&dd->sort_list[WRITE]))
+		return 0;
+
+	BUG_ON(!list_empty(&dd->read_fifo));
+	return 1;
+}
+
+static struct list_head *
+deadline_get_sort_head(request_queue_t *q, struct request *rq)
+{
+	struct deadline_data *dd = q->elevator.elevator_data;
+
+	return &dd->sort_list[rq_data_dir(rq)];
+}
+
+static void deadline_exit(request_queue_t *q, elevator_t *e)
+{
+	struct deadline_data *dd = e->elevator_data;
+	struct deadline_rq *drq;
+	struct request *rq;
+	int i;
+
+	BUG_ON(!list_empty(&dd->read_fifo));
+	BUG_ON(!list_empty(&dd->sort_list[READ]));
+	BUG_ON(!list_empty(&dd->sort_list[WRITE]));
+
+	for (i = READ; i <= WRITE; i++) {
+		struct request_list *rl = &q->rq[i];
+		struct list_head *entry = &rl->free;
+
+		if (list_empty(&rl->free))
+			continue;
+	
+		while ((entry = entry->next) != &rl->free) {
+			rq = list_entry_rq(entry);
+
+			if ((drq = RQ_DATA(rq)) == NULL)
+				continue;
+
+			rq->elevator_private = NULL;
+			kmem_cache_free(drq_pool, drq);
+		}
+	}
+
+	kfree(dd->hash);
+	kfree(dd);
+}
+
+/*
+ * initialize elevator private data (deadline_data), and alloc a drq for
+ * each request on the free lists
+ */
+static int deadline_init(request_queue_t *q, elevator_t *e)
+{
+	struct deadline_data *dd;
+	struct deadline_rq *drq;
+	struct request *rq;
+	int i, ret = 0;
+
+	if (!drq_pool)
+		return -ENOMEM;
+
+	dd = kmalloc(sizeof(*dd), GFP_KERNEL);
+	if (!dd)
+		return -ENOMEM;
+	memset(dd, 0, sizeof(*dd));
+
+	dd->hash = kmalloc(sizeof(struct list_head)*DL_HASH_ENTRIES,GFP_KERNEL);
+	if (!dd->hash) {
+		kfree(dd);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < DL_HASH_ENTRIES; i++)
+		INIT_LIST_HEAD(&dd->hash[i]);
+
+	INIT_LIST_HEAD(&dd->read_fifo);
+	INIT_LIST_HEAD(&dd->sort_list[READ]);
+	INIT_LIST_HEAD(&dd->sort_list[WRITE]);
+	dd->dispatch = &q->queue_head;
+	dd->fifo_batch = fifo_batch;
+	dd->read_expire = read_expire;
+	dd->seek_cost = seek_cost;
+	dd->hash_valid_count = 1;
+	dd->writes_starved = writes_starved;
+	e->elevator_data = dd;
+
+	for (i = READ; i <= WRITE; i++) {
+		struct request_list *rl = &q->rq[i];
+		struct list_head *entry = &rl->free;
+
+		if (list_empty(&rl->free))
+			continue;
+	
+		while ((entry = entry->next) != &rl->free) {
+			rq = list_entry_rq(entry);
+
+			drq = kmem_cache_alloc(drq_pool, GFP_KERNEL);
+			if (!drq) {
+				ret = -ENOMEM;
+				break;
+			}
+
+			memset(drq, 0, sizeof(*drq));
+			INIT_LIST_HEAD(&drq->fifo);
+			INIT_LIST_HEAD(&drq->hash);
+			drq->request = rq;
+			rq->elevator_private = drq;
+		}
+	}
+
+	if (ret)
+		deadline_exit(q, e);
+
+	return ret;
+}
+
+static int __init deadline_slab_setup(void)
+{
+	drq_pool = kmem_cache_create("deadline_drq", sizeof(struct deadline_rq),
+				     0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+
+	if (!drq_pool)
+		panic("deadline: can't init slab pool\n");
+
+	return 0;
+}
+
+module_init(deadline_slab_setup);
+
+elevator_t iosched_deadline = {
+	.elevator_merge_fn = 		deadline_merge,
+	.elevator_merge_req_fn =	deadline_merge_request,
+	.elevator_next_req_fn =		deadline_next_request,
+	.elevator_add_req_fn =		deadline_add_request,
+	.elevator_remove_req_fn =	deadline_remove_request,
+	.elevator_queue_empty_fn =	deadline_queue_empty,
+	.elevator_get_sort_head_fn =	deadline_get_sort_head,
+	.elevator_init_fn =		deadline_init,
+	.elevator_exit_fn =		deadline_exit,
+};
+
+EXPORT_SYMBOL(iosched_deadline);
diff -Nru a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
--- a/drivers/block/ll_rw_blk.c	Wed Sep 25 19:16:26 2002
+++ b/drivers/block/ll_rw_blk.c	Wed Sep 25 19:16:26 2002
@@ -1175,7 +1175,7 @@
 	if (blk_init_free_list(q))
 		return -ENOMEM;
 
-	if ((ret = elevator_init(q, &q->elevator, elevator_linus))) {
+	if ((ret = elevator_init(q, &q->elevator, iosched_deadline))) {
 		blk_cleanup_queue(q);
 		return ret;
 	}
diff -Nru a/include/linux/elevator.h b/include/linux/elevator.h
--- a/include/linux/elevator.h	Wed Sep 25 19:16:26 2002
+++ b/include/linux/elevator.h	Wed Sep 25 19:16:26 2002
@@ -60,6 +60,12 @@
 #define ELV_LINUS_SEEK_COST	16
 
 /*
+ * deadline i/o scheduler. uses request time outs to prevent indefinite
+ * starvation
+ */
+extern elevator_t iosched_deadline;
+
+/*
  * use the /proc/iosched interface, all the below is history ->
  */
 typedef struct blkelv_ioctl_arg_s {

--IJpNTDwzlM2Ie8A6--
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/