From: Vivek Goyal <vgoyal@redhat.com>
To: linux-kernel@vger.kernel.org, containers@lists.linux-foundation.org,
       dm-devel@redhat.com, jens.axboe@oracle.com, nauman@google.com,
       dpshah@google.com, lizf@cn.fujitsu.com, mikew@google.com,
       fchecconi@gmail.com, paolo.valente@unimore.it, ryov@valinux.co.jp,
       fernando@oss.ntt.co.jp, s-uchida@ap.jp.nec.com, taka@valinux.co.jp,
       guijianfeng@cn.fujitsu.com, jmoyer@redhat.com,
       dhaval@linux.vnet.ibm.com, balbir@linux.vnet.ibm.com,
       righi.andrea@gmail.com, m-ikeda@ds.jp.nec.com, jbaron@redhat.com
Cc: agk@redhat.com, snitzer@redhat.com, vgoyal@redhat.com,
       akpm@linux-foundation.org, peterz@infradead.org
Subject: [PATCH 19/19] io-controller: experimental debug patch for async queue wait before expiry
Date: Mon,  8 Jun 2009 22:09:02 -0400
Message-Id: <1244513342-11758-20-git-send-email-vgoyal@redhat.com>
In-Reply-To: <1244513342-11758-1-git-send-email-vgoyal@redhat.com>
References: <1244513342-11758-1-git-send-email-vgoyal@redhat.com>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 7788
Lines: 209

o A debug patch which does wait for next IO from async queue once it
  becomes empty.

o For async writes, traffic seen by IO scheduler is not in proportion to
  the weight of the cgroup task/page belongs to. So if there are two processes
  doing heavy writeouts in two cgroups with weights 1000 and 500 respectively,
  then IO scheduler does not see more traffic/IO from higher weight cgroup
  even if IO scheduler tries to give it higher disk time. Effectively, the
  async queue belonging to higher weight cgroup becomes empty, and gets out
  of contention for disk and lower weight cgroup gets to use disk giving
  an impression in user space that higher weight cgroup did not get higher
  time to disk.

o This is more of a problem at page cache level where a higher weight
  process might be writing out the pages of lower weight process etc and
  should be fixed there.

o While we fix those issues, introducing this debug patch which allows one
  to idle on async queue (tunable via /sys/blolc/<disk>/queue/async_slice_idle)  so that once a higher weight queue becomes empty, instead of expiring it
  we try to wait for next request to come from that queue hence giving it
  higher disk time. A higher value of async_slice_idle, around 300ms, helps
  me get some right numbers for my setup. Note: higher disk time would not
  necessarily translate in more IO done as higher weight group is not pushing
  enough IO to io scheduler. It is just a debugging aid to prove correctness
  of IO controller by providing higher disk times to higher weight cgroup.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
---
 block/blk-sysfs.c   |    7 +++++
 block/elevator-fq.c |   65 +++++++++++++++++++++++++++++++++++++++++++++++---
 block/elevator-fq.h |    7 +++++
 3 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b60b76e..f245f33 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -314,6 +314,12 @@ static struct queue_sysfs_entry queue_slice_idle_entry = {
 	.store = elv_slice_idle_store,
 };
 
+static struct queue_sysfs_entry queue_async_slice_idle_entry = {
+	.attr = {.name = "async_slice_idle", .mode = S_IRUGO | S_IWUSR },
+	.show = elv_async_slice_idle_show,
+	.store = elv_async_slice_idle_store,
+};
+
 static struct queue_sysfs_entry queue_slice_sync_entry = {
 	.attr = {.name = "slice_sync", .mode = S_IRUGO | S_IWUSR },
 	.show = elv_slice_sync_show,
@@ -349,6 +355,7 @@ static struct attribute *default_attrs[] = {
 	&queue_iostats_entry.attr,
 #ifdef CONFIG_ELV_FAIR_QUEUING
 	&queue_slice_idle_entry.attr,
+	&queue_async_slice_idle_entry.attr,
 	&queue_slice_sync_entry.attr,
 	&queue_slice_async_entry.attr,
 	&queue_fairness_entry.attr,
diff --git a/block/elevator-fq.c b/block/elevator-fq.c
index 207bdf1..7a9f196 100644
--- a/block/elevator-fq.c
+++ b/block/elevator-fq.c
@@ -22,6 +22,7 @@ const int elv_slice_sync = HZ / 10;
 int elv_slice_async = HZ / 25;
 const int elv_slice_async_rq = 2;
 int elv_slice_idle = HZ / 125;
+int elv_async_slice_idle = 0;
 static struct kmem_cache *elv_ioq_pool;
 
 /* Maximum Window length for updating average disk rate */
@@ -2695,6 +2696,46 @@ ssize_t elv_slice_idle_store(struct request_queue *q, const char *name,
 	return count;
 }
 
+/* Functions to show and store elv_idle_slice value through sysfs */
+ssize_t elv_async_slice_idle_show(struct request_queue *q, char *name)
+{
+	struct elv_fq_data *efqd;
+	unsigned int data;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	efqd = &q->elevator->efqd;
+	data = jiffies_to_msecs(efqd->elv_async_slice_idle);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+	return sprintf(name, "%d\n", data);
+}
+
+ssize_t elv_async_slice_idle_store(struct request_queue *q, const char *name,
+			  size_t count)
+{
+	struct elv_fq_data *efqd;
+	unsigned int data;
+	unsigned long flags;
+
+	char *p = (char *)name;
+
+	data = simple_strtoul(p, &p, 10);
+
+	if (data < 0)
+		data = 0;
+	else if (data > INT_MAX)
+		data = INT_MAX;
+
+	data = msecs_to_jiffies(data);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	efqd = &q->elevator->efqd;
+	efqd->elv_async_slice_idle = data;
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	return count;
+}
+
 /* Functions to show and store elv_slice_sync value through sysfs */
 ssize_t elv_slice_sync_show(struct request_queue *q, char *name)
 {
@@ -2945,8 +2986,8 @@ int elv_init_ioq(struct elevator_queue *eq, struct io_queue *ioq,
 		ioq->pid = current->pid;
 
 	ioq->sched_queue = sched_queue;
-	if (is_sync && !elv_ioq_class_idle(ioq))
-		elv_mark_ioq_idle_window(ioq);
+	if (!elv_ioq_class_idle(ioq) && (is_sync || efqd->fairness))
+			elv_mark_ioq_idle_window(ioq);
 	bfq_init_entity(&ioq->entity, iog);
 	ioq->entity.budget = elv_prio_to_slice(efqd, ioq);
 	return 0;
@@ -3568,7 +3609,12 @@ void elv_ioq_arm_slice_timer(struct request_queue *q, int wait_for_busy)
 	/*
 	 * idle is disabled, either manually or by past process history
 	 */
-	if (!efqd->elv_slice_idle || !elv_ioq_idle_window(ioq))
+	if ((elv_ioq_sync(ioq) && !efqd->elv_slice_idle) ||
+			!elv_ioq_idle_window(ioq))
+		return;
+
+	/* If this is async queue and async_slice_idle is disabled, return */
+	if (!elv_ioq_sync(ioq) && !efqd->elv_async_slice_idle)
 		return;
 
 	/*
@@ -3577,7 +3623,10 @@ void elv_ioq_arm_slice_timer(struct request_queue *q, int wait_for_busy)
 	 */
 	if (wait_for_busy) {
 		elv_mark_ioq_wait_busy(ioq);
-		sl = efqd->elv_slice_idle;
+		if (elv_ioq_sync(ioq))
+			sl = efqd->elv_slice_idle;
+		else
+			sl = efqd->elv_async_slice_idle;
 		mod_timer(&efqd->idle_slice_timer, jiffies + sl);
 		elv_log_ioq(efqd, ioq, "arm idle: %lu wait busy=1", sl);
 		return;
@@ -3959,6 +4008,13 @@ void elv_ioq_completed_request(struct request_queue *q, struct request *rq)
 			goto done;
 		}
 
+		/* For async queue try to do wait busy */
+		if (efqd->fairness && !elv_ioq_sync(ioq) && !ioq->nr_queued
+		    && (elv_iog_nr_active(iog) <= 1)) {
+			elv_ioq_arm_slice_timer(q, 1);
+			goto done;
+		}
+
 		/*
 		 * If there are no requests waiting in this queue, and
 		 * there are other queues ready to issue requests, AND
@@ -4087,6 +4143,7 @@ int elv_init_fq_data(struct request_queue *q, struct elevator_queue *e)
 	efqd->elv_slice[0] = elv_slice_async;
 	efqd->elv_slice[1] = elv_slice_sync;
 	efqd->elv_slice_idle = elv_slice_idle;
+	efqd->elv_async_slice_idle = elv_async_slice_idle;
 	efqd->hw_tag = 1;
 
 	/* For the time being keep fairness enabled by default */
diff --git a/block/elevator-fq.h b/block/elevator-fq.h
index b5cff90..2022210 100644
--- a/block/elevator-fq.h
+++ b/block/elevator-fq.h
@@ -344,6 +344,8 @@ struct elv_fq_data {
 	 * users of this functionality.
 	 */
 	unsigned int elv_slice_idle;
+	/* idle slice for async queue */
+	unsigned int elv_async_slice_idle;
 	struct timer_list idle_slice_timer;
 	struct work_struct unplug_work;
 
@@ -655,6 +657,11 @@ extern ssize_t elv_slice_idle_store(struct request_queue *q, const char *name,
 extern ssize_t elv_slice_sync_show(struct request_queue *q, char *name);
 extern ssize_t elv_slice_sync_store(struct request_queue *q, const char *name,
 						size_t count);
+
+extern ssize_t elv_async_slice_idle_show(struct request_queue *q, char *name);
+extern ssize_t elv_async_slice_idle_store(struct request_queue *q,
+					const char *name, size_t count);
+
 extern ssize_t elv_slice_async_show(struct request_queue *q, char *name);
 extern ssize_t elv_slice_async_store(struct request_queue *q, const char *name,
 						size_t count);
-- 
1.6.0.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/