Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1756264AbZGBUEn (ORCPT ); Thu, 2 Jul 2009 16:04:43 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1755388AbZGBUDQ (ORCPT ); Thu, 2 Jul 2009 16:03:16 -0400 Received: from mx2.redhat.com ([66.187.237.31]:52975 "EHLO mx2.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754466AbZGBUDG (ORCPT ); Thu, 2 Jul 2009 16:03:06 -0400 From: Vivek Goyal To: linux-kernel@vger.kernel.org, containers@lists.linux-foundation.org, dm-devel@redhat.com, jens.axboe@oracle.com, nauman@google.com, dpshah@google.com, lizf@cn.fujitsu.com, mikew@google.com, fchecconi@gmail.com, paolo.valente@unimore.it, ryov@valinux.co.jp, fernando@oss.ntt.co.jp, s-uchida@ap.jp.nec.com, taka@valinux.co.jp, guijianfeng@cn.fujitsu.com, jmoyer@redhat.com, dhaval@linux.vnet.ibm.com, balbir@linux.vnet.ibm.com, righi.andrea@gmail.com, m-ikeda@ds.jp.nec.com, jbaron@redhat.com Cc: agk@redhat.com, snitzer@redhat.com, vgoyal@redhat.com, akpm@linux-foundation.org, peterz@infradead.org Subject: [PATCH 05/25] io-controller: Charge for time slice based on average disk rate Date: Thu, 2 Jul 2009 16:01:37 -0400 Message-Id: <1246564917-19603-6-git-send-email-vgoyal@redhat.com> In-Reply-To: <1246564917-19603-1-git-send-email-vgoyal@redhat.com> References: <1246564917-19603-1-git-send-email-vgoyal@redhat.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 8560 Lines: 236 o There are situations where a queue gets expired very soon and it looks as if time slice used by that queue is zero. For example, If an async queue dispatches a bunch of requests and queue is expired before first request completes. Another example is where a queue is expired as soon as first request completes and queue has no more requests (sync queues on SSD). o Currently we just charge 25% of slice length in such cases. This patch tries to improve on that approximation by keeping a track of average disk rate and charging for time by nr_sectors/disk_rate. o This is still experimental, not very sure if it gives measurable improvement or not. May be a better scheme is to use something more granular than jiffies for time keeping for io queues. Signed-off-by: Vivek Goyal --- block/elevator-fq.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++---- block/elevator-fq.h | 11 ++++++ 2 files changed, 101 insertions(+), 7 deletions(-) diff --git a/block/elevator-fq.c b/block/elevator-fq.c index 6f23d7e..67c02b9 100644 --- a/block/elevator-fq.c +++ b/block/elevator-fq.c @@ -23,6 +23,9 @@ const int elv_slice_async_rq = 2; int elv_slice_idle = HZ / 125; static struct kmem_cache *elv_ioq_pool; +/* Maximum Window length for updating average disk rate */ +static int elv_rate_sampling_window = HZ / 10; + #define ELV_SLICE_SCALE (5) #define ELV_HW_QUEUE_MIN (5) @@ -941,6 +944,47 @@ static void elv_ioq_update_io_thinktime(struct io_queue *ioq) ioq->ttime_mean = (ioq->ttime_total + 128) / ioq->ttime_samples; } +static void elv_update_io_rate(struct elv_fq_data *efqd, struct request *rq) +{ + long elapsed = jiffies - efqd->rate_sampling_start; + unsigned long total; + + /* sampling window is off */ + if (!efqd->rate_sampling_start) + return; + + efqd->rate_sectors_current += blk_rq_sectors(rq); + + if (efqd->rq_in_driver && (elapsed < elv_rate_sampling_window)) + return; + + efqd->rate_sectors = (7*efqd->rate_sectors + + 256*efqd->rate_sectors_current) / 8; + + if (!elapsed) { + /* + * updating rate before a jiffy could complete. Could be a + * problem with fast queuing/non-queuing hardware. Should we + * look at higher resolution time source? + * + * In case of non-queuing hardware we will probably not try to + * dispatch from multiple queues and will be able to account + * for disk time used and will not need this approximation + * anyway? + */ + elapsed = 1; + } + + efqd->rate_time = (7*efqd->rate_time + 256*elapsed) / 8; + total = efqd->rate_sectors + (efqd->rate_time/2); + efqd->mean_rate = total/efqd->rate_time; + + elv_log(efqd, "mean_rate=%d, t=%d s=%d", efqd->mean_rate, + elapsed, efqd->rate_sectors_current); + efqd->rate_sampling_start = 0; + efqd->rate_sectors_current = 0; +} + /* * Disable idle window if the process thinks too long. * This idle flag can also be updated by io scheduler. @@ -1231,6 +1275,34 @@ static void elv_del_ioq_busy(struct elevator_queue *e, struct io_queue *ioq, } /* + * Calculate the effective disk time used by the queue based on how many + * sectors queue has dispatched and what is the average disk rate + * Returns disk time in ms. + */ +static inline unsigned long elv_disk_time_used(struct request_queue *q, + struct io_queue *ioq) +{ + struct elv_fq_data *efqd = &q->elevator->efqd; + struct io_entity *entity = &ioq->entity; + unsigned long jiffies_used = 0; + + if (!efqd->mean_rate) + return entity->budget/4; + + /* Charge the queue based on average disk rate */ + jiffies_used = ioq->nr_sectors/efqd->mean_rate; + + if (!jiffies_used) + jiffies_used = 1; + + elv_log_ioq(efqd, ioq, "disk time=%ldms sect=%lu rate=%ld", + jiffies_to_msecs(jiffies_used), + ioq->nr_sectors, efqd->mean_rate); + + return jiffies_used; +} + +/* * Do the accounting. Determine how much service (in terms of time slices) * current queue used and adjust the start, finish time of queue and vtime * of the tree accordingly. @@ -1248,8 +1320,10 @@ static void elv_del_ioq_busy(struct elevator_queue *e, struct io_queue *ioq, * from next queue. * * Not sure how to determine the time consumed by queue in such scenarios. - * Currently as a crude approximation, we are charging 25% of time slice - * for such cases. A better mechanism is needed for accurate accounting. + * Currently as a crude approximation, try to keep track of average disk rate + * and charge the queue based on number of sectors transferred. If suffcient + * disk rate data is not available then we are charging 25% of time slice + * for such cases. A better mechanism, is needed for accurate accounting. */ void __elv_ioq_slice_expired(struct request_queue *q, struct io_queue *ioq) { @@ -1270,9 +1344,9 @@ void __elv_ioq_slice_expired(struct request_queue *q, struct io_queue *ioq) * reuqest from the queue got completed. Of course we are not planning * to idle on the queue otherwise we would not have expired it. * - * Charge for the 25% slice in such cases. This is not the best thing - * to do but at the same time not very sure what's the next best - * thing to do. + * Charge the queue based on average disk rate or the 25% slice if + * mean rate is 0. This is not the best thing to do but at the same + * time not very sure what's the next best thing to do. * * This arises from that fact that we don't have the notion of * one queue being operational at one time. io scheduler can dispatch @@ -1282,7 +1356,7 @@ void __elv_ioq_slice_expired(struct request_queue *q, struct io_queue *ioq) * the requests to finish. But this will reduce throughput. */ if (!ioq->slice_end) - slice_used = entity->budget/4; + slice_used = elv_disk_time_used(q, ioq); else { if (time_after(ioq->slice_end, jiffies)) { slice_unused = ioq->slice_end - jiffies; @@ -1292,7 +1366,7 @@ void __elv_ioq_slice_expired(struct request_queue *q, struct io_queue *ioq) * completing first request. Charge 25% of * slice. */ - slice_used = entity->budget/4; + slice_used = elv_disk_time_used(q, ioq); } else slice_used = entity->budget - slice_unused; } else { @@ -1310,6 +1384,8 @@ void __elv_ioq_slice_expired(struct request_queue *q, struct io_queue *ioq) BUG_ON(ioq != efqd->active_queue); elv_reset_active_ioq(efqd); + /* Queue is being expired. Reset number of secotrs dispatched */ + ioq->nr_sectors = 0; if (!ioq->nr_queued) elv_del_ioq_busy(q->elevator, ioq, 1); else @@ -1671,6 +1747,7 @@ void elv_fq_dispatched_request(struct elevator_queue *e, struct request *rq) BUG_ON(!ioq); elv_ioq_request_dispatched(ioq); + ioq->nr_sectors += blk_rq_sectors(rq); elv_ioq_request_removed(e, rq); elv_clear_ioq_must_dispatch(ioq); } @@ -1683,6 +1760,10 @@ void elv_fq_activate_rq(struct request_queue *q, struct request *rq) return; efqd->rq_in_driver++; + + if (!efqd->rate_sampling_start) + efqd->rate_sampling_start = jiffies; + elv_log_ioq(efqd, rq->ioq, "activate rq, drv=%d", efqd->rq_in_driver); } @@ -1746,6 +1827,8 @@ void elv_ioq_completed_request(struct request_queue *q, struct request *rq) efqd->rq_in_driver--; ioq->dispatched--; + elv_update_io_rate(efqd, rq); + if (sync) ioq->last_end_request = jiffies; diff --git a/block/elevator-fq.h b/block/elevator-fq.h index a7cbc0f..4b69239 100644 --- a/block/elevator-fq.h +++ b/block/elevator-fq.h @@ -165,6 +165,9 @@ struct io_queue { /* Requests dispatched from this queue */ int dispatched; + /* Number of sectors dispatched in current dispatch round */ + unsigned long nr_sectors; + /* Keep a track of think time of processes in this queue */ unsigned long last_end_request; unsigned long ttime_total; @@ -228,6 +231,14 @@ struct elv_fq_data { /* Base slice length for sync and async queues */ unsigned int elv_slice[2]; + + /* Fields for keeping track of average disk rate */ + unsigned long rate_sectors; /* number of sectors finished */ + unsigned long rate_time; /* jiffies elapsed */ + unsigned long mean_rate; /* sectors per jiffy */ + unsigned long long rate_sampling_start; /*sampling window start jifies*/ + /* number of sectors finished io during current sampling window */ + unsigned long rate_sectors_current; }; /* Logging facilities. */ -- 1.6.0.6 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/