When the number of processes performing I/O concurrently increases,
a fixed time slice per process will cause large latencies.
This patch, if low_latency mode is enabled, will scale the time slice
assigned to each process according to a 300ms target latency.
In order to keep fairness among processes:
* The number of active processes is computed using a special form of
running average, that quickly follows sudden increases (to keep latency low),
and decrease slowly (to have fairness in spite of rapid decreases of this
value).
To safeguard sequential bandwidth, we impose a minimum time slice
(computed using 2*cfq_slice_idle as base, adjusted according to priority
and async-ness).
Signed-off-by: Corrado Zoccolo <[email protected]>
---
block/cfq-iosched.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 44 insertions(+), 2 deletions(-)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 069a610..25a3f46 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -27,6 +27,8 @@ static const int cfq_slice_sync = HZ / 10;
static int cfq_slice_async = HZ / 25;
static const int cfq_slice_async_rq = 2;
static int cfq_slice_idle = HZ / 125;
+static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
+static const int cfq_hist_divisor = 4;
/*
* offset from end of service tree
@@ -134,6 +136,8 @@ struct cfq_data {
struct rb_root prio_trees[CFQ_PRIO_LISTS];
unsigned int busy_queues;
+ unsigned int busy_rt_queues;
+ unsigned int busy_queues_avg[2];
int rq_in_driver[2];
int sync_flight;
@@ -301,10 +305,45 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
}
+/*
+ * get averaged number of queues of RT/BE priority.
+ * average is updated, with a formula that gives more weight to higher numbers,
+ * to quickly follows sudden increases and decrease slowly
+ */
+
+static inline unsigned
+cfq_get_avg_queues(struct cfq_data *cfqd, bool rt) {
+ unsigned min_q, max_q;
+ unsigned mult = cfq_hist_divisor - 1;
+ unsigned round = cfq_hist_divisor / 2;
+ unsigned busy = cfqd->busy_rt_queues;
+
+ if (!rt)
+ busy = cfqd->busy_queues - cfqd->busy_rt_queues;
+
+ min_q = min(cfqd->busy_queues_avg[rt], busy);
+ max_q = max(cfqd->busy_queues_avg[rt], busy);
+ cfqd->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
+ cfq_hist_divisor;
+ return cfqd->busy_queues_avg[rt];
+}
+
static inline void
cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
- cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
+ unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
+ if (cfqd->cfq_latency) {
+ unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq));
+ unsigned process_thr = cfq_target_latency / cfqd->cfq_slice[1];
+ if (iq > process_thr) {
+ unsigned low_slice = 2 * slice * cfqd->cfq_slice_idle
+ / cfqd->cfq_slice[1];
+ slice = max(slice * cfq_target_latency /
+ (cfqd->cfq_slice[1] * iq),
+ min(slice, low_slice));
+ }
+ }
+ cfqq->slice_end = jiffies + slice;
cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
}
@@ -655,7 +694,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
BUG_ON(cfq_cfqq_on_rr(cfqq));
cfq_mark_cfqq_on_rr(cfqq);
cfqd->busy_queues++;
-
+ if (cfq_class_rt(cfqq))
+ cfqd->busy_rt_queues++;
cfq_resort_rr_list(cfqd, cfqq);
}
@@ -678,6 +718,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
BUG_ON(!cfqd->busy_queues);
cfqd->busy_queues--;
+ if (cfq_class_rt(cfqq))
+ cfqd->busy_rt_queues--;
}
/*
--
1.6.2.5
Hi, Corrado!
Corrado Zoccolo <[email protected]> writes:
> When the number of processes performing I/O concurrently increases,
> a fixed time slice per process will cause large latencies.
>
> This patch, if low_latency mode is enabled, will scale the time slice
> assigned to each process according to a 300ms target latency.
>
> In order to keep fairness among processes:
> * The number of active processes is computed using a special form of
> running average, that quickly follows sudden increases (to keep latency low),
> and decrease slowly (to have fairness in spite of rapid decreases of this
> value).
>
> To safeguard sequential bandwidth, we impose a minimum time slice
> (computed using 2*cfq_slice_idle as base, adjusted according to priority
> and async-ness).
I like the idea as well, but I have a question and some nits to pick.
> static inline void
> cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
> {
> - cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
> + unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
> + if (cfqd->cfq_latency) {
> + unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq));
> + unsigned process_thr = cfq_target_latency / cfqd->cfq_slice[1];
> + if (iq > process_thr) {
> + unsigned low_slice = 2 * slice * cfqd->cfq_slice_idle
> + / cfqd->cfq_slice[1];
> + slice = max(slice * cfq_target_latency /
> + (cfqd->cfq_slice[1] * iq),
Couldn't you have just divided the slice by iq? And why iq? Why not
nr_qs or avg_qlen or something? It's a minor nit; I can live with it.
> + min(slice, low_slice));
> + }
> + }
> + cfqq->slice_end = jiffies + slice;
> cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
Wow. That function is *dense*. I tried to write it in a more
readable fashion, but please chime in if I misinterpreted anything.
static inline void
cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
if (cfqd->cfq_latency) {
unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq));
unsigned slice_sync = cfqd->cfq_slice[1];
unsigned process_thr = cfq_target_latency / slice_sync;
if (iq > process_thr) {
/*
* Minimum slice is computed using 2*slice_idle as
* a base, and then scaling it by priority and
* async-ness.
*/
unsigned total_sync = slice_sync * iq;
unsigned slice_fraction = cfq_target_latency / total_sync;
unsigned min_slice = (2 * cfqd->cfq_slice_idle) *
(slice / slice_sync);
min_slice = min(slice, min_slice);
slice *= slice_fraction;
slice = max(slice, min_slice);
}
}
cfqq->slice_end = jiffies + slice;
cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
}
Cheers,
Jeff
Hi, Corrado!
Sorry if folks receive this twice, but my mailer and I had an argument
about this message. ;-)
Corrado Zoccolo <[email protected]> writes:
> When the number of processes performing I/O concurrently increases,
> a fixed time slice per process will cause large latencies.
>
> This patch, if low_latency mode is enabled, will scale the time slice
> assigned to each process according to a 300ms target latency.
>
> In order to keep fairness among processes:
> * The number of active processes is computed using a special form of
> running average, that quickly follows sudden increases (to keep latency low),
> and decrease slowly (to have fairness in spite of rapid decreases of this
> value).
>
> To safeguard sequential bandwidth, we impose a minimum time slice
> (computed using 2*cfq_slice_idle as base, adjusted according to priority
> and async-ness).
I like the idea as well, but I have a question and some nits to pick.
> static inline void
> cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
> {
> - cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
> + unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
> + if (cfqd->cfq_latency) {
> + unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq));
> + unsigned process_thr = cfq_target_latency / cfqd->cfq_slice[1];
> + if (iq > process_thr) {
> + unsigned low_slice = 2 * slice * cfqd->cfq_slice_idle
> + / cfqd->cfq_slice[1];
> + slice = max(slice * cfq_target_latency /
> + (cfqd->cfq_slice[1] * iq),
Couldn't you have just divided the slice by iq? And why iq? Why not
nr_qs or avg_qlen or something? It's a minor nit; I can live with it.
> + min(slice, low_slice));
> + }
> + }
> + cfqq->slice_end = jiffies + slice;
> cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
Wow. That function is *dense*. I tried to write it in a more
readable fashion, but please chime in if I misinterpreted anything.
static inline void
cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
if (cfqd->cfq_latency) {
unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq));
unsigned slice_sync = cfqd->cfq_slice[1];
unsigned process_thr = cfq_target_latency / slice_sync;
if (iq > process_thr) {
/*
* Minimum slice is computed using 2*slice_idle as
* a base, and then scaling it by priority and
* async-ness.
*/
unsigned total_sync = slice_sync * iq;
unsigned slice_fraction = cfq_target_latency / total_sync;
unsigned min_slice = (2 * cfqd->cfq_slice_idle) *
(slice / slice_sync);
min_slice = min(slice, min_slice);
slice *= slice_fraction;
slice = max(slice, min_slice);
}
}
cfqq->slice_end = jiffies + slice;
cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
}
Cheers,
Jeff
Hi Jeff,
On Wed, Oct 21, 2009 at 5:57 PM, Jeff Moyer <[email protected]> wrote:
> Hi, Corrado!
>
> Sorry if folks receive this twice, but my mailer and I had an argument
> about this message. ;-)
>
> Corrado Zoccolo <[email protected]> writes:
>
>> When the number of processes performing I/O concurrently increases,
>> a fixed time slice per process will cause large latencies.
>>
>> This patch, if low_latency mode is enabled, will scale the time slice
>> assigned to each process according to a 300ms target latency.
>>
>> In order to keep fairness among processes:
>> * The number of active processes is computed using a special form of
>> running average, that quickly follows sudden increases (to keep latency low),
>> and decrease slowly (to have fairness in spite of rapid decreases of this
>> value).
>>
>> To safeguard sequential bandwidth, we impose a minimum time slice
>> (computed using 2*cfq_slice_idle as base, adjusted according to priority
>> and async-ness).
>
> I like the idea as well, but I have a question and some nits to pick.
>
>> static inline void
>> cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>> {
>> - cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
>> + unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
>> + if (cfqd->cfq_latency) {
>> + unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq));
>> + unsigned process_thr = cfq_target_latency / cfqd->cfq_slice[1];
>> + if (iq > process_thr) {
>> + unsigned low_slice = 2 * slice * cfqd->cfq_slice_idle
>> + / cfqd->cfq_slice[1];
>> + slice = max(slice * cfq_target_latency /
>> + (cfqd->cfq_slice[1] * iq),
>
> Couldn't you have just divided the slice by iq? And why iq? Why not
> nr_qs or avg_qlen or something? It's a minor nit; I can live with it.
iq stands for interested queues, because we are restricting the count
just to the same priority class, not all queues in the system.
>
>> + min(slice, low_slice));
>> + }
>> + }
>> + cfqq->slice_end = jiffies + slice;
>> cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
>
> Wow. That function is *dense*. I tried to write it in a more
> readable fashion, but please chime in if I misinterpreted anything.
>
> static inline void
> cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
> {
> unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
>
> if (cfqd->cfq_latency) {
> unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq));
> unsigned slice_sync = cfqd->cfq_slice[1];
> unsigned process_thr = cfq_target_latency / slice_sync;
>
> if (iq > process_thr) {
> /*
> * Minimum slice is computed using 2*slice_idle as
> * a base, and then scaling it by priority and
> * async-ness.
> */
> unsigned total_sync = slice_sync * iq;
> unsigned slice_fraction = cfq_target_latency / total_sync;
> unsigned min_slice = (2 * cfqd->cfq_slice_idle) *
> (slice / slice_sync);
> min_slice = min(slice, min_slice);
> slice *= slice_fraction;
> slice = max(slice, min_slice);
> }
> }
> cfqq->slice_end = jiffies + slice;
> cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
> }
>
I don't think this is equivalent. You seem to compute some divisions
too early, losing in precision.
slice * cfq_target_latency / (cfqd->cfq_slice[1] * iq)
is not generally equivalent to:
slice * (cfq_target_latency / (cfqd->cfq_slice[1] * iq))
that is what you are computing.
There is an other such case in your simplification.
Corrado
>
> Cheers,
> Jeff
>
Corrado Zoccolo <[email protected]> writes:
>> Couldn't you have just divided the slice by iq? And why iq? Why not
>> nr_qs or avg_qlen or something? It's a minor nit; I can live with it.
>
> iq stands for interested queues, because we are restricting the count
> just to the same priority class, not all queues in the system.
Ah, that makes sense. A comment would make that clear.
> I don't think this is equivalent. You seem to compute some divisions
> too early, losing in precision.
OK, I figured as much.
> slice * cfq_target_latency / (cfqd->cfq_slice[1] * iq)
> is not generally equivalent to:
> slice * (cfq_target_latency / (cfqd->cfq_slice[1] * iq))
> that is what you are computing.
> There is an other such case in your simplification.
OK. The idea was to try to make the thing a bit more digestable.
Honestly, I had to re-write it to figure out what it was doing. Could
you rework it so the logic is more obvious to others and still correct?
Thanks!
Jeff
Hi Jeff,
On Wed, Oct 21, 2009 at 6:36 PM, Jeff Moyer <[email protected]> wrote:
> Corrado Zoccolo <[email protected]> writes:
>
> OK. The idea was to try to make the thing a bit more digestable.
> Honestly, I had to re-write it to figure out what it was doing. Could
> you rework it so the logic is more obvious to others and still correct?
What about:
/* interested queues (we consider only the ones with the same priority class) */
unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq));
unsigned sync_slice = cfqd->cfq_slice[1];
unsigned expected_latency = sync_slice * iq;
if (expected_latency > target_latency) {
unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
/* scale low_slice according to IO priority and sync vs async */
unsigned low_slice = min(slice, base_low_slice * slice / sync_slice);
/* the adapted slice value is scaled to fit all iqs into the target latency */
slice = max(slice * cfq_target_latency / expected_latency, low_slice);
}
BTW, the other complex part of the patch, i.e. the definition of
cfq_get_avg_queues, could be simplified if we hard-code the divisor,
that is currently defined outside to allow for some experimentation.
Thanks for your comments
Corrado
>
> Thanks!
> Jeff
>
--
__________________________________________________________________________
dott. Corrado Zoccolo mailto:[email protected]
PhD - Department of Computer Science - University of Pisa, Italy
--------------------------------------------------------------------------
Corrado Zoccolo <[email protected]> writes:
> Hi Jeff,
> On Wed, Oct 21, 2009 at 6:36 PM, Jeff Moyer <[email protected]> wrote:
>> Corrado Zoccolo <[email protected]> writes:
>>
>> OK. The idea was to try to make the thing a bit more digestable.
>> Honestly, I had to re-write it to figure out what it was doing. Could
>> you rework it so the logic is more obvious to others and still correct?
>
> What about:
> /* interested queues (we consider only the ones with the same priority class) */
> unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq));
> unsigned sync_slice = cfqd->cfq_slice[1];
> unsigned expected_latency = sync_slice * iq;
> if (expected_latency > target_latency) {
> unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
> /* scale low_slice according to IO priority and sync vs async */
> unsigned low_slice = min(slice, base_low_slice * slice / sync_slice);
> /* the adapted slice value is scaled to fit all iqs into the target latency */
> slice = max(slice * cfq_target_latency / expected_latency, low_slice);
> }
Yeah, that looks a ton better. Thanks for doing that!
Cheers,
Jeff
On Mon, Oct 26, 2009 at 4:10 PM, Jeff Moyer <[email protected]> wrote:
> Corrado Zoccolo <[email protected]> writes:
>>
>> What about:
>> /* interested queues (we consider only the ones with the same priority class) */
>> unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq));
>> unsigned sync_slice = cfqd->cfq_slice[1];
>> unsigned expected_latency = sync_slice * iq;
>> if (expected_latency > target_latency) {
>> unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
>> /* scale low_slice according to IO priority and sync vs async */
>> unsigned low_slice = min(slice, base_low_slice * slice / sync_slice);
>> /* the adapted slice value is scaled to fit all iqs into the target latency */
>> slice = max(slice * cfq_target_latency / expected_latency, low_slice);
>> }
>
> Yeah, that looks a ton better. Thanks for doing that!
Jens, if you like the new version of the code (integrating this last
change), I can rebase my patch set on top of Jeff's, since it is
already queued for 2.6.33 .
Corrado
>
> Cheers,
> Jeff
>