2009-10-19 20:21:49

by Corrado Zoccolo

[permalink] [raw]
Subject: [RFC PATCH 5/5] cfq-iosched: fairness for sync no-idle queues

Currently no-idle queues in cfq are not serviced fairly:
even if they can only dispatch a small number of requests at a time,
they have to compete with idling queues to be serviced, experiencing
large latencies.

We should notice, instead, that no-idle queues are the ones that would
benefit most from having low latency, in fact they are any of:
* processes with large think times (e.g. interactive ones like file
managers)
* seeky (e.g. programs faulting in their code at startup)
* or marked as no-idle from upper levels, to improve latencies of those
requests.

This patch improves the fairness and latency for those queues, by:
* separating sync idle, sync no-idle and async queues in separate
service_trees, for each priority
* service all no-idle queues together
* and idling when the last no-idle queue has been serviced, to
anticipate for more no-idle work
* the timeslices allotted for idle and no-idle service_trees are
computed proportionally to the number of processes in each set.

Servicing all no-idle queues together should have a performance boost
for NCQ-capable drives, without compromising fairness.

Signed-off-by: Corrado Zoccolo <[email protected]>
---
block/cfq-iosched.c | 166 +++++++++++++++++++++++++++++++++++++++++++--------
1 files changed, 141 insertions(+), 25 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c6dbefc..1fe1a2c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -127,6 +127,12 @@ enum wl_prio_t {
RT_WL = 1
};

+enum wl_type_t {
+ ASYNC_WL = 0,
+ SYNC_NOIDLE_WL = 1,
+ SYNC_WL = 2
+};
+
/*
* Per block device queue structure
*/
@@ -136,10 +142,12 @@ struct cfq_data {
/*
* rr list of queues with requests and the count of them
*/
- struct cfq_rb_root service_trees[2];
+ struct cfq_rb_root service_trees[2][3];
struct cfq_rb_root service_tree_idle;

enum wl_prio_t serving_prio;
+ enum wl_type_t serving_type;
+ unsigned long workload_expires;

/*
* Each priority tree is sorted by next_request position. These
@@ -202,10 +210,11 @@ struct cfq_data {
};

static struct cfq_rb_root *service_tree_for(enum wl_prio_t prio,
+ enum wl_type_t type,
struct cfq_data *cfqd)
{
return prio == IDLE_WL ? &cfqd->service_tree_idle :
- &cfqd->service_trees[prio];
+ &cfqd->service_trees[prio][type];
}

enum cfqq_state_flags {
@@ -252,10 +261,16 @@ CFQ_CFQQ_FNS(coop);
#define cfq_log(cfqd, fmt, args...) \
blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)

+#define CIC_SEEK_THR (8 * 1024)
+#define CIC_SEEKY(cic) ((cic)->seek_mean > CIC_SEEK_THR)
+#define CFQQ_SEEKY(cfqq) (!cfqq->cic || CIC_SEEKY(cfqq->cic))
+
static inline int cfq_busy_queues_wl(enum wl_prio_t wl, struct cfq_data *cfqd)
{
return wl == IDLE_WL ? cfqd->service_tree_idle.count :
- cfqd->service_trees[wl].count;
+ cfqd->service_trees[wl][ASYNC_WL].count
+ + cfqd->service_trees[wl][SYNC_NOIDLE_WL].count
+ + cfqd->service_trees[wl][SYNC_WL].count;
}

static void cfq_dispatch_insert(struct request_queue *, struct request *);
@@ -350,7 +365,7 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
if (cfqd->cfq_latency) {
unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq));
- unsigned process_thr = cfqd->cfq_target_latency
+ unsigned process_thr = cfq_target_latency
/ cfqd->cfq_slice[1];
if (iq > process_thr) {
unsigned low_slice = 2 * slice * cfqd->cfq_slice_idle
@@ -564,7 +579,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
rb_key += jiffies;
} else if (!add_front) {
enum wl_prio_t prio = cfq_class_rt(cfqq) ? RT_WL : BE_WL;
- service_tree = service_tree_for(prio, cfqd);
+ enum wl_type_t type = cfq_cfqq_sync(cfqq) ? SYNC_WL : ASYNC_WL;
+ if (type == SYNC_WL && (CFQQ_SEEKY(cfqq) ||
+ !cfq_cfqq_idle_window(cfqq)))
+ type = SYNC_NOIDLE_WL;
+ service_tree = service_tree_for(prio, type, cfqd);
/*
* Get our rb key offset. Subtract any residual slice
* value carried from last service. A negative resid
@@ -576,7 +595,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
cfqq->slice_resid = 0;
} else {
enum wl_prio_t prio = cfq_class_rt(cfqq) ? RT_WL : BE_WL;
- service_tree = service_tree_for(prio, cfqd);
+ enum wl_type_t type = cfq_cfqq_sync(cfqq) ? SYNC_WL : ASYNC_WL;
+ if (type == SYNC_WL && (CFQQ_SEEKY(cfqq) ||
+ !cfq_cfqq_idle_window(cfqq)))
+ type = SYNC_NOIDLE_WL;
+ service_tree = service_tree_for(prio, type, cfqd);
rb_key = -HZ;
__cfqq = cfq_rb_first(service_tree);
rb_key += __cfqq ? __cfqq->rb_key : jiffies;
@@ -990,7 +1013,7 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
{
struct cfq_rb_root *service_tree =
- service_tree_for(cfqd->serving_prio, cfqd);
+ service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd);

if (RB_EMPTY_ROOT(&service_tree->rb))
return NULL;
@@ -1022,9 +1045,6 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
return cfqd->last_position - blk_rq_pos(rq);
}

-#define CIC_SEEK_THR 8 * 1024
-#define CIC_SEEKY(cic) ((cic)->seek_mean > CIC_SEEK_THR)
-
static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
{
struct cfq_io_context *cic = cfqd->active_cic;
@@ -1128,8 +1148,14 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
if (cfq_cfqq_idle_window(cfqq))
return true;
{
- enum wl_prio_t prio = cfq_class_rt(cfqq) ? RT_WL : BE_WL;
- struct cfq_rb_root *service_tree = service_tree_for(prio, cfqd);
+ struct cfq_rb_root *service_tree = cfqq->service_tree;
+ if (!service_tree) {
+ enum wl_prio_t prio =
+ cfq_class_rt(cfqq) ? RT_WL : BE_WL;
+ enum wl_type_t type =
+ cfq_cfqq_sync(cfqq) ? SYNC_NOIDLE_WL : ASYNC_WL;
+ service_tree = service_tree_for(prio, type, cfqd);
+ }
return service_tree->count == 0 ||
(service_tree->count == 1
&& cfq_rb_first(service_tree) == cfqq);
@@ -1183,14 +1209,20 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)

cfq_mark_cfqq_wait_request(cfqq);

- /*
- * we don't want to idle for seeks, but we do want to allow
- * fair distribution of slice time for a process doing back-to-back
- * seeks. so allow a little bit of time for him to submit a new rq
- */
sl = cfqd->cfq_slice_idle;
- if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
+ /* are we servicing noidle tree, and there are more queues?
+ * non-rotational or NCQ: no idle
+ * non-NCQ rotational : very small idle, to allow
+ * fair distribution of slice time for a process doing back-to-back
+ * seeks.
+ */
+ if (cfqd->serving_type == SYNC_NOIDLE_WL &&
+ service_tree_for(cfqd->serving_prio, SYNC_NOIDLE_WL, cfqd)
+ ->count > 0) {
+ if (blk_queue_nonrot(cfqd->queue) || cfqd->hw_tag)
+ return;
sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
+ }

mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
@@ -1248,6 +1280,29 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
}

+enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, enum wl_prio_t prio)
+{
+ struct cfq_queue *queues[3];
+ int i;
+ unsigned long lowest_key = 0;
+ enum wl_type_t cur_best = SYNC_NOIDLE_WL;
+ for (i = 0; i < 3; ++i) {
+ queues[i] = cfq_rb_first(service_tree_for(prio, i, cfqd));
+ if (queues[i]) {
+ lowest_key = queues[i]->rb_key;
+ cur_best = i;
+ }
+ }
+ for (i = 0; i < 3; ++i) {
+ if (queues[i] && time_before(queues[i]->rb_key, lowest_key)) {
+ lowest_key = queues[i]->rb_key;
+ cur_best = i;
+ }
+ }
+
+ return cur_best;
+}
+
/*
* Select a queue for service. If we have a current active queue,
* check whether to continue servicing it, or retrieve and set a new one.
@@ -1298,13 +1353,67 @@ expire:
cfq_slice_expired(cfqd, 0);
new_queue:
if (!new_cfqq) {
+ enum wl_prio_t previous_prio = cfqd->serving_prio;
+
if (cfq_busy_queues_wl(RT_WL, cfqd))
cfqd->serving_prio = RT_WL;
else if (cfq_busy_queues_wl(BE_WL, cfqd))
cfqd->serving_prio = BE_WL;
- else
+ else {
cfqd->serving_prio = IDLE_WL;
+ cfqd->workload_expires = jiffies + 1;
+ }
+
+ if (cfqd->serving_prio != IDLE_WL) {
+ int counts[] = {
+ service_tree_for(cfqd->serving_prio,
+ ASYNC_WL, cfqd)->count,
+ service_tree_for(cfqd->serving_prio,
+ SYNC_NOIDLE_WL, cfqd)->count,
+ service_tree_for(cfqd->serving_prio,
+ SYNC_WL, cfqd)->count
+ };
+ int nonzero_counts = !!counts[0] + !!counts[1]
+ + !!counts[2];
+
+ if (previous_prio != cfqd->serving_prio ||
+ (nonzero_counts == 1)) {
+ cfqd->serving_type = counts[1] ? SYNC_NOIDLE_WL
+ : counts[2] ? SYNC_WL : ASYNC_WL;
+ } else {
+ if (!counts[cfqd->serving_type] ||
+ time_after(jiffies, cfqd->workload_expires))
+ cfqd->serving_type = cfq_choose_wl
+ (cfqd, cfqd->serving_prio);
+ else
+ goto same_wl;
+ }
+
+ {
+ unsigned slice = cfq_target_latency;
+ slice = slice * counts[cfqd->serving_type] /
+ max_t(unsigned, cfqd->busy_queues_avg
+ [cfqd->serving_prio],
+ counts[SYNC_WL] +
+ counts[SYNC_NOIDLE_WL] +
+ counts[ASYNC_WL]);
+
+ if (cfqd->serving_type == ASYNC_WL)
+ slice = max(1U, slice
+ * cfqd->cfq_slice[0]
+ / cfqd->cfq_slice[1]);
+ else {
+ unsigned slice_min =
+ 2 * cfqd->cfq_slice_idle;
+ slice = max(slice, slice_min);
+ slice = max_t(unsigned, slice,
+ CFQ_MIN_TT);
+ }
+ cfqd->workload_expires = jiffies + slice;
+ }
+ }
}
+ same_wl:
cfqq = cfq_set_active_queue(cfqd, new_cfqq);
keep_queue:
return cfqq;
@@ -1331,10 +1440,12 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
{
struct cfq_queue *cfqq;
int dispatched = 0;
- int i;
+ int i, j;
for (i = 0; i < 2; ++i)
- while ((cfqq = cfq_rb_first(&cfqd->service_trees[i])) != NULL)
- dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+ for (j = 0; j < 3; ++j)
+ while ((cfqq = cfq_rb_first(&cfqd->service_trees[i][j]))
+ != NULL)
+ dispatched += __cfq_forced_dispatch_cfqq(cfqq);

while ((cfqq = cfq_rb_first(&cfqd->service_tree_idle)) != NULL)
dispatched += __cfq_forced_dispatch_cfqq(cfqq);
@@ -2086,7 +2197,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);

if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
- (!cfqd->cfq_latency && cfqd->hw_tag && CIC_SEEKY(cic)))
+ (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)))
enable_idle = 0;
else if (sample_valid(cic->ttime_samples)) {
unsigned int slice_idle = cfqd->cfq_slice_idle;
@@ -2130,6 +2241,10 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
if (cfq_class_idle(cfqq))
return true;

+ if (cfqd->serving_type == SYNC_NOIDLE_WL
+ && new_cfqq->service_tree == cfqq->service_tree)
+ return true;
+
/*
* if the new request is sync, but the currently running queue is
* not, let the sync request have priority.
@@ -2577,14 +2692,15 @@ static void cfq_exit_queue(struct elevator_queue *e)
static void *cfq_init_queue(struct request_queue *q)
{
struct cfq_data *cfqd;
- int i;
+ int i, j;

cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
if (!cfqd)
return NULL;

for (i = 0; i < 2; ++i)
- cfqd->service_trees[i] = CFQ_RB_ROOT;
+ for (j = 0; j < 3; ++j)
+ cfqd->service_trees[i][j] = CFQ_RB_ROOT;
cfqd->service_tree_idle = CFQ_RB_ROOT;

/*
--
1.6.2.5


2009-10-20 01:03:11

by Jens Axboe

[permalink] [raw]
Subject: Re: [RFC PATCH 5/5] cfq-iosched: fairness for sync no-idle queues

On Mon, Oct 19 2009, Corrado Zoccolo wrote:
> Currently no-idle queues in cfq are not serviced fairly:
> even if they can only dispatch a small number of requests at a time,
> they have to compete with idling queues to be serviced, experiencing
> large latencies.
>
> We should notice, instead, that no-idle queues are the ones that would
> benefit most from having low latency, in fact they are any of:
> * processes with large think times (e.g. interactive ones like file
> managers)
> * seeky (e.g. programs faulting in their code at startup)
> * or marked as no-idle from upper levels, to improve latencies of those
> requests.
>
> This patch improves the fairness and latency for those queues, by:
> * separating sync idle, sync no-idle and async queues in separate
> service_trees, for each priority
> * service all no-idle queues together
> * and idling when the last no-idle queue has been serviced, to
> anticipate for more no-idle work
> * the timeslices allotted for idle and no-idle service_trees are
> computed proportionally to the number of processes in each set.
>
> Servicing all no-idle queues together should have a performance boost
> for NCQ-capable drives, without compromising fairness.

Good stuff!

> +enum wl_type_t {
> + ASYNC_WL = 0,
> + SYNC_NOIDLE_WL = 1,
> + SYNC_WL = 2
> +};

Again the _WL. WL what?

> static inline int cfq_busy_queues_wl(enum wl_prio_t wl, struct cfq_data *cfqd)
> {
> return wl == IDLE_WL ? cfqd->service_tree_idle.count :
> - cfqd->service_trees[wl].count;
> + cfqd->service_trees[wl][ASYNC_WL].count
> + + cfqd->service_trees[wl][SYNC_NOIDLE_WL].count
> + + cfqd->service_trees[wl][SYNC_WL].count;

Unreadable.

> cfq_slice_expired(cfqd, 0);
> new_queue:
> if (!new_cfqq) {
> + enum wl_prio_t previous_prio = cfqd->serving_prio;
> +
> if (cfq_busy_queues_wl(RT_WL, cfqd))
> cfqd->serving_prio = RT_WL;
> else if (cfq_busy_queues_wl(BE_WL, cfqd))
> cfqd->serving_prio = BE_WL;
> - else
> + else {
> cfqd->serving_prio = IDLE_WL;
> + cfqd->workload_expires = jiffies + 1;
> + }
> +
> + if (cfqd->serving_prio != IDLE_WL) {
> + int counts[] = {
> + service_tree_for(cfqd->serving_prio,
> + ASYNC_WL, cfqd)->count,
> + service_tree_for(cfqd->serving_prio,
> + SYNC_NOIDLE_WL, cfqd)->count,
> + service_tree_for(cfqd->serving_prio,
> + SYNC_WL, cfqd)->count
> + };
> + int nonzero_counts = !!counts[0] + !!counts[1]
> + + !!counts[2];
> +
> + if (previous_prio != cfqd->serving_prio ||
> + (nonzero_counts == 1)) {
> + cfqd->serving_type = counts[1] ? SYNC_NOIDLE_WL
> + : counts[2] ? SYNC_WL : ASYNC_WL;
> + } else {
> + if (!counts[cfqd->serving_type] ||
> + time_after(jiffies, cfqd->workload_expires))
> + cfqd->serving_type = cfq_choose_wl
> + (cfqd, cfqd->serving_prio);
> + else
> + goto same_wl;
> + }
> +
> + {
> + unsigned slice = cfq_target_latency;
> + slice = slice * counts[cfqd->serving_type] /
> + max_t(unsigned, cfqd->busy_queues_avg
> + [cfqd->serving_prio],
> + counts[SYNC_WL] +
> + counts[SYNC_NOIDLE_WL] +
> + counts[ASYNC_WL]);
> +
> + if (cfqd->serving_type == ASYNC_WL)
> + slice = max(1U, slice
> + * cfqd->cfq_slice[0]
> + / cfqd->cfq_slice[1]);
> + else {
> + unsigned slice_min =
> + 2 * cfqd->cfq_slice_idle;
> + slice = max(slice, slice_min);
> + slice = max_t(unsigned, slice,
> + CFQ_MIN_TT);
> + }
> + cfqd->workload_expires = jiffies + slice;
> + }
> + }

This so badly needs comments and code cleanups it's not even funny. It's
completely unreadable.

Goes for most of this patch. Your goal is great, but please spend some
time commenting and making this code actually readable and mergeable.

Remember that reviewer time is valuable. You should provide patches that
are easily digestable. Huge chunks of unreadable code like the above
really wants to go into a separate function and be nicely commented.

--
Jens Axboe