2010-07-07 23:05:52

by Vivek Goyal

[permalink] [raw]
Subject: [RFC/RFT PATCH] cfq-iosched: Implement cfq group idling

Currently we idle on sequential queues and allow dispatch from a single
queue and that can become a bottleneck on higher end storage. For example
on my HP EVA, I can run multiple sequential streams and achieve top BW
of around 350 MB/s. But with CFQ, dispatching from single queue does not
keep the array busy (limits to 150-180 MB/s with 4 or 8 processes).

One approach to solve this issue is simply use slice_idle = 0. But this
also takes away the any service differentiation between groups.

This patch implements a new tunable "group_idle". This is similar to
slice_idle but it forces idling at cfq group level and not at cfq queue
level. So the idea is that one can run with slice_idle = 0 and group_idle
= 8, so that we don't idle on individual queues in the group but idle
on the group and still keep the IO controller working.

Not idling on individual queues in the group will dispatch requests from
multiple queues in the group at the same time and achieve higher throughput
on higher end storage.

I have done some testing with multiple sequential readers using fio in two
groups of weights 100 and 200. I run 1, 2, 4, 8 sequential readers in two
groups. Group names are test1 and test2 and throughputs are in KB/s.

Default CFQ
===========
Kernel=2.6.35-rc4-ioc+
DIR=/mnt/iostestmnt/fio DEV=/dev/sdf1
Workload=bsr iosched=cfq Filesz=512M bs=4K
group_isolation=1 slice_idle=8 group_idle=8 quantum=8
=============================================================
job Set NR test1 test2
--- --- -- --------------
bsr 1 1 61629 135182
bsr 1 2 62073 121222
bsr 1 4 51386 105694
bsr 1 8 50883 82450

Note how total BW is really low (around 150 - 180 MB/s) while array can
support up to 350MB/s

CFQ (slice_idle = 0)
====================
Kernel=2.6.35-rc4-ioc+
DIR=/mnt/iostestmnt/fio DEV=/dev/sdf1
Workload=bsr iosched=cfq Filesz=512M bs=4K
group_isolation=1 slice_idle=0 group_idle=8 quantum=8
=========================================================================
job Set NR test1 test2
--- --- -- --------------
bsr 1 1 62952 139499
bsr 1 2 95775 186426
bsr 1 4 115410 235632
bsr 1 8 125036 227859

With slice_idle=0, we can almost touch 350MB/s and still get the service
differentiation.

CFQ (slice_idle = 0, group_idle = 0)
===================================
Kernel=2.6.35-rc4-ioc+
DIR=/mnt/iostestmnt/fio DEV=/dev/sdf1
Workload=bsr iosched=cfq Filesz=512M bs=4K
group_isolation=1 slice_idle=0 group_idle=0 quantum=8
=========================================================================
job Set NR test1 test2
--- --- -- --------------
bsr 1 1 155274 153544
bsr 1 2 173448 174263
bsr 1 4 177629 178458
bsr 1 8 179846 179226

With both slice_idle = 0 and group_idle = 0, it is alsmost like deadline.
We lose service differentation but achieve high throughput.

Signed-off-by: Vivek Goyal <[email protected]>

---
block/cfq-iosched.c | 46 ++++++++++++++++++++++++++++++++++++++++------
1 files changed, 40 insertions(+), 6 deletions(-)

Index: linux-2.6/block/cfq-iosched.c
===================================================================
--- linux-2.6.orig/block/cfq-iosched.c
+++ linux-2.6/block/cfq-iosched.c
@@ -30,6 +30,7 @@ static const int cfq_slice_sync = HZ / 1
static int cfq_slice_async = HZ / 25;
static const int cfq_slice_async_rq = 2;
static int cfq_slice_idle = HZ / 125;
+static int cfq_group_idle = HZ / 125;
static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
static const int cfq_hist_divisor = 4;

@@ -198,6 +199,8 @@ struct cfq_group {
struct hlist_node cfqd_node;
atomic_t ref;
#endif
+ /* number of requests that are on the dispatch list or inside driver */
+ int dispatched;
};

/*
@@ -271,6 +274,7 @@ struct cfq_data {
unsigned int cfq_slice[2];
unsigned int cfq_slice_async_rq;
unsigned int cfq_slice_idle;
+ unsigned int cfq_group_idle;
unsigned int cfq_latency;
unsigned int cfq_group_isolation;

@@ -1838,6 +1842,9 @@ static bool cfq_should_idle(struct cfq_d
BUG_ON(!service_tree);
BUG_ON(!service_tree->count);

+ if (!cfqd->cfq_slice_idle)
+ return false;
+
/* We never do for idle class queues. */
if (prio == IDLE_WORKLOAD)
return false;
@@ -1862,7 +1869,7 @@ static void cfq_arm_slice_timer(struct c
{
struct cfq_queue *cfqq = cfqd->active_queue;
struct cfq_io_context *cic;
- unsigned long sl;
+ unsigned long sl, group_idle = 0;

/*
* SSD device without seek penalty, disable idling. But only do so
@@ -1878,15 +1885,19 @@ static void cfq_arm_slice_timer(struct c
/*
* idle is disabled, either manually or by past process history
*/
- if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
- return;
+ if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq)) {
+ /* no queue idling. Check for group idling */
+ if (cfqd->cfq_group_idle)
+ group_idle = cfqd->cfq_group_idle;
+ else
+ return;
+ }

/*
* still active requests from this queue, don't idle
*/
if (cfqq->dispatched)
return;
-
/*
* task has exited, don't wait
*/
@@ -1899,7 +1910,7 @@ static void cfq_arm_slice_timer(struct c
* slice, then don't idle. This avoids overrunning the allotted
* time slice.
*/
- if (sample_valid(cic->ttime_samples) &&
+ if (!group_idle && sample_valid(cic->ttime_samples) &&
(cfqq->slice_end - jiffies < cic->ttime_mean)) {
cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
cic->ttime_mean);
@@ -1908,11 +1919,15 @@ static void cfq_arm_slice_timer(struct c

cfq_mark_cfqq_wait_request(cfqq);

- sl = cfqd->cfq_slice_idle;
+ if (group_idle)
+ sl = cfqd->cfq_group_idle;
+ else
+ sl = cfqd->cfq_slice_idle;

mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
- cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
+ cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
+ group_idle ? 1 : 0);
}

/*
@@ -1928,6 +1943,7 @@ static void cfq_dispatch_insert(struct r
cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
cfq_remove_request(rq);
cfqq->dispatched++;
+ (RQ_CFQG(rq))->dispatched++;
elv_dispatch_sort(q, rq);

cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
@@ -2231,6 +2247,16 @@ static struct cfq_queue *cfq_select_queu
goto keep_queue;
}

+ /*
+ * If group idle is enabled and there are requests dispatched from
+ * this group, wait for requests to complete.
+ */
+ if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
+ && cfqq->cfqg->dispatched) {
+ cfqq = NULL;
+ goto keep_queue;
+ }
+
expire:
cfq_slice_expired(cfqd, 0);
new_queue:
@@ -3373,6 +3399,7 @@ static void cfq_completed_request(struct
WARN_ON(!cfqq->dispatched);
cfqd->rq_in_driver--;
cfqq->dispatched--;
+ (RQ_CFQG(rq))->dispatched--;
cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
rq_start_time_ns(rq), rq_io_start_time_ns(rq),
rq_data_dir(rq), rq_is_sync(rq));
@@ -3847,6 +3874,7 @@ static void *cfq_init_queue(struct reque
cfqd->cfq_slice[1] = cfq_slice_sync;
cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
cfqd->cfq_slice_idle = cfq_slice_idle;
+ cfqd->cfq_group_idle = cfq_group_idle;
cfqd->cfq_latency = 1;
cfqd->cfq_group_isolation = 0;
cfqd->hw_tag = -1;
@@ -3919,6 +3947,7 @@ SHOW_FUNCTION(cfq_fifo_expire_async_show
SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
+SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
@@ -3951,6 +3980,7 @@ STORE_FUNCTION(cfq_back_seek_max_store,
STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
UINT_MAX, 0);
STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
+STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
@@ -3972,6 +4002,7 @@ static struct elv_fs_entry cfq_attrs[] =
CFQ_ATTR(slice_async),
CFQ_ATTR(slice_async_rq),
CFQ_ATTR(slice_idle),
+ CFQ_ATTR(group_idle),
CFQ_ATTR(low_latency),
CFQ_ATTR(group_isolation),
__ATTR_NULL
@@ -4025,6 +4056,12 @@ static int __init cfq_init(void)
if (!cfq_slice_idle)
cfq_slice_idle = 1;

+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ if (!cfq_group_idle)
+ cfq_group_idle = 1;
+#else
+ cfq_group_idle = 0;
+#endif
if (cfq_slab_setup())
return -ENOMEM;


2010-07-08 05:54:13

by Nauman Rafique

[permalink] [raw]
Subject: Re: [RFC/RFT PATCH] cfq-iosched: Implement cfq group idling

If group idling is enabled, wouldn't it lower the throughput on
traditional drives that handle one IO at a time?
Also, it would be useful to have the option of controlling idling on
queue and idling on group independently. Idling on group would also be
useful for achieving isolation in cases where a group uses more than
one thread to send down IOs. In that case, if we idle on group, it
might send an IO down from another thread within the idling period.

On Wed, Jul 7, 2010 at 4:05 PM, Vivek Goyal <[email protected]> wrote:
> Currently we idle on sequential queues and allow dispatch from a single
> queue and that can become a bottleneck on higher end storage. For example
> on my HP EVA, I can run multiple sequential streams and achieve top BW
> of around 350 MB/s. But with CFQ, dispatching from single queue does not
> keep the array busy (limits to 150-180 MB/s with 4 or 8 processes).
>
> One approach to solve this issue is simply use slice_idle = 0. But this
> also takes away the any service differentiation between groups.
>
> This patch implements a new tunable "group_idle". This is similar to
> slice_idle but it forces idling at cfq group level and not at cfq queue
> level. So the idea is that one can run with slice_idle = 0 and group_idle
> = 8, so that we don't idle on individual queues in the group but idle
> on the group and still keep the IO controller working.
>
> Not idling on individual queues in the group will dispatch requests from
> multiple queues in the group at the same time and achieve higher throughput
> on higher end storage.
>
> I have done some testing with multiple sequential readers using fio in two
> groups of weights 100 and 200. I run 1, 2, 4, 8 sequential readers in two
> groups. ?Group names are test1 and test2 and throughputs are in KB/s.
>
> Default CFQ
> ===========
> Kernel=2.6.35-rc4-ioc+
> DIR=/mnt/iostestmnt/fio ? ? ? ?DEV=/dev/sdf1
> Workload=bsr ? ? ?iosched=cfq ? ? Filesz=512M bs=4K
> group_isolation=1 slice_idle=8 ? ?group_idle=8 ? ?quantum=8
> =============================================================
> job ? ? Set NR ?test1 ?test2
> --- ? ? --- -- ?--------------
> bsr ? ? 1 ? 1 ? 61629 ?135182
> bsr ? ? 1 ? 2 ? 62073 ?121222
> bsr ? ? 1 ? 4 ? 51386 ?105694
> bsr ? ? 1 ? 8 ? 50883 ?82450
>
> Note how total BW is really low (around 150 - 180 MB/s) while array can
> support up to 350MB/s
>
> CFQ (slice_idle = 0)
> ====================
> Kernel=2.6.35-rc4-ioc+
> DIR=/mnt/iostestmnt/fio ? ? ? ?DEV=/dev/sdf1
> Workload=bsr ? ? ?iosched=cfq ? ? Filesz=512M bs=4K
> group_isolation=1 slice_idle=0 ? ?group_idle=8 ? ?quantum=8
> =========================================================================
> job ? ? Set NR ?test1 ?test2
> --- ? ? --- -- ?--------------
> bsr ? ? 1 ? 1 ? 62952 ?139499
> bsr ? ? 1 ? 2 ? 95775 ?186426
> bsr ? ? 1 ? 4 ? 115410 235632
> bsr ? ? 1 ? 8 ? 125036 227859
>
> With slice_idle=0, we can almost touch 350MB/s and still get the service
> differentiation.
>
> CFQ (slice_idle = 0, group_idle = 0)
> ===================================
> Kernel=2.6.35-rc4-ioc+
> DIR=/mnt/iostestmnt/fio ? ? ? ?DEV=/dev/sdf1
> Workload=bsr ? ? ?iosched=cfq ? ? Filesz=512M bs=4K
> group_isolation=1 slice_idle=0 ? ?group_idle=0 ? ?quantum=8
> =========================================================================
> job ? ? Set NR ?test1 ?test2
> --- ? ? --- -- ?--------------
> bsr ? ? 1 ? 1 ? 155274 153544
> bsr ? ? 1 ? 2 ? 173448 174263
> bsr ? ? 1 ? 4 ? 177629 178458
> bsr ? ? 1 ? 8 ? 179846 179226
>
> With both slice_idle = 0 and group_idle = 0, it is alsmost like deadline.
> We lose service differentation but achieve high throughput.
>
> Signed-off-by: Vivek Goyal <[email protected]>
>
> ---
> ?block/cfq-iosched.c | ? 46 ++++++++++++++++++++++++++++++++++++++++------
> ?1 files changed, 40 insertions(+), 6 deletions(-)
>
> Index: linux-2.6/block/cfq-iosched.c
> ===================================================================
> --- linux-2.6.orig/block/cfq-iosched.c
> +++ linux-2.6/block/cfq-iosched.c
> @@ -30,6 +30,7 @@ static const int cfq_slice_sync = HZ / 1
> ?static int cfq_slice_async = HZ / 25;
> ?static const int cfq_slice_async_rq = 2;
> ?static int cfq_slice_idle = HZ / 125;
> +static int cfq_group_idle = HZ / 125;
> ?static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
> ?static const int cfq_hist_divisor = 4;
>
> @@ -198,6 +199,8 @@ struct cfq_group {
> ? ? ? ?struct hlist_node cfqd_node;
> ? ? ? ?atomic_t ref;
> ?#endif
> + ? ? ? /* number of requests that are on the dispatch list or inside driver */
> + ? ? ? int dispatched;
> ?};
>
> ?/*
> @@ -271,6 +274,7 @@ struct cfq_data {
> ? ? ? ?unsigned int cfq_slice[2];
> ? ? ? ?unsigned int cfq_slice_async_rq;
> ? ? ? ?unsigned int cfq_slice_idle;
> + ? ? ? unsigned int cfq_group_idle;
> ? ? ? ?unsigned int cfq_latency;
> ? ? ? ?unsigned int cfq_group_isolation;
>
> @@ -1838,6 +1842,9 @@ static bool cfq_should_idle(struct cfq_d
> ? ? ? ?BUG_ON(!service_tree);
> ? ? ? ?BUG_ON(!service_tree->count);
>
> + ? ? ? if (!cfqd->cfq_slice_idle)
> + ? ? ? ? ? ? ? return false;
> +
> ? ? ? ?/* We never do for idle class queues. */
> ? ? ? ?if (prio == IDLE_WORKLOAD)
> ? ? ? ? ? ? ? ?return false;
> @@ -1862,7 +1869,7 @@ static void cfq_arm_slice_timer(struct c
> ?{
> ? ? ? ?struct cfq_queue *cfqq = cfqd->active_queue;
> ? ? ? ?struct cfq_io_context *cic;
> - ? ? ? unsigned long sl;
> + ? ? ? unsigned long sl, group_idle = 0;
>
> ? ? ? ?/*
> ? ? ? ? * SSD device without seek penalty, disable idling. But only do so
> @@ -1878,15 +1885,19 @@ static void cfq_arm_slice_timer(struct c
> ? ? ? ?/*
> ? ? ? ? * idle is disabled, either manually or by past process history
> ? ? ? ? */
> - ? ? ? if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
> - ? ? ? ? ? ? ? return;
> + ? ? ? if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq)) {
> + ? ? ? ? ? ? ? /* no queue idling. Check for group idling */
> + ? ? ? ? ? ? ? if (cfqd->cfq_group_idle)
> + ? ? ? ? ? ? ? ? ? ? ? group_idle = cfqd->cfq_group_idle;
> + ? ? ? ? ? ? ? else
> + ? ? ? ? ? ? ? ? ? ? ? return;
> + ? ? ? }
>
> ? ? ? ?/*
> ? ? ? ? * still active requests from this queue, don't idle
> ? ? ? ? */
> ? ? ? ?if (cfqq->dispatched)
> ? ? ? ? ? ? ? ?return;
> -
> ? ? ? ?/*
> ? ? ? ? * task has exited, don't wait
> ? ? ? ? */
> @@ -1899,7 +1910,7 @@ static void cfq_arm_slice_timer(struct c
> ? ? ? ? * slice, then don't idle. This avoids overrunning the allotted
> ? ? ? ? * time slice.
> ? ? ? ? */
> - ? ? ? if (sample_valid(cic->ttime_samples) &&
> + ? ? ? if (!group_idle && sample_valid(cic->ttime_samples) &&
> ? ? ? ? ? ?(cfqq->slice_end - jiffies < cic->ttime_mean)) {
> ? ? ? ? ? ? ? ?cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?cic->ttime_mean);
> @@ -1908,11 +1919,15 @@ static void cfq_arm_slice_timer(struct c
>
> ? ? ? ?cfq_mark_cfqq_wait_request(cfqq);
>
> - ? ? ? sl = cfqd->cfq_slice_idle;
> + ? ? ? if (group_idle)
> + ? ? ? ? ? ? ? sl = cfqd->cfq_group_idle;
> + ? ? ? else
> + ? ? ? ? ? ? ? sl = cfqd->cfq_slice_idle;
>
> ? ? ? ?mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
> ? ? ? ?cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
> - ? ? ? cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
> + ? ? ? cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
> + ? ? ? ? ? ? ? ? ? ? ? group_idle ? 1 : 0);
> ?}
>
> ?/*
> @@ -1928,6 +1943,7 @@ static void cfq_dispatch_insert(struct r
> ? ? ? ?cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
> ? ? ? ?cfq_remove_request(rq);
> ? ? ? ?cfqq->dispatched++;
> + ? ? ? (RQ_CFQG(rq))->dispatched++;
> ? ? ? ?elv_dispatch_sort(q, rq);
>
> ? ? ? ?cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
> @@ -2231,6 +2247,16 @@ static struct cfq_queue *cfq_select_queu
> ? ? ? ? ? ? ? ?goto keep_queue;
> ? ? ? ?}
>
> + ? ? ? /*
> + ? ? ? ?* If group idle is enabled and there are requests dispatched from
> + ? ? ? ?* this group, wait for requests to complete.
> + ? ? ? ?*/
> + ? ? ? if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
> + ? ? ? ? ? && cfqq->cfqg->dispatched) {
> + ? ? ? ? ? ? ? cfqq = NULL;
> + ? ? ? ? ? ? ? goto keep_queue;
> + ? ? ? }
> +
> ?expire:
> ? ? ? ?cfq_slice_expired(cfqd, 0);
> ?new_queue:
> @@ -3373,6 +3399,7 @@ static void cfq_completed_request(struct
> ? ? ? ?WARN_ON(!cfqq->dispatched);
> ? ? ? ?cfqd->rq_in_driver--;
> ? ? ? ?cfqq->dispatched--;
> + ? ? ? (RQ_CFQG(rq))->dispatched--;
> ? ? ? ?cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
> ? ? ? ? ? ? ? ? ? ? ? ?rq_start_time_ns(rq), rq_io_start_time_ns(rq),
> ? ? ? ? ? ? ? ? ? ? ? ?rq_data_dir(rq), rq_is_sync(rq));
> @@ -3847,6 +3874,7 @@ static void *cfq_init_queue(struct reque
> ? ? ? ?cfqd->cfq_slice[1] = cfq_slice_sync;
> ? ? ? ?cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
> ? ? ? ?cfqd->cfq_slice_idle = cfq_slice_idle;
> + ? ? ? cfqd->cfq_group_idle = cfq_group_idle;
> ? ? ? ?cfqd->cfq_latency = 1;
> ? ? ? ?cfqd->cfq_group_isolation = 0;
> ? ? ? ?cfqd->hw_tag = -1;
> @@ -3919,6 +3947,7 @@ SHOW_FUNCTION(cfq_fifo_expire_async_show
> ?SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
> ?SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
> ?SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
> +SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
> ?SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
> ?SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
> ?SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
> @@ -3951,6 +3980,7 @@ STORE_FUNCTION(cfq_back_seek_max_store,
> ?STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
> ? ? ? ? ? ? ? ?UINT_MAX, 0);
> ?STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
> +STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
> ?STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
> ?STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
> ?STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
> @@ -3972,6 +4002,7 @@ static struct elv_fs_entry cfq_attrs[] =
> ? ? ? ?CFQ_ATTR(slice_async),
> ? ? ? ?CFQ_ATTR(slice_async_rq),
> ? ? ? ?CFQ_ATTR(slice_idle),
> + ? ? ? CFQ_ATTR(group_idle),
> ? ? ? ?CFQ_ATTR(low_latency),
> ? ? ? ?CFQ_ATTR(group_isolation),
> ? ? ? ?__ATTR_NULL
> @@ -4025,6 +4056,12 @@ static int __init cfq_init(void)
> ? ? ? ?if (!cfq_slice_idle)
> ? ? ? ? ? ? ? ?cfq_slice_idle = 1;
>
> +#ifdef CONFIG_CFQ_GROUP_IOSCHED
> + ? ? ? if (!cfq_group_idle)
> + ? ? ? ? ? ? ? cfq_group_idle = 1;
> +#else
> + ? ? ? ? ? ? ? cfq_group_idle = 0;
> +#endif
> ? ? ? ?if (cfq_slab_setup())
> ? ? ? ? ? ? ? ?return -ENOMEM;
>
>

2010-07-08 13:13:23

by Vivek Goyal

[permalink] [raw]
Subject: Re: [RFC/RFT PATCH] cfq-iosched: Implement cfq group idling

On Wed, Jul 07, 2010 at 10:53:48PM -0700, Nauman Rafique wrote:
> If group idling is enabled, wouldn't it lower the throughput on
> traditional drives that handle one IO at a time?

I did not understand why group idling is bad for SATA disk.

In fact group idling will take place only if queue idling is not taking
place. Most of the time it will kick in only if slice_idle = 0. If
there are cases where we it kicks in, lets get rid of these.

I am wondering about the case of sync-noidle queue where queue idle will
not kick in if other sync-noidle queues are on the tree. I need to
make sure group idle also does not come into the picture. I guess I
need to check for only one queue in the group condition to make sure
group timer is not armed. I will look into it...

But the point remains that if slice_idle is not zero, most of the time
only slice_idle will not kick in. If group_idle kicks in once in a while
we can look into it to see if it is right thing to from fairness
perspective or just get rid of additional group idle based on certain
conditions.

Thanks
Vivek

> Also, it would be useful to have the option of controlling idling on
> queue and idling on group independently. Idling on group would also be
> useful for achieving isolation in cases where a group uses more than
> one thread to send down IOs. In that case, if we idle on group, it
> might send an IO down from another thread within the idling period.
>
> On Wed, Jul 7, 2010 at 4:05 PM, Vivek Goyal <[email protected]> wrote:
> > Currently we idle on sequential queues and allow dispatch from a single
> > queue and that can become a bottleneck on higher end storage. For example
> > on my HP EVA, I can run multiple sequential streams and achieve top BW
> > of around 350 MB/s. But with CFQ, dispatching from single queue does not
> > keep the array busy (limits to 150-180 MB/s with 4 or 8 processes).
> >
> > One approach to solve this issue is simply use slice_idle = 0. But this
> > also takes away the any service differentiation between groups.
> >
> > This patch implements a new tunable "group_idle". This is similar to
> > slice_idle but it forces idling at cfq group level and not at cfq queue
> > level. So the idea is that one can run with slice_idle = 0 and group_idle
> > = 8, so that we don't idle on individual queues in the group but idle
> > on the group and still keep the IO controller working.
> >
> > Not idling on individual queues in the group will dispatch requests from
> > multiple queues in the group at the same time and achieve higher throughput
> > on higher end storage.
> >
> > I have done some testing with multiple sequential readers using fio in two
> > groups of weights 100 and 200. I run 1, 2, 4, 8 sequential readers in two
> > groups. ?Group names are test1 and test2 and throughputs are in KB/s.
> >
> > Default CFQ
> > ===========
> > Kernel=2.6.35-rc4-ioc+
> > DIR=/mnt/iostestmnt/fio ? ? ? ?DEV=/dev/sdf1
> > Workload=bsr ? ? ?iosched=cfq ? ? Filesz=512M bs=4K
> > group_isolation=1 slice_idle=8 ? ?group_idle=8 ? ?quantum=8
> > =============================================================
> > job ? ? Set NR ?test1 ?test2
> > --- ? ? --- -- ?--------------
> > bsr ? ? 1 ? 1 ? 61629 ?135182
> > bsr ? ? 1 ? 2 ? 62073 ?121222
> > bsr ? ? 1 ? 4 ? 51386 ?105694
> > bsr ? ? 1 ? 8 ? 50883 ?82450
> >
> > Note how total BW is really low (around 150 - 180 MB/s) while array can
> > support up to 350MB/s
> >
> > CFQ (slice_idle = 0)
> > ====================
> > Kernel=2.6.35-rc4-ioc+
> > DIR=/mnt/iostestmnt/fio ? ? ? ?DEV=/dev/sdf1
> > Workload=bsr ? ? ?iosched=cfq ? ? Filesz=512M bs=4K
> > group_isolation=1 slice_idle=0 ? ?group_idle=8 ? ?quantum=8
> > =========================================================================
> > job ? ? Set NR ?test1 ?test2
> > --- ? ? --- -- ?--------------
> > bsr ? ? 1 ? 1 ? 62952 ?139499
> > bsr ? ? 1 ? 2 ? 95775 ?186426
> > bsr ? ? 1 ? 4 ? 115410 235632
> > bsr ? ? 1 ? 8 ? 125036 227859
> >
> > With slice_idle=0, we can almost touch 350MB/s and still get the service
> > differentiation.
> >
> > CFQ (slice_idle = 0, group_idle = 0)
> > ===================================
> > Kernel=2.6.35-rc4-ioc+
> > DIR=/mnt/iostestmnt/fio ? ? ? ?DEV=/dev/sdf1
> > Workload=bsr ? ? ?iosched=cfq ? ? Filesz=512M bs=4K
> > group_isolation=1 slice_idle=0 ? ?group_idle=0 ? ?quantum=8
> > =========================================================================
> > job ? ? Set NR ?test1 ?test2
> > --- ? ? --- -- ?--------------
> > bsr ? ? 1 ? 1 ? 155274 153544
> > bsr ? ? 1 ? 2 ? 173448 174263
> > bsr ? ? 1 ? 4 ? 177629 178458
> > bsr ? ? 1 ? 8 ? 179846 179226
> >
> > With both slice_idle = 0 and group_idle = 0, it is alsmost like deadline.
> > We lose service differentation but achieve high throughput.
> >
> > Signed-off-by: Vivek Goyal <[email protected]>
> >
> > ---
> > ?block/cfq-iosched.c | ? 46 ++++++++++++++++++++++++++++++++++++++++------
> > ?1 files changed, 40 insertions(+), 6 deletions(-)
> >
> > Index: linux-2.6/block/cfq-iosched.c
> > ===================================================================
> > --- linux-2.6.orig/block/cfq-iosched.c
> > +++ linux-2.6/block/cfq-iosched.c
> > @@ -30,6 +30,7 @@ static const int cfq_slice_sync = HZ / 1
> > ?static int cfq_slice_async = HZ / 25;
> > ?static const int cfq_slice_async_rq = 2;
> > ?static int cfq_slice_idle = HZ / 125;
> > +static int cfq_group_idle = HZ / 125;
> > ?static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
> > ?static const int cfq_hist_divisor = 4;
> >
> > @@ -198,6 +199,8 @@ struct cfq_group {
> > ? ? ? ?struct hlist_node cfqd_node;
> > ? ? ? ?atomic_t ref;
> > ?#endif
> > + ? ? ? /* number of requests that are on the dispatch list or inside driver */
> > + ? ? ? int dispatched;
> > ?};
> >
> > ?/*
> > @@ -271,6 +274,7 @@ struct cfq_data {
> > ? ? ? ?unsigned int cfq_slice[2];
> > ? ? ? ?unsigned int cfq_slice_async_rq;
> > ? ? ? ?unsigned int cfq_slice_idle;
> > + ? ? ? unsigned int cfq_group_idle;
> > ? ? ? ?unsigned int cfq_latency;
> > ? ? ? ?unsigned int cfq_group_isolation;
> >
> > @@ -1838,6 +1842,9 @@ static bool cfq_should_idle(struct cfq_d
> > ? ? ? ?BUG_ON(!service_tree);
> > ? ? ? ?BUG_ON(!service_tree->count);
> >
> > + ? ? ? if (!cfqd->cfq_slice_idle)
> > + ? ? ? ? ? ? ? return false;
> > +
> > ? ? ? ?/* We never do for idle class queues. */
> > ? ? ? ?if (prio == IDLE_WORKLOAD)
> > ? ? ? ? ? ? ? ?return false;
> > @@ -1862,7 +1869,7 @@ static void cfq_arm_slice_timer(struct c
> > ?{
> > ? ? ? ?struct cfq_queue *cfqq = cfqd->active_queue;
> > ? ? ? ?struct cfq_io_context *cic;
> > - ? ? ? unsigned long sl;
> > + ? ? ? unsigned long sl, group_idle = 0;
> >
> > ? ? ? ?/*
> > ? ? ? ? * SSD device without seek penalty, disable idling. But only do so
> > @@ -1878,15 +1885,19 @@ static void cfq_arm_slice_timer(struct c
> > ? ? ? ?/*
> > ? ? ? ? * idle is disabled, either manually or by past process history
> > ? ? ? ? */
> > - ? ? ? if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
> > - ? ? ? ? ? ? ? return;
> > + ? ? ? if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq)) {
> > + ? ? ? ? ? ? ? /* no queue idling. Check for group idling */
> > + ? ? ? ? ? ? ? if (cfqd->cfq_group_idle)
> > + ? ? ? ? ? ? ? ? ? ? ? group_idle = cfqd->cfq_group_idle;
> > + ? ? ? ? ? ? ? else
> > + ? ? ? ? ? ? ? ? ? ? ? return;
> > + ? ? ? }
> >
> > ? ? ? ?/*
> > ? ? ? ? * still active requests from this queue, don't idle
> > ? ? ? ? */
> > ? ? ? ?if (cfqq->dispatched)
> > ? ? ? ? ? ? ? ?return;
> > -
> > ? ? ? ?/*
> > ? ? ? ? * task has exited, don't wait
> > ? ? ? ? */
> > @@ -1899,7 +1910,7 @@ static void cfq_arm_slice_timer(struct c
> > ? ? ? ? * slice, then don't idle. This avoids overrunning the allotted
> > ? ? ? ? * time slice.
> > ? ? ? ? */
> > - ? ? ? if (sample_valid(cic->ttime_samples) &&
> > + ? ? ? if (!group_idle && sample_valid(cic->ttime_samples) &&
> > ? ? ? ? ? ?(cfqq->slice_end - jiffies < cic->ttime_mean)) {
> > ? ? ? ? ? ? ? ?cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
> > ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?cic->ttime_mean);
> > @@ -1908,11 +1919,15 @@ static void cfq_arm_slice_timer(struct c
> >
> > ? ? ? ?cfq_mark_cfqq_wait_request(cfqq);
> >
> > - ? ? ? sl = cfqd->cfq_slice_idle;
> > + ? ? ? if (group_idle)
> > + ? ? ? ? ? ? ? sl = cfqd->cfq_group_idle;
> > + ? ? ? else
> > + ? ? ? ? ? ? ? sl = cfqd->cfq_slice_idle;
> >
> > ? ? ? ?mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
> > ? ? ? ?cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
> > - ? ? ? cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
> > + ? ? ? cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
> > + ? ? ? ? ? ? ? ? ? ? ? group_idle ? 1 : 0);
> > ?}
> >
> > ?/*
> > @@ -1928,6 +1943,7 @@ static void cfq_dispatch_insert(struct r
> > ? ? ? ?cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
> > ? ? ? ?cfq_remove_request(rq);
> > ? ? ? ?cfqq->dispatched++;
> > + ? ? ? (RQ_CFQG(rq))->dispatched++;
> > ? ? ? ?elv_dispatch_sort(q, rq);
> >
> > ? ? ? ?cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
> > @@ -2231,6 +2247,16 @@ static struct cfq_queue *cfq_select_queu
> > ? ? ? ? ? ? ? ?goto keep_queue;
> > ? ? ? ?}
> >
> > + ? ? ? /*
> > + ? ? ? ?* If group idle is enabled and there are requests dispatched from
> > + ? ? ? ?* this group, wait for requests to complete.
> > + ? ? ? ?*/
> > + ? ? ? if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
> > + ? ? ? ? ? && cfqq->cfqg->dispatched) {
> > + ? ? ? ? ? ? ? cfqq = NULL;
> > + ? ? ? ? ? ? ? goto keep_queue;
> > + ? ? ? }
> > +
> > ?expire:
> > ? ? ? ?cfq_slice_expired(cfqd, 0);
> > ?new_queue:
> > @@ -3373,6 +3399,7 @@ static void cfq_completed_request(struct
> > ? ? ? ?WARN_ON(!cfqq->dispatched);
> > ? ? ? ?cfqd->rq_in_driver--;
> > ? ? ? ?cfqq->dispatched--;
> > + ? ? ? (RQ_CFQG(rq))->dispatched--;
> > ? ? ? ?cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
> > ? ? ? ? ? ? ? ? ? ? ? ?rq_start_time_ns(rq), rq_io_start_time_ns(rq),
> > ? ? ? ? ? ? ? ? ? ? ? ?rq_data_dir(rq), rq_is_sync(rq));
> > @@ -3847,6 +3874,7 @@ static void *cfq_init_queue(struct reque
> > ? ? ? ?cfqd->cfq_slice[1] = cfq_slice_sync;
> > ? ? ? ?cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
> > ? ? ? ?cfqd->cfq_slice_idle = cfq_slice_idle;
> > + ? ? ? cfqd->cfq_group_idle = cfq_group_idle;
> > ? ? ? ?cfqd->cfq_latency = 1;
> > ? ? ? ?cfqd->cfq_group_isolation = 0;
> > ? ? ? ?cfqd->hw_tag = -1;
> > @@ -3919,6 +3947,7 @@ SHOW_FUNCTION(cfq_fifo_expire_async_show
> > ?SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
> > ?SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
> > ?SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
> > +SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
> > ?SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
> > ?SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
> > ?SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
> > @@ -3951,6 +3980,7 @@ STORE_FUNCTION(cfq_back_seek_max_store,
> > ?STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
> > ? ? ? ? ? ? ? ?UINT_MAX, 0);
> > ?STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
> > +STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
> > ?STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
> > ?STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
> > ?STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
> > @@ -3972,6 +4002,7 @@ static struct elv_fs_entry cfq_attrs[] =
> > ? ? ? ?CFQ_ATTR(slice_async),
> > ? ? ? ?CFQ_ATTR(slice_async_rq),
> > ? ? ? ?CFQ_ATTR(slice_idle),
> > + ? ? ? CFQ_ATTR(group_idle),
> > ? ? ? ?CFQ_ATTR(low_latency),
> > ? ? ? ?CFQ_ATTR(group_isolation),
> > ? ? ? ?__ATTR_NULL
> > @@ -4025,6 +4056,12 @@ static int __init cfq_init(void)
> > ? ? ? ?if (!cfq_slice_idle)
> > ? ? ? ? ? ? ? ?cfq_slice_idle = 1;
> >
> > +#ifdef CONFIG_CFQ_GROUP_IOSCHED
> > + ? ? ? if (!cfq_group_idle)
> > + ? ? ? ? ? ? ? cfq_group_idle = 1;
> > +#else
> > + ? ? ? ? ? ? ? cfq_group_idle = 0;
> > +#endif
> > ? ? ? ?if (cfq_slab_setup())
> > ? ? ? ? ? ? ? ?return -ENOMEM;
> >
> >

2010-07-08 13:40:03

by Jeff Moyer

[permalink] [raw]
Subject: Re: [RFC/RFT PATCH] cfq-iosched: Implement cfq group idling

Vivek Goyal <[email protected]> writes:

> Currently we idle on sequential queues and allow dispatch from a single
> queue and that can become a bottleneck on higher end storage. For example
> on my HP EVA, I can run multiple sequential streams and achieve top BW
> of around 350 MB/s. But with CFQ, dispatching from single queue does not
> keep the array busy (limits to 150-180 MB/s with 4 or 8 processes).
>
> One approach to solve this issue is simply use slice_idle = 0. But this
> also takes away the any service differentiation between groups.

That also takes away service differentiation between queues. If you
want to maintain that at all, then this is really just pushing the
problem to another layer.

This is the crux of my issues with CFQ. It works really well for SATA
disks. Once you try running it on enterprise storage, it falls flat.
Is it a design goal of CFQ to get it to run well on enterprise storage?

Jens?

Cheers,
Jeff

2010-07-08 13:56:39

by Vivek Goyal

[permalink] [raw]
Subject: Re: [RFC/RFT PATCH] cfq-iosched: Implement cfq group idling

On Thu, Jul 08, 2010 at 09:39:45AM -0400, Jeff Moyer wrote:
> Vivek Goyal <[email protected]> writes:
>
> > Currently we idle on sequential queues and allow dispatch from a single
> > queue and that can become a bottleneck on higher end storage. For example
> > on my HP EVA, I can run multiple sequential streams and achieve top BW
> > of around 350 MB/s. But with CFQ, dispatching from single queue does not
> > keep the array busy (limits to 150-180 MB/s with 4 or 8 processes).
> >
> > One approach to solve this issue is simply use slice_idle = 0. But this
> > also takes away the any service differentiation between groups.
>
> That also takes away service differentiation between queues. If you
> want to maintain that at all, then this is really just pushing the
> problem to another layer.
>

Yes it does take away the io priority with-in group. But I think that's
the trade-off and that's not default. So those who don't require ioprio
stuff working with-in group and those who know that they have got
faster storage will set slice_idle=0. For rest of the SATA users default
is still slice_idle=8.

So I am just creating a path so that high end storage users don't suffer.
Previously they could easily switch to deadline. But because they also
want to use IO control feature and deadline does not support that, we
need to create some paths in CFQ so that it gives deadline like
performance but also provides IO control facility.

Because storage behavior varies so much, typically idling works very well
on single SATA disks and with software RAIDs of SATA disks. But in my
testing deadline is outerforming CFQ on HBA based hardware RAID and
storage arrays. Now ideal thing to address this situation would be some
kind of auto tuning where CFQ realizes the capability of LUN it is
operating on and changes behavior automatically.

But getting auto-tuning is hard, I am trying to address the issue with
the help of tunable in a static manner. So if an admin knows the storage
configuration, he can change the tunables statically. Once a realibale
auto tuning infrastructure is in, we can get rid of these static paths.

Thanks
Vivek

> This is the crux of my issues with CFQ. It works really well for SATA
> disks. Once you try running it on enterprise storage, it falls flat.
> Is it a design goal of CFQ to get it to run well on enterprise storage?
>
> Jens?
>
> Cheers,
> Jeff

2010-07-08 14:09:59

by Jeff Moyer

[permalink] [raw]
Subject: Re: [RFC/RFT PATCH] cfq-iosched: Implement cfq group idling

Vivek Goyal <[email protected]> writes:

> On Thu, Jul 08, 2010 at 09:39:45AM -0400, Jeff Moyer wrote:
>> Vivek Goyal <[email protected]> writes:
>>
>> > Currently we idle on sequential queues and allow dispatch from a single
>> > queue and that can become a bottleneck on higher end storage. For example
>> > on my HP EVA, I can run multiple sequential streams and achieve top BW
>> > of around 350 MB/s. But with CFQ, dispatching from single queue does not
>> > keep the array busy (limits to 150-180 MB/s with 4 or 8 processes).
>> >
>> > One approach to solve this issue is simply use slice_idle = 0. But this
>> > also takes away the any service differentiation between groups.
>>
>> That also takes away service differentiation between queues. If you
>> want to maintain that at all, then this is really just pushing the
>> problem to another layer.
>>
>
> Yes it does take away the io priority with-in group. But I think that's
> the trade-off and that's not default. So those who don't require ioprio
> stuff working with-in group and those who know that they have got
> faster storage will set slice_idle=0. For rest of the SATA users default
> is still slice_idle=8.

[snip]

Sorry, Vivek, I'm actually hijacking your thread. ;-) I know what the
alternatives are, what I'm looking for is guidance on what Jens wants to
do with CFQ. We can discuss the merits of different approaches once we
agree on a set of requirements.

Cheers,
Jeff

2010-07-08 15:15:26

by Corrado Zoccolo

[permalink] [raw]
Subject: Re: [RFC/RFT PATCH] cfq-iosched: Implement cfq group idling

On Thu, Jul 8, 2010 at 4:08 PM, Jeff Moyer <[email protected]> wrote:
> Vivek Goyal <[email protected]> writes:
>
>> On Thu, Jul 08, 2010 at 09:39:45AM -0400, Jeff Moyer wrote:
>>> Vivek Goyal <[email protected]> writes:
>>>
>>> > Currently we idle on sequential queues and allow dispatch from a single
>>> > queue and that can become a bottleneck on higher end storage. For example
>>> > on my HP EVA, I can run multiple sequential streams and achieve top BW
>>> > of around 350 MB/s. But with CFQ, dispatching from single queue does not
>>> > keep the array busy (limits to 150-180 MB/s with 4 or 8 processes).
>>> >
>>> > One approach to solve this issue is simply use slice_idle = 0. But this
>>> > also takes away the any service differentiation between groups.
>>>
>>> That also takes away service differentiation between queues.  If you
>>> want to maintain that at all, then this is really just pushing the
>>> problem to another layer.
>>>
>>
>> Yes it does take away the io priority with-in group. But I think that's
>> the trade-off and that's not default. So those who don't require ioprio
>> stuff working with-in group and those who know that they have got
>> faster storage will set slice_idle=0. For rest of the SATA users default
>> is still slice_idle=8.
>
> [snip]
>
> Sorry, Vivek, I'm actually hijacking your thread.  ;-)  I know what the
> alternatives are, what I'm looking for is guidance on what Jens wants to
> do with CFQ.  We can discuss the merits of different approaches once we
> agree on a set of requirements.
>
> Cheers,
> Jeff
>

Hi Jeff,
did you have a look at:
http://lkml.indiana.edu/hypermail/linux/kernel/1004.3/00082.html (and
following msgs)?
We tried to achieve better throughput on multi-spindle disks, by merging queues.
I think the idea is promising, but we still need a lot of work to make
it concrete.
If you want to hack on it, feel free, since I don't have hardware to test it.

Corrado

__________________________________________________________________________

dott. Corrado Zoccolo mailto:[email protected]
PhD - Department of Computer Science - University of Pisa, Italy
--------------------------------------------------------------------------
The self-confidence of a warrior is not the self-confidence of the average
man. The average man seeks certainty in the eyes of the onlooker and calls
that self-confidence. The warrior seeks impeccability in his own eyes and
calls that humbleness.
Tales of Power - C. Castaneda

2010-07-08 15:33:05

by Vivek Goyal

[permalink] [raw]
Subject: Re: [RFC/RFT PATCH] cfq-iosched: Implement cfq group idling

On Thu, Jul 08, 2010 at 09:13:05AM -0400, Vivek Goyal wrote:
> On Wed, Jul 07, 2010 at 10:53:48PM -0700, Nauman Rafique wrote:
> > If group idling is enabled, wouldn't it lower the throughput on
> > traditional drives that handle one IO at a time?
>
> I did not understand why group idling is bad for SATA disk.
>
> In fact group idling will take place only if queue idling is not taking
> place. Most of the time it will kick in only if slice_idle = 0. If
> there are cases where we it kicks in, lets get rid of these.
>
> I am wondering about the case of sync-noidle queue where queue idle will
> not kick in if other sync-noidle queues are on the tree. I need to
> make sure group idle also does not come into the picture. I guess I
> need to check for only one queue in the group condition to make sure
> group timer is not armed. I will look into it...
>

I tried it but can't see it happening for various other conditions. But
yes, theoritically for sync-noidle queues a whole existed where we did
not idle on queue but could have idled on group.

I have put anohter check to make sure we are last queue in the group
before we arm the group idle timer.

So now, practically there is no case where slice_idle is enabled and group
idle timer will kick in. For async queues we will not even call arm_timer.
For sync-idle queues we always idle. For sync-noidle queues we anyway idle
on the last queue in the service tree.

Hence group idle will kick in only if slice_idle=0 and should not impact
any of the slow SATA storage.

Attaching the revised version of the patch.

Signed-off-by: Vivek Goyal <[email protected]>
---
block/cfq-iosched.c | 46 ++++++++++++++++++++++++++++++++++++++++------
1 files changed, 40 insertions(+), 6 deletions(-)

Index: linux-2.6/block/cfq-iosched.c
===================================================================
--- linux-2.6.orig/block/cfq-iosched.c
+++ linux-2.6/block/cfq-iosched.c
@@ -30,6 +30,7 @@ static const int cfq_slice_sync = HZ / 1
static int cfq_slice_async = HZ / 25;
static const int cfq_slice_async_rq = 2;
static int cfq_slice_idle = HZ / 125;
+static int cfq_group_idle = HZ / 125;
static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
static const int cfq_hist_divisor = 4;

@@ -198,6 +199,8 @@ struct cfq_group {
struct hlist_node cfqd_node;
atomic_t ref;
#endif
+ /* number of requests that are on the dispatch list or inside driver */
+ int dispatched;
};

/*
@@ -271,6 +274,7 @@ struct cfq_data {
unsigned int cfq_slice[2];
unsigned int cfq_slice_async_rq;
unsigned int cfq_slice_idle;
+ unsigned int cfq_group_idle;
unsigned int cfq_latency;
unsigned int cfq_group_isolation;

@@ -1838,6 +1842,9 @@ static bool cfq_should_idle(struct cfq_d
BUG_ON(!service_tree);
BUG_ON(!service_tree->count);

+ if (!cfqd->cfq_slice_idle)
+ return false;
+
/* We never do for idle class queues. */
if (prio == IDLE_WORKLOAD)
return false;
@@ -1862,7 +1869,7 @@ static void cfq_arm_slice_timer(struct c
{
struct cfq_queue *cfqq = cfqd->active_queue;
struct cfq_io_context *cic;
- unsigned long sl;
+ unsigned long sl, group_idle = 0;

/*
* SSD device without seek penalty, disable idling. But only do so
@@ -1878,8 +1885,13 @@ static void cfq_arm_slice_timer(struct c
/*
* idle is disabled, either manually or by past process history
*/
- if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
- return;
+ if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq)) {
+ /* no queue idling. Check for group idling */
+ if (cfqd->cfq_group_idle)
+ group_idle = cfqd->cfq_group_idle;
+ else
+ return;
+ }

/*
* still active requests from this queue, don't idle
@@ -1906,13 +1918,21 @@ static void cfq_arm_slice_timer(struct c
return;
}

+ /* There are other queues in the group, don't do group idle */
+ if (cfqq->cfqg->nr_cfqq > 1)
+ return;
+
cfq_mark_cfqq_wait_request(cfqq);

- sl = cfqd->cfq_slice_idle;
+ if (group_idle)
+ sl = cfqd->cfq_group_idle;
+ else
+ sl = cfqd->cfq_slice_idle;

mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
- cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
+ cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
+ group_idle ? 1 : 0);
}

/*
@@ -1928,6 +1948,7 @@ static void cfq_dispatch_insert(struct r
cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
cfq_remove_request(rq);
cfqq->dispatched++;
+ (RQ_CFQG(rq))->dispatched++;
elv_dispatch_sort(q, rq);

cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
@@ -2231,6 +2252,16 @@ static struct cfq_queue *cfq_select_queu
goto keep_queue;
}

+ /*
+ * If group idle is enabled and there are requests dispatched from
+ * this group, wait for requests to complete.
+ */
+ if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
+ && cfqq->cfqg->dispatched) {
+ cfqq = NULL;
+ goto keep_queue;
+ }
+
expire:
cfq_slice_expired(cfqd, 0);
new_queue:
@@ -3373,6 +3404,7 @@ static void cfq_completed_request(struct
WARN_ON(!cfqq->dispatched);
cfqd->rq_in_driver--;
cfqq->dispatched--;
+ (RQ_CFQG(rq))->dispatched--;
cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
rq_start_time_ns(rq), rq_io_start_time_ns(rq),
rq_data_dir(rq), rq_is_sync(rq));
@@ -3847,6 +3879,7 @@ static void *cfq_init_queue(struct reque
cfqd->cfq_slice[1] = cfq_slice_sync;
cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
cfqd->cfq_slice_idle = cfq_slice_idle;
+ cfqd->cfq_group_idle = cfq_group_idle;
cfqd->cfq_latency = 1;
cfqd->cfq_group_isolation = 0;
cfqd->hw_tag = -1;
@@ -3919,6 +3952,7 @@ SHOW_FUNCTION(cfq_fifo_expire_async_show
SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
+SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
@@ -3951,6 +3985,7 @@ STORE_FUNCTION(cfq_back_seek_max_store,
STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
UINT_MAX, 0);
STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
+STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
@@ -3972,6 +4007,7 @@ static struct elv_fs_entry cfq_attrs[] =
CFQ_ATTR(slice_async),
CFQ_ATTR(slice_async_rq),
CFQ_ATTR(slice_idle),
+ CFQ_ATTR(group_idle),
CFQ_ATTR(low_latency),
CFQ_ATTR(group_isolation),
__ATTR_NULL
@@ -4025,6 +4061,12 @@ static int __init cfq_init(void)
if (!cfq_slice_idle)
cfq_slice_idle = 1;

+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ if (!cfq_group_idle)
+ cfq_group_idle = 1;
+#else
+ cfq_group_idle = 0;
+#endif
if (cfq_slab_setup())
return -ENOMEM;

2010-07-08 22:53:16

by Vivek Goyal

[permalink] [raw]
Subject: Re: [RFC/RFT PATCH] cfq-iosched: Implement cfq group idling

On Thu, Jul 08, 2010 at 11:32:34AM -0400, Vivek Goyal wrote:
> On Thu, Jul 08, 2010 at 09:13:05AM -0400, Vivek Goyal wrote:
> > On Wed, Jul 07, 2010 at 10:53:48PM -0700, Nauman Rafique wrote:
> > > If group idling is enabled, wouldn't it lower the throughput on
> > > traditional drives that handle one IO at a time?
> >
> > I did not understand why group idling is bad for SATA disk.
> >
> > In fact group idling will take place only if queue idling is not taking
> > place. Most of the time it will kick in only if slice_idle = 0. If
> > there are cases where we it kicks in, lets get rid of these.
> >
> > I am wondering about the case of sync-noidle queue where queue idle will
> > not kick in if other sync-noidle queues are on the tree. I need to
> > make sure group idle also does not come into the picture. I guess I
> > need to check for only one queue in the group condition to make sure
> > group timer is not armed. I will look into it...
> >
>
> I tried it but can't see it happening for various other conditions. But
> yes, theoritically for sync-noidle queues a whole existed where we did
> not idle on queue but could have idled on group.
>
> I have put anohter check to make sure we are last queue in the group
> before we arm the group idle timer.
>
> So now, practically there is no case where slice_idle is enabled and group
> idle timer will kick in. For async queues we will not even call arm_timer.
> For sync-idle queues we always idle. For sync-noidle queues we anyway idle
> on the last queue in the service tree.
>
> Hence group idle will kick in only if slice_idle=0 and should not impact
> any of the slow SATA storage.
>
> Attaching the revised version of the patch.

This version had a bug. While checking for last queue in the group
I also need to check that I am looking for group idle otherwise normal
cfq queue idle also get disabled. This new patch fixes it.

By not idling on the queues, we switch between queues across very nicely.
That way request queue gets request from multiple groups at the same
time in request queue and latencies for groups are low. Of course all
this is true only for HBA hardware RAID and sotrage arrays.

What we lose in the process is capability to measure time accurately but
I guess that is secondary thing on high end storage as long as we get
decent service differentation between groups while maintainig good
throughput.

Signed-off-by: Vivek Goyal <[email protected]>
---
block/cfq-iosched.c | 46 ++++++++++++++++++++++++++++++++++++++++------
1 files changed, 40 insertions(+), 6 deletions(-)

Index: linux-2.6/block/cfq-iosched.c
===================================================================
--- linux-2.6.orig/block/cfq-iosched.c
+++ linux-2.6/block/cfq-iosched.c
@@ -30,6 +30,7 @@ static const int cfq_slice_sync = HZ / 1
static int cfq_slice_async = HZ / 25;
static const int cfq_slice_async_rq = 2;
static int cfq_slice_idle = HZ / 125;
+static int cfq_group_idle = HZ / 125;
static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
static const int cfq_hist_divisor = 4;

@@ -198,6 +199,8 @@ struct cfq_group {
struct hlist_node cfqd_node;
atomic_t ref;
#endif
+ /* number of requests that are on the dispatch list or inside driver */
+ int dispatched;
};

/*
@@ -271,6 +274,7 @@ struct cfq_data {
unsigned int cfq_slice[2];
unsigned int cfq_slice_async_rq;
unsigned int cfq_slice_idle;
+ unsigned int cfq_group_idle;
unsigned int cfq_latency;
unsigned int cfq_group_isolation;

@@ -1838,6 +1842,9 @@ static bool cfq_should_idle(struct cfq_d
BUG_ON(!service_tree);
BUG_ON(!service_tree->count);

+ if (!cfqd->cfq_slice_idle)
+ return false;
+
/* We never do for idle class queues. */
if (prio == IDLE_WORKLOAD)
return false;
@@ -1862,7 +1869,7 @@ static void cfq_arm_slice_timer(struct c
{
struct cfq_queue *cfqq = cfqd->active_queue;
struct cfq_io_context *cic;
- unsigned long sl;
+ unsigned long sl, group_idle = 0;

/*
* SSD device without seek penalty, disable idling. But only do so
@@ -1878,8 +1885,13 @@ static void cfq_arm_slice_timer(struct c
/*
* idle is disabled, either manually or by past process history
*/
- if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
- return;
+ if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq)) {
+ /* no queue idling. Check for group idling */
+ if (cfqd->cfq_group_idle)
+ group_idle = cfqd->cfq_group_idle;
+ else
+ return;
+ }

/*
* still active requests from this queue, don't idle
@@ -1906,13 +1918,21 @@ static void cfq_arm_slice_timer(struct c
return;
}

+ /* There are other queues in the group, don't do group idle */
+ if (group_idle && cfqq->cfqg->nr_cfqq > 1)
+ return;
+
cfq_mark_cfqq_wait_request(cfqq);

- sl = cfqd->cfq_slice_idle;
+ if (group_idle)
+ sl = cfqd->cfq_group_idle;
+ else
+ sl = cfqd->cfq_slice_idle;

mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
- cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
+ cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
+ group_idle ? 1 : 0);
}

/*
@@ -1928,6 +1948,7 @@ static void cfq_dispatch_insert(struct r
cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
cfq_remove_request(rq);
cfqq->dispatched++;
+ (RQ_CFQG(rq))->dispatched++;
elv_dispatch_sort(q, rq);

cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
@@ -2231,6 +2252,16 @@ static struct cfq_queue *cfq_select_queu
goto keep_queue;
}

+ /*
+ * If group idle is enabled and there are requests dispatched from
+ * this group, wait for requests to complete.
+ */
+ if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
+ && cfqq->cfqg->dispatched) {
+ cfqq = NULL;
+ goto keep_queue;
+ }
+
expire:
cfq_slice_expired(cfqd, 0);
new_queue:
@@ -3373,6 +3404,7 @@ static void cfq_completed_request(struct
WARN_ON(!cfqq->dispatched);
cfqd->rq_in_driver--;
cfqq->dispatched--;
+ (RQ_CFQG(rq))->dispatched--;
cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
rq_start_time_ns(rq), rq_io_start_time_ns(rq),
rq_data_dir(rq), rq_is_sync(rq));
@@ -3847,6 +3879,7 @@ static void *cfq_init_queue(struct reque
cfqd->cfq_slice[1] = cfq_slice_sync;
cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
cfqd->cfq_slice_idle = cfq_slice_idle;
+ cfqd->cfq_group_idle = cfq_group_idle;
cfqd->cfq_latency = 1;
cfqd->cfq_group_isolation = 0;
cfqd->hw_tag = -1;
@@ -3919,6 +3952,7 @@ SHOW_FUNCTION(cfq_fifo_expire_async_show
SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
+SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
@@ -3951,6 +3985,7 @@ STORE_FUNCTION(cfq_back_seek_max_store,
STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
UINT_MAX, 0);
STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
+STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
@@ -3972,6 +4007,7 @@ static struct elv_fs_entry cfq_attrs[] =
CFQ_ATTR(slice_async),
CFQ_ATTR(slice_async_rq),
CFQ_ATTR(slice_idle),
+ CFQ_ATTR(group_idle),
CFQ_ATTR(low_latency),
CFQ_ATTR(group_isolation),
__ATTR_NULL
@@ -4025,6 +4061,12 @@ static int __init cfq_init(void)
if (!cfq_slice_idle)
cfq_slice_idle = 1;

+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ if (!cfq_group_idle)
+ cfq_group_idle = 1;
+#else
+ cfq_group_idle = 0;
+#endif
if (cfq_slab_setup())
return -ENOMEM;