2011-02-10 07:47:26

by Gui, Jianfeng/归 剑峰

[permalink] [raw]
Subject: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue

Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
group scheduling on the same service tree.

Signed-off-by: Gui Jianfeng <[email protected]>
---
block/cfq-iosched.c | 219 +++++++++++++++++++++++++++++++++++++++------------
1 files changed, 167 insertions(+), 52 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f3a126e..41cef2e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -39,6 +39,13 @@ static const int cfq_hist_divisor = 4;
*/
#define CFQ_IDLE_DELAY (HZ / 5)

+/*
+ * The base boosting value.
+ */
+#define CFQ_BOOST_SYNC_BASE (HZ / 10)
+#define CFQ_BOOST_ASYNC_BASE (HZ / 25)
+
+
/*
* below this threshold, we consider thinktime immediate
*/
@@ -99,10 +106,7 @@ struct cfq_entity {
struct cfq_rb_root *service_tree;
/* service_tree member */
struct rb_node rb_node;
- /* service_tree key, represent the position on the tree */
- unsigned long rb_key;
-
- /* group service_tree key */
+ /* service_tree key */
u64 vdisktime;
bool is_group_entity;
unsigned int weight;
@@ -114,6 +118,8 @@ struct cfq_entity {
struct cfq_queue {
/* The schedule entity */
struct cfq_entity cfqe;
+ /* Reposition time */
+ unsigned long reposition_time;
/* reference count */
int ref;
/* various state flags, see below */
@@ -312,6 +318,24 @@ struct cfq_data {
struct rcu_head rcu;
};

+/*
+ * Map io priority(7 ~ 0) to io weight(100 ~ 1000) as follows
+ * prio 0 1 2 3 4 5 6 7
+ * weight 1000 868 740 612 484 356 228 100
+ */
+static inline unsigned int cfq_prio_to_weight(unsigned short ioprio)
+{
+ unsigned int step;
+
+ BUG_ON(ioprio >= IOPRIO_BE_NR);
+
+ step = (BLKIO_WEIGHT_MAX - BLKIO_WEIGHT_MIN) / (IOPRIO_BE_NR - 1);
+ if (ioprio == 0)
+ return BLKIO_WEIGHT_MAX;
+
+ return BLKIO_WEIGHT_MIN + (IOPRIO_BE_NR - ioprio - 1) * step;
+}
+
static inline struct cfq_queue *
cfqq_of_entity(struct cfq_entity *cfqe)
{
@@ -840,16 +864,6 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
}

-static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
- struct cfq_queue *cfqq)
-{
- /*
- * just an approximation, should be ok.
- */
- return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
- cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
-}
-
static inline s64
entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
{
@@ -1199,6 +1213,21 @@ static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}

#endif /* GROUP_IOSCHED */

+static inline u64 cfq_get_boost(struct cfq_data *cfqd,
+ struct cfq_queue *cfqq)
+{
+ u64 d;
+
+ if (cfq_cfqq_sync(cfqq))
+ d = CFQ_BOOST_SYNC_BASE << CFQ_SERVICE_SHIFT;
+ else
+ d = CFQ_BOOST_ASYNC_BASE << CFQ_SERVICE_SHIFT;
+
+ d = d * BLKIO_WEIGHT_DEFAULT;
+ do_div(d, cfqq->cfqe.weight);
+ return d;
+}
+
/*
* The cfqd->service_trees holds all pending cfq_queue's that have
* requests waiting to be processed. It is sorted in the order that
@@ -1210,13 +1239,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
struct cfq_entity *cfqe;
struct rb_node **p, *parent;
struct cfq_entity *__cfqe;
- unsigned long rb_key;
- struct cfq_rb_root *service_tree;
+ struct cfq_rb_root *service_tree, *orig_st;
int left;
int new_cfqq = 1;
int group_changed = 0;
+ s64 key;

cfqe = &cfqq->cfqe;
+ orig_st = cfqe->service_tree;

#ifdef CONFIG_CFQ_GROUP_IOSCHED
if (!cfqd->cfq_group_isolation
@@ -1224,8 +1254,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
&& cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
/* Move this cfq to root group */
cfq_log_cfqq(cfqd, cfqq, "moving to root group");
- if (!RB_EMPTY_NODE(&cfqe->rb_node))
+ if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+ /*
+ * Group changed, dequeue this CFQ queue from the
+ * original service tree.
+ */
+ cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
+ orig_st->total_weight -= cfqe->weight;
+ }
cfqq->orig_cfqg = cfqq->cfqg;
cfqq->cfqg = &cfqd->root_group;
cfqd->root_group.ref++;
@@ -1234,8 +1271,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
&& cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
/* cfqq is sequential now needs to go to its original group */
BUG_ON(cfqq->cfqg != &cfqd->root_group);
- if (!RB_EMPTY_NODE(&cfqe->rb_node))
+ if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+ /*
+ * Group changed, dequeue this CFQ queue from the
+ * original service tree.
+ */
+ cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
+ orig_st->total_weight -= cfqe->weight;
+ }
cfq_put_cfqg(cfqq->cfqg);
cfqq->cfqg = cfqq->orig_cfqg;
cfqq->orig_cfqg = NULL;
@@ -1246,47 +1290,68 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,

service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
cfqq_type(cfqq));
+ if (RB_EMPTY_NODE(&cfqe->rb_node)) {
+ /*
+ * If this CFQ queue moves to another group, the original
+ * vdisktime makes no sense any more, reset the vdisktime
+ * here.
+ */
+ parent = rb_last(&service_tree->rb);
+ if (parent) {
+ u64 pos_offset;
+
+ /*
+ * Estimate the position according to its weight and
+ * ioprio.
+ */
+ pos_offset = cfq_get_boost(cfqd, cfqq);
+ /* Debug purpose, should remove. */
+ cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu",
+ pos_offset);
+ cfqe->vdisktime = service_tree->min_vdisktime +
+ pos_offset;
+ } else
+ cfqe->vdisktime = service_tree->min_vdisktime;
+
+ goto insert;
+ }
+
+ /*
+ * Ok, we get here, this CFQ queue is on the service tree, dequeue it
+ * firstly.
+ */
+ cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
+ orig_st->total_weight -= cfqe->weight;
+
+ new_cfqq = 0;
+
if (cfq_class_idle(cfqq)) {
- rb_key = CFQ_IDLE_DELAY;
parent = rb_last(&service_tree->rb);
if (parent && parent != &cfqe->rb_node) {
__cfqe = rb_entry(parent, struct cfq_entity, rb_node);
- rb_key += __cfqe->rb_key;
+ cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
} else
- rb_key += jiffies;
+ cfqe->vdisktime = service_tree->min_vdisktime;
} else if (!add_front) {
/*
- * Get our rb key offset. Subtract any residual slice
- * value carried from last service. A negative resid
- * count indicates slice overrun, and this should position
- * the next service time further away in the tree.
+ * We charge the CFQ queue by the time this queue runs, and
+ * repsition it on the service tree.
*/
- rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
- rb_key -= cfqq->slice_resid;
+ unsigned int used_sl;
+
+ used_sl = cfq_cfqq_slice_usage(cfqq);
+ cfqe->vdisktime += cfq_scale_slice(used_sl, cfqe);
cfqq->slice_resid = 0;
} else {
- rb_key = -HZ;
- __cfqe = cfq_rb_first(service_tree);
- rb_key += __cfqe ? __cfqe->rb_key : jiffies;
- }
-
- if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
- new_cfqq = 0;
- /*
- * same position, nothing more to do
- */
- if (rb_key == cfqe->rb_key &&
- cfqe->service_tree == service_tree)
- return;
-
- cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
- cfqe->service_tree = NULL;
+ cfqe->vdisktime = service_tree->min_vdisktime;
}

+insert:
left = 1;
parent = NULL;
cfqe->service_tree = service_tree;
p = &service_tree->rb.rb_node;
+ key = entity_key(service_tree, cfqe);
while (*p) {
struct rb_node **n;

@@ -1296,7 +1361,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
/*
* sort by key, that represents service time.
*/
- if (time_before(rb_key, __cfqe->rb_key))
+ if (key < entity_key(service_tree, __cfqe))
n = &(*p)->rb_left;
else {
n = &(*p)->rb_right;
@@ -1309,10 +1374,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
if (left)
service_tree->left = &cfqe->rb_node;

- cfqe->rb_key = rb_key;
rb_link_node(&cfqe->rb_node, parent, p);
rb_insert_color(&cfqe->rb_node, &service_tree->rb);
+ update_min_vdisktime(service_tree);
service_tree->count++;
+ service_tree->total_weight += cfqe->weight;
+ cfqq->reposition_time = jiffies;
if ((add_front || !new_cfqq) && !group_changed)
return;
cfq_group_service_tree_add(cfqd, cfqq->cfqg);
@@ -1414,14 +1481,18 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
struct cfq_entity *cfqe;
+ struct cfq_rb_root *service_tree;
+
cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
BUG_ON(!cfq_cfqq_on_rr(cfqq));
cfq_clear_cfqq_on_rr(cfqq);

cfqe = &cfqq->cfqe;
+ service_tree = cfqe->service_tree;

if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
+ service_tree->total_weight -= cfqe->weight;
cfqe->service_tree = NULL;
}
if (cfqq->p_root) {
@@ -2120,23 +2191,36 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
}
}

+/*
+ * The time when a CFQ queue is put onto a service tree is recoreded in
+ * cfqq->reposition_time. Currently, we check the first priority CFQ queues
+ * on each service tree, and select the workload type that contains the lowest
+ * reposition_time CFQ queue among them.
+ */
static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
struct cfq_group *cfqg, enum wl_prio_t prio)
{
struct cfq_entity *cfqe;
+ struct cfq_queue *cfqq;
+ unsigned long lowest_start_time;
int i;
- bool key_valid = false;
- unsigned long lowest_key = 0;
+ bool time_valid = false;
enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;

+ /*
+ * TODO: We may take io priority and io class into account when
+ * choosing a workload type. But for the time being just make use of
+ * reposition_time only.
+ */
for (i = 0; i <= SYNC_WORKLOAD; ++i) {
- /* select the one with lowest rb_key */
cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
- if (cfqe &&
- (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
- lowest_key = cfqe->rb_key;
+ cfqq = cfqq_of_entity(cfqe);
+ if (cfqe && (!time_valid ||
+ time_before(cfqq->reposition_time,
+ lowest_start_time))) {
+ lowest_start_time = cfqq->reposition_time;
cur_best = i;
- key_valid = true;
+ time_valid = true;
}
}

@@ -2808,10 +2892,13 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
{
struct task_struct *tsk = current;
int ioprio_class;
+ struct cfq_entity *cfqe;

if (!cfq_cfqq_prio_changed(cfqq))
return;

+ cfqe = &cfqq->cfqe;
+
ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
switch (ioprio_class) {
default:
@@ -2838,6 +2925,17 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
break;
}

+ if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
+ /*
+ * If this CFQ entity is already on service tree, we need to
+ * adjust service tree's total weight accordingly.
+ */
+ cfqe->service_tree->total_weight -= cfqe->weight;
+ cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
+ cfqe->service_tree->total_weight += cfqe->weight;
+ } else
+ cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
+
/*
* keep track of original prio settings in case we have to temporarily
* elevate the priority of this queue
@@ -3572,6 +3670,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
*/
static void cfq_prio_boost(struct cfq_queue *cfqq)
{
+ struct cfq_entity *cfqe;
+
+ cfqe = &cfqq->cfqe;
if (has_fs_excl()) {
/*
* boost idle prio on transactions that would lock out other
@@ -3588,6 +3689,20 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
cfqq->ioprio_class = cfqq->org_ioprio_class;
cfqq->ioprio = cfqq->org_ioprio;
}
+
+ /*
+ * update the io weight if io priority gets changed.
+ */
+ if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
+ /*
+ * If this CFQ entity is already on service tree, we need to
+ * adjust service tree's total weight accordingly.
+ */
+ cfqe->service_tree->total_weight -= cfqe->weight;
+ cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
+ cfqe->service_tree->total_weight += cfqe->weight;
+ } else
+ cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
}

static inline int __cfq_may_queue(struct cfq_queue *cfqq)
--
1.7.1




2011-02-10 19:29:49

by Vivek Goyal

[permalink] [raw]
Subject: Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue

On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
> group scheduling on the same service tree.
>
> Signed-off-by: Gui Jianfeng <[email protected]>
> ---
> block/cfq-iosched.c | 219 +++++++++++++++++++++++++++++++++++++++------------
> 1 files changed, 167 insertions(+), 52 deletions(-)
>
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index f3a126e..41cef2e 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -39,6 +39,13 @@ static const int cfq_hist_divisor = 4;
> */
> #define CFQ_IDLE_DELAY (HZ / 5)
>
> +/*
> + * The base boosting value.
> + */
> +#define CFQ_BOOST_SYNC_BASE (HZ / 10)
> +#define CFQ_BOOST_ASYNC_BASE (HZ / 25)
> +

These are same as cfq_slice_sync and cfq_slice_async. Looking at
boost logic, this is equivalent of starting a new queue/group as
if it is being requeued after conuming a full slice. So may be we can divide
it by some const number say 4 or something like that. This is a minor
point though as this algorimthm will kind of evolve and we will learn
what works best.

Secondly, I think you wanted to SYNC vs ASYNC logic seem to be reversed.
We would like to give ASYNC queues higher boost (Put these farther in
tree) and lesser boost to SYNC queues. Looks like above constants will
do the reverse?


[..]
> + if (RB_EMPTY_NODE(&cfqe->rb_node)) {
> + /*
> + * If this CFQ queue moves to another group, the original
> + * vdisktime makes no sense any more, reset the vdisktime
> + * here.
> + */
> + parent = rb_last(&service_tree->rb);
> + if (parent) {
> + u64 pos_offset;
> +
> + /*
> + * Estimate the position according to its weight and
> + * ioprio.
> + */
> + pos_offset = cfq_get_boost(cfqd, cfqq);
> + /* Debug purpose, should remove. */
> + cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu",
> + pos_offset);

You wanted to get rid of above debugging comment?

Thanks
Vivek

2011-02-12 01:21:04

by Gui, Jianfeng/归 剑峰

[permalink] [raw]
Subject: Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue

Vivek Goyal wrote:
> On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
>> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
>> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
>> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
>> group scheduling on the same service tree.
>>
>> Signed-off-by: Gui Jianfeng <[email protected]>
>> ---
>> block/cfq-iosched.c | 219 +++++++++++++++++++++++++++++++++++++++------------
>> 1 files changed, 167 insertions(+), 52 deletions(-)
>>
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index f3a126e..41cef2e 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -39,6 +39,13 @@ static const int cfq_hist_divisor = 4;
>> */
>> #define CFQ_IDLE_DELAY (HZ / 5)
>>
>> +/*
>> + * The base boosting value.
>> + */
>> +#define CFQ_BOOST_SYNC_BASE (HZ / 10)
>> +#define CFQ_BOOST_ASYNC_BASE (HZ / 25)
>> +
>
> These are same as cfq_slice_sync and cfq_slice_async. Looking at
> boost logic, this is equivalent of starting a new queue/group as
> if it is being requeued after conuming a full slice. So may be we can divide
> it by some const number say 4 or something like that. This is a minor
> point though as this algorimthm will kind of evolve and we will learn
> what works best.
>
> Secondly, I think you wanted to SYNC vs ASYNC logic seem to be reversed.
> We would like to give ASYNC queues higher boost (Put these farther in
> tree) and lesser boost to SYNC queues. Looks like above constants will
> do the reverse?

Hi Vivek,

Currently, SYNC and ASYNC queues are in different service tree, they don't
impact each other. Here, I Really want use this logic.

Thanks,
Gui

>
>
> [..]
>> + if (RB_EMPTY_NODE(&cfqe->rb_node)) {
>> + /*
>> + * If this CFQ queue moves to another group, the original
>> + * vdisktime makes no sense any more, reset the vdisktime
>> + * here.
>> + */
>> + parent = rb_last(&service_tree->rb);
>> + if (parent) {
>> + u64 pos_offset;
>> +
>> + /*
>> + * Estimate the position according to its weight and
>> + * ioprio.
>> + */
>> + pos_offset = cfq_get_boost(cfqd, cfqq);
>> + /* Debug purpose, should remove. */
>> + cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu",
>> + pos_offset);
>
> You wanted to get rid of above debugging comment?
>
> Thanks
> Vivek
>

--
Regards
Gui Jianfeng

2011-02-14 16:58:18

by Vivek Goyal

[permalink] [raw]
Subject: Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue

On Sat, Feb 12, 2011 at 09:20:58AM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> > On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
> >> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
> >> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
> >> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
> >> group scheduling on the same service tree.
> >>
> >> Signed-off-by: Gui Jianfeng <[email protected]>
> >> ---
> >> block/cfq-iosched.c | 219 +++++++++++++++++++++++++++++++++++++++------------
> >> 1 files changed, 167 insertions(+), 52 deletions(-)
> >>
> >> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> >> index f3a126e..41cef2e 100644
> >> --- a/block/cfq-iosched.c
> >> +++ b/block/cfq-iosched.c
> >> @@ -39,6 +39,13 @@ static const int cfq_hist_divisor = 4;
> >> */
> >> #define CFQ_IDLE_DELAY (HZ / 5)
> >>
> >> +/*
> >> + * The base boosting value.
> >> + */
> >> +#define CFQ_BOOST_SYNC_BASE (HZ / 10)
> >> +#define CFQ_BOOST_ASYNC_BASE (HZ / 25)
> >> +
> >
> > These are same as cfq_slice_sync and cfq_slice_async. Looking at
> > boost logic, this is equivalent of starting a new queue/group as
> > if it is being requeued after conuming a full slice. So may be we can divide
> > it by some const number say 4 or something like that. This is a minor
> > point though as this algorimthm will kind of evolve and we will learn
> > what works best.
> >
> > Secondly, I think you wanted to SYNC vs ASYNC logic seem to be reversed.
> > We would like to give ASYNC queues higher boost (Put these farther in
> > tree) and lesser boost to SYNC queues. Looks like above constants will
> > do the reverse?
>
> Hi Vivek,
>
> Currently, SYNC and ASYNC queues are in different service tree, they don't
> impact each other. Here, I Really want use this logic.

Ok, SYNC and ASYNC are on separate service tree so their vtime are not
comparable (as of today, down the line one might want to look at those for
better workload selection logic).

Anyway, because two are on seprate tree so why should we have separate
boosting constants for them? How does it help?

Thanks
Vivek

2011-02-14 18:13:31

by Vivek Goyal

[permalink] [raw]
Subject: Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue

On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:

[..]
> +/*
> + * The time when a CFQ queue is put onto a service tree is recoreded in
> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
> + * on each service tree, and select the workload type that contains the lowest
> + * reposition_time CFQ queue among them.
> + */
> static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
> struct cfq_group *cfqg, enum wl_prio_t prio)
> {
> struct cfq_entity *cfqe;
> + struct cfq_queue *cfqq;
> + unsigned long lowest_start_time;
> int i;
> - bool key_valid = false;
> - unsigned long lowest_key = 0;
> + bool time_valid = false;
> enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
>
> + /*
> + * TODO: We may take io priority and io class into account when
> + * choosing a workload type. But for the time being just make use of
> + * reposition_time only.
> + */
> for (i = 0; i <= SYNC_WORKLOAD; ++i) {
> - /* select the one with lowest rb_key */
> cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
> - if (cfqe &&
> - (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
> - lowest_key = cfqe->rb_key;
> + cfqq = cfqq_of_entity(cfqe);
> + if (cfqe && (!time_valid ||
> + time_before(cfqq->reposition_time,
> + lowest_start_time))) {
> + lowest_start_time = cfqq->reposition_time;

Gui,

Have you had a chance to run some mixed workloads in a group (some sync,
some async and some sync-idle queues), and see how latency and throughput
of sync-idle workload changes due to this "resposition_time" logic. I
just want to make sure that latency of sync-noidle workload does not
go up as that's the workload that people care and gets noticed first.

Thanks
Vivek

2011-02-14 23:33:18

by Justin TerAvest

[permalink] [raw]
Subject: Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue

On Wed, Feb 9, 2011 at 11:47 PM, Gui Jianfeng
<[email protected]> wrote:
> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
> group scheduling on the same service tree.

Hi Gui,
I have a couple of questions inline.

Thanks,
Justin

>
> Signed-off-by: Gui Jianfeng <[email protected]>
> ---
> ?block/cfq-iosched.c | ?219 +++++++++++++++++++++++++++++++++++++++------------
> ?1 files changed, 167 insertions(+), 52 deletions(-)
>
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index f3a126e..41cef2e 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -39,6 +39,13 @@ static const int cfq_hist_divisor = 4;
> ?*/
> ?#define CFQ_IDLE_DELAY ? ? ? ? (HZ / 5)
>
> +/*
> + * The base boosting value.
> + */
> +#define CFQ_BOOST_SYNC_BASE ? ? ? ? ?(HZ / 10)
> +#define CFQ_BOOST_ASYNC_BASE ? ? ? ? ?(HZ / 25)
> +
> +
> ?/*
> ?* below this threshold, we consider thinktime immediate
> ?*/
> @@ -99,10 +106,7 @@ struct cfq_entity {
> ? ? ? ?struct cfq_rb_root *service_tree;
> ? ? ? ?/* service_tree member */
> ? ? ? ?struct rb_node rb_node;
> - ? ? ? /* service_tree key, represent the position on the tree */
> - ? ? ? unsigned long rb_key;
> -
> - ? ? ? /* group service_tree key */
> + ? ? ? /* service_tree key */
> ? ? ? ?u64 vdisktime;
> ? ? ? ?bool is_group_entity;
> ? ? ? ?unsigned int weight;
> @@ -114,6 +118,8 @@ struct cfq_entity {
> ?struct cfq_queue {
> ? ? ? ?/* The schedule entity */
> ? ? ? ?struct cfq_entity cfqe;
> + ? ? ? /* Reposition time */
> + ? ? ? unsigned long reposition_time;

Can this be addition time or something else instead? This is set, even
when we are not repositioning among service trees.

> ? ? ? ?/* reference count */
> ? ? ? ?int ref;
> ? ? ? ?/* various state flags, see below */
> @@ -312,6 +318,24 @@ struct cfq_data {
> ? ? ? ?struct rcu_head rcu;
> ?};
>
> +/*
> + * Map io priority(7 ~ 0) to io weight(100 ~ 1000) as follows
> + * ? ? prio ? ? ? 0 ? ?1 ? ? 2 ? ?3 ? ?4 ? ?5 ? ?6 ? ? 7
> + * ? ? weight ?1000 ?868 ? 740 ?612 ?484 ?356 ?228 ? 100
> + */
> +static inline unsigned int cfq_prio_to_weight(unsigned short ioprio)
> +{
> + ? ? ? unsigned int step;
> +
> + ? ? ? BUG_ON(ioprio >= IOPRIO_BE_NR);
> +
> + ? ? ? step = (BLKIO_WEIGHT_MAX - BLKIO_WEIGHT_MIN) / (IOPRIO_BE_NR - 1);
> + ? ? ? if (ioprio == 0)
> + ? ? ? ? ? ? ? return BLKIO_WEIGHT_MAX;
> +
> + ? ? ? return BLKIO_WEIGHT_MIN + (IOPRIO_BE_NR - ioprio - 1) * step;
> +}
> +
> ?static inline struct cfq_queue *
> ?cfqq_of_entity(struct cfq_entity *cfqe)
> ?{
> @@ -840,16 +864,6 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> ? ? ? ?return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
> ?}
>
> -static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct cfq_queue *cfqq)
> -{
> - ? ? ? /*
> - ? ? ? ?* just an approximation, should be ok.
> - ? ? ? ?*/
> - ? ? ? return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
> - ? ? ? ? ? ? ? ? ? ? ?cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
> -}
> -
> ?static inline s64
> ?entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
> ?{
> @@ -1199,6 +1213,21 @@ static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
>
> ?#endif /* GROUP_IOSCHED */
>
> +static inline u64 cfq_get_boost(struct cfq_data *cfqd,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cfq_queue *cfqq)
> +{
> + ? ? ? u64 d;
> +
> + ? ? ? if (cfq_cfqq_sync(cfqq))
> + ? ? ? ? ? ? ? d = CFQ_BOOST_SYNC_BASE << CFQ_SERVICE_SHIFT;
> + ? ? ? else
> + ? ? ? ? ? ? ? d = CFQ_BOOST_ASYNC_BASE << CFQ_SERVICE_SHIFT;
> +
> + ? ? ? d = d * BLKIO_WEIGHT_DEFAULT;
> + ? ? ? do_div(d, cfqq->cfqe.weight);
> + ? ? ? return d;
> +}

The logic for cfq_get_boost() looks a lot like cfq_scale_slice().
Instead of duplicating code, can't it just be
u64 d;
if (cfq_cfqq_sync(cfqq))
return cfq_scale_slice(CFQ_BOOST_SYNC_BASE, cfqq->cfqe);
else
return cfq_scale_slice(CFQ_BOOST_ASYNC_BASE, cfqq->cfqe);


> +
> ?/*
> ?* The cfqd->service_trees holds all pending cfq_queue's that have
> ?* requests waiting to be processed. It is sorted in the order that
> @@ -1210,13 +1239,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> ? ? ? ?struct cfq_entity *cfqe;
> ? ? ? ?struct rb_node **p, *parent;
> ? ? ? ?struct cfq_entity *__cfqe;
> - ? ? ? unsigned long rb_key;
> - ? ? ? struct cfq_rb_root *service_tree;
> + ? ? ? struct cfq_rb_root *service_tree, *orig_st;
> ? ? ? ?int left;
> ? ? ? ?int new_cfqq = 1;
> ? ? ? ?int group_changed = 0;
> + ? ? ? s64 key;
>
> ? ? ? ?cfqe = &cfqq->cfqe;
> + ? ? ? orig_st = cfqe->service_tree;
>
> ?#ifdef CONFIG_CFQ_GROUP_IOSCHED
> ? ? ? ?if (!cfqd->cfq_group_isolation
> @@ -1224,8 +1254,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> ? ? ? ? ? ?&& cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
> ? ? ? ? ? ? ? ?/* Move this cfq to root group */
> ? ? ? ? ? ? ? ?cfq_log_cfqq(cfqd, cfqq, "moving to root group");
> - ? ? ? ? ? ? ? if (!RB_EMPTY_NODE(&cfqe->rb_node))
> + ? ? ? ? ? ? ? if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> ? ? ? ? ? ? ? ? ? ? ? ?cfq_group_service_tree_del(cfqd, cfqq->cfqg);
> + ? ? ? ? ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ? ? ? ? ?* Group changed, dequeue this CFQ queue from the
> + ? ? ? ? ? ? ? ? ? ? ? ?* original service tree.
> + ? ? ? ? ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? ? ? ? ? cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> + ? ? ? ? ? ? ? ? ? ? ? orig_st->total_weight -= cfqe->weight;
> + ? ? ? ? ? ? ? }
> ? ? ? ? ? ? ? ?cfqq->orig_cfqg = cfqq->cfqg;
> ? ? ? ? ? ? ? ?cfqq->cfqg = &cfqd->root_group;
> ? ? ? ? ? ? ? ?cfqd->root_group.ref++;
> @@ -1234,8 +1271,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> ? ? ? ? ? ? ? ? ? && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
> ? ? ? ? ? ? ? ?/* cfqq is sequential now needs to go to its original group */
> ? ? ? ? ? ? ? ?BUG_ON(cfqq->cfqg != &cfqd->root_group);
> - ? ? ? ? ? ? ? if (!RB_EMPTY_NODE(&cfqe->rb_node))
> + ? ? ? ? ? ? ? if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> ? ? ? ? ? ? ? ? ? ? ? ?cfq_group_service_tree_del(cfqd, cfqq->cfqg);
> + ? ? ? ? ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ? ? ? ? ?* Group changed, dequeue this CFQ queue from the
> + ? ? ? ? ? ? ? ? ? ? ? ?* original service tree.
> + ? ? ? ? ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? ? ? ? ? cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> + ? ? ? ? ? ? ? ? ? ? ? orig_st->total_weight -= cfqe->weight;
> + ? ? ? ? ? ? ? }
> ? ? ? ? ? ? ? ?cfq_put_cfqg(cfqq->cfqg);
> ? ? ? ? ? ? ? ?cfqq->cfqg = cfqq->orig_cfqg;
> ? ? ? ? ? ? ? ?cfqq->orig_cfqg = NULL;
> @@ -1246,47 +1290,68 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>
> ? ? ? ?service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?cfqq_type(cfqq));
> + ? ? ? if (RB_EMPTY_NODE(&cfqe->rb_node)) {
> + ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ?* If this CFQ queue moves to another group, the original
> + ? ? ? ? ? ? ? ?* vdisktime makes no sense any more, reset the vdisktime
> + ? ? ? ? ? ? ? ?* here.
> + ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? parent = rb_last(&service_tree->rb);
> + ? ? ? ? ? ? ? if (parent) {
> + ? ? ? ? ? ? ? ? ? ? ? u64 pos_offset;
> +
> + ? ? ? ? ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ? ? ? ? ?* Estimate the position according to its weight and
> + ? ? ? ? ? ? ? ? ? ? ? ?* ioprio.
> + ? ? ? ? ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? ? ? ? ? pos_offset = cfq_get_boost(cfqd, cfqq);
> + ? ? ? ? ? ? ? ? ? ? ? /* Debug purpose, should remove. */
> + ? ? ? ? ? ? ? ? ? ? ? cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu",
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?pos_offset);
> + ? ? ? ? ? ? ? ? ? ? ? cfqe->vdisktime = service_tree->min_vdisktime +
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? pos_offset;
> + ? ? ? ? ? ? ? } else
> + ? ? ? ? ? ? ? ? ? ? ? cfqe->vdisktime = service_tree->min_vdisktime;
> +
> + ? ? ? ? ? ? ? goto insert;
> + ? ? ? }
> +
> + ? ? ? /*
> + ? ? ? ?* Ok, we get here, this CFQ queue is on the service tree, dequeue it
> + ? ? ? ?* firstly.
> + ? ? ? ?*/
> + ? ? ? cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> + ? ? ? orig_st->total_weight -= cfqe->weight;
> +
> + ? ? ? new_cfqq = 0;
> +
> ? ? ? ?if (cfq_class_idle(cfqq)) {
> - ? ? ? ? ? ? ? rb_key = CFQ_IDLE_DELAY;
> ? ? ? ? ? ? ? ?parent = rb_last(&service_tree->rb);
> ? ? ? ? ? ? ? ?if (parent && parent != &cfqe->rb_node) {
> ? ? ? ? ? ? ? ? ? ? ? ?__cfqe = rb_entry(parent, struct cfq_entity, rb_node);
> - ? ? ? ? ? ? ? ? ? ? ? rb_key += __cfqe->rb_key;
> + ? ? ? ? ? ? ? ? ? ? ? cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
> ? ? ? ? ? ? ? ?} else
> - ? ? ? ? ? ? ? ? ? ? ? rb_key += jiffies;
> + ? ? ? ? ? ? ? ? ? ? ? cfqe->vdisktime = service_tree->min_vdisktime;
> ? ? ? ?} else if (!add_front) {
> ? ? ? ? ? ? ? ?/*
> - ? ? ? ? ? ? ? ?* Get our rb key offset. Subtract any residual slice
> - ? ? ? ? ? ? ? ?* value carried from last service. A negative resid
> - ? ? ? ? ? ? ? ?* count indicates slice overrun, and this should position
> - ? ? ? ? ? ? ? ?* the next service time further away in the tree.
> + ? ? ? ? ? ? ? ?* We charge the CFQ queue by the time this queue runs, and
> + ? ? ? ? ? ? ? ?* repsition it on the service tree.
> ? ? ? ? ? ? ? ? */
> - ? ? ? ? ? ? ? rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
> - ? ? ? ? ? ? ? rb_key -= cfqq->slice_resid;
> + ? ? ? ? ? ? ? unsigned int used_sl;
> +
> + ? ? ? ? ? ? ? used_sl = cfq_cfqq_slice_usage(cfqq);
> + ? ? ? ? ? ? ? cfqe->vdisktime += cfq_scale_slice(used_sl, cfqe);
> ? ? ? ? ? ? ? ?cfqq->slice_resid = 0;
> ? ? ? ?} else {
> - ? ? ? ? ? ? ? rb_key = -HZ;
> - ? ? ? ? ? ? ? __cfqe = cfq_rb_first(service_tree);
> - ? ? ? ? ? ? ? rb_key += __cfqe ? __cfqe->rb_key : jiffies;
> - ? ? ? }
> -
> - ? ? ? if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> - ? ? ? ? ? ? ? new_cfqq = 0;
> - ? ? ? ? ? ? ? /*
> - ? ? ? ? ? ? ? ?* same position, nothing more to do
> - ? ? ? ? ? ? ? ?*/
> - ? ? ? ? ? ? ? if (rb_key == cfqe->rb_key &&
> - ? ? ? ? ? ? ? ? ? cfqe->service_tree == service_tree)
> - ? ? ? ? ? ? ? ? ? ? ? return;
> -
> - ? ? ? ? ? ? ? cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> - ? ? ? ? ? ? ? cfqe->service_tree = NULL;
> + ? ? ? ? ? ? ? cfqe->vdisktime = service_tree->min_vdisktime;
> ? ? ? ?}
>
> +insert:
> ? ? ? ?left = 1;
> ? ? ? ?parent = NULL;
> ? ? ? ?cfqe->service_tree = service_tree;
> ? ? ? ?p = &service_tree->rb.rb_node;
> + ? ? ? key = entity_key(service_tree, cfqe);
> ? ? ? ?while (*p) {
> ? ? ? ? ? ? ? ?struct rb_node **n;
>
> @@ -1296,7 +1361,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> ? ? ? ? ? ? ? ?/*
> ? ? ? ? ? ? ? ? * sort by key, that represents service time.
> ? ? ? ? ? ? ? ? */
> - ? ? ? ? ? ? ? if (time_before(rb_key, __cfqe->rb_key))
> + ? ? ? ? ? ? ? if (key < entity_key(service_tree, __cfqe))
> ? ? ? ? ? ? ? ? ? ? ? ?n = &(*p)->rb_left;
> ? ? ? ? ? ? ? ?else {
> ? ? ? ? ? ? ? ? ? ? ? ?n = &(*p)->rb_right;
> @@ -1309,10 +1374,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> ? ? ? ?if (left)
> ? ? ? ? ? ? ? ?service_tree->left = &cfqe->rb_node;
>
> - ? ? ? cfqe->rb_key = rb_key;
> ? ? ? ?rb_link_node(&cfqe->rb_node, parent, p);
> ? ? ? ?rb_insert_color(&cfqe->rb_node, &service_tree->rb);
> + ? ? ? update_min_vdisktime(service_tree);
> ? ? ? ?service_tree->count++;
> + ? ? ? service_tree->total_weight += cfqe->weight;

I'm confused by this line. Why are we doing some adjustment for cfqe
weight that we weren't doing previously? I think
cfq_group_service_tree_add below will still do the total_weight
adjustment.

> + ? ? ? cfqq->reposition_time = jiffies;
> ? ? ? ?if ((add_front || !new_cfqq) && !group_changed)
> ? ? ? ? ? ? ? ?return;
> ? ? ? ?cfq_group_service_tree_add(cfqd, cfqq->cfqg);
> @@ -1414,14 +1481,18 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
> ?static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
> ?{
> ? ? ? ?struct cfq_entity *cfqe;
> + ? ? ? struct cfq_rb_root *service_tree;
> +
> ? ? ? ?cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
> ? ? ? ?BUG_ON(!cfq_cfqq_on_rr(cfqq));
> ? ? ? ?cfq_clear_cfqq_on_rr(cfqq);
>
> ? ? ? ?cfqe = &cfqq->cfqe;
> + ? ? ? service_tree = cfqe->service_tree;
>
> ? ? ? ?if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> ? ? ? ? ? ? ? ?cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> + ? ? ? ? ? ? ? service_tree->total_weight -= cfqe->weight;
> ? ? ? ? ? ? ? ?cfqe->service_tree = NULL;
> ? ? ? ?}
> ? ? ? ?if (cfqq->p_root) {
> @@ -2120,23 +2191,36 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
> ? ? ? ?}
> ?}
>
> +/*
> + * The time when a CFQ queue is put onto a service tree is recoreded in
> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
> + * on each service tree, and select the workload type that contains the lowest
> + * reposition_time CFQ queue among them.
> + */
> ?static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cfq_group *cfqg, enum wl_prio_t prio)
> ?{
> ? ? ? ?struct cfq_entity *cfqe;
> + ? ? ? struct cfq_queue *cfqq;
> + ? ? ? unsigned long lowest_start_time;
> ? ? ? ?int i;
> - ? ? ? bool key_valid = false;
> - ? ? ? unsigned long lowest_key = 0;
> + ? ? ? bool time_valid = false;
> ? ? ? ?enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
>
> + ? ? ? /*
> + ? ? ? ?* TODO: We may take io priority and io class into account when
> + ? ? ? ?* choosing a workload type. But for the time being just make use of
> + ? ? ? ?* reposition_time only.
> + ? ? ? ?*/
> ? ? ? ?for (i = 0; i <= SYNC_WORKLOAD; ++i) {
> - ? ? ? ? ? ? ? /* select the one with lowest rb_key */
> ? ? ? ? ? ? ? ?cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
> - ? ? ? ? ? ? ? if (cfqe &&
> - ? ? ? ? ? ? ? ? ? (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
> - ? ? ? ? ? ? ? ? ? ? ? lowest_key = cfqe->rb_key;
> + ? ? ? ? ? ? ? cfqq = cfqq_of_entity(cfqe);
> + ? ? ? ? ? ? ? if (cfqe && (!time_valid ||
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ?time_before(cfqq->reposition_time,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?lowest_start_time))) {
> + ? ? ? ? ? ? ? ? ? ? ? lowest_start_time = cfqq->reposition_time;
> ? ? ? ? ? ? ? ? ? ? ? ?cur_best = i;
> - ? ? ? ? ? ? ? ? ? ? ? key_valid = true;
> + ? ? ? ? ? ? ? ? ? ? ? time_valid = true;
> ? ? ? ? ? ? ? ?}
> ? ? ? ?}
>
> @@ -2808,10 +2892,13 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
> ?{
> ? ? ? ?struct task_struct *tsk = current;
> ? ? ? ?int ioprio_class;
> + ? ? ? struct cfq_entity *cfqe;
>
> ? ? ? ?if (!cfq_cfqq_prio_changed(cfqq))
> ? ? ? ? ? ? ? ?return;
>
> + ? ? ? cfqe = &cfqq->cfqe;
> +
> ? ? ? ?ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
> ? ? ? ?switch (ioprio_class) {
> ? ? ? ?default:
> @@ -2838,6 +2925,17 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
> ? ? ? ? ? ? ? ?break;
> ? ? ? ?}
>
> + ? ? ? if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> + ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ?* If this CFQ entity is already on service tree, we need to
> + ? ? ? ? ? ? ? ?* adjust service tree's total weight accordingly.
> + ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? cfqe->service_tree->total_weight -= cfqe->weight;
> + ? ? ? ? ? ? ? cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
> + ? ? ? ? ? ? ? cfqe->service_tree->total_weight += cfqe->weight;
> + ? ? ? } else
> + ? ? ? ? ? ? ? cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
> +
> ? ? ? ?/*
> ? ? ? ? * keep track of original prio settings in case we have to temporarily
> ? ? ? ? * elevate the priority of this queue
> @@ -3572,6 +3670,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
> ?*/
> ?static void cfq_prio_boost(struct cfq_queue *cfqq)
> ?{
> + ? ? ? struct cfq_entity *cfqe;
> +
> + ? ? ? cfqe = &cfqq->cfqe;
> ? ? ? ?if (has_fs_excl()) {
> ? ? ? ? ? ? ? ?/*
> ? ? ? ? ? ? ? ? * boost idle prio on transactions that would lock out other
> @@ -3588,6 +3689,20 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
> ? ? ? ? ? ? ? ?cfqq->ioprio_class = cfqq->org_ioprio_class;
> ? ? ? ? ? ? ? ?cfqq->ioprio = cfqq->org_ioprio;
> ? ? ? ?}
> +
> + ? ? ? /*
> + ? ? ? ?* update the io weight if io priority gets changed.
> + ? ? ? ?*/
> + ? ? ? if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> + ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ?* If this CFQ entity is already on service tree, we need to
> + ? ? ? ? ? ? ? ?* adjust service tree's total weight accordingly.
> + ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? cfqe->service_tree->total_weight -= cfqe->weight;
> + ? ? ? ? ? ? ? cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
> + ? ? ? ? ? ? ? cfqe->service_tree->total_weight += cfqe->weight;
> + ? ? ? } else
> + ? ? ? ? ? ? ? cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
> ?}
>
> ?static inline int __cfq_may_queue(struct cfq_queue *cfqq)
> --
> 1.7.1
>
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at ?http://www.tux.org/lkml/
>

2011-02-15 01:44:56

by Gui, Jianfeng/归 剑峰

[permalink] [raw]
Subject: Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue

Justin TerAvest wrote:
> On Wed, Feb 9, 2011 at 11:47 PM, Gui Jianfeng
> <[email protected]> wrote:
>> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
>> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
>> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
>> group scheduling on the same service tree.
>
> Hi Gui,
> I have a couple of questions inline.
>
> Thanks,
> Justin
>
>> Signed-off-by: Gui Jianfeng <[email protected]>
>> ---
>> block/cfq-iosched.c | 219 +++++++++++++++++++++++++++++++++++++++------------
>> 1 files changed, 167 insertions(+), 52 deletions(-)
>>
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index f3a126e..41cef2e 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -39,6 +39,13 @@ static const int cfq_hist_divisor = 4;
>> */
>> #define CFQ_IDLE_DELAY (HZ / 5)
>>
>> +/*
>> + * The base boosting value.
>> + */
>> +#define CFQ_BOOST_SYNC_BASE (HZ / 10)
>> +#define CFQ_BOOST_ASYNC_BASE (HZ / 25)
>> +
>> +
>> /*
>> * below this threshold, we consider thinktime immediate
>> */
>> @@ -99,10 +106,7 @@ struct cfq_entity {
>> struct cfq_rb_root *service_tree;
>> /* service_tree member */
>> struct rb_node rb_node;
>> - /* service_tree key, represent the position on the tree */
>> - unsigned long rb_key;
>> -
>> - /* group service_tree key */
>> + /* service_tree key */
>> u64 vdisktime;
>> bool is_group_entity;
>> unsigned int weight;
>> @@ -114,6 +118,8 @@ struct cfq_entity {
>> struct cfq_queue {
>> /* The schedule entity */
>> struct cfq_entity cfqe;
>> + /* Reposition time */
>> + unsigned long reposition_time;
>
> Can this be addition time or something else instead? This is set, even
> when we are not repositioning among service trees.

Hi Justin,

how about position_time :)

>
>> /* reference count */
>> int ref;
>> /* various state flags, see below */
>> @@ -312,6 +318,24 @@ struct cfq_data {
>> struct rcu_head rcu;
>> };
>>
>> +/*
>> + * Map io priority(7 ~ 0) to io weight(100 ~ 1000) as follows
>> + * prio 0 1 2 3 4 5 6 7
>> + * weight 1000 868 740 612 484 356 228 100
>> + */
>> +static inline unsigned int cfq_prio_to_weight(unsigned short ioprio)
>> +{
>> + unsigned int step;
>> +
>> + BUG_ON(ioprio >= IOPRIO_BE_NR);
>> +
>> + step = (BLKIO_WEIGHT_MAX - BLKIO_WEIGHT_MIN) / (IOPRIO_BE_NR - 1);
>> + if (ioprio == 0)
>> + return BLKIO_WEIGHT_MAX;
>> +
>> + return BLKIO_WEIGHT_MIN + (IOPRIO_BE_NR - ioprio - 1) * step;
>> +}
>> +
>> static inline struct cfq_queue *
>> cfqq_of_entity(struct cfq_entity *cfqe)
>> {
>> @@ -840,16 +864,6 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>> return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
>> }
>>
>> -static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
>> - struct cfq_queue *cfqq)
>> -{
>> - /*
>> - * just an approximation, should be ok.
>> - */
>> - return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
>> - cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
>> -}
>> -
>> static inline s64
>> entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
>> {
>> @@ -1199,6 +1213,21 @@ static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
>>
>> #endif /* GROUP_IOSCHED */
>>
>> +static inline u64 cfq_get_boost(struct cfq_data *cfqd,
>> + struct cfq_queue *cfqq)
>> +{
>> + u64 d;
>> +
>> + if (cfq_cfqq_sync(cfqq))
>> + d = CFQ_BOOST_SYNC_BASE << CFQ_SERVICE_SHIFT;
>> + else
>> + d = CFQ_BOOST_ASYNC_BASE << CFQ_SERVICE_SHIFT;
>> +
>> + d = d * BLKIO_WEIGHT_DEFAULT;
>> + do_div(d, cfqq->cfqe.weight);
>> + return d;
>> +}
>
> The logic for cfq_get_boost() looks a lot like cfq_scale_slice().
> Instead of duplicating code, can't it just be
> u64 d;
> if (cfq_cfqq_sync(cfqq))
> return cfq_scale_slice(CFQ_BOOST_SYNC_BASE, cfqq->cfqe);
> else
> return cfq_scale_slice(CFQ_BOOST_ASYNC_BASE, cfqq->cfqe);
>

Ok, I think this should work.

>
>> +
>> /*
>> * The cfqd->service_trees holds all pending cfq_queue's that have
...

> I'm confused by this line. Why are we doing some adjustment for cfqe
> weight that we weren't doing previously? I think
> cfq_group_service_tree_add below will still do the total_weight
> adjustment.

later patch does the integration for cfqq and cfqg.

Thanks,
Gui

>
>> + cfqq->reposition_time = jiffies;
>> if ((add_front || !new_cfqq) && !group_changed)
>> return;
>> cfq_group_service_tree_add(cfqd, cfqq->cfqg);
>> @@ -1414,14 +1481,18 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>> static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>> {

2011-02-15 01:46:35

by Gui, Jianfeng/归 剑峰

[permalink] [raw]
Subject: Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue

Vivek Goyal wrote:
> On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
>
> [..]
>> +/*
>> + * The time when a CFQ queue is put onto a service tree is recoreded in
>> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
>> + * on each service tree, and select the workload type that contains the lowest
>> + * reposition_time CFQ queue among them.
>> + */
>> static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>> struct cfq_group *cfqg, enum wl_prio_t prio)
>> {
>> struct cfq_entity *cfqe;
>> + struct cfq_queue *cfqq;
>> + unsigned long lowest_start_time;
>> int i;
>> - bool key_valid = false;
>> - unsigned long lowest_key = 0;
>> + bool time_valid = false;
>> enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
>>
>> + /*
>> + * TODO: We may take io priority and io class into account when
>> + * choosing a workload type. But for the time being just make use of
>> + * reposition_time only.
>> + */
>> for (i = 0; i <= SYNC_WORKLOAD; ++i) {
>> - /* select the one with lowest rb_key */
>> cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
>> - if (cfqe &&
>> - (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
>> - lowest_key = cfqe->rb_key;
>> + cfqq = cfqq_of_entity(cfqe);
>> + if (cfqe && (!time_valid ||
>> + time_before(cfqq->reposition_time,
>> + lowest_start_time))) {
>> + lowest_start_time = cfqq->reposition_time;
>
> Gui,
>
> Have you had a chance to run some mixed workloads in a group (some sync,
> some async and some sync-idle queues), and see how latency and throughput
> of sync-idle workload changes due to this "resposition_time" logic. I
> just want to make sure that latency of sync-noidle workload does not
> go up as that's the workload that people care and gets noticed first.

Hi Vivek

Will do some tests.

Thanks,
Gui

>
> Thanks
> Vivek
>

2011-02-15 01:54:08

by Gui, Jianfeng/归 剑峰

[permalink] [raw]
Subject: Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue

Vivek Goyal wrote:
> On Sat, Feb 12, 2011 at 09:20:58AM +0800, Gui Jianfeng wrote:
>> Vivek Goyal wrote:
>>> On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
>>>> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
>>>> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
>>>> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
>>>> group scheduling on the same service tree.
>>>>
>>>> Signed-off-by: Gui Jianfeng <[email protected]>
>>>> ---
>>>> block/cfq-iosched.c | 219 +++++++++++++++++++++++++++++++++++++++------------
>>>> 1 files changed, 167 insertions(+), 52 deletions(-)
>>>>
>>>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>>>> index f3a126e..41cef2e 100644
>>>> --- a/block/cfq-iosched.c
>>>> +++ b/block/cfq-iosched.c
>>>> @@ -39,6 +39,13 @@ static const int cfq_hist_divisor = 4;
>>>> */
>>>> #define CFQ_IDLE_DELAY (HZ / 5)
>>>>
>>>> +/*
>>>> + * The base boosting value.
>>>> + */
>>>> +#define CFQ_BOOST_SYNC_BASE (HZ / 10)
>>>> +#define CFQ_BOOST_ASYNC_BASE (HZ / 25)
>>>> +
>>> These are same as cfq_slice_sync and cfq_slice_async. Looking at
>>> boost logic, this is equivalent of starting a new queue/group as
>>> if it is being requeued after conuming a full slice. So may be we can divide
>>> it by some const number say 4 or something like that. This is a minor
>>> point though as this algorimthm will kind of evolve and we will learn
>>> what works best.
>>>
>>> Secondly, I think you wanted to SYNC vs ASYNC logic seem to be reversed.
>>> We would like to give ASYNC queues higher boost (Put these farther in
>>> tree) and lesser boost to SYNC queues. Looks like above constants will
>>> do the reverse?
>> Hi Vivek,
>>
>> Currently, SYNC and ASYNC queues are in different service tree, they don't
>> impact each other. Here, I Really want use this logic.
>
> Ok, SYNC and ASYNC are on separate service tree so their vtime are not
> comparable (as of today, down the line one might want to look at those for
> better workload selection logic).
>
> Anyway, because two are on seprate tree so why should we have separate
> boosting constants for them? How does it help?

Here if we are using CFQ_BOOST_SYNC_BASE for both, I think it might boost
too much for an ASYNC cfqe as compare to others on the same service tree(async).
So I make charging and boosting follow the same base.

Thanks,
Gui

>
> Thanks
> Vivek
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2011-02-15 14:30:40

by Vivek Goyal

[permalink] [raw]
Subject: Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue

On Tue, Feb 15, 2011 at 09:53:58AM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> > On Sat, Feb 12, 2011 at 09:20:58AM +0800, Gui Jianfeng wrote:
> >> Vivek Goyal wrote:
> >>> On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
> >>>> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
> >>>> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
> >>>> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
> >>>> group scheduling on the same service tree.
> >>>>
> >>>> Signed-off-by: Gui Jianfeng <[email protected]>
> >>>> ---
> >>>> block/cfq-iosched.c | 219 +++++++++++++++++++++++++++++++++++++++------------
> >>>> 1 files changed, 167 insertions(+), 52 deletions(-)
> >>>>
> >>>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> >>>> index f3a126e..41cef2e 100644
> >>>> --- a/block/cfq-iosched.c
> >>>> +++ b/block/cfq-iosched.c
> >>>> @@ -39,6 +39,13 @@ static const int cfq_hist_divisor = 4;
> >>>> */
> >>>> #define CFQ_IDLE_DELAY (HZ / 5)
> >>>>
> >>>> +/*
> >>>> + * The base boosting value.
> >>>> + */
> >>>> +#define CFQ_BOOST_SYNC_BASE (HZ / 10)
> >>>> +#define CFQ_BOOST_ASYNC_BASE (HZ / 25)
> >>>> +
> >>> These are same as cfq_slice_sync and cfq_slice_async. Looking at
> >>> boost logic, this is equivalent of starting a new queue/group as
> >>> if it is being requeued after conuming a full slice. So may be we can divide
> >>> it by some const number say 4 or something like that. This is a minor
> >>> point though as this algorimthm will kind of evolve and we will learn
> >>> what works best.
> >>>
> >>> Secondly, I think you wanted to SYNC vs ASYNC logic seem to be reversed.
> >>> We would like to give ASYNC queues higher boost (Put these farther in
> >>> tree) and lesser boost to SYNC queues. Looks like above constants will
> >>> do the reverse?
> >> Hi Vivek,
> >>
> >> Currently, SYNC and ASYNC queues are in different service tree, they don't
> >> impact each other. Here, I Really want use this logic.
> >
> > Ok, SYNC and ASYNC are on separate service tree so their vtime are not
> > comparable (as of today, down the line one might want to look at those for
> > better workload selection logic).
> >
> > Anyway, because two are on seprate tree so why should we have separate
> > boosting constants for them? How does it help?
>
> Here if we are using CFQ_BOOST_SYNC_BASE for both, I think it might boost
> too much for an ASYNC cfqe as compare to others on the same service tree(async).
> So I make charging and boosting follow the same base.

Ok, that makes sense. So as suggested in other mails, lets use a even
smaller base so that freshly backlogged queues get smaller vdisktimes
as compared to existing queues which are using disks for longer time.

Thanks
Vivek

2011-02-15 15:03:51

by Vivek Goyal

[permalink] [raw]
Subject: Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue

On Tue, Feb 15, 2011 at 09:44:44AM +0800, Gui Jianfeng wrote:

[..]
> >> +static inline u64 cfq_get_boost(struct cfq_data *cfqd,
> >> + struct cfq_queue *cfqq)
> >> +{
> >> + u64 d;
> >> +
> >> + if (cfq_cfqq_sync(cfqq))
> >> + d = CFQ_BOOST_SYNC_BASE << CFQ_SERVICE_SHIFT;
> >> + else
> >> + d = CFQ_BOOST_ASYNC_BASE << CFQ_SERVICE_SHIFT;
> >> +
> >> + d = d * BLKIO_WEIGHT_DEFAULT;
> >> + do_div(d, cfqq->cfqe.weight);
> >> + return d;
> >> +}
> >
> > The logic for cfq_get_boost() looks a lot like cfq_scale_slice().
> > Instead of duplicating code, can't it just be
> > u64 d;
> > if (cfq_cfqq_sync(cfqq))
> > return cfq_scale_slice(CFQ_BOOST_SYNC_BASE, cfqq->cfqe);
> > else
> > return cfq_scale_slice(CFQ_BOOST_ASYNC_BASE, cfqq->cfqe);
> >

I still think that we should use smaller values for CFQ_BOOST_SYNC_BASE
because otherwise what it means is that for freshly backlogged queues
we assume that these have already used one slice and then requeue these
accordingly. Instead it should be reverse where freshly backlogged queues
should get preference over already queues which are hogging the disk
for long time.

Thanks
Vivek

2011-02-16 01:06:35

by Gui, Jianfeng/归 剑峰

[permalink] [raw]
Subject: Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue

Vivek Goyal wrote:
> On Tue, Feb 15, 2011 at 09:53:58AM +0800, Gui Jianfeng wrote:
>> Vivek Goyal wrote:
>>> On Sat, Feb 12, 2011 at 09:20:58AM +0800, Gui Jianfeng wrote:
>>>> Vivek Goyal wrote:
>>>>> On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
>>>>>> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
>>>>>> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
>>>>>> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
>>>>>> group scheduling on the same service tree.
>>>>>>
>>>>>> Signed-off-by: Gui Jianfeng <[email protected]>
>>>>>> ---
>>>>>> block/cfq-iosched.c | 219 +++++++++++++++++++++++++++++++++++++++------------
>>>>>> 1 files changed, 167 insertions(+), 52 deletions(-)
>>>>>>
>>>>>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>>>>>> index f3a126e..41cef2e 100644
>>>>>> --- a/block/cfq-iosched.c
>>>>>> +++ b/block/cfq-iosched.c
>>>>>> @@ -39,6 +39,13 @@ static const int cfq_hist_divisor = 4;
>>>>>> */
>>>>>> #define CFQ_IDLE_DELAY (HZ / 5)
>>>>>>
>>>>>> +/*
>>>>>> + * The base boosting value.
>>>>>> + */
>>>>>> +#define CFQ_BOOST_SYNC_BASE (HZ / 10)
>>>>>> +#define CFQ_BOOST_ASYNC_BASE (HZ / 25)
>>>>>> +
>>>>> These are same as cfq_slice_sync and cfq_slice_async. Looking at
>>>>> boost logic, this is equivalent of starting a new queue/group as
>>>>> if it is being requeued after conuming a full slice. So may be we can divide
>>>>> it by some const number say 4 or something like that. This is a minor
>>>>> point though as this algorimthm will kind of evolve and we will learn
>>>>> what works best.
>>>>>
>>>>> Secondly, I think you wanted to SYNC vs ASYNC logic seem to be reversed.
>>>>> We would like to give ASYNC queues higher boost (Put these farther in
>>>>> tree) and lesser boost to SYNC queues. Looks like above constants will
>>>>> do the reverse?
>>>> Hi Vivek,
>>>>
>>>> Currently, SYNC and ASYNC queues are in different service tree, they don't
>>>> impact each other. Here, I Really want use this logic.
>>> Ok, SYNC and ASYNC are on separate service tree so their vtime are not
>>> comparable (as of today, down the line one might want to look at those for
>>> better workload selection logic).
>>>
>>> Anyway, because two are on seprate tree so why should we have separate
>>> boosting constants for them? How does it help?
>> Here if we are using CFQ_BOOST_SYNC_BASE for both, I think it might boost
>> too much for an ASYNC cfqe as compare to others on the same service tree(async).
>> So I make charging and boosting follow the same base.
>
> Ok, that makes sense. So as suggested in other mails, lets use a even
> smaller base so that freshly backlogged queues get smaller vdisktimes
> as compared to existing queues which are using disks for longer time.

Ok, It sounds making sense.

Thanks
Gui

>
> Thanks
> Vivek
>

2011-02-18 06:04:38

by Gui, Jianfeng/归 剑峰

[permalink] [raw]
Subject: Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue

Vivek Goyal wrote:
> On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
>
> [..]
>> +/*
>> + * The time when a CFQ queue is put onto a service tree is recoreded in
>> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
>> + * on each service tree, and select the workload type that contains the lowest
>> + * reposition_time CFQ queue among them.
>> + */
>> static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>> struct cfq_group *cfqg, enum wl_prio_t prio)
>> {
>> struct cfq_entity *cfqe;
>> + struct cfq_queue *cfqq;
>> + unsigned long lowest_start_time;
>> int i;
>> - bool key_valid = false;
>> - unsigned long lowest_key = 0;
>> + bool time_valid = false;
>> enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
>>
>> + /*
>> + * TODO: We may take io priority and io class into account when
>> + * choosing a workload type. But for the time being just make use of
>> + * reposition_time only.
>> + */
>> for (i = 0; i <= SYNC_WORKLOAD; ++i) {
>> - /* select the one with lowest rb_key */
>> cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
>> - if (cfqe &&
>> - (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
>> - lowest_key = cfqe->rb_key;
>> + cfqq = cfqq_of_entity(cfqe);
>> + if (cfqe && (!time_valid ||
>> + time_before(cfqq->reposition_time,
>> + lowest_start_time))) {
>> + lowest_start_time = cfqq->reposition_time;
>
> Gui,
>
> Have you had a chance to run some mixed workloads in a group (some sync,
> some async and some sync-idle queues), and see how latency and throughput
> of sync-idle workload changes due to this "resposition_time" logic. I
> just want to make sure that latency of sync-noidle workload does not
> go up as that's the workload that people care and gets noticed first.

Hi Vivek,

I made a quick test by using fio. It seems the number changes little
between vanilla kernel and patched kernel.


Vanilla: SYNC read SYNC-NOIDLE read ASYNC write
1. 23,640KB/s 5.40 ---- 6,696KB/s 19.07 ---- 50,142KB/s 128.00
2. 24,459KB/s 5.22 ---- 6,775KB/s 18.86 ---- 47,349KB/s 129.89
3. 25,929KB/s 4.93 ---- 7,378KB/s 17.32 ---- 32,350KB/s 131.88

Patched: SYNC read SYNC-NOIDLE read ASYNC write
1. 24,000KB/s 5.32 ---- 6,942KB/s 18.39 ---- 30,860KB/s 135.95
2. 23,678KB/s 5.40 ---- 7,274KB/s 17.58 ---- 67,432KB/s 120.44
3. 23,004KB/s 5.55 ---- 6,621KB/s 19.30 ---- 36,536KB/s 148.64

Thanks,
Gui

2011-02-18 14:54:43

by Vivek Goyal

[permalink] [raw]
Subject: Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue

On Fri, Feb 18, 2011 at 02:04:18PM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> > On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
> >
> > [..]
> >> +/*
> >> + * The time when a CFQ queue is put onto a service tree is recoreded in
> >> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
> >> + * on each service tree, and select the workload type that contains the lowest
> >> + * reposition_time CFQ queue among them.
> >> + */
> >> static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
> >> struct cfq_group *cfqg, enum wl_prio_t prio)
> >> {
> >> struct cfq_entity *cfqe;
> >> + struct cfq_queue *cfqq;
> >> + unsigned long lowest_start_time;
> >> int i;
> >> - bool key_valid = false;
> >> - unsigned long lowest_key = 0;
> >> + bool time_valid = false;
> >> enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
> >>
> >> + /*
> >> + * TODO: We may take io priority and io class into account when
> >> + * choosing a workload type. But for the time being just make use of
> >> + * reposition_time only.
> >> + */
> >> for (i = 0; i <= SYNC_WORKLOAD; ++i) {
> >> - /* select the one with lowest rb_key */
> >> cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
> >> - if (cfqe &&
> >> - (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
> >> - lowest_key = cfqe->rb_key;
> >> + cfqq = cfqq_of_entity(cfqe);
> >> + if (cfqe && (!time_valid ||
> >> + time_before(cfqq->reposition_time,
> >> + lowest_start_time))) {
> >> + lowest_start_time = cfqq->reposition_time;
> >
> > Gui,
> >
> > Have you had a chance to run some mixed workloads in a group (some sync,
> > some async and some sync-idle queues), and see how latency and throughput
> > of sync-idle workload changes due to this "resposition_time" logic. I
> > just want to make sure that latency of sync-noidle workload does not
> > go up as that's the workload that people care and gets noticed first.
>
> Hi Vivek,
>
> I made a quick test by using fio. It seems the number changes little
> between vanilla kernel and patched kernel.
>
>
> Vanilla: SYNC read SYNC-NOIDLE read ASYNC write
> 1. 23,640KB/s 5.40 ---- 6,696KB/s 19.07 ---- 50,142KB/s 128.00
> 2. 24,459KB/s 5.22 ---- 6,775KB/s 18.86 ---- 47,349KB/s 129.89
> 3. 25,929KB/s 4.93 ---- 7,378KB/s 17.32 ---- 32,350KB/s 131.88
>
> Patched: SYNC read SYNC-NOIDLE read ASYNC write
> 1. 24,000KB/s 5.32 ---- 6,942KB/s 18.39 ---- 30,860KB/s 135.95
> 2. 23,678KB/s 5.40 ---- 7,274KB/s 17.58 ---- 67,432KB/s 120.44
> 3. 23,004KB/s 5.55 ---- 6,621KB/s 19.30 ---- 36,536KB/s 148.64

Hi Gui,

Do you also have latency numbers? I am especially interested max completion
latencies of SYNC-NOIDLE workload.

Thanks
Vivek

2011-02-21 01:22:37

by Gui, Jianfeng/归 剑峰

[permalink] [raw]
Subject: Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue

Vivek Goyal wrote:
> On Fri, Feb 18, 2011 at 02:04:18PM +0800, Gui Jianfeng wrote:
>> Vivek Goyal wrote:
>>> On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
>>>
>>> [..]
>>>> +/*
>>>> + * The time when a CFQ queue is put onto a service tree is recoreded in
>>>> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
>>>> + * on each service tree, and select the workload type that contains the lowest
>>>> + * reposition_time CFQ queue among them.
>>>> + */
>>>> static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>>>> struct cfq_group *cfqg, enum wl_prio_t prio)
>>>> {
>>>> struct cfq_entity *cfqe;
>>>> + struct cfq_queue *cfqq;
>>>> + unsigned long lowest_start_time;
>>>> int i;
>>>> - bool key_valid = false;
>>>> - unsigned long lowest_key = 0;
>>>> + bool time_valid = false;
>>>> enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
>>>>
>>>> + /*
>>>> + * TODO: We may take io priority and io class into account when
>>>> + * choosing a workload type. But for the time being just make use of
>>>> + * reposition_time only.
>>>> + */
>>>> for (i = 0; i <= SYNC_WORKLOAD; ++i) {
>>>> - /* select the one with lowest rb_key */
>>>> cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
>>>> - if (cfqe &&
>>>> - (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
>>>> - lowest_key = cfqe->rb_key;
>>>> + cfqq = cfqq_of_entity(cfqe);
>>>> + if (cfqe && (!time_valid ||
>>>> + time_before(cfqq->reposition_time,
>>>> + lowest_start_time))) {
>>>> + lowest_start_time = cfqq->reposition_time;
>>> Gui,
>>>
>>> Have you had a chance to run some mixed workloads in a group (some sync,
>>> some async and some sync-idle queues), and see how latency and throughput
>>> of sync-idle workload changes due to this "resposition_time" logic. I
>>> just want to make sure that latency of sync-noidle workload does not
>>> go up as that's the workload that people care and gets noticed first.
>> Hi Vivek,
>>
>> I made a quick test by using fio. It seems the number changes little
>> between vanilla kernel and patched kernel.
>>
>>
>> Vanilla: SYNC read SYNC-NOIDLE read ASYNC write
>> 1. 23,640KB/s 5.40 ---- 6,696KB/s 19.07 ---- 50,142KB/s 128.00
>> 2. 24,459KB/s 5.22 ---- 6,775KB/s 18.86 ---- 47,349KB/s 129.89
>> 3. 25,929KB/s 4.93 ---- 7,378KB/s 17.32 ---- 32,350KB/s 131.88
>>
>> Patched: SYNC read SYNC-NOIDLE read ASYNC write
>> 1. 24,000KB/s 5.32 ---- 6,942KB/s 18.39 ---- 30,860KB/s 135.95
>> 2. 23,678KB/s 5.40 ---- 7,274KB/s 17.58 ---- 67,432KB/s 120.44
>> 3. 23,004KB/s 5.55 ---- 6,621KB/s 19.30 ---- 36,536KB/s 148.64
>
> Hi Gui,
>
> Do you also have latency numbers? I am especially interested max completion
> latencies of SYNC-NOIDLE workload.

Vivek,

The number behind bandwidth is the average completion latency which is extracted
from the fio output.
For example, 23,640KB/s 5.40, the average completion lantency is 5.40 usec.
I'll re-test to check *max* completion latencies.

Thanks,
Gui

>
> Thanks
> Vivek
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2011-02-21 05:56:02

by Gui, Jianfeng/归 剑峰

[permalink] [raw]
Subject: Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue

Vivek Goyal wrote:
> On Fri, Feb 18, 2011 at 02:04:18PM +0800, Gui Jianfeng wrote:
>> Vivek Goyal wrote:
>>> On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
>>>
>>> [..]
>>>> +/*
>>>> + * The time when a CFQ queue is put onto a service tree is recoreded in
>>>> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
>>>> + * on each service tree, and select the workload type that contains the lowest
>>>> + * reposition_time CFQ queue among them.
>>>> + */
>>>> static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>>>> struct cfq_group *cfqg, enum wl_prio_t prio)
>>>> {
>>>> struct cfq_entity *cfqe;
>>>> + struct cfq_queue *cfqq;
>>>> + unsigned long lowest_start_time;
>>>> int i;
>>>> - bool key_valid = false;
>>>> - unsigned long lowest_key = 0;
>>>> + bool time_valid = false;
>>>> enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
>>>>
>>>> + /*
>>>> + * TODO: We may take io priority and io class into account when
>>>> + * choosing a workload type. But for the time being just make use of
>>>> + * reposition_time only.
>>>> + */
>>>> for (i = 0; i <= SYNC_WORKLOAD; ++i) {
>>>> - /* select the one with lowest rb_key */
>>>> cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
>>>> - if (cfqe &&
>>>> - (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
>>>> - lowest_key = cfqe->rb_key;
>>>> + cfqq = cfqq_of_entity(cfqe);
>>>> + if (cfqe && (!time_valid ||
>>>> + time_before(cfqq->reposition_time,
>>>> + lowest_start_time))) {
>>>> + lowest_start_time = cfqq->reposition_time;
>>> Gui,
>>>
>>> Have you had a chance to run some mixed workloads in a group (some sync,
>>> some async and some sync-idle queues), and see how latency and throughput
>>> of sync-idle workload changes due to this "resposition_time" logic. I
>>> just want to make sure that latency of sync-noidle workload does not
>>> go up as that's the workload that people care and gets noticed first.
>> Hi Vivek,
>>
>> I made a quick test by using fio. It seems the number changes little
>> between vanilla kernel and patched kernel.
>>
>>
>> Vanilla: SYNC read SYNC-NOIDLE read ASYNC write
>> 1. 23,640KB/s 5.40 ---- 6,696KB/s 19.07 ---- 50,142KB/s 128.00
>> 2. 24,459KB/s 5.22 ---- 6,775KB/s 18.86 ---- 47,349KB/s 129.89
>> 3. 25,929KB/s 4.93 ---- 7,378KB/s 17.32 ---- 32,350KB/s 131.88
>>
>> Patched: SYNC read SYNC-NOIDLE read ASYNC write
>> 1. 24,000KB/s 5.32 ---- 6,942KB/s 18.39 ---- 30,860KB/s 135.95
>> 2. 23,678KB/s 5.40 ---- 7,274KB/s 17.58 ---- 67,432KB/s 120.44
>> 3. 23,004KB/s 5.55 ---- 6,621KB/s 19.30 ---- 36,536KB/s 148.64
>
> Hi Gui,
>
> Do you also have latency numbers? I am especially interested max completion
> latencies of SYNC-NOIDLE workload.

Vivek,

Here some numbers about latency between vanilla and patched kernel.
I tested 4 times for each. It seems no latency regression happens.

Vanilla:
1. clat (msec): min=1, max=302, avg=18.19, stdev=39.80
2. clat (msec): min=1, max=201, avg=17.76, stdev=31.90
3. clat (msec): min=1, max=303, avg=18.64, stdev=41.30
4. clat (msec): min=1, max=370, avg=17.43, stdev=35.09

Patched:
1. clat (msec): min=1, max=176, avg=19.00, stdev=32.98
2. clat (msec): min=1, max=175, avg=17.75, stdev=32.41
3. clat (msec): min=1, max=191, avg=19.11, stdev=33.28
4. clat (msec): min=1, max=176, avg=17.11, stdev=32.99

Thanks,
Gui

>
> Thanks
> Vivek
>

2011-02-21 15:41:36

by Vivek Goyal

[permalink] [raw]
Subject: Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue

On Mon, Feb 21, 2011 at 01:55:38PM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> > On Fri, Feb 18, 2011 at 02:04:18PM +0800, Gui Jianfeng wrote:
> >> Vivek Goyal wrote:
> >>> On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
> >>>
> >>> [..]
> >>>> +/*
> >>>> + * The time when a CFQ queue is put onto a service tree is recoreded in
> >>>> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
> >>>> + * on each service tree, and select the workload type that contains the lowest
> >>>> + * reposition_time CFQ queue among them.
> >>>> + */
> >>>> static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
> >>>> struct cfq_group *cfqg, enum wl_prio_t prio)
> >>>> {
> >>>> struct cfq_entity *cfqe;
> >>>> + struct cfq_queue *cfqq;
> >>>> + unsigned long lowest_start_time;
> >>>> int i;
> >>>> - bool key_valid = false;
> >>>> - unsigned long lowest_key = 0;
> >>>> + bool time_valid = false;
> >>>> enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
> >>>>
> >>>> + /*
> >>>> + * TODO: We may take io priority and io class into account when
> >>>> + * choosing a workload type. But for the time being just make use of
> >>>> + * reposition_time only.
> >>>> + */
> >>>> for (i = 0; i <= SYNC_WORKLOAD; ++i) {
> >>>> - /* select the one with lowest rb_key */
> >>>> cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
> >>>> - if (cfqe &&
> >>>> - (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
> >>>> - lowest_key = cfqe->rb_key;
> >>>> + cfqq = cfqq_of_entity(cfqe);
> >>>> + if (cfqe && (!time_valid ||
> >>>> + time_before(cfqq->reposition_time,
> >>>> + lowest_start_time))) {
> >>>> + lowest_start_time = cfqq->reposition_time;
> >>> Gui,
> >>>
> >>> Have you had a chance to run some mixed workloads in a group (some sync,
> >>> some async and some sync-idle queues), and see how latency and throughput
> >>> of sync-idle workload changes due to this "resposition_time" logic. I
> >>> just want to make sure that latency of sync-noidle workload does not
> >>> go up as that's the workload that people care and gets noticed first.
> >> Hi Vivek,
> >>
> >> I made a quick test by using fio. It seems the number changes little
> >> between vanilla kernel and patched kernel.
> >>
> >>
> >> Vanilla: SYNC read SYNC-NOIDLE read ASYNC write
> >> 1. 23,640KB/s 5.40 ---- 6,696KB/s 19.07 ---- 50,142KB/s 128.00
> >> 2. 24,459KB/s 5.22 ---- 6,775KB/s 18.86 ---- 47,349KB/s 129.89
> >> 3. 25,929KB/s 4.93 ---- 7,378KB/s 17.32 ---- 32,350KB/s 131.88
> >>
> >> Patched: SYNC read SYNC-NOIDLE read ASYNC write
> >> 1. 24,000KB/s 5.32 ---- 6,942KB/s 18.39 ---- 30,860KB/s 135.95
> >> 2. 23,678KB/s 5.40 ---- 7,274KB/s 17.58 ---- 67,432KB/s 120.44
> >> 3. 23,004KB/s 5.55 ---- 6,621KB/s 19.30 ---- 36,536KB/s 148.64
> >
> > Hi Gui,
> >
> > Do you also have latency numbers? I am especially interested max completion
> > latencies of SYNC-NOIDLE workload.
>
> Vivek,
>
> Here some numbers about latency between vanilla and patched kernel.
> I tested 4 times for each. It seems no latency regression happens.
>
> Vanilla:
> 1. clat (msec): min=1, max=302, avg=18.19, stdev=39.80
> 2. clat (msec): min=1, max=201, avg=17.76, stdev=31.90
> 3. clat (msec): min=1, max=303, avg=18.64, stdev=41.30
> 4. clat (msec): min=1, max=370, avg=17.43, stdev=35.09
>
> Patched:
> 1. clat (msec): min=1, max=176, avg=19.00, stdev=32.98
> 2. clat (msec): min=1, max=175, avg=17.75, stdev=32.41
> 3. clat (msec): min=1, max=191, avg=19.11, stdev=33.28
> 4. clat (msec): min=1, max=176, avg=17.11, stdev=32.99

Thanks Gui, In fact they seem to have mproved a bit for sync-noidle
workload. So there are no major issues in presence of other SYNC-IDLE
and ASYNC workload. I wanted to be sure of that. If we run into issues,
we will tweak the worklaod selection logic futher.

Thanks
Vivek