LinuxLists.cc - [PATCH 5/8 v2] cfq-iosched: Introduce hierarchical scheduling with CFQ queue and group at the same level

2010-12-13 01:44:45

Subject: [PATCH 5/8 v2] cfq-iosched: Introduce hierarchical scheduling with CFQ queue and group at the same level

This patch makes CFQ queue and CFQ group schedules at the same level.
Consider the following hierarchy:

Root
/ | \
q1 q2 G1
/ \
q3 G2

q1 q2 and q3 are CFQ queues G1 and G2 are CFQ groups. With this patch, q1,
q2 and G1 are scheduling on a same service tree in Root CFQ group. q3 and G2
are schedluing under G1. Note, for the time being, CFQ group is treated
as "BE and SYNC" workload, and is put on "BE and SYNC" service tree. That
means service differentiate only happens in "BE and SYNC" service tree.
Later, we may introduce "IO Class" for CFQ group.

Signed-off-by: Gui Jianfeng <[email protected]>
---
block/cfq-iosched.c | 473 ++++++++++++++++++++++++++++++++++----------------
1 files changed, 321 insertions(+), 152 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 6486956..d90627e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -105,6 +105,9 @@ struct cfq_entity {
u64 vdisktime;
bool is_group_entity;
unsigned int weight;
+ struct cfq_entity *parent;
+ /* Reposition time */
+ unsigned long reposition_time;
};

/*
@@ -113,8 +116,6 @@ struct cfq_entity {
struct cfq_queue {
/* The schedule entity */
struct cfq_entity cfqe;
- /* Reposition time */
- unsigned long reposition_time;
/* reference count */
atomic_t ref;
/* various state flags, see below */
@@ -194,6 +195,9 @@ struct cfq_group {
/* number of cfqq currently on this group */
int nr_cfqq;

+ /* number of sub cfq groups */
+ int nr_subgp;
+
/*
* Per group busy queus average. Useful for workload slice calc. We
* create the array for each prio class but at run time it is used
@@ -229,8 +233,6 @@ struct cfq_group {
*/
struct cfq_data {
struct request_queue *queue;
- /* Root service tree for cfq_groups */
- struct cfq_rb_root grp_service_tree;
struct cfq_group root_group;

/*
@@ -347,8 +349,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
return NULL;
}

-static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
-
static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
enum wl_prio_t prio,
enum wl_type_t type)
@@ -638,10 +638,15 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
static inline unsigned
cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
struct cfq_entity *cfqe = &cfqg->cfqe;
+ struct cfq_rb_root *st = cfqe->service_tree;

- return cfq_target_latency * cfqe->weight / st->total_weight;
+ if (st)
+ return cfq_target_latency * cfqe->weight
+ / st->total_weight;
+ else
+ /* If this is the root group, give it a full slice. */
+ return cfq_target_latency;
}

static inline void
@@ -804,17 +809,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
return NULL;
}

-static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
-{
- if (!root->left)
- root->left = rb_first(&root->rb);
-
- if (root->left)
- return rb_entry_entity(root->left);
-
- return NULL;
-}
-
static void rb_erase_init(struct rb_node *n, struct rb_root *root)
{
rb_erase(n, root);
@@ -888,12 +882,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)

rb_link_node(&cfqe->rb_node, parent, node);
rb_insert_color(&cfqe->rb_node, &st->rb);
+
+ update_min_vdisktime(st);
}

static void
cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
{
__cfq_entity_service_tree_add(st, cfqe);
+ cfqe->reposition_time = jiffies;
st->count++;
st->total_weight += cfqe->weight;
}
@@ -901,34 +898,57 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
static void
cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
struct cfq_entity *cfqe = &cfqg->cfqe;
- struct cfq_entity *__cfqe;
struct rb_node *n;
+ struct cfq_entity *entity;
+ struct cfq_rb_root *st;
+ struct cfq_group *__cfqg;

cfqg->nr_cfqq++;
+
+ /*
+ * Root group doesn't belongs to any service
+ */
+ if (cfqg == &cfqd->root_group)
+ return;
+
if (!RB_EMPTY_NODE(&cfqe->rb_node))
return;

/*
- * Currently put the group at the end. Later implement something
- * so that groups get lesser vtime based on their weights, so that
- * if group does not loose all if it was not continously backlogged.
+ * Enqueue this group and its ancestors onto their service tree.
*/
- n = rb_last(&st->rb);
- if (n) {
- __cfqe = rb_entry_entity(n);
- cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
- } else
- cfqe->vdisktime = st->min_vdisktime;
+ while (cfqe && cfqe->parent) {
+ if (!RB_EMPTY_NODE(&cfqe->rb_node))
+ return;
+
+ /*
+ * Currently put the group at the end. Later implement
+ * something so that groups get lesser vtime based on their
+ * weights, so that if group does not loose all if it was not
+ * continously backlogged.
+ */
+ st = cfqe->service_tree;
+ n = rb_last(&st->rb);
+ if (n) {
+ entity = rb_entry_entity(n);
+ cfqe->vdisktime = entity->vdisktime +
+ CFQ_IDLE_DELAY;
+ } else
+ cfqe->vdisktime = st->min_vdisktime;

- cfq_entity_service_tree_add(st, cfqe);
+ cfq_entity_service_tree_add(st, cfqe);
+ cfqe = cfqe->parent;
+ __cfqg = cfqg_of_entity(cfqe);
+ __cfqg->nr_subgp++;
+ }
}

static void
__cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
{
cfq_rb_erase(&cfqe->rb_node, st);
+ update_min_vdisktime(st);
}

static void
@@ -937,27 +957,47 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
__cfq_entity_service_tree_del(st, cfqe);
st->total_weight -= cfqe->weight;
- cfqe->service_tree = NULL;
}
}

static void
cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
struct cfq_entity *cfqe = &cfqg->cfqe;
+ struct cfq_group *__cfqg, *p_cfqg;

BUG_ON(cfqg->nr_cfqq < 1);
cfqg->nr_cfqq--;

+ /*
+ * Root group doesn't belongs to any service
+ */
+ if (cfqg == &cfqd->root_group)
+ return;
+
/* If there are other cfq queues under this group, don't delete it */
if (cfqg->nr_cfqq)
return;

- cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
- cfq_entity_service_tree_del(st, cfqe);
- cfqg->saved_workload_slice = 0;
- cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
+ /* If child group exists, don't dequeue it */
+ if (cfqg->nr_subgp)
+ return;
+
+ /*
+ * Dequeue this group and its ancestors from their service tree.
+ */
+ while (cfqe && cfqe->parent) {
+ __cfqg = cfqg_of_entity(cfqe);
+ p_cfqg = cfqg_of_entity(cfqe->parent);
+ cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
+ cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
+ cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
+ __cfqg->saved_workload_slice = 0;
+ cfqe = cfqe->parent;
+ p_cfqg->nr_subgp--;
+ if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
+ return;
+ }
}

static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -989,7 +1029,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
struct cfq_queue *cfqq)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
unsigned int used_sl, charge;
int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
- cfqg->service_tree_idle.count;
@@ -1003,10 +1042,21 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
charge = cfqq->allocated_slice;

- /* Can't update vdisktime while group is on service tree */
- __cfq_entity_service_tree_del(st, cfqe);
- cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
- __cfq_entity_service_tree_add(st, cfqe);
+ /*
+ * Update the vdisktime on the whole chain.
+ */
+ while (cfqe && cfqe->parent) {
+ struct cfq_rb_root *st = cfqe->service_tree;
+
+ /* Can't update vdisktime while group is on service tree */
+ __cfq_entity_service_tree_del(st, cfqe);
+ cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
+ __cfq_entity_service_tree_add(st, cfqe);
+ st->count++;
+ cfqe->reposition_time = jiffies;
+ cfqe = cfqe->parent;
+ }
+

/* This group is being expired. Save the context */
if (time_after(cfqd->workload_expires, jiffies)) {
@@ -1018,7 +1068,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
cfqg->saved_workload_slice = 0;

cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
- cfqe->vdisktime, st->min_vdisktime);
+ cfqg->cfqe.vdisktime,
+ cfqg->cfqe.service_tree->min_vdisktime);
cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
" sect=%u", used_sl, cfqq->slice_dispatch, charge,
iops_mode(cfqd), cfqq->nr_sectors);
@@ -1040,35 +1091,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
cfqg_of_blkg(blkg)->cfqe.weight = weight;
}

-static struct cfq_group *
-cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+static void init_cfqe(struct blkio_cgroup *blkcg,
+ struct cfq_group *cfqg)
+{
+ struct cfq_entity *cfqe = &cfqg->cfqe;
+
+ cfqe->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+ RB_CLEAR_NODE(&cfqe->rb_node);
+ cfqe->is_group_entity = true;
+ cfqe->parent = NULL;
+}
+
+static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
+ struct cfq_group *cfqg)
{
- struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
- struct cfq_group *cfqg = NULL;
- void *key = cfqd;
int i, j;
struct cfq_rb_root *st;
- struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
unsigned int major, minor;
-
- cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
- if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
- sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- cfqg->blkg.dev = MKDEV(major, minor);
- goto done;
- }
- if (cfqg || !create)
- goto done;
-
- cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
- if (!cfqg)
- goto done;
+ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;

for_each_cfqg_st(cfqg, i, j, st)
*st = CFQ_RB_ROOT;
- RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
-
- cfqg->cfqe.is_group_entity = true;

/*
* Take the initial reference that will be released on destroy
@@ -1078,24 +1121,119 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
*/
atomic_set(&cfqg->ref, 1);

+ /* Add group onto cgroup list */
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+ MKDEV(major, minor));
+ /* Initiate group entity */
+ init_cfqe(blkcg, cfqg);
+ /* Add group on cfqd list */
+ hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+}
+
+static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg);
+
+static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+ if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
+ cfq_destroy_cfqg(cfqd, cfqg);
+}
+
+static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
+ struct cfq_group *p_cfqg)
+{
+ struct cfq_entity *cfqe, *p_cfqe;
+
+ cfqe = &cfqg->cfqe;
+
+ p_cfqe = &p_cfqg->cfqe;
+
+ cfqe->parent = p_cfqe;
+
/*
- * Add group onto cgroup list. It might happen that bdi->dev is
- * not initiliazed yet. Initialize this new group without major
- * and minor info and this info will be filled in once a new thread
- * comes for IO. See code above.
+ * Currently, just put cfq group entity on "BE:SYNC" workload
+ * service tree.
*/
- if (bdi->dev) {
- sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
- MKDEV(major, minor));
- } else
- cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
- 0);
+ cfqe->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD,
+ SYNC_WORKLOAD);
+ /* child reference */
+ atomic_inc(&p_cfqg->ref);
+}

- cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
+{
+ struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+ struct blkio_cgroup *p_blkcg;
+ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+ unsigned int major, minor;
+ struct cfq_group *cfqg, *p_cfqg;
+ void *key = cfqd;
+ int ret;

- /* Add group on cfqd list */
- hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ if (cfqg) {
+ if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ cfqg->blkg.dev = MKDEV(major, minor);
+ }
+ /* chain has already been built */
+ return 0;
+ }
+
+ cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
+ if (!cfqg)
+ return -1;
+
+ init_cfqg(cfqd, blkcg, cfqg);
+
+ /* Already to the top group */
+ if (!cgroup->parent)
+ return 0;
+
+ /* Allocate CFQ groups on the chain */
+ ret = cfqg_chain_alloc(cfqd, cgroup->parent);
+ if (ret == -1) {
+ uninit_cfqg(cfqd, cfqg);
+ return -1;
+ }
+
+ p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
+ p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key));
+ BUG_ON(p_cfqg == NULL);
+
+ cfqg_set_parent(cfqd, cfqg, p_cfqg);
+ return 0;
+}
+
+static struct cfq_group *
+cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+{
+ struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+ struct cfq_group *cfqg = NULL;
+ void *key = cfqd;
+ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+ unsigned int major, minor;
+ int ret;
+
+ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ cfqg->blkg.dev = MKDEV(major, minor);
+ goto done;
+ }
+ if (cfqg || !create)
+ goto done;
+
+ /*
+ * For hierarchical cfq group scheduling, we need to allocate
+ * the whole cfq group chain.
+ */
+ ret = cfqg_chain_alloc(cfqd, cgroup);
+ if (!ret) {
+ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ BUG_ON(cfqg == NULL);
+ goto done;
+ }

done:
return cfqg;
@@ -1140,12 +1278,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
{
struct cfq_rb_root *st;
int i, j;
+ struct cfq_entity *cfqe;
+ struct cfq_group *p_cfqg;

BUG_ON(atomic_read(&cfqg->ref) <= 0);
if (!atomic_dec_and_test(&cfqg->ref))
return;
for_each_cfqg_st(cfqg, i, j, st)
BUG_ON(!RB_EMPTY_ROOT(&st->rb));
+
+ cfqe = &cfqg->cfqe;
+ if (cfqe->parent) {
+ p_cfqg = cfqg_of_entity(cfqe->parent);
+ /* Drop the reference taken by children */
+ atomic_dec(&p_cfqg->ref);
+ }
+
kfree(cfqg);
}

@@ -1358,8 +1506,6 @@ insert:
/* Add cfqq onto service tree. */
cfq_entity_service_tree_add(service_tree, cfqe);

- update_min_vdisktime(service_tree);
- cfqq->reposition_time = jiffies;
if ((add_front || !new_cfqq) && !group_changed)
return;
cfq_group_service_tree_add(cfqd, cfqq->cfqg);
@@ -1802,28 +1948,30 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
return cfqq_of_entity(cfq_rb_first(service_tree));
}

-static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
+static struct cfq_entity *
+cfq_get_next_entity_forced(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_group *cfqg;
- struct cfq_entity *cfqe;
+ struct cfq_entity *entity;
int i, j;
struct cfq_rb_root *st;

if (!cfqd->rq_queued)
return NULL;

- cfqg = cfq_get_next_cfqg(cfqd);
- if (!cfqg)
- return NULL;
-
for_each_cfqg_st(cfqg, i, j, st) {
- cfqe = cfq_rb_first(st);
- if (cfqe != NULL)
- return cfqq_of_entity(cfqe);
+ entity = cfq_rb_first(st);
+
+ if (entity && !entity->is_group_entity)
+ return entity;
+ else if (entity && entity->is_group_entity) {
+ cfqg = cfqg_of_entity(entity);
+ return cfq_get_next_entity_forced(cfqd, cfqg);
+ }
}
return NULL;
}

+
/*
* Get and set a new active queue for service.
*/
@@ -2179,7 +2327,6 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
struct cfq_group *cfqg, enum wl_prio_t prio)
{
struct cfq_entity *cfqe;
- struct cfq_queue *cfqq;
unsigned long lowest_start_time;
int i;
bool time_valid = false;
@@ -2191,10 +2338,9 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
*/
for (i = 0; i <= SYNC_WORKLOAD; ++i) {
cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
- cfqq = cfqq_of_entity(cfqe);
if (cfqe && (!time_valid ||
- cfqq->reposition_time < lowest_start_time)) {
- lowest_start_time = cfqq->reposition_time;
+ cfqe->reposition_time < lowest_start_time)) {
+ lowest_start_time = cfqe->reposition_time;
cur_best = i;
time_valid = true;
}
@@ -2203,47 +2349,13 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
return cur_best;
}

-static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
+static void set_workload_expire(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
unsigned slice;
unsigned count;
struct cfq_rb_root *st;
unsigned group_slice;

- if (!cfqg) {
- cfqd->serving_prio = IDLE_WORKLOAD;
- cfqd->workload_expires = jiffies + 1;
- return;
- }
-
- /* Choose next priority. RT > BE > IDLE */
- if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
- cfqd->serving_prio = RT_WORKLOAD;
- else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
- cfqd->serving_prio = BE_WORKLOAD;
- else {
- cfqd->serving_prio = IDLE_WORKLOAD;
- cfqd->workload_expires = jiffies + 1;
- return;
- }
-
- /*
- * For RT and BE, we have to choose also the type
- * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
- * expiration time
- */
- st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
- count = st->count;
-
- /*
- * check workload expiration, and that we still have other queues ready
- */
- if (count && !time_after(jiffies, cfqd->workload_expires))
- return;
-
- /* otherwise select new workload type */
- cfqd->serving_type =
- cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
count = st->count;

@@ -2284,26 +2396,51 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
cfqd->workload_expires = jiffies + slice;
}

-static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
+static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
- struct cfq_group *cfqg;
- struct cfq_entity *cfqe;
+ struct cfq_rb_root *st;
+ unsigned count;

- if (RB_EMPTY_ROOT(&st->rb))
- return NULL;
- cfqe = cfq_rb_first_entity(st);
- cfqg = cfqg_of_entity(cfqe);
- BUG_ON(!cfqg);
- update_min_vdisktime(st);
- return cfqg;
+ if (!cfqg) {
+ cfqd->serving_prio = IDLE_WORKLOAD;
+ cfqd->workload_expires = jiffies + 1;
+ return;
+ }
+
+ /* Choose next priority. RT > BE > IDLE */
+ if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
+ cfqd->serving_prio = RT_WORKLOAD;
+ else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
+ cfqd->serving_prio = BE_WORKLOAD;
+ else {
+ cfqd->serving_prio = IDLE_WORKLOAD;
+ cfqd->workload_expires = jiffies + 1;
+ return;
+ }
+
+ /*
+ * For RT and BE, we have to choose also the type
+ * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
+ * expiration time
+ */
+ st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
+ count = st->count;
+
+ /*
+ * check workload expiration, and that we still have other queues ready
+ */
+ if (count && !time_after(jiffies, cfqd->workload_expires))
+ return;
+
+ /* otherwise select new workload type */
+ cfqd->serving_type =
+ cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
}

-static void cfq_choose_cfqg(struct cfq_data *cfqd)
+struct cfq_entity *choose_serving_entity(struct cfq_data *cfqd,
+ struct cfq_group *cfqg)
{
- struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
-
- cfqd->serving_group = cfqg;
+ struct cfq_rb_root *service_tree;

/* Restore the workload type data */
if (cfqg->saved_workload_slice) {
@@ -2314,8 +2451,21 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
cfqd->workload_expires = jiffies - 1;

choose_service_tree(cfqd, cfqg);
-}

+ service_tree = service_tree_for(cfqg, cfqd->serving_prio,
+ cfqd->serving_type);
+
+ if (!cfqd->rq_queued)
+ return NULL;
+
+ /* There is nothing to dispatch */
+ if (!service_tree)
+ return NULL;
+ if (RB_EMPTY_ROOT(&service_tree->rb))
+ return NULL;
+
+ return cfq_rb_first(service_tree);
+}
/*
* Select a queue for service. If we have a current active queue,
* check whether to continue servicing it, or retrieve and set a new one.
@@ -2323,6 +2473,8 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
{
struct cfq_queue *cfqq, *new_cfqq = NULL;
+ struct cfq_group *cfqg;
+ struct cfq_entity *entity;

cfqq = cfqd->active_queue;
if (!cfqq)
@@ -2422,8 +2574,23 @@ new_queue:
* Current queue expired. Check if we have to switch to a new
* service tree
*/
- if (!new_cfqq)
- cfq_choose_cfqg(cfqd);
+ cfqg = &cfqd->root_group;
+
+ if (!new_cfqq) {
+ do {
+ entity = choose_serving_entity(cfqd, cfqg);
+ if (entity && !entity->is_group_entity) {
+ /* This is the CFQ queue that should run */
+ new_cfqq = cfqq_of_entity(entity);
+ cfqd->serving_group = cfqg;
+ set_workload_expire(cfqd, cfqg);
+ break;
+ } else if (entity && entity->is_group_entity) {
+ /* Continue to lookup in this CFQ group */
+ cfqg = cfqg_of_entity(entity);
+ }
+ } while (entity && entity->is_group_entity);
+ }

cfqq = cfq_set_active_queue(cfqd, new_cfqq);
keep_queue:
@@ -2454,10 +2621,14 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
{
struct cfq_queue *cfqq;
int dispatched = 0;
+ struct cfq_entity *entity;
+ struct cfq_group *root = &cfqd->root_group;

/* Expire the timeslice of the current active queue first */
cfq_slice_expired(cfqd, 0);
- while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
+ while ((entity = cfq_get_next_entity_forced(cfqd, root)) != NULL) {
+ BUG_ON(entity->is_group_entity);
+ cfqq = cfqq_of_entity(entity);
__cfq_set_active_queue(cfqd, cfqq);
dispatched += __cfq_forced_dispatch_cfqq(cfqq);
}
@@ -3991,9 +4162,6 @@ static void *cfq_init_queue(struct request_queue *q)

cfqd->cic_index = i;

- /* Init root service tree */
- cfqd->grp_service_tree = CFQ_RB_ROOT;
-
/* Init root group */
cfqg = &cfqd->root_group;
for_each_cfqg_st(cfqg, i, j, st)
@@ -4003,6 +4171,7 @@ static void *cfq_init_queue(struct request_queue *q)
/* Give preference to root group over other groups */
cfqg->cfqe.weight = 2*BLKIO_WEIGHT_DEFAULT;
cfqg->cfqe.is_group_entity = true;
+ cfqg->cfqe.parent = NULL;

#ifdef CONFIG_CFQ_GROUP_IOSCHED
/*
--
1.6.5.2

2010-12-14 03:49:29

by Vivek Goyal

[permalink] [raw]

Subject: Re: [PATCH 5/8 v2] cfq-iosched: Introduce hierarchical scheduling with CFQ queue and group at the same level

On Mon, Dec 13, 2010 at 09:45:01AM +0800, Gui Jianfeng wrote:
> This patch makes CFQ queue and CFQ group schedules at the same level.
> Consider the following hierarchy:
>
> Root
> / | \
> q1 q2 G1
> / \
> q3 G2
>
> q1 q2 and q3 are CFQ queues G1 and G2 are CFQ groups. With this patch, q1,
> q2 and G1 are scheduling on a same service tree in Root CFQ group. q3 and G2
> are schedluing under G1. Note, for the time being, CFQ group is treated
> as "BE and SYNC" workload, and is put on "BE and SYNC" service tree. That
> means service differentiate only happens in "BE and SYNC" service tree.
> Later, we may introduce "IO Class" for CFQ group.
>
> Signed-off-by: Gui Jianfeng <[email protected]>
> ---
> block/cfq-iosched.c | 473 ++++++++++++++++++++++++++++++++++----------------
> 1 files changed, 321 insertions(+), 152 deletions(-)
>
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index 6486956..d90627e 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -105,6 +105,9 @@ struct cfq_entity {
> u64 vdisktime;
> bool is_group_entity;
> unsigned int weight;
> + struct cfq_entity *parent;
> + /* Reposition time */
> + unsigned long reposition_time;
> };
>
> /*
> @@ -113,8 +116,6 @@ struct cfq_entity {
> struct cfq_queue {
> /* The schedule entity */
> struct cfq_entity cfqe;
> - /* Reposition time */
> - unsigned long reposition_time;
> /* reference count */
> atomic_t ref;
> /* various state flags, see below */
> @@ -194,6 +195,9 @@ struct cfq_group {
> /* number of cfqq currently on this group */
> int nr_cfqq;
>
> + /* number of sub cfq groups */
> + int nr_subgp;
> +

Do you really have to maintain separate count for child queue and
child groups? Will a common count something like nr_children be
not sufficient.

> /*
> * Per group busy queus average. Useful for workload slice calc. We
> * create the array for each prio class but at run time it is used
> @@ -229,8 +233,6 @@ struct cfq_group {
> */
> struct cfq_data {
> struct request_queue *queue;
> - /* Root service tree for cfq_groups */
> - struct cfq_rb_root grp_service_tree;

I see that you are removing this service tree here and then adding it
back in patch 7. I think it is confusing. In fact title of patch 7 is
add flat mode. The fact the flat mode is already supported and we
are just adding hierarchical mode on top of it. I think this is
just a matter of better naming and patch organization so that
it is more clear.

> struct cfq_group root_group;
>
> /*
> @@ -347,8 +349,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
> return NULL;
> }
>
> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
> -
> static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
> enum wl_prio_t prio,
> enum wl_type_t type)
> @@ -638,10 +638,15 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
> static inline unsigned
> cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
> {
> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
> struct cfq_entity *cfqe = &cfqg->cfqe;
> + struct cfq_rb_root *st = cfqe->service_tree;
>
> - return cfq_target_latency * cfqe->weight / st->total_weight;
> + if (st)
> + return cfq_target_latency * cfqe->weight
> + / st->total_weight;

Is it still true in hierarchical mode. Previously group used to be
at the top and there used to be only one service tree for groups so
st->total_weight represented total weight in the system.

Now with hierarhcy this will not/should not be true. So a group slice
calculation should be different?

> + else
> + /* If this is the root group, give it a full slice. */
> + return cfq_target_latency;
> }
>
> static inline void
> @@ -804,17 +809,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
> return NULL;
> }
>
> -static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
> -{
> - if (!root->left)
> - root->left = rb_first(&root->rb);
> -
> - if (root->left)
> - return rb_entry_entity(root->left);
> -
> - return NULL;
> -}
> -
> static void rb_erase_init(struct rb_node *n, struct rb_root *root)
> {
> rb_erase(n, root);
> @@ -888,12 +882,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>
> rb_link_node(&cfqe->rb_node, parent, node);
> rb_insert_color(&cfqe->rb_node, &st->rb);
> +
> + update_min_vdisktime(st);
> }
>
> static void
> cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> {
> __cfq_entity_service_tree_add(st, cfqe);
> + cfqe->reposition_time = jiffies;
> st->count++;
> st->total_weight += cfqe->weight;
> }
> @@ -901,34 +898,57 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> static void
> cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
> {
> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
> struct cfq_entity *cfqe = &cfqg->cfqe;
> - struct cfq_entity *__cfqe;
> struct rb_node *n;
> + struct cfq_entity *entity;
> + struct cfq_rb_root *st;
> + struct cfq_group *__cfqg;
>
> cfqg->nr_cfqq++;
> +
> + /*
> + * Root group doesn't belongs to any service
> + */
> + if (cfqg == &cfqd->root_group)
> + return;

Can we keep root group on cfqd->grp_service_tree? In hierarchical mode
there will be only 1 group on grp service tree and in flat mode there
can be many.

> +
> if (!RB_EMPTY_NODE(&cfqe->rb_node))
> return;
>
> /*
> - * Currently put the group at the end. Later implement something
> - * so that groups get lesser vtime based on their weights, so that
> - * if group does not loose all if it was not continously backlogged.
> + * Enqueue this group and its ancestors onto their service tree.
> */
> - n = rb_last(&st->rb);
> - if (n) {
> - __cfqe = rb_entry_entity(n);
> - cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
> - } else
> - cfqe->vdisktime = st->min_vdisktime;
> + while (cfqe && cfqe->parent) {
> + if (!RB_EMPTY_NODE(&cfqe->rb_node))
> + return;
> +
> + /*
> + * Currently put the group at the end. Later implement
> + * something so that groups get lesser vtime based on their
> + * weights, so that if group does not loose all if it was not
> + * continously backlogged.
> + */
> + st = cfqe->service_tree;
> + n = rb_last(&st->rb);
> + if (n) {
> + entity = rb_entry_entity(n);
> + cfqe->vdisktime = entity->vdisktime +
> + CFQ_IDLE_DELAY;
> + } else
> + cfqe->vdisktime = st->min_vdisktime;
>
> - cfq_entity_service_tree_add(st, cfqe);
> + cfq_entity_service_tree_add(st, cfqe);
> + cfqe = cfqe->parent;
> + __cfqg = cfqg_of_entity(cfqe);
> + __cfqg->nr_subgp++;
> + }
> }
>
> static void
> __cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> {
> cfq_rb_erase(&cfqe->rb_node, st);
> + update_min_vdisktime(st);
> }
>
> static void
> @@ -937,27 +957,47 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> __cfq_entity_service_tree_del(st, cfqe);
> st->total_weight -= cfqe->weight;
> - cfqe->service_tree = NULL;
> }
> }
>
> static void
> cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
> {
> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
> struct cfq_entity *cfqe = &cfqg->cfqe;
> + struct cfq_group *__cfqg, *p_cfqg;
>
> BUG_ON(cfqg->nr_cfqq < 1);
> cfqg->nr_cfqq--;
>
> + /*
> + * Root group doesn't belongs to any service
> + */
> + if (cfqg == &cfqd->root_group)
> + return;
> +
> /* If there are other cfq queues under this group, don't delete it */
> if (cfqg->nr_cfqq)
> return;
>
> - cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
> - cfq_entity_service_tree_del(st, cfqe);
> - cfqg->saved_workload_slice = 0;
> - cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
> + /* If child group exists, don't dequeue it */
> + if (cfqg->nr_subgp)
> + return;
> +
> + /*
> + * Dequeue this group and its ancestors from their service tree.
> + */
> + while (cfqe && cfqe->parent) {
> + __cfqg = cfqg_of_entity(cfqe);
> + p_cfqg = cfqg_of_entity(cfqe->parent);
> + cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
> + cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
> + cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
> + __cfqg->saved_workload_slice = 0;
> + cfqe = cfqe->parent;
> + p_cfqg->nr_subgp--;
> + if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
> + return;
> + }
> }

I think one you merge queue/group algorithms, you can use same function
for adding/deleting queue/group entities and don't have to use separate
functions for groups?

[..]
> - cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
> +int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
> +{
> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> + struct blkio_cgroup *p_blkcg;
> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> + unsigned int major, minor;
> + struct cfq_group *cfqg, *p_cfqg;
> + void *key = cfqd;
> + int ret;
>
> - /* Add group on cfqd list */
> - hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> + if (cfqg) {
> + if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> + cfqg->blkg.dev = MKDEV(major, minor);
> + }
> + /* chain has already been built */
> + return 0;
> + }
> +
> + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
> + if (!cfqg)
> + return -1;
> +
> + init_cfqg(cfqd, blkcg, cfqg);
> +
> + /* Already to the top group */
> + if (!cgroup->parent)
> + return 0;
> +
> + /* Allocate CFQ groups on the chain */
> + ret = cfqg_chain_alloc(cfqd, cgroup->parent);

Can you avoid recursion and user for/while loops to initialize the
chain. Don't want to push multiple stack frames in case of a deep hier.

> + if (ret == -1) {
> + uninit_cfqg(cfqd, cfqg);
> + return -1;
> + }
> +
> + p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
> + p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key));
> + BUG_ON(p_cfqg == NULL);
> +
> + cfqg_set_parent(cfqd, cfqg, p_cfqg);
> + return 0;
> +}
> +
> +static struct cfq_group *
> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> +{
> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> + struct cfq_group *cfqg = NULL;
> + void *key = cfqd;
> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> + unsigned int major, minor;
> + int ret;
> +
> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> + cfqg->blkg.dev = MKDEV(major, minor);
> + goto done;
> + }
> + if (cfqg || !create)
> + goto done;
> +
> + /*
> + * For hierarchical cfq group scheduling, we need to allocate
> + * the whole cfq group chain.
> + */
> + ret = cfqg_chain_alloc(cfqd, cgroup);
> + if (!ret) {
> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> + BUG_ON(cfqg == NULL);
> + goto done;
> + }
>
> done:
> return cfqg;
> @@ -1140,12 +1278,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
> {
> struct cfq_rb_root *st;
> int i, j;
> + struct cfq_entity *cfqe;
> + struct cfq_group *p_cfqg;
>
> BUG_ON(atomic_read(&cfqg->ref) <= 0);
> if (!atomic_dec_and_test(&cfqg->ref))
> return;
> for_each_cfqg_st(cfqg, i, j, st)
> BUG_ON(!RB_EMPTY_ROOT(&st->rb));
> +
> + cfqe = &cfqg->cfqe;
> + if (cfqe->parent) {
> + p_cfqg = cfqg_of_entity(cfqe->parent);
> + /* Drop the reference taken by children */
> + atomic_dec(&p_cfqg->ref);
> + }
> +

Is it the right way to free up whole of the parent chain. just think that
in a hier of test1->test2->test3 somebody drops the reference to test3
and test1 and test2 don't have any other children. In that case after
freeing up test3, we should be freeing up test2 and test1 also.

I was thiking that we can achieve this by freeing up groups in a
loop.

do {
cfqe = cfqg->entity.parent;
if (!atomic_dec_and_test(&cfqg->ref))
return;
kfree(cfqg);
cfqg = cfqg_of_entity(cfqe);
} while(cfqg)

> kfree(cfqg);
> }
>
> @@ -1358,8 +1506,6 @@ insert:
> /* Add cfqq onto service tree. */
> cfq_entity_service_tree_add(service_tree, cfqe);
>
> - update_min_vdisktime(service_tree);
> - cfqq->reposition_time = jiffies;
> if ((add_front || !new_cfqq) && !group_changed)
> return;
> cfq_group_service_tree_add(cfqd, cfqq->cfqg);
> @@ -1802,28 +1948,30 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
> return cfqq_of_entity(cfq_rb_first(service_tree));
> }
>
> -static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
> +static struct cfq_entity *
> +cfq_get_next_entity_forced(struct cfq_data *cfqd, struct cfq_group *cfqg)
> {
> - struct cfq_group *cfqg;
> - struct cfq_entity *cfqe;
> + struct cfq_entity *entity;
> int i, j;
> struct cfq_rb_root *st;
>
> if (!cfqd->rq_queued)
> return NULL;
>
> - cfqg = cfq_get_next_cfqg(cfqd);
> - if (!cfqg)
> - return NULL;
> -
> for_each_cfqg_st(cfqg, i, j, st) {
> - cfqe = cfq_rb_first(st);
> - if (cfqe != NULL)
> - return cfqq_of_entity(cfqe);
> + entity = cfq_rb_first(st);
> +
> + if (entity && !entity->is_group_entity)
> + return entity;
> + else if (entity && entity->is_group_entity) {
> + cfqg = cfqg_of_entity(entity);
> + return cfq_get_next_entity_forced(cfqd, cfqg);
> + }
> }
> return NULL;
> }

Can above be siplified by just taking cfqd as parameter. Will work both
for hierarchical and flat mode. Wanted to avoid recursion as somebody
can create deep cgroup hierarchy and push lots of frames on stack.

struct cfq_entity *cfq_get_next_entity_forced(struct cfq_data *cfqd)
{
struct service_tree *st = cfqd->grp_service_tree;

do {
cfqe = cfq_rb_first(st);
if (is_cfqe_cfqq(cfqe))
return cfqe;
st = choose_service_tree_forced(cfqg);
} while (st);
}

And choose_service_tree_forced() can be something like.

choose_service_tree_forced(cfqg) {
for_each_cfqg_st() {
if (st->count !=0)
return st;
}
}

>
> +
> /*
> * Get and set a new active queue for service.
> */
> @@ -2179,7 +2327,6 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
> struct cfq_group *cfqg, enum wl_prio_t prio)
> {
> struct cfq_entity *cfqe;
> - struct cfq_queue *cfqq;
> unsigned long lowest_start_time;
> int i;
> bool time_valid = false;
> @@ -2191,10 +2338,9 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
> */
> for (i = 0; i <= SYNC_WORKLOAD; ++i) {
> cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
> - cfqq = cfqq_of_entity(cfqe);
> if (cfqe && (!time_valid ||
> - cfqq->reposition_time < lowest_start_time)) {
> - lowest_start_time = cfqq->reposition_time;
> + cfqe->reposition_time < lowest_start_time)) {
> + lowest_start_time = cfqe->reposition_time;
> cur_best = i;
> time_valid = true;
> }
> @@ -2203,47 +2349,13 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
> return cur_best;
> }
>
> -static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
> +static void set_workload_expire(struct cfq_data *cfqd, struct cfq_group *cfqg)
> {
> unsigned slice;
> unsigned count;
> struct cfq_rb_root *st;
> unsigned group_slice;
>
> - if (!cfqg) {
> - cfqd->serving_prio = IDLE_WORKLOAD;
> - cfqd->workload_expires = jiffies + 1;
> - return;
> - }
> -
> - /* Choose next priority. RT > BE > IDLE */
> - if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
> - cfqd->serving_prio = RT_WORKLOAD;
> - else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
> - cfqd->serving_prio = BE_WORKLOAD;
> - else {
> - cfqd->serving_prio = IDLE_WORKLOAD;
> - cfqd->workload_expires = jiffies + 1;
> - return;
> - }
> -
> - /*
> - * For RT and BE, we have to choose also the type
> - * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
> - * expiration time
> - */
> - st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
> - count = st->count;
> -
> - /*
> - * check workload expiration, and that we still have other queues ready
> - */
> - if (count && !time_after(jiffies, cfqd->workload_expires))
> - return;
> -
> - /* otherwise select new workload type */
> - cfqd->serving_type =
> - cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
> st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
> count = st->count;
>
> @@ -2284,26 +2396,51 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
> cfqd->workload_expires = jiffies + slice;
> }
>
> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
> +static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
> {
> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
> - struct cfq_group *cfqg;
> - struct cfq_entity *cfqe;
> + struct cfq_rb_root *st;
> + unsigned count;
>
> - if (RB_EMPTY_ROOT(&st->rb))
> - return NULL;
> - cfqe = cfq_rb_first_entity(st);
> - cfqg = cfqg_of_entity(cfqe);
> - BUG_ON(!cfqg);
> - update_min_vdisktime(st);
> - return cfqg;
> + if (!cfqg) {
> + cfqd->serving_prio = IDLE_WORKLOAD;
> + cfqd->workload_expires = jiffies + 1;
> + return;
> + }

I am wondering where do we use above code. Do we ever call
choose_service_tree() with cfqg == NULL? Can't seem to figure out by
looking at the code.

> +
> + /* Choose next priority. RT > BE > IDLE */
> + if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
> + cfqd->serving_prio = RT_WORKLOAD;
> + else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
> + cfqd->serving_prio = BE_WORKLOAD;
> + else {
> + cfqd->serving_prio = IDLE_WORKLOAD;
> + cfqd->workload_expires = jiffies + 1;
> + return;
> + }
> +
> + /*
> + * For RT and BE, we have to choose also the type
> + * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
> + * expiration time
> + */
> + st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
> + count = st->count;
> +
> + /*
> + * check workload expiration, and that we still have other queues ready
> + */
> + if (count && !time_after(jiffies, cfqd->workload_expires))
> + return;
> +
> + /* otherwise select new workload type */
> + cfqd->serving_type =
> + cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
> }
>
> -static void cfq_choose_cfqg(struct cfq_data *cfqd)
> +struct cfq_entity *choose_serving_entity(struct cfq_data *cfqd,
> + struct cfq_group *cfqg)

I think for this function you don't have to pass cfqg as parameter. You
can just use cfqd as parameter and then take all decisions based on
service tree.

So at top we can continue to have grp_service_tree in cfqd. In
hierarchical mode it will only have root group queued there and in
flat mode it can have multiple groups queued.

Also I am looking forward to simplifying and organizing CFQ code little
better so that it is easy to read. Can chooser_serving entity function
be organized something as follows. This shoudl work both for flat and
hierarchical modes. Following is only a pesudo code.

struct cfq_entity *select_entity(struct cfq_data *cfqd)
{
struct cfq_rb_root *st = cfqd->grp_service_tree;
struct cfq_entity *cfqe;

do {
cfqe = cfq_rb_first(st);
if (is_cfqe_cfqq(cfqe))
/* We found the next queue to dispatch from */
break;
else
st = choose_service_tree();
} while (st)

return cfqe;
}

> {
> - struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
> -
> - cfqd->serving_group = cfqg;
> + struct cfq_rb_root *service_tree;
>
> /* Restore the workload type data */
> if (cfqg->saved_workload_slice) {
> @@ -2314,8 +2451,21 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
> cfqd->workload_expires = jiffies - 1;
>
> choose_service_tree(cfqd, cfqg);
> -}
>
> + service_tree = service_tree_for(cfqg, cfqd->serving_prio,
> + cfqd->serving_type);
> +
> + if (!cfqd->rq_queued)
> + return NULL;
> +
> + /* There is nothing to dispatch */
> + if (!service_tree)
> + return NULL;
> + if (RB_EMPTY_ROOT(&service_tree->rb))
> + return NULL;
> +
> + return cfq_rb_first(service_tree);
> +}
> /*
> * Select a queue for service. If we have a current active queue,
> * check whether to continue servicing it, or retrieve and set a new one.
> @@ -2323,6 +2473,8 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
> static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
> {
> struct cfq_queue *cfqq, *new_cfqq = NULL;
> + struct cfq_group *cfqg;
> + struct cfq_entity *entity;
>
> cfqq = cfqd->active_queue;
> if (!cfqq)
> @@ -2422,8 +2574,23 @@ new_queue:
> * Current queue expired. Check if we have to switch to a new
> * service tree
> */
> - if (!new_cfqq)
> - cfq_choose_cfqg(cfqd);
> + cfqg = &cfqd->root_group;
> +
> + if (!new_cfqq) {
> + do {
> + entity = choose_serving_entity(cfqd, cfqg);
> + if (entity && !entity->is_group_entity) {
> + /* This is the CFQ queue that should run */
> + new_cfqq = cfqq_of_entity(entity);
> + cfqd->serving_group = cfqg;
> + set_workload_expire(cfqd, cfqg);
> + break;
> + } else if (entity && entity->is_group_entity) {
> + /* Continue to lookup in this CFQ group */
> + cfqg = cfqg_of_entity(entity);
> + }
> + } while (entity && entity->is_group_entity);

I think move above logic in a separate function otherwise select_queue()
is becoming complicated.

Secondly for traversing the hierarchy you can introduce macros like
for_each_entity() or for_each_cfqe() etc.

Thirdly I would again say that flat mode is already supported. Build on
top of it instead of first getting rid of it and then adding it back
with the help of a separate patch. If it is too complicated then let
it be a single patch instead of separating it out in two pathes.

> + }

>
> cfqq = cfq_set_active_queue(cfqd, new_cfqq);
> keep_queue:
> @@ -2454,10 +2621,14 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
> {
> struct cfq_queue *cfqq;
> int dispatched = 0;
> + struct cfq_entity *entity;
> + struct cfq_group *root = &cfqd->root_group;
>
> /* Expire the timeslice of the current active queue first */
> cfq_slice_expired(cfqd, 0);
> - while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
> + while ((entity = cfq_get_next_entity_forced(cfqd, root)) != NULL) {
> + BUG_ON(entity->is_group_entity);
> + cfqq = cfqq_of_entity(entity);
> __cfq_set_active_queue(cfqd, cfqq);
> dispatched += __cfq_forced_dispatch_cfqq(cfqq);
> }
> @@ -3991,9 +4162,6 @@ static void *cfq_init_queue(struct request_queue *q)
>
> cfqd->cic_index = i;
>
> - /* Init root service tree */
> - cfqd->grp_service_tree = CFQ_RB_ROOT;
> -
> /* Init root group */
> cfqg = &cfqd->root_group;
> for_each_cfqg_st(cfqg, i, j, st)
> @@ -4003,6 +4171,7 @@ static void *cfq_init_queue(struct request_queue *q)
> /* Give preference to root group over other groups */
> cfqg->cfqe.weight = 2*BLKIO_WEIGHT_DEFAULT;
> cfqg->cfqe.is_group_entity = true;
> + cfqg->cfqe.parent = NULL;
>
> #ifdef CONFIG_CFQ_GROUP_IOSCHED
> /*
> --
> 1.6.5.2
>
>

2010-12-14 06:09:14

by Gui, Jianfeng/归剑峰

[permalink] [raw]

Subject: Re: [PATCH 5/8 v2] cfq-iosched: Introduce hierarchical scheduling with CFQ queue and group at the same level

Vivek Goyal wrote:
...
>> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
>> +static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> - struct cfq_group *cfqg;
>> - struct cfq_entity *cfqe;
>> + struct cfq_rb_root *st;
>> + unsigned count;
>>
>> - if (RB_EMPTY_ROOT(&st->rb))
>> - return NULL;
>> - cfqe = cfq_rb_first_entity(st);
>> - cfqg = cfqg_of_entity(cfqe);
>> - BUG_ON(!cfqg);
>> - update_min_vdisktime(st);
>> - return cfqg;
>> + if (!cfqg) {
>> + cfqd->serving_prio = IDLE_WORKLOAD;
>> + cfqd->workload_expires = jiffies + 1;
>> + return;
>> + }
>
> I am wondering where do we use above code. Do we ever call
> choose_service_tree() with cfqg == NULL? Can't seem to figure out by
> looking at the code.
>

Vivek,

This piece of code comes from original CFQ code. Think more about it, this
piece of code seems redundant. When cfq_choose_cfqg() is called in select_queue(),
there must be at least one backlogged CFQ queue waiting for dispatching, hence
there must be at least one backlogged CFQ group on service tree. So we never call
choose_service_tree() with cfqg == NULL.
I'd like to post a seperate patch to get rid of this piece.

Thanks,
Gui

2010-12-15 07:02:20

by Gui, Jianfeng/归剑峰

[permalink] [raw]

Subject: Re: [PATCH 5/8 v2] cfq-iosched: Introduce hierarchical scheduling with CFQ queue and group at the same level

Vivek Goyal wrote:
> On Mon, Dec 13, 2010 at 09:45:01AM +0800, Gui Jianfeng wrote:
>> This patch makes CFQ queue and CFQ group schedules at the same level.
>> Consider the following hierarchy:
>>
>> Root
>> / | \
>> q1 q2 G1
>> / \
>> q3 G2
>>
>> q1 q2 and q3 are CFQ queues G1 and G2 are CFQ groups. With this patch, q1,
>> q2 and G1 are scheduling on a same service tree in Root CFQ group. q3 and G2
>> are schedluing under G1. Note, for the time being, CFQ group is treated
>> as "BE and SYNC" workload, and is put on "BE and SYNC" service tree. That
>> means service differentiate only happens in "BE and SYNC" service tree.
>> Later, we may introduce "IO Class" for CFQ group.
>>
>> Signed-off-by: Gui Jianfeng <[email protected]>
>> ---
>> block/cfq-iosched.c | 473 ++++++++++++++++++++++++++++++++++----------------
>> 1 files changed, 321 insertions(+), 152 deletions(-)
>>
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index 6486956..d90627e 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -105,6 +105,9 @@ struct cfq_entity {
>> u64 vdisktime;
>> bool is_group_entity;
>> unsigned int weight;
>> + struct cfq_entity *parent;
>> + /* Reposition time */
>> + unsigned long reposition_time;
>> };
>>
>> /*
>> @@ -113,8 +116,6 @@ struct cfq_entity {
>> struct cfq_queue {
>> /* The schedule entity */
>> struct cfq_entity cfqe;
>> - /* Reposition time */
>> - unsigned long reposition_time;
>> /* reference count */
>> atomic_t ref;
>> /* various state flags, see below */
>> @@ -194,6 +195,9 @@ struct cfq_group {
>> /* number of cfqq currently on this group */
>> int nr_cfqq;
>>
>> + /* number of sub cfq groups */
>> + int nr_subgp;
>> +
>
> Do you really have to maintain separate count for child queue and
> child groups? Will a common count something like nr_children be
> not sufficient.

Currently, nr_subgp is only effective in hierarchical mode, but nr_cfqq work
for both hierarchical and flat mode. So for the time being, we need separate
count for child queues and groups.

>
>> /*
>> * Per group busy queus average. Useful for workload slice calc. We
>> * create the array for each prio class but at run time it is used
>> @@ -229,8 +233,6 @@ struct cfq_group {
>> */
>> struct cfq_data {
>> struct request_queue *queue;
>> - /* Root service tree for cfq_groups */
>> - struct cfq_rb_root grp_service_tree;
>
> I see that you are removing this service tree here and then adding it
> back in patch 7. I think it is confusing. In fact title of patch 7 is
> add flat mode. The fact the flat mode is already supported and we
> are just adding hierarchical mode on top of it. I think this is
> just a matter of better naming and patch organization so that
> it is more clear.

Ok, will merge hierarchical patch and flat patch together.

>
>> struct cfq_group root_group;
>>
>> /*
>> @@ -347,8 +349,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
>> return NULL;
>> }
>>
>> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
>> -
>> static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
>> enum wl_prio_t prio,
>> enum wl_type_t type)
>> @@ -638,10 +638,15 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
>> static inline unsigned
>> cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> struct cfq_entity *cfqe = &cfqg->cfqe;
>> + struct cfq_rb_root *st = cfqe->service_tree;
>>
>> - return cfq_target_latency * cfqe->weight / st->total_weight;
>> + if (st)
>> + return cfq_target_latency * cfqe->weight
>> + / st->total_weight;
>
> Is it still true in hierarchical mode. Previously group used to be
> at the top and there used to be only one service tree for groups so
> st->total_weight represented total weight in the system.
>
> Now with hierarhcy this will not/should not be true. So a group slice
> calculation should be different?

I just keep the original group slice calculation here. I was thinking that
calculate group slice in a hierachical way, this might get a really small
group slice and not sure how it works. So I just keep the original calculation.
Any thoughts?

>
>> + else
>> + /* If this is the root group, give it a full slice. */
>> + return cfq_target_latency;
>> }
>>
>> static inline void
>> @@ -804,17 +809,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
>> return NULL;
>> }
>>
>> -static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
>> -{
>> - if (!root->left)
>> - root->left = rb_first(&root->rb);
>> -
>> - if (root->left)
>> - return rb_entry_entity(root->left);
>> -
>> - return NULL;
>> -}
>> -
>> static void rb_erase_init(struct rb_node *n, struct rb_root *root)
>> {
>> rb_erase(n, root);
>> @@ -888,12 +882,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>>
>> rb_link_node(&cfqe->rb_node, parent, node);
>> rb_insert_color(&cfqe->rb_node, &st->rb);
>> +
>> + update_min_vdisktime(st);
>> }
>>
>> static void
>> cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>> {
>> __cfq_entity_service_tree_add(st, cfqe);
>> + cfqe->reposition_time = jiffies;
>> st->count++;
>> st->total_weight += cfqe->weight;
>> }
>> @@ -901,34 +898,57 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>> static void
>> cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> struct cfq_entity *cfqe = &cfqg->cfqe;
>> - struct cfq_entity *__cfqe;
>> struct rb_node *n;
>> + struct cfq_entity *entity;
>> + struct cfq_rb_root *st;
>> + struct cfq_group *__cfqg;
>>
>> cfqg->nr_cfqq++;
>> +
>> + /*
>> + * Root group doesn't belongs to any service
>> + */
>> + if (cfqg == &cfqd->root_group)
>> + return;
>
> Can we keep root group on cfqd->grp_service_tree? In hierarchical mode
> there will be only 1 group on grp service tree and in flat mode there
> can be many.

Keep top service tree different for hierarchical mode and flat mode is just
fine to me. If you don't strongly object, I'd to keep the current way. :)

>
>> +
>> if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> return;
>>
>> /*
>> - * Currently put the group at the end. Later implement something
>> - * so that groups get lesser vtime based on their weights, so that
>> - * if group does not loose all if it was not continously backlogged.
>> + * Enqueue this group and its ancestors onto their service tree.
>> */
>> - n = rb_last(&st->rb);
>> - if (n) {
>> - __cfqe = rb_entry_entity(n);
>> - cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>> - } else
>> - cfqe->vdisktime = st->min_vdisktime;
>> + while (cfqe && cfqe->parent) {
>> + if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> + return;
>> +
>> + /*
>> + * Currently put the group at the end. Later implement
>> + * something so that groups get lesser vtime based on their
>> + * weights, so that if group does not loose all if it was not
>> + * continously backlogged.
>> + */
>> + st = cfqe->service_tree;
>> + n = rb_last(&st->rb);
>> + if (n) {
>> + entity = rb_entry_entity(n);
>> + cfqe->vdisktime = entity->vdisktime +
>> + CFQ_IDLE_DELAY;
>> + } else
>> + cfqe->vdisktime = st->min_vdisktime;
>>
>> - cfq_entity_service_tree_add(st, cfqe);
>> + cfq_entity_service_tree_add(st, cfqe);
>> + cfqe = cfqe->parent;
>> + __cfqg = cfqg_of_entity(cfqe);
>> + __cfqg->nr_subgp++;
>> + }
>> }
>>
>> static void
>> __cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>> {
>> cfq_rb_erase(&cfqe->rb_node, st);
>> + update_min_vdisktime(st);
>> }
>>
>> static void
>> @@ -937,27 +957,47 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>> if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>> __cfq_entity_service_tree_del(st, cfqe);
>> st->total_weight -= cfqe->weight;
>> - cfqe->service_tree = NULL;
>> }
>> }
>>
>> static void
>> cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> struct cfq_entity *cfqe = &cfqg->cfqe;
>> + struct cfq_group *__cfqg, *p_cfqg;
>>
>> BUG_ON(cfqg->nr_cfqq < 1);
>> cfqg->nr_cfqq--;
>>
>> + /*
>> + * Root group doesn't belongs to any service
>> + */
>> + if (cfqg == &cfqd->root_group)
>> + return;
>> +
>> /* If there are other cfq queues under this group, don't delete it */
>> if (cfqg->nr_cfqq)
>> return;
>>
>> - cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
>> - cfq_entity_service_tree_del(st, cfqe);
>> - cfqg->saved_workload_slice = 0;
>> - cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
>> + /* If child group exists, don't dequeue it */
>> + if (cfqg->nr_subgp)
>> + return;
>> +
>> + /*
>> + * Dequeue this group and its ancestors from their service tree.
>> + */
>> + while (cfqe && cfqe->parent) {
>> + __cfqg = cfqg_of_entity(cfqe);
>> + p_cfqg = cfqg_of_entity(cfqe->parent);
>> + cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
>> + cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
>> + cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
>> + __cfqg->saved_workload_slice = 0;
>> + cfqe = cfqe->parent;
>> + p_cfqg->nr_subgp--;
>> + if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
>> + return;
>> + }
>> }
>
> I think one you merge queue/group algorithms, you can use same function
> for adding/deleting queue/group entities and don't have to use separate
> functions for groups?

The CFQ entity adding/deleting for queue/group are almost the same, and I'v
already extract common function to handle it.
cfq_entity_service_tree_add() and cfq_entity_service_tree_del()

>
> [..]
>> - cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>> +int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
>> +{
>> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> + struct blkio_cgroup *p_blkcg;
>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> + unsigned int major, minor;
>> + struct cfq_group *cfqg, *p_cfqg;
>> + void *key = cfqd;
>> + int ret;
>>
>> - /* Add group on cfqd list */
>> - hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> + if (cfqg) {
>> + if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> + cfqg->blkg.dev = MKDEV(major, minor);
>> + }
>> + /* chain has already been built */
>> + return 0;
>> + }
>> +
>> + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
>> + if (!cfqg)
>> + return -1;
>> +
>> + init_cfqg(cfqd, blkcg, cfqg);
>> +
>> + /* Already to the top group */
>> + if (!cgroup->parent)
>> + return 0;
>> +
>> + /* Allocate CFQ groups on the chain */
>> + ret = cfqg_chain_alloc(cfqd, cgroup->parent);
>
> Can you avoid recursion and user for/while loops to initialize the
> chain. Don't want to push multiple stack frames in case of a deep hier.
>

OK, will change.

>> + if (ret == -1) {
>> + uninit_cfqg(cfqd, cfqg);
>> + return -1;
>> + }
>> +
>> + p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
>> + p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key));
>> + BUG_ON(p_cfqg == NULL);
>> +
>> + cfqg_set_parent(cfqd, cfqg, p_cfqg);
>> + return 0;
>> +}
>> +
>> +static struct cfq_group *
>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> +{
>> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> + struct cfq_group *cfqg = NULL;
>> + void *key = cfqd;
>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> + unsigned int major, minor;
>> + int ret;
>> +
>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> + cfqg->blkg.dev = MKDEV(major, minor);
>> + goto done;
>> + }
>> + if (cfqg || !create)
>> + goto done;
>> +
>> + /*
>> + * For hierarchical cfq group scheduling, we need to allocate
>> + * the whole cfq group chain.
>> + */
>> + ret = cfqg_chain_alloc(cfqd, cgroup);
>> + if (!ret) {
>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> + BUG_ON(cfqg == NULL);
>> + goto done;
>> + }
>>
>> done:
>> return cfqg;
>> @@ -1140,12 +1278,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
>> {
>> struct cfq_rb_root *st;
>> int i, j;
>> + struct cfq_entity *cfqe;
>> + struct cfq_group *p_cfqg;
>>
>> BUG_ON(atomic_read(&cfqg->ref) <= 0);
>> if (!atomic_dec_and_test(&cfqg->ref))
>> return;
>> for_each_cfqg_st(cfqg, i, j, st)
>> BUG_ON(!RB_EMPTY_ROOT(&st->rb));
>> +
>> + cfqe = &cfqg->cfqe;
>> + if (cfqe->parent) {
>> + p_cfqg = cfqg_of_entity(cfqe->parent);
>> + /* Drop the reference taken by children */
>> + atomic_dec(&p_cfqg->ref);
>> + }
>> +
>
> Is it the right way to free up whole of the parent chain. just think that
> in a hier of test1->test2->test3 somebody drops the reference to test3
> and test1 and test2 don't have any other children. In that case after
> freeing up test3, we should be freeing up test2 and test1 also.
>
> I was thiking that we can achieve this by freeing up groups in a
> loop.
>
> do {
> cfqe = cfqg->entity.parent;
> if (!atomic_dec_and_test(&cfqg->ref))
> return;
> kfree(cfqg);
> cfqg = cfqg_of_entity(cfqe);
> } while(cfqg)
>

OK

>> kfree(cfqg);
>> }
>>
>> @@ -1358,8 +1506,6 @@ insert:
>> /* Add cfqq onto service tree. */
>> cfq_entity_service_tree_add(service_tree, cfqe);
>>
>> - update_min_vdisktime(service_tree);
>> - cfqq->reposition_time = jiffies;
>> if ((add_front || !new_cfqq) && !group_changed)
>> return;
>> cfq_group_service_tree_add(cfqd, cfqq->cfqg);
>> @@ -1802,28 +1948,30 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
>> return cfqq_of_entity(cfq_rb_first(service_tree));
>> }
>>
>> -static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
>> +static struct cfq_entity *
>> +cfq_get_next_entity_forced(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_group *cfqg;
>> - struct cfq_entity *cfqe;
>> + struct cfq_entity *entity;
>> int i, j;
>> struct cfq_rb_root *st;
>>
>> if (!cfqd->rq_queued)
>> return NULL;
>>
>> - cfqg = cfq_get_next_cfqg(cfqd);
>> - if (!cfqg)
>> - return NULL;
>> -
>> for_each_cfqg_st(cfqg, i, j, st) {
>> - cfqe = cfq_rb_first(st);
>> - if (cfqe != NULL)
>> - return cfqq_of_entity(cfqe);
>> + entity = cfq_rb_first(st);
>> +
>> + if (entity && !entity->is_group_entity)
>> + return entity;
>> + else if (entity && entity->is_group_entity) {
>> + cfqg = cfqg_of_entity(entity);
>> + return cfq_get_next_entity_forced(cfqd, cfqg);
>> + }
>> }
>> return NULL;
>> }
>
> Can above be siplified by just taking cfqd as parameter. Will work both
> for hierarchical and flat mode. Wanted to avoid recursion as somebody
> can create deep cgroup hierarchy and push lots of frames on stack.

Ok, will change.

>
> struct cfq_entity *cfq_get_next_entity_forced(struct cfq_data *cfqd)
> {
> struct service_tree *st = cfqd->grp_service_tree;
>
> do {
> cfqe = cfq_rb_first(st);
> if (is_cfqe_cfqq(cfqe))
> return cfqe;
> st = choose_service_tree_forced(cfqg);
> } while (st);
> }
>
> And choose_service_tree_forced() can be something like.
>
> choose_service_tree_forced(cfqg) {
> for_each_cfqg_st() {
> if (st->count !=0)
> return st;
> }
> }
>
>>
>> +
>> /*
>> * Get and set a new active queue for service.
>> */
>> @@ -2179,7 +2327,6 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>> struct cfq_group *cfqg, enum wl_prio_t prio)
>> {
>> struct cfq_entity *cfqe;
>> - struct cfq_queue *cfqq;
>> unsigned long lowest_start_time;
>> int i;
>> bool time_valid = false;
>> @@ -2191,10 +2338,9 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>> */
>> for (i = 0; i <= SYNC_WORKLOAD; ++i) {
>> cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
>> - cfqq = cfqq_of_entity(cfqe);
>> if (cfqe && (!time_valid ||
>> - cfqq->reposition_time < lowest_start_time)) {
>> - lowest_start_time = cfqq->reposition_time;
>> + cfqe->reposition_time < lowest_start_time)) {
>> + lowest_start_time = cfqe->reposition_time;
>> cur_best = i;
>> time_valid = true;
>> }
>> @@ -2203,47 +2349,13 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>> return cur_best;
>> }
>>
>> -static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> +static void set_workload_expire(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> unsigned slice;
>> unsigned count;
>> struct cfq_rb_root *st;
>> unsigned group_slice;
>>
>> - if (!cfqg) {
>> - cfqd->serving_prio = IDLE_WORKLOAD;
>> - cfqd->workload_expires = jiffies + 1;
>> - return;
>> - }
>> -
>> - /* Choose next priority. RT > BE > IDLE */
>> - if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
>> - cfqd->serving_prio = RT_WORKLOAD;
>> - else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
>> - cfqd->serving_prio = BE_WORKLOAD;
>> - else {
>> - cfqd->serving_prio = IDLE_WORKLOAD;
>> - cfqd->workload_expires = jiffies + 1;
>> - return;
>> - }
>> -
>> - /*
>> - * For RT and BE, we have to choose also the type
>> - * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
>> - * expiration time
>> - */
>> - st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
>> - count = st->count;
>> -
>> - /*
>> - * check workload expiration, and that we still have other queues ready
>> - */
>> - if (count && !time_after(jiffies, cfqd->workload_expires))
>> - return;
>> -
>> - /* otherwise select new workload type */
>> - cfqd->serving_type =
>> - cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
>> st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
>> count = st->count;
>>
>> @@ -2284,26 +2396,51 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> cfqd->workload_expires = jiffies + slice;
>> }
>>
>> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
>> +static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> - struct cfq_group *cfqg;
>> - struct cfq_entity *cfqe;
>> + struct cfq_rb_root *st;
>> + unsigned count;
>>
>> - if (RB_EMPTY_ROOT(&st->rb))
>> - return NULL;
>> - cfqe = cfq_rb_first_entity(st);
>> - cfqg = cfqg_of_entity(cfqe);
>> - BUG_ON(!cfqg);
>> - update_min_vdisktime(st);
>> - return cfqg;
>> + if (!cfqg) {
>> + cfqd->serving_prio = IDLE_WORKLOAD;
>> + cfqd->workload_expires = jiffies + 1;
>> + return;
>> + }
>
> I am wondering where do we use above code. Do we ever call
> choose_service_tree() with cfqg == NULL? Can't seem to figure out by
> looking at the code.

Already fixed by a seperate patch. ;)

>
>> +
>> + /* Choose next priority. RT > BE > IDLE */
>> + if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
>> + cfqd->serving_prio = RT_WORKLOAD;
>> + else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
>> + cfqd->serving_prio = BE_WORKLOAD;
>> + else {
>> + cfqd->serving_prio = IDLE_WORKLOAD;
>> + cfqd->workload_expires = jiffies + 1;
>> + return;
>> + }
>> +
>> + /*
>> + * For RT and BE, we have to choose also the type
>> + * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
>> + * expiration time
>> + */
>> + st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
>> + count = st->count;
>> +
>> + /*
>> + * check workload expiration, and that we still have other queues ready
>> + */
>> + if (count && !time_after(jiffies, cfqd->workload_expires))
>> + return;
>> +
>> + /* otherwise select new workload type */
>> + cfqd->serving_type =
>> + cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
>> }
>>
>> -static void cfq_choose_cfqg(struct cfq_data *cfqd)
>> +struct cfq_entity *choose_serving_entity(struct cfq_data *cfqd,
>> + struct cfq_group *cfqg)
>
> I think for this function you don't have to pass cfqg as parameter. You
> can just use cfqd as parameter and then take all decisions based on
> service tree.
>
> So at top we can continue to have grp_service_tree in cfqd. In
> hierarchical mode it will only have root group queued there and in
> flat mode it can have multiple groups queued.
>
> Also I am looking forward to simplifying and organizing CFQ code little
> better so that it is easy to read. Can chooser_serving entity function
> be organized something as follows. This shoudl work both for flat and
> hierarchical modes. Following is only a pesudo code.
>
> struct cfq_entity *select_entity(struct cfq_data *cfqd)
> {
> struct cfq_rb_root *st = cfqd->grp_service_tree;
> struct cfq_entity *cfqe;
>
> do {
> cfqe = cfq_rb_first(st);
> if (is_cfqe_cfqq(cfqe))
> /* We found the next queue to dispatch from */
> break;
> else
> st = choose_service_tree();
> } while (st)
>
> return cfqe;
> }
>

Ok, I'll refine the code to make it easier to read.

>> {
>> - struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
>> -
>> - cfqd->serving_group = cfqg;
>> + struct cfq_rb_root *service_tree;
>>
>> /* Restore the workload type data */
>> if (cfqg->saved_workload_slice) {
>> @@ -2314,8 +2451,21 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
>> cfqd->workload_expires = jiffies - 1;
>>
>> choose_service_tree(cfqd, cfqg);
>> -}
>>
>> + service_tree = service_tree_for(cfqg, cfqd->serving_prio,
>> + cfqd->serving_type);
>> +
>> + if (!cfqd->rq_queued)
>> + return NULL;
>> +
>> + /* There is nothing to dispatch */
>> + if (!service_tree)
>> + return NULL;
>> + if (RB_EMPTY_ROOT(&service_tree->rb))
>> + return NULL;
>> +
>> + return cfq_rb_first(service_tree);
>> +}
>> /*
>> * Select a queue for service. If we have a current active queue,
>> * check whether to continue servicing it, or retrieve and set a new one.
>> @@ -2323,6 +2473,8 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
>> static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>> {
>> struct cfq_queue *cfqq, *new_cfqq = NULL;
>> + struct cfq_group *cfqg;
>> + struct cfq_entity *entity;
>>
>> cfqq = cfqd->active_queue;
>> if (!cfqq)
>> @@ -2422,8 +2574,23 @@ new_queue:
>> * Current queue expired. Check if we have to switch to a new
>> * service tree
>> */
>> - if (!new_cfqq)
>> - cfq_choose_cfqg(cfqd);
>> + cfqg = &cfqd->root_group;
>> +
>> + if (!new_cfqq) {
>> + do {
>> + entity = choose_serving_entity(cfqd, cfqg);
>> + if (entity && !entity->is_group_entity) {
>> + /* This is the CFQ queue that should run */
>> + new_cfqq = cfqq_of_entity(entity);
>> + cfqd->serving_group = cfqg;
>> + set_workload_expire(cfqd, cfqg);
>> + break;
>> + } else if (entity && entity->is_group_entity) {
>> + /* Continue to lookup in this CFQ group */
>> + cfqg = cfqg_of_entity(entity);
>> + }
>> + } while (entity && entity->is_group_entity);
>
> I think move above logic in a separate function otherwise select_queue()
> is becoming complicated.
>
> Secondly for traversing the hierarchy you can introduce macros like
> for_each_entity() or for_each_cfqe() etc.
>
> Thirdly I would again say that flat mode is already supported. Build on
> top of it instead of first getting rid of it and then adding it back
> with the help of a separate patch. If it is too complicated then let
> it be a single patch instead of separating it out in two pathes.

Ok

Thanks for reviewing Vievk, will post a updated patch.

Gui

>
>> + }
>
>>
>> cfqq = cfq_set_active_queue(cfqd, new_cfqq);
>> keep_queue:
>> @@ -2454,10 +2621,14 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
>> {
>> struct cfq_queue *cfqq;
>> int dispatched = 0;
>> + struct cfq_entity *entity;
>> + struct cfq_group *root = &cfqd->root_group;
>>
>> /* Expire the timeslice of the current active queue first */
>> cfq_slice_expired(cfqd, 0);
>> - while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
>> + while ((entity = cfq_get_next_entity_forced(cfqd, root)) != NULL) {
>> + BUG_ON(entity->is_group_entity);
>> + cfqq = cfqq_of_entity(entity);
>> __cfq_set_active_queue(cfqd, cfqq);
>> dispatched += __cfq_forced_dispatch_cfqq(cfqq);
>> }
>> @@ -3991,9 +4162,6 @@ static void *cfq_init_queue(struct request_queue *q)
>>
>> cfqd->cic_index = i;
>>
>> - /* Init root service tree */
>> - cfqd->grp_service_tree = CFQ_RB_ROOT;
>> -
>> /* Init root group */
>> cfqg = &cfqd->root_group;
>> for_each_cfqg_st(cfqg, i, j, st)
>> @@ -4003,6 +4171,7 @@ static void *cfq_init_queue(struct request_queue *q)
>> /* Give preference to root group over other groups */
>> cfqg->cfqe.weight = 2*BLKIO_WEIGHT_DEFAULT;
>> cfqg->cfqe.is_group_entity = true;
>> + cfqg->cfqe.parent = NULL;
>>
>> #ifdef CONFIG_CFQ_GROUP_IOSCHED
>> /*
>> --
>> 1.6.5.2
>>
>>
>

2010-12-15 22:05:07

by Vivek Goyal

[permalink] [raw]

Subject: Re: [PATCH 5/8 v2] cfq-iosched: Introduce hierarchical scheduling with CFQ queue and group at the same level

On Wed, Dec 15, 2010 at 03:02:36PM +0800, Gui Jianfeng wrote:
[..]
> >> static inline unsigned
> >> cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >> {
> >> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
> >> struct cfq_entity *cfqe = &cfqg->cfqe;
> >> + struct cfq_rb_root *st = cfqe->service_tree;
> >>
> >> - return cfq_target_latency * cfqe->weight / st->total_weight;
> >> + if (st)
> >> + return cfq_target_latency * cfqe->weight
> >> + / st->total_weight;
> >
> > Is it still true in hierarchical mode. Previously group used to be
> > at the top and there used to be only one service tree for groups so
> > st->total_weight represented total weight in the system.
> >
> > Now with hierarhcy this will not/should not be true. So a group slice
> > calculation should be different?
>
> I just keep the original group slice calculation here. I was thinking that
> calculate group slice in a hierachical way, this might get a really small
> group slice and not sure how it works. So I just keep the original calculation.
> Any thoughts?

Corrado already had minimum per queue limits (16ms or something) so don't
worry about it getting too small. But we have to do hierarchical groups
share calculation otherwise what's the point of writting this code and
all the logic of trying to meet soft latency of 300ms.

>

> >
> >> + else
> >> + /* If this is the root group, give it a full slice. */
> >> + return cfq_target_latency;
> >> }
> >>
> >> static inline void
> >> @@ -804,17 +809,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
> >> return NULL;
> >> }
> >>
> >> -static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
> >> -{
> >> - if (!root->left)
> >> - root->left = rb_first(&root->rb);
> >> -
> >> - if (root->left)
> >> - return rb_entry_entity(root->left);
> >> -
> >> - return NULL;
> >> -}
> >> -
> >> static void rb_erase_init(struct rb_node *n, struct rb_root *root)
> >> {
> >> rb_erase(n, root);
> >> @@ -888,12 +882,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> >>
> >> rb_link_node(&cfqe->rb_node, parent, node);
> >> rb_insert_color(&cfqe->rb_node, &st->rb);
> >> +
> >> + update_min_vdisktime(st);
> >> }
> >>
> >> static void
> >> cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> >> {
> >> __cfq_entity_service_tree_add(st, cfqe);
> >> + cfqe->reposition_time = jiffies;
> >> st->count++;
> >> st->total_weight += cfqe->weight;
> >> }
> >> @@ -901,34 +898,57 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> >> static void
> >> cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >> {
> >> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
> >> struct cfq_entity *cfqe = &cfqg->cfqe;
> >> - struct cfq_entity *__cfqe;
> >> struct rb_node *n;
> >> + struct cfq_entity *entity;
> >> + struct cfq_rb_root *st;
> >> + struct cfq_group *__cfqg;
> >>
> >> cfqg->nr_cfqq++;
> >> +
> >> + /*
> >> + * Root group doesn't belongs to any service
> >> + */
> >> + if (cfqg == &cfqd->root_group)
> >> + return;
> >
> > Can we keep root group on cfqd->grp_service_tree? In hierarchical mode
> > there will be only 1 group on grp service tree and in flat mode there
> > can be many.
>
> Keep top service tree different for hierarchical mode and flat mode is just
> fine to me. If you don't strongly object, I'd to keep the current way. :)

I am saying that keep one top tree both for hierarchical and flat mode and
not separate trees.

for flat mode everything goes on cfqd->grp_service_tree.

grp_service_tree
/ | \
root test1 test2

for hierarchical mode it will look as follows.

grp_service_tree
|
root
/ \
test1 test2

Or it could looks as follows if user has set use_hier=1 in test2 only.

grp_service_tree
| | |
root test1 test2
|
test3

Thanks
Vivek