2010-12-27 08:51:12

by Gui, Jianfeng/归 剑峰

[permalink] [raw]
Subject: [PATCH 5/6 v3] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

CFQ group hierarchical scheduling and use_hierarchy interface.

Signed-off-by: Gui Jianfeng <[email protected]>
---
block/blk-cgroup.c | 61 ++++++-
block/blk-cgroup.h | 3 +
block/cfq-iosched.c | 570 ++++++++++++++++++++++++++++++++++++---------------
3 files changed, 469 insertions(+), 165 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 455768a..c55fecd 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -25,7 +25,10 @@
static DEFINE_SPINLOCK(blkio_list_lock);
static LIST_HEAD(blkio_list);

-struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
+struct blkio_cgroup blkio_root_cgroup = {
+ .weight = 2*BLKIO_WEIGHT_DEFAULT,
+ .use_hierarchy = 0
+};
EXPORT_SYMBOL_GPL(blkio_root_cgroup);

static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
@@ -454,6 +457,7 @@ static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
blkg->blkcg_id = 0;
}

+
/*
* returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
* indicating that blk_group was unhashed by the time we got to it.
@@ -765,6 +769,12 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
}
EXPORT_SYMBOL_GPL(blkcg_get_weight);

+unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg)
+{
+ return blkcg->use_hierarchy;
+}
+EXPORT_SYMBOL_GPL(blkcg_get_use_hierarchy);
+
uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
{
struct blkio_policy_node *pn;
@@ -1202,6 +1212,8 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
switch(name) {
case BLKIO_PROP_weight:
return (u64)blkcg->weight;
+ case BLKIO_PROP_use_hierarchy:
+ return (u64)blkcg->use_hierarchy;
}
break;
default:
@@ -1210,6 +1222,36 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
return 0;
}

+static int blkio_use_hierarchy_write(struct cgroup *cgrp, u64 val)
+{
+ struct cgroup *parent = cgrp->parent;
+ struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
+ int ret = 0;
+
+ if (val != 0 && val != 1)
+ return -EINVAL;
+
+ blkcg = cgroup_to_blkio_cgroup(cgrp);
+ if (parent)
+ parent_blkcg = cgroup_to_blkio_cgroup(parent);
+
+ cgroup_lock();
+ /*
+ * If parent's use_hierarchy is set, we can't make any modifications
+ * in the child subtrees. If it is unset, then the change can occur,
+ * provided the current cgroup has no children.
+ */
+ if (!parent_blkcg || !parent_blkcg->use_hierarchy) {
+ if (list_empty(&cgrp->children))
+ blkcg->use_hierarchy = val;
+ else
+ ret = -EBUSY;
+ } else
+ ret = -EINVAL;
+ cgroup_unlock();
+ return ret;
+}
+
static int
blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
{
@@ -1224,6 +1266,8 @@ blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
switch(name) {
case BLKIO_PROP_weight:
return blkio_weight_write(blkcg, val);
+ case BLKIO_PROP_use_hierarchy:
+ return blkio_use_hierarchy_write(cgrp, val);
}
break;
default:
@@ -1301,6 +1345,13 @@ struct cftype blkio_files[] = {
.name = "reset_stats",
.write_u64 = blkiocg_reset_stats,
},
+ {
+ .name = "use_hierarchy",
+ .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+ BLKIO_PROP_use_hierarchy),
+ .read_u64 = blkiocg_file_read_u64,
+ .write_u64 = blkiocg_file_write_u64,
+ },
#ifdef CONFIG_BLK_DEV_THROTTLING
{
.name = "throttle.read_bps_device",
@@ -1444,7 +1495,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
static struct cgroup_subsys_state *
blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
{
- struct blkio_cgroup *blkcg;
+ struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
struct cgroup *parent = cgroup->parent;

if (!parent) {
@@ -1452,6 +1503,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
goto done;
}

+ parent_blkcg = cgroup_to_blkio_cgroup(parent);
blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
if (!blkcg)
return ERR_PTR(-ENOMEM);
@@ -1462,6 +1514,11 @@ done:
INIT_HLIST_HEAD(&blkcg->blkg_list);

INIT_LIST_HEAD(&blkcg->policy_list);
+ if (parent)
+ blkcg->use_hierarchy = parent_blkcg->use_hierarchy;
+ else
+ blkcg->use_hierarchy = 0;
+
return &blkcg->css;
}

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ea4861b..5b4b351 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -90,6 +90,7 @@ enum blkcg_file_name_prop {
BLKIO_PROP_idle_time,
BLKIO_PROP_empty_time,
BLKIO_PROP_dequeue,
+ BLKIO_PROP_use_hierarchy,
};

/* cgroup files owned by throttle policy */
@@ -105,6 +106,7 @@ enum blkcg_file_name_throtl {
struct blkio_cgroup {
struct cgroup_subsys_state css;
unsigned int weight;
+ bool use_hierarchy;
spinlock_t lock;
struct hlist_head blkg_list;
struct list_head policy_list; /* list of blkio_policy_node */
@@ -179,6 +181,7 @@ struct blkio_policy_node {

extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
dev_t dev);
+extern unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg);
extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
dev_t dev);
extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 1f21418..727d6e6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -104,6 +104,9 @@ struct cfq_entity {
u64 vdisktime;
bool is_group_entity;
unsigned int weight;
+ struct cfq_entity *parent;
+ /* Reposition time */
+ unsigned long reposition_time;
};

/*
@@ -112,8 +115,6 @@ struct cfq_entity {
struct cfq_queue {
/* The schedule entity */
struct cfq_entity cfqe;
- /* Reposition time */
- unsigned long reposition_time;
/* reference count */
atomic_t ref;
/* various state flags, see below */
@@ -193,6 +194,9 @@ struct cfq_group {
/* number of cfqq currently on this group */
int nr_cfqq;

+ /* number of sub cfq groups */
+ int nr_subgp;
+
/*
* Per group busy queus average. Useful for workload slice calc. We
* create the array for each prio class but at run time it is used
@@ -228,10 +232,11 @@ struct cfq_group {
*/
struct cfq_data {
struct request_queue *queue;
- /* Root service tree for cfq_groups */
- struct cfq_rb_root grp_service_tree;
struct cfq_group root_group;

+ /* cfq group schedule in flat or hierarchy manner. */
+ bool use_hierarchy;
+
/*
* The priority currently being served
*/
@@ -240,6 +245,9 @@ struct cfq_data {
unsigned long workload_expires;
struct cfq_group *serving_group;

+ /* Service tree for cfq group flat scheduling mode. */
+ struct cfq_rb_root grp_service_tree;
+
/*
* Each priority tree is sorted by next_request position. These
* trees are used when determining if two or more queues are
@@ -349,8 +357,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
}


-static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
-
static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
enum wl_prio_t prio,
enum wl_type_t type)
@@ -640,10 +646,19 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
static inline unsigned
cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
struct cfq_entity *cfqe = &cfqg->cfqe;
+ struct cfq_rb_root *st = cfqe->service_tree;
+ int group_slice = cfq_target_latency;
+
+ /* Calculate group slice in a hierarchical way */
+ do {
+ group_slice = group_slice * cfqe->weight / st->total_weight;
+ cfqe = cfqe->parent;
+ if (cfqe)
+ st = cfqe->service_tree;
+ } while (cfqe);

- return cfq_target_latency * cfqe->weight / st->total_weight;
+ return group_slice;
}

static inline void
@@ -666,7 +681,8 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
/* scale low_slice according to IO priority
* and sync vs async */
unsigned low_slice =
- min(slice, base_low_slice * slice / sync_slice);
+ min(slice, base_low_slice * slice /
+ sync_slice);
/* the adapted slice value is scaled to fit all iqs
* into the target latency */
slice = max(slice * group_slice / expect_latency,
@@ -806,17 +822,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
return NULL;
}

-static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
-{
- if (!root->left)
- root->left = rb_first(&root->rb);
-
- if (root->left)
- return rb_entry_entity(root->left);
-
- return NULL;
-}
-
static void rb_erase_init(struct rb_node *n, struct rb_root *root)
{
rb_erase(n, root);
@@ -890,12 +895,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)

rb_link_node(&cfqe->rb_node, parent, node);
rb_insert_color(&cfqe->rb_node, &st->rb);
+
+ update_min_vdisktime(st);
}

static void
cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
{
__cfq_entity_service_tree_add(st, cfqe);
+ cfqe->reposition_time = jiffies;
st->count++;
st->total_weight += cfqe->weight;
}
@@ -903,34 +911,52 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
static void
cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
struct cfq_entity *cfqe = &cfqg->cfqe;
- struct cfq_entity *__cfqe;
struct rb_node *n;
+ struct cfq_entity *entity;
+ struct cfq_rb_root *st;
+ struct cfq_group *__cfqg;

cfqg->nr_cfqq++;
+
if (!RB_EMPTY_NODE(&cfqe->rb_node))
return;

/*
- * Currently put the group at the end. Later implement something
- * so that groups get lesser vtime based on their weights, so that
- * if group does not loose all if it was not continously backlogged.
+ * Enqueue this group and its ancestors onto their service tree.
*/
- n = rb_last(&st->rb);
- if (n) {
- __cfqe = rb_entry_entity(n);
- cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
- } else
- cfqe->vdisktime = st->min_vdisktime;
+ while (cfqe) {
+ if (!RB_EMPTY_NODE(&cfqe->rb_node))
+ return;
+
+ /*
+ * Currently put the group at the end. Later implement
+ * something so that groups get lesser vtime based on
+ * their weights, so that if group does not loose all
+ * if it was not continously backlogged.
+ */
+ st = cfqe->service_tree;
+ n = rb_last(&st->rb);
+ if (n) {
+ entity = rb_entry_entity(n);
+ cfqe->vdisktime = entity->vdisktime +
+ CFQ_IDLE_DELAY;
+ } else
+ cfqe->vdisktime = st->min_vdisktime;

- cfq_entity_service_tree_add(st, cfqe);
+ cfq_entity_service_tree_add(st, cfqe);
+ cfqe = cfqe->parent;
+ __cfqg = cfqg_of_entity(cfqe);
+ if (__cfqg)
+ __cfqg->nr_subgp++;
+ }
}

static void
__cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
{
cfq_rb_erase(&cfqe->rb_node, st);
+ update_min_vdisktime(st);
}

static void
@@ -939,27 +965,43 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
__cfq_entity_service_tree_del(st, cfqe);
st->total_weight -= cfqe->weight;
- cfqe->service_tree = NULL;
}
}

static void
cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
struct cfq_entity *cfqe = &cfqg->cfqe;
+ struct cfq_group *__cfqg, *p_cfqg;

BUG_ON(cfqg->nr_cfqq < 1);
cfqg->nr_cfqq--;

- /* If there are other cfq queues under this group, don't delete it */
- if (cfqg->nr_cfqq)
+ /*
+ * If there are other cfq queues under this group, or there are other
+ * cfq groups under this group, don't delete it.
+ */
+ if (cfqg->nr_cfqq || cfqg->nr_subgp)
return;

- cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
- cfq_entity_service_tree_del(st, cfqe);
- cfqg->saved_workload_slice = 0;
- cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
+ /*
+ * Dequeue this group and its ancestors from their service
+ * tree.
+ */
+ while (cfqe) {
+ __cfqg = cfqg_of_entity(cfqe);
+ p_cfqg = cfqg_of_entity(cfqe->parent);
+ cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
+ cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
+ cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
+ __cfqg->saved_workload_slice = 0;
+ cfqe = cfqe->parent;
+ if (p_cfqg) {
+ p_cfqg->nr_subgp--;
+ if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
+ return;
+ }
+ }
}

static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -991,7 +1033,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
struct cfq_queue *cfqq)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
unsigned int used_sl, charge;
int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
- cfqg->service_tree_idle.count;
@@ -1005,10 +1046,23 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
charge = cfqq->allocated_slice;

- /* Can't update vdisktime while group is on service tree */
- __cfq_entity_service_tree_del(st, cfqe);
- cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
- __cfq_entity_service_tree_add(st, cfqe);
+ /*
+ * Update the vdisktime on the whole chain.
+ */
+ while (cfqe) {
+ struct cfq_rb_root *st = cfqe->service_tree;
+
+ /*
+ * Can't update vdisktime while group is on service
+ * tree.
+ */
+ __cfq_entity_service_tree_del(st, cfqe);
+ cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
+ __cfq_entity_service_tree_add(st, cfqe);
+ st->count++;
+ cfqe->reposition_time = jiffies;
+ cfqe = cfqe->parent;
+ }

/* This group is being expired. Save the context */
if (time_after(cfqd->workload_expires, jiffies)) {
@@ -1020,7 +1074,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
cfqg->saved_workload_slice = 0;

cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
- cfqe->vdisktime, st->min_vdisktime);
+ cfqg->cfqe.vdisktime,
+ cfqg->cfqe.service_tree->min_vdisktime);
cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
" sect=%u", used_sl, cfqq->slice_dispatch, charge,
iops_mode(cfqd), cfqq->nr_sectors);
@@ -1042,35 +1097,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
cfqg_of_blkg(blkg)->cfqe.weight = weight;
}

-static struct cfq_group *
-cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+static void init_cfqe(struct blkio_cgroup *blkcg,
+ struct cfq_group *cfqg)
+{
+ struct cfq_entity *cfqe = &cfqg->cfqe;
+
+ cfqe->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+ RB_CLEAR_NODE(&cfqe->rb_node);
+ cfqe->is_group_entity = true;
+ cfqe->parent = NULL;
+}
+
+static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
+ struct cfq_group *cfqg)
{
- struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
- struct cfq_group *cfqg = NULL;
- void *key = cfqd;
int i, j;
struct cfq_rb_root *st;
- struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
unsigned int major, minor;
-
- cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
- if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
- sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- cfqg->blkg.dev = MKDEV(major, minor);
- goto done;
- }
- if (cfqg || !create)
- goto done;
-
- cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
- if (!cfqg)
- goto done;
+ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;

for_each_cfqg_st(cfqg, i, j, st)
*st = CFQ_RB_ROOT;
- RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
-
- cfqg->cfqe.is_group_entity = true;

/*
* Take the initial reference that will be released on destroy
@@ -1080,24 +1127,199 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
*/
atomic_set(&cfqg->ref, 1);

+ /* Add group onto cgroup list */
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+ MKDEV(major, minor));
+ /* Initiate group entity */
+ init_cfqe(blkcg, cfqg);
+ /* Add group on cfqd list */
+ hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+}
+
+static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg);
+
+static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+ if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
+ cfq_destroy_cfqg(cfqd, cfqg);
+}
+
+static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
+ struct cfq_group *p_cfqg)
+{
+ struct cfq_entity *cfqe, *p_cfqe;
+
+ cfqe = &cfqg->cfqe;
+
/*
- * Add group onto cgroup list. It might happen that bdi->dev is
- * not initiliazed yet. Initialize this new group without major
- * and minor info and this info will be filled in once a new thread
- * comes for IO. See code above.
+ * 1. If use_hierarchy of the CGroup where cfqg's parent stays is not
+ * set, we put this cfqg onto global service tree.
+ * 2. If cfqg is root cfqg, put it onto global service tree.
*/
- if (bdi->dev) {
- sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
- MKDEV(major, minor));
- } else
- cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
- 0);
+ if (!p_cfqg) {
+ cfqe->service_tree = &cfqd->grp_service_tree;
+ cfqe->parent = NULL;
+ return;
+ }

- cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+ p_cfqe = &p_cfqg->cfqe;

- /* Add group on cfqd list */
- hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+ cfqe->parent = p_cfqe;
+
+ /*
+ * Currently, just put cfq group entity on "BE:SYNC" workload
+ * service tree.
+ */
+ cfqe->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD,
+ SYNC_WORKLOAD);
+ /* child reference */
+ atomic_inc(&p_cfqg->ref);
+}
+
+static struct cfq_group *cfqg_get_parent(struct cfq_group * cfqg)
+{
+ struct cfq_entity *cfqe, *p_cfqe;
+
+ if (!cfqg)
+ return NULL;
+
+ cfqe = &cfqg->cfqe;
+ p_cfqe = cfqe->parent;
+ if (!p_cfqe)
+ return NULL;
+
+ return cfqg_of_entity(p_cfqe);
+}
+
+static struct cfq_group *
+cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
+{
+ struct blkio_cgroup *blkcg;
+ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+ unsigned int major, minor;
+ struct cfq_group *cfqg, *leaf_cfqg, *child_cfqg, *tmp_cfqg;
+ void *key = cfqd;
+
+ /*
+ * If CGroup's use_hierarchy is unset, we just need to allocate only
+ * one CFQ group, and this group will put onto the "grp_service_tree".
+ * We don't need to check whether the cfqg exists, the caller has
+ * already checked it.
+ */
+ blkcg = cgroup_to_blkio_cgroup(cgroup);
+ if (!blkcg_get_use_hierarchy(blkcg)) {
+ cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
+ cfqd->queue->node);
+ if (!cfqg)
+ return NULL;
+
+ init_cfqg(cfqd, blkcg, cfqg);
+ cfqg_set_parent(cfqd, cfqg, NULL);
+ return cfqg;
+ }
+
+ /*
+ * Allocate the CFQ group chain until we meet the group we'v already
+ * allocated before, or to the CGroup whose use_hierarchy is not set.
+ */
+ leaf_cfqg = NULL;
+ child_cfqg = NULL;
+ for (; cgroup != NULL; cgroup = cgroup->parent) {
+ blkcg = cgroup_to_blkio_cgroup(cgroup);
+ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ if (cfqg) {
+ if (!cfqg->blkg.dev && bdi->dev &&
+ dev_name(bdi->dev)) {
+ sscanf(dev_name(bdi->dev), "%u:%u",
+ &major, &minor);
+ cfqg->blkg.dev = MKDEV(major, minor);
+ }
+
+ /*
+ * Initialization of parent doesn't finish yet, get
+ * it done.
+ */
+ if (child_cfqg) {
+ if (blkcg_get_use_hierarchy(blkcg))
+ cfqg_set_parent(cfqd, child_cfqg,
+ cfqg);
+ else
+ cfqg_set_parent(cfqd, child_cfqg,
+ NULL);
+ }
+
+ /* chain has already been built */
+ break;
+ }
+
+ /*
+ * We only allocate a cfqg that the corresponding cgroup's
+ * use_hierarchy is set.
+ */
+ if (blkcg_get_use_hierarchy(blkcg)) {
+ cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
+ cfqd->queue->node);
+ if (!cfqg)
+ goto clean_up;
+
+ if (!leaf_cfqg)
+ leaf_cfqg = cfqg;
+
+ init_cfqg(cfqd, blkcg, cfqg);
+ } else {
+ cfqg = NULL;
+ }
+
+ if (child_cfqg)
+ cfqg_set_parent(cfqd, child_cfqg, cfqg);
+
+ /*
+ * This CGroup's use_hierarchy isn't set, this means the CFQ
+ * group chain has been built.
+ */
+ if (!blkcg_get_use_hierarchy(blkcg))
+ break;
+
+ child_cfqg = cfqg;
+ }
+
+ return leaf_cfqg;
+
+clean_up:
+ /* clean up the allocated cfq groups. */
+ while (leaf_cfqg) {
+ tmp_cfqg = leaf_cfqg;
+ leaf_cfqg = cfqg_get_parent(leaf_cfqg);
+ uninit_cfqg(cfqd, tmp_cfqg);
+ }
+
+ return NULL;
+}
+
+static struct cfq_group *
+cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+{
+ struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+ struct cfq_group *cfqg = NULL;
+ void *key = cfqd;
+ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+ unsigned int major, minor;
+
+ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ cfqg->blkg.dev = MKDEV(major, minor);
+ goto done;
+ }
+ if (cfqg || !create)
+ goto done;
+
+ /*
+ * Allocate CFQ group chain to the root group or we meet the CGroup
+ * with use_hierarchy disabled.
+ */
+ cfqg = cfqg_chain_alloc(cfqd, cgroup);

done:
return cfqg;
@@ -1142,13 +1364,26 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
{
struct cfq_rb_root *st;
int i, j;
+ struct cfq_group *p_cfqg;

BUG_ON(atomic_read(&cfqg->ref) <= 0);
if (!atomic_dec_and_test(&cfqg->ref))
return;
+
for_each_cfqg_st(cfqg, i, j, st)
BUG_ON(!RB_EMPTY_ROOT(&st->rb));
- kfree(cfqg);
+
+ do {
+ p_cfqg = cfqg_get_parent(cfqg);
+ kfree(cfqg);
+ cfqg = NULL;
+ /*
+ * Drop the reference taken by children, if nobody references
+ * parent group, we need delete the parent also.
+ */
+ if (p_cfqg && atomic_dec_and_test(&p_cfqg->ref))
+ cfqg = p_cfqg;
+ } while (cfqg);
}

static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
@@ -1356,9 +1591,8 @@ insert:
cfqe->service_tree = service_tree;

/* Add cfqq onto service tree. */
+
cfq_entity_service_tree_add(service_tree, cfqe);
- update_min_vdisktime(service_tree);
- cfqq->reposition_time = jiffies;
if ((add_front || !new_cfqq) && !group_changed)
return;
cfq_group_service_tree_add(cfqd, cfqq->cfqg);
@@ -1801,28 +2035,43 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
return cfqq_of_entity(cfq_rb_first(service_tree));
}

-static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
+struct cfq_rb_root *choose_service_tree_forced(struct cfq_group *cfqg)
{
- struct cfq_group *cfqg;
- struct cfq_entity *cfqe;
int i, j;
struct cfq_rb_root *st;

- if (!cfqd->rq_queued)
- return NULL;
+ for_each_cfqg_st(cfqg, i, j, st) {
+ if (st->count != 0)
+ return st;
+ }

- cfqg = cfq_get_next_cfqg(cfqd);
- if (!cfqg)
+ return NULL;
+}
+
+static struct cfq_entity *
+cfq_get_next_entity_forced(struct cfq_data *cfqd)
+{
+ struct cfq_entity *cfqe;
+ struct cfq_rb_root *st = &cfqd->grp_service_tree;
+ struct cfq_group *cfqg;
+
+ if (!cfqd->rq_queued)
return NULL;

- for_each_cfqg_st(cfqg, i, j, st) {
+ do {
cfqe = cfq_rb_first(st);
- if (cfqe != NULL)
- return cfqq_of_entity(cfqe);
- }
+ if (cfqe && !cfqe->is_group_entity)
+ return cfqe;
+ else if (cfqe && cfqe->is_group_entity)
+ cfqg = cfqg_of_entity(cfqe);
+
+ st = choose_service_tree_forced(cfqg);
+ } while (st);
+
return NULL;
}

+
/*
* Get and set a new active queue for service.
*/
@@ -2178,7 +2427,6 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
struct cfq_group *cfqg, enum wl_prio_t prio)
{
struct cfq_entity *cfqe;
- struct cfq_queue *cfqq;
unsigned long lowest_start_time;
int i;
bool time_valid = false;
@@ -2190,11 +2438,10 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
*/
for (i = 0; i <= SYNC_WORKLOAD; ++i) {
cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
- cfqq = cfqq_of_entity(cfqe);
if (cfqe && (!time_valid ||
- time_before(cfqq->reposition_time,
+ time_before(cfqe->reposition_time,
lowest_start_time))) {
- lowest_start_time = cfqq->reposition_time;
+ lowest_start_time = cfqe->reposition_time;
cur_best = i;
time_valid = true;
}
@@ -2203,46 +2450,13 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
return cur_best;
}

-static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
+static void set_workload_expire(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
unsigned slice;
unsigned count;
struct cfq_rb_root *st;
unsigned group_slice;
- enum wl_prio_t original_prio = cfqd->serving_prio;
-
- /* Choose next priority. RT > BE > IDLE */
- if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
- cfqd->serving_prio = RT_WORKLOAD;
- else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
- cfqd->serving_prio = BE_WORKLOAD;
- else {
- cfqd->serving_prio = IDLE_WORKLOAD;
- cfqd->workload_expires = jiffies + 1;
- return;
- }
-
- if (original_prio != cfqd->serving_prio)
- goto new_workload;
-
- /*
- * For RT and BE, we have to choose also the type
- * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
- * expiration time
- */
- st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
- count = st->count;
-
- /*
- * check workload expiration, and that we still have other queues ready
- */
- if (count && !time_after(jiffies, cfqd->workload_expires))
- return;

-new_workload:
- /* otherwise select new workload type */
- cfqd->serving_type =
- cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
count = st->count;

@@ -2281,38 +2495,63 @@ new_workload:
slice = max_t(unsigned, slice, CFQ_MIN_TT);
cfq_log(cfqd, "workload slice:%d", slice);
cfqd->workload_expires = jiffies + slice;
+ /* Restore the previous saved slice. */
+ if (cfqg->saved_workload_slice)
+ cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
}

-static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
+static struct cfq_rb_root *choose_service_tree(struct cfq_data *cfqd,
+ struct cfq_group *cfqg)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
- struct cfq_group *cfqg;
- struct cfq_entity *cfqe;
-
- if (RB_EMPTY_ROOT(&st->rb))
+ if (!cfqg) {
+ cfqd->serving_prio = IDLE_WORKLOAD;
+ cfqd->workload_expires = jiffies + 1;
return NULL;
- cfqe = cfq_rb_first_entity(st);
- cfqg = cfqg_of_entity(cfqe);
- BUG_ON(!cfqg);
- update_min_vdisktime(st);
- return cfqg;
+ }
+
+ /* Choose next priority. RT > BE > IDLE */
+ if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
+ cfqd->serving_prio = RT_WORKLOAD;
+ else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
+ cfqd->serving_prio = BE_WORKLOAD;
+ else {
+ cfqd->serving_prio = IDLE_WORKLOAD;
+ cfqd->workload_expires = jiffies + 1;
+ return service_tree_for(cfqg, cfqd->serving_prio,
+ cfqd->serving_type);
+ }
+
+ /* otherwise select new workload type */
+ cfqd->serving_type =
+ cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
+
+ return service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
}

-static void cfq_choose_cfqg(struct cfq_data *cfqd)
+static struct cfq_entity *cfq_select_entity(struct cfq_data *cfqd)
{
- struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
+ struct cfq_rb_root *st = &cfqd->grp_service_tree;
+ struct cfq_entity *cfqe;
+ struct cfq_group *cfqg = NULL;

- cfqd->serving_group = cfqg;
+ if (!cfqd->rq_queued)
+ return NULL;

- /* Restore the workload type data */
- if (cfqg->saved_workload_slice) {
- cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
- cfqd->serving_type = cfqg->saved_workload;
- cfqd->serving_prio = cfqg->saved_serving_prio;
- } else
- cfqd->workload_expires = jiffies - 1;
+ do {
+ cfqe = cfq_rb_first(st);
+ if (!cfqe->is_group_entity) {
+ set_workload_expire(cfqd, cfqg);
+ cfqd->serving_group = cfqg;
+ return cfqe;
+ } else {
+ cfqg = cfqg_of_entity(cfqe);
+ st = choose_service_tree(cfqd, cfqg);
+ if (!st || RB_EMPTY_ROOT(&st->rb))
+ return NULL;
+ }
+ } while (st);

- choose_service_tree(cfqd, cfqg);
+ return NULL;
}

/*
@@ -2322,6 +2561,7 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
{
struct cfq_queue *cfqq, *new_cfqq = NULL;
+ struct cfq_entity *entity;

cfqq = cfqd->active_queue;
if (!cfqq)
@@ -2421,9 +2661,9 @@ new_queue:
* Current queue expired. Check if we have to switch to a new
* service tree
*/
- if (!new_cfqq)
- cfq_choose_cfqg(cfqd);
-
+ entity = cfq_select_entity(cfqd);
+ BUG_ON(entity->is_group_entity);
+ new_cfqq = cfqq_of_entity(entity);
cfqq = cfq_set_active_queue(cfqd, new_cfqq);
keep_queue:
return cfqq;
@@ -2453,10 +2693,14 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
{
struct cfq_queue *cfqq;
int dispatched = 0;
+ struct cfq_entity *cfqe;

/* Expire the timeslice of the current active queue first */
cfq_slice_expired(cfqd, 0);
- while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
+
+ while ((cfqe = cfq_get_next_entity_forced(cfqd)) != NULL) {
+ BUG_ON(cfqe->is_group_entity);
+ cfqq = cfqq_of_entity(cfqe);
__cfq_set_active_queue(cfqd, cfqq);
dispatched += __cfq_forced_dispatch_cfqq(cfqq);
}
@@ -2464,6 +2708,7 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
BUG_ON(cfqd->busy_queues);

cfq_log(cfqd, "forced_dispatch=%d", dispatched);
+
return dispatched;
}

@@ -4008,9 +4253,6 @@ static void *cfq_init_queue(struct request_queue *q)

cfqd->cic_index = i;

- /* Init root service tree */
- cfqd->grp_service_tree = CFQ_RB_ROOT;
-
/* Init root group */
cfqg = &cfqd->root_group;
for_each_cfqg_st(cfqg, i, j, st)
@@ -4020,6 +4262,7 @@ static void *cfq_init_queue(struct request_queue *q)
/* Give preference to root group over other groups */
cfqg->cfqe.weight = 2*BLKIO_WEIGHT_DEFAULT;
cfqg->cfqe.is_group_entity = true;
+ cfqg_set_parent(cfqd, cfqg, NULL);

#ifdef CONFIG_CFQ_GROUP_IOSCHED
/*
@@ -4072,6 +4315,7 @@ static void *cfq_init_queue(struct request_queue *q)
cfqd->cfq_latency = 1;
cfqd->cfq_group_isolation = 0;
cfqd->hw_tag = -1;
+
/*
* we optimistically start assuming sync ops weren't delayed in last
* second, in order to have larger depth for async operations.
--
1.6.5.2