Message-ID: <4D539821.1090703@cn.fujitsu.com>
Date: Thu, 10 Feb 2011 15:47:45 +0800
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
User-Agent: Thunderbird 2.0.0.24 (Windows/20100228)
MIME-Version: 1.0
To: Vivek Goyal <vgoyal@redhat.com>, Jens Axboe <axboe@kernel.dk>
CC: Gui Jianfeng <guijianfeng@cn.fujitsu.com>,
        Shaohua Li <shaohua.li@intel.com>, lkml <linux-kernel@vger.kernel.org>,
        Chad Talbott <ctalbott@google.com>, Divyesh Shah <dpshah@google.com>
Subject: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and
 use_hierarchy interface
References: <4D51ED26.8050809@cn.fujitsu.com>
In-Reply-To: <4D51ED26.8050809@cn.fujitsu.com>
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset=UTF-8
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 24996
Lines: 875

CFQ group hierarchical scheduling and use_hierarchy interface.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
 block/blk-cgroup.c  |   61 +++++-
 block/blk-cgroup.h  |    3 +
 block/cfq-iosched.c |  603 +++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 500 insertions(+), 167 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 455768a..c55fecd 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -25,7 +25,10 @@
 static DEFINE_SPINLOCK(blkio_list_lock);
 static LIST_HEAD(blkio_list);
 
-struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
+struct blkio_cgroup blkio_root_cgroup = {
+	.weight = 2*BLKIO_WEIGHT_DEFAULT,
+	.use_hierarchy = 0
+};
 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 
 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
@@ -454,6 +457,7 @@ static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
 	blkg->blkcg_id = 0;
 }
 
+
 /*
  * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
  * indicating that blk_group was unhashed by the time we got to it.
@@ -765,6 +769,12 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 }
 EXPORT_SYMBOL_GPL(blkcg_get_weight);
 
+unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg)
+{
+	return blkcg->use_hierarchy;
+}
+EXPORT_SYMBOL_GPL(blkcg_get_use_hierarchy);
+
 uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
 {
 	struct blkio_policy_node *pn;
@@ -1202,6 +1212,8 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
 		switch(name) {
 		case BLKIO_PROP_weight:
 			return (u64)blkcg->weight;
+		case BLKIO_PROP_use_hierarchy:
+			return (u64)blkcg->use_hierarchy;
 		}
 		break;
 	default:
@@ -1210,6 +1222,36 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
 	return 0;
 }
 
+static int blkio_use_hierarchy_write(struct cgroup *cgrp, u64 val)
+{
+	struct cgroup *parent = cgrp->parent;
+	struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
+	int ret = 0;
+
+	if (val != 0 && val != 1)
+		return -EINVAL;
+
+	blkcg = cgroup_to_blkio_cgroup(cgrp);
+	if (parent)
+		parent_blkcg = cgroup_to_blkio_cgroup(parent);
+
+	cgroup_lock();
+	/*
+	 * If parent's use_hierarchy is set, we can't make any modifications
+	 * in the child subtrees. If it is unset, then the change can occur,
+	 * provided the current cgroup has no children.
+	 */
+	if (!parent_blkcg || !parent_blkcg->use_hierarchy) {
+		if (list_empty(&cgrp->children))
+			blkcg->use_hierarchy = val;
+		else
+			ret = -EBUSY;
+	} else
+		ret = -EINVAL;
+	cgroup_unlock();
+	return ret;
+}
+
 static int
 blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
@@ -1224,6 +1266,8 @@ blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 		switch(name) {
 		case BLKIO_PROP_weight:
 			return blkio_weight_write(blkcg, val);
+		case BLKIO_PROP_use_hierarchy:
+			return blkio_use_hierarchy_write(cgrp, val);
 		}
 		break;
 	default:
@@ -1301,6 +1345,13 @@ struct cftype blkio_files[] = {
 		.name = "reset_stats",
 		.write_u64 = blkiocg_reset_stats,
 	},
+	{
+		.name = "use_hierarchy",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+					     BLKIO_PROP_use_hierarchy),
+		.read_u64 = blkiocg_file_read_u64,
+		.write_u64 = blkiocg_file_write_u64,
+	},
 #ifdef CONFIG_BLK_DEV_THROTTLING
 	{
 		.name = "throttle.read_bps_device",
@@ -1444,7 +1495,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 static struct cgroup_subsys_state *
 blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 {
-	struct blkio_cgroup *blkcg;
+	struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
 	struct cgroup *parent = cgroup->parent;
 
 	if (!parent) {
@@ -1452,6 +1503,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 		goto done;
 	}
 
+	parent_blkcg = cgroup_to_blkio_cgroup(parent);
 	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
 	if (!blkcg)
 		return ERR_PTR(-ENOMEM);
@@ -1462,6 +1514,11 @@ done:
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
 
 	INIT_LIST_HEAD(&blkcg->policy_list);
+	if (parent)
+		blkcg->use_hierarchy = parent_blkcg->use_hierarchy;
+	else
+		blkcg->use_hierarchy = 0;
+
 	return &blkcg->css;
 }
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ea4861b..5b4b351 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -90,6 +90,7 @@ enum blkcg_file_name_prop {
 	BLKIO_PROP_idle_time,
 	BLKIO_PROP_empty_time,
 	BLKIO_PROP_dequeue,
+	BLKIO_PROP_use_hierarchy,
 };
 
 /* cgroup files owned by throttle policy */
@@ -105,6 +106,7 @@ enum blkcg_file_name_throtl {
 struct blkio_cgroup {
 	struct cgroup_subsys_state css;
 	unsigned int weight;
+	bool use_hierarchy;
 	spinlock_t lock;
 	struct hlist_head blkg_list;
 	struct list_head policy_list; /* list of blkio_policy_node */
@@ -179,6 +181,7 @@ struct blkio_policy_node {
 
 extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 				     dev_t dev);
+extern unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg);
 extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
 				     dev_t dev);
 extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index aa3eda8..0e21d27 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -110,6 +110,9 @@ struct cfq_entity {
 	u64 vdisktime;
 	bool is_group_entity;
 	unsigned int weight;
+	struct cfq_entity *parent;
+	/* Reposition time */
+	unsigned long reposition_time;
 };
 
 /*
@@ -118,8 +121,6 @@ struct cfq_entity {
 struct cfq_queue {
 	/* The schedule entity */
 	struct cfq_entity cfqe;
-	/* Reposition time */
-	unsigned long reposition_time;
 	/* reference count */
 	int ref;
 	/* various state flags, see below */
@@ -199,6 +200,9 @@ struct cfq_group {
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
 
+	/* number of sub cfq groups */
+	int nr_subgp;
+
 	/*
 	 * Per group busy queus average. Useful for workload slice calc. We
 	 * create the array for each prio class but at run time it is used
@@ -234,10 +238,11 @@ struct cfq_group {
  */
 struct cfq_data {
 	struct request_queue *queue;
-	/* Root service tree for cfq_groups */
-	struct cfq_rb_root grp_service_tree;
 	struct cfq_group root_group;
 
+	/* cfq group schedule in flat or hierarchy manner. */
+	bool use_hierarchy;
+
 	/*
 	 * The priority currently being served
 	 */
@@ -246,6 +251,9 @@ struct cfq_data {
 	unsigned long workload_expires;
 	struct cfq_group *serving_group;
 
+	/* Service tree for cfq group flat scheduling mode. */
+	struct cfq_rb_root grp_service_tree;
+
 	/*
 	 * Each priority tree is sorted by next_request position.  These
 	 * trees are used when determining if two or more queues are
@@ -355,8 +363,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
 }
 
 
-static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
-
 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
 					    enum wl_prio_t prio,
 					    enum wl_type_t type)
@@ -643,13 +649,50 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
 	return cfqg->busy_queues_avg[rt];
 }
 
+static inline unsigned int
+cfq_group_get_total_weight(struct cfq_group *cfqg)
+{
+	int i, j;
+	struct cfq_rb_root *st;
+	unsigned int total_weight = 0;
+
+	for_each_cfqg_st(cfqg, i, j, st) {
+		total_weight += st->total_weight;
+	}
+
+	return total_weight;
+}
+
 static inline unsigned
 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_entity *cfqe = &cfqg->cfqe;
+	struct cfq_rb_root *st;
+	int group_slice = cfq_target_latency;
+	unsigned int grp_total_weight;
+	struct cfq_group *p_cfqg;
+
+	/*
+	 * Calculate group slice in a hierarchical way.
+	 * Note, the calculation is cross all service trees under a group.
+	 */
+	do {
+		if (cfqe->parent) {
+			p_cfqg = cfqg_of_entity(cfqe->parent);
+			grp_total_weight = cfq_group_get_total_weight(p_cfqg);
+			group_slice = group_slice * cfqe->weight /
+					grp_total_weight;
+		} else {
+			/* For top level groups */
+			st = cfqe->service_tree;
+			group_slice = group_slice * cfqe->weight /
+					st->total_weight;
+		}
 
-	return cfq_target_latency * cfqe->weight / st->total_weight;
+		cfqe = cfqe->parent;
+	} while (cfqe);
+
+	return group_slice;
 }
 
 static inline void
@@ -672,7 +715,8 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 			/* scale low_slice according to IO priority
 			 * and sync vs async */
 			unsigned low_slice =
-				min(slice, base_low_slice * slice / sync_slice);
+				min(slice, base_low_slice * slice /
+				    sync_slice);
 			/* the adapted slice value is scaled to fit all iqs
 			 * into the target latency */
 			slice = max(slice * group_slice / expect_latency,
@@ -812,17 +856,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
 	return NULL;
 }
 
-static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
-{
-	if (!root->left)
-		root->left = rb_first(&root->rb);
-
-	if (root->left)
-		return rb_entry_entity(root->left);
-
-	return NULL;
-}
-
 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
 {
 	rb_erase(n, root);
@@ -896,12 +929,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 
 	rb_link_node(&cfqe->rb_node, parent, node);
 	rb_insert_color(&cfqe->rb_node, &st->rb);
+
+	update_min_vdisktime(st);
 }
 
 static void
 cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 {
 	__cfq_entity_service_tree_add(st, cfqe);
+	cfqe->reposition_time = jiffies;
 	st->count++;
 	st->total_weight += cfqe->weight;
 }
@@ -909,34 +945,52 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 static void
 cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_entity *cfqe = &cfqg->cfqe;
-	struct cfq_entity *__cfqe;
 	struct rb_node *n;
+	struct cfq_entity *entity;
+	struct cfq_rb_root *st;
+	struct cfq_group *__cfqg;
 
 	cfqg->nr_cfqq++;
+
 	if (!RB_EMPTY_NODE(&cfqe->rb_node))
 		return;
 
 	/*
-	 * Currently put the group at the end. Later implement something
-	 * so that groups get lesser vtime based on their weights, so that
-	 * if group does not loose all if it was not continously backlogged.
+	 * Enqueue this group and its ancestors onto their service tree.
 	 */
-	n = rb_last(&st->rb);
-	if (n) {
-		__cfqe = rb_entry_entity(n);
-		cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
-	} else
-		cfqe->vdisktime = st->min_vdisktime;
+	while (cfqe) {
+		if (!RB_EMPTY_NODE(&cfqe->rb_node))
+			return;
 
-	cfq_entity_service_tree_add(st, cfqe);
+		/*
+		 * Currently put the group at the end. Later implement
+		 * something so that groups get lesser vtime based on
+		 * their weights, so that if group does not loose all
+		 * if it was not continously backlogged.
+		 */
+		st = cfqe->service_tree;
+		n = rb_last(&st->rb);
+		if (n) {
+			entity = rb_entry_entity(n);
+			cfqe->vdisktime = entity->vdisktime +
+				CFQ_IDLE_DELAY;
+		} else
+			cfqe->vdisktime = st->min_vdisktime;
+
+		cfq_entity_service_tree_add(st, cfqe);
+		cfqe = cfqe->parent;
+		__cfqg = cfqg_of_entity(cfqe);
+		if (__cfqg)
+			__cfqg->nr_subgp++;
+	}
 }
 
 static void
 __cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 {
 	cfq_rb_erase(&cfqe->rb_node, st);
+	update_min_vdisktime(st);
 }
 
 static void
@@ -945,27 +999,43 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
 		__cfq_entity_service_tree_del(st, cfqe);
 		st->total_weight -= cfqe->weight;
-		cfqe->service_tree = NULL;
 	}
 }
 
 static void
 cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_entity *cfqe = &cfqg->cfqe;
+	struct cfq_group *__cfqg, *p_cfqg;
 
 	BUG_ON(cfqg->nr_cfqq < 1);
 	cfqg->nr_cfqq--;
 
-	/* If there are other cfq queues under this group, don't delete it */
-	if (cfqg->nr_cfqq)
+	/*
+	 * If there are other cfq queues under this group, or there are other
+	 * cfq groups under this group, don't delete it.
+	 */
+	if (cfqg->nr_cfqq || cfqg->nr_subgp)
 		return;
 
-	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-	cfq_entity_service_tree_del(st, cfqe);
-	cfqg->saved_workload_slice = 0;
-	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
+	/*
+	 * Dequeue this group and its ancestors from their service
+	 * tree.
+	 */
+	while (cfqe) {
+		__cfqg = cfqg_of_entity(cfqe);
+		p_cfqg = cfqg_of_entity(cfqe->parent);
+		cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
+		cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
+		cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
+		__cfqg->saved_workload_slice = 0;
+		cfqe = cfqe->parent;
+		if (p_cfqg) {
+			p_cfqg->nr_subgp--;
+			if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
+				return;
+		}
+	}
 }
 
 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -997,7 +1067,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
 static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 				struct cfq_queue *cfqq)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	unsigned int used_sl, charge;
 	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
 			- cfqg->service_tree_idle.count;
@@ -1011,10 +1080,23 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
 		charge = cfqq->allocated_slice;
 
-	/* Can't update vdisktime while group is on service tree */
-	__cfq_entity_service_tree_del(st, cfqe);
-	cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
-	__cfq_entity_service_tree_add(st, cfqe);
+	/*
+	 * Update the vdisktime on the whole chain.
+	 */
+	while (cfqe) {
+		struct cfq_rb_root *st = cfqe->service_tree;
+
+		/*
+		 * Can't update vdisktime while group is on service
+		 * tree.
+		 */
+		__cfq_entity_service_tree_del(st, cfqe);
+		cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
+		__cfq_entity_service_tree_add(st, cfqe);
+		st->count++;
+		cfqe->reposition_time = jiffies;
+		cfqe = cfqe->parent;
+	}
 
 	/* This group is being expired. Save the context */
 	if (time_after(cfqd->workload_expires, jiffies)) {
@@ -1026,7 +1108,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 		cfqg->saved_workload_slice = 0;
 
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
-		     cfqe->vdisktime, st->min_vdisktime);
+		     cfqg->cfqe.vdisktime,
+		     cfqg->cfqe.service_tree->min_vdisktime);
 	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
 			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
 			iops_mode(cfqd), cfqq->nr_sectors);
@@ -1048,35 +1131,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
 	cfqg_of_blkg(blkg)->cfqe.weight = weight;
 }
 
-static struct cfq_group *
-cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+static void init_cfqe(struct blkio_cgroup *blkcg,
+				    struct cfq_group *cfqg)
+{
+	struct cfq_entity *cfqe = &cfqg->cfqe;
+
+	cfqe->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+	RB_CLEAR_NODE(&cfqe->rb_node);
+	cfqe->is_group_entity = true;
+	cfqe->parent = NULL;
+}
+
+static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
+		      struct cfq_group *cfqg)
 {
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
-	struct cfq_group *cfqg = NULL;
-	void *key = cfqd;
 	int i, j;
 	struct cfq_rb_root *st;
-	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
 	unsigned int major, minor;
-
-	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
-	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
-		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		cfqg->blkg.dev = MKDEV(major, minor);
-		goto done;
-	}
-	if (cfqg || !create)
-		goto done;
-
-	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
-	if (!cfqg)
-		goto done;
+	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
 
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
-	RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
-
-	cfqg->cfqe.is_group_entity = true;
 
 	/*
 	 * Take the initial reference that will be released on destroy
@@ -1086,24 +1161,199 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	 */
 	cfqg->ref = 1;
 
+	/* Add group onto cgroup list */
+	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+	cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+				    MKDEV(major, minor));
+	/* Initiate group entity */
+	init_cfqe(blkcg, cfqg);
+	/* Add group on cfqd list */
+	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+}
+
+static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg);
+
+static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+	if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
+		cfq_destroy_cfqg(cfqd, cfqg);
+}
+
+static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
+			    struct cfq_group *p_cfqg)
+{
+	struct cfq_entity *cfqe, *p_cfqe;
+
+	cfqe = &cfqg->cfqe;
+
 	/*
-	 * Add group onto cgroup list. It might happen that bdi->dev is
-	 * not initiliazed yet. Initialize this new group without major
-	 * and minor info and this info will be filled in once a new thread
-	 * comes for IO. See code above.
+	 * 1. If use_hierarchy of the CGroup where cfqg's parent stays is not
+	 *    set, we put this cfqg onto global service tree.
+	 * 2. If cfqg is root cfqg, put it onto global service tree.
 	 */
-	if (bdi->dev) {
-		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
-					MKDEV(major, minor));
-	} else
-		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
-					0);
+	if (!p_cfqg) {
+		cfqe->service_tree = &cfqd->grp_service_tree;
+		cfqe->parent = NULL;
+		return;
+	}
 
-	cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+	p_cfqe = &p_cfqg->cfqe;
 
-	/* Add group on cfqd list */
-	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+	cfqe->parent = p_cfqe;
+
+	/*
+	 * Currently, just put cfq group entity on "BE:SYNC" workload
+	 * service tree.
+	 */
+	cfqe->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD,
+						      SYNC_WORKLOAD);
+	/* child reference */
+	p_cfqg->ref++;
+}
+
+static struct cfq_group *cfqg_get_parent(struct cfq_group * cfqg)
+{
+	struct cfq_entity *cfqe, *p_cfqe;
+
+	if (!cfqg)
+		return NULL;
+
+	cfqe = &cfqg->cfqe;
+	p_cfqe = cfqe->parent;
+	if (!p_cfqe)
+		return NULL;
+
+	return cfqg_of_entity(p_cfqe);
+}
+
+static struct cfq_group *
+cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
+{
+	struct blkio_cgroup *blkcg;
+	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+	unsigned int major, minor;
+	struct cfq_group *cfqg, *leaf_cfqg, *child_cfqg, *tmp_cfqg;
+	void *key = cfqd;
+
+	/*
+	 * If CGroup's use_hierarchy is unset, we just need to allocate only
+	 * one CFQ group, and this group will put onto the "grp_service_tree".
+	 * We don't need to check whether the cfqg exists, the caller has
+	 * already checked it.
+	 */
+	blkcg = cgroup_to_blkio_cgroup(cgroup);
+	if (!blkcg_get_use_hierarchy(blkcg)) {
+		cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
+				    cfqd->queue->node);
+		if (!cfqg)
+			return NULL;
+
+		init_cfqg(cfqd, blkcg, cfqg);
+		cfqg_set_parent(cfqd, cfqg, NULL);
+		return cfqg;
+	}
+
+	/*
+	 * Allocate the CFQ group chain until we meet the group we'v already
+	 * allocated before, or to the CGroup whose use_hierarchy is not set.
+	 */
+	leaf_cfqg = NULL;
+	child_cfqg = NULL;
+	for (; cgroup != NULL; cgroup = cgroup->parent) {
+		blkcg = cgroup_to_blkio_cgroup(cgroup);
+		cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+		if (cfqg) {
+			if (!cfqg->blkg.dev && bdi->dev &&
+			    dev_name(bdi->dev)) {
+				sscanf(dev_name(bdi->dev), "%u:%u",
+				       &major, &minor);
+				cfqg->blkg.dev = MKDEV(major, minor);
+			}
+
+			/*
+			 * Initialization of parent doesn't finish yet, get
+			 * it done.
+			 */
+			if (child_cfqg) {
+				if (blkcg_get_use_hierarchy(blkcg))
+					cfqg_set_parent(cfqd, child_cfqg,
+							cfqg);
+				else
+					cfqg_set_parent(cfqd, child_cfqg,
+							NULL);
+			}
+
+			/* chain has already been built */
+			break;
+		}
+
+		/*
+		 * We only allocate a cfqg that the corresponding cgroup's
+		 * use_hierarchy is set.
+		 */
+		if (blkcg_get_use_hierarchy(blkcg)) {
+			cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
+					    cfqd->queue->node);
+			if (!cfqg)
+				goto clean_up;
+
+			if (!leaf_cfqg)
+				leaf_cfqg = cfqg;
+
+			init_cfqg(cfqd, blkcg, cfqg);
+		} else {
+			cfqg = NULL;
+		}
+
+		if (child_cfqg)
+			cfqg_set_parent(cfqd, child_cfqg, cfqg);
+
+		/*
+		 * This CGroup's use_hierarchy isn't set, this means the CFQ
+		 * group chain has been built.
+		 */
+		if (!blkcg_get_use_hierarchy(blkcg))
+			break;
+
+		child_cfqg = cfqg;
+	}
+
+	return leaf_cfqg;
+
+clean_up:
+	/* clean up the allocated cfq groups. */
+	while (leaf_cfqg) {
+		tmp_cfqg = leaf_cfqg;
+		leaf_cfqg = cfqg_get_parent(leaf_cfqg);
+		uninit_cfqg(cfqd, tmp_cfqg);
+	}
+
+	return NULL;
+}
+
+static struct cfq_group *
+cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+	struct cfq_group *cfqg = NULL;
+	void *key = cfqd;
+	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+	unsigned int major, minor;
+
+	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+		cfqg->blkg.dev = MKDEV(major, minor);
+		goto done;
+	}
+	if (cfqg || !create)
+		goto done;
+
+	/*
+	 * Allocate CFQ group chain to the root group or we meet the CGroup
+	 * with use_hierarchy disabled.
+	 */
+	cfqg = cfqg_chain_alloc(cfqd, cgroup);
 
 done:
 	return cfqg;
@@ -1148,6 +1398,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st;
 	int i, j;
+	struct cfq_group *p_cfqg;
 
 	BUG_ON(cfqg->ref <= 0);
 	cfqg->ref--;
@@ -1155,6 +1406,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
 		return;
 	for_each_cfqg_st(cfqg, i, j, st)
 		BUG_ON(!RB_EMPTY_ROOT(&st->rb));
+
+	do {
+		p_cfqg = cfqg_get_parent(cfqg);
+		kfree(cfqg);
+		cfqg = NULL;
+		/*
+		 * Drop the reference taken by children, if nobody references
+		 * parent group, we need delete the parent also.
+		 */
+		if (p_cfqg) {
+			p_cfqg->ref--;
+			if (p_cfqg->ref == 0)
+				cfqg = p_cfqg;
+		}
+	} while (cfqg);
+
 	kfree(cfqg);
 }
 
@@ -1321,9 +1588,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			 * ioprio.
 			 */
 			pos_offset = cfq_get_boost(cfqd, cfqq);
-			/* Debug purpose, should remove. */
-			cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu",
-				     pos_offset);
 			cfqe->vdisktime = service_tree->min_vdisktime +
 						pos_offset;
 		} else
@@ -1365,9 +1629,8 @@ insert:
 	cfqe->service_tree = service_tree;
 
 	/* Add cfqq onto service tree. */
+
 	cfq_entity_service_tree_add(service_tree, cfqe);
-	update_min_vdisktime(service_tree);
-	cfqq->reposition_time = jiffies;
 	if ((add_front || !new_cfqq) && !group_changed)
 		return;
 	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
@@ -1810,28 +2073,43 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 	return cfqq_of_entity(cfq_rb_first(service_tree));
 }
 
-static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
+struct cfq_rb_root *choose_service_tree_forced(struct cfq_group *cfqg)
 {
-	struct cfq_group *cfqg;
-	struct cfq_entity *cfqe;
 	int i, j;
 	struct cfq_rb_root *st;
 
-	if (!cfqd->rq_queued)
-		return NULL;
+	for_each_cfqg_st(cfqg, i, j, st) {
+		if (st->count != 0)
+			return st;
+	}
 
-	cfqg = cfq_get_next_cfqg(cfqd);
-	if (!cfqg)
+	return NULL;
+}
+
+static struct cfq_entity *
+cfq_get_next_entity_forced(struct cfq_data *cfqd)
+{
+	struct cfq_entity *cfqe;
+	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+	struct cfq_group *cfqg;
+
+	if (!cfqd->rq_queued)
 		return NULL;
 
-	for_each_cfqg_st(cfqg, i, j, st) {
+	do {
 		cfqe = cfq_rb_first(st);
-		if (cfqe != NULL)
-			return cfqq_of_entity(cfqe);
-	}
+		if (cfqe && !cfqe->is_group_entity)
+			return cfqe;
+		else if (cfqe && cfqe->is_group_entity)
+			cfqg = cfqg_of_entity(cfqe);
+
+		st = choose_service_tree_forced(cfqg);
+	} while (st);
+
 	return NULL;
 }
 
+
 /*
  * Get and set a new active qu


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/