Message-ID: <4CF31307.9090004@cn.fujitsu.com>
Date: Mon, 29 Nov 2010 10:42:15 +0800
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
User-Agent: Thunderbird 2.0.0.24 (Windows/20100228)
MIME-Version: 1.0
To: Vivek Goyal <vgoyal@redhat.com>
CC: Jens Axboe <axboe@kernel.dk>, Corrado Zoccolo <czoccolo@gmail.com>,
        Chad Talbott <ctalbott@google.com>, Nauman Rafique <nauman@google.com>,
        Divyesh Shah <dpshah@google.com>,
        linux kernel mailing list <linux-kernel@vger.kernel.org>
Subject: Re: [RFC] [PATCH 8/8] cfq-iosched: Introduce hierarchical scheduling
 with CFQ queue and group at the same level
References: <4CDF7BC5.9080803@cn.fujitsu.com> <4CDF9D0D.4060806@cn.fujitsu.com> <20101115204459.GF3396@redhat.com>
In-Reply-To: <20101115204459.GF3396@redhat.com>
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset=UTF-8
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 27047
Lines: 803

Vivek Goyal wrote:
> On Sun, Nov 14, 2010 at 04:25:49PM +0800, Gui Jianfeng wrote:
>> This patch makes CFQ queue and CFQ group schedules at the same level.
>> Consider the following hierarchy:
>>
>>                     Root
>>                    / | \
>>                  q1 q2 G1
>>                       / \
>>                     q3  G2 
>>
>> q1 q2 and q3 are CFQ queues G1 and G2 are CFQ groups. Currently, q1, q2 
>> and G1 are scheduling on a same service tree in Root CFQ group. q3 and G2
>> are schedluing under G1. Note, for the time being, CFQ group is treated 
>> as "BE and SYNC" workload, and is put on "BE and SYNC" service tree. That
>> means service differentiate only happens in "BE and SYNC" service tree.
>> Later, we may introduce "IO Class" for CFQ group.
>>
> 
> Have you got rid of flat mode (existing default mode). IOW, I don't see
> the introduction of "use_hierarchy" which will differentiate between 
> whether to treat an hierarchy as flat or not?

Vivek,

As I said in [PATCH 0/8], yes, for the time being, I get rid of flat mode.
Hierarchical cgroup creation is just merge into block-tree, not in Mainline
now, .so I think it's ok I'd like to post "use_hierarchy" patchset separately
when this patchset get merged. How do you say, Vivek?

Gui

> 
> Vivek
> 
>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>> ---
>>  block/cfq-iosched.c |  483 ++++++++++++++++++++++++++++++++++-----------------
>>  1 files changed, 324 insertions(+), 159 deletions(-)
>>
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index 1df0928..9def3a2 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -105,6 +105,9 @@ struct io_sched_entity {
>>  	u64 vdisktime;
>>  	bool is_group_entity;
>>  	unsigned int weight;
>> +	struct io_sched_entity *parent;
>> +	/* Reposition time */
>> +	unsigned long reposition_time;
>>  };
>>  
>>  /*
>> @@ -113,8 +116,6 @@ struct io_sched_entity {
>>  struct cfq_queue {
>>  	/* The schedule entity */
>>  	struct io_sched_entity queue_entity;
>> -	/* Reposition time */
>> -	unsigned long reposition_time;
>>  	/* reference count */
>>  	atomic_t ref;
>>  	/* various state flags, see below */
>> @@ -193,6 +194,9 @@ struct cfq_group {
>>  	/* number of cfqq currently on this group */
>>  	int nr_cfqq;
>>  
>> +	/* number of sub cfq groups */
>> +	int nr_subgp;
>> +
>>  	/* Per group busy queus average. Useful for workload slice calc. */
>>  	unsigned int busy_queues_avg[2];
>>  	/*
>> @@ -219,8 +223,6 @@ struct cfq_group {
>>   */
>>  struct cfq_data {
>>  	struct request_queue *queue;
>> -	/* Root service tree for cfq_groups */
>> -	struct cfq_rb_root grp_service_tree;
>>  	struct cfq_group root_group;
>>  
>>  	/*
>> @@ -337,8 +339,6 @@ cfqg_of_entity(struct io_sched_entity *io_entity)
>>  	return NULL;
>>  }
>>  
>> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
>> -
>>  static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
>>  					    enum wl_prio_t prio,
>>  					    enum wl_type_t type)
>> @@ -629,10 +629,15 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
>>  static inline unsigned
>>  cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>  {
>> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>  	struct io_sched_entity *group_entity = &cfqg->group_entity;
>> +	struct cfq_rb_root *st = group_entity->service_tree;
>>  
>> -	return cfq_target_latency * group_entity->weight / st->total_weight;
>> +	if (st)
>> +		return cfq_target_latency * group_entity->weight
>> +			/ st->total_weight;
>> +	else
>> +		/* If this is the root group, give it a full slice. */
>> +		return cfq_target_latency;
>>  }
>>  
>>  static inline void
>> @@ -795,17 +800,6 @@ static struct io_sched_entity *cfq_rb_first(struct cfq_rb_root *root)
>>  	return NULL;
>>  }
>>  
>> -static struct io_sched_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
>> -{
>> -	if (!root->left)
>> -		root->left = rb_first(&root->rb);
>> -
>> -	if (root->left)
>> -		return rb_entry_entity(root->left);
>> -
>> -	return NULL;
>> -}
>> -
>>  static void rb_erase_init(struct rb_node *n, struct rb_root *root)
>>  {
>>  	rb_erase(n, root);
>> @@ -887,6 +881,7 @@ io_entity_service_tree_add(struct cfq_rb_root *st,
>>  			   struct io_sched_entity *io_entity)
>>  {
>>  	__io_entity_service_tree_add(st, io_entity);
>> +	io_entity->reposition_time = jiffies;
>>  	st->count++;
>>  	st->total_weight += io_entity->weight;
>>  }
>> @@ -894,29 +889,49 @@ io_entity_service_tree_add(struct cfq_rb_root *st,
>>  static void
>>  cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>  {
>> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>  	struct rb_node *n;
>>  	struct io_sched_entity *group_entity = &cfqg->group_entity;
>> -	struct io_sched_entity *__group_entity;
>> +	struct io_sched_entity *entity;
>> +	struct cfq_rb_root *st;
>> +	struct cfq_group *__cfqg;
>>  
>>  	cfqg->nr_cfqq++;
>> +
>> +	/*
>> +	 * Root group doesn't belongs to any service
>> +	 */
>> +	if (cfqg == &cfqd->root_group)
>> +		return;
>> +
>>  	if (!RB_EMPTY_NODE(&group_entity->rb_node))
>>  		return;
>>  
>> -	/*
>> -	 * Currently put the group at the end. Later implement something
>> -	 * so that groups get lesser vtime based on their weights, so that
>> -	 * if group does not loose all if it was not continously backlogged.
>> +	/* 
>> +	 * Enqueue this group and its ancestors onto their service tree.
>>  	 */
>> -	n = rb_last(&st->rb);
>> -	if (n) {
>> -		__group_entity = rb_entry_entity(n);
>> -		group_entity->vdisktime = __group_entity->vdisktime +
>> -					  CFQ_IDLE_DELAY;
>> -	} else
>> -		group_entity->vdisktime = st->min_vdisktime;
>> +	while (group_entity && group_entity->parent) {
>> +		if (!RB_EMPTY_NODE(&group_entity->rb_node))
>> +			return;
>> +		/*
>> +		 * Currently put the group at the end. Later implement
>> +		 * something so that groups get lesser vtime based on their
>> +		 * weights, so that if group does not loose all if it was not
>> +		 * continously backlogged.
>> +		 */
>> +		st = group_entity->service_tree;
>> +		n = rb_last(&st->rb);
>> +		if (n) {
>> +			entity = rb_entry_entity(n);
>> +			group_entity->vdisktime = entity->vdisktime +
>> +						  CFQ_IDLE_DELAY;
>> +		} else
>> +			group_entity->vdisktime = st->min_vdisktime;
>>  
>> -	io_entity_service_tree_add(st, group_entity);
>> +		io_entity_service_tree_add(st, group_entity);
>> +		group_entity = group_entity->parent;
>> +		__cfqg = cfqg_of_entity(group_entity);
>> +		__cfqg->nr_subgp++;
>> +	}
>>  }
>>  
>>  static void
>> @@ -933,27 +948,47 @@ io_entity_service_tree_del(struct cfq_rb_root *st,
>>  	if (!RB_EMPTY_NODE(&io_entity->rb_node)) {
>>  		__io_entity_service_tree_del(st, io_entity);
>>  		st->total_weight -= io_entity->weight;
>> -		io_entity->service_tree = NULL;
>>  	}
>>  }
>>  
>>  static void
>>  cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>  {
>> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>  	struct io_sched_entity *group_entity = &cfqg->group_entity;
>> +	struct cfq_group *__cfqg, *p_cfqg;
>>  
>>  	BUG_ON(cfqg->nr_cfqq < 1);
>>  	cfqg->nr_cfqq--;
>>  
>> +	/*
>> +	 * Root group doesn't belongs to any service
>> +	 */
>> +	if (cfqg == &cfqd->root_group)
>> +		return;
>> +
>>  	/* If there are other cfq queues under this group, don't delete it */
>>  	if (cfqg->nr_cfqq)
>>  		return;
>> -
>> -	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
>> -	io_entity_service_tree_del(st, group_entity);
>> -	cfqg->saved_workload_slice = 0;
>> -	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
>> +	/* If child group exists, don't dequeue it */
>> +	if (cfqg->nr_subgp)
>> +		return;
>> +	
>> +	/*
>> +         * Dequeue this group and its ancestors from their service tree.
>> +         */
>> +	while (group_entity && group_entity->parent) {
>> +		__cfqg = cfqg_of_entity(group_entity);
>> +		p_cfqg = cfqg_of_entity(group_entity->parent);
>> +		io_entity_service_tree_del(group_entity->service_tree,
>> +					   group_entity);
>> +		cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
>> +		cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
>> +		__cfqg->saved_workload_slice = 0;
>> +		group_entity = group_entity->parent;
>> +		p_cfqg->nr_subgp--;
>> +		if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
>> +			return;
>> +	}
>>  }
>>  
>>  static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>> @@ -985,7 +1020,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>>  static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>  				struct cfq_queue *cfqq)
>>  {
>> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>  	unsigned int used_sl, charge;
>>  	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
>>  			- cfqg->service_tree_idle.count;
>> @@ -999,10 +1033,21 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>  	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
>>  		charge = cfqq->allocated_slice;
>>  
>> -	/* Can't update vdisktime while group is on service tree */
>> -	__io_entity_service_tree_del(st, group_entity);
>> -	group_entity->vdisktime += cfq_scale_slice(charge, group_entity);
>> -	__io_entity_service_tree_add(st, group_entity);
>> +	/*
>> +	 * Update the vdisktime on the whole chain.
>> +	 */
>> +	while (group_entity && group_entity->parent) {
>> +		struct cfq_rb_root *st = group_entity->service_tree;
>> +
>> +		/* Can't update vdisktime while group is on service tree */
>> +		__io_entity_service_tree_del(st, group_entity);
>> +		group_entity->vdisktime += cfq_scale_slice(charge,
>> +							   group_entity);
>> +		__io_entity_service_tree_add(st, group_entity);
>> +		st->count++;
>> +		group_entity->reposition_time = jiffies;
>> +		group_entity = group_entity->parent;
>> +	}
>>  
>>  	/* This group is being expired. Save the context */
>>  	if (time_after(cfqd->workload_expires, jiffies)) {
>> @@ -1014,7 +1059,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>  		cfqg->saved_workload_slice = 0;
>>  
>>  	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
>> -		     group_entity->vdisktime, st->min_vdisktime);
>> +		     cfqg->group_entity.vdisktime,
>> +		     cfqg->group_entity.service_tree->min_vdisktime);
>>  	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
>>  			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
>>  			iops_mode(cfqd), cfqq->nr_sectors);
>> @@ -1036,35 +1082,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
>>  	cfqg_of_blkg(blkg)->group_entity.weight = weight;
>>  }
>>  
>> -static struct cfq_group *
>> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> +static void init_group_entity(struct blkio_cgroup *blkcg,
>> +				    struct cfq_group *cfqg)
>> +{
>> +	struct io_sched_entity *group_entity = &cfqg->group_entity;
>> +
>> +	group_entity->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>> +	RB_CLEAR_NODE(&group_entity->rb_node);
>> +	group_entity->is_group_entity = true;
>> +	group_entity->parent = NULL;
>> +}
>> +
>> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
>> +		      struct cfq_group *cfqg)
>>  {
>> -	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> -	struct cfq_group *cfqg = NULL;
>> -	void *key = cfqd;
>>  	int i, j;
>>  	struct cfq_rb_root *st;
>> -	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>  	unsigned int major, minor;
>> -
>> -	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> -	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> -		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> -		cfqg->blkg.dev = MKDEV(major, minor);
>> -		goto done;
>> -	}
>> -	if (cfqg || !create)
>> -		goto done;
>> -
>> -	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
>> -	if (!cfqg)
>> -		goto done;
>> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>  
>>  	for_each_cfqg_st(cfqg, i, j, st)
>>  		*st = CFQ_RB_ROOT;
>> -	RB_CLEAR_NODE(&cfqg->group_entity.rb_node);
>> -
>> -	cfqg->group_entity.is_group_entity = true;
>>  
>>  	/*
>>  	 * Take the initial reference that will be released on destroy
>> @@ -1074,24 +1112,119 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>  	 */
>>  	atomic_set(&cfqg->ref, 1);
>>  
>> +	/* Add group onto cgroup list */
>> +	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> +	cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>> +				    MKDEV(major, minor));
>> +	/* Initiate group entity */
>> +	init_group_entity(blkcg, cfqg);
>> +	/* Add group on cfqd list */
>> +	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>> +}
>> +
>> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg);
>> +
>> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> +{
>> +	if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
>> +		cfq_destroy_cfqg(cfqd, cfqg);
>> +}
>> +
>> +static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
>> +			    struct cfq_group *p_cfqg)
>> +{
>> +	struct io_sched_entity *group_entity, *p_group_entity;
>> +
>> +	group_entity = &cfqg->group_entity;
>> +
>> +	p_group_entity = &p_cfqg->group_entity;
>> +
>> +	group_entity->parent = p_group_entity;
>> +
>>  	/*
>> -	 * Add group onto cgroup list. It might happen that bdi->dev is
>> -	 * not initiliazed yet. Initialize this new group without major
>> -	 * and minor info and this info will be filled in once a new thread
>> -	 * comes for IO. See code above.
>> +	 * Currently, just put cfq group entity on "BE:SYNC" workload
>> +	 * service tree.
>>  	 */
>> -	if (bdi->dev) {
>> -		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> -		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>> -					MKDEV(major, minor));
>> -	} else
>> -		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>> -					0);
>> +	group_entity->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD,
>> +						      SYNC_WORKLOAD);
>> +	/* child reference */
>> +	atomic_inc(&p_cfqg->ref);
>> +}
>>  
>> -	cfqg->group_entity.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>> +int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
>> +{
>> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> +	struct blkio_cgroup *p_blkcg;
>> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> +	unsigned int major, minor;
>> +	struct cfq_group *cfqg, *p_cfqg;
>> +	void *key = cfqd;
>> +	int ret;
>>  
>> -	/* Add group on cfqd list */
>> -	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> +	if (cfqg) {
>> +		if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> +			sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> +			cfqg->blkg.dev = MKDEV(major, minor);
>> +		}
>> +		/* chain has already been built */
>> +		return 0;
>> +	}
>> +
>> +	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
>> +	if (!cfqg)
>> +		return -1;
>> +
>> +	init_cfqg(cfqd, blkcg, cfqg);
>> +
>> +	/* Already to the top group */
>> +	if (!cgroup->parent)
>> +		return 0;
>> +
>> +	/* Allocate CFQ groups on the chain */
>> +	ret = cfqg_chain_alloc(cfqd, cgroup->parent);
>> +	if (ret == -1) {
>> +		uninit_cfqg(cfqd, cfqg);
>> +		return -1;
>> +	}
>> +
>> +	p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
>> +	p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key));
>> +	BUG_ON(p_cfqg == NULL);
>> +
>> +	cfqg_set_parent(cfqd, cfqg, p_cfqg);
>> +	return 0;
>> +}
>> +
>> +static struct cfq_group *
>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> +{
>> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> +	struct cfq_group *cfqg = NULL;
>> +	void *key = cfqd;
>> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> +	unsigned int major, minor;
>> +	int ret;
>> +
>> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> +	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> +		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> +		cfqg->blkg.dev = MKDEV(major, minor);
>> +		goto done;
>> +	}
>> +	if (cfqg || !create)
>> +		goto done;
>> +
>> +	/*
>> +	 * For hierarchical cfq group scheduling, we need to allocate
>> +	 * the whole cfq group chain.
>> +	 */
>> +	ret = cfqg_chain_alloc(cfqd, cgroup);
>> +	if (!ret) {
>> +		cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> +		BUG_ON(cfqg == NULL);
>> +		goto done;
>> +	}
>>  
>>  done:
>>  	return cfqg;
>> @@ -1136,12 +1269,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
>>  {
>>  	struct cfq_rb_root *st;
>>  	int i, j;
>> +	struct io_sched_entity *group_entity;
>> +	struct cfq_group *p_cfqg;
>>  
>>  	BUG_ON(atomic_read(&cfqg->ref) <= 0);
>>  	if (!atomic_dec_and_test(&cfqg->ref))
>>  		return;
>>  	for_each_cfqg_st(cfqg, i, j, st)
>>  		BUG_ON(!RB_EMPTY_ROOT(&st->rb));
>> +
>> +	group_entity = &cfqg->group_entity;
>> +	if (group_entity->parent) {
>> +		p_cfqg = cfqg_of_entity(group_entity->parent);
>> +		/* Drop the reference taken by children */
>> +		atomic_dec(&p_cfqg->ref);
>> +	}
>> +
>>  	kfree(cfqg);
>>  }
>>  
>> @@ -1336,7 +1479,6 @@ insert:
>>  	io_entity_service_tree_add(service_tree, queue_entity);
>>  
>>  	update_min_vdisktime(service_tree);
>> -	cfqq->reposition_time = jiffies;
>>  	if ((add_front || !new_cfqq) && !group_changed)
>>  		return;
>>  	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
>> @@ -1779,28 +1921,30 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
>>  	return cfqq_of_entity(cfq_rb_first(service_tree));
>>  }
>>  
>> -static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
>> +static struct io_sched_entity *
>> +cfq_get_next_entity_forced(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>  {
>> -	struct cfq_group *cfqg;
>> -	struct io_sched_entity *queue_entity;
>> +	struct io_sched_entity *entity;
>>  	int i, j;
>>  	struct cfq_rb_root *st;
>>  
>>  	if (!cfqd->rq_queued)
>>  		return NULL;
>>  
>> -	cfqg = cfq_get_next_cfqg(cfqd);
>> -	if (!cfqg)
>> -		return NULL;
>> -
>>  	for_each_cfqg_st(cfqg, i, j, st) {
>> -		queue_entity = cfq_rb_first(st);
>> -		if (queue_entity != NULL)
>> -			return cfqq_of_entity(queue_entity);
>> +		entity = cfq_rb_first(st);
>> +
>> +		if (entity && !entity->is_group_entity)
>> +			return entity;
>> +		else if (entity && entity->is_group_entity) {
>> +			cfqg = cfqg_of_entity(entity);
>> +			return cfq_get_next_entity_forced(cfqd, cfqg);
>> +		}
>>  	}
>>  	return NULL;
>>  }
>>  
>> +
>>  /*
>>   * Get and set a new active queue for service.
>>   */
>> @@ -2155,8 +2299,7 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
>>  static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>>  				struct cfq_group *cfqg, enum wl_prio_t prio)
>>  {
>> -	struct io_sched_entity *queue_entity;
>> -	struct cfq_queue *cfqq;
>> +	struct io_sched_entity *entity;
>>  	unsigned long lowest_start_time;
>>  	int i;
>>  	bool time_valid = false;
>> @@ -2167,12 +2310,11 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>>  	 * type. But for the time being just make use of reposition_time only.
>>  	 */
>>  	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
>> -		queue_entity = cfq_rb_first(service_tree_for(cfqg, prio, i));
>> -		cfqq = cfqq_of_entity(queue_entity);
>> -		if (queue_entity &&
>> +		entity = cfq_rb_first(service_tree_for(cfqg, prio, i));
>> +		if (entity &&
>>  		    (!time_valid ||
>> -		     cfqq->reposition_time < lowest_start_time)) {
>> -			lowest_start_time = cfqq->reposition_time;
>> +		     entity->reposition_time < lowest_start_time)) {
>> +			lowest_start_time = entity->reposition_time;
>>  			cur_best = i;
>>  			time_valid = true;
>>  		}
>> @@ -2181,47 +2323,13 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>>  	return cur_best;
>>  }
>>  
>> -static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> +static void set_workload_expire(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>  {
>>  	unsigned slice;
>>  	unsigned count;
>>  	struct cfq_rb_root *st;
>>  	unsigned group_slice;
>>  
>> -	if (!cfqg) {
>> -		cfqd->serving_prio = IDLE_WORKLOAD;
>> -		cfqd->workload_expires = jiffies + 1;
>> -		return;
>> -	}
>> -
>> -	/* Choose next priority. RT > BE > IDLE */
>> -	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
>> -		cfqd->serving_prio = RT_WORKLOAD;
>> -	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
>> -		cfqd->serving_prio = BE_WORKLOAD;
>> -	else {
>> -		cfqd->serving_prio = IDLE_WORKLOAD;
>> -		cfqd->workload_expires = jiffies + 1;
>> -		return;
>> -	}
>> -
>> -	/*
>> -	 * For RT and BE, we have to choose also the type
>> -	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
>> -	 * expiration time
>> -	 */
>> -	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
>> -	count = st->count;
>> -
>> -	/*
>> -	 * check workload expiration, and that we still have other queues ready
>> -	 */
>> -	if (count && !time_after(jiffies, cfqd->workload_expires))
>> -		return;
>> -
>> -	/* otherwise select new workload type */
>> -	cfqd->serving_type =
>> -		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
>>  	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
>>  	count = st->count;
>>  
>> @@ -2262,26 +2370,51 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>  	cfqd->workload_expires = jiffies + slice;
>>  }
>>  
>> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
>> +static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>  {
>> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> -	struct cfq_group *cfqg;
>> -	struct io_sched_entity *group_entity;
>> +	struct cfq_rb_root *st;
>> +	unsigned count;
>>  
>> -	if (RB_EMPTY_ROOT(&st->rb))
>> -		return NULL;
>> -	group_entity = cfq_rb_first_entity(st);
>> -	cfqg = cfqg_of_entity(group_entity);
>> -	BUG_ON(!cfqg);
>> -	update_min_vdisktime(st);
>> -	return cfqg;
>> +	if (!cfqg) {
>> +		cfqd->serving_prio = IDLE_WORKLOAD;
>> +		cfqd->workload_expires = jiffies + 1;
>> +		return;
>> +	}
>> +
>> +	/* Choose next priority. RT > BE > IDLE */
>> +	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
>> +		cfqd->serving_prio = RT_WORKLOAD;
>> +	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
>> +		cfqd->serving_prio = BE_WORKLOAD;
>> +	else {
>> +		cfqd->serving_prio = IDLE_WORKLOAD;
>> +		cfqd->workload_expires = jiffies + 1;
>> +		return;
>> +	}
>> +
>> +	/*
>> +	 * For RT and BE, we have to choose also the type
>> +	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
>> +	 * expiration time
>> +	 */
>> +	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
>> +	count = st->count;
>> +
>> +	/*
>> +	 * check workload expiration, and that we still have other queues ready
>> +	 */
>> +	if (count && !time_after(jiffies, cfqd->workload_expires))
>> +		return;
>> +
>> +	/* otherwise select new workload type */
>> +	cfqd->serving_type =
>> +		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
>>  }
>>  
>> -static void cfq_choose_cfqg(struct cfq_data *cfqd)
>> +struct io_sched_entity *choose_serving_entity(struct cfq_data *cfqd,
>> +					      struct cfq_group *cfqg)
>>  {
>> -	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
>> -
>> -	cfqd->serving_group = cfqg;
>> +	struct cfq_rb_root *service_tree;
>>  
>>  	/* Restore the workload type data */
>>  	if (cfqg->saved_workload_slice) {
>> @@ -2292,8 +2425,21 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
>>  		cfqd->workload_expires = jiffies - 1;
>>  
>>  	choose_service_tree(cfqd, cfqg);
>> -}
>>  
>> +	service_tree = service_tree_for(cfqg, cfqd->serving_prio,
>> +					cfqd->serving_type);
>> +
>> +	if (!cfqd->rq_queued)
>> +		return NULL;
>> +
>> +	/* There is nothing to dispatch */
>> +	if (!service_tree)
>> +		return NULL;
>> +	if (RB_EMPTY_ROOT(&service_tree->rb))
>> +		return NULL;
>> +
>> +	return cfq_rb_first(service_tree);
>> +}
>>  /*
>>   * Select a queue for service. If we have a current active queue,
>>   * check whether to continue servicing it, or retrieve and set a new one.
>> @@ -2301,6 +2447,8 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
>>  static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>>  {
>>  	struct cfq_queue *cfqq, *new_cfqq = NULL;
>> +	struct cfq_group *cfqg;
>> +	struct io_sched_entity *entity;
>>  
>>  	cfqq = cfqd->active_queue;
>>  	if (!cfqq)
>> @@ -2389,8 +2537,23 @@ new_queue:
>>  	 * Current queue expired. Check if we have to switch to a new
>>  	 * service tree
>>  	 */
>> -	if (!new_cfqq)
>> -		cfq_choose_cfqg(cfqd);
>> +	cfqg = &cfqd->root_group;
>> +
>> +	if (!new_cfqq) {
>> +		do {
>> +			entity = choose_serving_entity(cfqd, cfqg);
>> +			if (entity && !entity->is_group_entity) {
>> +				/* This is the CFQ queue that should run */
>> +				new_cfqq = cfqq_of_entity(entity);
>> +				cfqd->serving_group = cfqg;
>> +				set_workload_expire(cfqd, cfqg);
>> +				break;
>> +			} else if (entity && entity->is_group_entity) {
>> +				/* Continue to lookup in this CFQ group */
>> +				cfqg = cfqg_of_entity(entity);
>> +			}
>> +		} while (entity && entity->is_group_entity);
>> +	}
>>  
>>  	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
>>  keep_queue:
>> @@ -2421,10 +2584,14 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
>>  {
>>  	struct cfq_queue *cfqq;
>>  	int dispatched = 0;
>> +	struct io_sched_entity *entity;
>> +	struct cfq_group *root = &cfqd->root_group;
>>  
>>  	/* Expire the timeslice of the current active queue first */
>>  	cfq_slice_expired(cfqd, 0);
>> -	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
>> +	while ((entity = cfq_get_next_entity_forced(cfqd, root)) != NULL) {
>> +		BUG_ON(entity->is_group_entity);
>> +		cfqq = cfqq_of_entity(entity);
>>  		__cfq_set_active_queue(cfqd, cfqq);
>>  		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
>>  	}
>> @@ -3954,9 +4121,6 @@ static void *cfq_init_queue(struct request_queue *q)
>>  
>>  	cfqd->cic_index = i;
>>  
>> -	/* Init root service tree */
>> -	cfqd->grp_service_tree = CFQ_RB_ROOT;
>> -
>>  	/* Init root group */
>>  	cfqg = &cfqd->root_group;
>>  	for_each_cfqg_st(cfqg, i, j, st)
>> @@ -3966,6 +4130,7 @@ static void *cfq_init_queue(struct request_queue *q)
>>  	/* Give preference to root group over other groups */
>>  	cfqg->group_entity.weight = 2*BLKIO_WEIGHT_DEFAULT;
>>  	cfqg->group_entity.is_group_entity = true;
>> +	cfqg->group_entity.parent = NULL;
>>  
>>  #ifdef CONFIG_CFQ_GROUP_IOSCHED
>>  	/*
>> -- 
>> 1.6.5.2
>>
>>
> 

-- 
Regards
Gui Jianfeng
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/