Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752789Ab1BJHsB (ORCPT ); Thu, 10 Feb 2011 02:48:01 -0500 Received: from cn.fujitsu.com ([222.73.24.84]:60478 "EHLO song.cn.fujitsu.com" rhost-flags-OK-FAIL-OK-OK) by vger.kernel.org with ESMTP id S1751962Ab1BJHr7 (ORCPT ); Thu, 10 Feb 2011 02:47:59 -0500 Message-ID: <4D539821.1090703@cn.fujitsu.com> Date: Thu, 10 Feb 2011 15:47:45 +0800 From: Gui Jianfeng User-Agent: Thunderbird 2.0.0.24 (Windows/20100228) MIME-Version: 1.0 To: Vivek Goyal , Jens Axboe CC: Gui Jianfeng , Shaohua Li , lkml , Chad Talbott , Divyesh Shah Subject: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface References: <4D51ED26.8050809@cn.fujitsu.com> In-Reply-To: <4D51ED26.8050809@cn.fujitsu.com> X-MIMETrack: Itemize by SMTP Server on mailserver/fnst(Release 8.5.1FP4|July 25, 2010) at 2011-02-10 15:46:54, Serialize by Router on mailserver/fnst(Release 8.5.1FP4|July 25, 2010) at 2011-02-10 15:46:55, Serialize complete at 2011-02-10 15:46:55 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset=UTF-8 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 24996 Lines: 875 CFQ group hierarchical scheduling and use_hierarchy interface. Signed-off-by: Gui Jianfeng --- block/blk-cgroup.c | 61 +++++- block/blk-cgroup.h | 3 + block/cfq-iosched.c | 603 +++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 500 insertions(+), 167 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 455768a..c55fecd 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -25,7 +25,10 @@ static DEFINE_SPINLOCK(blkio_list_lock); static LIST_HEAD(blkio_list); -struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; +struct blkio_cgroup blkio_root_cgroup = { + .weight = 2*BLKIO_WEIGHT_DEFAULT, + .use_hierarchy = 0 +}; EXPORT_SYMBOL_GPL(blkio_root_cgroup); static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *, @@ -454,6 +457,7 @@ static void __blkiocg_del_blkio_group(struct blkio_group *blkg) blkg->blkcg_id = 0; } + /* * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 * indicating that blk_group was unhashed by the time we got to it. @@ -765,6 +769,12 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, } EXPORT_SYMBOL_GPL(blkcg_get_weight); +unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg) +{ + return blkcg->use_hierarchy; +} +EXPORT_SYMBOL_GPL(blkcg_get_use_hierarchy); + uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev) { struct blkio_policy_node *pn; @@ -1202,6 +1212,8 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) { switch(name) { case BLKIO_PROP_weight: return (u64)blkcg->weight; + case BLKIO_PROP_use_hierarchy: + return (u64)blkcg->use_hierarchy; } break; default: @@ -1210,6 +1222,36 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) { return 0; } +static int blkio_use_hierarchy_write(struct cgroup *cgrp, u64 val) +{ + struct cgroup *parent = cgrp->parent; + struct blkio_cgroup *blkcg, *parent_blkcg = NULL; + int ret = 0; + + if (val != 0 && val != 1) + return -EINVAL; + + blkcg = cgroup_to_blkio_cgroup(cgrp); + if (parent) + parent_blkcg = cgroup_to_blkio_cgroup(parent); + + cgroup_lock(); + /* + * If parent's use_hierarchy is set, we can't make any modifications + * in the child subtrees. If it is unset, then the change can occur, + * provided the current cgroup has no children. + */ + if (!parent_blkcg || !parent_blkcg->use_hierarchy) { + if (list_empty(&cgrp->children)) + blkcg->use_hierarchy = val; + else + ret = -EBUSY; + } else + ret = -EINVAL; + cgroup_unlock(); + return ret; +} + static int blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) { @@ -1224,6 +1266,8 @@ blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) switch(name) { case BLKIO_PROP_weight: return blkio_weight_write(blkcg, val); + case BLKIO_PROP_use_hierarchy: + return blkio_use_hierarchy_write(cgrp, val); } break; default: @@ -1301,6 +1345,13 @@ struct cftype blkio_files[] = { .name = "reset_stats", .write_u64 = blkiocg_reset_stats, }, + { + .name = "use_hierarchy", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_use_hierarchy), + .read_u64 = blkiocg_file_read_u64, + .write_u64 = blkiocg_file_write_u64, + }, #ifdef CONFIG_BLK_DEV_THROTTLING { .name = "throttle.read_bps_device", @@ -1444,7 +1495,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) static struct cgroup_subsys_state * blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) { - struct blkio_cgroup *blkcg; + struct blkio_cgroup *blkcg, *parent_blkcg = NULL; struct cgroup *parent = cgroup->parent; if (!parent) { @@ -1452,6 +1503,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) goto done; } + parent_blkcg = cgroup_to_blkio_cgroup(parent); blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); if (!blkcg) return ERR_PTR(-ENOMEM); @@ -1462,6 +1514,11 @@ done: INIT_HLIST_HEAD(&blkcg->blkg_list); INIT_LIST_HEAD(&blkcg->policy_list); + if (parent) + blkcg->use_hierarchy = parent_blkcg->use_hierarchy; + else + blkcg->use_hierarchy = 0; + return &blkcg->css; } diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index ea4861b..5b4b351 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -90,6 +90,7 @@ enum blkcg_file_name_prop { BLKIO_PROP_idle_time, BLKIO_PROP_empty_time, BLKIO_PROP_dequeue, + BLKIO_PROP_use_hierarchy, }; /* cgroup files owned by throttle policy */ @@ -105,6 +106,7 @@ enum blkcg_file_name_throtl { struct blkio_cgroup { struct cgroup_subsys_state css; unsigned int weight; + bool use_hierarchy; spinlock_t lock; struct hlist_head blkg_list; struct list_head policy_list; /* list of blkio_policy_node */ @@ -179,6 +181,7 @@ struct blkio_policy_node { extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, dev_t dev); +extern unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg); extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev); extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index aa3eda8..0e21d27 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -110,6 +110,9 @@ struct cfq_entity { u64 vdisktime; bool is_group_entity; unsigned int weight; + struct cfq_entity *parent; + /* Reposition time */ + unsigned long reposition_time; }; /* @@ -118,8 +121,6 @@ struct cfq_entity { struct cfq_queue { /* The schedule entity */ struct cfq_entity cfqe; - /* Reposition time */ - unsigned long reposition_time; /* reference count */ int ref; /* various state flags, see below */ @@ -199,6 +200,9 @@ struct cfq_group { /* number of cfqq currently on this group */ int nr_cfqq; + /* number of sub cfq groups */ + int nr_subgp; + /* * Per group busy queus average. Useful for workload slice calc. We * create the array for each prio class but at run time it is used @@ -234,10 +238,11 @@ struct cfq_group { */ struct cfq_data { struct request_queue *queue; - /* Root service tree for cfq_groups */ - struct cfq_rb_root grp_service_tree; struct cfq_group root_group; + /* cfq group schedule in flat or hierarchy manner. */ + bool use_hierarchy; + /* * The priority currently being served */ @@ -246,6 +251,9 @@ struct cfq_data { unsigned long workload_expires; struct cfq_group *serving_group; + /* Service tree for cfq group flat scheduling mode. */ + struct cfq_rb_root grp_service_tree; + /* * Each priority tree is sorted by next_request position. These * trees are used when determining if two or more queues are @@ -355,8 +363,6 @@ cfqg_of_entity(struct cfq_entity *cfqe) } -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); - static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, enum wl_prio_t prio, enum wl_type_t type) @@ -643,13 +649,50 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, return cfqg->busy_queues_avg[rt]; } +static inline unsigned int +cfq_group_get_total_weight(struct cfq_group *cfqg) +{ + int i, j; + struct cfq_rb_root *st; + unsigned int total_weight = 0; + + for_each_cfqg_st(cfqg, i, j, st) { + total_weight += st->total_weight; + } + + return total_weight; +} + static inline unsigned cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) { - struct cfq_rb_root *st = &cfqd->grp_service_tree; struct cfq_entity *cfqe = &cfqg->cfqe; + struct cfq_rb_root *st; + int group_slice = cfq_target_latency; + unsigned int grp_total_weight; + struct cfq_group *p_cfqg; + + /* + * Calculate group slice in a hierarchical way. + * Note, the calculation is cross all service trees under a group. + */ + do { + if (cfqe->parent) { + p_cfqg = cfqg_of_entity(cfqe->parent); + grp_total_weight = cfq_group_get_total_weight(p_cfqg); + group_slice = group_slice * cfqe->weight / + grp_total_weight; + } else { + /* For top level groups */ + st = cfqe->service_tree; + group_slice = group_slice * cfqe->weight / + st->total_weight; + } - return cfq_target_latency * cfqe->weight / st->total_weight; + cfqe = cfqe->parent; + } while (cfqe); + + return group_slice; } static inline void @@ -672,7 +715,8 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) /* scale low_slice according to IO priority * and sync vs async */ unsigned low_slice = - min(slice, base_low_slice * slice / sync_slice); + min(slice, base_low_slice * slice / + sync_slice); /* the adapted slice value is scaled to fit all iqs * into the target latency */ slice = max(slice * group_slice / expect_latency, @@ -812,17 +856,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root) return NULL; } -static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root) -{ - if (!root->left) - root->left = rb_first(&root->rb); - - if (root->left) - return rb_entry_entity(root->left); - - return NULL; -} - static void rb_erase_init(struct rb_node *n, struct rb_root *root) { rb_erase(n, root); @@ -896,12 +929,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe) rb_link_node(&cfqe->rb_node, parent, node); rb_insert_color(&cfqe->rb_node, &st->rb); + + update_min_vdisktime(st); } static void cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe) { __cfq_entity_service_tree_add(st, cfqe); + cfqe->reposition_time = jiffies; st->count++; st->total_weight += cfqe->weight; } @@ -909,34 +945,52 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe) static void cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) { - struct cfq_rb_root *st = &cfqd->grp_service_tree; struct cfq_entity *cfqe = &cfqg->cfqe; - struct cfq_entity *__cfqe; struct rb_node *n; + struct cfq_entity *entity; + struct cfq_rb_root *st; + struct cfq_group *__cfqg; cfqg->nr_cfqq++; + if (!RB_EMPTY_NODE(&cfqe->rb_node)) return; /* - * Currently put the group at the end. Later implement something - * so that groups get lesser vtime based on their weights, so that - * if group does not loose all if it was not continously backlogged. + * Enqueue this group and its ancestors onto their service tree. */ - n = rb_last(&st->rb); - if (n) { - __cfqe = rb_entry_entity(n); - cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY; - } else - cfqe->vdisktime = st->min_vdisktime; + while (cfqe) { + if (!RB_EMPTY_NODE(&cfqe->rb_node)) + return; - cfq_entity_service_tree_add(st, cfqe); + /* + * Currently put the group at the end. Later implement + * something so that groups get lesser vtime based on + * their weights, so that if group does not loose all + * if it was not continously backlogged. + */ + st = cfqe->service_tree; + n = rb_last(&st->rb); + if (n) { + entity = rb_entry_entity(n); + cfqe->vdisktime = entity->vdisktime + + CFQ_IDLE_DELAY; + } else + cfqe->vdisktime = st->min_vdisktime; + + cfq_entity_service_tree_add(st, cfqe); + cfqe = cfqe->parent; + __cfqg = cfqg_of_entity(cfqe); + if (__cfqg) + __cfqg->nr_subgp++; + } } static void __cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe) { cfq_rb_erase(&cfqe->rb_node, st); + update_min_vdisktime(st); } static void @@ -945,27 +999,43 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe) if (!RB_EMPTY_NODE(&cfqe->rb_node)) { __cfq_entity_service_tree_del(st, cfqe); st->total_weight -= cfqe->weight; - cfqe->service_tree = NULL; } } static void cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) { - struct cfq_rb_root *st = &cfqd->grp_service_tree; struct cfq_entity *cfqe = &cfqg->cfqe; + struct cfq_group *__cfqg, *p_cfqg; BUG_ON(cfqg->nr_cfqq < 1); cfqg->nr_cfqq--; - /* If there are other cfq queues under this group, don't delete it */ - if (cfqg->nr_cfqq) + /* + * If there are other cfq queues under this group, or there are other + * cfq groups under this group, don't delete it. + */ + if (cfqg->nr_cfqq || cfqg->nr_subgp) return; - cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); - cfq_entity_service_tree_del(st, cfqe); - cfqg->saved_workload_slice = 0; - cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); + /* + * Dequeue this group and its ancestors from their service + * tree. + */ + while (cfqe) { + __cfqg = cfqg_of_entity(cfqe); + p_cfqg = cfqg_of_entity(cfqe->parent); + cfq_entity_service_tree_del(cfqe->service_tree, cfqe); + cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1); + cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group"); + __cfqg->saved_workload_slice = 0; + cfqe = cfqe->parent; + if (p_cfqg) { + p_cfqg->nr_subgp--; + if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp) + return; + } + } } static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) @@ -997,7 +1067,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, struct cfq_queue *cfqq) { - struct cfq_rb_root *st = &cfqd->grp_service_tree; unsigned int used_sl, charge; int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) - cfqg->service_tree_idle.count; @@ -1011,10 +1080,23 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, else if (!cfq_cfqq_sync(cfqq) && !nr_sync) charge = cfqq->allocated_slice; - /* Can't update vdisktime while group is on service tree */ - __cfq_entity_service_tree_del(st, cfqe); - cfqe->vdisktime += cfq_scale_slice(charge, cfqe); - __cfq_entity_service_tree_add(st, cfqe); + /* + * Update the vdisktime on the whole chain. + */ + while (cfqe) { + struct cfq_rb_root *st = cfqe->service_tree; + + /* + * Can't update vdisktime while group is on service + * tree. + */ + __cfq_entity_service_tree_del(st, cfqe); + cfqe->vdisktime += cfq_scale_slice(charge, cfqe); + __cfq_entity_service_tree_add(st, cfqe); + st->count++; + cfqe->reposition_time = jiffies; + cfqe = cfqe->parent; + } /* This group is being expired. Save the context */ if (time_after(cfqd->workload_expires, jiffies)) { @@ -1026,7 +1108,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, cfqg->saved_workload_slice = 0; cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", - cfqe->vdisktime, st->min_vdisktime); + cfqg->cfqe.vdisktime, + cfqg->cfqe.service_tree->min_vdisktime); cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" " sect=%u", used_sl, cfqq->slice_dispatch, charge, iops_mode(cfqd), cfqq->nr_sectors); @@ -1048,35 +1131,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, cfqg_of_blkg(blkg)->cfqe.weight = weight; } -static struct cfq_group * -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) +static void init_cfqe(struct blkio_cgroup *blkcg, + struct cfq_group *cfqg) +{ + struct cfq_entity *cfqe = &cfqg->cfqe; + + cfqe->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); + RB_CLEAR_NODE(&cfqe->rb_node); + cfqe->is_group_entity = true; + cfqe->parent = NULL; +} + +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg, + struct cfq_group *cfqg) { - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); - struct cfq_group *cfqg = NULL; - void *key = cfqd; int i, j; struct cfq_rb_root *st; - struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; unsigned int major, minor; - - cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); - if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - cfqg->blkg.dev = MKDEV(major, minor); - goto done; - } - if (cfqg || !create) - goto done; - - cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); - if (!cfqg) - goto done; + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; for_each_cfqg_st(cfqg, i, j, st) *st = CFQ_RB_ROOT; - RB_CLEAR_NODE(&cfqg->cfqe.rb_node); - - cfqg->cfqe.is_group_entity = true; /* * Take the initial reference that will be released on destroy @@ -1086,24 +1161,199 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) */ cfqg->ref = 1; + /* Add group onto cgroup list */ + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, + MKDEV(major, minor)); + /* Initiate group entity */ + init_cfqe(blkcg, cfqg); + /* Add group on cfqd list */ + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); +} + +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg); + +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) + cfq_destroy_cfqg(cfqd, cfqg); +} + +static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg, + struct cfq_group *p_cfqg) +{ + struct cfq_entity *cfqe, *p_cfqe; + + cfqe = &cfqg->cfqe; + /* - * Add group onto cgroup list. It might happen that bdi->dev is - * not initiliazed yet. Initialize this new group without major - * and minor info and this info will be filled in once a new thread - * comes for IO. See code above. + * 1. If use_hierarchy of the CGroup where cfqg's parent stays is not + * set, we put this cfqg onto global service tree. + * 2. If cfqg is root cfqg, put it onto global service tree. */ - if (bdi->dev) { - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, - MKDEV(major, minor)); - } else - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, - 0); + if (!p_cfqg) { + cfqe->service_tree = &cfqd->grp_service_tree; + cfqe->parent = NULL; + return; + } - cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); + p_cfqe = &p_cfqg->cfqe; - /* Add group on cfqd list */ - hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); + cfqe->parent = p_cfqe; + + /* + * Currently, just put cfq group entity on "BE:SYNC" workload + * service tree. + */ + cfqe->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD, + SYNC_WORKLOAD); + /* child reference */ + p_cfqg->ref++; +} + +static struct cfq_group *cfqg_get_parent(struct cfq_group * cfqg) +{ + struct cfq_entity *cfqe, *p_cfqe; + + if (!cfqg) + return NULL; + + cfqe = &cfqg->cfqe; + p_cfqe = cfqe->parent; + if (!p_cfqe) + return NULL; + + return cfqg_of_entity(p_cfqe); +} + +static struct cfq_group * +cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup) +{ + struct blkio_cgroup *blkcg; + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; + unsigned int major, minor; + struct cfq_group *cfqg, *leaf_cfqg, *child_cfqg, *tmp_cfqg; + void *key = cfqd; + + /* + * If CGroup's use_hierarchy is unset, we just need to allocate only + * one CFQ group, and this group will put onto the "grp_service_tree". + * We don't need to check whether the cfqg exists, the caller has + * already checked it. + */ + blkcg = cgroup_to_blkio_cgroup(cgroup); + if (!blkcg_get_use_hierarchy(blkcg)) { + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, + cfqd->queue->node); + if (!cfqg) + return NULL; + + init_cfqg(cfqd, blkcg, cfqg); + cfqg_set_parent(cfqd, cfqg, NULL); + return cfqg; + } + + /* + * Allocate the CFQ group chain until we meet the group we'v already + * allocated before, or to the CGroup whose use_hierarchy is not set. + */ + leaf_cfqg = NULL; + child_cfqg = NULL; + for (; cgroup != NULL; cgroup = cgroup->parent) { + blkcg = cgroup_to_blkio_cgroup(cgroup); + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); + if (cfqg) { + if (!cfqg->blkg.dev && bdi->dev && + dev_name(bdi->dev)) { + sscanf(dev_name(bdi->dev), "%u:%u", + &major, &minor); + cfqg->blkg.dev = MKDEV(major, minor); + } + + /* + * Initialization of parent doesn't finish yet, get + * it done. + */ + if (child_cfqg) { + if (blkcg_get_use_hierarchy(blkcg)) + cfqg_set_parent(cfqd, child_cfqg, + cfqg); + else + cfqg_set_parent(cfqd, child_cfqg, + NULL); + } + + /* chain has already been built */ + break; + } + + /* + * We only allocate a cfqg that the corresponding cgroup's + * use_hierarchy is set. + */ + if (blkcg_get_use_hierarchy(blkcg)) { + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, + cfqd->queue->node); + if (!cfqg) + goto clean_up; + + if (!leaf_cfqg) + leaf_cfqg = cfqg; + + init_cfqg(cfqd, blkcg, cfqg); + } else { + cfqg = NULL; + } + + if (child_cfqg) + cfqg_set_parent(cfqd, child_cfqg, cfqg); + + /* + * This CGroup's use_hierarchy isn't set, this means the CFQ + * group chain has been built. + */ + if (!blkcg_get_use_hierarchy(blkcg)) + break; + + child_cfqg = cfqg; + } + + return leaf_cfqg; + +clean_up: + /* clean up the allocated cfq groups. */ + while (leaf_cfqg) { + tmp_cfqg = leaf_cfqg; + leaf_cfqg = cfqg_get_parent(leaf_cfqg); + uninit_cfqg(cfqd, tmp_cfqg); + } + + return NULL; +} + +static struct cfq_group * +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) +{ + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); + struct cfq_group *cfqg = NULL; + void *key = cfqd; + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; + unsigned int major, minor; + + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + cfqg->blkg.dev = MKDEV(major, minor); + goto done; + } + if (cfqg || !create) + goto done; + + /* + * Allocate CFQ group chain to the root group or we meet the CGroup + * with use_hierarchy disabled. + */ + cfqg = cfqg_chain_alloc(cfqd, cgroup); done: return cfqg; @@ -1148,6 +1398,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg) { struct cfq_rb_root *st; int i, j; + struct cfq_group *p_cfqg; BUG_ON(cfqg->ref <= 0); cfqg->ref--; @@ -1155,6 +1406,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg) return; for_each_cfqg_st(cfqg, i, j, st) BUG_ON(!RB_EMPTY_ROOT(&st->rb)); + + do { + p_cfqg = cfqg_get_parent(cfqg); + kfree(cfqg); + cfqg = NULL; + /* + * Drop the reference taken by children, if nobody references + * parent group, we need delete the parent also. + */ + if (p_cfqg) { + p_cfqg->ref--; + if (p_cfqg->ref == 0) + cfqg = p_cfqg; + } + } while (cfqg); + kfree(cfqg); } @@ -1321,9 +1588,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, * ioprio. */ pos_offset = cfq_get_boost(cfqd, cfqq); - /* Debug purpose, should remove. */ - cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu", - pos_offset); cfqe->vdisktime = service_tree->min_vdisktime + pos_offset; } else @@ -1365,9 +1629,8 @@ insert: cfqe->service_tree = service_tree; /* Add cfqq onto service tree. */ + cfq_entity_service_tree_add(service_tree, cfqe); - update_min_vdisktime(service_tree); - cfqq->reposition_time = jiffies; if ((add_front || !new_cfqq) && !group_changed) return; cfq_group_service_tree_add(cfqd, cfqq->cfqg); @@ -1810,28 +2073,43 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) return cfqq_of_entity(cfq_rb_first(service_tree)); } -static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) +struct cfq_rb_root *choose_service_tree_forced(struct cfq_group *cfqg) { - struct cfq_group *cfqg; - struct cfq_entity *cfqe; int i, j; struct cfq_rb_root *st; - if (!cfqd->rq_queued) - return NULL; + for_each_cfqg_st(cfqg, i, j, st) { + if (st->count != 0) + return st; + } - cfqg = cfq_get_next_cfqg(cfqd); - if (!cfqg) + return NULL; +} + +static struct cfq_entity * +cfq_get_next_entity_forced(struct cfq_data *cfqd) +{ + struct cfq_entity *cfqe; + struct cfq_rb_root *st = &cfqd->grp_service_tree; + struct cfq_group *cfqg; + + if (!cfqd->rq_queued) return NULL; - for_each_cfqg_st(cfqg, i, j, st) { + do { cfqe = cfq_rb_first(st); - if (cfqe != NULL) - return cfqq_of_entity(cfqe); - } + if (cfqe && !cfqe->is_group_entity) + return cfqe; + else if (cfqe && cfqe->is_group_entity) + cfqg = cfqg_of_entity(cfqe); + + st = choose_service_tree_forced(cfqg); + } while (st); + return NULL; } + /* * Get and set a new active qu -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/