2011-02-10 07:48:01

by Gui, Jianfeng/归 剑峰

[permalink] [raw]
Subject: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

CFQ group hierarchical scheduling and use_hierarchy interface.

Signed-off-by: Gui Jianfeng <[email protected]>
---
block/blk-cgroup.c | 61 +++++-
block/blk-cgroup.h | 3 +
block/cfq-iosched.c | 603 +++++++++++++++++++++++++++++++++++++--------------
3 files changed, 500 insertions(+), 167 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 455768a..c55fecd 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -25,7 +25,10 @@
static DEFINE_SPINLOCK(blkio_list_lock);
static LIST_HEAD(blkio_list);

-struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
+struct blkio_cgroup blkio_root_cgroup = {
+ .weight = 2*BLKIO_WEIGHT_DEFAULT,
+ .use_hierarchy = 0
+};
EXPORT_SYMBOL_GPL(blkio_root_cgroup);

static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
@@ -454,6 +457,7 @@ static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
blkg->blkcg_id = 0;
}

+
/*
* returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
* indicating that blk_group was unhashed by the time we got to it.
@@ -765,6 +769,12 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
}
EXPORT_SYMBOL_GPL(blkcg_get_weight);

+unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg)
+{
+ return blkcg->use_hierarchy;
+}
+EXPORT_SYMBOL_GPL(blkcg_get_use_hierarchy);
+
uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
{
struct blkio_policy_node *pn;
@@ -1202,6 +1212,8 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
switch(name) {
case BLKIO_PROP_weight:
return (u64)blkcg->weight;
+ case BLKIO_PROP_use_hierarchy:
+ return (u64)blkcg->use_hierarchy;
}
break;
default:
@@ -1210,6 +1222,36 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
return 0;
}

+static int blkio_use_hierarchy_write(struct cgroup *cgrp, u64 val)
+{
+ struct cgroup *parent = cgrp->parent;
+ struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
+ int ret = 0;
+
+ if (val != 0 && val != 1)
+ return -EINVAL;
+
+ blkcg = cgroup_to_blkio_cgroup(cgrp);
+ if (parent)
+ parent_blkcg = cgroup_to_blkio_cgroup(parent);
+
+ cgroup_lock();
+ /*
+ * If parent's use_hierarchy is set, we can't make any modifications
+ * in the child subtrees. If it is unset, then the change can occur,
+ * provided the current cgroup has no children.
+ */
+ if (!parent_blkcg || !parent_blkcg->use_hierarchy) {
+ if (list_empty(&cgrp->children))
+ blkcg->use_hierarchy = val;
+ else
+ ret = -EBUSY;
+ } else
+ ret = -EINVAL;
+ cgroup_unlock();
+ return ret;
+}
+
static int
blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
{
@@ -1224,6 +1266,8 @@ blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
switch(name) {
case BLKIO_PROP_weight:
return blkio_weight_write(blkcg, val);
+ case BLKIO_PROP_use_hierarchy:
+ return blkio_use_hierarchy_write(cgrp, val);
}
break;
default:
@@ -1301,6 +1345,13 @@ struct cftype blkio_files[] = {
.name = "reset_stats",
.write_u64 = blkiocg_reset_stats,
},
+ {
+ .name = "use_hierarchy",
+ .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+ BLKIO_PROP_use_hierarchy),
+ .read_u64 = blkiocg_file_read_u64,
+ .write_u64 = blkiocg_file_write_u64,
+ },
#ifdef CONFIG_BLK_DEV_THROTTLING
{
.name = "throttle.read_bps_device",
@@ -1444,7 +1495,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
static struct cgroup_subsys_state *
blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
{
- struct blkio_cgroup *blkcg;
+ struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
struct cgroup *parent = cgroup->parent;

if (!parent) {
@@ -1452,6 +1503,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
goto done;
}

+ parent_blkcg = cgroup_to_blkio_cgroup(parent);
blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
if (!blkcg)
return ERR_PTR(-ENOMEM);
@@ -1462,6 +1514,11 @@ done:
INIT_HLIST_HEAD(&blkcg->blkg_list);

INIT_LIST_HEAD(&blkcg->policy_list);
+ if (parent)
+ blkcg->use_hierarchy = parent_blkcg->use_hierarchy;
+ else
+ blkcg->use_hierarchy = 0;
+
return &blkcg->css;
}

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ea4861b..5b4b351 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -90,6 +90,7 @@ enum blkcg_file_name_prop {
BLKIO_PROP_idle_time,
BLKIO_PROP_empty_time,
BLKIO_PROP_dequeue,
+ BLKIO_PROP_use_hierarchy,
};

/* cgroup files owned by throttle policy */
@@ -105,6 +106,7 @@ enum blkcg_file_name_throtl {
struct blkio_cgroup {
struct cgroup_subsys_state css;
unsigned int weight;
+ bool use_hierarchy;
spinlock_t lock;
struct hlist_head blkg_list;
struct list_head policy_list; /* list of blkio_policy_node */
@@ -179,6 +181,7 @@ struct blkio_policy_node {

extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
dev_t dev);
+extern unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg);
extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
dev_t dev);
extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index aa3eda8..0e21d27 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -110,6 +110,9 @@ struct cfq_entity {
u64 vdisktime;
bool is_group_entity;
unsigned int weight;
+ struct cfq_entity *parent;
+ /* Reposition time */
+ unsigned long reposition_time;
};

/*
@@ -118,8 +121,6 @@ struct cfq_entity {
struct cfq_queue {
/* The schedule entity */
struct cfq_entity cfqe;
- /* Reposition time */
- unsigned long reposition_time;
/* reference count */
int ref;
/* various state flags, see below */
@@ -199,6 +200,9 @@ struct cfq_group {
/* number of cfqq currently on this group */
int nr_cfqq;

+ /* number of sub cfq groups */
+ int nr_subgp;
+
/*
* Per group busy queus average. Useful for workload slice calc. We
* create the array for each prio class but at run time it is used
@@ -234,10 +238,11 @@ struct cfq_group {
*/
struct cfq_data {
struct request_queue *queue;
- /* Root service tree for cfq_groups */
- struct cfq_rb_root grp_service_tree;
struct cfq_group root_group;

+ /* cfq group schedule in flat or hierarchy manner. */
+ bool use_hierarchy;
+
/*
* The priority currently being served
*/
@@ -246,6 +251,9 @@ struct cfq_data {
unsigned long workload_expires;
struct cfq_group *serving_group;

+ /* Service tree for cfq group flat scheduling mode. */
+ struct cfq_rb_root grp_service_tree;
+
/*
* Each priority tree is sorted by next_request position. These
* trees are used when determining if two or more queues are
@@ -355,8 +363,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
}


-static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
-
static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
enum wl_prio_t prio,
enum wl_type_t type)
@@ -643,13 +649,50 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
return cfqg->busy_queues_avg[rt];
}

+static inline unsigned int
+cfq_group_get_total_weight(struct cfq_group *cfqg)
+{
+ int i, j;
+ struct cfq_rb_root *st;
+ unsigned int total_weight = 0;
+
+ for_each_cfqg_st(cfqg, i, j, st) {
+ total_weight += st->total_weight;
+ }
+
+ return total_weight;
+}
+
static inline unsigned
cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
struct cfq_entity *cfqe = &cfqg->cfqe;
+ struct cfq_rb_root *st;
+ int group_slice = cfq_target_latency;
+ unsigned int grp_total_weight;
+ struct cfq_group *p_cfqg;
+
+ /*
+ * Calculate group slice in a hierarchical way.
+ * Note, the calculation is cross all service trees under a group.
+ */
+ do {
+ if (cfqe->parent) {
+ p_cfqg = cfqg_of_entity(cfqe->parent);
+ grp_total_weight = cfq_group_get_total_weight(p_cfqg);
+ group_slice = group_slice * cfqe->weight /
+ grp_total_weight;
+ } else {
+ /* For top level groups */
+ st = cfqe->service_tree;
+ group_slice = group_slice * cfqe->weight /
+ st->total_weight;
+ }

- return cfq_target_latency * cfqe->weight / st->total_weight;
+ cfqe = cfqe->parent;
+ } while (cfqe);
+
+ return group_slice;
}

static inline void
@@ -672,7 +715,8 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
/* scale low_slice according to IO priority
* and sync vs async */
unsigned low_slice =
- min(slice, base_low_slice * slice / sync_slice);
+ min(slice, base_low_slice * slice /
+ sync_slice);
/* the adapted slice value is scaled to fit all iqs
* into the target latency */
slice = max(slice * group_slice / expect_latency,
@@ -812,17 +856,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
return NULL;
}

-static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
-{
- if (!root->left)
- root->left = rb_first(&root->rb);
-
- if (root->left)
- return rb_entry_entity(root->left);
-
- return NULL;
-}
-
static void rb_erase_init(struct rb_node *n, struct rb_root *root)
{
rb_erase(n, root);
@@ -896,12 +929,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)

rb_link_node(&cfqe->rb_node, parent, node);
rb_insert_color(&cfqe->rb_node, &st->rb);
+
+ update_min_vdisktime(st);
}

static void
cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
{
__cfq_entity_service_tree_add(st, cfqe);
+ cfqe->reposition_time = jiffies;
st->count++;
st->total_weight += cfqe->weight;
}
@@ -909,34 +945,52 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
static void
cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
struct cfq_entity *cfqe = &cfqg->cfqe;
- struct cfq_entity *__cfqe;
struct rb_node *n;
+ struct cfq_entity *entity;
+ struct cfq_rb_root *st;
+ struct cfq_group *__cfqg;

cfqg->nr_cfqq++;
+
if (!RB_EMPTY_NODE(&cfqe->rb_node))
return;

/*
- * Currently put the group at the end. Later implement something
- * so that groups get lesser vtime based on their weights, so that
- * if group does not loose all if it was not continously backlogged.
+ * Enqueue this group and its ancestors onto their service tree.
*/
- n = rb_last(&st->rb);
- if (n) {
- __cfqe = rb_entry_entity(n);
- cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
- } else
- cfqe->vdisktime = st->min_vdisktime;
+ while (cfqe) {
+ if (!RB_EMPTY_NODE(&cfqe->rb_node))
+ return;

- cfq_entity_service_tree_add(st, cfqe);
+ /*
+ * Currently put the group at the end. Later implement
+ * something so that groups get lesser vtime based on
+ * their weights, so that if group does not loose all
+ * if it was not continously backlogged.
+ */
+ st = cfqe->service_tree;
+ n = rb_last(&st->rb);
+ if (n) {
+ entity = rb_entry_entity(n);
+ cfqe->vdisktime = entity->vdisktime +
+ CFQ_IDLE_DELAY;
+ } else
+ cfqe->vdisktime = st->min_vdisktime;
+
+ cfq_entity_service_tree_add(st, cfqe);
+ cfqe = cfqe->parent;
+ __cfqg = cfqg_of_entity(cfqe);
+ if (__cfqg)
+ __cfqg->nr_subgp++;
+ }
}

static void
__cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
{
cfq_rb_erase(&cfqe->rb_node, st);
+ update_min_vdisktime(st);
}

static void
@@ -945,27 +999,43 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
__cfq_entity_service_tree_del(st, cfqe);
st->total_weight -= cfqe->weight;
- cfqe->service_tree = NULL;
}
}

static void
cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
struct cfq_entity *cfqe = &cfqg->cfqe;
+ struct cfq_group *__cfqg, *p_cfqg;

BUG_ON(cfqg->nr_cfqq < 1);
cfqg->nr_cfqq--;

- /* If there are other cfq queues under this group, don't delete it */
- if (cfqg->nr_cfqq)
+ /*
+ * If there are other cfq queues under this group, or there are other
+ * cfq groups under this group, don't delete it.
+ */
+ if (cfqg->nr_cfqq || cfqg->nr_subgp)
return;

- cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
- cfq_entity_service_tree_del(st, cfqe);
- cfqg->saved_workload_slice = 0;
- cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
+ /*
+ * Dequeue this group and its ancestors from their service
+ * tree.
+ */
+ while (cfqe) {
+ __cfqg = cfqg_of_entity(cfqe);
+ p_cfqg = cfqg_of_entity(cfqe->parent);
+ cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
+ cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
+ cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
+ __cfqg->saved_workload_slice = 0;
+ cfqe = cfqe->parent;
+ if (p_cfqg) {
+ p_cfqg->nr_subgp--;
+ if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
+ return;
+ }
+ }
}

static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -997,7 +1067,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
struct cfq_queue *cfqq)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
unsigned int used_sl, charge;
int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
- cfqg->service_tree_idle.count;
@@ -1011,10 +1080,23 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
charge = cfqq->allocated_slice;

- /* Can't update vdisktime while group is on service tree */
- __cfq_entity_service_tree_del(st, cfqe);
- cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
- __cfq_entity_service_tree_add(st, cfqe);
+ /*
+ * Update the vdisktime on the whole chain.
+ */
+ while (cfqe) {
+ struct cfq_rb_root *st = cfqe->service_tree;
+
+ /*
+ * Can't update vdisktime while group is on service
+ * tree.
+ */
+ __cfq_entity_service_tree_del(st, cfqe);
+ cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
+ __cfq_entity_service_tree_add(st, cfqe);
+ st->count++;
+ cfqe->reposition_time = jiffies;
+ cfqe = cfqe->parent;
+ }

/* This group is being expired. Save the context */
if (time_after(cfqd->workload_expires, jiffies)) {
@@ -1026,7 +1108,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
cfqg->saved_workload_slice = 0;

cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
- cfqe->vdisktime, st->min_vdisktime);
+ cfqg->cfqe.vdisktime,
+ cfqg->cfqe.service_tree->min_vdisktime);
cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
" sect=%u", used_sl, cfqq->slice_dispatch, charge,
iops_mode(cfqd), cfqq->nr_sectors);
@@ -1048,35 +1131,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
cfqg_of_blkg(blkg)->cfqe.weight = weight;
}

-static struct cfq_group *
-cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+static void init_cfqe(struct blkio_cgroup *blkcg,
+ struct cfq_group *cfqg)
+{
+ struct cfq_entity *cfqe = &cfqg->cfqe;
+
+ cfqe->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+ RB_CLEAR_NODE(&cfqe->rb_node);
+ cfqe->is_group_entity = true;
+ cfqe->parent = NULL;
+}
+
+static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
+ struct cfq_group *cfqg)
{
- struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
- struct cfq_group *cfqg = NULL;
- void *key = cfqd;
int i, j;
struct cfq_rb_root *st;
- struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
unsigned int major, minor;
-
- cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
- if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
- sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- cfqg->blkg.dev = MKDEV(major, minor);
- goto done;
- }
- if (cfqg || !create)
- goto done;
-
- cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
- if (!cfqg)
- goto done;
+ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;

for_each_cfqg_st(cfqg, i, j, st)
*st = CFQ_RB_ROOT;
- RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
-
- cfqg->cfqe.is_group_entity = true;

/*
* Take the initial reference that will be released on destroy
@@ -1086,24 +1161,199 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
*/
cfqg->ref = 1;

+ /* Add group onto cgroup list */
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+ MKDEV(major, minor));
+ /* Initiate group entity */
+ init_cfqe(blkcg, cfqg);
+ /* Add group on cfqd list */
+ hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+}
+
+static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg);
+
+static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+ if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
+ cfq_destroy_cfqg(cfqd, cfqg);
+}
+
+static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
+ struct cfq_group *p_cfqg)
+{
+ struct cfq_entity *cfqe, *p_cfqe;
+
+ cfqe = &cfqg->cfqe;
+
/*
- * Add group onto cgroup list. It might happen that bdi->dev is
- * not initiliazed yet. Initialize this new group without major
- * and minor info and this info will be filled in once a new thread
- * comes for IO. See code above.
+ * 1. If use_hierarchy of the CGroup where cfqg's parent stays is not
+ * set, we put this cfqg onto global service tree.
+ * 2. If cfqg is root cfqg, put it onto global service tree.
*/
- if (bdi->dev) {
- sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
- MKDEV(major, minor));
- } else
- cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
- 0);
+ if (!p_cfqg) {
+ cfqe->service_tree = &cfqd->grp_service_tree;
+ cfqe->parent = NULL;
+ return;
+ }

- cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+ p_cfqe = &p_cfqg->cfqe;

- /* Add group on cfqd list */
- hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+ cfqe->parent = p_cfqe;
+
+ /*
+ * Currently, just put cfq group entity on "BE:SYNC" workload
+ * service tree.
+ */
+ cfqe->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD,
+ SYNC_WORKLOAD);
+ /* child reference */
+ p_cfqg->ref++;
+}
+
+static struct cfq_group *cfqg_get_parent(struct cfq_group * cfqg)
+{
+ struct cfq_entity *cfqe, *p_cfqe;
+
+ if (!cfqg)
+ return NULL;
+
+ cfqe = &cfqg->cfqe;
+ p_cfqe = cfqe->parent;
+ if (!p_cfqe)
+ return NULL;
+
+ return cfqg_of_entity(p_cfqe);
+}
+
+static struct cfq_group *
+cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
+{
+ struct blkio_cgroup *blkcg;
+ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+ unsigned int major, minor;
+ struct cfq_group *cfqg, *leaf_cfqg, *child_cfqg, *tmp_cfqg;
+ void *key = cfqd;
+
+ /*
+ * If CGroup's use_hierarchy is unset, we just need to allocate only
+ * one CFQ group, and this group will put onto the "grp_service_tree".
+ * We don't need to check whether the cfqg exists, the caller has
+ * already checked it.
+ */
+ blkcg = cgroup_to_blkio_cgroup(cgroup);
+ if (!blkcg_get_use_hierarchy(blkcg)) {
+ cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
+ cfqd->queue->node);
+ if (!cfqg)
+ return NULL;
+
+ init_cfqg(cfqd, blkcg, cfqg);
+ cfqg_set_parent(cfqd, cfqg, NULL);
+ return cfqg;
+ }
+
+ /*
+ * Allocate the CFQ group chain until we meet the group we'v already
+ * allocated before, or to the CGroup whose use_hierarchy is not set.
+ */
+ leaf_cfqg = NULL;
+ child_cfqg = NULL;
+ for (; cgroup != NULL; cgroup = cgroup->parent) {
+ blkcg = cgroup_to_blkio_cgroup(cgroup);
+ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ if (cfqg) {
+ if (!cfqg->blkg.dev && bdi->dev &&
+ dev_name(bdi->dev)) {
+ sscanf(dev_name(bdi->dev), "%u:%u",
+ &major, &minor);
+ cfqg->blkg.dev = MKDEV(major, minor);
+ }
+
+ /*
+ * Initialization of parent doesn't finish yet, get
+ * it done.
+ */
+ if (child_cfqg) {
+ if (blkcg_get_use_hierarchy(blkcg))
+ cfqg_set_parent(cfqd, child_cfqg,
+ cfqg);
+ else
+ cfqg_set_parent(cfqd, child_cfqg,
+ NULL);
+ }
+
+ /* chain has already been built */
+ break;
+ }
+
+ /*
+ * We only allocate a cfqg that the corresponding cgroup's
+ * use_hierarchy is set.
+ */
+ if (blkcg_get_use_hierarchy(blkcg)) {
+ cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
+ cfqd->queue->node);
+ if (!cfqg)
+ goto clean_up;
+
+ if (!leaf_cfqg)
+ leaf_cfqg = cfqg;
+
+ init_cfqg(cfqd, blkcg, cfqg);
+ } else {
+ cfqg = NULL;
+ }
+
+ if (child_cfqg)
+ cfqg_set_parent(cfqd, child_cfqg, cfqg);
+
+ /*
+ * This CGroup's use_hierarchy isn't set, this means the CFQ
+ * group chain has been built.
+ */
+ if (!blkcg_get_use_hierarchy(blkcg))
+ break;
+
+ child_cfqg = cfqg;
+ }
+
+ return leaf_cfqg;
+
+clean_up:
+ /* clean up the allocated cfq groups. */
+ while (leaf_cfqg) {
+ tmp_cfqg = leaf_cfqg;
+ leaf_cfqg = cfqg_get_parent(leaf_cfqg);
+ uninit_cfqg(cfqd, tmp_cfqg);
+ }
+
+ return NULL;
+}
+
+static struct cfq_group *
+cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+{
+ struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+ struct cfq_group *cfqg = NULL;
+ void *key = cfqd;
+ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+ unsigned int major, minor;
+
+ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ cfqg->blkg.dev = MKDEV(major, minor);
+ goto done;
+ }
+ if (cfqg || !create)
+ goto done;
+
+ /*
+ * Allocate CFQ group chain to the root group or we meet the CGroup
+ * with use_hierarchy disabled.
+ */
+ cfqg = cfqg_chain_alloc(cfqd, cgroup);

done:
return cfqg;
@@ -1148,6 +1398,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
{
struct cfq_rb_root *st;
int i, j;
+ struct cfq_group *p_cfqg;

BUG_ON(cfqg->ref <= 0);
cfqg->ref--;
@@ -1155,6 +1406,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
return;
for_each_cfqg_st(cfqg, i, j, st)
BUG_ON(!RB_EMPTY_ROOT(&st->rb));
+
+ do {
+ p_cfqg = cfqg_get_parent(cfqg);
+ kfree(cfqg);
+ cfqg = NULL;
+ /*
+ * Drop the reference taken by children, if nobody references
+ * parent group, we need delete the parent also.
+ */
+ if (p_cfqg) {
+ p_cfqg->ref--;
+ if (p_cfqg->ref == 0)
+ cfqg = p_cfqg;
+ }
+ } while (cfqg);
+
kfree(cfqg);
}

@@ -1321,9 +1588,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
* ioprio.
*/
pos_offset = cfq_get_boost(cfqd, cfqq);
- /* Debug purpose, should remove. */
- cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu",
- pos_offset);
cfqe->vdisktime = service_tree->min_vdisktime +
pos_offset;
} else
@@ -1365,9 +1629,8 @@ insert:
cfqe->service_tree = service_tree;

/* Add cfqq onto service tree. */
+
cfq_entity_service_tree_add(service_tree, cfqe);
- update_min_vdisktime(service_tree);
- cfqq->reposition_time = jiffies;
if ((add_front || !new_cfqq) && !group_changed)
return;
cfq_group_service_tree_add(cfqd, cfqq->cfqg);
@@ -1810,28 +2073,43 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
return cfqq_of_entity(cfq_rb_first(service_tree));
}

-static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
+struct cfq_rb_root *choose_service_tree_forced(struct cfq_group *cfqg)
{
- struct cfq_group *cfqg;
- struct cfq_entity *cfqe;
int i, j;
struct cfq_rb_root *st;

- if (!cfqd->rq_queued)
- return NULL;
+ for_each_cfqg_st(cfqg, i, j, st) {
+ if (st->count != 0)
+ return st;
+ }

- cfqg = cfq_get_next_cfqg(cfqd);
- if (!cfqg)
+ return NULL;
+}
+
+static struct cfq_entity *
+cfq_get_next_entity_forced(struct cfq_data *cfqd)
+{
+ struct cfq_entity *cfqe;
+ struct cfq_rb_root *st = &cfqd->grp_service_tree;
+ struct cfq_group *cfqg;
+
+ if (!cfqd->rq_queued)
return NULL;

- for_each_cfqg_st(cfqg, i, j, st) {
+ do {
cfqe = cfq_rb_first(st);
- if (cfqe != NULL)
- return cfqq_of_entity(cfqe);
- }
+ if (cfqe && !cfqe->is_group_entity)
+ return cfqe;
+ else if (cfqe && cfqe->is_group_entity)
+ cfqg = cfqg_of_entity(cfqe);
+
+ st = choose_service_tree_forced(cfqg);
+ } while (st);
+
return NULL;
}

+
/*
* Get and set a new active qu


2011-02-10 20:57:35

by Vivek Goyal

[permalink] [raw]
Subject: Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

On Thu, Feb 10, 2011 at 03:47:45PM +0800, Gui Jianfeng wrote:
> CFQ group hierarchical scheduling and use_hierarchy interface.
>

Hi Gui,

I have done a quick high level review. Some minor comments inline.

[..]
> struct cfq_data {
> struct request_queue *queue;
> - /* Root service tree for cfq_groups */
> - struct cfq_rb_root grp_service_tree;
> struct cfq_group root_group;
>
> + /* cfq group schedule in flat or hierarchy manner. */
> + bool use_hierarchy;
> +

This seems to be redundant now? Nobody is using it?

> /*
> * The priority currently being served
> */
> @@ -246,6 +251,9 @@ struct cfq_data {
> unsigned long workload_expires;
> struct cfq_group *serving_group;
>
> + /* Service tree for cfq group flat scheduling mode. */
> + struct cfq_rb_root grp_service_tree;

Above comment is misleading. This service tree is now used both for
flat as well as hierarhical mode.

[..]
> static void
> cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
> {
> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
> struct cfq_entity *cfqe = &cfqg->cfqe;
> - struct cfq_entity *__cfqe;
> struct rb_node *n;
> + struct cfq_entity *entity;
> + struct cfq_rb_root *st;
> + struct cfq_group *__cfqg;
>
> cfqg->nr_cfqq++;
> +
> if (!RB_EMPTY_NODE(&cfqe->rb_node))
> return;
>
> /*
> - * Currently put the group at the end. Later implement something
> - * so that groups get lesser vtime based on their weights, so that
> - * if group does not loose all if it was not continously backlogged.
> + * Enqueue this group and its ancestors onto their service tree.
> */
> - n = rb_last(&st->rb);
> - if (n) {
> - __cfqe = rb_entry_entity(n);
> - cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
> - } else
> - cfqe->vdisktime = st->min_vdisktime;
> + while (cfqe) {
> + if (!RB_EMPTY_NODE(&cfqe->rb_node))
> + return;
>
> - cfq_entity_service_tree_add(st, cfqe);
> + /*
> + * Currently put the group at the end. Later implement
> + * something so that groups get lesser vtime based on
> + * their weights, so that if group does not loose all
> + * if it was not continously backlogged.
> + */

Can we use vdisktime boost logic for groups also? I think it can be a separate
patch in the series (the last one). Keeping it as a separate patch will
also help you to coordinate with chad's patch.

> + st = cfqe->service_tree;

Group entity set their service tree when they get allocated and retain
this pointer even when they get deleted from serivce tree. Queue entities
seem to have it NULL when they get deleted from service tree and it
gets set again when queue is getting inserted. It would be nice if we
can fix this discrepancy and keep it consistent. I think clearing up
cfqe->service_tree is a better idea and then calculate it again for
group also.

[..]
>
> -static struct cfq_group *
> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> +static void init_cfqe(struct blkio_cgroup *blkcg,
> + struct cfq_group *cfqg)

As you are using this function for initializing group entity, possibly
rename it to init_group_entity() or init_group_cfqe() etc.

[..]
> +static struct cfq_group *
> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> +{
> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> + struct cfq_group *cfqg = NULL;
> + void *key = cfqd;
> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> + unsigned int major, minor;
> +
> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> + cfqg->blkg.dev = MKDEV(major, minor);
> + goto done;
> + }

Should we make this updation of this info hierarhical?

Thanks
Vivek

2011-02-12 02:21:50

by Gui, Jianfeng/归 剑峰

[permalink] [raw]
Subject: Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

Vivek Goyal wrote:
> On Thu, Feb 10, 2011 at 03:47:45PM +0800, Gui Jianfeng wrote:
>> CFQ group hierarchical scheduling and use_hierarchy interface.
>>
>
> Hi Gui,
>
> I have done a quick high level review. Some minor comments inline.
>
> [..]
>> struct cfq_data {
>> struct request_queue *queue;
>> - /* Root service tree for cfq_groups */
>> - struct cfq_rb_root grp_service_tree;
>> struct cfq_group root_group;
>>
>> + /* cfq group schedule in flat or hierarchy manner. */
>> + bool use_hierarchy;
>> +
>
> This seems to be redundant now? Nobody is using it?

Ahh, I think so.

>
>> /*
>> * The priority currently being served
>> */
>> @@ -246,6 +251,9 @@ struct cfq_data {
>> unsigned long workload_expires;
>> struct cfq_group *serving_group;
>>
>> + /* Service tree for cfq group flat scheduling mode. */
>> + struct cfq_rb_root grp_service_tree;
>
> Above comment is misleading. This service tree is now used both for
> flat as well as hierarhical mode.

Will modify.

>
> [..]
>> static void
>> cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> struct cfq_entity *cfqe = &cfqg->cfqe;
>> - struct cfq_entity *__cfqe;
>> struct rb_node *n;
>> + struct cfq_entity *entity;
>> + struct cfq_rb_root *st;
>> + struct cfq_group *__cfqg;
>>
>> cfqg->nr_cfqq++;
>> +
>> if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> return;
>>
>> /*
>> - * Currently put the group at the end. Later implement something
>> - * so that groups get lesser vtime based on their weights, so that
>> - * if group does not loose all if it was not continously backlogged.
>> + * Enqueue this group and its ancestors onto their service tree.
>> */
>> - n = rb_last(&st->rb);
>> - if (n) {
>> - __cfqe = rb_entry_entity(n);
>> - cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>> - } else
>> - cfqe->vdisktime = st->min_vdisktime;
>> + while (cfqe) {
>> + if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> + return;
>>
>> - cfq_entity_service_tree_add(st, cfqe);
>> + /*
>> + * Currently put the group at the end. Later implement
>> + * something so that groups get lesser vtime based on
>> + * their weights, so that if group does not loose all
>> + * if it was not continously backlogged.
>> + */
>
> Can we use vdisktime boost logic for groups also? I think it can be a separate
> patch in the series (the last one). Keeping it as a separate patch will
> also help you to coordinate with chad's patch.

Make a separete patch make more sense, will do it as soon as this series gets merged.

>
>> + st = cfqe->service_tree;
>
> Group entity set their service tree when they get allocated and retain
> this pointer even when they get deleted from serivce tree. Queue entities
> seem to have it NULL when they get deleted from service tree and it
> gets set again when queue is getting inserted. It would be nice if we
> can fix this discrepancy and keep it consistent. I think clearing up
> cfqe->service_tree is a better idea and then calculate it again for
> group also.

Ok, will consider to change.

>
> [..]
>>
>> -static struct cfq_group *
>> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> +static void init_cfqe(struct blkio_cgroup *blkcg,
>> + struct cfq_group *cfqg)
>
> As you are using this function for initializing group entity, possibly
> rename it to init_group_entity() or init_group_cfqe() etc.

Sure.

>
> [..]
>> +static struct cfq_group *
>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> +{
>> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> + struct cfq_group *cfqg = NULL;
>> + void *key = cfqd;
>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> + unsigned int major, minor;
>> +
>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> + cfqg->blkg.dev = MKDEV(major, minor);
>> + goto done;
>> + }
>
> Should we make this updation of this info hierarhical?

IMHO, it's fine to defer the updation when we really get the cfqg.

Will post an updated version.

Thanks,
Gui

>
> Thanks
> Vivek
>

2011-02-14 03:20:44

by Gui, Jianfeng/归 剑峰

[permalink] [raw]
Subject: Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

Vivek Goyal wrote:
> On Thu, Feb 10, 2011 at 03:47:45PM +0800, Gui Jianfeng wrote:
>> CFQ group hierarchical scheduling and use_hierarchy interface.
>>
>
> Hi Gui,
>
> I have done a quick high level review. Some minor comments inline.
>
> [..]
>> struct cfq_data {
>> struct request_queue *queue;
>> - /* Root service tree for cfq_groups */
>> - struct cfq_rb_root grp_service_tree;
>> struct cfq_group root_group;
>>
>> + /* cfq group schedule in flat or hierarchy manner. */
>> + bool use_hierarchy;
>> +
>
> This seems to be redundant now? Nobody is using it?
>
>> /*
>> * The priority currently being served
>> */
>> @@ -246,6 +251,9 @@ struct cfq_data {
>> unsigned long workload_expires;
>> struct cfq_group *serving_group;
>>
>> + /* Service tree for cfq group flat scheduling mode. */
>> + struct cfq_rb_root grp_service_tree;
>
> Above comment is misleading. This service tree is now used both for
> flat as well as hierarhical mode.
>
> [..]
>> static void
>> cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> struct cfq_entity *cfqe = &cfqg->cfqe;
>> - struct cfq_entity *__cfqe;
>> struct rb_node *n;
>> + struct cfq_entity *entity;
>> + struct cfq_rb_root *st;
>> + struct cfq_group *__cfqg;
>>
>> cfqg->nr_cfqq++;
>> +
>> if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> return;
>>
>> /*
>> - * Currently put the group at the end. Later implement something
>> - * so that groups get lesser vtime based on their weights, so that
>> - * if group does not loose all if it was not continously backlogged.
>> + * Enqueue this group and its ancestors onto their service tree.
>> */
>> - n = rb_last(&st->rb);
>> - if (n) {
>> - __cfqe = rb_entry_entity(n);
>> - cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>> - } else
>> - cfqe->vdisktime = st->min_vdisktime;
>> + while (cfqe) {
>> + if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> + return;
>>
>> - cfq_entity_service_tree_add(st, cfqe);
>> + /*
>> + * Currently put the group at the end. Later implement
>> + * something so that groups get lesser vtime based on
>> + * their weights, so that if group does not loose all
>> + * if it was not continously backlogged.
>> + */
>
> Can we use vdisktime boost logic for groups also? I think it can be a separate
> patch in the series (the last one). Keeping it as a separate patch will
> also help you to coordinate with chad's patch.
>
>> + st = cfqe->service_tree;
>
> Group entity set their service tree when they get allocated and retain
> this pointer even when they get deleted from serivce tree. Queue entities
> seem to have it NULL when they get deleted from service tree and it
> gets set again when queue is getting inserted. It would be nice if we
> can fix this discrepancy and keep it consistent. I think clearing up
> cfqe->service_tree is a better idea and then calculate it again for
> group also.

Vivek,

Currently, cfq queue might change workload type and io class, so we need to recalculate
its service_tree. But for cfq groups, IMHO we don't need to add this complexity for the
time being.
I think we can add this change as soon as different io classes or workload types are
introduced. How do you think?

Thanks,
Gui

2011-02-14 18:04:44

by Vivek Goyal

[permalink] [raw]
Subject: Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

On Sat, Feb 12, 2011 at 10:21:47AM +0800, Gui Jianfeng wrote:
[..]
> >> +static struct cfq_group *
> >> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> >> +{
> >> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> >> + struct cfq_group *cfqg = NULL;
> >> + void *key = cfqd;
> >> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> >> + unsigned int major, minor;
> >> +
> >> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> >> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> >> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> >> + cfqg->blkg.dev = MKDEV(major, minor);
> >> + goto done;
> >> + }
> >
> > Should we make this updation of this info hierarhical?
>
> IMHO, it's fine to defer the updation when we really get the cfqg.

But if cfqg is alrady present, we will never hit the allocation path
again. So if somebody creates 2-3 level deep hierarchy and does IO
only in the children cgroup, parent cgroups will potentially not get
device info updated hence no visible stats?

Thanks
Vivek

2011-02-14 18:10:35

by Vivek Goyal

[permalink] [raw]
Subject: Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

On Mon, Feb 14, 2011 at 11:20:33AM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> > On Thu, Feb 10, 2011 at 03:47:45PM +0800, Gui Jianfeng wrote:
> >> CFQ group hierarchical scheduling and use_hierarchy interface.
> >>
> >
> > Hi Gui,
> >
> > I have done a quick high level review. Some minor comments inline.
> >
> > [..]
> >> struct cfq_data {
> >> struct request_queue *queue;
> >> - /* Root service tree for cfq_groups */
> >> - struct cfq_rb_root grp_service_tree;
> >> struct cfq_group root_group;
> >>
> >> + /* cfq group schedule in flat or hierarchy manner. */
> >> + bool use_hierarchy;
> >> +
> >
> > This seems to be redundant now? Nobody is using it?
> >
> >> /*
> >> * The priority currently being served
> >> */
> >> @@ -246,6 +251,9 @@ struct cfq_data {
> >> unsigned long workload_expires;
> >> struct cfq_group *serving_group;
> >>
> >> + /* Service tree for cfq group flat scheduling mode. */
> >> + struct cfq_rb_root grp_service_tree;
> >
> > Above comment is misleading. This service tree is now used both for
> > flat as well as hierarhical mode.
> >
> > [..]
> >> static void
> >> cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >> {
> >> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
> >> struct cfq_entity *cfqe = &cfqg->cfqe;
> >> - struct cfq_entity *__cfqe;
> >> struct rb_node *n;
> >> + struct cfq_entity *entity;
> >> + struct cfq_rb_root *st;
> >> + struct cfq_group *__cfqg;
> >>
> >> cfqg->nr_cfqq++;
> >> +
> >> if (!RB_EMPTY_NODE(&cfqe->rb_node))
> >> return;
> >>
> >> /*
> >> - * Currently put the group at the end. Later implement something
> >> - * so that groups get lesser vtime based on their weights, so that
> >> - * if group does not loose all if it was not continously backlogged.
> >> + * Enqueue this group and its ancestors onto their service tree.
> >> */
> >> - n = rb_last(&st->rb);
> >> - if (n) {
> >> - __cfqe = rb_entry_entity(n);
> >> - cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
> >> - } else
> >> - cfqe->vdisktime = st->min_vdisktime;
> >> + while (cfqe) {
> >> + if (!RB_EMPTY_NODE(&cfqe->rb_node))
> >> + return;
> >>
> >> - cfq_entity_service_tree_add(st, cfqe);
> >> + /*
> >> + * Currently put the group at the end. Later implement
> >> + * something so that groups get lesser vtime based on
> >> + * their weights, so that if group does not loose all
> >> + * if it was not continously backlogged.
> >> + */
> >
> > Can we use vdisktime boost logic for groups also? I think it can be a separate
> > patch in the series (the last one). Keeping it as a separate patch will
> > also help you to coordinate with chad's patch.
> >
> >> + st = cfqe->service_tree;
> >
> > Group entity set their service tree when they get allocated and retain
> > this pointer even when they get deleted from serivce tree. Queue entities
> > seem to have it NULL when they get deleted from service tree and it
> > gets set again when queue is getting inserted. It would be nice if we
> > can fix this discrepancy and keep it consistent. I think clearing up
> > cfqe->service_tree is a better idea and then calculate it again for
> > group also.
>
> Vivek,
>
> Currently, cfq queue might change workload type and io class, so we need to recalculate
> its service_tree. But for cfq groups, IMHO we don't need to add this complexity for the
> time being.
> I think we can add this change as soon as different io classes or workload types are
> introduced. How do you think?

Ok, that's fine.

Thanks
Vivek

2011-02-15 02:38:42

by Gui, Jianfeng/归 剑峰

[permalink] [raw]
Subject: Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

Vivek Goyal wrote:
> On Sat, Feb 12, 2011 at 10:21:47AM +0800, Gui Jianfeng wrote:
> [..]
>>>> +static struct cfq_group *
>>>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>>> +{
>>>> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>>>> + struct cfq_group *cfqg = NULL;
>>>> + void *key = cfqd;
>>>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>>> + unsigned int major, minor;
>>>> +
>>>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>>>> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>>>> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>>> + cfqg->blkg.dev = MKDEV(major, minor);
>>>> + goto done;
>>>> + }
>>> Should we make this updation of this info hierarhical?
>> IMHO, it's fine to defer the updation when we really get the cfqg.
>
> But if cfqg is alrady present, we will never hit the allocation path
> again. So if somebody creates 2-3 level deep hierarchy and does IO
> only in the children cgroup, parent cgroups will potentially not get
> device info updated hence no visible stats?

Ahh, I see your concern. But do we really need to show the stats even if
a cgroup doesn't issue any IO on a given device?
For example, on valinna kernel, when we create a new cgroup, we check the
stats, say blkio.io_service_bytes, we will get "Total 0", and no disk
specific stats.
Currently, As soon as a cgroup issue one IO request, cfqg->blkg.dev will
be updated.

Thanks,
Gui

>
> Thanks
> Vivek
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2011-02-15 14:35:29

by Vivek Goyal

[permalink] [raw]
Subject: Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

On Tue, Feb 15, 2011 at 10:38:32AM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> > On Sat, Feb 12, 2011 at 10:21:47AM +0800, Gui Jianfeng wrote:
> > [..]
> >>>> +static struct cfq_group *
> >>>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> >>>> +{
> >>>> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> >>>> + struct cfq_group *cfqg = NULL;
> >>>> + void *key = cfqd;
> >>>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> >>>> + unsigned int major, minor;
> >>>> +
> >>>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> >>>> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> >>>> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> >>>> + cfqg->blkg.dev = MKDEV(major, minor);
> >>>> + goto done;
> >>>> + }
> >>> Should we make this updation of this info hierarhical?
> >> IMHO, it's fine to defer the updation when we really get the cfqg.
> >
> > But if cfqg is alrady present, we will never hit the allocation path
> > again. So if somebody creates 2-3 level deep hierarchy and does IO
> > only in the children cgroup, parent cgroups will potentially not get
> > device info updated hence no visible stats?
>
> Ahh, I see your concern. But do we really need to show the stats even if
> a cgroup doesn't issue any IO on a given device?

I am assuming that once use_hierarchy=1, you are aggregating the stats
in parent cgroups? So if a child services 5 IOs, these are accounted
to parent group also when user_hier=1?

What happens in case of memoy cgroup controller?

Thanks
Vivek

2011-02-16 01:44:55

by Gui, Jianfeng/归 剑峰

[permalink] [raw]
Subject: Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

Vivek Goyal wrote:
> On Tue, Feb 15, 2011 at 10:38:32AM +0800, Gui Jianfeng wrote:
>> Vivek Goyal wrote:
>>> On Sat, Feb 12, 2011 at 10:21:47AM +0800, Gui Jianfeng wrote:
>>> [..]
>>>>>> +static struct cfq_group *
>>>>>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>>>>> +{
>>>>>> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>>>>>> + struct cfq_group *cfqg = NULL;
>>>>>> + void *key = cfqd;
>>>>>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>>>>> + unsigned int major, minor;
>>>>>> +
>>>>>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>>>>>> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>>>>>> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>>>>> + cfqg->blkg.dev = MKDEV(major, minor);
>>>>>> + goto done;
>>>>>> + }
>>>>> Should we make this updation of this info hierarhical?
>>>> IMHO, it's fine to defer the updation when we really get the cfqg.
>>> But if cfqg is alrady present, we will never hit the allocation path
>>> again. So if somebody creates 2-3 level deep hierarchy and does IO
>>> only in the children cgroup, parent cgroups will potentially not get
>>> device info updated hence no visible stats?
>> Ahh, I see your concern. But do we really need to show the stats even if
>> a cgroup doesn't issue any IO on a given device?
>
> I am assuming that once use_hierarchy=1, you are aggregating the stats
> in parent cgroups? So if a child services 5 IOs, these are accounted
> to parent group also when user_hier=1?
>
> What happens in case of memoy cgroup controller?

Hmm, it seems memcg aggregating stats in parent group.
But do we really need to do that in kernel? I think it's easier to do it in
userland, and it makes kernel much simpler.

Thanks,
Gui

>
> Thanks
> Vivek
>

2011-02-16 14:17:59

by Vivek Goyal

[permalink] [raw]
Subject: Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

On Wed, Feb 16, 2011 at 09:44:39AM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> > On Tue, Feb 15, 2011 at 10:38:32AM +0800, Gui Jianfeng wrote:
> >> Vivek Goyal wrote:
> >>> On Sat, Feb 12, 2011 at 10:21:47AM +0800, Gui Jianfeng wrote:
> >>> [..]
> >>>>>> +static struct cfq_group *
> >>>>>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> >>>>>> +{
> >>>>>> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> >>>>>> + struct cfq_group *cfqg = NULL;
> >>>>>> + void *key = cfqd;
> >>>>>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> >>>>>> + unsigned int major, minor;
> >>>>>> +
> >>>>>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> >>>>>> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> >>>>>> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> >>>>>> + cfqg->blkg.dev = MKDEV(major, minor);
> >>>>>> + goto done;
> >>>>>> + }
> >>>>> Should we make this updation of this info hierarhical?
> >>>> IMHO, it's fine to defer the updation when we really get the cfqg.
> >>> But if cfqg is alrady present, we will never hit the allocation path
> >>> again. So if somebody creates 2-3 level deep hierarchy and does IO
> >>> only in the children cgroup, parent cgroups will potentially not get
> >>> device info updated hence no visible stats?
> >> Ahh, I see your concern. But do we really need to show the stats even if
> >> a cgroup doesn't issue any IO on a given device?
> >
> > I am assuming that once use_hierarchy=1, you are aggregating the stats
> > in parent cgroups? So if a child services 5 IOs, these are accounted
> > to parent group also when user_hier=1?
> >
> > What happens in case of memoy cgroup controller?
>
> Hmm, it seems memcg aggregating stats in parent group.
> But do we really need to do that in kernel? I think it's easier to do it in
> userland, and it makes kernel much simpler.

I think at some point of time hierarchical aggregated stats will also be
required. I am also looking at "memory.stat" file of meomory controller
and they seem to be reporting both aggregated as well as individual group
stats.

So we can probably skip implementing hierarhical stats in this patchset
and implement it on a need basis in future.

Thanks
Vivek

2011-02-16 17:22:53

by Divyesh Shah

[permalink] [raw]
Subject: Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

On Tue, Feb 15, 2011 at 5:44 PM, Gui Jianfeng
<[email protected]> wrote:
> Hmm, it seems memcg aggregating stats in parent group.
> But do we really need to do that in kernel? I think it's easier to do it in
> userland, and it makes kernel much simpler.
>

I would prefer having stats aggregated up the hierarchy. One trick we
used at Google earlier was to do lazy updates for most stats. So we
would accumulate stats for a given timeslice and at the end of that
timeslice propagate those counts all the way up to the parent. For
device info like this, the update can be instantaneous since its not a
very frequent event. It would also be very useful to distinguish
between stats for the cgroup itself vs total values accumulated from
the subtree rooted at that cgroup like memcg does.

--
Thanks,
Divyesh

2011-02-16 17:28:33

by Divyesh Shah

[permalink] [raw]
Subject: Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

On Wed, Feb 16, 2011 at 9:22 AM, Divyesh Shah <[email protected]> wrote:
> On Tue, Feb 15, 2011 at 5:44 PM, Gui Jianfeng
> <[email protected]> wrote:
>> Hmm, it seems memcg aggregating stats in parent group.
>> But do we really need to do that in kernel? I think it's easier to do it in
>> userland, and it makes kernel much simpler.
>>
>
> I would prefer having stats aggregated up the hierarchy. One trick we
> used at Google earlier was to do lazy updates for most stats. So we

Note that this was on ancient version of the blkio controller :). As
Vivek mentioned,
it may be ok to add the hierarchical accounting later.

> would accumulate stats for a given timeslice and at the end of that
> timeslice propagate those counts all the way up to the parent. For
> device info like this, the update can be instantaneous since its not a
> very frequent event. It would also be very useful to distinguish
> between stats for the cgroup itself vs total values accumulated from
> the subtree rooted at that cgroup like memcg does.
>
> --
> Thanks,
> Divyesh
>



--
Thanks,
Divyesh

2011-02-16 18:06:39

by Vivek Goyal

[permalink] [raw]
Subject: Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

On Wed, Feb 16, 2011 at 09:28:07AM -0800, Divyesh Shah wrote:
> On Wed, Feb 16, 2011 at 9:22 AM, Divyesh Shah <[email protected]> wrote:
> > On Tue, Feb 15, 2011 at 5:44 PM, Gui Jianfeng
> > <[email protected]> wrote:
> >> Hmm, it seems memcg aggregating stats in parent group.
> >> But do we really need to do that in kernel? I think it's easier to do it in
> >> userland, and it makes kernel much simpler.
> >>
> >
> > I would prefer having stats aggregated up the hierarchy. One trick we
> > used at Google earlier was to do lazy updates for most stats. So we
>
> Note that this was on ancient version of the blkio controller :). As
> Vivek mentioned,
> it may be ok to add the hierarchical accounting later.

One improvement we can probably do is make accounting per cpu and make
it lockless.

Thanks
Vivek

2011-02-17 00:31:57

by Justin TerAvest

[permalink] [raw]
Subject: Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

After a quick read,

It's sad that we have to have so many use_hierarchy checks; it seems
like we're asking for bugs, especially in the future when one codepath
gets updated but not the other.

CodingStyle says we should only have one declaration per line.

I feel like there is an implicit assumption that groups and tasks
should not be children of the same parent; that is, a group should
contain only groups, or only tasks, but I don't see this enforced;
there's just and assumption that BE:SYNC is "good enough" for that
comparison. This smells like something that will be tweaked/tuned for
fairness later. :( Why don't we just prevent this from happening?

The clean_up label in chain_alloc() is strange; I don't think the goto
is necessary at all. I found that method generally hard to understand.
It's doing a lot.

It's possible that some of these can't be worked around.


On Wed, Feb 9, 2011 at 11:47 PM, Gui Jianfeng
<[email protected]> wrote:
> CFQ group hierarchical scheduling and use_hierarchy interface.
>
> Signed-off-by: Gui Jianfeng <[email protected]>
> ---
> ?block/blk-cgroup.c ?| ? 61 +++++-
> ?block/blk-cgroup.h ?| ? ?3 +
> ?block/cfq-iosched.c | ?603 +++++++++++++++++++++++++++++++++++++--------------
> ?3 files changed, 500 insertions(+), 167 deletions(-)
>
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 455768a..c55fecd 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -25,7 +25,10 @@
> ?static DEFINE_SPINLOCK(blkio_list_lock);
> ?static LIST_HEAD(blkio_list);
>
> -struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
> +struct blkio_cgroup blkio_root_cgroup = {
> + ? ? ? .weight = 2*BLKIO_WEIGHT_DEFAULT,
> + ? ? ? .use_hierarchy = 0
> +};
> ?EXPORT_SYMBOL_GPL(blkio_root_cgroup);
>
> ?static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
> @@ -454,6 +457,7 @@ static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
> ? ? ? ?blkg->blkcg_id = 0;
> ?}
>
> +
> ?/*
> ?* returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
> ?* indicating that blk_group was unhashed by the time we got to it.
> @@ -765,6 +769,12 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
> ?}
> ?EXPORT_SYMBOL_GPL(blkcg_get_weight);
>
> +unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg)
> +{
> + ? ? ? return blkcg->use_hierarchy;
> +}
> +EXPORT_SYMBOL_GPL(blkcg_get_use_hierarchy);
> +
> ?uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
> ?{
> ? ? ? ?struct blkio_policy_node *pn;
> @@ -1202,6 +1212,8 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
> ? ? ? ? ? ? ? ?switch(name) {
> ? ? ? ? ? ? ? ?case BLKIO_PROP_weight:
> ? ? ? ? ? ? ? ? ? ? ? ?return (u64)blkcg->weight;
> + ? ? ? ? ? ? ? case BLKIO_PROP_use_hierarchy:
> + ? ? ? ? ? ? ? ? ? ? ? return (u64)blkcg->use_hierarchy;
> ? ? ? ? ? ? ? ?}
> ? ? ? ? ? ? ? ?break;
> ? ? ? ?default:
> @@ -1210,6 +1222,36 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
> ? ? ? ?return 0;
> ?}
>
> +static int blkio_use_hierarchy_write(struct cgroup *cgrp, u64 val)
> +{
> + ? ? ? struct cgroup *parent = cgrp->parent;
> + ? ? ? struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
> + ? ? ? int ret = 0;
> +
> + ? ? ? if (val != 0 && val != 1)
> + ? ? ? ? ? ? ? return -EINVAL;
> +
> + ? ? ? blkcg = cgroup_to_blkio_cgroup(cgrp);
> + ? ? ? if (parent)
> + ? ? ? ? ? ? ? parent_blkcg = cgroup_to_blkio_cgroup(parent);
> +
> + ? ? ? cgroup_lock();
> + ? ? ? /*
> + ? ? ? ?* If parent's use_hierarchy is set, we can't make any modifications
> + ? ? ? ?* in the child subtrees. If it is unset, then the change can occur,
> + ? ? ? ?* provided the current cgroup has no children.
> + ? ? ? ?*/
> + ? ? ? if (!parent_blkcg || !parent_blkcg->use_hierarchy) {
> + ? ? ? ? ? ? ? if (list_empty(&cgrp->children))
> + ? ? ? ? ? ? ? ? ? ? ? blkcg->use_hierarchy = val;
> + ? ? ? ? ? ? ? else
> + ? ? ? ? ? ? ? ? ? ? ? ret = -EBUSY;
> + ? ? ? } else
> + ? ? ? ? ? ? ? ret = -EINVAL;
> + ? ? ? cgroup_unlock();
> + ? ? ? return ret;
> +}
> +
> ?static int
> ?blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
> ?{
> @@ -1224,6 +1266,8 @@ blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
> ? ? ? ? ? ? ? ?switch(name) {
> ? ? ? ? ? ? ? ?case BLKIO_PROP_weight:
> ? ? ? ? ? ? ? ? ? ? ? ?return blkio_weight_write(blkcg, val);
> + ? ? ? ? ? ? ? case BLKIO_PROP_use_hierarchy:
> + ? ? ? ? ? ? ? ? ? ? ? return blkio_use_hierarchy_write(cgrp, val);
> ? ? ? ? ? ? ? ?}
> ? ? ? ? ? ? ? ?break;
> ? ? ? ?default:
> @@ -1301,6 +1345,13 @@ struct cftype blkio_files[] = {
> ? ? ? ? ? ? ? ?.name = "reset_stats",
> ? ? ? ? ? ? ? ?.write_u64 = blkiocg_reset_stats,
> ? ? ? ?},
> + ? ? ? {
> + ? ? ? ? ? ? ? .name = "use_hierarchy",
> + ? ? ? ? ? ? ? .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?BLKIO_PROP_use_hierarchy),
> + ? ? ? ? ? ? ? .read_u64 = blkiocg_file_read_u64,
> + ? ? ? ? ? ? ? .write_u64 = blkiocg_file_write_u64,
> + ? ? ? },
> ?#ifdef CONFIG_BLK_DEV_THROTTLING
> ? ? ? ?{
> ? ? ? ? ? ? ? ?.name = "throttle.read_bps_device",
> @@ -1444,7 +1495,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
> ?static struct cgroup_subsys_state *
> ?blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
> ?{
> - ? ? ? struct blkio_cgroup *blkcg;
> + ? ? ? struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
> ? ? ? ?struct cgroup *parent = cgroup->parent;
>
> ? ? ? ?if (!parent) {
> @@ -1452,6 +1503,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
> ? ? ? ? ? ? ? ?goto done;
> ? ? ? ?}
>
> + ? ? ? parent_blkcg = cgroup_to_blkio_cgroup(parent);
> ? ? ? ?blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
> ? ? ? ?if (!blkcg)
> ? ? ? ? ? ? ? ?return ERR_PTR(-ENOMEM);
> @@ -1462,6 +1514,11 @@ done:
> ? ? ? ?INIT_HLIST_HEAD(&blkcg->blkg_list);
>
> ? ? ? ?INIT_LIST_HEAD(&blkcg->policy_list);
> + ? ? ? if (parent)
> + ? ? ? ? ? ? ? blkcg->use_hierarchy = parent_blkcg->use_hierarchy;
> + ? ? ? else
> + ? ? ? ? ? ? ? blkcg->use_hierarchy = 0;
> +
> ? ? ? ?return &blkcg->css;
> ?}
>
> diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
> index ea4861b..5b4b351 100644
> --- a/block/blk-cgroup.h
> +++ b/block/blk-cgroup.h
> @@ -90,6 +90,7 @@ enum blkcg_file_name_prop {
> ? ? ? ?BLKIO_PROP_idle_time,
> ? ? ? ?BLKIO_PROP_empty_time,
> ? ? ? ?BLKIO_PROP_dequeue,
> + ? ? ? BLKIO_PROP_use_hierarchy,
> ?};
>
> ?/* cgroup files owned by throttle policy */
> @@ -105,6 +106,7 @@ enum blkcg_file_name_throtl {
> ?struct blkio_cgroup {
> ? ? ? ?struct cgroup_subsys_state css;
> ? ? ? ?unsigned int weight;
> + ? ? ? bool use_hierarchy;
> ? ? ? ?spinlock_t lock;
> ? ? ? ?struct hlist_head blkg_list;
> ? ? ? ?struct list_head policy_list; /* list of blkio_policy_node */
> @@ -179,6 +181,7 @@ struct blkio_policy_node {
>
> ?extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? dev_t dev);
> +extern unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg);
> ?extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? dev_t dev);
> ?extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index aa3eda8..0e21d27 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -110,6 +110,9 @@ struct cfq_entity {
> ? ? ? ?u64 vdisktime;
> ? ? ? ?bool is_group_entity;
> ? ? ? ?unsigned int weight;
> + ? ? ? struct cfq_entity *parent;
> + ? ? ? /* Reposition time */
> + ? ? ? unsigned long reposition_time;
> ?};
>
> ?/*
> @@ -118,8 +121,6 @@ struct cfq_entity {
> ?struct cfq_queue {
> ? ? ? ?/* The schedule entity */
> ? ? ? ?struct cfq_entity cfqe;
> - ? ? ? /* Reposition time */
> - ? ? ? unsigned long reposition_time;
> ? ? ? ?/* reference count */
> ? ? ? ?int ref;
> ? ? ? ?/* various state flags, see below */
> @@ -199,6 +200,9 @@ struct cfq_group {
> ? ? ? ?/* number of cfqq currently on this group */
> ? ? ? ?int nr_cfqq;
>
> + ? ? ? /* number of sub cfq groups */
> + ? ? ? int nr_subgp;
> +
> ? ? ? ?/*
> ? ? ? ? * Per group busy queus average. Useful for workload slice calc. We
> ? ? ? ? * create the array for each prio class but at run time it is used
> @@ -234,10 +238,11 @@ struct cfq_group {
> ?*/
> ?struct cfq_data {
> ? ? ? ?struct request_queue *queue;
> - ? ? ? /* Root service tree for cfq_groups */
> - ? ? ? struct cfq_rb_root grp_service_tree;
> ? ? ? ?struct cfq_group root_group;
>
> + ? ? ? /* cfq group schedule in flat or hierarchy manner. */
> + ? ? ? bool use_hierarchy;
> +
> ? ? ? ?/*
> ? ? ? ? * The priority currently being served
> ? ? ? ? */
> @@ -246,6 +251,9 @@ struct cfq_data {
> ? ? ? ?unsigned long workload_expires;
> ? ? ? ?struct cfq_group *serving_group;
>
> + ? ? ? /* Service tree for cfq group flat scheduling mode. */
> + ? ? ? struct cfq_rb_root grp_service_tree;
> +
> ? ? ? ?/*
> ? ? ? ? * Each priority tree is sorted by next_request position. ?These
> ? ? ? ? * trees are used when determining if two or more queues are
> @@ -355,8 +363,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
> ?}
>
>
> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
> -
> ?static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?enum wl_prio_t prio,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?enum wl_type_t type)
> @@ -643,13 +649,50 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
> ? ? ? ?return cfqg->busy_queues_avg[rt];
> ?}
>
> +static inline unsigned int
> +cfq_group_get_total_weight(struct cfq_group *cfqg)
> +{
> + ? ? ? int i, j;
> + ? ? ? struct cfq_rb_root *st;
> + ? ? ? unsigned int total_weight = 0;
> +
> + ? ? ? for_each_cfqg_st(cfqg, i, j, st) {
> + ? ? ? ? ? ? ? total_weight += st->total_weight;
> + ? ? ? }
> +
> + ? ? ? return total_weight;
> +}
> +
> ?static inline unsigned
> ?cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
> ?{
> - ? ? ? struct cfq_rb_root *st = &cfqd->grp_service_tree;
> ? ? ? ?struct cfq_entity *cfqe = &cfqg->cfqe;
> + ? ? ? struct cfq_rb_root *st;
> + ? ? ? int group_slice = cfq_target_latency;
> + ? ? ? unsigned int grp_total_weight;
> + ? ? ? struct cfq_group *p_cfqg;
> +
> + ? ? ? /*
> + ? ? ? ?* Calculate group slice in a hierarchical way.
> + ? ? ? ?* Note, the calculation is cross all service trees under a group.
> + ? ? ? ?*/
> + ? ? ? do {
> + ? ? ? ? ? ? ? if (cfqe->parent) {
> + ? ? ? ? ? ? ? ? ? ? ? p_cfqg = cfqg_of_entity(cfqe->parent);
> + ? ? ? ? ? ? ? ? ? ? ? grp_total_weight = cfq_group_get_total_weight(p_cfqg);
> + ? ? ? ? ? ? ? ? ? ? ? group_slice = group_slice * cfqe->weight /
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? grp_total_weight;
> + ? ? ? ? ? ? ? } else {
> + ? ? ? ? ? ? ? ? ? ? ? /* For top level groups */
> + ? ? ? ? ? ? ? ? ? ? ? st = cfqe->service_tree;
> + ? ? ? ? ? ? ? ? ? ? ? group_slice = group_slice * cfqe->weight /
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? st->total_weight;
> + ? ? ? ? ? ? ? }
>
> - ? ? ? return cfq_target_latency * cfqe->weight / st->total_weight;
> + ? ? ? ? ? ? ? cfqe = cfqe->parent;
> + ? ? ? } while (cfqe);
> +
> + ? ? ? return group_slice;
> ?}
>
> ?static inline void
> @@ -672,7 +715,8 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
> ? ? ? ? ? ? ? ? ? ? ? ?/* scale low_slice according to IO priority
> ? ? ? ? ? ? ? ? ? ? ? ? * and sync vs async */
> ? ? ? ? ? ? ? ? ? ? ? ?unsigned low_slice =
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? min(slice, base_low_slice * slice / sync_slice);
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? min(slice, base_low_slice * slice /
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? sync_slice);
> ? ? ? ? ? ? ? ? ? ? ? ?/* the adapted slice value is scaled to fit all iqs
> ? ? ? ? ? ? ? ? ? ? ? ? * into the target latency */
> ? ? ? ? ? ? ? ? ? ? ? ?slice = max(slice * group_slice / expect_latency,
> @@ -812,17 +856,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
> ? ? ? ?return NULL;
> ?}
>
> -static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
> -{
> - ? ? ? if (!root->left)
> - ? ? ? ? ? ? ? root->left = rb_first(&root->rb);
> -
> - ? ? ? if (root->left)
> - ? ? ? ? ? ? ? return rb_entry_entity(root->left);
> -
> - ? ? ? return NULL;
> -}
> -
> ?static void rb_erase_init(struct rb_node *n, struct rb_root *root)
> ?{
> ? ? ? ?rb_erase(n, root);
> @@ -896,12 +929,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>
> ? ? ? ?rb_link_node(&cfqe->rb_node, parent, node);
> ? ? ? ?rb_insert_color(&cfqe->rb_node, &st->rb);
> +
> + ? ? ? update_min_vdisktime(st);
> ?}
>
> ?static void
> ?cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> ?{
> ? ? ? ?__cfq_entity_service_tree_add(st, cfqe);
> + ? ? ? cfqe->reposition_time = jiffies;
> ? ? ? ?st->count++;
> ? ? ? ?st->total_weight += cfqe->weight;
> ?}
> @@ -909,34 +945,52 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> ?static void
> ?cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
> ?{
> - ? ? ? struct cfq_rb_root *st = &cfqd->grp_service_tree;
> ? ? ? ?struct cfq_entity *cfqe = &cfqg->cfqe;
> - ? ? ? struct cfq_entity *__cfqe;
> ? ? ? ?struct rb_node *n;
> + ? ? ? struct cfq_entity *entity;
> + ? ? ? struct cfq_rb_root *st;
> + ? ? ? struct cfq_group *__cfqg;
>
> ? ? ? ?cfqg->nr_cfqq++;
> +
> ? ? ? ?if (!RB_EMPTY_NODE(&cfqe->rb_node))
> ? ? ? ? ? ? ? ?return;
>
> ? ? ? ?/*
> - ? ? ? ?* Currently put the group at the end. Later implement something
> - ? ? ? ?* so that groups get lesser vtime based on their weights, so that
> - ? ? ? ?* if group does not loose all if it was not continously backlogged.
> + ? ? ? ?* Enqueue this group and its ancestors onto their service tree.
> ? ? ? ? */
> - ? ? ? n = rb_last(&st->rb);
> - ? ? ? if (n) {
> - ? ? ? ? ? ? ? __cfqe = rb_entry_entity(n);
> - ? ? ? ? ? ? ? cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
> - ? ? ? } else
> - ? ? ? ? ? ? ? cfqe->vdisktime = st->min_vdisktime;
> + ? ? ? while (cfqe) {
> + ? ? ? ? ? ? ? if (!RB_EMPTY_NODE(&cfqe->rb_node))
> + ? ? ? ? ? ? ? ? ? ? ? return;
>
> - ? ? ? cfq_entity_service_tree_add(st, cfqe);
> + ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ?* Currently put the group at the end. Later implement
> + ? ? ? ? ? ? ? ?* something so that groups get lesser vtime based on
> + ? ? ? ? ? ? ? ?* their weights, so that if group does not loose all
> + ? ? ? ? ? ? ? ?* if it was not continously backlogged.
> + ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? st = cfqe->service_tree;
> + ? ? ? ? ? ? ? n = rb_last(&st->rb);
> + ? ? ? ? ? ? ? if (n) {
> + ? ? ? ? ? ? ? ? ? ? ? entity = rb_entry_entity(n);
> + ? ? ? ? ? ? ? ? ? ? ? cfqe->vdisktime = entity->vdisktime +
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? CFQ_IDLE_DELAY;
> + ? ? ? ? ? ? ? } else
> + ? ? ? ? ? ? ? ? ? ? ? cfqe->vdisktime = st->min_vdisktime;
> +
> + ? ? ? ? ? ? ? cfq_entity_service_tree_add(st, cfqe);
> + ? ? ? ? ? ? ? cfqe = cfqe->parent;
> + ? ? ? ? ? ? ? __cfqg = cfqg_of_entity(cfqe);
> + ? ? ? ? ? ? ? if (__cfqg)
> + ? ? ? ? ? ? ? ? ? ? ? __cfqg->nr_subgp++;
> + ? ? ? }
> ?}
>
> ?static void
> ?__cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> ?{
> ? ? ? ?cfq_rb_erase(&cfqe->rb_node, st);
> + ? ? ? update_min_vdisktime(st);
> ?}
>
> ?static void
> @@ -945,27 +999,43 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> ? ? ? ?if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> ? ? ? ? ? ? ? ?__cfq_entity_service_tree_del(st, cfqe);
> ? ? ? ? ? ? ? ?st->total_weight -= cfqe->weight;
> - ? ? ? ? ? ? ? cfqe->service_tree = NULL;
> ? ? ? ?}
> ?}
>
> ?static void
> ?cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
> ?{
> - ? ? ? struct cfq_rb_root *st = &cfqd->grp_service_tree;
> ? ? ? ?struct cfq_entity *cfqe = &cfqg->cfqe;
> + ? ? ? struct cfq_group *__cfqg, *p_cfqg;
>
> ? ? ? ?BUG_ON(cfqg->nr_cfqq < 1);
> ? ? ? ?cfqg->nr_cfqq--;
>
> - ? ? ? /* If there are other cfq queues under this group, don't delete it */
> - ? ? ? if (cfqg->nr_cfqq)
> + ? ? ? /*
> + ? ? ? ?* If there are other cfq queues under this group, or there are other
> + ? ? ? ?* cfq groups under this group, don't delete it.
> + ? ? ? ?*/
> + ? ? ? if (cfqg->nr_cfqq || cfqg->nr_subgp)
> ? ? ? ? ? ? ? ?return;
>
> - ? ? ? cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
> - ? ? ? cfq_entity_service_tree_del(st, cfqe);
> - ? ? ? cfqg->saved_workload_slice = 0;
> - ? ? ? cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
> + ? ? ? /*
> + ? ? ? ?* Dequeue this group and its ancestors from their service
> + ? ? ? ?* tree.
> + ? ? ? ?*/
> + ? ? ? while (cfqe) {
> + ? ? ? ? ? ? ? __cfqg = cfqg_of_entity(cfqe);
> + ? ? ? ? ? ? ? p_cfqg = cfqg_of_entity(cfqe->parent);
> + ? ? ? ? ? ? ? cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
> + ? ? ? ? ? ? ? cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
> + ? ? ? ? ? ? ? cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
> + ? ? ? ? ? ? ? __cfqg->saved_workload_slice = 0;
> + ? ? ? ? ? ? ? cfqe = cfqe->parent;
> + ? ? ? ? ? ? ? if (p_cfqg) {
> + ? ? ? ? ? ? ? ? ? ? ? p_cfqg->nr_subgp--;
> + ? ? ? ? ? ? ? ? ? ? ? if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? return;
> + ? ? ? ? ? ? ? }
> + ? ? ? }
> ?}
>
> ?static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
> @@ -997,7 +1067,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
> ?static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cfq_queue *cfqq)
> ?{
> - ? ? ? struct cfq_rb_root *st = &cfqd->grp_service_tree;
> ? ? ? ?unsigned int used_sl, charge;
> ? ? ? ?int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
> ? ? ? ? ? ? ? ? ? ? ? ?- cfqg->service_tree_idle.count;
> @@ -1011,10 +1080,23 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
> ? ? ? ?else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
> ? ? ? ? ? ? ? ?charge = cfqq->allocated_slice;
>
> - ? ? ? /* Can't update vdisktime while group is on service tree */
> - ? ? ? __cfq_entity_service_tree_del(st, cfqe);
> - ? ? ? cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
> - ? ? ? __cfq_entity_service_tree_add(st, cfqe);
> + ? ? ? /*
> + ? ? ? ?* Update the vdisktime on the whole chain.
> + ? ? ? ?*/
> + ? ? ? while (cfqe) {
> + ? ? ? ? ? ? ? struct cfq_rb_root *st = cfqe->service_tree;
> +
> + ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ?* Can't update vdisktime while group is on service
> + ? ? ? ? ? ? ? ?* tree.
> + ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? __cfq_entity_service_tree_del(st, cfqe);
> + ? ? ? ? ? ? ? cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
> + ? ? ? ? ? ? ? __cfq_entity_service_tree_add(st, cfqe);
> + ? ? ? ? ? ? ? st->count++;
> + ? ? ? ? ? ? ? cfqe->reposition_time = jiffies;
> + ? ? ? ? ? ? ? cfqe = cfqe->parent;
> + ? ? ? }
>
> ? ? ? ?/* This group is being expired. Save the context */
> ? ? ? ?if (time_after(cfqd->workload_expires, jiffies)) {
> @@ -1026,7 +1108,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
> ? ? ? ? ? ? ? ?cfqg->saved_workload_slice = 0;
>
> ? ? ? ?cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
> - ? ? ? ? ? ? ? ? ? ?cfqe->vdisktime, st->min_vdisktime);
> + ? ? ? ? ? ? ? ? ? ?cfqg->cfqe.vdisktime,
> + ? ? ? ? ? ? ? ? ? ?cfqg->cfqe.service_tree->min_vdisktime);
> ? ? ? ?cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
> ? ? ? ? ? ? ? ? ? ? ? ?" sect=%u", used_sl, cfqq->slice_dispatch, charge,
> ? ? ? ? ? ? ? ? ? ? ? ?iops_mode(cfqd), cfqq->nr_sectors);
> @@ -1048,35 +1131,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
> ? ? ? ?cfqg_of_blkg(blkg)->cfqe.weight = weight;
> ?}
>
> -static struct cfq_group *
> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> +static void init_cfqe(struct blkio_cgroup *blkcg,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct cfq_group *cfqg)
> +{
> + ? ? ? struct cfq_entity *cfqe = &cfqg->cfqe;
> +
> + ? ? ? cfqe->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
> + ? ? ? RB_CLEAR_NODE(&cfqe->rb_node);
> + ? ? ? cfqe->is_group_entity = true;
> + ? ? ? cfqe->parent = NULL;
> +}
> +
> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
> + ? ? ? ? ? ? ? ? ? ? struct cfq_group *cfqg)
> ?{
> - ? ? ? struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> - ? ? ? struct cfq_group *cfqg = NULL;
> - ? ? ? void *key = cfqd;
> ? ? ? ?int i, j;
> ? ? ? ?struct cfq_rb_root *st;
> - ? ? ? struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> ? ? ? ?unsigned int major, minor;
> -
> - ? ? ? cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> - ? ? ? if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> - ? ? ? ? ? ? ? sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> - ? ? ? ? ? ? ? cfqg->blkg.dev = MKDEV(major, minor);
> - ? ? ? ? ? ? ? goto done;
> - ? ? ? }
> - ? ? ? if (cfqg || !create)
> - ? ? ? ? ? ? ? goto done;
> -
> - ? ? ? cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
> - ? ? ? if (!cfqg)
> - ? ? ? ? ? ? ? goto done;
> + ? ? ? struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>
> ? ? ? ?for_each_cfqg_st(cfqg, i, j, st)
> ? ? ? ? ? ? ? ?*st = CFQ_RB_ROOT;
> - ? ? ? RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
> -
> - ? ? ? cfqg->cfqe.is_group_entity = true;
>
> ? ? ? ?/*
> ? ? ? ? * Take the initial reference that will be released on destroy
> @@ -1086,24 +1161,199 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> ? ? ? ? */
> ? ? ? ?cfqg->ref = 1;
>
> + ? ? ? /* Add group onto cgroup list */
> + ? ? ? sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> + ? ? ? cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? MKDEV(major, minor));
> + ? ? ? /* Initiate group entity */
> + ? ? ? init_cfqe(blkcg, cfqg);
> + ? ? ? /* Add group on cfqd list */
> + ? ? ? hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
> +}
> +
> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg);
> +
> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
> +{
> + ? ? ? if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
> + ? ? ? ? ? ? ? cfq_destroy_cfqg(cfqd, cfqg);
> +}
> +
> +static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? struct cfq_group *p_cfqg)
> +{
> + ? ? ? struct cfq_entity *cfqe, *p_cfqe;
> +
> + ? ? ? cfqe = &cfqg->cfqe;
> +
> ? ? ? ?/*
> - ? ? ? ?* Add group onto cgroup list. It might happen that bdi->dev is
> - ? ? ? ?* not initiliazed yet. Initialize this new group without major
> - ? ? ? ?* and minor info and this info will be filled in once a new thread
> - ? ? ? ?* comes for IO. See code above.
> + ? ? ? ?* 1. If use_hierarchy of the CGroup where cfqg's parent stays is not
> + ? ? ? ?* ? ?set, we put this cfqg onto global service tree.
> + ? ? ? ?* 2. If cfqg is root cfqg, put it onto global service tree.
> ? ? ? ? */
> - ? ? ? if (bdi->dev) {
> - ? ? ? ? ? ? ? sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> - ? ? ? ? ? ? ? cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? MKDEV(major, minor));
> - ? ? ? } else
> - ? ? ? ? ? ? ? cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 0);
> + ? ? ? if (!p_cfqg) {
> + ? ? ? ? ? ? ? cfqe->service_tree = &cfqd->grp_service_tree;
> + ? ? ? ? ? ? ? cfqe->parent = NULL;
> + ? ? ? ? ? ? ? return;
> + ? ? ? }
>
> - ? ? ? cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
> + ? ? ? p_cfqe = &p_cfqg->cfqe;
>
> - ? ? ? /* Add group on cfqd list */
> - ? ? ? hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
> + ? ? ? cfqe->parent = p_cfqe;
> +
> + ? ? ? /*
> + ? ? ? ?* Currently, just put cfq group entity on "BE:SYNC" workload
> + ? ? ? ?* service tree.
> + ? ? ? ?*/
> + ? ? ? cfqe->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? SYNC_WORKLOAD);
> + ? ? ? /* child reference */
> + ? ? ? p_cfqg->ref++;
> +}
> +
> +static struct cfq_group *cfqg_get_parent(struct cfq_group * cfqg)
> +{
> + ? ? ? struct cfq_entity *cfqe, *p_cfqe;
> +
> + ? ? ? if (!cfqg)
> + ? ? ? ? ? ? ? return NULL;
> +
> + ? ? ? cfqe = &cfqg->cfqe;
> + ? ? ? p_cfqe = cfqe->parent;
> + ? ? ? if (!p_cfqe)
> + ? ? ? ? ? ? ? return NULL;
> +
> + ? ? ? return cfqg_of_entity(p_cfqe);
> +}
> +
> +static struct cfq_group *
> +cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
> +{
> + ? ? ? struct blkio_cgroup *blkcg;
> + ? ? ? struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> + ? ? ? unsigned int major, minor;
> + ? ? ? struct cfq_group *cfqg, *leaf_cfqg, *child_cfqg, *tmp_cfqg;
> + ? ? ? void *key = cfqd;
> +
> + ? ? ? /*
> + ? ? ? ?* If CGroup's use_hierarchy is unset, we just need to allocate only
> + ? ? ? ?* one CFQ group, and this group will put onto the "grp_service_tree".
> + ? ? ? ?* We don't need to check whether the cfqg exists, the caller has
> + ? ? ? ?* already checked it.
> + ? ? ? ?*/
> + ? ? ? blkcg = cgroup_to_blkio_cgroup(cgroup);
> + ? ? ? if (!blkcg_get_use_hierarchy(blkcg)) {
> + ? ? ? ? ? ? ? cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? cfqd->queue->node);
> + ? ? ? ? ? ? ? if (!cfqg)
> + ? ? ? ? ? ? ? ? ? ? ? return NULL;
> +
> + ? ? ? ? ? ? ? init_cfqg(cfqd, blkcg, cfqg);
> + ? ? ? ? ? ? ? cfqg_set_parent(cfqd, cfqg, NULL);
> + ? ? ? ? ? ? ? return cfqg;
> + ? ? ? }
> +
> + ? ? ? /*
> + ? ? ? ?* Allocate the CFQ group chain until we meet the group we'v already
> + ? ? ? ?* allocated before, or to the CGroup whose use_hierarchy is not set.
> + ? ? ? ?*/
> + ? ? ? leaf_cfqg = NULL;
> + ? ? ? child_cfqg = NULL;
> + ? ? ? for (; cgroup != NULL; cgroup = cgroup->parent) {
> + ? ? ? ? ? ? ? blkcg = cgroup_to_blkio_cgroup(cgroup);
> + ? ? ? ? ? ? ? cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> + ? ? ? ? ? ? ? if (cfqg) {
> + ? ? ? ? ? ? ? ? ? ? ? if (!cfqg->blkg.dev && bdi->dev &&
> + ? ? ? ? ? ? ? ? ? ? ? ? ? dev_name(bdi->dev)) {
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? sscanf(dev_name(bdi->dev), "%u:%u",
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?&major, &minor);
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? cfqg->blkg.dev = MKDEV(major, minor);
> + ? ? ? ? ? ? ? ? ? ? ? }
> +
> + ? ? ? ? ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ? ? ? ? ?* Initialization of parent doesn't finish yet, get
> + ? ? ? ? ? ? ? ? ? ? ? ?* it done.
> + ? ? ? ? ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? ? ? ? ? if (child_cfqg) {
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? if (blkcg_get_use_hierarchy(blkcg))
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? cfqg_set_parent(cfqd, child_cfqg,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? cfqg);
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? else
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? cfqg_set_parent(cfqd, child_cfqg,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? NULL);
> + ? ? ? ? ? ? ? ? ? ? ? }
> +
> + ? ? ? ? ? ? ? ? ? ? ? /* chain has already been built */
> + ? ? ? ? ? ? ? ? ? ? ? break;
> + ? ? ? ? ? ? ? }
> +
> + ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ?* We only allocate a cfqg that the corresponding cgroup's
> + ? ? ? ? ? ? ? ?* use_hierarchy is set.
> + ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? if (blkcg_get_use_hierarchy(blkcg)) {
> + ? ? ? ? ? ? ? ? ? ? ? cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? cfqd->queue->node);
> + ? ? ? ? ? ? ? ? ? ? ? if (!cfqg)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto clean_up;
> +
> + ? ? ? ? ? ? ? ? ? ? ? if (!leaf_cfqg)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? leaf_cfqg = cfqg;
> +
> + ? ? ? ? ? ? ? ? ? ? ? init_cfqg(cfqd, blkcg, cfqg);
> + ? ? ? ? ? ? ? } else {
> + ? ? ? ? ? ? ? ? ? ? ? cfqg = NULL;
> + ? ? ? ? ? ? ? }
> +
> + ? ? ? ? ? ? ? if (child_cfqg)
> + ? ? ? ? ? ? ? ? ? ? ? cfqg_set_parent(cfqd, child_cfqg, cfqg);
> +
> + ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ?* This CGroup's use_hierarchy isn't set, this means the CFQ
> + ? ? ? ? ? ? ? ?* group chain has been built.
> + ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? if (!blkcg_get_use_hierarchy(blkcg))
> + ? ? ? ? ? ? ? ? ? ? ? break;
> +
> + ? ? ? ? ? ? ? child_cfqg = cfqg;
> + ? ? ? }
> +
> + ? ? ? return leaf_cfqg;
> +
> +clean_up:
> + ? ? ? /* clean up the allocated cfq groups. */
> + ? ? ? while (leaf_cfqg) {
> + ? ? ? ? ? ? ? tmp_cfqg = leaf_cfqg;
> + ? ? ? ? ? ? ? leaf_cfqg = cfqg_get_parent(leaf_cfqg);
> + ? ? ? ? ? ? ? uninit_cfqg(cfqd, tmp_cfqg);
> + ? ? ? }
> +
> + ? ? ? return NULL;
> +}
> +
> +static struct cfq_group *
> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> +{
> + ? ? ? struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> + ? ? ? struct cfq_group *cfqg = NULL;
> + ? ? ? void *key = cfqd;
> + ? ? ? struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> + ? ? ? unsigned int major, minor;
> +
> + ? ? ? cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> + ? ? ? if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> + ? ? ? ? ? ? ? sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> + ? ? ? ? ? ? ? cfqg->blkg.dev = MKDEV(major, minor);
> + ? ? ? ? ? ? ? goto done;
> + ? ? ? }
> + ? ? ? if (cfqg || !create)
> + ? ? ? ? ? ? ? goto done;
> +
> + ? ? ? /*
> + ? ? ? ?* Allocate CFQ group chain to the root group or we meet the CGroup
> + ? ? ? ?* with use_hierarchy disabled.
> + ? ? ? ?*/
> + ? ? ? cfqg = cfqg_chain_alloc(cfqd, cgroup);
>
> ?done:
> ? ? ? ?return cfqg;
> @@ -1148,6 +1398,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
> ?{
> ? ? ? ?struct cfq_rb_root *st;
> ? ? ? ?int i, j;
> + ? ? ? struct cfq_group *p_cfqg;
>
> ? ? ? ?BUG_ON(cfqg->ref <= 0);
> ? ? ? ?cfqg->ref--;
> @@ -1155,6 +1406,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
> ? ? ? ? ? ? ? ?return;
> ? ? ? ?for_each_cfqg_st(cfqg, i, j, st)
> ? ? ? ? ? ? ? ?BUG_ON(!RB_EMPTY_ROOT(&st->rb));
> +
> + ? ? ? do {
> + ? ? ? ? ? ? ? p_cfqg = cfqg_get_parent(cfqg);
> + ? ? ? ? ? ? ? kfree(cfqg);
> + ? ? ? ? ? ? ? cfqg = NULL;
> + ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ?* Drop the reference taken by children, if nobody references
> + ? ? ? ? ? ? ? ?* parent group, we need delete the parent also.
> + ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? if (p_cfqg) {
> + ? ? ? ? ? ? ? ? ? ? ? p_cfqg->ref--;
> + ? ? ? ? ? ? ? ? ? ? ? if (p_cfqg->ref == 0)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? cfqg = p_cfqg;
> + ? ? ? ? ? ? ? }
> + ? ? ? } while (cfqg);
> +
> ? ? ? ?kfree(cfqg);
> ?}
>
> @@ -1321,9 +1588,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> ? ? ? ? ? ? ? ? ? ? ? ? * ioprio.
> ? ? ? ? ? ? ? ? ? ? ? ? */
> ? ? ? ? ? ? ? ? ? ? ? ?pos_offset = cfq_get_boost(cfqd, cfqq);
> - ? ? ? ? ? ? ? ? ? ? ? /* Debug purpose, should remove. */
> - ? ? ? ? ? ? ? ? ? ? ? cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu",
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?pos_offset);
> ? ? ? ? ? ? ? ? ? ? ? ?cfqe->vdisktime = service_tree->min_vdisktime +
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?pos_offset;
> ? ? ? ? ? ? ? ?} else
> @@ -1365,9 +1629,8 @@ insert:
> ? ? ? ?cfqe->service_tree = service_tree;
>
> ? ? ? ?/* Add cfqq onto service tree. */
> +
> ? ? ? ?cfq_entity_service_tree_add(service_tree, cfqe);
> - ? ? ? update_min_vdisktime(service_tree);
> - ? ? ? cfqq->reposition_time = jiffies;
> ? ? ? ?if ((add_front || !new_cfqq) && !group_changed)
> ? ? ? ? ? ? ? ?return;
> ? ? ? ?cfq_group_service_tree_add(cfqd, cfqq->cfqg);
> @@ -1810,28 +2073,43 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
> ? ? ? ?return cfqq_of_entity(cfq_rb_first(service_tree));
> ?}
>
> -static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
> +struct cfq_rb_root *choose_service_tree_forced(struct cfq_group *cfqg)
> ?{
> - ? ? ? struct cfq_group *cfqg;
> - ? ? ? struct cfq_entity *cfqe;
> ? ? ? ?int i, j;
> ? ? ? ?struct cfq_rb_root *st;
>
> - ? ? ? if (!cfqd->rq_queued)
> - ? ? ? ? ? ? ? return NULL;
> + ? ? ? for_each_cfqg_st(cfqg, i, j, st) {
> + ? ? ? ? ? ? ? if (st->count != 0)
> + ? ? ? ? ? ? ? ? ? ? ? return st;
> + ? ? ? }
>
> - ? ? ? cfqg = cfq_get_next_cfqg(cfqd);
> - ? ? ? if (!cfqg)
> + ? ? ? return NULL;
> +}
> +
> +static struct cfq_entity *
> +cfq_get_next_entity_forced(struct cfq_data *cfqd)
> +{
> + ? ? ? struct cfq_entity *cfqe;
> + ? ? ? struct cfq_rb_root *st = &cfqd->grp_service_tree;
> + ? ? ? struct cfq_group *cfqg;
> +
> + ? ? ? if (!cfqd->rq_queued)
> ? ? ? ? ? ? ? ?return NULL;
>
> - ? ? ? for_each_cfqg_st(cfqg, i, j, st) {
> + ? ? ? do {
> ? ? ? ? ? ? ? ?cfqe = cfq_rb_first(st);
> - ? ? ? ? ? ? ? if (cfqe != NULL)
> - ? ? ? ? ? ? ? ? ? ? ? return cfqq_of_entity(cfqe);
> - ? ? ? }
> + ? ? ? ? ? ? ? if (cfqe && !cfqe->is_group_entity)
> + ? ? ? ? ? ? ? ? ? ? ? return cfqe;
> + ? ? ? ? ? ? ? else if (cfqe && cfqe->is_group_entity)
> + ? ? ? ? ? ? ? ? ? ? ? cfqg = cfqg_of_entity(cfqe);
> +
> + ? ? ? ? ? ? ? st = choose_service_tree_forced(cfqg);
> + ? ? ? } while (st);
> +
> ? ? ? ?return NULL;
> ?}
>
> +
> ?/*
> ?* Get and set a new active qu
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at ?http://www.tux.org/lkml/
>

2011-02-17 01:22:07

by Gui, Jianfeng/归 剑峰

[permalink] [raw]
Subject: Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

Justin TerAvest wrote:
> After a quick read,
>
> It's sad that we have to have so many use_hierarchy checks; it seems
> like we're asking for bugs, especially in the future when one codepath
> gets updated but not the other.
>
> CodingStyle says we should only have one declaration per line.
>
> I feel like there is an implicit assumption that groups and tasks
> should not be children of the same parent; that is, a group should
> contain only groups, or only tasks, but I don't see this enforced;
> there's just and assumption that BE:SYNC is "good enough" for that
> comparison. This smells like something that will be tweaked/tuned for
> fairness later. :( Why don't we just prevent this from happening?

Hi Justin,

Thanks for reviewing.

Previously, I posted very first version that makes a group containing only
groups or only tasks. But I think it's more flexible to treat groups and
tasks at the same level. I think Vivek and Jens have the same opinion.
We had discussed in this thread http://lkml.org/lkml/2010/8/30/30

>
> The clean_up label in chain_alloc() is strange; I don't think the goto
> is necessary at all. I found that method generally hard to understand.
> It's doing a lot.

I don't understand why clean_up isn't needed.
When we fail to allocate a cfq group at some level, we have to clean up
all groups in the chain that we have already allocated.

Thanks,
Gui

>
> It's possible that some of these can't be worked around.
>
>
> On Wed, Feb 9, 2011 at 11:47 PM, Gui Jianfeng
> <[email protected]> wrote:
>> CFQ group hierarchical scheduling and use_hierarchy interface.
>>
>> Signed-off-by: Gui Jianfeng <[email protected]>
>> ---
>> block/blk-cgroup.c | 61 +++++-
>> block/blk-cgroup.h | 3 +
>> block/cfq-iosched.c | 603 +++++++++++++++++++++++++++++++++++++--------------
>> 3 files changed, 500 insertions(+), 167 deletions(-)
>>
>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>> index 455768a..c55fecd 100644
>> --- a/block/blk-cgroup.c
>> +++ b/block/blk-cgroup.c
>> @@ -25,7 +25,10 @@
>> static DEFINE_SPINLOCK(blkio_list_lock);
>> static LIST_HEAD(blkio_list);
>>
>> -struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
>> +struct blkio_cgroup blkio_root_cgroup = {
>> + .weight = 2*BLKIO_WEIGHT_DEFAULT,
>> + .use_hierarchy = 0
>> +};
>> EXPORT_SYMBOL_GPL(blkio_root_cgroup);
>>
>> static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
>> @@ -454,6 +457,7 @@ static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
>> blkg->blkcg_id = 0;
>> }
>>
>> +
>> /*
>> * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
>> * indicating that blk_group was unhashed by the time we got to it.
>> @@ -765,6 +769,12 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
>> }
>> EXPORT_SYMBOL_GPL(blkcg_get_weight);
>>
>> +unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg)
>> +{
>> + return blkcg->use_hierarchy;
>> +}
>> +EXPORT_SYMBOL_GPL(blkcg_get_use_hierarchy);
>> +
>> uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
>> {
>> struct blkio_policy_node *pn;
>> @@ -1202,6 +1212,8 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
>> switch(name) {
>> case BLKIO_PROP_weight:
>> return (u64)blkcg->weight;
>> + case BLKIO_PROP_use_hierarchy:
>> + return (u64)blkcg->use_hierarchy;
>> }
>> break;
>> default:
>> @@ -1210,6 +1222,36 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
>> return 0;
>> }
>>
>> +static int blkio_use_hierarchy_write(struct cgroup *cgrp, u64 val)
>> +{
>> + struct cgroup *parent = cgrp->parent;
>> + struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
>> + int ret = 0;
>> +
>> + if (val != 0 && val != 1)
>> + return -EINVAL;
>> +
>> + blkcg = cgroup_to_blkio_cgroup(cgrp);
>> + if (parent)
>> + parent_blkcg = cgroup_to_blkio_cgroup(parent);
>> +
>> + cgroup_lock();
>> + /*
>> + * If parent's use_hierarchy is set, we can't make any modifications
>> + * in the child subtrees. If it is unset, then the change can occur,
>> + * provided the current cgroup has no children.
>> + */
>> + if (!parent_blkcg || !parent_blkcg->use_hierarchy) {
>> + if (list_empty(&cgrp->children))
>> + blkcg->use_hierarchy = val;
>> + else
>> + ret = -EBUSY;
>> + } else
>> + ret = -EINVAL;
>> + cgroup_unlock();
>> + return ret;
>> +}
>> +
>> static int
>> blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
>> {
>> @@ -1224,6 +1266,8 @@ blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
>> switch(name) {
>> case BLKIO_PROP_weight:
>> return blkio_weight_write(blkcg, val);
>> + case BLKIO_PROP_use_hierarchy:
>> + return blkio_use_hierarchy_write(cgrp, val);
>> }
>> break;
>> default:
>> @@ -1301,6 +1345,13 @@ struct cftype blkio_files[] = {
>> .name = "reset_stats",
>> .write_u64 = blkiocg_reset_stats,
>> },
>> + {
>> + .name = "use_hierarchy",
>> + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
>> + BLKIO_PROP_use_hierarchy),
>> + .read_u64 = blkiocg_file_read_u64,
>> + .write_u64 = blkiocg_file_write_u64,
>> + },
>> #ifdef CONFIG_BLK_DEV_THROTTLING
>> {
>> .name = "throttle.read_bps_device",
>> @@ -1444,7 +1495,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>> static struct cgroup_subsys_state *
>> blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>> {
>> - struct blkio_cgroup *blkcg;
>> + struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
>> struct cgroup *parent = cgroup->parent;
>>
>> if (!parent) {
>> @@ -1452,6 +1503,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>> goto done;
>> }
>>
>> + parent_blkcg = cgroup_to_blkio_cgroup(parent);
>> blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
>> if (!blkcg)
>> return ERR_PTR(-ENOMEM);
>> @@ -1462,6 +1514,11 @@ done:
>> INIT_HLIST_HEAD(&blkcg->blkg_list);
>>
>> INIT_LIST_HEAD(&blkcg->policy_list);
>> + if (parent)
>> + blkcg->use_hierarchy = parent_blkcg->use_hierarchy;
>> + else
>> + blkcg->use_hierarchy = 0;
>> +
>> return &blkcg->css;
>> }
>>
>> diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
>> index ea4861b..5b4b351 100644
>> --- a/block/blk-cgroup.h
>> +++ b/block/blk-cgroup.h
>> @@ -90,6 +90,7 @@ enum blkcg_file_name_prop {
>> BLKIO_PROP_idle_time,
>> BLKIO_PROP_empty_time,
>> BLKIO_PROP_dequeue,
>> + BLKIO_PROP_use_hierarchy,
>> };
>>
>> /* cgroup files owned by throttle policy */
>> @@ -105,6 +106,7 @@ enum blkcg_file_name_throtl {
>> struct blkio_cgroup {
>> struct cgroup_subsys_state css;
>> unsigned int weight;
>> + bool use_hierarchy;
>> spinlock_t lock;
>> struct hlist_head blkg_list;
>> struct list_head policy_list; /* list of blkio_policy_node */
>> @@ -179,6 +181,7 @@ struct blkio_policy_node {
>>
>> extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
>> dev_t dev);
>> +extern unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg);
>> extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
>> dev_t dev);
>> extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index aa3eda8..0e21d27 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -110,6 +110,9 @@ struct cfq_entity {
>> u64 vdisktime;
>> bool is_group_entity;
>> unsigned int weight;
>> + struct cfq_entity *parent;
>> + /* Reposition time */
>> + unsigned long reposition_time;
>> };
>>
>> /*
>> @@ -118,8 +121,6 @@ struct cfq_entity {
>> struct cfq_queue {
>> /* The schedule entity */
>> struct cfq_entity cfqe;
>> - /* Reposition time */
>> - unsigned long reposition_time;
>> /* reference count */
>> int ref;
>> /* various state flags, see below */
>> @@ -199,6 +200,9 @@ struct cfq_group {
>> /* number of cfqq currently on this group */
>> int nr_cfqq;
>>
>> + /* number of sub cfq groups */
>> + int nr_subgp;
>> +
>> /*
>> * Per group busy queus average. Useful for workload slice calc. We
>> * create the array for each prio class but at run time it is used
>> @@ -234,10 +238,11 @@ struct cfq_group {
>> */
>> struct cfq_data {
>> struct request_queue *queue;
>> - /* Root service tree for cfq_groups */
>> - struct cfq_rb_root grp_service_tree;
>> struct cfq_group root_group;
>>
>> + /* cfq group schedule in flat or hierarchy manner. */
>> + bool use_hierarchy;
>> +
>> /*
>> * The priority currently being served
>> */
>> @@ -246,6 +251,9 @@ struct cfq_data {
>> unsigned long workload_expires;
>> struct cfq_group *serving_group;
>>
>> + /* Service tree for cfq group flat scheduling mode. */
>> + struct cfq_rb_root grp_service_tree;
>> +
>> /*
>> * Each priority tree is sorted by next_request position. These
>> * trees are used when determining if two or more queues are
>> @@ -355,8 +363,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
>> }
>>
>>
>> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
>> -
>> static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
>> enum wl_prio_t prio,
>> enum wl_type_t type)
>> @@ -643,13 +649,50 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
>> return cfqg->busy_queues_avg[rt];
>> }
>>
>> +static inline unsigned int
>> +cfq_group_get_total_weight(struct cfq_group *cfqg)
>> +{
>> + int i, j;
>> + struct cfq_rb_root *st;
>> + unsigned int total_weight = 0;
>> +
>> + for_each_cfqg_st(cfqg, i, j, st) {
>> + total_weight += st->total_weight;
>> + }
>> +
>> + return total_weight;
>> +}
>> +
>> static inline unsigned
>> cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> struct cfq_entity *cfqe = &cfqg->cfqe;
>> + struct cfq_rb_root *st;
>> + int group_slice = cfq_target_latency;
>> + unsigned int grp_total_weight;
>> + struct cfq_group *p_cfqg;
>> +
>> + /*
>> + * Calculate group slice in a hierarchical way.
>> + * Note, the calculation is cross all service trees under a group.
>> + */
>> + do {
>> + if (cfqe->parent) {
>> + p_cfqg = cfqg_of_entity(cfqe->parent);
>> + grp_total_weight = cfq_group_get_total_weight(p_cfqg);
>> + group_slice = group_slice * cfqe->weight /
>> + grp_total_weight;
>> + } else {
>> + /* For top level groups */
>> + st = cfqe->service_tree;
>> + group_slice = group_slice * cfqe->weight /
>> + st->total_weight;
>> + }
>>
>> - return cfq_target_latency * cfqe->weight / st->total_weight;
>> + cfqe = cfqe->parent;
>> + } while (cfqe);
>> +
>> + return group_slice;
>> }
>>
>> static inline void
>> @@ -672,7 +715,8 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>> /* scale low_slice according to IO priority
>> * and sync vs async */
>> unsigned low_slice =
>> - min(slice, base_low_slice * slice / sync_slice);
>> + min(slice, base_low_slice * slice /
>> + sync_slice);
>> /* the adapted slice value is scaled to fit all iqs
>> * into the target latency */
>> slice = max(slice * group_slice / expect_latency,
>> @@ -812,17 +856,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
>> return NULL;
>> }
>>
>> -static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
>> -{
>> - if (!root->left)
>> - root->left = rb_first(&root->rb);
>> -
>> - if (root->left)
>> - return rb_entry_entity(root->left);
>> -
>> - return NULL;
>> -}
>> -
>> static void rb_erase_init(struct rb_node *n, struct rb_root *root)
>> {
>> rb_erase(n, root);
>> @@ -896,12 +929,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>>
>> rb_link_node(&cfqe->rb_node, parent, node);
>> rb_insert_color(&cfqe->rb_node, &st->rb);
>> +
>> + update_min_vdisktime(st);
>> }
>>
>> static void
>> cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>> {
>> __cfq_entity_service_tree_add(st, cfqe);
>> + cfqe->reposition_time = jiffies;
>> st->count++;
>> st->total_weight += cfqe->weight;
>> }
>> @@ -909,34 +945,52 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>> static void
>> cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> struct cfq_entity *cfqe = &cfqg->cfqe;
>> - struct cfq_entity *__cfqe;
>> struct rb_node *n;
>> + struct cfq_entity *entity;
>> + struct cfq_rb_root *st;
>> + struct cfq_group *__cfqg;
>>
>> cfqg->nr_cfqq++;
>> +
>> if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> return;
>>
>> /*
>> - * Currently put the group at the end. Later implement something
>> - * so that groups get lesser vtime based on their weights, so that
>> - * if group does not loose all if it was not continously backlogged.
>> + * Enqueue this group and its ancestors onto their service tree.
>> */
>> - n = rb_last(&st->rb);
>> - if (n) {
>> - __cfqe = rb_entry_entity(n);
>> - cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>> - } else
>> - cfqe->vdisktime = st->min_vdisktime;
>> + while (cfqe) {
>> + if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> + return;
>>
>> - cfq_entity_service_tree_add(st, cfqe);
>> + /*
>> + * Currently put the group at the end. Later implement
>> + * something so that groups get lesser vtime based on
>> + * their weights, so that if group does not loose all
>> + * if it was not continously backlogged.
>> + */
>> + st = cfqe->service_tree;
>> + n = rb_last(&st->rb);
>> + if (n) {
>> + entity = rb_entry_entity(n);
>> + cfqe->vdisktime = entity->vdisktime +
>> + CFQ_IDLE_DELAY;
>> + } else
>> + cfqe->vdisktime = st->min_vdisktime;
>> +
>> + cfq_entity_service_tree_add(st, cfqe);
>> + cfqe = cfqe->parent;
>> + __cfqg = cfqg_of_entity(cfqe);
>> + if (__cfqg)
>> + __cfqg->nr_subgp++;
>> + }
>> }
>>
>> static void
>> __cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>> {
>> cfq_rb_erase(&cfqe->rb_node, st);
>> + update_min_vdisktime(st);
>> }
>>
>> static void
>> @@ -945,27 +999,43 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>> if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>> __cfq_entity_service_tree_del(st, cfqe);
>> st->total_weight -= cfqe->weight;
>> - cfqe->service_tree = NULL;
>> }
>> }
>>
>> static void
>> cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> struct cfq_entity *cfqe = &cfqg->cfqe;
>> + struct cfq_group *__cfqg, *p_cfqg;
>>
>> BUG_ON(cfqg->nr_cfqq < 1);
>> cfqg->nr_cfqq--;
>>
>> - /* If there are other cfq queues under this group, don't delete it */
>> - if (cfqg->nr_cfqq)
>> + /*
>> + * If there are other cfq queues under this group, or there are other
>> + * cfq groups under this group, don't delete it.
>> + */
>> + if (cfqg->nr_cfqq || cfqg->nr_subgp)
>> return;
>>
>> - cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
>> - cfq_entity_service_tree_del(st, cfqe);
>> - cfqg->saved_workload_slice = 0;
>> - cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
>> + /*
>> + * Dequeue this group and its ancestors from their service
>> + * tree.
>> + */
>> + while (cfqe) {
>> + __cfqg = cfqg_of_entity(cfqe);
>> + p_cfqg = cfqg_of_entity(cfqe->parent);
>> + cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
>> + cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
>> + cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
>> + __cfqg->saved_workload_slice = 0;
>> + cfqe = cfqe->parent;
>> + if (p_cfqg) {
>> + p_cfqg->nr_subgp--;
>> + if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
>> + return;
>> + }
>> + }
>> }
>>
>> static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>> @@ -997,7 +1067,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>> static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>> struct cfq_queue *cfqq)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> unsigned int used_sl, charge;
>> int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
>> - cfqg->service_tree_idle.count;
>> @@ -1011,10 +1080,23 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>> else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
>> charge = cfqq->allocated_slice;
>>
>> - /* Can't update vdisktime while group is on service tree */
>> - __cfq_entity_service_tree_del(st, cfqe);
>> - cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
>> - __cfq_entity_service_tree_add(st, cfqe);
>> + /*
>> + * Update the vdisktime on the whole chain.
>> + */
>> + while (cfqe) {
>> + struct cfq_rb_root *st = cfqe->service_tree;
>> +
>> + /*
>> + * Can't update vdisktime while group is on service
>> + * tree.
>> + */
>> + __cfq_entity_service_tree_del(st, cfqe);
>> + cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
>> + __cfq_entity_service_tree_add(st, cfqe);
>> + st->count++;
>> + cfqe->reposition_time = jiffies;
>> + cfqe = cfqe->parent;
>> + }
>>
>> /* This group is being expired. Save the context */
>> if (time_after(cfqd->workload_expires, jiffies)) {
>> @@ -1026,7 +1108,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>> cfqg->saved_workload_slice = 0;
>>
>> cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
>> - cfqe->vdisktime, st->min_vdisktime);
>> + cfqg->cfqe.vdisktime,
>> + cfqg->cfqe.service_tree->min_vdisktime);
>> cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
>> " sect=%u", used_sl, cfqq->slice_dispatch, charge,
>> iops_mode(cfqd), cfqq->nr_sectors);
>> @@ -1048,35 +1131,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
>> cfqg_of_blkg(blkg)->cfqe.weight = weight;
>> }
>>
>> -static struct cfq_group *
>> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> +static void init_cfqe(struct blkio_cgroup *blkcg,
>> + struct cfq_group *cfqg)
>> +{
>> + struct cfq_entity *cfqe = &cfqg->cfqe;
>> +
>> + cfqe->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>> + RB_CLEAR_NODE(&cfqe->rb_node);
>> + cfqe->is_group_entity = true;
>> + cfqe->parent = NULL;
>> +}
>> +
>> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
>> + struct cfq_group *cfqg)
>> {
>> - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> - struct cfq_group *cfqg = NULL;
>> - void *key = cfqd;
>> int i, j;
>> struct cfq_rb_root *st;
>> - struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> unsigned int major, minor;
>> -
>> - cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> - if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> - cfqg->blkg.dev = MKDEV(major, minor);
>> - goto done;
>> - }
>> - if (cfqg || !create)
>> - goto done;
>> -
>> - cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
>> - if (!cfqg)
>> - goto done;
>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>
>> for_each_cfqg_st(cfqg, i, j, st)
>> *st = CFQ_RB_ROOT;
>> - RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
>> -
>> - cfqg->cfqe.is_group_entity = true;
>>
>> /*
>> * Take the initial reference that will be released on destroy
>> @@ -1086,24 +1161,199 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> */
>> cfqg->ref = 1;
>>
>> + /* Add group onto cgroup list */
>> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>> + MKDEV(major, minor));
>> + /* Initiate group entity */
>> + init_cfqe(blkcg, cfqg);
>> + /* Add group on cfqd list */
>> + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>> +}
>> +
>> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg);
>> +
>> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> +{
>> + if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
>> + cfq_destroy_cfqg(cfqd, cfqg);
>> +}
>> +
>> +static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
>> + struct cfq_group *p_cfqg)
>> +{
>> + struct cfq_entity *cfqe, *p_cfqe;
>> +
>> + cfqe = &cfqg->cfqe;
>> +
>> /*
>> - * Add group onto cgroup list. It might happen that bdi->dev is
>> - * not initiliazed yet. Initialize this new group without major
>> - * and minor info and this info will be filled in once a new thread
>> - * comes for IO. See code above.
>> + * 1. If use_hierarchy of the CGroup where cfqg's parent stays is not
>> + * set, we put this cfqg onto global service tree.
>> + * 2. If cfqg is root cfqg, put it onto global service tree.
>> */
>> - if (bdi->dev) {
>> - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>> - MKDEV(major, minor));
>> - } else
>> - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>> - 0);
>> + if (!p_cfqg) {
>> + cfqe->service_tree = &cfqd->grp_service_tree;
>> + cfqe->parent = NULL;
>> + return;
>> + }
>>
>> - cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>> + p_cfqe = &p_cfqg->cfqe;
>>
>> - /* Add group on cfqd list */
>> - hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>> + cfqe->parent = p_cfqe;
>> +
>> + /*
>> + * Currently, just put cfq group entity on "BE:SYNC" workload
>> + * service tree.
>> + */
>> + cfqe->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD,
>> + SYNC_WORKLOAD);
>> + /* child reference */
>> + p_cfqg->ref++;
>> +}
>> +
>> +static struct cfq_group *cfqg_get_parent(struct cfq_group * cfqg)
>> +{
>> + struct cfq_entity *cfqe, *p_cfqe;
>> +
>> + if (!cfqg)
>> + return NULL;
>> +
>> + cfqe = &cfqg->cfqe;
>> + p_cfqe = cfqe->parent;
>> + if (!p_cfqe)
>> + return NULL;
>> +
>> + return cfqg_of_entity(p_cfqe);
>> +}
>> +
>> +static struct cfq_group *
>> +cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
>> +{
>> + struct blkio_cgroup *blkcg;
>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> + unsigned int major, minor;
>> + struct cfq_group *cfqg, *leaf_cfqg, *child_cfqg, *tmp_cfqg;
>> + void *key = cfqd;
>> +
>> + /*
>> + * If CGroup's use_hierarchy is unset, we just need to allocate only
>> + * one CFQ group, and this group will put onto the "grp_service_tree".
>> + * We don't need to check whether the cfqg exists, the caller has
>> + * already checked it.
>> + */
>> + blkcg = cgroup_to_blkio_cgroup(cgroup);
>> + if (!blkcg_get_use_hierarchy(blkcg)) {
>> + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
>> + cfqd->queue->node);
>> + if (!cfqg)
>> + return NULL;
>> +
>> + init_cfqg(cfqd, blkcg, cfqg);
>> + cfqg_set_parent(cfqd, cfqg, NULL);
>> + return cfqg;
>> + }
>> +
>> + /*
>> + * Allocate the CFQ group chain until we meet the group we'v already
>> + * allocated before, or to the CGroup whose use_hierarchy is not set.
>> + */
>> + leaf_cfqg = NULL;
>> + child_cfqg = NULL;
>> + for (; cgroup != NULL; cgroup = cgroup->parent) {
>> + blkcg = cgroup_to_blkio_cgroup(cgroup);
>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> + if (cfqg) {
>> + if (!cfqg->blkg.dev && bdi->dev &&
>> + dev_name(bdi->dev)) {
>> + sscanf(dev_name(bdi->dev), "%u:%u",
>> + &major, &minor);
>> + cfqg->blkg.dev = MKDEV(major, minor);
>> + }
>> +
>> + /*
>> + * Initialization of parent doesn't finish yet, get
>> + * it done.
>> + */
>> + if (child_cfqg) {
>> + if (blkcg_get_use_hierarchy(blkcg))
>> + cfqg_set_parent(cfqd, child_cfqg,
>> + cfqg);
>> + else
>> + cfqg_set_parent(cfqd, child_cfqg,
>> + NULL);
>> + }
>> +
>> + /* chain has already been built */
>> + break;
>> + }
>> +
>> + /*
>> + * We only allocate a cfqg that the corresponding cgroup's
>> + * use_hierarchy is set.
>> + */
>> + if (blkcg_get_use_hierarchy(blkcg)) {
>> + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
>> + cfqd->queue->node);
>> + if (!cfqg)
>> + goto clean_up;
>> +
>> + if (!leaf_cfqg)
>> + leaf_cfqg = cfqg;
>> +
>> + init_cfqg(cfqd, blkcg, cfqg);
>> + } else {
>> + cfqg = NULL;
>> + }
>> +
>> + if (child_cfqg)
>> + cfqg_set_parent(cfqd, child_cfqg, cfqg);
>> +
>> + /*
>> + * This CGroup's use_hierarchy isn't set, this means the CFQ
>> + * group chain has been built.
>> + */
>> + if (!blkcg_get_use_hierarchy(blkcg))
>> + break;
>> +
>> + child_cfqg = cfqg;
>> + }
>> +
>> + return leaf_cfqg;
>> +
>> +clean_up:
>> + /* clean up the allocated cfq groups. */
>> + while (leaf_cfqg) {
>> + tmp_cfqg = leaf_cfqg;
>> + leaf_cfqg = cfqg_get_parent(leaf_cfqg);
>> + uninit_cfqg(cfqd, tmp_cfqg);
>> + }
>> +
>> + return NULL;
>> +}
>> +
>> +static struct cfq_group *
>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> +{
>> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> + struct cfq_group *cfqg = NULL;
>> + void *key = cfqd;
>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> + unsigned int major, minor;
>> +
>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> + cfqg->blkg.dev = MKDEV(major, minor);
>> + goto done;
>> + }
>> + if (cfqg || !create)
>> + goto done;
>> +
>> + /*
>> + * Allocate CFQ group chain to the root group or we meet the CGroup
>> + * with use_hierarchy disabled.
>> + */
>> + cfqg = cfqg_chain_alloc(cfqd, cgroup);
>>
>> done:
>> return cfqg;
>> @@ -1148,6 +1398,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
>> {
>> struct cfq_rb_root *st;
>> int i, j;
>> + struct cfq_group *p_cfqg;
>>
>> BUG_ON(cfqg->ref <= 0);
>> cfqg->ref--;
>> @@ -1155,6 +1406,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
>> return;
>> for_each_cfqg_st(cfqg, i, j, st)
>> BUG_ON(!RB_EMPTY_ROOT(&st->rb));
>> +
>> + do {
>> + p_cfqg = cfqg_get_parent(cfqg);
>> + kfree(cfqg);
>> + cfqg = NULL;
>> + /*
>> + * Drop the reference taken by children, if nobody references
>> + * parent group, we need delete the parent also.
>> + */
>> + if (p_cfqg) {
>> + p_cfqg->ref--;
>> + if (p_cfqg->ref == 0)
>> + cfqg = p_cfqg;
>> + }
>> + } while (cfqg);
>> +
>> kfree(cfqg);
>> }
>>
>> @@ -1321,9 +1588,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>> * ioprio.
>> */
>> pos_offset = cfq_get_boost(cfqd, cfqq);
>> - /* Debug purpose, should remove. */
>> - cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu",
>> - pos_offset);
>> cfqe->vdisktime = service_tree->min_vdisktime +
>> pos_offset;
>> } else
>> @@ -1365,9 +1629,8 @@ insert:
>> cfqe->service_tree = service_tree;
>>
>> /* Add cfqq onto service tree. */
>> +
>> cfq_entity_service_tree_add(service_tree, cfqe);
>> - update_min_vdisktime(service_tree);
>> - cfqq->reposition_time = jiffies;
>> if ((add_front || !new_cfqq) && !group_changed)
>> return;
>> cfq_group_service_tree_add(cfqd, cfqq->cfqg);
>> @@ -1810,28 +2073,43 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
>> return cfqq_of_entity(cfq_rb_first(service_tree));
>> }
>>
>> -static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
>> +struct cfq_rb_root *choose_service_tree_forced(struct cfq_group *cfqg)
>> {
>> - struct cfq_group *cfqg;
>> - struct cfq_entity *cfqe;
>> int i, j;
>> struct cfq_rb_root *st;
>>
>> - if (!cfqd->rq_queued)
>> - return NULL;
>> + for_each_cfqg_st(cfqg, i, j, st) {
>> + if (st->count != 0)
>> + return st;
>> + }
>>
>> - cfqg = cfq_get_next_cfqg(cfqd);
>> - if (!cfqg)
>> + return NULL;
>> +}
>> +
>> +static struct cfq_entity *
>> +cfq_get_next_entity_forced(struct cfq_data *cfqd)
>> +{
>> + struct cfq_entity *cfqe;
>> + struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> + struct cfq_group *cfqg;
>> +
>> + if (!cfqd->rq_queued)
>> return NULL;
>>
>> - for_each_cfqg_st(cfqg, i, j, st) {
>> + do {
>> cfqe = cfq_rb_first(st);
>> - if (cfqe != NULL)
>> - return cfqq_of_entity(cfqe);
>> - }
>> + if (cfqe && !cfqe->is_group_entity)
>> + return cfqe;
>> + else if (cfqe && cfqe->is_group_entity)
>> + cfqg = cfqg_of_entity(cfqe);
>> +
>> + st = choose_service_tree_forced(cfqg);
>> + } while (st);
>> +
>> return NULL;
>> }
>>
>> +
>> /*
>> * Get and set a new active qu
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/
>>
>

--
Regards
Gui Jianfeng

2011-02-17 01:22:53

by Gui, Jianfeng/归 剑峰

[permalink] [raw]
Subject: Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

Vivek Goyal wrote:
> On Wed, Feb 16, 2011 at 09:44:39AM +0800, Gui Jianfeng wrote:
>> Vivek Goyal wrote:
>>> On Tue, Feb 15, 2011 at 10:38:32AM +0800, Gui Jianfeng wrote:
>>>> Vivek Goyal wrote:
>>>>> On Sat, Feb 12, 2011 at 10:21:47AM +0800, Gui Jianfeng wrote:
>>>>> [..]
>>>>>>>> +static struct cfq_group *
>>>>>>>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>>>>>>> +{
>>>>>>>> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>>>>>>>> + struct cfq_group *cfqg = NULL;
>>>>>>>> + void *key = cfqd;
>>>>>>>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>>>>>>> + unsigned int major, minor;
>>>>>>>> +
>>>>>>>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>>>>>>>> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>>>>>>>> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>>>>>>> + cfqg->blkg.dev = MKDEV(major, minor);
>>>>>>>> + goto done;
>>>>>>>> + }
>>>>>>> Should we make this updation of this info hierarhical?
>>>>>> IMHO, it's fine to defer the updation when we really get the cfqg.
>>>>> But if cfqg is alrady present, we will never hit the allocation path
>>>>> again. So if somebody creates 2-3 level deep hierarchy and does IO
>>>>> only in the children cgroup, parent cgroups will potentially not get
>>>>> device info updated hence no visible stats?
>>>> Ahh, I see your concern. But do we really need to show the stats even if
>>>> a cgroup doesn't issue any IO on a given device?
>>> I am assuming that once use_hierarchy=1, you are aggregating the stats
>>> in parent cgroups? So if a child services 5 IOs, these are accounted
>>> to parent group also when user_hier=1?
>>>
>>> What happens in case of memoy cgroup controller?
>> Hmm, it seems memcg aggregating stats in parent group.
>> But do we really need to do that in kernel? I think it's easier to do it in
>> userland, and it makes kernel much simpler.
>
> I think at some point of time hierarchical aggregated stats will also be
> required. I am also looking at "memory.stat" file of meomory controller
> and they seem to be reporting both aggregated as well as individual group
> stats.
>
> So we can probably skip implementing hierarhical stats in this patchset
> and implement it on a need basis in future.

Ok, I agree.

Thanks,
Gui

>
> Thanks
> Vivek
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2011-02-17 10:39:39

by Alan

[permalink] [raw]
Subject: Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

> CodingStyle says we should only have one declaration per line.

CodingStyle is a guide not a religious document at over which common
sense deviations should be sacrificed

2011-02-17 17:36:43

by Justin TerAvest

[permalink] [raw]
Subject: Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

On Wed, Feb 16, 2011 at 5:21 PM, Gui Jianfeng
<[email protected]> wrote:
> Justin TerAvest wrote:
>> After a quick read,
>>
>> It's sad that we have to have so many use_hierarchy checks; it seems
>> like we're asking for bugs, especially in the future when one codepath
>> gets updated but not the other.
>>
>> CodingStyle says we should only have one declaration per line.
>>
>> I feel like there is an implicit assumption that groups and tasks
>> should not be children of the same parent; that is, a group should
>> contain only groups, or only tasks, but I don't see this enforced;
>> there's just and assumption that BE:SYNC is "good enough" for that
>> comparison. This smells like something that will be tweaked/tuned for
>> fairness later. :( Why don't we just prevent this from happening?
>
> Hi Justin,
>
> Thanks for reviewing.
>
> Previously, I posted very first version that makes a group containing only
> groups or only tasks. But I think it's more flexible to treat groups and
> tasks at the same level. I think Vivek and Jens have the same opinion.
> We had discussed in this thread http://lkml.org/lkml/2010/8/30/30

Hi Gui,
Thanks for pointing me at the earlier discussion, the decisions make a
lot more sense now.

>
>>
>> The clean_up label in chain_alloc() is strange; I don't think the goto
>> is necessary at all. I found that method generally hard to understand.
>> It's doing a lot.
>
> I don't understand why clean_up isn't needed.
> When we fail to allocate a cfq group at some level, we have to clean up
> all groups in the chain that we have already allocated.

Cleaning up is necessary, but the label is only used from one place.
Why add the goto and the label when the code below "clean_up" can just
be moved inside the condition
+ if (!cfqg)



Thanks,
Justin

>
> Thanks,
> Gui
>
>>
>> It's possible that some of these can't be worked around.
>>
>>
>> On Wed, Feb 9, 2011 at 11:47 PM, Gui Jianfeng
>> <[email protected]> wrote:
>>> CFQ group hierarchical scheduling and use_hierarchy interface.
>>>
>>> Signed-off-by: Gui Jianfeng <[email protected]>
>>> ---
>>> ?block/blk-cgroup.c ?| ? 61 +++++-
>>> ?block/blk-cgroup.h ?| ? ?3 +
>>> ?block/cfq-iosched.c | ?603 +++++++++++++++++++++++++++++++++++++--------------
>>> ?3 files changed, 500 insertions(+), 167 deletions(-)
>>>
>>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>>> index 455768a..c55fecd 100644
>>> --- a/block/blk-cgroup.c
>>> +++ b/block/blk-cgroup.c
>>> @@ -25,7 +25,10 @@
>>> ?static DEFINE_SPINLOCK(blkio_list_lock);
>>> ?static LIST_HEAD(blkio_list);
>>>
>>> -struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
>>> +struct blkio_cgroup blkio_root_cgroup = {
>>> + ? ? ? .weight = 2*BLKIO_WEIGHT_DEFAULT,
>>> + ? ? ? .use_hierarchy = 0
>>> +};
>>> ?EXPORT_SYMBOL_GPL(blkio_root_cgroup);
>>>
>>> ?static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
>>> @@ -454,6 +457,7 @@ static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
>>> ? ? ? ?blkg->blkcg_id = 0;
>>> ?}
>>>
>>> +
>>> ?/*
>>> ?* returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
>>> ?* indicating that blk_group was unhashed by the time we got to it.
>>> @@ -765,6 +769,12 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
>>> ?}
>>> ?EXPORT_SYMBOL_GPL(blkcg_get_weight);
>>>
>>> +unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg)
>>> +{
>>> + ? ? ? return blkcg->use_hierarchy;
>>> +}
>>> +EXPORT_SYMBOL_GPL(blkcg_get_use_hierarchy);
>>> +
>>> ?uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
>>> ?{
>>> ? ? ? ?struct blkio_policy_node *pn;
>>> @@ -1202,6 +1212,8 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
>>> ? ? ? ? ? ? ? ?switch(name) {
>>> ? ? ? ? ? ? ? ?case BLKIO_PROP_weight:
>>> ? ? ? ? ? ? ? ? ? ? ? ?return (u64)blkcg->weight;
>>> + ? ? ? ? ? ? ? case BLKIO_PROP_use_hierarchy:
>>> + ? ? ? ? ? ? ? ? ? ? ? return (u64)blkcg->use_hierarchy;
>>> ? ? ? ? ? ? ? ?}
>>> ? ? ? ? ? ? ? ?break;
>>> ? ? ? ?default:
>>> @@ -1210,6 +1222,36 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
>>> ? ? ? ?return 0;
>>> ?}
>>>
>>> +static int blkio_use_hierarchy_write(struct cgroup *cgrp, u64 val)
>>> +{
>>> + ? ? ? struct cgroup *parent = cgrp->parent;
>>> + ? ? ? struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
>>> + ? ? ? int ret = 0;
>>> +
>>> + ? ? ? if (val != 0 && val != 1)
>>> + ? ? ? ? ? ? ? return -EINVAL;
>>> +
>>> + ? ? ? blkcg = cgroup_to_blkio_cgroup(cgrp);
>>> + ? ? ? if (parent)
>>> + ? ? ? ? ? ? ? parent_blkcg = cgroup_to_blkio_cgroup(parent);
>>> +
>>> + ? ? ? cgroup_lock();
>>> + ? ? ? /*
>>> + ? ? ? ?* If parent's use_hierarchy is set, we can't make any modifications
>>> + ? ? ? ?* in the child subtrees. If it is unset, then the change can occur,
>>> + ? ? ? ?* provided the current cgroup has no children.
>>> + ? ? ? ?*/
>>> + ? ? ? if (!parent_blkcg || !parent_blkcg->use_hierarchy) {
>>> + ? ? ? ? ? ? ? if (list_empty(&cgrp->children))
>>> + ? ? ? ? ? ? ? ? ? ? ? blkcg->use_hierarchy = val;
>>> + ? ? ? ? ? ? ? else
>>> + ? ? ? ? ? ? ? ? ? ? ? ret = -EBUSY;
>>> + ? ? ? } else
>>> + ? ? ? ? ? ? ? ret = -EINVAL;
>>> + ? ? ? cgroup_unlock();
>>> + ? ? ? return ret;
>>> +}
>>> +
>>> ?static int
>>> ?blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
>>> ?{
>>> @@ -1224,6 +1266,8 @@ blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
>>> ? ? ? ? ? ? ? ?switch(name) {
>>> ? ? ? ? ? ? ? ?case BLKIO_PROP_weight:
>>> ? ? ? ? ? ? ? ? ? ? ? ?return blkio_weight_write(blkcg, val);
>>> + ? ? ? ? ? ? ? case BLKIO_PROP_use_hierarchy:
>>> + ? ? ? ? ? ? ? ? ? ? ? return blkio_use_hierarchy_write(cgrp, val);
>>> ? ? ? ? ? ? ? ?}
>>> ? ? ? ? ? ? ? ?break;
>>> ? ? ? ?default:
>>> @@ -1301,6 +1345,13 @@ struct cftype blkio_files[] = {
>>> ? ? ? ? ? ? ? ?.name = "reset_stats",
>>> ? ? ? ? ? ? ? ?.write_u64 = blkiocg_reset_stats,
>>> ? ? ? ?},
>>> + ? ? ? {
>>> + ? ? ? ? ? ? ? .name = "use_hierarchy",
>>> + ? ? ? ? ? ? ? .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?BLKIO_PROP_use_hierarchy),
>>> + ? ? ? ? ? ? ? .read_u64 = blkiocg_file_read_u64,
>>> + ? ? ? ? ? ? ? .write_u64 = blkiocg_file_write_u64,
>>> + ? ? ? },
>>> ?#ifdef CONFIG_BLK_DEV_THROTTLING
>>> ? ? ? ?{
>>> ? ? ? ? ? ? ? ?.name = "throttle.read_bps_device",
>>> @@ -1444,7 +1495,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>>> ?static struct cgroup_subsys_state *
>>> ?blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>>> ?{
>>> - ? ? ? struct blkio_cgroup *blkcg;
>>> + ? ? ? struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
>>> ? ? ? ?struct cgroup *parent = cgroup->parent;
>>>
>>> ? ? ? ?if (!parent) {
>>> @@ -1452,6 +1503,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>>> ? ? ? ? ? ? ? ?goto done;
>>> ? ? ? ?}
>>>
>>> + ? ? ? parent_blkcg = cgroup_to_blkio_cgroup(parent);
>>> ? ? ? ?blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
>>> ? ? ? ?if (!blkcg)
>>> ? ? ? ? ? ? ? ?return ERR_PTR(-ENOMEM);
>>> @@ -1462,6 +1514,11 @@ done:
>>> ? ? ? ?INIT_HLIST_HEAD(&blkcg->blkg_list);
>>>
>>> ? ? ? ?INIT_LIST_HEAD(&blkcg->policy_list);
>>> + ? ? ? if (parent)
>>> + ? ? ? ? ? ? ? blkcg->use_hierarchy = parent_blkcg->use_hierarchy;
>>> + ? ? ? else
>>> + ? ? ? ? ? ? ? blkcg->use_hierarchy = 0;
>>> +
>>> ? ? ? ?return &blkcg->css;
>>> ?}
>>>
>>> diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
>>> index ea4861b..5b4b351 100644
>>> --- a/block/blk-cgroup.h
>>> +++ b/block/blk-cgroup.h
>>> @@ -90,6 +90,7 @@ enum blkcg_file_name_prop {
>>> ? ? ? ?BLKIO_PROP_idle_time,
>>> ? ? ? ?BLKIO_PROP_empty_time,
>>> ? ? ? ?BLKIO_PROP_dequeue,
>>> + ? ? ? BLKIO_PROP_use_hierarchy,
>>> ?};
>>>
>>> ?/* cgroup files owned by throttle policy */
>>> @@ -105,6 +106,7 @@ enum blkcg_file_name_throtl {
>>> ?struct blkio_cgroup {
>>> ? ? ? ?struct cgroup_subsys_state css;
>>> ? ? ? ?unsigned int weight;
>>> + ? ? ? bool use_hierarchy;
>>> ? ? ? ?spinlock_t lock;
>>> ? ? ? ?struct hlist_head blkg_list;
>>> ? ? ? ?struct list_head policy_list; /* list of blkio_policy_node */
>>> @@ -179,6 +181,7 @@ struct blkio_policy_node {
>>>
>>> ?extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
>>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? dev_t dev);
>>> +extern unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg);
>>> ?extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
>>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? dev_t dev);
>>> ?extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
>>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>>> index aa3eda8..0e21d27 100644
>>> --- a/block/cfq-iosched.c
>>> +++ b/block/cfq-iosched.c
>>> @@ -110,6 +110,9 @@ struct cfq_entity {
>>> ? ? ? ?u64 vdisktime;
>>> ? ? ? ?bool is_group_entity;
>>> ? ? ? ?unsigned int weight;
>>> + ? ? ? struct cfq_entity *parent;
>>> + ? ? ? /* Reposition time */
>>> + ? ? ? unsigned long reposition_time;
>>> ?};
>>>
>>> ?/*
>>> @@ -118,8 +121,6 @@ struct cfq_entity {
>>> ?struct cfq_queue {
>>> ? ? ? ?/* The schedule entity */
>>> ? ? ? ?struct cfq_entity cfqe;
>>> - ? ? ? /* Reposition time */
>>> - ? ? ? unsigned long reposition_time;
>>> ? ? ? ?/* reference count */
>>> ? ? ? ?int ref;
>>> ? ? ? ?/* various state flags, see below */
>>> @@ -199,6 +200,9 @@ struct cfq_group {
>>> ? ? ? ?/* number of cfqq currently on this group */
>>> ? ? ? ?int nr_cfqq;
>>>
>>> + ? ? ? /* number of sub cfq groups */
>>> + ? ? ? int nr_subgp;
>>> +
>>> ? ? ? ?/*
>>> ? ? ? ? * Per group busy queus average. Useful for workload slice calc. We
>>> ? ? ? ? * create the array for each prio class but at run time it is used
>>> @@ -234,10 +238,11 @@ struct cfq_group {
>>> ?*/
>>> ?struct cfq_data {
>>> ? ? ? ?struct request_queue *queue;
>>> - ? ? ? /* Root service tree for cfq_groups */
>>> - ? ? ? struct cfq_rb_root grp_service_tree;
>>> ? ? ? ?struct cfq_group root_group;
>>>
>>> + ? ? ? /* cfq group schedule in flat or hierarchy manner. */
>>> + ? ? ? bool use_hierarchy;
>>> +
>>> ? ? ? ?/*
>>> ? ? ? ? * The priority currently being served
>>> ? ? ? ? */
>>> @@ -246,6 +251,9 @@ struct cfq_data {
>>> ? ? ? ?unsigned long workload_expires;
>>> ? ? ? ?struct cfq_group *serving_group;
>>>
>>> + ? ? ? /* Service tree for cfq group flat scheduling mode. */
>>> + ? ? ? struct cfq_rb_root grp_service_tree;
>>> +
>>> ? ? ? ?/*
>>> ? ? ? ? * Each priority tree is sorted by next_request position. ?These
>>> ? ? ? ? * trees are used when determining if two or more queues are
>>> @@ -355,8 +363,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
>>> ?}
>>>
>>>
>>> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
>>> -
>>> ?static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
>>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?enum wl_prio_t prio,
>>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?enum wl_type_t type)
>>> @@ -643,13 +649,50 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
>>> ? ? ? ?return cfqg->busy_queues_avg[rt];
>>> ?}
>>>
>>> +static inline unsigned int
>>> +cfq_group_get_total_weight(struct cfq_group *cfqg)
>>> +{
>>> + ? ? ? int i, j;
>>> + ? ? ? struct cfq_rb_root *st;
>>> + ? ? ? unsigned int total_weight = 0;
>>> +
>>> + ? ? ? for_each_cfqg_st(cfqg, i, j, st) {
>>> + ? ? ? ? ? ? ? total_weight += st->total_weight;
>>> + ? ? ? }
>>> +
>>> + ? ? ? return total_weight;
>>> +}
>>> +
>>> ?static inline unsigned
>>> ?cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>> ?{
>>> - ? ? ? struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>> ? ? ? ?struct cfq_entity *cfqe = &cfqg->cfqe;
>>> + ? ? ? struct cfq_rb_root *st;
>>> + ? ? ? int group_slice = cfq_target_latency;
>>> + ? ? ? unsigned int grp_total_weight;
>>> + ? ? ? struct cfq_group *p_cfqg;
>>> +
>>> + ? ? ? /*
>>> + ? ? ? ?* Calculate group slice in a hierarchical way.
>>> + ? ? ? ?* Note, the calculation is cross all service trees under a group.
>>> + ? ? ? ?*/
>>> + ? ? ? do {
>>> + ? ? ? ? ? ? ? if (cfqe->parent) {
>>> + ? ? ? ? ? ? ? ? ? ? ? p_cfqg = cfqg_of_entity(cfqe->parent);
>>> + ? ? ? ? ? ? ? ? ? ? ? grp_total_weight = cfq_group_get_total_weight(p_cfqg);
>>> + ? ? ? ? ? ? ? ? ? ? ? group_slice = group_slice * cfqe->weight /
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? grp_total_weight;
>>> + ? ? ? ? ? ? ? } else {
>>> + ? ? ? ? ? ? ? ? ? ? ? /* For top level groups */
>>> + ? ? ? ? ? ? ? ? ? ? ? st = cfqe->service_tree;
>>> + ? ? ? ? ? ? ? ? ? ? ? group_slice = group_slice * cfqe->weight /
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? st->total_weight;
>>> + ? ? ? ? ? ? ? }
>>>
>>> - ? ? ? return cfq_target_latency * cfqe->weight / st->total_weight;
>>> + ? ? ? ? ? ? ? cfqe = cfqe->parent;
>>> + ? ? ? } while (cfqe);
>>> +
>>> + ? ? ? return group_slice;
>>> ?}
>>>
>>> ?static inline void
>>> @@ -672,7 +715,8 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>>> ? ? ? ? ? ? ? ? ? ? ? ?/* scale low_slice according to IO priority
>>> ? ? ? ? ? ? ? ? ? ? ? ? * and sync vs async */
>>> ? ? ? ? ? ? ? ? ? ? ? ?unsigned low_slice =
>>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? min(slice, base_low_slice * slice / sync_slice);
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? min(slice, base_low_slice * slice /
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? sync_slice);
>>> ? ? ? ? ? ? ? ? ? ? ? ?/* the adapted slice value is scaled to fit all iqs
>>> ? ? ? ? ? ? ? ? ? ? ? ? * into the target latency */
>>> ? ? ? ? ? ? ? ? ? ? ? ?slice = max(slice * group_slice / expect_latency,
>>> @@ -812,17 +856,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
>>> ? ? ? ?return NULL;
>>> ?}
>>>
>>> -static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
>>> -{
>>> - ? ? ? if (!root->left)
>>> - ? ? ? ? ? ? ? root->left = rb_first(&root->rb);
>>> -
>>> - ? ? ? if (root->left)
>>> - ? ? ? ? ? ? ? return rb_entry_entity(root->left);
>>> -
>>> - ? ? ? return NULL;
>>> -}
>>> -
>>> ?static void rb_erase_init(struct rb_node *n, struct rb_root *root)
>>> ?{
>>> ? ? ? ?rb_erase(n, root);
>>> @@ -896,12 +929,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>>>
>>> ? ? ? ?rb_link_node(&cfqe->rb_node, parent, node);
>>> ? ? ? ?rb_insert_color(&cfqe->rb_node, &st->rb);
>>> +
>>> + ? ? ? update_min_vdisktime(st);
>>> ?}
>>>
>>> ?static void
>>> ?cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>>> ?{
>>> ? ? ? ?__cfq_entity_service_tree_add(st, cfqe);
>>> + ? ? ? cfqe->reposition_time = jiffies;
>>> ? ? ? ?st->count++;
>>> ? ? ? ?st->total_weight += cfqe->weight;
>>> ?}
>>> @@ -909,34 +945,52 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>>> ?static void
>>> ?cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>> ?{
>>> - ? ? ? struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>> ? ? ? ?struct cfq_entity *cfqe = &cfqg->cfqe;
>>> - ? ? ? struct cfq_entity *__cfqe;
>>> ? ? ? ?struct rb_node *n;
>>> + ? ? ? struct cfq_entity *entity;
>>> + ? ? ? struct cfq_rb_root *st;
>>> + ? ? ? struct cfq_group *__cfqg;
>>>
>>> ? ? ? ?cfqg->nr_cfqq++;
>>> +
>>> ? ? ? ?if (!RB_EMPTY_NODE(&cfqe->rb_node))
>>> ? ? ? ? ? ? ? ?return;
>>>
>>> ? ? ? ?/*
>>> - ? ? ? ?* Currently put the group at the end. Later implement something
>>> - ? ? ? ?* so that groups get lesser vtime based on their weights, so that
>>> - ? ? ? ?* if group does not loose all if it was not continously backlogged.
>>> + ? ? ? ?* Enqueue this group and its ancestors onto their service tree.
>>> ? ? ? ? */
>>> - ? ? ? n = rb_last(&st->rb);
>>> - ? ? ? if (n) {
>>> - ? ? ? ? ? ? ? __cfqe = rb_entry_entity(n);
>>> - ? ? ? ? ? ? ? cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>>> - ? ? ? } else
>>> - ? ? ? ? ? ? ? cfqe->vdisktime = st->min_vdisktime;
>>> + ? ? ? while (cfqe) {
>>> + ? ? ? ? ? ? ? if (!RB_EMPTY_NODE(&cfqe->rb_node))
>>> + ? ? ? ? ? ? ? ? ? ? ? return;
>>>
>>> - ? ? ? cfq_entity_service_tree_add(st, cfqe);
>>> + ? ? ? ? ? ? ? /*
>>> + ? ? ? ? ? ? ? ?* Currently put the group at the end. Later implement
>>> + ? ? ? ? ? ? ? ?* something so that groups get lesser vtime based on
>>> + ? ? ? ? ? ? ? ?* their weights, so that if group does not loose all
>>> + ? ? ? ? ? ? ? ?* if it was not continously backlogged.
>>> + ? ? ? ? ? ? ? ?*/
>>> + ? ? ? ? ? ? ? st = cfqe->service_tree;
>>> + ? ? ? ? ? ? ? n = rb_last(&st->rb);
>>> + ? ? ? ? ? ? ? if (n) {
>>> + ? ? ? ? ? ? ? ? ? ? ? entity = rb_entry_entity(n);
>>> + ? ? ? ? ? ? ? ? ? ? ? cfqe->vdisktime = entity->vdisktime +
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? CFQ_IDLE_DELAY;
>>> + ? ? ? ? ? ? ? } else
>>> + ? ? ? ? ? ? ? ? ? ? ? cfqe->vdisktime = st->min_vdisktime;
>>> +
>>> + ? ? ? ? ? ? ? cfq_entity_service_tree_add(st, cfqe);
>>> + ? ? ? ? ? ? ? cfqe = cfqe->parent;
>>> + ? ? ? ? ? ? ? __cfqg = cfqg_of_entity(cfqe);
>>> + ? ? ? ? ? ? ? if (__cfqg)
>>> + ? ? ? ? ? ? ? ? ? ? ? __cfqg->nr_subgp++;
>>> + ? ? ? }
>>> ?}
>>>
>>> ?static void
>>> ?__cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>>> ?{
>>> ? ? ? ?cfq_rb_erase(&cfqe->rb_node, st);
>>> + ? ? ? update_min_vdisktime(st);
>>> ?}
>>>
>>> ?static void
>>> @@ -945,27 +999,43 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>>> ? ? ? ?if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>>> ? ? ? ? ? ? ? ?__cfq_entity_service_tree_del(st, cfqe);
>>> ? ? ? ? ? ? ? ?st->total_weight -= cfqe->weight;
>>> - ? ? ? ? ? ? ? cfqe->service_tree = NULL;
>>> ? ? ? ?}
>>> ?}
>>>
>>> ?static void
>>> ?cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>> ?{
>>> - ? ? ? struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>> ? ? ? ?struct cfq_entity *cfqe = &cfqg->cfqe;
>>> + ? ? ? struct cfq_group *__cfqg, *p_cfqg;
>>>
>>> ? ? ? ?BUG_ON(cfqg->nr_cfqq < 1);
>>> ? ? ? ?cfqg->nr_cfqq--;
>>>
>>> - ? ? ? /* If there are other cfq queues under this group, don't delete it */
>>> - ? ? ? if (cfqg->nr_cfqq)
>>> + ? ? ? /*
>>> + ? ? ? ?* If there are other cfq queues under this group, or there are other
>>> + ? ? ? ?* cfq groups under this group, don't delete it.
>>> + ? ? ? ?*/
>>> + ? ? ? if (cfqg->nr_cfqq || cfqg->nr_subgp)
>>> ? ? ? ? ? ? ? ?return;
>>>
>>> - ? ? ? cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
>>> - ? ? ? cfq_entity_service_tree_del(st, cfqe);
>>> - ? ? ? cfqg->saved_workload_slice = 0;
>>> - ? ? ? cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
>>> + ? ? ? /*
>>> + ? ? ? ?* Dequeue this group and its ancestors from their service
>>> + ? ? ? ?* tree.
>>> + ? ? ? ?*/
>>> + ? ? ? while (cfqe) {
>>> + ? ? ? ? ? ? ? __cfqg = cfqg_of_entity(cfqe);
>>> + ? ? ? ? ? ? ? p_cfqg = cfqg_of_entity(cfqe->parent);
>>> + ? ? ? ? ? ? ? cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
>>> + ? ? ? ? ? ? ? cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
>>> + ? ? ? ? ? ? ? cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
>>> + ? ? ? ? ? ? ? __cfqg->saved_workload_slice = 0;
>>> + ? ? ? ? ? ? ? cfqe = cfqe->parent;
>>> + ? ? ? ? ? ? ? if (p_cfqg) {
>>> + ? ? ? ? ? ? ? ? ? ? ? p_cfqg->nr_subgp--;
>>> + ? ? ? ? ? ? ? ? ? ? ? if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? return;
>>> + ? ? ? ? ? ? ? }
>>> + ? ? ? }
>>> ?}
>>>
>>> ?static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>>> @@ -997,7 +1067,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>>> ?static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cfq_queue *cfqq)
>>> ?{
>>> - ? ? ? struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>> ? ? ? ?unsigned int used_sl, charge;
>>> ? ? ? ?int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
>>> ? ? ? ? ? ? ? ? ? ? ? ?- cfqg->service_tree_idle.count;
>>> @@ -1011,10 +1080,23 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>> ? ? ? ?else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
>>> ? ? ? ? ? ? ? ?charge = cfqq->allocated_slice;
>>>
>>> - ? ? ? /* Can't update vdisktime while group is on service tree */
>>> - ? ? ? __cfq_entity_service_tree_del(st, cfqe);
>>> - ? ? ? cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
>>> - ? ? ? __cfq_entity_service_tree_add(st, cfqe);
>>> + ? ? ? /*
>>> + ? ? ? ?* Update the vdisktime on the whole chain.
>>> + ? ? ? ?*/
>>> + ? ? ? while (cfqe) {
>>> + ? ? ? ? ? ? ? struct cfq_rb_root *st = cfqe->service_tree;
>>> +
>>> + ? ? ? ? ? ? ? /*
>>> + ? ? ? ? ? ? ? ?* Can't update vdisktime while group is on service
>>> + ? ? ? ? ? ? ? ?* tree.
>>> + ? ? ? ? ? ? ? ?*/
>>> + ? ? ? ? ? ? ? __cfq_entity_service_tree_del(st, cfqe);
>>> + ? ? ? ? ? ? ? cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
>>> + ? ? ? ? ? ? ? __cfq_entity_service_tree_add(st, cfqe);
>>> + ? ? ? ? ? ? ? st->count++;
>>> + ? ? ? ? ? ? ? cfqe->reposition_time = jiffies;
>>> + ? ? ? ? ? ? ? cfqe = cfqe->parent;
>>> + ? ? ? }
>>>
>>> ? ? ? ?/* This group is being expired. Save the context */
>>> ? ? ? ?if (time_after(cfqd->workload_expires, jiffies)) {
>>> @@ -1026,7 +1108,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>> ? ? ? ? ? ? ? ?cfqg->saved_workload_slice = 0;
>>>
>>> ? ? ? ?cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
>>> - ? ? ? ? ? ? ? ? ? ?cfqe->vdisktime, st->min_vdisktime);
>>> + ? ? ? ? ? ? ? ? ? ?cfqg->cfqe.vdisktime,
>>> + ? ? ? ? ? ? ? ? ? ?cfqg->cfqe.service_tree->min_vdisktime);
>>> ? ? ? ?cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
>>> ? ? ? ? ? ? ? ? ? ? ? ?" sect=%u", used_sl, cfqq->slice_dispatch, charge,
>>> ? ? ? ? ? ? ? ? ? ? ? ?iops_mode(cfqd), cfqq->nr_sectors);
>>> @@ -1048,35 +1131,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
>>> ? ? ? ?cfqg_of_blkg(blkg)->cfqe.weight = weight;
>>> ?}
>>>
>>> -static struct cfq_group *
>>> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>> +static void init_cfqe(struct blkio_cgroup *blkcg,
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct cfq_group *cfqg)
>>> +{
>>> + ? ? ? struct cfq_entity *cfqe = &cfqg->cfqe;
>>> +
>>> + ? ? ? cfqe->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>>> + ? ? ? RB_CLEAR_NODE(&cfqe->rb_node);
>>> + ? ? ? cfqe->is_group_entity = true;
>>> + ? ? ? cfqe->parent = NULL;
>>> +}
>>> +
>>> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
>>> + ? ? ? ? ? ? ? ? ? ? struct cfq_group *cfqg)
>>> ?{
>>> - ? ? ? struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>>> - ? ? ? struct cfq_group *cfqg = NULL;
>>> - ? ? ? void *key = cfqd;
>>> ? ? ? ?int i, j;
>>> ? ? ? ?struct cfq_rb_root *st;
>>> - ? ? ? struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>> ? ? ? ?unsigned int major, minor;
>>> -
>>> - ? ? ? cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>>> - ? ? ? if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>>> - ? ? ? ? ? ? ? sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>> - ? ? ? ? ? ? ? cfqg->blkg.dev = MKDEV(major, minor);
>>> - ? ? ? ? ? ? ? goto done;
>>> - ? ? ? }
>>> - ? ? ? if (cfqg || !create)
>>> - ? ? ? ? ? ? ? goto done;
>>> -
>>> - ? ? ? cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
>>> - ? ? ? if (!cfqg)
>>> - ? ? ? ? ? ? ? goto done;
>>> + ? ? ? struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>>
>>> ? ? ? ?for_each_cfqg_st(cfqg, i, j, st)
>>> ? ? ? ? ? ? ? ?*st = CFQ_RB_ROOT;
>>> - ? ? ? RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
>>> -
>>> - ? ? ? cfqg->cfqe.is_group_entity = true;
>>>
>>> ? ? ? ?/*
>>> ? ? ? ? * Take the initial reference that will be released on destroy
>>> @@ -1086,24 +1161,199 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>> ? ? ? ? */
>>> ? ? ? ?cfqg->ref = 1;
>>>
>>> + ? ? ? /* Add group onto cgroup list */
>>> + ? ? ? sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>> + ? ? ? cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? MKDEV(major, minor));
>>> + ? ? ? /* Initiate group entity */
>>> + ? ? ? init_cfqe(blkcg, cfqg);
>>> + ? ? ? /* Add group on cfqd list */
>>> + ? ? ? hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>>> +}
>>> +
>>> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg);
>>> +
>>> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>> +{
>>> + ? ? ? if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
>>> + ? ? ? ? ? ? ? cfq_destroy_cfqg(cfqd, cfqg);
>>> +}
>>> +
>>> +static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? struct cfq_group *p_cfqg)
>>> +{
>>> + ? ? ? struct cfq_entity *cfqe, *p_cfqe;
>>> +
>>> + ? ? ? cfqe = &cfqg->cfqe;
>>> +
>>> ? ? ? ?/*
>>> - ? ? ? ?* Add group onto cgroup list. It might happen that bdi->dev is
>>> - ? ? ? ?* not initiliazed yet. Initialize this new group without major
>>> - ? ? ? ?* and minor info and this info will be filled in once a new thread
>>> - ? ? ? ?* comes for IO. See code above.
>>> + ? ? ? ?* 1. If use_hierarchy of the CGroup where cfqg's parent stays is not
>>> + ? ? ? ?* ? ?set, we put this cfqg onto global service tree.
>>> + ? ? ? ?* 2. If cfqg is root cfqg, put it onto global service tree.
>>> ? ? ? ? */
>>> - ? ? ? if (bdi->dev) {
>>> - ? ? ? ? ? ? ? sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>> - ? ? ? ? ? ? ? cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? MKDEV(major, minor));
>>> - ? ? ? } else
>>> - ? ? ? ? ? ? ? cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 0);
>>> + ? ? ? if (!p_cfqg) {
>>> + ? ? ? ? ? ? ? cfqe->service_tree = &cfqd->grp_service_tree;
>>> + ? ? ? ? ? ? ? cfqe->parent = NULL;
>>> + ? ? ? ? ? ? ? return;
>>> + ? ? ? }
>>>
>>> - ? ? ? cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>>> + ? ? ? p_cfqe = &p_cfqg->cfqe;
>>>
>>> - ? ? ? /* Add group on cfqd list */
>>> - ? ? ? hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>>> + ? ? ? cfqe->parent = p_cfqe;
>>> +
>>> + ? ? ? /*
>>> + ? ? ? ?* Currently, just put cfq group entity on "BE:SYNC" workload
>>> + ? ? ? ?* service tree.
>>> + ? ? ? ?*/
>>> + ? ? ? cfqe->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD,
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? SYNC_WORKLOAD);
>>> + ? ? ? /* child reference */
>>> + ? ? ? p_cfqg->ref++;
>>> +}
>>> +
>>> +static struct cfq_group *cfqg_get_parent(struct cfq_group * cfqg)
>>> +{
>>> + ? ? ? struct cfq_entity *cfqe, *p_cfqe;
>>> +
>>> + ? ? ? if (!cfqg)
>>> + ? ? ? ? ? ? ? return NULL;
>>> +
>>> + ? ? ? cfqe = &cfqg->cfqe;
>>> + ? ? ? p_cfqe = cfqe->parent;
>>> + ? ? ? if (!p_cfqe)
>>> + ? ? ? ? ? ? ? return NULL;
>>> +
>>> + ? ? ? return cfqg_of_entity(p_cfqe);
>>> +}
>>> +
>>> +static struct cfq_group *
>>> +cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
>>> +{
>>> + ? ? ? struct blkio_cgroup *blkcg;
>>> + ? ? ? struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>> + ? ? ? unsigned int major, minor;
>>> + ? ? ? struct cfq_group *cfqg, *leaf_cfqg, *child_cfqg, *tmp_cfqg;
>>> + ? ? ? void *key = cfqd;
>>> +
>>> + ? ? ? /*
>>> + ? ? ? ?* If CGroup's use_hierarchy is unset, we just need to allocate only
>>> + ? ? ? ?* one CFQ group, and this group will put onto the "grp_service_tree".
>>> + ? ? ? ?* We don't need to check whether the cfqg exists, the caller has
>>> + ? ? ? ?* already checked it.
>>> + ? ? ? ?*/
>>> + ? ? ? blkcg = cgroup_to_blkio_cgroup(cgroup);
>>> + ? ? ? if (!blkcg_get_use_hierarchy(blkcg)) {
>>> + ? ? ? ? ? ? ? cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? cfqd->queue->node);
>>> + ? ? ? ? ? ? ? if (!cfqg)
>>> + ? ? ? ? ? ? ? ? ? ? ? return NULL;
>>> +
>>> + ? ? ? ? ? ? ? init_cfqg(cfqd, blkcg, cfqg);
>>> + ? ? ? ? ? ? ? cfqg_set_parent(cfqd, cfqg, NULL);
>>> + ? ? ? ? ? ? ? return cfqg;
>>> + ? ? ? }
>>> +
>>> + ? ? ? /*
>>> + ? ? ? ?* Allocate the CFQ group chain until we meet the group we'v already
>>> + ? ? ? ?* allocated before, or to the CGroup whose use_hierarchy is not set.
>>> + ? ? ? ?*/
>>> + ? ? ? leaf_cfqg = NULL;
>>> + ? ? ? child_cfqg = NULL;
>>> + ? ? ? for (; cgroup != NULL; cgroup = cgroup->parent) {
>>> + ? ? ? ? ? ? ? blkcg = cgroup_to_blkio_cgroup(cgroup);
>>> + ? ? ? ? ? ? ? cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>>> + ? ? ? ? ? ? ? if (cfqg) {
>>> + ? ? ? ? ? ? ? ? ? ? ? if (!cfqg->blkg.dev && bdi->dev &&
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? dev_name(bdi->dev)) {
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? sscanf(dev_name(bdi->dev), "%u:%u",
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?&major, &minor);
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? cfqg->blkg.dev = MKDEV(major, minor);
>>> + ? ? ? ? ? ? ? ? ? ? ? }
>>> +
>>> + ? ? ? ? ? ? ? ? ? ? ? /*
>>> + ? ? ? ? ? ? ? ? ? ? ? ?* Initialization of parent doesn't finish yet, get
>>> + ? ? ? ? ? ? ? ? ? ? ? ?* it done.
>>> + ? ? ? ? ? ? ? ? ? ? ? ?*/
>>> + ? ? ? ? ? ? ? ? ? ? ? if (child_cfqg) {
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? if (blkcg_get_use_hierarchy(blkcg))
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? cfqg_set_parent(cfqd, child_cfqg,
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? cfqg);
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? else
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? cfqg_set_parent(cfqd, child_cfqg,
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? NULL);
>>> + ? ? ? ? ? ? ? ? ? ? ? }
>>> +
>>> + ? ? ? ? ? ? ? ? ? ? ? /* chain has already been built */
>>> + ? ? ? ? ? ? ? ? ? ? ? break;
>>> + ? ? ? ? ? ? ? }
>>> +
>>> + ? ? ? ? ? ? ? /*
>>> + ? ? ? ? ? ? ? ?* We only allocate a cfqg that the corresponding cgroup's
>>> + ? ? ? ? ? ? ? ?* use_hierarchy is set.
>>> + ? ? ? ? ? ? ? ?*/
>>> + ? ? ? ? ? ? ? if (blkcg_get_use_hierarchy(blkcg)) {
>>> + ? ? ? ? ? ? ? ? ? ? ? cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? cfqd->queue->node);
>>> + ? ? ? ? ? ? ? ? ? ? ? if (!cfqg)
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto clean_up;
>>> +
>>> + ? ? ? ? ? ? ? ? ? ? ? if (!leaf_cfqg)
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? leaf_cfqg = cfqg;
>>> +
>>> + ? ? ? ? ? ? ? ? ? ? ? init_cfqg(cfqd, blkcg, cfqg);
>>> + ? ? ? ? ? ? ? } else {
>>> + ? ? ? ? ? ? ? ? ? ? ? cfqg = NULL;
>>> + ? ? ? ? ? ? ? }
>>> +
>>> + ? ? ? ? ? ? ? if (child_cfqg)
>>> + ? ? ? ? ? ? ? ? ? ? ? cfqg_set_parent(cfqd, child_cfqg, cfqg);
>>> +
>>> + ? ? ? ? ? ? ? /*
>>> + ? ? ? ? ? ? ? ?* This CGroup's use_hierarchy isn't set, this means the CFQ
>>> + ? ? ? ? ? ? ? ?* group chain has been built.
>>> + ? ? ? ? ? ? ? ?*/
>>> + ? ? ? ? ? ? ? if (!blkcg_get_use_hierarchy(blkcg))
>>> + ? ? ? ? ? ? ? ? ? ? ? break;
>>> +
>>> + ? ? ? ? ? ? ? child_cfqg = cfqg;
>>> + ? ? ? }
>>> +
>>> + ? ? ? return leaf_cfqg;
>>> +
>>> +clean_up:
>>> + ? ? ? /* clean up the allocated cfq groups. */
>>> + ? ? ? while (leaf_cfqg) {
>>> + ? ? ? ? ? ? ? tmp_cfqg = leaf_cfqg;
>>> + ? ? ? ? ? ? ? leaf_cfqg = cfqg_get_parent(leaf_cfqg);
>>> + ? ? ? ? ? ? ? uninit_cfqg(cfqd, tmp_cfqg);
>>> + ? ? ? }
>>> +
>>> + ? ? ? return NULL;
>>> +}
>>> +
>>> +static struct cfq_group *
>>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>> +{
>>> + ? ? ? struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>>> + ? ? ? struct cfq_group *cfqg = NULL;
>>> + ? ? ? void *key = cfqd;
>>> + ? ? ? struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>> + ? ? ? unsigned int major, minor;
>>> +
>>> + ? ? ? cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>>> + ? ? ? if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>>> + ? ? ? ? ? ? ? sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>> + ? ? ? ? ? ? ? cfqg->blkg.dev = MKDEV(major, minor);
>>> + ? ? ? ? ? ? ? goto done;
>>> + ? ? ? }
>>> + ? ? ? if (cfqg || !create)
>>> + ? ? ? ? ? ? ? goto done;
>>> +
>>> + ? ? ? /*
>>> + ? ? ? ?* Allocate CFQ group chain to the root group or we meet the CGroup
>>> + ? ? ? ?* with use_hierarchy disabled.
>>> + ? ? ? ?*/
>>> + ? ? ? cfqg = cfqg_chain_alloc(cfqd, cgroup);
>>>
>>> ?done:
>>> ? ? ? ?return cfqg;
>>> @@ -1148,6 +1398,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
>>> ?{
>>> ? ? ? ?struct cfq_rb_root *st;
>>> ? ? ? ?int i, j;
>>> + ? ? ? struct cfq_group *p_cfqg;
>>>
>>> ? ? ? ?BUG_ON(cfqg->ref <= 0);
>>> ? ? ? ?cfqg->ref--;
>>> @@ -1155,6 +1406,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
>>> ? ? ? ? ? ? ? ?return;
>>> ? ? ? ?for_each_cfqg_st(cfqg, i, j, st)
>>> ? ? ? ? ? ? ? ?BUG_ON(!RB_EMPTY_ROOT(&st->rb));
>>> +
>>> + ? ? ? do {
>>> + ? ? ? ? ? ? ? p_cfqg = cfqg_get_parent(cfqg);
>>> + ? ? ? ? ? ? ? kfree(cfqg);
>>> + ? ? ? ? ? ? ? cfqg = NULL;
>>> + ? ? ? ? ? ? ? /*
>>> + ? ? ? ? ? ? ? ?* Drop the reference taken by children, if nobody references
>>> + ? ? ? ? ? ? ? ?* parent group, we need delete the parent also.
>>> + ? ? ? ? ? ? ? ?*/
>>> + ? ? ? ? ? ? ? if (p_cfqg) {
>>> + ? ? ? ? ? ? ? ? ? ? ? p_cfqg->ref--;
>>> + ? ? ? ? ? ? ? ? ? ? ? if (p_cfqg->ref == 0)
>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? cfqg = p_cfqg;
>>> + ? ? ? ? ? ? ? }
>>> + ? ? ? } while (cfqg);
>>> +
>>> ? ? ? ?kfree(cfqg);
>>> ?}
>>>
>>> @@ -1321,9 +1588,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>> ? ? ? ? ? ? ? ? ? ? ? ? * ioprio.
>>> ? ? ? ? ? ? ? ? ? ? ? ? */
>>> ? ? ? ? ? ? ? ? ? ? ? ?pos_offset = cfq_get_boost(cfqd, cfqq);
>>> - ? ? ? ? ? ? ? ? ? ? ? /* Debug purpose, should remove. */
>>> - ? ? ? ? ? ? ? ? ? ? ? cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu",
>>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?pos_offset);
>>> ? ? ? ? ? ? ? ? ? ? ? ?cfqe->vdisktime = service_tree->min_vdisktime +
>>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?pos_offset;
>>> ? ? ? ? ? ? ? ?} else
>>> @@ -1365,9 +1629,8 @@ insert:
>>> ? ? ? ?cfqe->service_tree = service_tree;
>>>
>>> ? ? ? ?/* Add cfqq onto service tree. */
>>> +
>>> ? ? ? ?cfq_entity_service_tree_add(service_tree, cfqe);
>>> - ? ? ? update_min_vdisktime(service_tree);
>>> - ? ? ? cfqq->reposition_time = jiffies;
>>> ? ? ? ?if ((add_front || !new_cfqq) && !group_changed)
>>> ? ? ? ? ? ? ? ?return;
>>> ? ? ? ?cfq_group_service_tree_add(cfqd, cfqq->cfqg);
>>> @@ -1810,28 +2073,43 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
>>> ? ? ? ?return cfqq_of_entity(cfq_rb_first(service_tree));
>>> ?}
>>>
>>> -static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
>>> +struct cfq_rb_root *choose_service_tree_forced(struct cfq_group *cfqg)
>>> ?{
>>> - ? ? ? struct cfq_group *cfqg;
>>> - ? ? ? struct cfq_entity *cfqe;
>>> ? ? ? ?int i, j;
>>> ? ? ? ?struct cfq_rb_root *st;
>>>
>>> - ? ? ? if (!cfqd->rq_queued)
>>> - ? ? ? ? ? ? ? return NULL;
>>> + ? ? ? for_each_cfqg_st(cfqg, i, j, st) {
>>> + ? ? ? ? ? ? ? if (st->count != 0)
>>> + ? ? ? ? ? ? ? ? ? ? ? return st;
>>> + ? ? ? }
>>>
>>> - ? ? ? cfqg = cfq_get_next_cfqg(cfqd);
>>> - ? ? ? if (!cfqg)
>>> + ? ? ? return NULL;
>>> +}
>>> +
>>> +static struct cfq_entity *
>>> +cfq_get_next_entity_forced(struct cfq_data *cfqd)
>>> +{
>>> + ? ? ? struct cfq_entity *cfqe;
>>> + ? ? ? struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>> + ? ? ? struct cfq_group *cfqg;
>>> +
>>> + ? ? ? if (!cfqd->rq_queued)
>>> ? ? ? ? ? ? ? ?return NULL;
>>>
>>> - ? ? ? for_each_cfqg_st(cfqg, i, j, st) {
>>> + ? ? ? do {
>>> ? ? ? ? ? ? ? ?cfqe = cfq_rb_first(st);
>>> - ? ? ? ? ? ? ? if (cfqe != NULL)
>>> - ? ? ? ? ? ? ? ? ? ? ? return cfqq_of_entity(cfqe);
>>> - ? ? ? }
>>> + ? ? ? ? ? ? ? if (cfqe && !cfqe->is_group_entity)
>>> + ? ? ? ? ? ? ? ? ? ? ? return cfqe;
>>> + ? ? ? ? ? ? ? else if (cfqe && cfqe->is_group_entity)
>>> + ? ? ? ? ? ? ? ? ? ? ? cfqg = cfqg_of_entity(cfqe);
>>> +
>>> + ? ? ? ? ? ? ? st = choose_service_tree_forced(cfqg);
>>> + ? ? ? } while (st);
>>> +
>>> ? ? ? ?return NULL;
>>> ?}
>>>
>>> +
>>> ?/*
>>> ?* Get and set a new active qu
>>>
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>> the body of a message to [email protected]
>>> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
>>> Please read the FAQ at ?http://www.tux.org/lkml/
>>>
>>
>
> --
> Regards
> Gui Jianfeng
>

2011-02-18 01:15:01

by Gui, Jianfeng/归 剑峰

[permalink] [raw]
Subject: Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

Justin TerAvest wrote:
> On Wed, Feb 16, 2011 at 5:21 PM, Gui Jianfeng
> <[email protected]> wrote:
>> Justin TerAvest wrote:
>>> After a quick read,
>>>
>>> It's sad that we have to have so many use_hierarchy checks; it seems
>>> like we're asking for bugs, especially in the future when one codepath
>>> gets updated but not the other.
>>>
>>> CodingStyle says we should only have one declaration per line.
>>>
>>> I feel like there is an implicit assumption that groups and tasks
>>> should not be children of the same parent; that is, a group should
>>> contain only groups, or only tasks, but I don't see this enforced;
>>> there's just and assumption that BE:SYNC is "good enough" for that
>>> comparison. This smells like something that will be tweaked/tuned for
>>> fairness later. :( Why don't we just prevent this from happening?
>> Hi Justin,
>>
>> Thanks for reviewing.
>>
>> Previously, I posted very first version that makes a group containing only
>> groups or only tasks. But I think it's more flexible to treat groups and
>> tasks at the same level. I think Vivek and Jens have the same opinion.
>> We had discussed in this thread http://lkml.org/lkml/2010/8/30/30
>
> Hi Gui,
> Thanks for pointing me at the earlier discussion, the decisions make a
> lot more sense now.
>
>>> The clean_up label in chain_alloc() is strange; I don't think the goto
>>> is necessary at all. I found that method generally hard to understand.
>>> It's doing a lot.
>> I don't understand why clean_up isn't needed.
>> When we fail to allocate a cfq group at some level, we have to clean up
>> all groups in the chain that we have already allocated.
>
> Cleaning up is necessary, but the label is only used from one place.
> Why add the goto and the label when the code below "clean_up" can just
> be moved inside the condition
> + if (!cfqg)

It's common in kernel to put error processing at the end of a function. ;)

Thanks,
Gui