Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754598AbZGXUax (ORCPT ); Fri, 24 Jul 2009 16:30:53 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1754281AbZGXUav (ORCPT ); Fri, 24 Jul 2009 16:30:51 -0400 Received: from mx2.redhat.com ([66.187.237.31]:35525 "EHLO mx2.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754403AbZGXUag (ORCPT ); Fri, 24 Jul 2009 16:30:36 -0400 From: Vivek Goyal To: linux-kernel@vger.kernel.org, containers@lists.linux-foundation.org, dm-devel@redhat.com, jens.axboe@oracle.com, nauman@google.com, dpshah@google.com, ryov@valinux.co.jp, guijianfeng@cn.fujitsu.com, balbir@linux.vnet.ibm.com, righi.andrea@gmail.com Cc: lizf@cn.fujitsu.com, mikew@google.com, fchecconi@gmail.com, paolo.valente@unimore.it, fernando@oss.ntt.co.jp, s-uchida@ap.jp.nec.com, taka@valinux.co.jp, jmoyer@redhat.com, dhaval@linux.vnet.ibm.com, m-ikeda@ds.jp.nec.com, agk@redhat.com, vgoyal@redhat.com, akpm@linux-foundation.org, peterz@infradead.org Subject: [PATCH 06/24] io-controller: core bfq scheduler changes for hierarchical setup Date: Fri, 24 Jul 2009 16:27:36 -0400 Message-Id: <1248467274-32073-7-git-send-email-vgoyal@redhat.com> In-Reply-To: <1248467274-32073-1-git-send-email-vgoyal@redhat.com> References: <1248467274-32073-1-git-send-email-vgoyal@redhat.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 8323 Lines: 300 o Some of the core bfq scheduler changes for hiearchical groups. Signed-off-by: Fabio Checconi Signed-off-by: Paolo Valente Signed-off-by: Nauman Rafique Signed-off-by: Vivek Goyal --- block/elevator-fq.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++----- block/elevator-fq.h | 4 + init/Kconfig | 8 +++ 3 files changed, 165 insertions(+), 16 deletions(-) diff --git a/block/elevator-fq.c b/block/elevator-fq.c index e302ca0..8f8fe9a 100644 --- a/block/elevator-fq.c +++ b/block/elevator-fq.c @@ -38,6 +38,69 @@ static struct kmem_cache *elv_ioq_pool; */ #define WFQ_SERVICE_SHIFT 22 +#ifdef CONFIG_GROUP_IOSCHED +#define for_each_entity(entity) \ + for (; entity != NULL; entity = entity->parent) + +#define for_each_entity_safe(entity, parent) \ + for (; entity && ({ parent = entity->parent; 1; }); entity = parent) + + +static struct io_entity *bfq_lookup_next_entity(struct io_sched_data *sd, + int extract); + +static int bfq_update_next_active(struct io_sched_data *sd) +{ + struct io_group *iog; + struct io_entity *entity, *next_active; + + if (sd->active_entity != NULL) + /* will update/requeue at the end of service */ + return 0; + + /* + * NOTE: this can be improved in may ways, such as returning + * 1 (and thus propagating upwards the update) only when the + * budget changes, or caching the bfqq that will be scheduled + * next from this subtree. By now we worry more about + * correctness than about performance... + */ + next_active = bfq_lookup_next_entity(sd, 0); + sd->next_active = next_active; + + if (next_active != NULL) { + iog = container_of(sd, struct io_group, sched_data); + entity = iog->my_entity; + if (entity != NULL) + entity->budget = next_active->budget; + } + + return 1; +} + +static inline void bfq_check_next_active(struct io_sched_data *sd, + struct io_entity *entity) +{ + BUG_ON(sd->next_active != entity); +} +#else /* GROUP_IOSCHED */ +#define for_each_entity(entity) \ + for (; entity != NULL; entity = NULL) + +#define for_each_entity_safe(entity, parent) \ + for (parent = NULL; entity != NULL; entity = parent) + +static inline int bfq_update_next_active(struct io_sched_data *sd) +{ + return 0; +} + +static inline void bfq_check_next_active(struct io_sched_data *sd, + struct io_entity *entity) +{ +} +#endif /* GROUP_IOSCHED */ + static inline int elv_prio_slice(struct elv_fq_data *efqd, int sync, unsigned short prio) { @@ -582,8 +645,10 @@ static struct io_entity *bfq_lookup_next_entity(struct io_sched_data *sd, entity = __bfq_lookup_next_entity(st); if (entity != NULL) { if (extract) { + bfq_check_next_active(sd, entity); bfq_active_remove(st, entity); sd->active_entity = entity; + sd->next_active = NULL; } break; } @@ -660,11 +725,8 @@ static void __bfq_activate_entity(struct io_entity *entity, int add_front) if (add_front) { struct io_entity *next_entity; - /* - * Determine the entity which will be dispatched next - * Use sd->next_active once hierarchical patch is applied - */ - next_entity = bfq_lookup_next_entity(sd, 0); + /* Determine the entity which will be dispatched next */ + next_entity = sd->next_active; if (next_entity && next_entity != entity) { struct io_service_tree *new_st; @@ -696,7 +758,21 @@ static void __bfq_activate_entity(struct io_entity *entity, int add_front) */ static void bfq_activate_entity(struct io_entity *entity, int add_front) { - __bfq_activate_entity(entity, add_front); + struct io_sched_data *sd; + + for_each_entity(entity) { + __bfq_activate_entity(entity, add_front); + + add_front = 0; + sd = entity->sched_data; + if (!bfq_update_next_active(sd)) + /* + * No need to propagate the activation to the + * upper entities, as they will be updated when + * the active entity is rescheduled. + */ + break; + } } /** @@ -731,6 +807,8 @@ static int __bfq_deactivate_entity(struct io_entity *entity, int requeue) bfq_idle_remove(st, entity); else if (entity->tree != NULL) BUG(); + if (was_active || sd->next_active == entity) + ret = bfq_update_next_active(sd); if (!requeue || !bfq_gt(entity->finish, st->vtime)) bfq_forget_entity(st, entity); @@ -738,6 +816,7 @@ static int __bfq_deactivate_entity(struct io_entity *entity, int requeue) bfq_idle_insert(st, entity); BUG_ON(sd->active_entity == entity); + BUG_ON(sd->next_active == entity); return ret; } @@ -749,18 +828,62 @@ static int __bfq_deactivate_entity(struct io_entity *entity, int requeue) */ static void bfq_deactivate_entity(struct io_entity *entity, int requeue) { - __bfq_deactivate_entity(entity, requeue); + struct io_sched_data *sd; + struct io_entity *parent; + + for_each_entity_safe(entity, parent) { + sd = entity->sched_data; + + if (!__bfq_deactivate_entity(entity, requeue)) + /* + * The parent entity is still backlogged, and + * we don't need to update it as it is still + * under service. + */ + break; + + if (sd->next_active != NULL) { + /* + * The parent entity is still backlogged and + * the budgets on the path towards the root + * need to be updated. + */ + goto update; + } + + /* + * If we reach there the parent is no more backlogged and + * we want to propagate the dequeue upwards. + * + */ + + requeue = 1; + } + + return; + +update: + entity = parent; + for_each_entity(entity) { + __bfq_activate_entity(entity, 0); + + sd = entity->sched_data; + if (!bfq_update_next_active(sd)) + break; + } } static void entity_served(struct io_entity *entity, unsigned long served) { struct io_service_tree *st; - st = io_entity_service_tree(entity); - entity->service += served; - BUG_ON(st->wsum == 0); - st->vtime += bfq_delta(served, st->wsum); - bfq_forget_idle(st); + for_each_entity(entity) { + st = io_entity_service_tree(entity); + entity->service += served; + BUG_ON(st->wsum == 0); + st->vtime += bfq_delta(served, st->wsum); + bfq_forget_idle(st); + } } /** @@ -1068,11 +1191,25 @@ static struct io_queue *elv_get_next_ioq(struct request_queue *q, int extract) return NULL; sd = &efqd->root_group->sched_data; - entity = bfq_lookup_next_entity(sd, 1); - BUG_ON(!entity); - if (extract) - entity->service = 0; + for (; sd != NULL; sd = entity->my_sched_data) { + entity = bfq_lookup_next_entity(sd, 1); + /* + * entity can be null despite the fact that there are busy + * queues. if all the busy queues are under a group which is + * currently under service. + * So if we are just looking for next ioq while something is + * being served, null entity is not an error. + */ + BUG_ON(!entity && extract); + + if (extract) + entity->service = 0; + + if (!entity) + return NULL; + } + ioq = io_entity_to_ioq(entity); return ioq; diff --git a/block/elevator-fq.h b/block/elevator-fq.h index d870360..ed65a87 100644 --- a/block/elevator-fq.h +++ b/block/elevator-fq.h @@ -72,6 +72,7 @@ struct io_service_tree { */ struct io_sched_data { struct io_entity *active_entity; + struct io_entity *next_active; struct io_service_tree service_tree[IO_IOPRIO_CLASSES]; }; @@ -178,7 +179,10 @@ struct io_queue { }; struct io_group { + struct io_entity entity; struct io_sched_data sched_data; + struct io_entity *my_entity; + /* * async queue for each priority case for RT and BE class. * Used only for cfq. diff --git a/init/Kconfig b/init/Kconfig index cb2c092..fa3edd6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -612,6 +612,14 @@ config CGROUP_MEM_RES_CTLR_SWAP Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page size is 4096bytes, 512k per 1Gbytes of swap. +config GROUP_IOSCHED + bool "Group IO Scheduler" + depends on CGROUPS && ELV_FAIR_QUEUING + default n + ---help--- + This feature lets IO scheduler recognize task groups and control + disk bandwidth allocation to such task groups. + endif # CGROUPS config MM_OWNER -- 1.6.0.6 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/