Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753357AbZI3Mxo (ORCPT ); Wed, 30 Sep 2009 08:53:44 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1752282AbZI3Mxn (ORCPT ); Wed, 30 Sep 2009 08:53:43 -0400 Received: from e36.co.us.ibm.com ([32.97.110.154]:39194 "EHLO e36.co.us.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751120AbZI3Mxm (ORCPT ); Wed, 30 Sep 2009 08:53:42 -0400 Date: Wed, 30 Sep 2009 18:22:04 +0530 From: Bharata B Rao To: linux-kernel@vger.kernel.org Cc: Dhaval Giani , Balbir Singh , Vaidyanathan Srinivasan , Gautham R Shenoy , Srivatsa Vaddagiri , Ingo Molnar , Peter Zijlstra , Pavel Emelyanov , Herbert Poetzl , Avi Kivity , Chris Friesen , Paul Menage , Mike Waychison Subject: [RFC v2 PATCH 3/8] sched: Bandwidth initialization for fair task groups Message-ID: <20090930125204.GD19951@in.ibm.com> Reply-To: bharata@linux.vnet.ibm.com References: <20090930124919.GA19951@in.ibm.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20090930124919.GA19951@in.ibm.com> User-Agent: Mutt/1.5.18 (2008-05-17) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 11442 Lines: 448 sched: Bandwidth initialization for fair task groups. From: Bharata B Rao Introduce the notion of hard limiting for CFS groups by bringing in the concept of runtime and period for them. Add cgroup files to control runtime and period. Signed-off-by: Bharata B Rao --- init/Kconfig | 13 ++ kernel/sched.c | 317 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 330 insertions(+), 0 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 3f7e609..e93282f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -492,6 +492,19 @@ config CGROUP_SCHED endchoice +config CFS_HARD_LIMITS + bool "Hard Limits for CFS Group Scheduler" + depends on EXPERIMENTAL + depends on FAIR_GROUP_SCHED && CGROUP_SCHED + default n + help + This option enables hard limiting of CPU time obtained by + a fair task group. Use this if you want to throttle a group of tasks + based on its CPU usage. For more details refer to + Documentation/scheduler/sched-cfs-hard-limits.txt + + Say N if unsure. + menuconfig CGROUPS boolean "Control Group support" help diff --git a/kernel/sched.c b/kernel/sched.c index c283d0f..0147f6f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -262,6 +262,15 @@ static DEFINE_MUTEX(sched_domains_mutex); #include +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_CFS_HARD_LIMITS) +struct cfs_bandwidth { + spinlock_t cfs_runtime_lock; + ktime_t cfs_period; + u64 cfs_runtime; + struct hrtimer cfs_period_timer; +}; +#endif + struct cfs_rq; static LIST_HEAD(task_groups); @@ -282,6 +291,11 @@ struct task_group { /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; unsigned long shares; +#ifdef CONFIG_CFS_HARD_LIMITS + struct cfs_bandwidth cfs_bandwidth; + /* If set, throttle when the group exceeds its bandwidth */ + int hard_limit_enabled; +#endif #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -477,6 +491,16 @@ struct cfs_rq { unsigned long rq_weight; #endif #endif +#ifdef CONFIG_CFS_HARD_LIMITS + /* set when the group is throttled on this cpu */ + int cfs_throttled; + + /* runtime currently consumed by the group on this rq */ + u64 cfs_time; + + /* runtime available to the group on this rq */ + u64 cfs_runtime; +#endif /* * Number of tasks at this heirarchy. */ @@ -665,6 +689,11 @@ struct rq { /* BKL stats */ unsigned int bkl_count; #endif + /* + * Protects the cfs runtime related fields of all cfs_rqs under + * this rq + */ + spinlock_t runtime_lock; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -1759,6 +1788,150 @@ static inline const struct cpumask *sched_bw_period_mask(void) #endif +#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_CFS_HARD_LIMITS + +/* + * Runtime allowed for a cfs group before it is hard limited. + * default: Infinite which means no hard limiting. + */ +u64 sched_cfs_runtime = RUNTIME_INF; + +/* + * period over which we hard limit the cfs group's bandwidth. + * default: 0.5s + */ +u64 sched_cfs_period = 500000; + +static inline u64 global_cfs_period(void) +{ + return sched_cfs_period * NSEC_PER_USEC; +} + +static inline u64 global_cfs_runtime(void) +{ + return RUNTIME_INF; +} + +static inline int cfs_bandwidth_enabled(struct task_group *tg) +{ + return tg->hard_limit_enabled; +} + +static inline void rq_runtime_lock(struct rq *rq) +{ + spin_lock(&rq->runtime_lock); +} + +static inline void rq_runtime_unlock(struct rq *rq) +{ + spin_unlock(&rq->runtime_lock); +} + +/* + * Refresh the runtimes of the throttled groups. + * But nothing much to do now, will populate this in later patches. + */ +static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) +{ + struct cfs_bandwidth *cfs_b = + container_of(timer, struct cfs_bandwidth, cfs_period_timer); + + hrtimer_add_expires_ns(timer, ktime_to_ns(cfs_b->cfs_period)); + return HRTIMER_RESTART; +} + +/* + * TODO: Check if this kind of timer setup is sufficient for cfs or + * should we do what rt is doing. + */ +static void start_cfs_bandwidth(struct task_group *tg) +{ + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + + /* + * Timer isn't setup for groups with infinite runtime or for groups + * for which hard limiting isn't enabled. + */ + if (!cfs_bandwidth_enabled(tg) || (cfs_b->cfs_runtime == RUNTIME_INF)) + return; + + if (hrtimer_active(&cfs_b->cfs_period_timer)) + return; + + hrtimer_start_range_ns(&cfs_b->cfs_period_timer, cfs_b->cfs_period, + 0, HRTIMER_MODE_REL); +} + +static void init_cfs_bandwidth(struct task_group *tg) +{ + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + + cfs_b->cfs_period = ns_to_ktime(global_cfs_period()); + cfs_b->cfs_runtime = global_cfs_runtime(); + + spin_lock_init(&cfs_b->cfs_runtime_lock); + + hrtimer_init(&cfs_b->cfs_period_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_b->cfs_period_timer.function = &sched_cfs_period_timer; +} + +static inline void destroy_cfs_bandwidth(struct task_group *tg) +{ + hrtimer_cancel(&tg->cfs_bandwidth.cfs_period_timer); +} + +static void init_cfs_hard_limits(struct cfs_rq *cfs_rq, struct task_group *tg) +{ + cfs_rq->cfs_time = 0; + cfs_rq->cfs_throttled = 0; + cfs_rq->cfs_runtime = tg->cfs_bandwidth.cfs_runtime; + tg->hard_limit_enabled = 0; +} + +#else /* !CONFIG_CFS_HARD_LIMITS */ + +static void init_cfs_bandwidth(struct task_group *tg) +{ + return; +} + +static inline void destroy_cfs_bandwidth(struct task_group *tg) +{ + return; +} + +static void init_cfs_hard_limits(struct cfs_rq *cfs_rq, struct task_group *tg) +{ + return; +} + +static inline void rq_runtime_lock(struct rq *rq) +{ + return; +} + +static inline void rq_runtime_unlock(struct rq *rq) +{ + return; +} + +#endif /* CONFIG_CFS_HARD_LIMITS */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ + +static inline void rq_runtime_lock(struct rq *rq) +{ + return; +} + +static inline void rq_runtime_unlock(struct rq *rq) +{ + return; +} + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -9146,6 +9319,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct rq *rq = cpu_rq(cpu); tg->cfs_rq[cpu] = cfs_rq; init_cfs_rq(cfs_rq, rq); + init_cfs_hard_limits(cfs_rq, tg); cfs_rq->tg = tg; if (add) list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); @@ -9275,6 +9449,10 @@ void __init sched_init(void) #endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED + init_cfs_bandwidth(&init_task_group); +#endif + #ifdef CONFIG_GROUP_SCHED list_add(&init_task_group.list, &task_groups); INIT_LIST_HEAD(&init_task_group.children); @@ -9291,6 +9469,7 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); + spin_lock_init(&rq->runtime_lock); rq->nr_running = 0; rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; @@ -9564,6 +9743,7 @@ static void free_fair_sched_group(struct task_group *tg) { int i; + destroy_cfs_bandwidth(tg); for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); @@ -9590,6 +9770,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (!tg->se) goto err; + init_cfs_bandwidth(tg); tg->shares = NICE_0_LOAD; for_each_possible_cpu(i) { @@ -10284,6 +10465,125 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) return (u64) tg->shares; } + +#ifdef CONFIG_CFS_HARD_LIMITS + +static int tg_set_cfs_bandwidth(struct task_group *tg, + u64 cfs_period, u64 cfs_runtime) +{ + int i, err = 0; + + spin_lock_irq(&tg->cfs_bandwidth.cfs_runtime_lock); + tg->cfs_bandwidth.cfs_period = ns_to_ktime(cfs_period); + tg->cfs_bandwidth.cfs_runtime = cfs_runtime; + + for_each_possible_cpu(i) { + struct cfs_rq *cfs_rq = tg->cfs_rq[i]; + + rq_runtime_lock(rq_of(cfs_rq)); + cfs_rq->cfs_runtime = cfs_runtime; + rq_runtime_unlock(rq_of(cfs_rq)); + } + + start_cfs_bandwidth(tg); + spin_unlock_irq(&tg->cfs_bandwidth.cfs_runtime_lock); + return err; +} + +int tg_set_cfs_runtime(struct task_group *tg, long cfs_runtime_us) +{ + u64 cfs_runtime, cfs_period; + + cfs_period = ktime_to_ns(tg->cfs_bandwidth.cfs_period); + cfs_runtime = (u64)cfs_runtime_us * NSEC_PER_USEC; + if (cfs_runtime_us < 0) + cfs_runtime = RUNTIME_INF; + + return tg_set_cfs_bandwidth(tg, cfs_period, cfs_runtime); +} + +long tg_get_cfs_runtime(struct task_group *tg) +{ + u64 cfs_runtime_us; + + if (tg->cfs_bandwidth.cfs_runtime == RUNTIME_INF) + return -1; + + cfs_runtime_us = tg->cfs_bandwidth.cfs_runtime; + do_div(cfs_runtime_us, NSEC_PER_USEC); + return cfs_runtime_us; +} + +int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) +{ + u64 cfs_runtime, cfs_period; + + cfs_period = (u64)cfs_period_us * NSEC_PER_USEC; + cfs_runtime = tg->cfs_bandwidth.cfs_runtime; + + if (cfs_period == 0) + return -EINVAL; + + return tg_set_cfs_bandwidth(tg, cfs_period, cfs_runtime); +} + +long tg_get_cfs_period(struct task_group *tg) +{ + u64 cfs_period_us; + + cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.cfs_period); + do_div(cfs_period_us, NSEC_PER_USEC); + return cfs_period_us; +} + +int tg_set_hard_limit_enabled(struct task_group *tg, u64 val) +{ + spin_lock_irq(&tg->cfs_bandwidth.cfs_runtime_lock); + if (val > 0) { + tg->hard_limit_enabled = 1; + start_cfs_bandwidth(tg); + } else { + destroy_cfs_bandwidth(tg); + tg->hard_limit_enabled = 0; + } + spin_unlock_irq(&tg->cfs_bandwidth.cfs_runtime_lock); + return 0; +} + +static s64 cpu_cfs_runtime_read_s64(struct cgroup *cgrp, struct cftype *cft) +{ + return tg_get_cfs_runtime(cgroup_tg(cgrp)); +} + +static int cpu_cfs_runtime_write_s64(struct cgroup *cgrp, struct cftype *cftype, + s64 cfs_runtime_us) +{ + return tg_set_cfs_runtime(cgroup_tg(cgrp), cfs_runtime_us); +} + +static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) +{ + return tg_get_cfs_period(cgroup_tg(cgrp)); +} + +static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, + u64 cfs_period_us) +{ + return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); +} + +static u64 cpu_cfs_hard_limit_read_u64(struct cgroup *cgrp, struct cftype *cft) +{ + return cfs_bandwidth_enabled(cgroup_tg(cgrp)); +} + +static int cpu_cfs_hard_limit_write_u64(struct cgroup *cgrp, + struct cftype *cftype, u64 val) +{ + return tg_set_hard_limit_enabled(cgroup_tg(cgrp), val); +} + +#endif /* CONFIG_CFS_HARD_LIMITS */ #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED @@ -10317,6 +10617,23 @@ static struct cftype cpu_files[] = { .read_u64 = cpu_shares_read_u64, .write_u64 = cpu_shares_write_u64, }, +#ifdef CONFIG_CFS_HARD_LIMITS + { + .name = "cfs_runtime_us", + .read_s64 = cpu_cfs_runtime_read_s64, + .write_s64 = cpu_cfs_runtime_write_s64, + }, + { + .name = "cfs_period_us", + .read_u64 = cpu_cfs_period_read_u64, + .write_u64 = cpu_cfs_period_write_u64, + }, + { + .name = "cfs_hard_limit", + .read_u64 = cpu_cfs_hard_limit_read_u64, + .write_u64 = cpu_cfs_hard_limit_write_u64, + }, +#endif /* CONFIG_CFS_HARD_LIMITS */ #endif #ifdef CONFIG_RT_GROUP_SCHED { -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/