Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1759145Ab0BYKax (ORCPT ); Thu, 25 Feb 2010 05:30:53 -0500 Received: from smtp-out.google.com ([216.239.33.17]:6236 "EHLO smtp-out.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1759056Ab0BYKav convert rfc822-to-8bit (ORCPT ); Thu, 25 Feb 2010 05:30:51 -0500 DomainKey-Signature: a=rsa-sha1; s=beta; d=google.com; c=nofws; q=dns; h=mime-version:in-reply-to:references:date:message-id:subject:from:to: cc:content-type:content-transfer-encoding:x-system-of-record; b=e9Olv+BKRdg8MZdFdpJAkolFw2ATTImBKopKnFum8C08ItYjNOilf+6NhZ/ykMDaQ kwQTS2zOlisA0eHfP/bYQ== MIME-Version: 1.0 In-Reply-To: <20100225081438.GA9640@in.ibm.com> References: <20100213025417.23325.90048.stgit@kitami.corp.google.com> <20100213025457.23325.69574.stgit@kitami.corp.google.com> <20100225081438.GA9640@in.ibm.com> Date: Thu, 25 Feb 2010 02:30:44 -0800 Message-ID: Subject: Re: [RFC PATCH v1 1/4] sched: introduce primitives to account for CFS bandwidth tracking From: Paul Turner To: bharata@linux.vnet.ibm.com Cc: linux-kernel@vger.kernel.org, Paul Menage , Srivatsa Vaddagiri , Peter Zijlstra , Gautham R Shenoy , Dhaval Giani , Balbir Singh , Herbert Poetzl , Chris Friesen , Avi Kivity , Nikhil Rao , Ingo Molnar , Kamalesh Babulal , Mike Waychison , Vaidyanathan Srinivasan , Pavel Emelyanov Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 8BIT X-System-Of-Record: true Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 12402 Lines: 363 On Thu, Feb 25, 2010 at 12:14 AM, Bharata B Rao wrote: > On Fri, Feb 12, 2010 at 06:54:57PM -0800, Paul wrote: >> --- a/kernel/sched.c >> +++ b/kernel/sched.c >> @@ -190,10 +190,28 @@ static inline int rt_bandwidth_enabled(void) >> ? ? ? return sysctl_sched_rt_runtime >= 0; >> ?} >> >> -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) >> +static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) >> ?{ >> - ? ? ktime_t now; >> + ? ? unsigned long delta; >> + ? ? ktime_t soft, hard, now; >> + >> + ? ? for (;;) { >> + ? ? ? ? ? ? if (hrtimer_active(period_timer)) >> + ? ? ? ? ? ? ? ? ? ? break; >> + >> + ? ? ? ? ? ? now = hrtimer_cb_get_time(period_timer); >> + ? ? ? ? ? ? hrtimer_forward(period_timer, now, period); >> + >> + ? ? ? ? ? ? soft = hrtimer_get_softexpires(period_timer); >> + ? ? ? ? ? ? hard = hrtimer_get_expires(period_timer); >> + ? ? ? ? ? ? delta = ktime_to_ns(ktime_sub(hard, soft)); >> + ? ? ? ? ? ? __hrtimer_start_range_ns(period_timer, soft, delta, >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?HRTIMER_MODE_ABS_PINNED, 0); >> + ? ? } >> +} >> >> +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) >> +{ >> ? ? ? if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) >> ? ? ? ? ? ? ? return; >> >> @@ -201,22 +219,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) >> ? ? ? ? ? ? ? return; >> >> ? ? ? raw_spin_lock(&rt_b->rt_runtime_lock); >> - ? ? for (;;) { >> - ? ? ? ? ? ? unsigned long delta; >> - ? ? ? ? ? ? ktime_t soft, hard; >> - >> - ? ? ? ? ? ? if (hrtimer_active(&rt_b->rt_period_timer)) >> - ? ? ? ? ? ? ? ? ? ? break; >> - >> - ? ? ? ? ? ? now = hrtimer_cb_get_time(&rt_b->rt_period_timer); >> - ? ? ? ? ? ? hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); >> - >> - ? ? ? ? ? ? soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); >> - ? ? ? ? ? ? hard = hrtimer_get_expires(&rt_b->rt_period_timer); >> - ? ? ? ? ? ? delta = ktime_to_ns(ktime_sub(hard, soft)); >> - ? ? ? ? ? ? __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, >> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? HRTIMER_MODE_ABS_PINNED, 0); >> - ? ? } >> + ? ? start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); >> ? ? ? raw_spin_unlock(&rt_b->rt_runtime_lock); >> ?} >> >> @@ -241,6 +244,15 @@ struct cfs_rq; >> >> ?static LIST_HEAD(task_groups); >> >> +#ifdef CONFIG_CFS_BANDWIDTH >> +struct cfs_bandwidth { >> + ? ? raw_spinlock_t ? ? ? ? ?lock; >> + ? ? ktime_t ? ? ? ? ? ? ? ? period; >> + ? ? u64 ? ? ? ? ? ? ? ? ? ? runtime, quota; >> + ? ? struct hrtimer ? ? ? ? ?period_timer; >> +}; >> +#endif >> + >> ?/* task group related information */ >> ?struct task_group { >> ?#ifdef CONFIG_CGROUP_SCHED >> @@ -272,6 +284,10 @@ struct task_group { >> ? ? ? struct task_group *parent; >> ? ? ? struct list_head siblings; >> ? ? ? struct list_head children; >> + >> +#ifdef CONFIG_CFS_BANDWIDTH >> + ? ? struct cfs_bandwidth cfs_bandwidth; >> +#endif >> ?}; >> >> ?#ifdef CONFIG_USER_SCHED >> @@ -445,9 +461,76 @@ struct cfs_rq { >> ? ? ? ?*/ >> ? ? ? unsigned long rq_weight; >> ?#endif >> +#ifdef CONFIG_CFS_BANDWIDTH >> + ? ? u64 quota_assigned, quota_used; >> +#endif >> ?#endif >> ?}; >> >> +#ifdef CONFIG_CFS_BANDWIDTH >> +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); >> + >> +static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) >> +{ >> + ? ? struct cfs_bandwidth *cfs_b = >> + ? ? ? ? ? ? container_of(timer, struct cfs_bandwidth, period_timer); >> + ? ? ktime_t now; >> + ? ? int overrun; >> + ? ? int idle = 0; >> + >> + ? ? for (;;) { >> + ? ? ? ? ? ? now = hrtimer_cb_get_time(timer); >> + ? ? ? ? ? ? overrun = hrtimer_forward(timer, now, cfs_b->period); >> + >> + ? ? ? ? ? ? if (!overrun) >> + ? ? ? ? ? ? ? ? ? ? break; >> + >> + ? ? ? ? ? ? idle = do_sched_cfs_period_timer(cfs_b, overrun); >> + ? ? } >> + >> + ? ? return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; >> +} >> + >> +static >> +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, u64 quota, u64 period) >> +{ >> + ? ? raw_spin_lock_init(&cfs_b->lock); >> + ? ? cfs_b->quota = cfs_b->runtime = quota; >> + ? ? cfs_b->period = ns_to_ktime(period); >> + >> + ? ? hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); >> + ? ? cfs_b->period_timer.function = sched_cfs_period_timer; >> +} >> + >> +static >> +void init_cfs_rq_quota(struct cfs_rq *cfs_rq) >> +{ >> + ? ? cfs_rq->quota_used = 0; >> + ? ? if (cfs_rq->tg->cfs_bandwidth.quota == RUNTIME_INF) >> + ? ? ? ? ? ? cfs_rq->quota_assigned = RUNTIME_INF; >> + ? ? else >> + ? ? ? ? ? ? cfs_rq->quota_assigned = 0; >> +} >> + >> +static void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) >> +{ >> + ? ? if (cfs_b->quota == RUNTIME_INF) >> + ? ? ? ? ? ? return; >> + >> + ? ? if (hrtimer_active(&cfs_b->period_timer)) >> + ? ? ? ? ? ? return; >> + >> + ? ? raw_spin_lock(&cfs_b->lock); >> + ? ? start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); >> + ? ? raw_spin_unlock(&cfs_b->lock); >> +} >> + >> +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) >> +{ >> + ? ? hrtimer_cancel(&cfs_b->period_timer); >> +} >> +#endif > > May be you could define some of this functions for !CONFIG_CFS_BANDWIDTH case > and avoid them calling under #ifdef ? I was given this comment during my > initial iterations. > Was it for init or run-time functions? We try to maintain the empty def style for most of our run-time functions (e.g. cfs_throttled); for init it seems more descriptive (and in keeping with the rest of sched init) to ifdef specific initialization. Regardless, I will definitely give this a pass-over to see what I can clean up. >> + >> ?/* Real-Time classes' related field in a runqueue: */ >> ?struct rt_rq { >> ? ? ? struct rt_prio_array active; >> @@ -1834,6 +1917,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) >> ?#endif >> ?} >> >> +#ifdef CONFIG_CFS_BANDWIDTH >> +/* >> + * default period for cfs group bandwidth. >> + * default: 0.5s >> + */ >> +static u64 sched_cfs_bandwidth_period = 500000000ULL; >> +#endif >> + >> ?#include "sched_stats.h" >> ?#include "sched_idletask.c" >> ?#include "sched_fair.c" >> @@ -9422,6 +9513,9 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, >> ? ? ? tg->cfs_rq[cpu] = cfs_rq; >> ? ? ? init_cfs_rq(cfs_rq, rq); >> ? ? ? cfs_rq->tg = tg; >> +#ifdef CONFIG_CFS_BANDWIDTH >> + ? ? init_cfs_rq_quota(cfs_rq); >> +#endif >> ? ? ? if (add) >> ? ? ? ? ? ? ? list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); >> >> @@ -9594,6 +9688,10 @@ void __init sched_init(void) >> ? ? ? ? ? ? ? ?* We achieve this by letting init_task_group's tasks sit >> ? ? ? ? ? ? ? ?* directly in rq->cfs (i.e init_task_group->se[] = NULL). >> ? ? ? ? ? ? ? ?*/ >> +#ifdef CONFIG_CFS_BANDWIDTH >> + ? ? ? ? ? ? init_cfs_bandwidth(&init_task_group.cfs_bandwidth, >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? RUNTIME_INF, sched_cfs_bandwidth_period); >> +#endif >> ? ? ? ? ? ? ? init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); >> ?#elif defined CONFIG_USER_SCHED >> ? ? ? ? ? ? ? root_task_group.shares = NICE_0_LOAD; >> @@ -9851,6 +9949,10 @@ static void free_fair_sched_group(struct task_group *tg) >> ?{ >> ? ? ? int i; >> >> +#ifdef CONFIG_CFS_BANDWIDTH >> + ? ? destroy_cfs_bandwidth(&tg->cfs_bandwidth); >> +#endif >> + >> ? ? ? for_each_possible_cpu(i) { >> ? ? ? ? ? ? ? if (tg->cfs_rq) >> ? ? ? ? ? ? ? ? ? ? ? kfree(tg->cfs_rq[i]); >> @@ -9878,7 +9980,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) >> ? ? ? ? ? ? ? goto err; >> >> ? ? ? tg->shares = NICE_0_LOAD; >> - >> +#ifdef CONFIG_CFS_BANDWIDTH >> + ? ? init_cfs_bandwidth(&tg->cfs_bandwidth, RUNTIME_INF, >> + ? ? ? ? ? ? ? ? ? ? sched_cfs_bandwidth_period); >> +#endif >> ? ? ? for_each_possible_cpu(i) { >> ? ? ? ? ? ? ? rq = cpu_rq(i); >> >> @@ -10333,7 +10438,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) >> ? ? ? return walk_tg_tree(tg_schedulable, tg_nop, &data); >> ?} >> >> -static int tg_set_bandwidth(struct task_group *tg, >> +static int tg_set_rt_bandwidth(struct task_group *tg, >> ? ? ? ? ? ? ? u64 rt_period, u64 rt_runtime) >> ?{ >> ? ? ? int i, err = 0; >> @@ -10372,7 +10477,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) >> ? ? ? if (rt_runtime_us < 0) >> ? ? ? ? ? ? ? rt_runtime = RUNTIME_INF; >> >> - ? ? return tg_set_bandwidth(tg, rt_period, rt_runtime); >> + ? ? return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); >> ?} >> >> ?long sched_group_rt_runtime(struct task_group *tg) >> @@ -10397,7 +10502,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) >> ? ? ? if (rt_period == 0) >> ? ? ? ? ? ? ? return -EINVAL; >> >> - ? ? return tg_set_bandwidth(tg, rt_period, rt_runtime); >> + ? ? return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); >> ?} >> >> ?long sched_group_rt_period(struct task_group *tg) >> @@ -10604,6 +10709,120 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) >> >> ? ? ? return (u64) tg->shares; >> ?} >> + >> +#ifdef CONFIG_CFS_BANDWIDTH >> +static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) >> +{ >> + ? ? int i; >> + ? ? static DEFINE_MUTEX(mutex); >> + >> + ? ? if (tg == &init_task_group) >> + ? ? ? ? ? ? return -EINVAL; >> + >> + ? ? if (!period) >> + ? ? ? ? ? ? return -EINVAL; >> + >> + ? ? mutex_lock(&mutex); > > What is this mutex for ? So you essentially serializing the bandwidth > setting of all groups ? While that iself isn't an issue, just wondering if > cfs_bandwidth.lock isn't suffient ? > The idea isn't to synchronize quota updates for all groups, but to synchronize it within a single group. Consider the case of 2 parallel writes, one setting infinite bandwidth the other setting finite. Depending on rq->lock contention it's possible for both values to propagate to some of the cpus. You sit on the bandwidth lock because then there is inversion with update_curr, e.g. cpu1 -> rq->lock held, update_curr -> request bandwidth -> acquire cfs_bandwidth.lock cpu2 -> tg_set_cfs_bandwidth, hold cfs_bandwidth -> try to acquire cpu1 rq->lock This mutex could be per-cgroup but users shouldn't be updating fast enough to the point where they require it, it also reduces rq->lock slamming when users update several cgroups in parallel. >> + ? ? /* >> + ? ? ?* Ensure we have at least one tick of bandwidth every period. ?This is >> + ? ? ?* to prevent reaching a state of large arrears when throttled via >> + ? ? ?* entity_tick() resulting in prolonged exit starvation. >> + ? ? ?*/ >> + ? ? if (NS_TO_JIFFIES(quota) < 1) >> + ? ? ? ? ? ? return -EINVAL; > > Return with mutex held ? woops this should be below the mutex_lock, fixed thanks! > >> + >> + ? ? raw_spin_lock_irq(&tg->cfs_bandwidth.lock); >> + ? ? tg->cfs_bandwidth.period = ns_to_ktime(period); >> + ? ? tg->cfs_bandwidth.runtime = tg->cfs_bandwidth.quota = quota; >> + ? ? raw_spin_unlock_irq(&tg->cfs_bandwidth.lock); >> + >> + ? ? for_each_possible_cpu(i) { >> + ? ? ? ? ? ? struct cfs_rq *cfs_rq = tg->cfs_rq[i]; >> + ? ? ? ? ? ? struct rq *rq = rq_of(cfs_rq); >> + >> + ? ? ? ? ? ? raw_spin_lock_irq(&rq->lock); >> + ? ? ? ? ? ? cfs_rq->quota_used = 0; >> + ? ? ? ? ? ? if (quota == RUNTIME_INF) >> + ? ? ? ? ? ? ? ? ? ? cfs_rq->quota_assigned = RUNTIME_INF; >> + ? ? ? ? ? ? else >> + ? ? ? ? ? ? ? ? ? ? cfs_rq->quota_assigned = 0; >> + ? ? ? ? ? ? raw_spin_unlock_irq(&rq->lock); >> + ? ? } >> + ? ? mutex_unlock(&mutex); >> + >> + ? ? return 0; >> +} >> + >> ?static void task_waking_fair(struct rq *rq, struct task_struct *p) >> @@ -1172,7 +1180,7 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p) >> ? * We still saw a performance dip, some tracing learned us that between >> ? * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased >> ? * significantly. Therefore try to bias the error in direction of failing >> - * the affine wakeup. >> + * the affie wakeup. > > Unintended change ? > most definitely, fixed! > Regards, > Bharata. > Thanks for the comments! -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/