Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751881AbbDPUDq (ORCPT ); Thu, 16 Apr 2015 16:03:46 -0400 Received: from mail-ie0-f179.google.com ([209.85.223.179]:33453 "EHLO mail-ie0-f179.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751046AbbDPUDg (ORCPT ); Thu, 16 Apr 2015 16:03:36 -0400 From: bsegall@google.com To: Peter Zijlstra Cc: tglx@linutronix.de, mingo@kernel.org, linux-kernel@vger.kernel.org, Paul Turner , Roman Gushchin Subject: Re: [PATCH 2/3] sched: Cleanup bandwidth timers References: <20150415094155.601987867@infradead.org> <20150415095011.804589208@infradead.org> Date: Thu, 16 Apr 2015 13:03:33 -0700 In-Reply-To: <20150415095011.804589208@infradead.org> (Peter Zijlstra's message of "Wed, 15 Apr 2015 11:41:57 +0200") Message-ID: User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/24.3 (gnu/linux) MIME-Version: 1.0 Content-Type: text/plain Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 9129 Lines: 272 Peter Zijlstra writes: > Roman reported a 3 cpu lockup scenario involving __start_cfs_bandwidth(). > > The more I look at that code the more I'm convinced its crack, that > entire __start_cfs_bandwidth() thing is brain melting, we don't need to > cancel a timer before starting it, *hrtimer_start*() will happily remove > the timer for you if its still enqueued. > > Removing that, removes a big part of the problem, no more ugly cancel > loop to get stuck in. > > So now, if I understand things right, the entire reason you have this > cfs_b->lock guarded ->timer_active nonsense is to make sure we don't > accidentally loose the timer. > > It appears to me that it should be possible to guarantee that same by > unconditionally (re)starting the timer when !queued. Because regardless > what hrtimer::function will return, if we beat it to (re)enqueue the > timer, it doesn't matter. > > Now, because hrtimers don't come with any serialization guarantees we > must ensure both handler and (re)start loop serialize their access to > the hrtimer to avoid both trying to forward the timer at the same > time. > > Update the rt bandwidth timer to match. > > This effectively reverts: 09dc4ab03936 ("sched/fair: Fix > tg_set_cfs_bandwidth() deadlock on rq->lock"). > > Cc: Thomas Gleixner > Cc: Paul Turner > Cc: Ben Segall > Reported-by: Roman Gushchin Reviewed-by: Ben Segall So much nicer. Docs/comment issue only: s/loose/lose/ > Signed-off-by: Peter Zijlstra (Intel) > --- > kernel/sched/core.c | 15 ++++++------ > kernel/sched/fair.c | 59 ++++++++++++--------------------------------------- > kernel/sched/rt.c | 14 +++++------- > kernel/sched/sched.h | 4 +-- > 4 files changed, 31 insertions(+), 61 deletions(-) > > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -92,10 +92,13 @@ > > void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) > { > - if (hrtimer_active(period_timer)) > - return; > + /* > + * Do not forward the expiration time of active timers; > + * we do not want to loose an overrun. > + */ > + if (!hrtimer_active(period_timer)) > + hrtimer_forward_now(period_timer, period); > > - hrtimer_forward_now(period_timer, period); > hrtimer_start_expires(period_timer, HRTIMER_MODE_ABS_PINNED); > } > > @@ -8098,10 +8101,8 @@ static int tg_set_cfs_bandwidth(struct t > > __refill_cfs_bandwidth_runtime(cfs_b); > /* restart the period timer (if active) to handle new period expiry */ > - if (runtime_enabled && cfs_b->timer_active) { > - /* force a reprogram */ > - __start_cfs_bandwidth(cfs_b, true); > - } > + if (runtime_enabled) > + start_cfs_bandwidth(cfs_b); > raw_spin_unlock_irq(&cfs_b->lock); > > for_each_online_cpu(i) { > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -3476,16 +3476,7 @@ static int assign_cfs_rq_runtime(struct > if (cfs_b->quota == RUNTIME_INF) > amount = min_amount; > else { > - /* > - * If the bandwidth pool has become inactive, then at least one > - * period must have elapsed since the last consumption. > - * Refresh the global state and ensure bandwidth timer becomes > - * active. > - */ > - if (!cfs_b->timer_active) { > - __refill_cfs_bandwidth_runtime(cfs_b); > - __start_cfs_bandwidth(cfs_b, false); > - } > + start_cfs_bandwidth(cfs_b); > > if (cfs_b->runtime > 0) { > amount = min(cfs_b->runtime, min_amount); > @@ -3634,6 +3625,7 @@ static void throttle_cfs_rq(struct cfs_r > struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); > struct sched_entity *se; > long task_delta, dequeue = 1; > + bool empty; > > se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; > > @@ -3663,13 +3655,21 @@ static void throttle_cfs_rq(struct cfs_r > cfs_rq->throttled = 1; > cfs_rq->throttled_clock = rq_clock(rq); > raw_spin_lock(&cfs_b->lock); > + empty = list_empty(&cfs_rq->throttled_list); > + > /* > * Add to the _head_ of the list, so that an already-started > * distribute_cfs_runtime will not see us > */ > list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); > - if (!cfs_b->timer_active) > - __start_cfs_bandwidth(cfs_b, false); > + > + /* > + * If we're the first throttled task, make sure the bandwidth > + * timer is running. > + */ > + if (empty) > + start_cfs_bandwidth(cfs_b); > + > raw_spin_unlock(&cfs_b->lock); > } > > @@ -3784,13 +3784,6 @@ static int do_sched_cfs_period_timer(str > if (cfs_b->idle && !throttled) > goto out_deactivate; > > - /* > - * if we have relooped after returning idle once, we need to update our > - * status as actually running, so that other cpus doing > - * __start_cfs_bandwidth will stop trying to cancel us. > - */ > - cfs_b->timer_active = 1; > - > __refill_cfs_bandwidth_runtime(cfs_b); > > if (!throttled) { > @@ -3835,7 +3828,6 @@ static int do_sched_cfs_period_timer(str > return 0; > > out_deactivate: > - cfs_b->timer_active = 0; > return 1; > } > > @@ -3999,6 +3991,7 @@ static enum hrtimer_restart sched_cfs_sl > { > struct cfs_bandwidth *cfs_b = > container_of(timer, struct cfs_bandwidth, slack_timer); > + > do_sched_cfs_slack_timer(cfs_b); > > return HRTIMER_NORESTART; > @@ -4008,15 +4001,12 @@ static enum hrtimer_restart sched_cfs_pe > { > struct cfs_bandwidth *cfs_b = > container_of(timer, struct cfs_bandwidth, period_timer); > - ktime_t now; > int overrun; > int idle = 0; > > raw_spin_lock(&cfs_b->lock); > for (;;) { > - now = hrtimer_cb_get_time(timer); > - overrun = hrtimer_forward(timer, now, cfs_b->period); > - > + overrun = hrtimer_forward_now(timer, cfs_b->period); > if (!overrun) > break; > > @@ -4047,27 +4037,8 @@ static void init_cfs_rq_runtime(struct c > INIT_LIST_HEAD(&cfs_rq->throttled_list); > } > > -/* requires cfs_b->lock, may release to reprogram timer */ > -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) > +void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) > { > - /* > - * The timer may be active because we're trying to set a new bandwidth > - * period or because we're racing with the tear-down path > - * (timer_active==0 becomes visible before the hrtimer call-back > - * terminates). In either case we ensure that it's re-programmed > - */ > - while (unlikely(hrtimer_active(&cfs_b->period_timer)) && > - hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) { > - /* bounce the lock to allow do_sched_cfs_period_timer to run */ > - raw_spin_unlock(&cfs_b->lock); > - cpu_relax(); > - raw_spin_lock(&cfs_b->lock); > - /* if someone else restarted the timer then we're done */ > - if (!force && cfs_b->timer_active) > - return; > - } > - > - cfs_b->timer_active = 1; > start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); > } > > --- a/kernel/sched/rt.c > +++ b/kernel/sched/rt.c > @@ -18,19 +18,20 @@ static enum hrtimer_restart sched_rt_per > { > struct rt_bandwidth *rt_b = > container_of(timer, struct rt_bandwidth, rt_period_timer); > - ktime_t now; > - int overrun; > int idle = 0; > + int overrun; > > + raw_spin_lock(&rt_b->rt_runtime_lock); > for (;;) { > - now = hrtimer_cb_get_time(timer); > - overrun = hrtimer_forward(timer, now, rt_b->rt_period); > - > + overrun = hrtimer_forward_now(timer, rt_b->rt_period); > if (!overrun) > break; > > + raw_spin_unlock(&rt_b->rt_runtime_lock); > idle = do_sched_rt_period_timer(rt_b, overrun); > + raw_spin_lock(&rt_b->rt_runtime_lock); > } > + raw_spin_unlock(&rt_b->rt_runtime_lock); > > return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; > } > @@ -52,9 +53,6 @@ static void start_rt_bandwidth(struct rt > if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) > return; > > - if (hrtimer_active(&rt_b->rt_period_timer)) > - return; > - > raw_spin_lock(&rt_b->rt_runtime_lock); > start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); > raw_spin_unlock(&rt_b->rt_runtime_lock); > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -215,7 +215,7 @@ struct cfs_bandwidth { > s64 hierarchical_quota; > u64 runtime_expires; > > - int idle, timer_active; > + int idle; > struct hrtimer period_timer, slack_timer; > struct list_head throttled_cfs_rq; > > @@ -306,7 +306,7 @@ extern void init_cfs_bandwidth(struct cf > extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); > > extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); > -extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force); > +extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); > extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); > > extern void free_rt_sched_group(struct task_group *tg); -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/