Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751728AbdCZReG (ORCPT ); Sun, 26 Mar 2017 13:34:06 -0400 Received: from mail-yw0-f177.google.com ([209.85.161.177]:32982 "EHLO mail-yw0-f177.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751483AbdCZRdB (ORCPT ); Sun, 26 Mar 2017 13:33:01 -0400 MIME-Version: 1.0 In-Reply-To: <1490327582-4376-3-git-send-email-luca.abeni@santannapisa.it> References: <1490327582-4376-1-git-send-email-luca.abeni@santannapisa.it> <1490327582-4376-3-git-send-email-luca.abeni@santannapisa.it> From: Mathieu Poirier Date: Sun, 26 Mar 2017 11:32:59 -0600 Message-ID: Subject: Re: [RFC v5 2/9] sched/deadline: improve the tracking of active utilization To: luca abeni Cc: "linux-kernel@vger.kernel.org" , Peter Zijlstra , Ingo Molnar , Juri Lelli , Claudio Scordino , Steven Rostedt , Tommaso Cucinotta , Daniel Bristot de Oliveira , Joel Fernandes Content-Type: text/plain; charset=UTF-8 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 15965 Lines: 405 On 23 March 2017 at 21:52, luca abeni wrote: > From: Luca Abeni > > This patch implements a more theoretically sound algorithm for > tracking active utilization: instead of decreasing it when a > task blocks, use a timer (the "inactive timer", named after the > "Inactive" task state of the GRUB algorithm) to decrease the > active utilization at the so called "0-lag time". > > Signed-off-by: Luca Abeni > Tested-by: Claudio Scordino > Tested-by: Daniel Bristot de Oliveira > --- > include/linux/sched.h | 17 ++++ > kernel/sched/core.c | 3 + > kernel/sched/deadline.c | 208 ++++++++++++++++++++++++++++++++++++++++++++---- > kernel/sched/sched.h | 2 + > 4 files changed, 215 insertions(+), 15 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index d67eee8..952cac8 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -445,16 +445,33 @@ struct sched_dl_entity { > * > * @dl_yielded tells if task gave up the CPU before consuming > * all its available runtime during the last job. > + * > + * @dl_non_contending tells if task is inactive while still > + * contributing to the active utilization. In other words, it > + * indicates if the inactive timer has been armed and its handler > + * has not been executed yet. This flag is useful to avoid race > + * conditions between the inactive timer handler and the wakeup > + * code. > */ > int dl_throttled; > int dl_boosted; > int dl_yielded; > + int dl_non_contending; > > /* > * Bandwidth enforcement timer. Each -deadline task has its > * own bandwidth to be enforced, thus we need one timer per task. > */ > struct hrtimer dl_timer; > + > + /* > + * Inactive timer, responsible for decreasing the active utilization > + * at the "0-lag time". When a -deadline task blocks, it contributes > + * to GRUB's active utilization until the "0-lag time", hence a > + * timer is needed to decrease the active utilization at the correct > + * time. > + */ > + struct hrtimer inactive_timer; > }; > > union rcu_special { > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index 6d6cad9..bf0b0b9 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -2165,6 +2165,7 @@ void __dl_clear_params(struct task_struct *p) > > dl_se->dl_throttled = 0; > dl_se->dl_yielded = 0; > + dl_se->dl_non_contending = 0; > } > > /* > @@ -2196,6 +2197,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) > > RB_CLEAR_NODE(&p->dl.rb_node); > init_dl_task_timer(&p->dl); > + init_inactive_task_timer(&p->dl); > __dl_clear_params(p); > > INIT_LIST_HEAD(&p->rt.run_list); > @@ -2518,6 +2520,7 @@ static int dl_overflow(struct task_struct *p, int policy, > !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { > __dl_clear(dl_b, p->dl.dl_bw); > __dl_add(dl_b, new_bw); > + dl_change_utilization(p, new_bw); > err = 0; > } else if (!dl_policy(policy) && task_has_dl_policy(p)) { > __dl_clear(dl_b, p->dl.dl_bw); > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c > index cef9adb..86aed82 100644 > --- a/kernel/sched/deadline.c > +++ b/kernel/sched/deadline.c > @@ -65,6 +65,107 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) > dl_rq->running_bw = 0; > } > > +void dl_change_utilization(struct task_struct *p, u64 new_bw) > +{ > + if (!task_on_rq_queued(p)) { > + struct rq *rq = task_rq(p); > + > + if (p->dl.dl_non_contending) { > + sub_running_bw(p->dl.dl_bw, &rq->dl); > + p->dl.dl_non_contending = 0; > + /* > + * If the timer handler is currently running and the > + * timer cannot be cancelled, inactive_task_timer() > + * will see that dl_not_contending is not set, and > + * will not touch the rq's active utilization, > + * so we are still safe. > + */ > + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) > + put_task_struct(p); > + } > + } > +} > + > +static void task_non_contending(struct task_struct *p) > +{ > + struct sched_dl_entity *dl_se = &p->dl; > + struct hrtimer *timer = &dl_se->inactive_timer; > + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); > + struct rq *rq = rq_of_dl_rq(dl_rq); > + s64 zerolag_time; > + > + /* > + * If this is a non-deadline task that has been boosted, > + * do nothing > + */ > + if (dl_se->dl_runtime == 0) > + return; > + > + WARN_ON(hrtimer_active(&dl_se->inactive_timer)); > + WARN_ON(dl_se->dl_non_contending); > + > + zerolag_time = dl_se->deadline - > + div64_long((dl_se->runtime * dl_se->dl_period), > + dl_se->dl_runtime); > + > + /* > + * Using relative times instead of the absolute "0-lag time" > + * allows to simplify the code > + */ > + zerolag_time -= rq_clock(rq); > + > + /* > + * If the "0-lag time" already passed, decrease the active > + * utilization now, instead of starting a timer > + */ > + if (zerolag_time < 0) { > + if (dl_task(p)) > + sub_running_bw(dl_se->dl_bw, dl_rq); > + if (!dl_task(p) || p->state == TASK_DEAD) > + __dl_clear_params(p); > + > + return; > + } > + > + dl_se->dl_non_contending = 1; > + get_task_struct(p); > + hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL); > +} > + > +static void task_contending(struct sched_dl_entity *dl_se) > +{ > + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); > + > + /* > + * If this is a non-deadline task that has been boosted, > + * do nothing > + */ > + if (dl_se->dl_runtime == 0) > + return; > + > + if (dl_se->dl_non_contending) { > + /* > + * If the timer handler is currently running and the > + * timer cannot be cancelled, inactive_task_timer() > + * will see that dl_not_contending is not set, and > + * will not touch the rq's active utilization, > + * so we are still safe. > + */ > + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) > + put_task_struct(dl_task_of(dl_se)); > + dl_se->dl_non_contending = 0; > + } else { > + /* > + * Since "dl_non_contending" is not set, the > + * task's utilization has already been removed from > + * active utilization (either when the task blocked, > + * when the "inactive timer" fired). > + * So, add it back. > + */ > + add_running_bw(dl_se->dl_bw, dl_rq); > + } > +} > + > static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) > { > struct sched_dl_entity *dl_se = &p->dl; > @@ -615,10 +716,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) > * The task might have changed its scheduling policy to something > * different than SCHED_DEADLINE (through switched_from_dl()). > */ > - if (!dl_task(p)) { > - __dl_clear_params(p); > + if (!dl_task(p)) > goto unlock; > - } > > /* > * The task might have been boosted by someone else and might be in the > @@ -837,6 +936,49 @@ static void update_curr_dl(struct rq *rq) > } > } > > +static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) > +{ > + struct sched_dl_entity *dl_se = container_of(timer, > + struct sched_dl_entity, > + inactive_timer); > + struct task_struct *p = dl_task_of(dl_se); > + struct rq_flags rf; > + struct rq *rq; > + > + rq = task_rq_lock(p, &rf); > + > + if (!dl_task(p) || p->state == TASK_DEAD) { > + if (p->state == TASK_DEAD && dl_se->dl_non_contending) { > + sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); > + dl_se->dl_non_contending = 0; > + } > + __dl_clear_params(p); > + > + goto unlock; > + } > + if (dl_se->dl_non_contending == 0) > + goto unlock; > + > + sched_clock_tick(); > + update_rq_clock(rq); > + > + sub_running_bw(dl_se->dl_bw, &rq->dl); > + dl_se->dl_non_contending = 0; > +unlock: > + task_rq_unlock(rq, p, &rf); > + put_task_struct(p); > + > + return HRTIMER_NORESTART; > +} > + > +void init_inactive_task_timer(struct sched_dl_entity *dl_se) To be consistent with the other DL related functions: s/init_inactive_task_timer(...)/init_dl_inactive_task_timer(...) > +{ > + struct hrtimer *timer = &dl_se->inactive_timer; > + > + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); > + timer->function = inactive_task_timer; > +} > + > #ifdef CONFIG_SMP > > static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) > @@ -969,9 +1111,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, > * we want a replenishment of its runtime. > */ > if (flags & ENQUEUE_WAKEUP) { > - struct dl_rq *dl_rq = dl_rq_of_se(dl_se); > - > - add_running_bw(dl_se->dl_bw, dl_rq); > + task_contending(dl_se); > update_dl_entity(dl_se, pi_se); > } > else if (flags & ENQUEUE_REPLENISH) > @@ -1040,7 +1180,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) > * add_running_bw(). > */ > if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) { > - add_running_bw(p->dl.dl_bw, &rq->dl); > + if (flags & ENQUEUE_WAKEUP) > + task_contending(&p->dl); > + > return; > } > > @@ -1065,7 +1207,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) > sub_running_bw(p->dl.dl_bw, &rq->dl); > > /* > - * This check allows to decrease the active utilization in two cases: > + * This check allows to start the inactive timer (or to immediately > + * decrease the active utilization, if needed) in two cases: > * when the task blocks and when it is terminating > * (p->state == TASK_DEAD). We can handle the two cases in the same > * way, because from GRUB's point of view the same thing is happening > @@ -1073,7 +1216,7 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) > * or "inactive") > */ > if (flags & DEQUEUE_SLEEP) > - sub_running_bw(p->dl.dl_bw, &rq->dl); > + task_non_contending(p); > } > > /* > @@ -1151,6 +1294,28 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) > return cpu; > } > > +static void migrate_task_rq_dl(struct task_struct *p) > +{ > + if ((p->state == TASK_WAKING) && (p->dl.dl_non_contending)) { > + struct rq *rq = task_rq(p); > + > + raw_spin_lock(&rq->lock); > + sub_running_bw(p->dl.dl_bw, &rq->dl); > + p->dl.dl_non_contending = 0; > + /* > + * If the timer handler is currently running and the > + * timer cannot be cancelled, inactive_task_timer() > + * will see that dl_not_contending is not set, and > + * will not touch the rq's active utilization, > + * so we are still safe. > + */ > + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) > + put_task_struct(p); > + > + raw_spin_unlock(&rq->lock); > + } > +} > + > static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) > { > /* > @@ -1792,13 +1957,23 @@ void __init init_sched_dl_class(void) > static void switched_from_dl(struct rq *rq, struct task_struct *p) > { > /* > - * Start the deadline timer; if we switch back to dl before this we'll > - * continue consuming our current CBS slice. If we stay outside of > - * SCHED_DEADLINE until the deadline passes, the timer will reset the > - * task. > + * task_non_contending() can start the "inactive timer" (if the 0-lag > + * time is in the future). If the task switches back to dl before > + * the "inactive timer" fires, it can continue to consume its current > + * runtime using its current deadline. If it stays outside of > + * SCHED_DEADLINE until the 0-lag time passes, inactive_task_timer() > + * will reset the task parameters. > */ > - if (!start_dl_timer(p)) > - __dl_clear_params(p); > + if (task_on_rq_queued(p) && p->dl.dl_runtime) > + task_non_contending(p); > + > + /* > + * We cannot use inactive_task_timer() to invoke sub_running_bw() > + * at the 0-lag time, because the task could have been migrated > + * while SCHED_OTHER in the meanwhile. > + */ > + if (p->dl.dl_non_contending) > + p->dl.dl_non_contending = 0; > > /* > * Since this might be the only -deadline task on the rq, > @@ -1817,6 +1992,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) > */ > static void switched_to_dl(struct rq *rq, struct task_struct *p) > { > + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) > + put_task_struct(p); > > /* If p is not queued we will update its parameters at next wakeup. */ > if (!task_on_rq_queued(p)) > @@ -1891,6 +2068,7 @@ const struct sched_class dl_sched_class = { > > #ifdef CONFIG_SMP > .select_task_rq = select_task_rq_dl, > + .migrate_task_rq = migrate_task_rq_dl, > .set_cpus_allowed = set_cpus_allowed_dl, > .rq_online = rq_online_dl, > .rq_offline = rq_offline_dl, > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index caaa7d3..57bb79b 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -244,6 +244,7 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) > dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; > } > > +void dl_change_utilization(struct task_struct *p, u64 new_bw); > extern void init_dl_bw(struct dl_bw *dl_b); > > #ifdef CONFIG_CGROUP_SCHED > @@ -1490,6 +1491,7 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime > extern struct dl_bandwidth def_dl_bandwidth; > extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); > extern void init_dl_task_timer(struct sched_dl_entity *dl_se); > +extern void init_inactive_task_timer(struct sched_dl_entity *dl_se); > > unsigned long to_ratio(u64 period, u64 runtime); > > -- > 2.7.4 >