Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755601AbdCWTxD (ORCPT ); Thu, 23 Mar 2017 15:53:03 -0400 Received: from mail.santannapisa.it ([193.205.80.99]:26882 "EHLO mail.santannapisa.it" rhost-flags-OK-FAIL-OK-OK) by vger.kernel.org with ESMTP id S1753821AbdCWTxB (ORCPT ); Thu, 23 Mar 2017 15:53:01 -0400 From: luca abeni To: linux-kernel@vger.kernel.org Cc: Peter Zijlstra , Ingo Molnar , Juri Lelli , Claudio Scordino , Steven Rostedt , Tommaso Cucinotta , Daniel Bristot de Oliveira , Joel Fernandes , Mathieu Poirier , Luca Abeni Subject: [RFC v5 2/9] sched/deadline: improve the tracking of active utilization Date: Fri, 24 Mar 2017 04:52:55 +0100 Message-Id: <1490327582-4376-3-git-send-email-luca.abeni@santannapisa.it> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1490327582-4376-1-git-send-email-luca.abeni@santannapisa.it> References: <1490327582-4376-1-git-send-email-luca.abeni@santannapisa.it> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 12607 Lines: 397 From: Luca Abeni This patch implements a more theoretically sound algorithm for tracking active utilization: instead of decreasing it when a task blocks, use a timer (the "inactive timer", named after the "Inactive" task state of the GRUB algorithm) to decrease the active utilization at the so called "0-lag time". Signed-off-by: Luca Abeni Tested-by: Claudio Scordino Tested-by: Daniel Bristot de Oliveira --- include/linux/sched.h | 17 ++++ kernel/sched/core.c | 3 + kernel/sched/deadline.c | 208 ++++++++++++++++++++++++++++++++++++++++++++---- kernel/sched/sched.h | 2 + 4 files changed, 215 insertions(+), 15 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d67eee8..952cac8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -445,16 +445,33 @@ struct sched_dl_entity { * * @dl_yielded tells if task gave up the CPU before consuming * all its available runtime during the last job. + * + * @dl_non_contending tells if task is inactive while still + * contributing to the active utilization. In other words, it + * indicates if the inactive timer has been armed and its handler + * has not been executed yet. This flag is useful to avoid race + * conditions between the inactive timer handler and the wakeup + * code. */ int dl_throttled; int dl_boosted; int dl_yielded; + int dl_non_contending; /* * Bandwidth enforcement timer. Each -deadline task has its * own bandwidth to be enforced, thus we need one timer per task. */ struct hrtimer dl_timer; + + /* + * Inactive timer, responsible for decreasing the active utilization + * at the "0-lag time". When a -deadline task blocks, it contributes + * to GRUB's active utilization until the "0-lag time", hence a + * timer is needed to decrease the active utilization at the correct + * time. + */ + struct hrtimer inactive_timer; }; union rcu_special { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6d6cad9..bf0b0b9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2165,6 +2165,7 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; + dl_se->dl_non_contending = 0; } /* @@ -2196,6 +2197,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) RB_CLEAR_NODE(&p->dl.rb_node); init_dl_task_timer(&p->dl); + init_inactive_task_timer(&p->dl); __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); @@ -2518,6 +2520,7 @@ static int dl_overflow(struct task_struct *p, int policy, !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { __dl_clear(dl_b, p->dl.dl_bw); __dl_add(dl_b, new_bw); + dl_change_utilization(p, new_bw); err = 0; } else if (!dl_policy(policy) && task_has_dl_policy(p)) { __dl_clear(dl_b, p->dl.dl_bw); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index cef9adb..86aed82 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -65,6 +65,107 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) dl_rq->running_bw = 0; } +void dl_change_utilization(struct task_struct *p, u64 new_bw) +{ + if (!task_on_rq_queued(p)) { + struct rq *rq = task_rq(p); + + if (p->dl.dl_non_contending) { + sub_running_bw(p->dl.dl_bw, &rq->dl); + p->dl.dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); + } + } +} + +static void task_non_contending(struct task_struct *p) +{ + struct sched_dl_entity *dl_se = &p->dl; + struct hrtimer *timer = &dl_se->inactive_timer; + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + s64 zerolag_time; + + /* + * If this is a non-deadline task that has been boosted, + * do nothing + */ + if (dl_se->dl_runtime == 0) + return; + + WARN_ON(hrtimer_active(&dl_se->inactive_timer)); + WARN_ON(dl_se->dl_non_contending); + + zerolag_time = dl_se->deadline - + div64_long((dl_se->runtime * dl_se->dl_period), + dl_se->dl_runtime); + + /* + * Using relative times instead of the absolute "0-lag time" + * allows to simplify the code + */ + zerolag_time -= rq_clock(rq); + + /* + * If the "0-lag time" already passed, decrease the active + * utilization now, instead of starting a timer + */ + if (zerolag_time < 0) { + if (dl_task(p)) + sub_running_bw(dl_se->dl_bw, dl_rq); + if (!dl_task(p) || p->state == TASK_DEAD) + __dl_clear_params(p); + + return; + } + + dl_se->dl_non_contending = 1; + get_task_struct(p); + hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL); +} + +static void task_contending(struct sched_dl_entity *dl_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + /* + * If this is a non-deadline task that has been boosted, + * do nothing + */ + if (dl_se->dl_runtime == 0) + return; + + if (dl_se->dl_non_contending) { + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) + put_task_struct(dl_task_of(dl_se)); + dl_se->dl_non_contending = 0; + } else { + /* + * Since "dl_non_contending" is not set, the + * task's utilization has already been removed from + * active utilization (either when the task blocked, + * when the "inactive timer" fired). + * So, add it back. + */ + add_running_bw(dl_se->dl_bw, dl_rq); + } +} + static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; @@ -615,10 +716,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * The task might have changed its scheduling policy to something * different than SCHED_DEADLINE (through switched_from_dl()). */ - if (!dl_task(p)) { - __dl_clear_params(p); + if (!dl_task(p)) goto unlock; - } /* * The task might have been boosted by someone else and might be in the @@ -837,6 +936,49 @@ static void update_curr_dl(struct rq *rq) } } +static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) +{ + struct sched_dl_entity *dl_se = container_of(timer, + struct sched_dl_entity, + inactive_timer); + struct task_struct *p = dl_task_of(dl_se); + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + + if (!dl_task(p) || p->state == TASK_DEAD) { + if (p->state == TASK_DEAD && dl_se->dl_non_contending) { + sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); + dl_se->dl_non_contending = 0; + } + __dl_clear_params(p); + + goto unlock; + } + if (dl_se->dl_non_contending == 0) + goto unlock; + + sched_clock_tick(); + update_rq_clock(rq); + + sub_running_bw(dl_se->dl_bw, &rq->dl); + dl_se->dl_non_contending = 0; +unlock: + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + + return HRTIMER_NORESTART; +} + +void init_inactive_task_timer(struct sched_dl_entity *dl_se) +{ + struct hrtimer *timer = &dl_se->inactive_timer; + + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + timer->function = inactive_task_timer; +} + #ifdef CONFIG_SMP static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) @@ -969,9 +1111,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * we want a replenishment of its runtime. */ if (flags & ENQUEUE_WAKEUP) { - struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - - add_running_bw(dl_se->dl_bw, dl_rq); + task_contending(dl_se); update_dl_entity(dl_se, pi_se); } else if (flags & ENQUEUE_REPLENISH) @@ -1040,7 +1180,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * add_running_bw(). */ if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) { - add_running_bw(p->dl.dl_bw, &rq->dl); + if (flags & ENQUEUE_WAKEUP) + task_contending(&p->dl); + return; } @@ -1065,7 +1207,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) sub_running_bw(p->dl.dl_bw, &rq->dl); /* - * This check allows to decrease the active utilization in two cases: + * This check allows to start the inactive timer (or to immediately + * decrease the active utilization, if needed) in two cases: * when the task blocks and when it is terminating * (p->state == TASK_DEAD). We can handle the two cases in the same * way, because from GRUB's point of view the same thing is happening @@ -1073,7 +1216,7 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) * or "inactive") */ if (flags & DEQUEUE_SLEEP) - sub_running_bw(p->dl.dl_bw, &rq->dl); + task_non_contending(p); } /* @@ -1151,6 +1294,28 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) return cpu; } +static void migrate_task_rq_dl(struct task_struct *p) +{ + if ((p->state == TASK_WAKING) && (p->dl.dl_non_contending)) { + struct rq *rq = task_rq(p); + + raw_spin_lock(&rq->lock); + sub_running_bw(p->dl.dl_bw, &rq->dl); + p->dl.dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); + + raw_spin_unlock(&rq->lock); + } +} + static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) { /* @@ -1792,13 +1957,23 @@ void __init init_sched_dl_class(void) static void switched_from_dl(struct rq *rq, struct task_struct *p) { /* - * Start the deadline timer; if we switch back to dl before this we'll - * continue consuming our current CBS slice. If we stay outside of - * SCHED_DEADLINE until the deadline passes, the timer will reset the - * task. + * task_non_contending() can start the "inactive timer" (if the 0-lag + * time is in the future). If the task switches back to dl before + * the "inactive timer" fires, it can continue to consume its current + * runtime using its current deadline. If it stays outside of + * SCHED_DEADLINE until the 0-lag time passes, inactive_task_timer() + * will reset the task parameters. */ - if (!start_dl_timer(p)) - __dl_clear_params(p); + if (task_on_rq_queued(p) && p->dl.dl_runtime) + task_non_contending(p); + + /* + * We cannot use inactive_task_timer() to invoke sub_running_bw() + * at the 0-lag time, because the task could have been migrated + * while SCHED_OTHER in the meanwhile. + */ + if (p->dl.dl_non_contending) + p->dl.dl_non_contending = 0; /* * Since this might be the only -deadline task on the rq, @@ -1817,6 +1992,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); /* If p is not queued we will update its parameters at next wakeup. */ if (!task_on_rq_queued(p)) @@ -1891,6 +2068,7 @@ const struct sched_class dl_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_dl, + .migrate_task_rq = migrate_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index caaa7d3..57bb79b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -244,6 +244,7 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; } +void dl_change_utilization(struct task_struct *p, u64 new_bw); extern void init_dl_bw(struct dl_bw *dl_b); #ifdef CONFIG_CGROUP_SCHED @@ -1490,6 +1491,7 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime extern struct dl_bandwidth def_dl_bandwidth; extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); +extern void init_inactive_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); -- 2.7.4