Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934099AbbELTln (ORCPT ); Tue, 12 May 2015 15:41:43 -0400 Received: from foss.arm.com ([217.140.101.70]:33977 "EHLO foss.arm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933978AbbELTiv (ORCPT ); Tue, 12 May 2015 15:38:51 -0400 From: Morten Rasmussen To: peterz@infradead.org, mingo@redhat.com Cc: vincent.guittot@linaro.org, Dietmar Eggemann , yuyang.du@intel.com, preeti@linux.vnet.ibm.com, mturquette@linaro.org, rjw@rjwysocki.net, Juri Lelli , sgurrappadi@nvidia.com, pang.xunlei@zte.com.cn, linux-kernel@vger.kernel.org, linux-pm@vger.kernel.org, morten.rasmussen@arm.com Subject: [RFCv4 PATCH 25/34] sched: Add over-utilization/tipping point indicator Date: Tue, 12 May 2015 20:39:00 +0100 Message-Id: <1431459549-18343-26-git-send-email-morten.rasmussen@arm.com> X-Mailer: git-send-email 1.9.1 In-Reply-To: <1431459549-18343-1-git-send-email-morten.rasmussen@arm.com> References: <1431459549-18343-1-git-send-email-morten.rasmussen@arm.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 5785 Lines: 163 Energy-aware scheduling is only meant to be active while the system is _not_ over-utilized. That is, there are spare cycles available to shift tasks around based on their actual utilization to get a more energy-efficient task distribution without depriving any tasks. When above the tipping point task placement is done the traditional way, spreading the tasks across as many cpus as possible based on priority scaled load to preserve smp_nice. The over-utilization condition is conservatively chosen to indicate over-utilization as soon as one cpu is fully utilized at it's highest frequency. We don't consider groups as lumping usage and capacity together for a group of cpus may hide the fact that one or more cpus in the group are over-utilized while group-siblings are partially idle. The tasks could be served better if moved to another group with completely idle cpus. This is particularly problematic if some cpus have a significantly reduced capacity due to RT/IRQ pressure or if the system has cpus of different capacity (e.g. ARM big.LITTLE). cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 35 +++++++++++++++++++++++++++++++---- kernel/sched/sched.h | 3 +++ 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f36ab2f3..5b7bc28 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4266,6 +4266,8 @@ static inline void hrtick_update(struct rq *rq) } #endif +static bool cpu_overutilized(int cpu); + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -4276,6 +4278,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int task_new = !(flags & ENQUEUE_WAKEUP); for_each_sched_entity(se) { if (se->on_rq) @@ -4310,6 +4313,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) { update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); + if (!task_new && !rq->rd->overutilized && + cpu_overutilized(rq->cpu)) + rq->rd->overutilized = true; } hrtick_update(rq); } @@ -4937,6 +4943,14 @@ static int find_new_capacity(struct energy_env *eenv, return idx; } +static unsigned int capacity_margin = 1280; /* ~20% margin */ + +static bool cpu_overutilized(int cpu) +{ + return (capacity_of(cpu) * 1024) < + (get_cpu_usage(cpu) * capacity_margin); +} + /* * sched_group_energy(): Returns absolute energy consumption of cpus belonging * to the sched_group including shared resources shared only by members of the @@ -6732,11 +6746,12 @@ static enum group_type group_classify(struct lb_env *env, * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. * @overload: Indicate more than one runnable task for any CPU. + * @overutilized: Indicate overutilization for any CPU. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs, - bool *overload) + bool *overload, bool *overutilized) { unsigned long load; int i; @@ -6766,6 +6781,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->sum_weighted_load += weighted_cpuload(i); if (idle_cpu(i)) sgs->idle_cpus++; + + if (cpu_overutilized(i)) + *overutilized = true; } /* Adjust by relative CPU capacity of the group */ @@ -6871,7 +6889,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; - bool overload = false; + bool overload = false, overutilized = false; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -6893,7 +6911,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd } update_sg_lb_stats(env, sg, load_idx, local_group, sgs, - &overload); + &overload, &overutilized); if (local_group) goto next_group; @@ -6935,8 +6953,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; - } + /* Update over-utilization (tipping point, U >= 0) indicator */ + if (env->dst_rq->rd->overutilized != overutilized) + env->dst_rq->rd->overutilized = overutilized; + } else { + if (!env->dst_rq->rd->overutilized && overutilized) + env->dst_rq->rd->overutilized = true; + } } /** @@ -8300,6 +8324,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr); update_rq_runnable_avg(rq, 1); + + if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) + rq->rd->overutilized = true; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b627dfa..a5d2d69 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -535,6 +535,9 @@ struct root_domain { /* Indicate more than one runnable task for any CPU */ bool overload; + /* Indicate one or more cpus over-utilized (tipping point) */ + bool overutilized; + /* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). -- 1.9.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/