Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1759129AbYJIMFk (ORCPT ); Thu, 9 Oct 2008 08:05:40 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1758369AbYJIMFQ (ORCPT ); Thu, 9 Oct 2008 08:05:16 -0400 Received: from E23SMTP03.au.ibm.com ([202.81.18.172]:47449 "EHLO e23smtp03.au.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1757975AbYJIMFO (ORCPT ); Thu, 9 Oct 2008 08:05:14 -0400 From: Vaidyanathan Srinivasan Subject: [RFC PATCH v2 1/5] sched: load calculation for each group in sched domain To: Linux Kernel , Suresh B Siddha , Venkatesh Pallipadi , Peter Zijlstra Cc: Ingo Molnar , Dipankar Sarma , Balbir Singh , Vatsa , Gautham R Shenoy , Andi Kleen , David Collier-Brown , Tim Connors , Max Krasnyansky , Vaidyanathan Srinivasan Date: Thu, 09 Oct 2008 17:39:25 +0530 Message-ID: <20081009120924.27010.59999.stgit@drishya.in.ibm.com> In-Reply-To: <20081009120705.27010.12857.stgit@drishya.in.ibm.com> References: <20081009120705.27010.12857.stgit@drishya.in.ibm.com> User-Agent: StGIT/0.14.2 MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 4094 Lines: 140 Add data structures for per group stats, and function to calculate the required per group stats. Signed-off-by: Vaidyanathan Srinivasan --- kernel/sched.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 114 insertions(+), 0 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index ad1962d..ab77937 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3064,6 +3064,120 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } +/* Helper functions for find_busiest_group */ + +int get_load_idx(struct sched_domain *sd, enum cpu_idle_type idle) +{ + if (idle == CPU_NOT_IDLE) + return sd->busy_idx; + else if (idle == CPU_NEWLY_IDLE) + return sd->newidle_idx; + else + return sd->idle_idx; +} + +/* Struct to return group stats */ + +struct group_loads { + struct sched_group *group; + unsigned long nr_running; + unsigned long load; /* Decay average load */ + unsigned long load_per_cpu; /* Decay load / cpu_power */ + unsigned long weighted_load; /* Instantaneous load (load.weight)*/ + unsigned long avg_load_per_task; /* Instantaneous load/ nr_running */ + unsigned int group_imbalance; + int local_group; + int balance_cpu; +}; + +/* Helper function to calculate basic group level stats */ + +int get_group_loads(struct sched_group *group, int this_cpu, + const cpumask_t *valid_cpus, enum cpu_idle_type idle, + int load_idx, + struct group_loads *gl) +{ + struct rq *rq; + unsigned long load, min_load, max_load, avg_load_per_task_per_cpu; + int cpu; + int local_group = 0; + int first_idle_cpu = -1; + int need_balance = 1; + + gl->group = group; + gl->nr_running = 0; + gl->load = 0; + gl->weighted_load = 0; + gl->avg_load_per_task = 0; + gl->group_imbalance = 0; + gl->balance_cpu = -1; + max_load = 0; + min_load = ~0UL; + + gl->local_group = cpu_isset(this_cpu, group->cpumask); + + for_each_cpu_mask_nr(cpu, group->cpumask) { + if (!cpu_isset(cpu, *valid_cpus)) + continue; + + rq = cpu_rq(cpu); + + /* Bias balancing toward cpus of our domain */ + if (gl->local_group) { + if (idle_cpu(cpu) && first_idle_cpu == -1) + first_idle_cpu = cpu; + + load = target_load(cpu, load_idx); + } else { + load = source_load(cpu, load_idx); + if (load > max_load) + max_load = load; + if (load < min_load) + min_load = load; + } + gl->nr_running += rq->nr_running; + gl->load += load; + gl->weighted_load += weighted_cpuload(cpu); + gl->avg_load_per_task += cpu_avg_load_per_task(cpu); + } + + /* + * Consider the group unbalanced when the imbalance is larger + * than the average weight of two tasks. + * + * APZ: with cgroup the avg task weight can vary wildly and + * might not be a suitable number - should we keep a + * normalized nr_running number somewhere that negates + * the hierarchy? + */ + + avg_load_per_task_per_cpu = sg_div_cpu_power(group, + gl->avg_load_per_task * SCHED_LOAD_SCALE); + + if (!gl->local_group && + ((max_load - min_load) > 2*avg_load_per_task_per_cpu)) + gl->group_imbalance = 1; + + if (local_group) { + if (first_idle_cpu != -1) + gl->balance_cpu = first_idle_cpu; + else + gl->balance_cpu = first_cpu(group->cpumask); + + /* + * First idle cpu or the first cpu(busiest) in this sched group + * is eligible for doing load balancing at this and above + * domains. In the newly idle case, we will allow all the cpu's + * to do the newly idle load balance. + */ + if (idle != CPU_NEWLY_IDLE && gl->balance_cpu != this_cpu) + need_balance = 0; + } + gl->load_per_cpu = sg_div_cpu_power(group, gl->load * SCHED_LOAD_SCALE); + + return need_balance; +} + /* * find_busiest_group finds and returns the busiest CPU group within the * domain. It calculates and returns the amount of weighted load which -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/