From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Subject: [RFC PATCH v2 1/5] sched: load calculation for each group in sched
	domain
To: Linux Kernel <linux-kernel@vger.kernel.org>,
       Suresh B Siddha <suresh.b.siddha@intel.com>,
       Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>,
       Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>, Dipankar Sarma <dipankar@in.ibm.com>,
       Balbir Singh <balbir@linux.vnet.ibm.com>,
       Vatsa <vatsa@linux.vnet.ibm.com>, Gautham R Shenoy <ego@in.ibm.com>,
       Andi Kleen <andi@firstfloor.org>, David Collier-Brown <davecb@sun.com>,
       Tim Connors <tconnors@astro.swin.edu.au>,
       Max Krasnyansky <maxk@qualcomm.com>,
       Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Date: Thu, 09 Oct 2008 17:39:25 +0530
Message-ID: <20081009120924.27010.59999.stgit@drishya.in.ibm.com>
In-Reply-To: <20081009120705.27010.12857.stgit@drishya.in.ibm.com>
References: <20081009120705.27010.12857.stgit@drishya.in.ibm.com>
User-Agent: StGIT/0.14.2
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: 7bit
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 4094
Lines: 140

Add data structures for per group stats, and function
to calculate the required per group stats.

Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
---

 kernel/sched.c |  114 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 114 insertions(+), 0 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index ad1962d..ab77937 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3064,6 +3064,120 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	return 0;
 }
 
+/* Helper functions for find_busiest_group */
+
+int get_load_idx(struct sched_domain *sd, enum cpu_idle_type idle)
+{
+	if (idle == CPU_NOT_IDLE)
+		return sd->busy_idx;
+	else if (idle == CPU_NEWLY_IDLE)
+		return sd->newidle_idx;
+	else
+		return sd->idle_idx;
+}
+
+/* Struct to return group stats */
+
+struct group_loads {
+	struct sched_group *group;
+	unsigned long nr_running;
+	unsigned long load; /* Decay average load */
+	unsigned long load_per_cpu; /* Decay load / cpu_power */
+	unsigned long weighted_load; /* Instantaneous load (load.weight)*/
+	unsigned long avg_load_per_task; /* Instantaneous load/ nr_running */
+	unsigned int group_imbalance;
+	int local_group;
+	int balance_cpu;
+};
+
+/* Helper function to calculate basic group level stats */
+
+int get_group_loads(struct sched_group *group, int this_cpu,
+		    const cpumask_t *valid_cpus, enum cpu_idle_type idle,
+		    int load_idx,
+		    struct group_loads *gl)
+{
+	struct rq *rq;
+	unsigned long load, min_load, max_load, avg_load_per_task_per_cpu;
+	int cpu;
+	int local_group = 0;
+	int first_idle_cpu = -1;
+	int need_balance = 1;
+
+	gl->group = group;
+	gl->nr_running = 0;
+	gl->load = 0;
+	gl->weighted_load = 0;
+	gl->avg_load_per_task = 0;
+	gl->group_imbalance = 0;
+	gl->balance_cpu = -1;
+	max_load = 0;
+	min_load = ~0UL;
+
+	gl->local_group = cpu_isset(this_cpu, group->cpumask);
+
+	for_each_cpu_mask_nr(cpu, group->cpumask) {
+		if (!cpu_isset(cpu, *valid_cpus))
+			continue;
+
+		rq = cpu_rq(cpu);
+
+		/* Bias balancing toward cpus of our domain */
+		if (gl->local_group) {
+			if (idle_cpu(cpu) && first_idle_cpu == -1)
+				first_idle_cpu = cpu;
+
+			load = target_load(cpu, load_idx);
+		} else {
+			load = source_load(cpu, load_idx);
+			if (load > max_load)
+				max_load = load;
+			if (load < min_load)
+				min_load = load;
+		}
+		gl->nr_running += rq->nr_running;
+		gl->load += load;
+		gl->weighted_load += weighted_cpuload(cpu);
+		gl->avg_load_per_task += cpu_avg_load_per_task(cpu);
+	}
+
+	/*
+	 * Consider the group unbalanced when the imbalance is larger
+	 * than the average weight of two tasks.
+	 *
+	 * APZ: with cgroup the avg task weight can vary wildly and
+	 *      might not be a suitable number - should we keep a
+	 *      normalized nr_running number somewhere that negates
+	 *      the hierarchy?
+	 */
+
+	avg_load_per_task_per_cpu = sg_div_cpu_power(group,
+				gl->avg_load_per_task * SCHED_LOAD_SCALE);
+
+	if (!gl->local_group &&
+	    ((max_load - min_load) > 2*avg_load_per_task_per_cpu))
+			gl->group_imbalance = 1;
+
+	if (local_group) {
+		if (first_idle_cpu != -1)
+			gl->balance_cpu = first_idle_cpu;
+		else
+			gl->balance_cpu = first_cpu(group->cpumask);
+
+		/*
+		 * First idle cpu or the first cpu(busiest) in this sched group
+		 * is eligible for doing load balancing at this and above
+		 * domains. In the newly idle case, we will allow all the cpu's
+		 * to do the newly idle load balance.
+		 */
+		if (idle != CPU_NEWLY_IDLE && gl->balance_cpu != this_cpu)
+			need_balance = 0;
+	}
+	gl->load_per_cpu = sg_div_cpu_power(group, gl->load * SCHED_LOAD_SCALE);
+
+	return need_balance;
+}
+
 /*
  * find_busiest_group finds and returns the busiest CPU group within the
  * domain. It calculates and returns the amount of weighted load which

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/