From: Brendan Jackman <brendan.jackman@arm.com>
To: linux-kernel@vger.kernel.org
Cc: Joel Fernandes <joelaf@google.com>,
        Andres Oportus <andresoportus@google.com>,
        Ingo Molnar <mingo@redhat.com>,
        Morten Rasmussen <morten.rasmussen@arm.com>,
        Peter Zijlstra <peterz@infradead.org>,
        Dietmar Eggemann <dietmar.eggemann@arm.com>,
        Vincent Guittot <vincent.guittot@linaro.org>
Subject: [PATCH 2/2] sched/fair: Fix use of NULL with find_idlest_group
Date: Mon, 21 Aug 2017 16:21:28 +0100
Message-Id: <20170821152128.14418-3-brendan.jackman@arm.com>
In-Reply-To: <20170821152128.14418-1-brendan.jackman@arm.com>
References: <20170821152128.14418-1-brendan.jackman@arm.com>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 5611
Lines: 163

The current use of returning NULL from find_idlest_group is broken in
two cases:

a1) The local group is not allowed.

   In this case, we currently do not change this_runnable_load or
   this_avg_load from its initial value of 0, which means we return
   NULL regardless of the load of the other, allowed groups. This
   results in pointlessly continuing the find_idlest_group search
   within the local group and then returning prev_cpu from
   select_task_rq_fair.

a2) No CPUs in the sched_domain are allowed.

   In this case we also return NULL and again pointlessly continue
   the search.

b) smp_processor_id() is the "idlest" and != prev_cpu.

   find_idlest_group also returns NULL when the local group is
   allowed and is the idlest. The caller then continues the
   find_idlest_group search at a lower level of the current CPU's
   sched_domain hierarchy. However new_cpu is not updated. This means
   the search is pointless and we return prev_cpu from
   select_task_rq_fair.

This is fixed by:

1. Returning NULL from find_idlest_group only when _no_ groups were
   allowed in the current sched_domain. In this case, we now break
   from the while(sd) loop and immediately return prev_cpu. This
   fixes case a2).

2. Initializing this_runnable_load and this_avg_load to ULONG_MAX
   instead of 0. This means in case a1) we now return the idlest
   non-local group.

3. Explicitly updating new_cpu when find_idlest_group returns the
   local group, fixing case b).

This patch also re-words the check for whether the group in
consideration is local, under the assumption that the first group in
the sched domain is always the local one.

Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/sched/fair.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 64618d768546..7cb5ed719cf9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5382,26 +5382,29 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
  * domain.
  */
 static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-		  int this_cpu, int sd_flag)
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int sd_flag)
 {
 	struct sched_group *idlest = NULL, *group = sd->groups;
+	struct sched_group *local_group = sd->groups;
 	struct sched_group *most_spare_sg = NULL;
-	unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
-	unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
+	unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = ULONG_MAX;
+	unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
 	unsigned long most_spare = 0, this_spare = 0;
 	int load_idx = sd->forkexec_idx;
 	int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
 	unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
 				(sd->imbalance_pct-100) / 100;
 
+	if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+		return NULL;
+
 	if (sd_flag & SD_BALANCE_WAKE)
 		load_idx = sd->wake_idx;
 
 	do {
 		unsigned long load, avg_load, runnable_load;
 		unsigned long spare_cap, max_spare_cap;
-		int local_group;
+		bool group_is_local = group == local_group;
 		int i;
 
 		/* Skip over this group if it has no CPUs allowed */
@@ -5409,9 +5412,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 					&p->cpus_allowed))
 			continue;
 
-		local_group = cpumask_test_cpu(this_cpu,
-					       sched_group_span(group));
-
 		/*
 		 * Tally up the load of all CPUs in the group and find
 		 * the group containing the CPU with most spare capacity.
@@ -5422,7 +5422,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 
 		for_each_cpu(i, sched_group_span(group)) {
 			/* Bias balancing toward cpus of our domain */
-			if (local_group)
+			if (group_is_local)
 				load = source_load(i, load_idx);
 			else
 				load = target_load(i, load_idx);
@@ -5443,7 +5443,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
 					group->sgc->capacity;
 
-		if (local_group) {
+		if (group_is_local) {
 			this_runnable_load = runnable_load;
 			this_avg_load = avg_load;
 			this_spare = max_spare_cap;
@@ -5489,21 +5489,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 
 	if (this_spare > task_util(p) / 2 &&
 	    imbalance_scale*this_spare > 100*most_spare)
-		return NULL;
+		return local_group;
 
 	if (most_spare > task_util(p) / 2)
 		return most_spare_sg;
 
 skip_spare:
 	if (!idlest)
-		return NULL;
+		return local_group;
 
 	if (min_runnable_load > (this_runnable_load + imbalance))
-		return NULL;
+		return local_group;
 
 	if ((this_runnable_load < (min_runnable_load + imbalance)) &&
 	     (100*this_avg_load < imbalance_scale*min_avg_load))
-		return NULL;
+		return local_group;
 
 	return idlest;
 }
@@ -5927,8 +5927,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 			continue;
 		}
 
-		group = find_idlest_group(sd, p, cpu, sd_flag);
+		group = find_idlest_group(sd, p, sd_flag);
 		if (!group) {
+			break;
+		} else if (group == sd->groups) {
+			new_cpu = cpu;
+			/* Now try balancing at a lower domain level of cpu */
 			sd = sd->child;
 			continue;
 		}
-- 
2.14.1