Date: Tue, 14 Mar 2017 23:34:06 +0200
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Reply-To: Jarmo Tiitto <Jarmo.Tiitto@gmail.com>
Subject: [PATCH v4.9] sched/fair: Overload-On-Wakeup fix.
To: Ingo Molnar <mingo@redhat.com>, Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Message-Id: <1489527246.3961.3@smtp.gmail.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii; format=flowed
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 5948
Lines: 211

Try improve multi-core scaling.
The changes are experimental - the fix works
but possible regressions are unknown.

Please read the orginal author's paper to understand
the overall problem this patch tries to solve.

See the orignal patches at:
https://github.com/jplozi/wastedcores/tree/master/patches

And the related paper at:
http://www.i3s.unice.fr/~jplozi/wastedcores/

v1: the patch was re-written for v4.9 to remove goto statement.

v2: improved version from the wastedcores initial code. try take into
	account the cpu  wake-up latency and choose most recent idle cpu
	instead.

I have tested the code and it really does improve
multi-core performance on my machine.

Reported-by: <jplozi@unice.fr>
Signed-off-by: <jarmo.tiitto@gmail.com>
---
 kernel/sched/fair.c | 138
++++++++++++++++++++++++++++++++--------------------
 1 file changed, 86 insertions(+), 52 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c242944f5cbd..6569e76294d4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5630,6 +5630,10 @@ select_task_rq_fair(struct task_struct *p, int
prev_cpu, int sd_flag, int wake_f
 	int new_cpu = prev_cpu;
 	int want_affine = 0;
 	int sync = wake_flags & WF_SYNC;
+	int chk_cpu;
+	u64 idle_stamp = 0;
+	unsigned int idle_exit_lat = UINT_MAX;
+	int idle_stamp_cpu = -1;

 	if (sd_flag & SD_BALANCE_WAKE) {
 		record_wakee(p);
@@ -5638,69 +5642,99 @@ select_task_rq_fair(struct task_struct *p, int
prev_cpu, int sd_flag, int wake_f
 	}

 	rcu_read_lock();
-	for_each_domain(cpu, tmp) {
-		if (!(tmp->flags & SD_LOAD_BALANCE))
-			break;
+	/* fast wake up on current idle cpu */
+	if (!cpu_rq(prev_cpu)->nr_running) {
+		new_cpu = prev_cpu;
+	} else {
+		/* Check for suitable unused cores */
+		for_each_cpu_and(chk_cpu, cpu_online_mask, tsk_cpus_allowed(p)) {
+			struct rq *rq = cpu_rq(chk_cpu);
+			struct cpuidle_state *idle = idle_get_state(rq);

-		/*
-		 * If both cpu and prev_cpu are part of this domain,
-		 * cpu is a valid SD_WAKE_AFFINE target.
-		 */
-		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
-		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
-			affine_sd = tmp;
-			break;
+			if (!rq->nr_running) {
+				if (idle && idle->exit_latency < idle_exit_lat) {
+					/* Always pick lower latency idle cpu */
+					idle_exit_lat = idle->exit_latency;
+					idle_stamp = rq->idle_stamp;
+					idle_stamp_cpu = chk_cpu;
+				} else if ((!idle || idle->exit_latency == idle_exit_lat) &&
+					rq->idle_stamp > idle_stamp) {
+					/* Pick most recent idle cpu. */
+					idle_stamp = rq->idle_stamp;
+					idle_stamp_cpu = chk_cpu;
+				}
+			}
 		}

-		if (tmp->flags & sd_flag)
-			sd = tmp;
-		else if (!want_affine)
-			break;
-	}
+		if (idle_stamp_cpu != -1) {
+			/* Found suitable unused cpu */
+			new_cpu = idle_stamp_cpu;
+		} else {
+			/* There was no idle cpus available, must load balance. */
+			for_each_domain(cpu, tmp) {
+				if (!(tmp->flags & SD_LOAD_BALANCE))
+					break;

-	if (affine_sd) {
-		sd = NULL; /* Prefer wake_affine over balance flags */
-		if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
-			new_cpu = cpu;
-	}
+				/* If both cpu and prev_cpu are part of this domain,
+				 * cpu is a valid SD_WAKE_AFFINE target.
+				 */
+				if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+					cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+						affine_sd = tmp;
+						break;
+				}
+
+				if (tmp->flags & sd_flag)
+					sd = tmp;
+				else if (!want_affine)
+					break;
+			}

-	if (!sd) {
-		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
-			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+			if (affine_sd) {
+				sd = NULL; /* Prefer wake_affine over balance flags */
+				if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
+					new_cpu = cpu;
+			}

-	} else while (sd) {
-		struct sched_group *group;
-		int weight;
+			if (!sd) {
+				if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+					new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);

-		if (!(sd->flags & sd_flag)) {
-			sd = sd->child;
-			continue;
-		}
+			} else while (sd) {
+				struct sched_group *group;
+				int weight;

-		group = find_idlest_group(sd, p, cpu, sd_flag);
-		if (!group) {
-			sd = sd->child;
-			continue;
-		}
+				if (!(sd->flags & sd_flag)) {
+					sd = sd->child;
+					continue;
+				}

-		new_cpu = find_idlest_cpu(group, p, cpu);
-		if (new_cpu == -1 || new_cpu == cpu) {
-			/* Now try balancing at a lower domain level of cpu */
-			sd = sd->child;
-			continue;
-		}
+				group = find_idlest_group(sd, p, cpu, sd_flag);
+				if (!group) {
+					sd = sd->child;
+					continue;
+				}

-		/* Now try balancing at a lower domain level of new_cpu */
-		cpu = new_cpu;
-		weight = sd->span_weight;
-		sd = NULL;
-		for_each_domain(cpu, tmp) {
-			if (weight <= tmp->span_weight)
-				break;
-			if (tmp->flags & sd_flag)
-				sd = tmp;
+				new_cpu = find_idlest_cpu(group, p, cpu);
+				if (new_cpu == -1 || new_cpu == cpu) {
+					/* Now try balancing at a lower domain level of cpu */
+					sd = sd->child;
+					continue;
+				}
+
+				/* Now try balancing at a lower domain level of new_cpu */
+				cpu = new_cpu;
+				weight = sd->span_weight;
+				sd = NULL;
+				for_each_domain(cpu, tmp) {
+					if (weight <= tmp->span_weight)
+						break;
+					if (tmp->flags & sd_flag)
+						sd = tmp;
+				}
+				/* while loop will break here if sd == NULL */
+			}
 		}
-		/* while loop will break here if sd == NULL */
 	}
 	rcu_read_unlock();

-- 
2.12.0
^---
I'm faily newbie to the kernel development. please be nice.
I hope this email and the attached patch is useful. I did not write the
orginal
code but I have tinkered the code as an experiment quite lot.
The attached patch is the result of this tinkering.
The code most probaly is incorrect and has side-effects/regressions.
However, I have ran multiple kernels with this applied and it never
caused any severe problems.