From: Atish Patra <atish.patra@oracle.com>
To: linux-kernel@vger.kernel.org
Cc: joelaf@google.com, jbacik@fb.com, mingo@redhat.com,
        peterz@infradead.org, efault@gmx.de, urezki@gmail.com,
        atish.patra@oracle.com
Subject: [PATCH RFC v2] sched: Minimize the idle cpu selection race window.
Date: Tue,  5 Dec 2017 13:09:07 -0600
Message-Id: <1512500947-24444-2-git-send-email-atish.patra@oracle.com>
In-Reply-To: <1512500947-24444-1-git-send-email-atish.patra@oracle.com>
References: <1512500947-24444-1-git-send-email-atish.patra@oracle.com>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 5289
Lines: 161

Currently, multiple tasks can wakeup on same cpu from
select_idle_sibiling() path in case they wakeup simulatenously
and last ran on the same llc. This happens because an idle cpu
is not updated until idle task is scheduled out. Any task waking
during that period may potentially select that cpu for a wakeup
candidate.

Introduce a per cpu variable that is set as soon as a cpu is
selected for wakeup for any task. This prevents from other tasks
to select the same cpu again. Note: This does not close the race
window but minimizes it to accessing the per-cpu variable. If two
wakee tasks access the per cpu variable at the same time, they may
select the same cpu again. But it minimizes the race window
considerably.

Here are some performance numbers:

Hardware config: 20 core (40 hyperthreaded cpus) x86 box.
uperf config:ping pong test on loopback interface with message size = 8k

         Baseline (4.14)            Baseline    +pcpu
       Mean       stdev     Mean        stdev   Improvement(%)
1      9.056      0.02      8.966        0.083   -0.993
2      17.664     0.13      17.448       0.303   -1.222
4      32.03      0.22      31.972       0.129   -0.181
8      58.198     0.31      58.588       0.198   0.670
16     101.018    0.67      100.056      0.455   -0.952
32     148.1      15.41     164.494      2.312   11.069
64     203.66     1.16      203.042      1.348   -0.3073
128    197.12     1.04      194.722      1.174   -1.2165

schbench config: message threads = 2; time = 180s, worker thread = 19

        Baseline(4.14)          Base+pcpu
Max Latency
        Mean    stdev     Mean      stdev   Improvement(%)
        16457   2046.51    12985.4   48.46   21.0949747828

Latency in usec(lower is better)
        Mean     stdev        Mean    stdev    Improvement(%)
50%      63      4.774         58      4       7.936
75%      81.6    1.2           79.4    0.8     2.696
90%      92.8    0.979         90.6    0.489   2.370
95%      102.6   1.624         99      0.894   3.508
99%      126.25  4.573         120.4   3.872   4.63
99.5%    2712.2  4772.883      133.6   5.571   95.074

Reported by Joel:
Jackbench(Android)
count    16304   (@60 fps, 4.5 minutes)

        Without patch   With patch
mean         5.196633   4.429641 (+14.75%)
std          2.030054   2.310025
25%          5.606810   1.991017 (+64.48%)
50%          5.824013   5.716631 (+1.84%)
75%          5.987102   5.932751 (+0.90%)
95%          6.461230   6.301318 (+2.47%)
99%          9.828959   9.697076 (+1.34%)

The details are at:
https://lkml.org/lkml/2017/11/22/5

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Atish Patra <atish.patra@oracle.com>
---
 kernel/sched/core.c      |  4 ++++
 kernel/sched/fair.c      | 12 +++++++++---
 kernel/sched/idle_task.c |  1 +
 kernel/sched/sched.h     |  1 +
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2288a14..d9d501c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3896,6 +3896,7 @@ int task_prio(const struct task_struct *p)
 	return p->prio - MAX_RT_PRIO;
 }
 
+DEFINE_PER_CPU(int, claim_wakeup);
 /**
  * idle_cpu - is a given CPU idle currently?
  * @cpu: the processor in question.
@@ -3917,6 +3918,9 @@ int idle_cpu(int cpu)
 		return 0;
 #endif
 
+	if (per_cpu(claim_wakeup, cpu))
+		return 0;
+
 	return 1;
 }
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 13393bb..885023a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6077,8 +6077,10 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
 				idle = false;
 		}
 
-		if (idle)
+		if (idle) {
+			per_cpu(claim_wakeup, core) = 1;
 			return core;
+		}
 	}
 
 	/*
@@ -6102,8 +6104,10 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
 	for_each_cpu(cpu, cpu_smt_mask(target)) {
 		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 			continue;
-		if (idle_cpu(cpu))
+		if (idle_cpu(cpu)) {
+			per_cpu(claim_wakeup, cpu) = 1;
 			return cpu;
+		}
 	}
 
 	return -1;
@@ -6165,8 +6169,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 			return -1;
 		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 			continue;
-		if (idle_cpu(cpu))
+		if (idle_cpu(cpu)) {
+			per_cpu(claim_wakeup, cpu) = 1;
 			break;
+		}
 	}
 
 	time = local_clock() - time;
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 0c00172..64d6495 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -28,6 +28,7 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 {
 	put_prev_task(rq, prev);
 	update_idle_core(rq);
+	this_cpu_write(claim_wakeup, 0);
 	schedstat_inc(rq->sched_goidle);
 	return rq->idle;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8aa24b4..5f70b98 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1068,6 +1068,7 @@ DECLARE_PER_CPU(int, sd_llc_id);
 DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
 DECLARE_PER_CPU(struct sched_domain *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+DECLARE_PER_CPU(int, claim_wakeup);
 
 struct sched_group_capacity {
 	atomic_t ref;
-- 
2.7.4