From: Yuyang Du <yuyang.du@intel.com>
To: peterz@infradead.org, mingo@kernel.org, linux-kernel@vger.kernel.org
Cc: bsegall@google.com, pjt@google.com, morten.rasmussen@arm.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        juri.lelli@arm.com, Yuyang Du <yuyang.du@intel.com>
Subject: [PATCH 5/6] sched/fair: Optimize __update_sched_avg()
Date: Thu, 28 Apr 2016 10:56:12 +0800
Message-Id: <1461812173-32439-6-git-send-email-yuyang.du@intel.com>
In-Reply-To: <1461812173-32439-1-git-send-email-yuyang.du@intel.com>
References: <1461812173-32439-1-git-send-email-yuyang.du@intel.com>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 8158
Lines: 256

Previously, __update_sched_avg() has these (full) steps:
(1) add the left of the last incomplete period
(2) decay old sum
(3) accumulate new sum since last_update_time
(4) add the current incomplete period

If we separately compute (1), (3), and (4), each one of them is
ugly in codes and costly in overhead. But actually they do the same
thing, so we combine them together.

Signed-off-by: Yuyang Du <yuyang.du@intel.com>
---
 kernel/sched/fair.c |  185 +++++++++++++++++++++++++--------------------------
 1 file changed, 92 insertions(+), 93 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 486f098..7dba259 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -668,7 +668,7 @@ static unsigned long task_h_load(struct task_struct *p);
  */
 #define SCHED_AVG_HALFLIFE 32	/* number of periods as a half-life */
 #define SCHED_AVG_MAX 47742	/* maximum possible sched avg */
-#define SCHED_AVG_MAX_N 345	/* number of full periods to produce SCHED_AVG_MAX */
+#define SCHED_AVG_MAX_N 347	/* number of full periods to produce SCHED_AVG_MAX */
 
 /* Give new sched_entity start runnable values to heavy its load in infant time */
 void init_entity_runnable_average(struct sched_entity *se)
@@ -2606,7 +2606,7 @@ static const u32 __accumulated_sum_N[] = {
 
 /*
  * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
- * lower integers.
+ * lower integers. Since n < SCHED_AVG_MAX_N, n/SCHED_AVG_HALFLIFE < 11
  */
 static const u32 __accumulated_sum_N32[] = {
 	    0, 23371, 35056, 40899, 43820, 45281,
@@ -2649,20 +2649,31 @@ static __always_inline u64 __decay_sum(u64 val, u32 n)
  * We can compute this efficiently by combining:
  * y^32 = 1/2 with precomputed \Sum 1024*y^n   (where n < 32)
  */
-static __always_inline u32 __accumulate_sum(u32 n)
+static __always_inline u32
+__accumulate_sum(u32 periods, u32 period_contrib, u32 remainder)
 {
-	u32 contrib = 0;
+	u32 contrib;
 
-	if (likely(n <= SCHED_AVG_HALFLIFE))
-		return __accumulated_sum_N[n];
-	else if (unlikely(n >= SCHED_AVG_MAX_N))
+	if (!periods)
+		return remainder - period_contrib;
+
+	if (unlikely(periods >= SCHED_AVG_MAX_N))
 		return SCHED_AVG_MAX;
 
-	/* Since n < SCHED_AVG_MAX_N, n/SCHED_AVG_HALFLIFE < 11 */
-	contrib = __accumulated_sum_N32[n>>5]; /* =n/SCHED_AVG_HALFLIFE */
-	n %= SCHED_AVG_HALFLIFE;
-	contrib = __decay_sum(contrib, n);
-	return contrib + __accumulated_sum_N[n];
+	remainder += __decay_sum((u64)(1024 - period_contrib), periods);
+
+	periods -= 1;
+	if (likely(periods <= SCHED_AVG_HALFLIFE))
+		contrib = __accumulated_sum_N[periods];
+	else {
+		/*(periods>>5) = (periods/SCHED_AVG_HALFLIFE) */
+		contrib = __accumulated_sum_N32[periods>>5];
+		periods %= SCHED_AVG_HALFLIFE;
+		contrib = __decay_sum(contrib, periods);
+		contrib += __accumulated_sum_N[periods];
+	}
+
+	return contrib + remainder;
 }
 
 #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
@@ -2671,6 +2682,55 @@ static __always_inline u32 __accumulate_sum(u32 n)
 
 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 
+static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa,
+	struct cfs_rq *cfs_rq, int cpu, unsigned long weight, int running)
+{
+	u32 contrib, periods;
+	unsigned long scale_freq, scale_cpu;
+
+	scale_freq = arch_scale_freq_capacity(NULL, cpu);
+	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+	delta += sa->period_contrib;
+	periods = delta >> 10; /* A period is 1024us (~1ms) */
+
+	/*
+	 * Accumulating *_sum has two steps.
+	 *
+	 * Step 1: decay old *_sum if we crossed period boundaries.
+	 */
+	if (periods) {
+		sa->load_sum = __decay_sum(sa->load_sum, periods);
+		if (cfs_rq) {
+			cfs_rq->runnable_load_sum =
+				__decay_sum(cfs_rq->runnable_load_sum, periods);
+		}
+		sa->util_sum = __decay_sum((u64)(sa->util_sum), periods);
+	}
+
+	/*
+	 * Step 2: accumulate new *_sum since last_update_time. This at most has
+	 * three parts (at least one part): (1) remainder of incomplete last
+	 * period, (2) full periods since (1), and (3) incomplete current period.
+	 *
+	 * Fortunately, we can (and should) do all these three at once.
+	 */
+	delta %= 1024;
+	contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+	sa->period_contrib = delta;
+
+	contrib = cap_scale(contrib, scale_freq);
+	if (weight) {
+		sa->load_sum += weight * contrib;
+		if (cfs_rq)
+			cfs_rq->runnable_load_sum += weight * contrib;
+	}
+	if (running)
+		sa->util_sum += contrib * scale_cpu;
+
+	return periods;
+}
+
 /*
  * We can represent the historical contribution to sched average as the
  * coefficients of a geometric series.  To do this we divide the history
@@ -2701,12 +2761,9 @@ static __always_inline u32 __accumulate_sum(u32 n)
  */
 static __always_inline int
 __update_sched_avg(u64 now, int cpu, struct sched_avg *sa,
-		  unsigned long weight, int running, struct cfs_rq *cfs_rq)
+		   unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
-	u64 delta, scaled_delta;
-	u32 contrib, periods;
-	unsigned int delta_w, scaled_delta_w, decayed = 0;
-	unsigned long scale_freq, scale_cpu;
+	u64 delta;
 
 	delta = now - sa->last_update_time;
 	/*
@@ -2727,85 +2784,27 @@ __update_sched_avg(u64 now, int cpu, struct sched_avg *sa,
 		return 0;
 	sa->last_update_time = now;
 
-	scale_freq = arch_scale_freq_capacity(NULL, cpu);
-	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
-	/* delta_w is the amount already accumulated against our next period */
-	delta_w = sa->period_contrib;
-	if (delta + delta_w >= 1024) {
-		decayed = 1;
-
-		/* how much left for next period will start over, we don't know yet */
-		sa->period_contrib = 0;
-
-		/*
-		 * Now that we know we're crossing a period boundary, figure
-		 * out how much from delta we need to complete the current
-		 * period and accrue it.
-		 */
-		delta_w = 1024 - delta_w;
-		scaled_delta_w = cap_scale(delta_w, scale_freq);
-		if (weight) {
-			sa->load_sum += weight * scaled_delta_w;
-			if (cfs_rq) {
-				cfs_rq->runnable_load_sum +=
-						weight * scaled_delta_w;
-			}
-		}
-		if (running)
-			sa->util_sum += scaled_delta_w * scale_cpu;
-
-		delta -= delta_w;
-
-		/*
-		 * Figure out how many additional periods this update spans.
-		 * A period is 1024*1024ns or ~1ms, so a 32bit integer can hold
-		 * approximately a maximum of 49 (=2^32/1000/3600/24) days.
-		 */
-		periods = delta / 1024;
-		delta %= 1024;
-
-		sa->load_sum = __decay_sum(sa->load_sum, periods + 1);
-		if (cfs_rq) {
-			cfs_rq->runnable_load_sum =
-				__decay_sum(cfs_rq->runnable_load_sum, periods + 1);
-		}
-		sa->util_sum = __decay_sum((u64)(sa->util_sum), periods + 1);
-
-		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
-		contrib = __accumulate_sum(periods);
-		contrib = cap_scale(contrib, scale_freq);
-		if (weight) {
-			sa->load_sum += weight * contrib;
-			if (cfs_rq)
-				cfs_rq->runnable_load_sum += weight * contrib;
-		}
-		if (running)
-			sa->util_sum += contrib * scale_cpu;
-	}
-
-	/* Remainder of delta accrued against u_0` */
-	scaled_delta = cap_scale(delta, scale_freq);
-	if (weight) {
-		sa->load_sum += weight * scaled_delta;
-		if (cfs_rq)
-			cfs_rq->runnable_load_sum += weight * scaled_delta;
-	}
-	if (running)
-		sa->util_sum += scaled_delta * scale_cpu;
-
-	sa->period_contrib += delta;
+	/*
+	 * Now we know we crossed measurement unit boundaries. The *_avg
+	 * accrues by two steps:
+	 *
+	 * Step 1: accumulate *_sum since last_update_tim. If we haven't
+	 * crossed period boundaries, finish.
+	 */
+	if (!accumulate_sum(delta, sa, cfs_rq, cpu, weight, running))
+		return 0;
 
-	if (decayed) {
-		sa->load_avg = div_u64(sa->load_sum, SCHED_AVG_MAX);
-		if (cfs_rq) {
-			cfs_rq->runnable_load_avg =
-				div_u64(cfs_rq->runnable_load_sum, SCHED_AVG_MAX);
-		}
-		sa->util_avg = sa->util_sum / SCHED_AVG_MAX;
+	/*
+	 * Step 2: update *_avg.
+	 */
+	sa->load_avg = div_u64(sa->load_sum, SCHED_AVG_MAX);
+	if (cfs_rq) {
+		cfs_rq->runnable_load_avg =
+			div_u64(cfs_rq->runnable_load_sum, SCHED_AVG_MAX);
 	}
+	sa->util_avg = sa->util_sum / SCHED_AVG_MAX;
 
-	return decayed;
+	return 1;
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-- 
1.7.9.5