From: Yuyang Du <yuyang.du@intel.com>
To: peterz@infradead.org, mingo@kernel.org, linux-kernel@vger.kernel.org
Cc: bsegall@google.com, pjt@google.com, morten.rasmussen@arm.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        lizefan@huawei.com, umgwanakikbuti@gmail.com,
        Yuyang Du <yuyang.du@intel.com>
Subject: [PATCH RESEND v2 4/6] sched/fair: Remove scale_load_down() for load_avg
Date: Thu, 31 Mar 2016 04:16:53 +0800
Message-Id: <1459369015-28375-5-git-send-email-yuyang.du@intel.com>
In-Reply-To: <1459369015-28375-1-git-send-email-yuyang.du@intel.com>
References: <1459369015-28375-1-git-send-email-yuyang.du@intel.com>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 5621
Lines: 133

Currently, load_avg = scale_load_down(load) * runnable%. The extra scaling
down of load does not make much sense, because load_avg is primarily THE
load and on top of that, we take runnable time into account.

We therefore remove scale_load_down() for load_avg. But we need to
carefully consider the overflow risk if load has higher range
(2*SCHED_FIXEDPOINT_SHIFT). The only case an overflow may occur due
to us is on 64bit kernel with increased load range. In that case,
the 64bit load_sum can afford 4251057 (=2^64/47742/88761/1024)
entities with the highest load (=88761*1024) always runnable on one
single cfs_rq, which may be an issue, but should be fine. Even if this
occurs at the end of day, on the condition where it occurs, the
load average will not be useful anyway.

Signed-off-by: Yuyang Du <yuyang.du@intel.com>
[update calculate_imbalance]
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 include/linux/sched.h | 19 ++++++++++++++-----
 kernel/sched/fair.c   | 19 +++++++++----------
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index db3c6e1..8df6d69 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1213,7 +1213,7 @@ struct load_weight {
  *
  * [load_avg definition]
  *
- * load_avg = runnable% * scale_load_down(load)
+ * load_avg = runnable% * load
  *
  * where runnable% is the time ratio that a sched_entity is runnable.
  * For cfs_rq, it is the aggregated such load_avg of all runnable and
@@ -1221,7 +1221,7 @@ struct load_weight {
  *
  * load_avg may also take frequency scaling into account:
  *
- * load_avg = runnable% * scale_load_down(load) * freq%
+ * load_avg = runnable% * load * freq%
  *
  * where freq% is the CPU frequency normalize to the highest frequency
  *
@@ -1247,9 +1247,18 @@ struct load_weight {
  *
  * [Overflow issue]
  *
- * The 64bit load_sum can have 4353082796 (=2^64/47742/88761) entities
- * with the highest load (=88761) always runnable on a single cfs_rq, we
- * should not overflow as the number already hits PID_MAX_LIMIT.
+ * On 64bit kernel:
+ *
+ * When load has small fixed point range (SCHED_FIXEDPOINT_SHIFT), the
+ * 64bit load_sum can have 4353082796 (=2^64/47742/88761) tasks with
+ * the highest load (=88761) always runnable on a cfs_rq, we should
+ * not overflow as the number already hits PID_MAX_LIMIT.
+ *
+ * When load has large fixed point range (2*SCHED_FIXEDPOINT_SHIFT),
+ * the 64bit load_sum can have 4251057 (=2^64/47742/88761/1024) tasks
+ * with the highest load (=88761*1024) always runnable on ONE cfs_rq,
+ * we should be fine. Even if the overflow occurs at the end of day,
+ * at the time the load_avg won't be useful anyway in that situation.
  *
  * For all other cases (including 32bit kernel), struct load_weight's
  * weight will overflow first before we do, because:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bf835b5..da6642f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -680,7 +680,7 @@ void init_entity_runnable_average(struct sched_entity *se)
 	 * will definitely be update (after enqueue).
 	 */
 	sa->period_contrib = 1023;
-	sa->load_avg = scale_load_down(se->load.weight);
+	sa->load_avg = se->load.weight;
 	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
 	sa->util_avg = SCHED_CAPACITY_SCALE;
 	sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
@@ -2837,7 +2837,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 	}
 
 	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
-		scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
+		cfs_rq->load.weight, cfs_rq->curr != NULL, cfs_rq);
 
 #ifndef CONFIG_64BIT
 	smp_wmb();
@@ -2858,8 +2858,7 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 	 * Track task load average for carrying it to new CPU after migrated, and
 	 * track group sched_entity load average for task_h_load calc in migration
 	 */
-	__update_load_avg(now, cpu, &se->avg,
-			  se->on_rq * scale_load_down(se->load.weight),
+	__update_load_avg(now, cpu, &se->avg, se->on_rq * se->load.weight,
 			  cfs_rq->curr == se, NULL);
 
 	if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
@@ -2896,7 +2895,7 @@ skip_aging:
 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
-			  &se->avg, se->on_rq * scale_load_down(se->load.weight),
+			  &se->avg, se->on_rq * se->load.weight,
 			  cfs_rq->curr == se, NULL);
 
 	cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
@@ -2916,7 +2915,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	migrated = !sa->last_update_time;
 	if (!migrated) {
 		__update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
-			se->on_rq * scale_load_down(se->load.weight),
+			se->on_rq * se->load.weight,
 			cfs_rq->curr == se, NULL);
 	}
 
@@ -6876,10 +6875,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	 */
 	if (busiest->group_type == group_overloaded &&
 	    local->group_type   == group_overloaded) {
-		load_above_capacity = busiest->sum_nr_running *
-				      scale_load_down(NICE_0_LOAD);
-		if (load_above_capacity > busiest->group_capacity)
-			load_above_capacity -= busiest->group_capacity;
+		load_above_capacity = busiest->sum_nr_running * NICE_0_LOAD;
+		if (load_above_capacity > scale_load(busiest->group_capacity))
+			load_above_capacity -=
+				scale_load(busiest->group_capacity);
 		else
 			load_above_capacity = ~0UL;
 	}
-- 
2.1.4