From: bsegall@google.com
To: Yuyang Du <yuyang.du@intel.com>
Cc: peterz@infradead.org, mingo@kernel.org, linux-kernel@vger.kernel.org,
        pjt@google.com, morten.rasmussen@arm.com, vincent.guittot@linaro.org,
        dietmar.eggemann@arm.com, lizefan@huawei.com, umgwanakikbuti@gmail.com
Subject: Re: [PATCH RESEND v2 4/6] sched/fair: Remove scale_load_down() for load_avg
References: <1459369015-28375-1-git-send-email-yuyang.du@intel.com>
	<1459369015-28375-5-git-send-email-yuyang.du@intel.com>
Date: Thu, 31 Mar 2016 10:45:57 -0700
In-Reply-To: <1459369015-28375-5-git-send-email-yuyang.du@intel.com> (Yuyang
	Du's message of "Thu, 31 Mar 2016 04:16:53 +0800")
Message-ID: <xm26lh4ya6cq.fsf@bsegall-linux.mtv.corp.google.com>
User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/24.3 (gnu/linux)
MIME-Version: 1.0
Content-Type: text/plain
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 6878
Lines: 157

Yuyang Du <yuyang.du@intel.com> writes:

> Currently, load_avg = scale_load_down(load) * runnable%. The extra scaling
> down of load does not make much sense, because load_avg is primarily THE
> load and on top of that, we take runnable time into account.
>
> We therefore remove scale_load_down() for load_avg. But we need to
> carefully consider the overflow risk if load has higher range
> (2*SCHED_FIXEDPOINT_SHIFT). The only case an overflow may occur due
> to us is on 64bit kernel with increased load range. In that case,
> the 64bit load_sum can afford 4251057 (=2^64/47742/88761/1024)
> entities with the highest load (=88761*1024) always runnable on one


This is true (ignoring that the cgroup max is a bit higher at 262144,
but that's only ~3x, hardly an issue), but I wondered when the uses of
load_avg in h_load and effective_load hit issues, under the assumption
that maybe the old old code used scale_load_down originally, so here's
the math:

effective_load multiplies load_avg by tg->shares, allowing a maximum of
(2^18)*load_avg=(2^18)*((2^18)*1024)=2^46 per child cgroup, allowing
131072 child cgroups, or ~3x that in tasks. (Half of normal because it
is signed)

h_load however is cfs_rq_load_avg * se->avg.load_avg, so we have an
extra factor of 1024, allowing only 256 max weight cgroups or 9x that in
tasks.

That said, when I checked the old code, it didn't use scale_load_down
back h_load involved cfs_rq->load.weight * se->load.weight, so this is
only new in a recent sense.

TLDR: you're correct that load_avg hits any issues first, and it did
so before any of the new average computations were added.

> single cfs_rq, which may be an issue, but should be fine. Even if this
> occurs at the end of day, on the condition where it occurs, the
> load average will not be useful anyway.
>
> Signed-off-by: Yuyang Du <yuyang.du@intel.com>
> [update calculate_imbalance]
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
>  include/linux/sched.h | 19 ++++++++++++++-----
>  kernel/sched/fair.c   | 19 +++++++++----------
>  2 files changed, 23 insertions(+), 15 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index db3c6e1..8df6d69 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1213,7 +1213,7 @@ struct load_weight {
>   *
>   * [load_avg definition]
>   *
> - * load_avg = runnable% * scale_load_down(load)
> + * load_avg = runnable% * load
>   *
>   * where runnable% is the time ratio that a sched_entity is runnable.
>   * For cfs_rq, it is the aggregated such load_avg of all runnable and
> @@ -1221,7 +1221,7 @@ struct load_weight {
>   *
>   * load_avg may also take frequency scaling into account:
>   *
> - * load_avg = runnable% * scale_load_down(load) * freq%
> + * load_avg = runnable% * load * freq%
>   *
>   * where freq% is the CPU frequency normalize to the highest frequency
>   *
> @@ -1247,9 +1247,18 @@ struct load_weight {
>   *
>   * [Overflow issue]
>   *
> - * The 64bit load_sum can have 4353082796 (=2^64/47742/88761) entities
> - * with the highest load (=88761) always runnable on a single cfs_rq, we
> - * should not overflow as the number already hits PID_MAX_LIMIT.
> + * On 64bit kernel:
> + *
> + * When load has small fixed point range (SCHED_FIXEDPOINT_SHIFT), the
> + * 64bit load_sum can have 4353082796 (=2^64/47742/88761) tasks with
> + * the highest load (=88761) always runnable on a cfs_rq, we should
> + * not overflow as the number already hits PID_MAX_LIMIT.
> + *
> + * When load has large fixed point range (2*SCHED_FIXEDPOINT_SHIFT),
> + * the 64bit load_sum can have 4251057 (=2^64/47742/88761/1024) tasks
> + * with the highest load (=88761*1024) always runnable on ONE cfs_rq,
> + * we should be fine. Even if the overflow occurs at the end of day,
> + * at the time the load_avg won't be useful anyway in that situation.
>   *
>   * For all other cases (including 32bit kernel), struct load_weight's
>   * weight will overflow first before we do, because:
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index bf835b5..da6642f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -680,7 +680,7 @@ void init_entity_runnable_average(struct sched_entity *se)
>  	 * will definitely be update (after enqueue).
>  	 */
>  	sa->period_contrib = 1023;
> -	sa->load_avg = scale_load_down(se->load.weight);
> +	sa->load_avg = se->load.weight;
>  	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
>  	sa->util_avg = SCHED_CAPACITY_SCALE;
>  	sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
> @@ -2837,7 +2837,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
>  	}
>  
>  	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
> -		scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
> +		cfs_rq->load.weight, cfs_rq->curr != NULL, cfs_rq);
>  
>  #ifndef CONFIG_64BIT
>  	smp_wmb();
> @@ -2858,8 +2858,7 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
>  	 * Track task load average for carrying it to new CPU after migrated, and
>  	 * track group sched_entity load average for task_h_load calc in migration
>  	 */
> -	__update_load_avg(now, cpu, &se->avg,
> -			  se->on_rq * scale_load_down(se->load.weight),
> +	__update_load_avg(now, cpu, &se->avg, se->on_rq * se->load.weight,
>  			  cfs_rq->curr == se, NULL);
>  
>  	if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
> @@ -2896,7 +2895,7 @@ skip_aging:
>  static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
>  	__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
> -			  &se->avg, se->on_rq * scale_load_down(se->load.weight),
> +			  &se->avg, se->on_rq * se->load.weight,
>  			  cfs_rq->curr == se, NULL);
>  
>  	cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
> @@ -2916,7 +2915,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  	migrated = !sa->last_update_time;
>  	if (!migrated) {
>  		__update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
> -			se->on_rq * scale_load_down(se->load.weight),
> +			se->on_rq * se->load.weight,
>  			cfs_rq->curr == se, NULL);
>  	}
>  
> @@ -6876,10 +6875,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
>  	 */
>  	if (busiest->group_type == group_overloaded &&
>  	    local->group_type   == group_overloaded) {
> -		load_above_capacity = busiest->sum_nr_running *
> -				      scale_load_down(NICE_0_LOAD);
> -		if (load_above_capacity > busiest->group_capacity)
> -			load_above_capacity -= busiest->group_capacity;
> +		load_above_capacity = busiest->sum_nr_running * NICE_0_LOAD;
> +		if (load_above_capacity > scale_load(busiest->group_capacity))
> +			load_above_capacity -=
> +				scale_load(busiest->group_capacity);
>  		else
>  			load_above_capacity = ~0UL;
>  	}