Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()
To: Peter Zijlstra <peterz@infradead.org>, Paul Turner <pjt@google.com>
References: <1486935863-25251-3-git-send-email-yuyang.du@intel.com>
 <20170328144625.GX3093@worktop> <20170329000442.GC2459@ydu19desktop>
 <20170329104126.lg6ismevfbqywpcj@hirez.programming.kicks-ass.net>
 <CAPM31RLHHKVBvVn04ZmvNQ=69f3M9WyFBUNPOoO-8=Q207ioxA@mail.gmail.com>
 <20170330121658.6mo7datma4ssw7st@hirez.programming.kicks-ass.net>
 <20170330141428.deiduft5btwid6ov@hirez.programming.kicks-ass.net>
 <CAPM31RJL_-Ty+_5r4gwvPvF+WbsEQ_0YETpBVHWbQmtMM=SyQg@mail.gmail.com>
 <20170331070141.ygghjaamv6bswbyl@hirez.programming.kicks-ass.net>
 <CAPM31RKyCe95r-y-J3DC7rTcY0cP0NFfXG1A6WNV9b2OqC2c2A@mail.gmail.com>
 <20170331112358.422kb5aosbq3k5tn@hirez.programming.kicks-ass.net>
Cc: Yuyang Du <yuyang.du@intel.com>, Ingo Molnar <mingo@kernel.org>,
        LKML <linux-kernel@vger.kernel.org>,
        Benjamin Segall <bsegall@google.com>,
        Morten Rasmussen <morten.rasmussen@arm.com>,
        Vincent Guittot <vincent.guittot@linaro.org>,
        Matt Fleming <matt@codeblueprint.co.uk>,
        Mike Galbraith <umgwanakikbuti@gmail.com>
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Message-ID: <2f4c2d4e-f56e-8681-1a32-621eb2ad8c35@arm.com>
Date: Mon, 10 Apr 2017 08:39:10 +0100
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101
 Thunderbird/45.7.0
MIME-Version: 1.0
In-Reply-To: <20170331112358.422kb5aosbq3k5tn@hirez.programming.kicks-ass.net>
Content-Type: text/plain; charset=windows-1252
Content-Transfer-Encoding: 7bit
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 4332
Lines: 120


On 31/03/17 12:23, Peter Zijlstra wrote:
> On Fri, Mar 31, 2017 at 02:58:57AM -0700, Paul Turner wrote:
>>> So lets pull it out again -- but I don't think we need to undo all of
>>> yuyang's patches for that. So please, look at the patch I proposed for
>>> the problem you spotted. Lets fix the current state and take it from
>>> there, ok?
>>
>> Oh, absolutely.  I'm not really not proposing re-vulcanizing any
>> rubber here.  Just saying that this particular problem is just one
>> facet of the weight mixing.  100% agree on fixing this as is and
>> iterating.  I was actually thinking that these changes (which really
>> nicely simplify the calculation) greatly simplify restoring the
>> separation there.
> 
> So on top of the previous patch; I've arrived (without testing and
> considering numerical overflow much) at the following patch.

I did some testing (performance gov, no big.little) with this and for a
single task the load_avg values are much higher now on 64bit because of
the missing scale_load_down on weight.
Are you planning to get rid of scale_load_down() for weight because of
the correctness problems of the signal mentioned by Paul in this thread?

"... c) It doesn't work with scale_load_down and fractional shares below
SCHED_LOAD_SCALE [we multiply in a zero -> zero rq load] ... "

> 
> Its known broken for things like update_cfs_rq_load_avg() where doing
> this will unconditionally flatten load_sum, which seems undesired. Need
> to think more on this.
> 
> Also, I've pulled out the cpufreq scaling, which I'm not sure is the
> right thing to do. Vincent wants to scale time, which would be
> persistent in the history, unlike this now.

This patch moves freq and cpu from accumulate_sum() to ___update_load_avg()?

[...]

> @@ -2820,15 +2820,11 @@ static u32 __accumulate_pelt_segments(u6
>   */
>  static __always_inline u32
>  accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
> -	       unsigned long weight, int running, struct cfs_rq *cfs_rq)
> +	       bool load, bool running, struct cfs_rq *cfs_rq)
>  {
> -	unsigned long scale_freq, scale_cpu;
> -	u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
> +	u32 contrib = (u32)delta;
>  	u64 periods;
>  
> -	scale_freq = arch_scale_freq_capacity(NULL, cpu);
> -	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> -
>  	delta += sa->period_contrib;
>  	periods = delta / 1024; /* A period is 1024us (~1ms) */
>  
> @@ -2852,14 +2848,13 @@ accumulate_sum(u64 delta, int cpu, struc
>  	}
>  	sa->period_contrib = delta;
>  
> -	contrib = cap_scale(contrib, scale_freq);
> -	if (weight) {
> -		sa->load_sum += weight * contrib;
> +	if (load) {
> +		sa->load_sum += contrib;
>  		if (cfs_rq)
> -			cfs_rq->runnable_load_sum += weight * contrib;
> +			cfs_rq->runnable_load_sum += contrib;
>  	}
>  	if (running)
> -		sa->util_sum += contrib * scale_cpu;
> +		sa->util_sum += contrib;
>  
>  	return periods;
>  }
> @@ -2894,9 +2889,10 @@ accumulate_sum(u64 delta, int cpu, struc
>   */
>  static __always_inline int
>  ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
> -		  unsigned long weight, int running, struct cfs_rq *cfs_rq)
> +		  unsigned long weight, bool running, struct cfs_rq *cfs_rq)
>  {
> -	u64 delta;
> +	unsigned long scale_freq, scale_cpu;
> +	u64 delta, load;
>  
>  	delta = now - sa->last_update_time;
>  	/*
> @@ -2924,18 +2920,22 @@ ___update_load_avg(u64 now, int cpu, str
>  	 * Step 1: accumulate *_sum since last_update_time. If we haven't
>  	 * crossed period boundaries, finish.
>  	 */
> -	if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
> +	if (!accumulate_sum(delta, cpu, sa, !!weight, running, cfs_rq))
>  		return 0;
>  
> +	scale_freq = arch_scale_freq_capacity(NULL, cpu);
> +	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> +
>  	/*
>  	 * Step 2: update *_avg.
>  	 */
> -	sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
> +	load = cap_scale(weight, scale_freq);
> +	sa->load_avg = div_u64(load * sa->load_sum, LOAD_AVG_MAX);
>  	if (cfs_rq) {
>  		cfs_rq->runnable_load_avg =
> -			div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
> +			div_u64(load * cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
>  	}
> -	sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
> +	sa->util_avg = (cap_scale(scale_cpu, scale_freq) * sa->util_sum) / LOAD_AVG_MAX;
>  
>  	return 1;
>  }

[...]