Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754334AbdC1Oqu (ORCPT ); Tue, 28 Mar 2017 10:46:50 -0400 Received: from bombadil.infradead.org ([65.50.211.133]:50136 "EHLO bombadil.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752805AbdC1Oqq (ORCPT ); Tue, 28 Mar 2017 10:46:46 -0400 Date: Tue, 28 Mar 2017 16:46:25 +0200 From: Peter Zijlstra To: Yuyang Du Cc: mingo@kernel.org, linux-kernel@vger.kernel.org, pjt@google.com, bsegall@google.com, morten.rasmussen@arm.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, matt@codeblueprint.co.uk, umgwanakikbuti@gmail.com Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg() Message-ID: <20170328144625.GX3093@worktop> References: <1486935863-25251-1-git-send-email-yuyang.du@intel.com> <1486935863-25251-3-git-send-email-yuyang.du@intel.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <1486935863-25251-3-git-send-email-yuyang.du@intel.com> User-Agent: Mutt/1.5.22.1 (2013-10-16) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 8540 Lines: 293 On Mon, Feb 13, 2017 at 05:44:23AM +0800, Yuyang Du wrote: > __update_load_avg() has the following steps: > > 1. add the remainder of the last incomplete period > 2. decay old sum > 3. accumulate new sum in full periods since last_update_time > 4. accumulate the current incomplete period > 5. update averages > > However, there is no need to separately compute steps 1, 3, and 4. > > Illustation: > > c1 c3 c4 > ^ ^ ^ > | | | > |<->|<----------------->|<--->| > ... |---x---|------| ... |------|-----x (now) > > c1, c3, and c4 are the accumulated (meanwhile decayed) contributions > in timing in steps 1, 3, and 4 respectively. > > With them, the accumulated contribution to load_sum, for example, is: > > contrib = c1 * weight * freq_scaled; > contrib += c3 * weight * freq_scaled; > contrib += c4 * weight * freq_scaled; > > Obviously, we can optimize the above and they equate to: > > contrib = c1 + c3 + c4; > contrib *= weight * freq_scaled; > So I figured out what it is you're doing, how's this? I still need to rewrite the Changelog to make this cleared, but I think the code now has understandable comments. --- --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[] * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) */ -static __always_inline u64 decay_load(u64 val, u64 n) +static u64 decay_load(u64 val, u64 n) { unsigned int local_n; @@ -2795,32 +2795,111 @@ static __always_inline u64 decay_load(u6 return val; } -/* - * For updates fully spanning n periods, the contribution to runnable - * average will be: \Sum 1024*y^n - * - * We can compute this reasonably efficiently by combining: - * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n = LOAD_AVG_MAX_N)) + if (unlikely(periods >= LOAD_AVG_MAX_N)) return LOAD_AVG_MAX; - /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */ - contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD]; - n %= LOAD_AVG_PERIOD; - contrib = decay_load(contrib, n); - return contrib + runnable_avg_yN_sum[n]; + /* + * c1 y^(p+1) + c3 y^0 + */ + remainder += decay_load((u64)(1024 - period_contrib), periods); + + periods -= 1; + /* + * For updates fully spanning n periods, the contribution to runnable + * average will be: 1024 \Sum y^n + * + * We can compute this reasonably efficiently by combining: + * + * y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD} + */ + if (likely(periods <= LOAD_AVG_PERIOD)) { + contrib = runnable_avg_yN_sum[periods]; + } else { + contrib = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD]; + periods %= LOAD_AVG_PERIOD; + contrib = decay_load(contrib, periods); + contrib += runnable_avg_yN_sum[periods]; + } + + return contrib + remainder; } #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) /* + * Accumulate the three separate parts of the sum; c1 the remainder + * of the last (incomplete) period, c2 the span of full periods and c3 + * the remainder of the (incomplete) current period. + * + * c1 c2 c3 + * ^ ^ ^ + * | | | + * |<->|<----------------->|<--->| + * ... |---x---|------| ... |------|-----x (now) + * + * p + * u' = (u + c1) y^(p+1) + 1024 \Sum y^n + c3 y^0 + * n=1 + * + * = u y^(p+1) + (Step 1) + * + * p + * c1 y^(p+1) + 1024 \Sum y^n + c3 y^0 (Step 2) + * n=1 + */ +static __always_inline u32 +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, + unsigned long weight, int running, struct cfs_rq *cfs_rq) +{ + unsigned long scale_freq, scale_cpu; + u64 periods; + u32 contrib; + + scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + + delta += sa->period_contrib; + periods = delta / 1024; /* A period is 1024us (~1ms) */ + + /* + * Step 1: decay old *_sum if we crossed period boundaries. + */ + if (periods) { + sa->load_sum = decay_load(sa->load_sum, periods); + if (cfs_rq) { + cfs_rq->runnable_load_sum = + decay_load(cfs_rq->runnable_load_sum, periods); + } + sa->util_sum = decay_load((u64)(sa->util_sum), periods); + } + + /* + * Step 2 + */ + delta %= 1024; + contrib = __accumulate_sum(periods, sa->period_contrib, delta); + sa->period_contrib = delta; + + contrib = cap_scale(contrib, scale_freq); + if (weight) { + sa->load_sum += weight * contrib; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * contrib; + } + if (running) + sa->util_sum += contrib * scale_cpu; + + return periods; +} + +/* * We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable * history into segments of approximately 1ms (1024us); label the segment that @@ -2852,10 +2931,7 @@ static __always_inline int ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, unsigned long weight, int running, struct cfs_rq *cfs_rq) { - u64 delta, scaled_delta, periods; - u32 contrib; - unsigned int delta_w, scaled_delta_w, decayed = 0; - unsigned long scale_freq, scale_cpu; + u64 delta; delta = now - sa->last_update_time; /* @@ -2876,81 +2952,27 @@ ___update_load_avg(u64 now, int cpu, str return 0; sa->last_update_time = now; - scale_freq = arch_scale_freq_capacity(NULL, cpu); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - - /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->period_contrib; - if (delta + delta_w >= 1024) { - decayed = 1; - - /* how much left for next period will start over, we don't know yet */ - sa->period_contrib = 0; - - /* - * Now that we know we're crossing a period boundary, figure - * out how much from delta we need to complete the current - * period and accrue it. - */ - delta_w = 1024 - delta_w; - scaled_delta_w = cap_scale(delta_w, scale_freq); - if (weight) { - sa->load_sum += weight * scaled_delta_w; - if (cfs_rq) { - cfs_rq->runnable_load_sum += - weight * scaled_delta_w; - } - } - if (running) - sa->util_sum += scaled_delta_w * scale_cpu; - - delta -= delta_w; - - /* Figure out how many additional periods this update spans */ - periods = delta / 1024; - delta %= 1024; - - sa->load_sum = decay_load(sa->load_sum, periods + 1); - if (cfs_rq) { - cfs_rq->runnable_load_sum = - decay_load(cfs_rq->runnable_load_sum, periods + 1); - } - sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); - - /* Efficiently calculate \sum (1..n_period) 1024*y^i */ - contrib = __compute_runnable_contrib(periods); - contrib = cap_scale(contrib, scale_freq); - if (weight) { - sa->load_sum += weight * contrib; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * contrib; - } - if (running) - sa->util_sum += contrib * scale_cpu; - } - - /* Remainder of delta accrued against u_0` */ - scaled_delta = cap_scale(delta, scale_freq); - if (weight) { - sa->load_sum += weight * scaled_delta; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * scaled_delta; - } - if (running) - sa->util_sum += scaled_delta * scale_cpu; - - sa->period_contrib += delta; + /* + * Now we know we crossed measurement unit boundaries. The *_avg + * accrues by two steps: + * + * Step 1: accumulate *_sum since last_update_time. If we haven't + * crossed period boundaries, finish. + */ + if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) + return 0; - if (decayed) { - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); - if (cfs_rq) { - cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); - } - sa->util_avg = sa->util_sum / LOAD_AVG_MAX; + /* + * Step 2: update *_avg. + */ + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + if (cfs_rq) { + cfs_rq->runnable_load_avg = + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); } + sa->util_avg = sa->util_sum / LOAD_AVG_MAX; - return decayed; + return 1; } static int