Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932214AbdC2Klq (ORCPT ); Wed, 29 Mar 2017 06:41:46 -0400 Received: from merlin.infradead.org ([205.233.59.134]:33976 "EHLO merlin.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S932147AbdC2Klo (ORCPT ); Wed, 29 Mar 2017 06:41:44 -0400 Date: Wed, 29 Mar 2017 12:41:26 +0200 From: Peter Zijlstra To: Yuyang Du Cc: mingo@kernel.org, linux-kernel@vger.kernel.org, pjt@google.com, bsegall@google.com, morten.rasmussen@arm.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, matt@codeblueprint.co.uk, umgwanakikbuti@gmail.com Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg() Message-ID: <20170329104126.lg6ismevfbqywpcj@hirez.programming.kicks-ass.net> References: <1486935863-25251-1-git-send-email-yuyang.du@intel.com> <1486935863-25251-3-git-send-email-yuyang.du@intel.com> <20170328144625.GX3093@worktop> <20170329000442.GC2459@ydu19desktop> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20170329000442.GC2459@ydu19desktop> User-Agent: NeoMutt/20170113 (1.7.2) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 9905 Lines: 344 On Wed, Mar 29, 2017 at 08:04:42AM +0800, Yuyang Du wrote: > Yes, you need to, and let me do it too and learn how you will rewrite > it. I've meanwhile written this. Does that work for you? --- Subject: sched/fair: Optimize ___update_sched_avg() From: Yuyang Du Date: Mon, 13 Feb 2017 05:44:23 +0800 The main PELT function ___update_load_avg(), that implements the accumulation and progression of the geometric average series, is implemented along the following lines for the scenario where the time delta spans all 3 possible sections (see figure below): 1. add the remainder of the last incomplete period 2. decay old sum 3. accumulate new sum in full periods since last_update_time 4. accumulate the current incomplete period 5. update averages Or: d1 d2 d3 ^ ^ ^ | | | |<->|<----------------->|<--->| ... |---x---|------| ... |------|-----x (now) load_sum' = (load_sum + weight * scale * d1) * y^(p+1) + (1,2) p weight * scale * 1024 * \Sum y^n + (3) n=1 weight * sclae * d3 * y^0 (4) load_avg' = load_sum' / LOAD_AVG_MAX (5) Where: d1 - is the delta part completing the remainder of the last incomplete period, d2 - is the delta part spannind complete periods, and d3 - is the delta part starting the current incomplete period. We can simplify the code in two steps; the first step is to separate the first term into new and old parts like: (load_sum + weight * scale * d1) * y^(p+1) = load_sum * y^(p+1) + weight * scale * d1 * y^(p+1) Once we've done that, its easy to see that all new terms carry the common factors: weight * scale If we factor those out, we arrive at the form: load_sum' = load_sum * y^(p+1) + weight * scale * (d1 * y^(p+1) + p 1024 * \Sum y^n + n=1 d3 * y^0) Which results in a simpler, smaller and faster implementation. Cc: mingo@kernel.org Cc: vincent.guittot@linaro.org Cc: dietmar.eggemann@arm.com Cc: matt@codeblueprint.co.uk Cc: pjt@google.com Cc: bsegall@google.com Cc: umgwanakikbuti@gmail.com Cc: morten.rasmussen@arm.com Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1486935863-25251-3-git-send-email-yuyang.du@intel.com --- kernel/sched/fair.c | 212 ++++++++++++++++++++++++++++------------------------ 1 file changed, 118 insertions(+), 94 deletions(-) --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[] * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) */ -static __always_inline u64 decay_load(u64 val, u64 n) +static u64 decay_load(u64 val, u64 n) { unsigned int local_n; @@ -2795,32 +2795,113 @@ static __always_inline u64 decay_load(u6 return val; } -/* - * For updates fully spanning n periods, the contribution to runnable - * average will be: \Sum 1024*y^n - * - * We can compute this reasonably efficiently by combining: - * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n = LOAD_AVG_MAX_N)) + if (!periods) + return remainder - period_contrib; + + if (unlikely(periods >= LOAD_AVG_MAX_N)) return LOAD_AVG_MAX; - /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */ - contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD]; - n %= LOAD_AVG_PERIOD; - contrib = decay_load(contrib, n); - return contrib + runnable_avg_yN_sum[n]; + /* + * c1 = d1 y^(p+1) + */ + c1 = decay_load((u64)(1024 - period_contrib), periods); + + periods -= 1; + /* + * For updates fully spanning n periods, the contribution to runnable + * average will be: + * + * c2 = 1024 \Sum y^n + * + * We can compute this reasonably efficiently by combining: + * + * y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD} + */ + if (likely(periods <= LOAD_AVG_PERIOD)) { + c2 = runnable_avg_yN_sum[periods]; + } else { + c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD]; + periods %= LOAD_AVG_PERIOD; + c2 = decay_load(c2, periods); + c2 += runnable_avg_yN_sum[periods]; + } + + return c1 + c2 + c3; } #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) /* + * Accumulate the three separate parts of the sum; d1 the remainder + * of the last (incomplete) period, d2 the span of full periods and d3 + * the remainder of the (incomplete) current period. + * + * d1 d2 d3 + * ^ ^ ^ + * | | | + * |<->|<----------------->|<--->| + * ... |---x---|------| ... |------|-----x (now) + * + * p + * u' = (u + d1) y^(p+1) + 1024 \Sum y^n + d3 y^0 + * n=1 + * + * = u y^(p+1) + (Step 1) + * + * p + * d1 y^(p+1) + 1024 \Sum y^n + d3 y^0 (Step 2) + * n=1 + */ +static __always_inline u32 +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, + unsigned long weight, int running, struct cfs_rq *cfs_rq) +{ + unsigned long scale_freq, scale_cpu; + u64 periods; + u32 contrib; + + scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + + delta += sa->period_contrib; + periods = delta / 1024; /* A period is 1024us (~1ms) */ + + /* + * Step 1: decay old *_sum if we crossed period boundaries. + */ + if (periods) { + sa->load_sum = decay_load(sa->load_sum, periods); + if (cfs_rq) { + cfs_rq->runnable_load_sum = + decay_load(cfs_rq->runnable_load_sum, periods); + } + sa->util_sum = decay_load((u64)(sa->util_sum), periods); + } + + /* + * Step 2 + */ + delta %= 1024; + contrib = __accumulate_sum(periods, sa->period_contrib, delta); + sa->period_contrib = delta; + + contrib = cap_scale(contrib, scale_freq); + if (weight) { + sa->load_sum += weight * contrib; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * contrib; + } + if (running) + sa->util_sum += contrib * scale_cpu; + + return periods; +} + +/* * We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable * history into segments of approximately 1ms (1024us); label the segment that @@ -2852,10 +2933,7 @@ static __always_inline int ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, unsigned long weight, int running, struct cfs_rq *cfs_rq) { - u64 delta, scaled_delta, periods; - u32 contrib; - unsigned int delta_w, scaled_delta_w, decayed = 0; - unsigned long scale_freq, scale_cpu; + u64 delta; delta = now - sa->last_update_time; /* @@ -2876,81 +2954,27 @@ ___update_load_avg(u64 now, int cpu, str return 0; sa->last_update_time = now; - scale_freq = arch_scale_freq_capacity(NULL, cpu); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - - /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->period_contrib; - if (delta + delta_w >= 1024) { - decayed = 1; - - /* how much left for next period will start over, we don't know yet */ - sa->period_contrib = 0; - - /* - * Now that we know we're crossing a period boundary, figure - * out how much from delta we need to complete the current - * period and accrue it. - */ - delta_w = 1024 - delta_w; - scaled_delta_w = cap_scale(delta_w, scale_freq); - if (weight) { - sa->load_sum += weight * scaled_delta_w; - if (cfs_rq) { - cfs_rq->runnable_load_sum += - weight * scaled_delta_w; - } - } - if (running) - sa->util_sum += scaled_delta_w * scale_cpu; - - delta -= delta_w; - - /* Figure out how many additional periods this update spans */ - periods = delta / 1024; - delta %= 1024; - - sa->load_sum = decay_load(sa->load_sum, periods + 1); - if (cfs_rq) { - cfs_rq->runnable_load_sum = - decay_load(cfs_rq->runnable_load_sum, periods + 1); - } - sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); - - /* Efficiently calculate \sum (1..n_period) 1024*y^i */ - contrib = __compute_runnable_contrib(periods); - contrib = cap_scale(contrib, scale_freq); - if (weight) { - sa->load_sum += weight * contrib; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * contrib; - } - if (running) - sa->util_sum += contrib * scale_cpu; - } - - /* Remainder of delta accrued against u_0` */ - scaled_delta = cap_scale(delta, scale_freq); - if (weight) { - sa->load_sum += weight * scaled_delta; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * scaled_delta; - } - if (running) - sa->util_sum += scaled_delta * scale_cpu; - - sa->period_contrib += delta; + /* + * Now we know we crossed measurement unit boundaries. The *_avg + * accrues by two steps: + * + * Step 1: accumulate *_sum since last_update_time. If we haven't + * crossed period boundaries, finish. + */ + if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) + return 0; - if (decayed) { - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); - if (cfs_rq) { - cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); - } - sa->util_avg = sa->util_sum / LOAD_AVG_MAX; + /* + * Step 2: update *_avg. + */ + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + if (cfs_rq) { + cfs_rq->runnable_load_avg = + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); } + sa->util_avg = sa->util_sum / LOAD_AVG_MAX; - return decayed; + return 1; } static int