Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S933592AbdC3LVr (ORCPT ); Thu, 30 Mar 2017 07:21:47 -0400 Received: from mail-yw0-f176.google.com ([209.85.161.176]:36817 "EHLO mail-yw0-f176.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933192AbdC3LVp (ORCPT ); Thu, 30 Mar 2017 07:21:45 -0400 MIME-Version: 1.0 In-Reply-To: <20170329104126.lg6ismevfbqywpcj@hirez.programming.kicks-ass.net> References: <1486935863-25251-1-git-send-email-yuyang.du@intel.com> <1486935863-25251-3-git-send-email-yuyang.du@intel.com> <20170328144625.GX3093@worktop> <20170329000442.GC2459@ydu19desktop> <20170329104126.lg6ismevfbqywpcj@hirez.programming.kicks-ass.net> From: Paul Turner Date: Thu, 30 Mar 2017 04:21:08 -0700 Message-ID: Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg() To: Peter Zijlstra Cc: Yuyang Du , Ingo Molnar , LKML , Benjamin Segall , Morten Rasmussen , Vincent Guittot , Dietmar Eggemann , Matt Fleming , umgwanakikbuti@gmail.com Content-Type: text/plain; charset=UTF-8 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 10119 Lines: 276 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[] > * Approximate: > * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) > */ > -static __always_inline u64 decay_load(u64 val, u64 n) > +static u64 decay_load(u64 val, u64 n) > { > unsigned int local_n; > > @@ -2795,32 +2795,113 @@ static __always_inline u64 decay_load(u6 > return val; > } > > -/* > - * For updates fully spanning n periods, the contribution to runnable > - * average will be: \Sum 1024*y^n > - * > - * We can compute this reasonably efficiently by combining: > - * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n - */ > -static u32 __compute_runnable_contrib(u64 n) > +static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder) - The naming here is really ambiguous: "__accumulate_sum" -> "__accumulate_pelt_segments"? - Passing in "remainder" seems irrelevant to the sum accumulation. It would be more clear to handle it from the caller. > > { > - u32 contrib = 0; > + u32 c1, c2, c3 = remainder; /* y^0 == 1 */ > > - if (likely(n <= LOAD_AVG_PERIOD)) > - return runnable_avg_yN_sum[n]; > - else if (unlikely(n >= LOAD_AVG_MAX_N)) > + if (!periods) > + return remainder - period_contrib; This is super confusing. It only works because remainder already had period_contrib aggregated _into_ it. We're literally computing: remainder + period_contrib - period_contrib We should just not call this in the !periods case and handle the remainder below. > + > + if (unlikely(periods >= LOAD_AVG_MAX_N)) > return LOAD_AVG_MAX; > > - /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */ > - contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD]; > - n %= LOAD_AVG_PERIOD; > - contrib = decay_load(contrib, n); > - return contrib + runnable_avg_yN_sum[n]; > + /* > + * c1 = d1 y^(p+1) > + */ > + c1 = decay_load((u64)(1024 - period_contrib), periods); > + > + periods -= 1; > + /* > + * For updates fully spanning n periods, the contribution to runnable > + * average will be: > + * > + * c2 = 1024 \Sum y^n > + * > + * We can compute this reasonably efficiently by combining: > + * > + * y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD} > + */ > + if (likely(periods <= LOAD_AVG_PERIOD)) { > + c2 = runnable_avg_yN_sum[periods]; > + } else { > + c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD]; This still wants the comment justifying why we can't overflow. > + periods %= LOAD_AVG_PERIOD; > + c2 = decay_load(c2, periods); > + c2 += runnable_avg_yN_sum[periods]; > + } > + > + return c1 + c2 + c3; > } > > #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) > > /* > + * Accumulate the three separate parts of the sum; d1 the remainder > + * of the last (incomplete) period, d2 the span of full periods and d3 > + * the remainder of the (incomplete) current period. > + * > + * d1 d2 d3 > + * ^ ^ ^ > + * | | | > + * |<->|<----------------->|<--->| > + * ... |---x---|------| ... |------|-----x (now) > + * > + * p > + * u' = (u + d1) y^(p+1) + 1024 \Sum y^n + d3 y^0 > + * n=1 > + * > + * = u y^(p+1) + (Step 1) > + * > + * p > + * d1 y^(p+1) + 1024 \Sum y^n + d3 y^0 (Step 2) > + * n=1 > + */ > +static __always_inline u32 > +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, > + unsigned long weight, int running, struct cfs_rq *cfs_rq) > +{ > + unsigned long scale_freq, scale_cpu; > + u64 periods; > + u32 contrib; > + > + scale_freq = arch_scale_freq_capacity(NULL, cpu); > + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); > + > + delta += sa->period_contrib; > + periods = delta / 1024; /* A period is 1024us (~1ms) */ > + > + if (periods) { > + sa->load_sum = decay_load(sa->load_sum, periods); > + if (cfs_rq) { > + cfs_rq->runnable_load_sum = > + decay_load(cfs_rq->runnable_load_sum, periods); > + } > + sa->util_sum = decay_load((u64)(sa->util_sum), periods); Move step 2 here, handle remainder below. > + } > + > + /* > + * Step 2 > + */ > + delta %= 1024; > + contrib = __accumulate_sum(periods, sa->period_contrib, delta); > + sa->period_contrib = delta; > + > + contrib = cap_scale(contrib, scale_freq); > + if (weight) { > + sa->load_sum += weight * contrib; Is this correct in the iterated periods > LOAD_AVG_MAX_N case? I don't think the decay above is guaranteed to return these to zero. > + if (cfs_rq) > + cfs_rq->runnable_load_sum += weight * contrib; > + } > + if (running) > + sa->util_sum += contrib * scale_cpu; > + > + return periods; > +} > + > +/* > * We can represent the historical contribution to runnable average as the > * coefficients of a geometric series. To do this we sub-divide our runnable > * history into segments of approximately 1ms (1024us); label the segment that > @@ -2852,10 +2933,7 @@ static __always_inline int > ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, > unsigned long weight, int running, struct cfs_rq *cfs_rq) > { > - u64 delta, scaled_delta, periods; > - u32 contrib; > - unsigned int delta_w, scaled_delta_w, decayed = 0; > - unsigned long scale_freq, scale_cpu; > + u64 delta; > > delta = now - sa->last_update_time; > /* > @@ -2876,81 +2954,27 @@ ___update_load_avg(u64 now, int cpu, str > return 0; > sa->last_update_time = now; > > - scale_freq = arch_scale_freq_capacity(NULL, cpu); > - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); > - > - /* delta_w is the amount already accumulated against our next period */ > - delta_w = sa->period_contrib; > - if (delta + delta_w >= 1024) { > - decayed = 1; > - > - /* how much left for next period will start over, we don't know yet */ > - sa->period_contrib = 0; > - > - /* > - * Now that we know we're crossing a period boundary, figure > - * out how much from delta we need to complete the current > - * period and accrue it. > - */ > - delta_w = 1024 - delta_w; > - scaled_delta_w = cap_scale(delta_w, scale_freq); > - if (weight) { > - sa->load_sum += weight * scaled_delta_w; > - if (cfs_rq) { > - cfs_rq->runnable_load_sum += > - weight * scaled_delta_w; > - } > - } > - if (running) > - sa->util_sum += scaled_delta_w * scale_cpu; > - > - delta -= delta_w; > - > - /* Figure out how many additional periods this update spans */ > - periods = delta / 1024; > - delta %= 1024; > - > - sa->load_sum = decay_load(sa->load_sum, periods + 1); > - if (cfs_rq) { > - cfs_rq->runnable_load_sum = > - decay_load(cfs_rq->runnable_load_sum, periods + 1); > - } > - sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); > - > - /* Efficiently calculate \sum (1..n_period) 1024*y^i */ > - contrib = __compute_runnable_contrib(periods); > - contrib = cap_scale(contrib, scale_freq); > - if (weight) { > - sa->load_sum += weight * contrib; > - if (cfs_rq) > - cfs_rq->runnable_load_sum += weight * contrib; > - } > - if (running) > - sa->util_sum += contrib * scale_cpu; > - } > - > - /* Remainder of delta accrued against u_0` */ > - scaled_delta = cap_scale(delta, scale_freq); > - if (weight) { > - sa->load_sum += weight * scaled_delta; > - if (cfs_rq) > - cfs_rq->runnable_load_sum += weight * scaled_delta; > - } > - if (running) > - sa->util_sum += scaled_delta * scale_cpu; > - > - sa->period_contrib += delta; > + /* > + * Now we know we crossed measurement unit boundaries. The *_avg > + * accrues by two steps: > + * > + * Step 1: accumulate *_sum since last_update_time. If we haven't > + * crossed period boundaries, finish. > + */ > + if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) > + return 0; > > - if (decayed) { > - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); > - if (cfs_rq) { > - cfs_rq->runnable_load_avg = > - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); > - } > - sa->util_avg = sa->util_sum / LOAD_AVG_MAX; > + /* > + * Step 2: update *_avg. > + */ > + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); > + if (cfs_rq) { > + cfs_rq->runnable_load_avg = > + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); > } > + sa->util_avg = sa->util_sum / LOAD_AVG_MAX; > > - return decayed; > + return 1; > } > > static int