2017-03-28 14:46:50

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Mon, Feb 13, 2017 at 05:44:23AM +0800, Yuyang Du wrote:
> __update_load_avg() has the following steps:
>
> 1. add the remainder of the last incomplete period
> 2. decay old sum
> 3. accumulate new sum in full periods since last_update_time
> 4. accumulate the current incomplete period
> 5. update averages
>
> However, there is no need to separately compute steps 1, 3, and 4.
>
> Illustation:
>
> c1 c3 c4
> ^ ^ ^
> | | |
> |<->|<----------------->|<--->|
> ... |---x---|------| ... |------|-----x (now)
>
> c1, c3, and c4 are the accumulated (meanwhile decayed) contributions
> in timing in steps 1, 3, and 4 respectively.
>
> With them, the accumulated contribution to load_sum, for example, is:
>
> contrib = c1 * weight * freq_scaled;
> contrib += c3 * weight * freq_scaled;
> contrib += c4 * weight * freq_scaled;
>
> Obviously, we can optimize the above and they equate to:
>
> contrib = c1 + c3 + c4;
> contrib *= weight * freq_scaled;
>

So I figured out what it is you're doing, how's this? I still need to
rewrite the Changelog to make this cleared, but I think the code now has
understandable comments.

---


--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[]
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
*/
-static __always_inline u64 decay_load(u64 val, u64 n)
+static u64 decay_load(u64 val, u64 n)
{
unsigned int local_n;

@@ -2795,32 +2795,111 @@ static __always_inline u64 decay_load(u6
return val;
}

-/*
- * For updates fully spanning n periods, the contribution to runnable
- * average will be: \Sum 1024*y^n
- *
- * We can compute this reasonably efficiently by combining:
- * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
- */
-static u32 __compute_runnable_contrib(u64 n)
+static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
{
- u32 contrib = 0;
+ u32 contrib;
+
+ if (!periods)
+ return remainder - period_contrib;

- if (likely(n <= LOAD_AVG_PERIOD))
- return runnable_avg_yN_sum[n];
- else if (unlikely(n >= LOAD_AVG_MAX_N))
+ if (unlikely(periods >= LOAD_AVG_MAX_N))
return LOAD_AVG_MAX;

- /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
- contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
- n %= LOAD_AVG_PERIOD;
- contrib = decay_load(contrib, n);
- return contrib + runnable_avg_yN_sum[n];
+ /*
+ * c1 y^(p+1) + c3 y^0
+ */
+ remainder += decay_load((u64)(1024 - period_contrib), periods);
+
+ periods -= 1;
+ /*
+ * For updates fully spanning n periods, the contribution to runnable
+ * average will be: 1024 \Sum y^n
+ *
+ * We can compute this reasonably efficiently by combining:
+ *
+ * y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
+ */
+ if (likely(periods <= LOAD_AVG_PERIOD)) {
+ contrib = runnable_avg_yN_sum[periods];
+ } else {
+ contrib = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
+ periods %= LOAD_AVG_PERIOD;
+ contrib = decay_load(contrib, periods);
+ contrib += runnable_avg_yN_sum[periods];
+ }
+
+ return contrib + remainder;
}

#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)

/*
+ * Accumulate the three separate parts of the sum; c1 the remainder
+ * of the last (incomplete) period, c2 the span of full periods and c3
+ * the remainder of the (incomplete) current period.
+ *
+ * c1 c2 c3
+ * ^ ^ ^
+ * | | |
+ * |<->|<----------------->|<--->|
+ * ... |---x---|------| ... |------|-----x (now)
+ *
+ * p
+ * u' = (u + c1) y^(p+1) + 1024 \Sum y^n + c3 y^0
+ * n=1
+ *
+ * = u y^(p+1) + (Step 1)
+ *
+ * p
+ * c1 y^(p+1) + 1024 \Sum y^n + c3 y^0 (Step 2)
+ * n=1
+ */
+static __always_inline u32
+accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+ unsigned long weight, int running, struct cfs_rq *cfs_rq)
+{
+ unsigned long scale_freq, scale_cpu;
+ u64 periods;
+ u32 contrib;
+
+ scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+ delta += sa->period_contrib;
+ periods = delta / 1024; /* A period is 1024us (~1ms) */
+
+ /*
+ * Step 1: decay old *_sum if we crossed period boundaries.
+ */
+ if (periods) {
+ sa->load_sum = decay_load(sa->load_sum, periods);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum =
+ decay_load(cfs_rq->runnable_load_sum, periods);
+ }
+ sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+ }
+
+ /*
+ * Step 2
+ */
+ delta %= 1024;
+ contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+ sa->period_contrib = delta;
+
+ contrib = cap_scale(contrib, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * contrib;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * contrib;
+ }
+ if (running)
+ sa->util_sum += contrib * scale_cpu;
+
+ return periods;
+}
+
+/*
* We can represent the historical contribution to runnable average as the
* coefficients of a geometric series. To do this we sub-divide our runnable
* history into segments of approximately 1ms (1024us); label the segment that
@@ -2852,10 +2931,7 @@ static __always_inline int
___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
- u64 delta, scaled_delta, periods;
- u32 contrib;
- unsigned int delta_w, scaled_delta_w, decayed = 0;
- unsigned long scale_freq, scale_cpu;
+ u64 delta;

delta = now - sa->last_update_time;
/*
@@ -2876,81 +2952,27 @@ ___update_load_avg(u64 now, int cpu, str
return 0;
sa->last_update_time = now;

- scale_freq = arch_scale_freq_capacity(NULL, cpu);
- scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
- /* delta_w is the amount already accumulated against our next period */
- delta_w = sa->period_contrib;
- if (delta + delta_w >= 1024) {
- decayed = 1;
-
- /* how much left for next period will start over, we don't know yet */
- sa->period_contrib = 0;
-
- /*
- * Now that we know we're crossing a period boundary, figure
- * out how much from delta we need to complete the current
- * period and accrue it.
- */
- delta_w = 1024 - delta_w;
- scaled_delta_w = cap_scale(delta_w, scale_freq);
- if (weight) {
- sa->load_sum += weight * scaled_delta_w;
- if (cfs_rq) {
- cfs_rq->runnable_load_sum +=
- weight * scaled_delta_w;
- }
- }
- if (running)
- sa->util_sum += scaled_delta_w * scale_cpu;
-
- delta -= delta_w;
-
- /* Figure out how many additional periods this update spans */
- periods = delta / 1024;
- delta %= 1024;
-
- sa->load_sum = decay_load(sa->load_sum, periods + 1);
- if (cfs_rq) {
- cfs_rq->runnable_load_sum =
- decay_load(cfs_rq->runnable_load_sum, periods + 1);
- }
- sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
-
- /* Efficiently calculate \sum (1..n_period) 1024*y^i */
- contrib = __compute_runnable_contrib(periods);
- contrib = cap_scale(contrib, scale_freq);
- if (weight) {
- sa->load_sum += weight * contrib;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * contrib;
- }
- if (running)
- sa->util_sum += contrib * scale_cpu;
- }
-
- /* Remainder of delta accrued against u_0` */
- scaled_delta = cap_scale(delta, scale_freq);
- if (weight) {
- sa->load_sum += weight * scaled_delta;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * scaled_delta;
- }
- if (running)
- sa->util_sum += scaled_delta * scale_cpu;
-
- sa->period_contrib += delta;
+ /*
+ * Now we know we crossed measurement unit boundaries. The *_avg
+ * accrues by two steps:
+ *
+ * Step 1: accumulate *_sum since last_update_time. If we haven't
+ * crossed period boundaries, finish.
+ */
+ if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+ return 0;

- if (decayed) {
- sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
- if (cfs_rq) {
- cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
- }
- sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+ /*
+ * Step 2: update *_avg.
+ */
+ sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_avg =
+ div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
}
+ sa->util_avg = sa->util_sum / LOAD_AVG_MAX;

- return decayed;
+ return 1;
}

static int


2017-03-29 08:11:04

by Yuyang Du

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

Hi Peter,

On Tue, Mar 28, 2017 at 04:46:25PM +0200, Peter Zijlstra wrote:
> On Mon, Feb 13, 2017 at 05:44:23AM +0800, Yuyang Du wrote:
> > __update_load_avg() has the following steps:
> >
> > 1. add the remainder of the last incomplete period
> > 2. decay old sum
> > 3. accumulate new sum in full periods since last_update_time
> > 4. accumulate the current incomplete period
> > 5. update averages
> >
> > However, there is no need to separately compute steps 1, 3, and 4.
> >
> > Illustation:
> >
> > c1 c3 c4
> > ^ ^ ^
> > | | |
> > |<->|<----------------->|<--->|
> > ... |---x---|------| ... |------|-----x (now)
> >
> > c1, c3, and c4 are the accumulated (meanwhile decayed) contributions
> > in timing in steps 1, 3, and 4 respectively.
> >
> > With them, the accumulated contribution to load_sum, for example, is:
> >
> > contrib = c1 * weight * freq_scaled;
> > contrib += c3 * weight * freq_scaled;
> > contrib += c4 * weight * freq_scaled;
> >
> > Obviously, we can optimize the above and they equate to:
> >
> > contrib = c1 + c3 + c4;
> > contrib *= weight * freq_scaled;
> >
>
> So I figured out what it is you're doing, how's this? I still need to
> rewrite the Changelog to make this cleared,

Yes, you need to, and let me do it too and learn how you will rewrite
it.

Subject: [PATCH] sched/fair: Optimize __update_sched_avg()

In __update_load_avg(), the flow of periods and update times are
illustrated as:

t1 t3 t4
^ ^ ^
| | |
|<->|<----------------->|<--->|
... |---x---|------| ... |------|-----x (now)

in which, t1 is the remainder of the last incomplete period, t2
is the full periods since the last_update_time, and t3 is the
current period.

These times altogether make contributions to the load_sum with
the following 5 steps:

step 1: t1 is decayed as c1
step 2: last sum is decayed
step 3: t3 has accumulated c3
step 4: t4 is c4
step 5: average the sum

These contributions are further scaled with weight and frequency:

contrib = c1 * weight * freq_scaled;
contrib += c3 * weight * freq_scaled;
contrib += c4 * weight * freq_scaled;

Obviously, they equate to:

contrib = c1 + c3 + c4;
contrib *= weight * freq_scaled;

By doing so, we save instructions, and clean the codes. With that, c1
must be additionally decayed as opposed to doing it in step 2, but
these two approaches are about the same if you compare decay_load()
with a round of contrib scaling.

Code size comparison (with allyesconfig):

Before: kernel/sched/built-in.o 1213304
After : kernel/sched/built-in.o 1212536 (-768B)

> but I think the code now has
> understandable comments.

Yes, you made it. It's much better now.

Thanks,
Yuyang

2017-03-29 10:41:46

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Wed, Mar 29, 2017 at 08:04:42AM +0800, Yuyang Du wrote:
> Yes, you need to, and let me do it too and learn how you will rewrite
> it.

I've meanwhile written this. Does that work for you?

---
Subject: sched/fair: Optimize ___update_sched_avg()
From: Yuyang Du <[email protected]>
Date: Mon, 13 Feb 2017 05:44:23 +0800

The main PELT function ___update_load_avg(), that implements the
accumulation and progression of the geometric average series, is
implemented along the following lines for the scenario where the time
delta spans all 3 possible sections (see figure below):

1. add the remainder of the last incomplete period
2. decay old sum
3. accumulate new sum in full periods since last_update_time
4. accumulate the current incomplete period
5. update averages

Or:

d1 d2 d3
^ ^ ^
| | |
|<->|<----------------->|<--->|
... |---x---|------| ... |------|-----x (now)


load_sum' = (load_sum + weight * scale * d1) * y^(p+1) + (1,2)

p
weight * scale * 1024 * \Sum y^n + (3)
n=1

weight * sclae * d3 * y^0 (4)

load_avg' = load_sum' / LOAD_AVG_MAX (5)


Where:

d1 - is the delta part completing the remainder of the last
incomplete period,
d2 - is the delta part spannind complete periods, and
d3 - is the delta part starting the current incomplete period.


We can simplify the code in two steps; the first step is to separate
the first term into new and old parts like:

(load_sum + weight * scale * d1) * y^(p+1) = load_sum * y^(p+1) +
weight * scale * d1 * y^(p+1)

Once we've done that, its easy to see that all new terms carry the
common factors:

weight * scale

If we factor those out, we arrive at the form:

load_sum' = load_sum * y^(p+1) +

weight * scale * (d1 * y^(p+1) +

p
1024 * \Sum y^n +
n=1

d3 * y^0)

Which results in a simpler, smaller and faster implementation.

Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Yuyang Du <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
---
kernel/sched/fair.c | 212 ++++++++++++++++++++++++++++------------------------
1 file changed, 118 insertions(+), 94 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[]
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
*/
-static __always_inline u64 decay_load(u64 val, u64 n)
+static u64 decay_load(u64 val, u64 n)
{
unsigned int local_n;

@@ -2795,32 +2795,113 @@ static __always_inline u64 decay_load(u6
return val;
}

-/*
- * For updates fully spanning n periods, the contribution to runnable
- * average will be: \Sum 1024*y^n
- *
- * We can compute this reasonably efficiently by combining:
- * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
- */
-static u32 __compute_runnable_contrib(u64 n)
+static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
{
- u32 contrib = 0;
+ u32 c1, c2, c3 = remainder; /* y^0 == 1 */

- if (likely(n <= LOAD_AVG_PERIOD))
- return runnable_avg_yN_sum[n];
- else if (unlikely(n >= LOAD_AVG_MAX_N))
+ if (!periods)
+ return remainder - period_contrib;
+
+ if (unlikely(periods >= LOAD_AVG_MAX_N))
return LOAD_AVG_MAX;

- /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
- contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
- n %= LOAD_AVG_PERIOD;
- contrib = decay_load(contrib, n);
- return contrib + runnable_avg_yN_sum[n];
+ /*
+ * c1 = d1 y^(p+1)
+ */
+ c1 = decay_load((u64)(1024 - period_contrib), periods);
+
+ periods -= 1;
+ /*
+ * For updates fully spanning n periods, the contribution to runnable
+ * average will be:
+ *
+ * c2 = 1024 \Sum y^n
+ *
+ * We can compute this reasonably efficiently by combining:
+ *
+ * y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
+ */
+ if (likely(periods <= LOAD_AVG_PERIOD)) {
+ c2 = runnable_avg_yN_sum[periods];
+ } else {
+ c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
+ periods %= LOAD_AVG_PERIOD;
+ c2 = decay_load(c2, periods);
+ c2 += runnable_avg_yN_sum[periods];
+ }
+
+ return c1 + c2 + c3;
}

#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)

/*
+ * Accumulate the three separate parts of the sum; d1 the remainder
+ * of the last (incomplete) period, d2 the span of full periods and d3
+ * the remainder of the (incomplete) current period.
+ *
+ * d1 d2 d3
+ * ^ ^ ^
+ * | | |
+ * |<->|<----------------->|<--->|
+ * ... |---x---|------| ... |------|-----x (now)
+ *
+ * p
+ * u' = (u + d1) y^(p+1) + 1024 \Sum y^n + d3 y^0
+ * n=1
+ *
+ * = u y^(p+1) + (Step 1)
+ *
+ * p
+ * d1 y^(p+1) + 1024 \Sum y^n + d3 y^0 (Step 2)
+ * n=1
+ */
+static __always_inline u32
+accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+ unsigned long weight, int running, struct cfs_rq *cfs_rq)
+{
+ unsigned long scale_freq, scale_cpu;
+ u64 periods;
+ u32 contrib;
+
+ scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+ delta += sa->period_contrib;
+ periods = delta / 1024; /* A period is 1024us (~1ms) */
+
+ /*
+ * Step 1: decay old *_sum if we crossed period boundaries.
+ */
+ if (periods) {
+ sa->load_sum = decay_load(sa->load_sum, periods);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum =
+ decay_load(cfs_rq->runnable_load_sum, periods);
+ }
+ sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+ }
+
+ /*
+ * Step 2
+ */
+ delta %= 1024;
+ contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+ sa->period_contrib = delta;
+
+ contrib = cap_scale(contrib, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * contrib;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * contrib;
+ }
+ if (running)
+ sa->util_sum += contrib * scale_cpu;
+
+ return periods;
+}
+
+/*
* We can represent the historical contribution to runnable average as the
* coefficients of a geometric series. To do this we sub-divide our runnable
* history into segments of approximately 1ms (1024us); label the segment that
@@ -2852,10 +2933,7 @@ static __always_inline int
___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
- u64 delta, scaled_delta, periods;
- u32 contrib;
- unsigned int delta_w, scaled_delta_w, decayed = 0;
- unsigned long scale_freq, scale_cpu;
+ u64 delta;

delta = now - sa->last_update_time;
/*
@@ -2876,81 +2954,27 @@ ___update_load_avg(u64 now, int cpu, str
return 0;
sa->last_update_time = now;

- scale_freq = arch_scale_freq_capacity(NULL, cpu);
- scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
- /* delta_w is the amount already accumulated against our next period */
- delta_w = sa->period_contrib;
- if (delta + delta_w >= 1024) {
- decayed = 1;
-
- /* how much left for next period will start over, we don't know yet */
- sa->period_contrib = 0;
-
- /*
- * Now that we know we're crossing a period boundary, figure
- * out how much from delta we need to complete the current
- * period and accrue it.
- */
- delta_w = 1024 - delta_w;
- scaled_delta_w = cap_scale(delta_w, scale_freq);
- if (weight) {
- sa->load_sum += weight * scaled_delta_w;
- if (cfs_rq) {
- cfs_rq->runnable_load_sum +=
- weight * scaled_delta_w;
- }
- }
- if (running)
- sa->util_sum += scaled_delta_w * scale_cpu;
-
- delta -= delta_w;
-
- /* Figure out how many additional periods this update spans */
- periods = delta / 1024;
- delta %= 1024;
-
- sa->load_sum = decay_load(sa->load_sum, periods + 1);
- if (cfs_rq) {
- cfs_rq->runnable_load_sum =
- decay_load(cfs_rq->runnable_load_sum, periods + 1);
- }
- sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
-
- /* Efficiently calculate \sum (1..n_period) 1024*y^i */
- contrib = __compute_runnable_contrib(periods);
- contrib = cap_scale(contrib, scale_freq);
- if (weight) {
- sa->load_sum += weight * contrib;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * contrib;
- }
- if (running)
- sa->util_sum += contrib * scale_cpu;
- }
-
- /* Remainder of delta accrued against u_0` */
- scaled_delta = cap_scale(delta, scale_freq);
- if (weight) {
- sa->load_sum += weight * scaled_delta;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * scaled_delta;
- }
- if (running)
- sa->util_sum += scaled_delta * scale_cpu;
-
- sa->period_contrib += delta;
+ /*
+ * Now we know we crossed measurement unit boundaries. The *_avg
+ * accrues by two steps:
+ *
+ * Step 1: accumulate *_sum since last_update_time. If we haven't
+ * crossed period boundaries, finish.
+ */
+ if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+ return 0;

- if (decayed) {
- sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
- if (cfs_rq) {
- cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
- }
- sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+ /*
+ * Step 2: update *_avg.
+ */
+ sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_avg =
+ div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
}
+ sa->util_avg = sa->util_sum / LOAD_AVG_MAX;

- return decayed;
+ return 1;
}

static int

2017-03-30 02:46:50

by Yuyang Du

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

Hi Peter,

On Wed, Mar 29, 2017 at 12:41:26PM +0200, Peter Zijlstra wrote:
> On Wed, Mar 29, 2017 at 08:04:42AM +0800, Yuyang Du wrote:
> > Yes, you need to, and let me do it too and learn how you will rewrite
> > it.
>
> I've meanwhile written this. Does that work for you?

It works. You sort it out.

I hope I get along with going into detail too ...

Some grammar and typo issues:

> ---
> Subject: sched/fair: Optimize ___update_sched_avg()
> From: Yuyang Du <[email protected]>
> Date: Mon, 13 Feb 2017 05:44:23 +0800
>
> The main PELT function ___update_load_avg(), that implements the
~~~~
which

> accumulation and progression of the geometric average series, is
> implemented along the following lines for the scenario where the time
> delta spans all 3 possible sections (see figure below):
>
> 1. add the remainder of the last incomplete period
> 2. decay old sum
> 3. accumulate new sum in full periods since last_update_time
> 4. accumulate the current incomplete period
> 5. update averages
>
> Or:
>
> d1 d2 d3
> ^ ^ ^
> | | |
> |<->|<----------------->|<--->|
> ... |---x---|------| ... |------|-----x (now)
>
>
> load_sum' = (load_sum + weight * scale * d1) * y^(p+1) + (1,2)
>
> p
> weight * scale * 1024 * \Sum y^n + (3)
> n=1
>
> weight * sclae * d3 * y^0 (4)
~~~~~
scale

Thanks,
Yuyang

2017-03-30 11:21:47

by Paul Turner

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[]
> * Approximate:
> * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
> */
> -static __always_inline u64 decay_load(u64 val, u64 n)
> +static u64 decay_load(u64 val, u64 n)
> {
> unsigned int local_n;
>
> @@ -2795,32 +2795,113 @@ static __always_inline u64 decay_load(u6
> return val;
> }
>
> -/*
> - * For updates fully spanning n periods, the contribution to runnable
> - * average will be: \Sum 1024*y^n
> - *
> - * We can compute this reasonably efficiently by combining:
> - * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
> - */
> -static u32 __compute_runnable_contrib(u64 n)
> +static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)

- The naming here is really ambiguous:
"__accumulate_sum" -> "__accumulate_pelt_segments"?
- Passing in "remainder" seems irrelevant to the sum accumulation. It would be
more clear to handle it from the caller.

>
> {
> - u32 contrib = 0;
> + u32 c1, c2, c3 = remainder; /* y^0 == 1 */
>
> - if (likely(n <= LOAD_AVG_PERIOD))
> - return runnable_avg_yN_sum[n];
> - else if (unlikely(n >= LOAD_AVG_MAX_N))
> + if (!periods)
> + return remainder - period_contrib;

This is super confusing. It only works because remainder already had
period_contrib aggregated _into_ it. We're literally computing:
remainder + period_contrib - period_contrib

We should just not call this in the !periods case and handle the remainder
below.

> +
> + if (unlikely(periods >= LOAD_AVG_MAX_N))
> return LOAD_AVG_MAX;
>
> - /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
> - contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
> - n %= LOAD_AVG_PERIOD;
> - contrib = decay_load(contrib, n);
> - return contrib + runnable_avg_yN_sum[n];
> + /*
> + * c1 = d1 y^(p+1)
> + */
> + c1 = decay_load((u64)(1024 - period_contrib), periods);
> +
> + periods -= 1;
> + /*
> + * For updates fully spanning n periods, the contribution to runnable
> + * average will be:
> + *
> + * c2 = 1024 \Sum y^n
> + *
> + * We can compute this reasonably efficiently by combining:
> + *
> + * y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
> + */
> + if (likely(periods <= LOAD_AVG_PERIOD)) {
> + c2 = runnable_avg_yN_sum[periods];
> + } else {
> + c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];

This still wants the comment justifying why we can't overflow.

> + periods %= LOAD_AVG_PERIOD;
> + c2 = decay_load(c2, periods);
> + c2 += runnable_avg_yN_sum[periods];
> + }
> +
> + return c1 + c2 + c3;
> }
>
> #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
>
> /*
> + * Accumulate the three separate parts of the sum; d1 the remainder
> + * of the last (incomplete) period, d2 the span of full periods and d3
> + * the remainder of the (incomplete) current period.
> + *
> + * d1 d2 d3
> + * ^ ^ ^
> + * | | |
> + * |<->|<----------------->|<--->|
> + * ... |---x---|------| ... |------|-----x (now)
> + *
> + * p
> + * u' = (u + d1) y^(p+1) + 1024 \Sum y^n + d3 y^0
> + * n=1
> + *
> + * = u y^(p+1) + (Step 1)
> + *
> + * p
> + * d1 y^(p+1) + 1024 \Sum y^n + d3 y^0 (Step 2)
> + * n=1
> + */
> +static __always_inline u32
> +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
> + unsigned long weight, int running, struct cfs_rq *cfs_rq)
> +{
> + unsigned long scale_freq, scale_cpu;
> + u64 periods;
> + u32 contrib;
> +
> + scale_freq = arch_scale_freq_capacity(NULL, cpu);
> + scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> +
> + delta += sa->period_contrib;
> + periods = delta / 1024; /* A period is 1024us (~1ms) */
> +
> + if (periods) {
> + sa->load_sum = decay_load(sa->load_sum, periods);
> + if (cfs_rq) {
> + cfs_rq->runnable_load_sum =
> + decay_load(cfs_rq->runnable_load_sum, periods);
> + }
> + sa->util_sum = decay_load((u64)(sa->util_sum), periods);

Move step 2 here, handle remainder below.

> + }
> +
> + /*
> + * Step 2
> + */
> + delta %= 1024;
> + contrib = __accumulate_sum(periods, sa->period_contrib, delta);
> + sa->period_contrib = delta;
> +
> + contrib = cap_scale(contrib, scale_freq);
> + if (weight) {
> + sa->load_sum += weight * contrib;

Is this correct in the iterated periods > LOAD_AVG_MAX_N case?
I don't think the decay above is guaranteed to return these to zero.

> + if (cfs_rq)
> + cfs_rq->runnable_load_sum += weight * contrib;
> + }
> + if (running)
> + sa->util_sum += contrib * scale_cpu;
> +
> + return periods;
> +}
> +
> +/*
> * We can represent the historical contribution to runnable average as the
> * coefficients of a geometric series. To do this we sub-divide our runnable
> * history into segments of approximately 1ms (1024us); label the segment that
> @@ -2852,10 +2933,7 @@ static __always_inline int
> ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
> unsigned long weight, int running, struct cfs_rq *cfs_rq)
> {
> - u64 delta, scaled_delta, periods;
> - u32 contrib;
> - unsigned int delta_w, scaled_delta_w, decayed = 0;
> - unsigned long scale_freq, scale_cpu;
> + u64 delta;
>
> delta = now - sa->last_update_time;
> /*
> @@ -2876,81 +2954,27 @@ ___update_load_avg(u64 now, int cpu, str
> return 0;
> sa->last_update_time = now;
>
> - scale_freq = arch_scale_freq_capacity(NULL, cpu);
> - scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> -
> - /* delta_w is the amount already accumulated against our next period */
> - delta_w = sa->period_contrib;
> - if (delta + delta_w >= 1024) {
> - decayed = 1;
> -
> - /* how much left for next period will start over, we don't know yet */
> - sa->period_contrib = 0;
> -
> - /*
> - * Now that we know we're crossing a period boundary, figure
> - * out how much from delta we need to complete the current
> - * period and accrue it.
> - */
> - delta_w = 1024 - delta_w;
> - scaled_delta_w = cap_scale(delta_w, scale_freq);
> - if (weight) {
> - sa->load_sum += weight * scaled_delta_w;
> - if (cfs_rq) {
> - cfs_rq->runnable_load_sum +=
> - weight * scaled_delta_w;
> - }
> - }
> - if (running)
> - sa->util_sum += scaled_delta_w * scale_cpu;
> -
> - delta -= delta_w;
> -
> - /* Figure out how many additional periods this update spans */
> - periods = delta / 1024;
> - delta %= 1024;
> -
> - sa->load_sum = decay_load(sa->load_sum, periods + 1);
> - if (cfs_rq) {
> - cfs_rq->runnable_load_sum =
> - decay_load(cfs_rq->runnable_load_sum, periods + 1);
> - }
> - sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
> -
> - /* Efficiently calculate \sum (1..n_period) 1024*y^i */
> - contrib = __compute_runnable_contrib(periods);
> - contrib = cap_scale(contrib, scale_freq);
> - if (weight) {
> - sa->load_sum += weight * contrib;
> - if (cfs_rq)
> - cfs_rq->runnable_load_sum += weight * contrib;
> - }
> - if (running)
> - sa->util_sum += contrib * scale_cpu;
> - }
> -
> - /* Remainder of delta accrued against u_0` */
> - scaled_delta = cap_scale(delta, scale_freq);
> - if (weight) {
> - sa->load_sum += weight * scaled_delta;
> - if (cfs_rq)
> - cfs_rq->runnable_load_sum += weight * scaled_delta;
> - }
> - if (running)
> - sa->util_sum += scaled_delta * scale_cpu;
> -
> - sa->period_contrib += delta;
> + /*
> + * Now we know we crossed measurement unit boundaries. The *_avg
> + * accrues by two steps:
> + *
> + * Step 1: accumulate *_sum since last_update_time. If we haven't
> + * crossed period boundaries, finish.
> + */
> + if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
> + return 0;
>
> - if (decayed) {
> - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
> - if (cfs_rq) {
> - cfs_rq->runnable_load_avg =
> - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
> - }
> - sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
> + /*
> + * Step 2: update *_avg.
> + */
> + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
> + if (cfs_rq) {
> + cfs_rq->runnable_load_avg =
> + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
> }
> + sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
>
> - return decayed;
> + return 1;
> }
>
> static int

2017-03-30 12:17:18

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Thu, Mar 30, 2017 at 04:21:08AM -0700, Paul Turner wrote:
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[]
> > * Approximate:
> > * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
> > */
> > -static __always_inline u64 decay_load(u64 val, u64 n)
> > +static u64 decay_load(u64 val, u64 n)
> > {
> > unsigned int local_n;
> >
> > @@ -2795,32 +2795,113 @@ static __always_inline u64 decay_load(u6
> > return val;
> > }
> >
> > -/*
> > - * For updates fully spanning n periods, the contribution to runnable
> > - * average will be: \Sum 1024*y^n
> > - *
> > - * We can compute this reasonably efficiently by combining:
> > - * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
> > - */
> > -static u32 __compute_runnable_contrib(u64 n)
> > +static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
>
> - The naming here is really ambiguous:
> "__accumulate_sum" -> "__accumulate_pelt_segments"?

OK, I did struggle with that a bit too but failed to improve, I'll change it.

> - Passing in "remainder" seems irrelevant to the sum accumulation. It would be
> more clear to handle it from the caller.

Well, this way we have all 3 delta parts in one function. I'll try it
and see what it looks like though.

> >
> > {
> > - u32 contrib = 0;
> > + u32 c1, c2, c3 = remainder; /* y^0 == 1 */
> >
> > - if (likely(n <= LOAD_AVG_PERIOD))
> > - return runnable_avg_yN_sum[n];
> > - else if (unlikely(n >= LOAD_AVG_MAX_N))
> > + if (!periods)
> > + return remainder - period_contrib;
>
> This is super confusing. It only works because remainder already had
> period_contrib aggregated _into_ it. We're literally computing:
> remainder + period_contrib - period_contrib

Correct; although I didn't find it too confusing. Could be because I'd
been staring at this code for a few hours though.

> We should just not call this in the !periods case and handle the remainder
> below.

I'll change it see what it looks like.

> > +
> > + if (unlikely(periods >= LOAD_AVG_MAX_N))
> > return LOAD_AVG_MAX;
> >
> > - return contrib + runnable_avg_yN_sum[n];
> > + /*
> > + * c1 = d1 y^(p+1)
> > + */
> > + c1 = decay_load((u64)(1024 - period_contrib), periods);
> > +
> > + periods -= 1;
> > + /*
> > + * For updates fully spanning n periods, the contribution to runnable
> > + * average will be:
> > + *
> > + * c2 = 1024 \Sum y^n
> > + *
> > + * We can compute this reasonably efficiently by combining:
> > + *
> > + * y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
> > + */
> > + if (likely(periods <= LOAD_AVG_PERIOD)) {
> > + c2 = runnable_avg_yN_sum[periods];
> > + } else {
> > + c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
>
> This still wants the comment justifying why we can't overflow.

You mean why periods/LOAD_AVG_PERIOD < ARRAY_SIZE(__accumulated_sum_N32)
? Or something else?

> > + periods %= LOAD_AVG_PERIOD;
> > + c2 = decay_load(c2, periods);
> > + c2 += runnable_avg_yN_sum[periods];
> > + }
> > +
> > + return c1 + c2 + c3;
> > }


> > + /*
> > + * Step 2
> > + */
> > + delta %= 1024;
> > + contrib = __accumulate_sum(periods, sa->period_contrib, delta);
> > + sa->period_contrib = delta;
> > +
> > + contrib = cap_scale(contrib, scale_freq);
> > + if (weight) {
> > + sa->load_sum += weight * contrib;
>
> Is this correct in the iterated periods > LOAD_AVG_MAX_N case?
> I don't think the decay above is guaranteed to return these to zero.

Ah!

Indeed, so decay_load() needs LOAD_AVG_PERIOD * 63 before it truncates
to 0, because every LOAD_AVG_PERIOD we half the value; loose 1 bit; so
63 of those and we're 0.

But __accumulate_sum() OTOH returns LOAD_AVG_MAX after only
LOAD_AVG_MAX_N, which < LOAD_AVG_PERIOD * 63.

So yes, combined we exceed LOAD_AVG_MAX, which is bad. Let me think what
to do about that.

> > + if (cfs_rq)
> > + cfs_rq->runnable_load_sum += weight * contrib;
> > + }
> > + if (running)
> > + sa->util_sum += contrib * scale_cpu;
> > +
> > + return periods;
> > +}

2017-03-30 13:47:09

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Thu, Mar 30, 2017 at 02:16:58PM +0200, Peter Zijlstra wrote:
> On Thu, Mar 30, 2017 at 04:21:08AM -0700, Paul Turner wrote:

> > - The naming here is really ambiguous:
> > "__accumulate_sum" -> "__accumulate_pelt_segments"?
>
> OK, I did struggle with that a bit too but failed to improve, I'll change it.
>
> > - Passing in "remainder" seems irrelevant to the sum accumulation. It would be
> > more clear to handle it from the caller.
>
> Well, this way we have all 3 delta parts in one function. I'll try it
> and see what it looks like though.

> > This is super confusing. It only works because remainder already had
> > period_contrib aggregated _into_ it. We're literally computing:
> > remainder + period_contrib - period_contrib
>
> Correct; although I didn't find it too confusing. Could be because I'd
> been staring at this code for a few hours though.
>
> > We should just not call this in the !periods case and handle the remainder
> > below.
>
> I'll change it see what it looks like.

How's this?

---
kernel/sched/fair.c | 22 ++++++++++------------
1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 76f67b3e34d6..10d34498b5fe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2795,12 +2795,9 @@ static u64 decay_load(u64 val, u64 n)
return val;
}

-static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
+static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
{
- u32 c1, c2, c3 = remainder; /* y^0 == 1 */
-
- if (!periods)
- return remainder - period_contrib;
+ u32 c1, c2, c3 = d3; /* y^0 == 1 */

if (unlikely(periods >= LOAD_AVG_MAX_N))
return LOAD_AVG_MAX;
@@ -2861,8 +2858,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
unsigned long scale_freq, scale_cpu;
+ u32 contrib = delta;
u64 periods;
- u32 contrib;

scale_freq = arch_scale_freq_capacity(NULL, cpu);
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
@@ -2880,13 +2877,14 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
decay_load(cfs_rq->runnable_load_sum, periods);
}
sa->util_sum = decay_load((u64)(sa->util_sum), periods);
- }

- /*
- * Step 2
- */
- delta %= 1024;
- contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+ /*
+ * Step 2
+ */
+ delta %= 1024;
+ contrib = __accumulate_pelt_segments(periods,
+ 1024 - sa->period_contrib, delta);
+ }
sa->period_contrib = delta;

contrib = cap_scale(contrib, scale_freq);

2017-03-30 14:14:43

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Thu, Mar 30, 2017 at 02:16:58PM +0200, Peter Zijlstra wrote:
> On Thu, Mar 30, 2017 at 04:21:08AM -0700, Paul Turner wrote:

> > > +
> > > + if (unlikely(periods >= LOAD_AVG_MAX_N))
> > > return LOAD_AVG_MAX;

> >
> > Is this correct in the iterated periods > LOAD_AVG_MAX_N case?
> > I don't think the decay above is guaranteed to return these to zero.
>
> Ah!
>
> Indeed, so decay_load() needs LOAD_AVG_PERIOD * 63 before it truncates
> to 0, because every LOAD_AVG_PERIOD we half the value; loose 1 bit; so
> 63 of those and we're 0.
>
> But __accumulate_sum() OTOH returns LOAD_AVG_MAX after only
> LOAD_AVG_MAX_N, which < LOAD_AVG_PERIOD * 63.
>
> So yes, combined we exceed LOAD_AVG_MAX, which is bad. Let me think what
> to do about that.


So at the very least it should be decay_load(LOAD_AVG_MAX, 1) (aka
LOAD_AVG_MAX - 1024), but that still doesn't account for the !0
decay_load() of the first segment.

I'm thinking that we can compute the middle segment, by taking the max
value and chopping off the ends, like:


p
c2 = 1024 \Sum y^n
n=1

inf inf
= 1024 ( \Sum y^n - \Sum y^n - y^0 )
n=0 n=p


Which gives something like the below.. Or am I completely off my rocker?

---
kernel/sched/fair.c | 70 ++++++++++++++---------------------------------------
1 file changed, 18 insertions(+), 52 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 76f67b3e34d6..4f17ec0a378a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2744,26 +2744,6 @@ static const u32 runnable_avg_yN_inv[] = {
};

/*
- * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
- * over-estimates when re-combining.
- */
-static const u32 runnable_avg_yN_sum[] = {
- 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
- 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
- 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
-};
-
-/*
- * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
- * lower integers. See Documentation/scheduler/sched-avg.txt how these
- * were generated:
- */
-static const u32 __accumulated_sum_N32[] = {
- 0, 23371, 35056, 40899, 43820, 45281,
- 46011, 46376, 46559, 46650, 46696, 46719,
-};
-
-/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
*/
@@ -2795,40 +2775,25 @@ static u64 decay_load(u64 val, u64 n)
return val;
}

-static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
+static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
{
- u32 c1, c2, c3 = remainder; /* y^0 == 1 */
-
- if (!periods)
- return remainder - period_contrib;
-
- if (unlikely(periods >= LOAD_AVG_MAX_N))
- return LOAD_AVG_MAX;
+ u32 c1, c2, c3 = d3; /* y^0 == 1 */

/*
* c1 = d1 y^(p+1)
*/
- c1 = decay_load((u64)(1024 - period_contrib), periods);
+ c1 = decay_load((u64)d1, periods);

- periods -= 1;
/*
- * For updates fully spanning n periods, the contribution to runnable
- * average will be:
+ * p
+ * c2 = 1024 \Sum y^n
+ * n=1
*
- * c2 = 1024 \Sum y^n
- *
- * We can compute this reasonably efficiently by combining:
- *
- * y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
+ * inf inf
+ * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
+ * n=0 n=p+1
*/
- if (likely(periods <= LOAD_AVG_PERIOD)) {
- c2 = runnable_avg_yN_sum[periods];
- } else {
- c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
- periods %= LOAD_AVG_PERIOD;
- c2 = decay_load(c2, periods);
- c2 += runnable_avg_yN_sum[periods];
- }
+ c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;

return c1 + c2 + c3;
}
@@ -2861,8 +2826,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
unsigned long scale_freq, scale_cpu;
+ u32 contrib = delta;
u64 periods;
- u32 contrib;

scale_freq = arch_scale_freq_capacity(NULL, cpu);
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
@@ -2880,13 +2845,14 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
decay_load(cfs_rq->runnable_load_sum, periods);
}
sa->util_sum = decay_load((u64)(sa->util_sum), periods);
- }

- /*
- * Step 2
- */
- delta %= 1024;
- contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+ /*
+ * Step 2
+ */
+ delta %= 1024;
+ contrib = __accumulate_pelt_segments(periods,
+ 1024 - sa->period_contrib, delta);
+ }
sa->period_contrib = delta;

contrib = cap_scale(contrib, scale_freq);

2017-03-30 22:03:21

by Paul Turner

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Thu, Mar 30, 2017 at 7:14 AM, Peter Zijlstra <[email protected]> wrote:
> On Thu, Mar 30, 2017 at 02:16:58PM +0200, Peter Zijlstra wrote:
>> On Thu, Mar 30, 2017 at 04:21:08AM -0700, Paul Turner wrote:
>
>> > > +
>> > > + if (unlikely(periods >= LOAD_AVG_MAX_N))
>> > > return LOAD_AVG_MAX;
>
>> >
>> > Is this correct in the iterated periods > LOAD_AVG_MAX_N case?
>> > I don't think the decay above is guaranteed to return these to zero.
>>
>> Ah!
>>
>> Indeed, so decay_load() needs LOAD_AVG_PERIOD * 63 before it truncates
>> to 0, because every LOAD_AVG_PERIOD we half the value; loose 1 bit; so
>> 63 of those and we're 0.
>>
>> But __accumulate_sum() OTOH returns LOAD_AVG_MAX after only
>> LOAD_AVG_MAX_N, which < LOAD_AVG_PERIOD * 63.
>>
>> So yes, combined we exceed LOAD_AVG_MAX, which is bad. Let me think what
>> to do about that.
>
>
> So at the very least it should be decay_load(LOAD_AVG_MAX, 1) (aka
> LOAD_AVG_MAX - 1024), but that still doesn't account for the !0
> decay_load() of the first segment.
>
> I'm thinking that we can compute the middle segment, by taking the max
> value and chopping off the ends, like:
>
>
> p
> c2 = 1024 \Sum y^n
> n=1
>
> inf inf
> = 1024 ( \Sum y^n - \Sum y^n - y^0 )
> n=0 n=p
>
>

So this is endemic to what I think is a deeper problem:

The previous rounds of optimization folded weight into load_sum. I
think this introduced a number of correctness problems:

a) the load_avg is no longer independent of weight; a high weight
entity can linger for eons. [63 LOAD_AVG_PERIODS]
b) it breaks the dynamic response of load_avg on a weight change.
While nice is not common, there's a case that this is really important
for which is cgroups with a low number of threads running. E.g. When we
transition from 1->2 threads we immediately halve the weight, but
because of the folding it takes a very large time to be numerically
correct again.
c) It doesn't work with scale_load_down and fractional shares below
SCHED_LOAD_SCALE [we multiply in a zero -> zero rq load]
d) It makes doing stability/clipping above a nightmare.

I think it's actually *costing* us cycles, since we end up multiplying
in the weight every partial [load_sum] update, but we only actually
need to compute it when updating load_avg [which is per period
overflow].

> Which gives something like the below.. Or am I completely off my rocker?
>
> ---
> kernel/sched/fair.c | 70 ++++++++++++++---------------------------------------
> 1 file changed, 18 insertions(+), 52 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 76f67b3e34d6..4f17ec0a378a 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2744,26 +2744,6 @@ static const u32 runnable_avg_yN_inv[] = {
> };
>
> /*
> - * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
> - * over-estimates when re-combining.
> - */
> -static const u32 runnable_avg_yN_sum[] = {
> - 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
> - 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
> - 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
> -};
> -
> -/*
> - * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
> - * lower integers. See Documentation/scheduler/sched-avg.txt how these
> - * were generated:
> - */
> -static const u32 __accumulated_sum_N32[] = {
> - 0, 23371, 35056, 40899, 43820, 45281,
> - 46011, 46376, 46559, 46650, 46696, 46719,
> -};
> -
> -/*
> * Approximate:
> * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
> */
> @@ -2795,40 +2775,25 @@ static u64 decay_load(u64 val, u64 n)
> return val;
> }
>
> -static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
> +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
> {
> - u32 c1, c2, c3 = remainder; /* y^0 == 1 */
> -
> - if (!periods)
> - return remainder - period_contrib;
> -
> - if (unlikely(periods >= LOAD_AVG_MAX_N))
> - return LOAD_AVG_MAX;
> + u32 c1, c2, c3 = d3; /* y^0 == 1 */
>
> /*
> * c1 = d1 y^(p+1)
> */
> - c1 = decay_load((u64)(1024 - period_contrib), periods);
> + c1 = decay_load((u64)d1, periods);
>
> - periods -= 1;
> /*
> - * For updates fully spanning n periods, the contribution to runnable
> - * average will be:
> + * p
> + * c2 = 1024 \Sum y^n
> + * n=1
> *
> - * c2 = 1024 \Sum y^n
> - *
> - * We can compute this reasonably efficiently by combining:
> - *
> - * y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
> + * inf inf
> + * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
> + * n=0 n=p+1
> */
> - if (likely(periods <= LOAD_AVG_PERIOD)) {
> - c2 = runnable_avg_yN_sum[periods];
> - } else {
> - c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
> - periods %= LOAD_AVG_PERIOD;
> - c2 = decay_load(c2, periods);
> - c2 += runnable_avg_yN_sum[periods];
> - }
> + c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
>
> return c1 + c2 + c3;
> }
> @@ -2861,8 +2826,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
> unsigned long weight, int running, struct cfs_rq *cfs_rq)
> {
> unsigned long scale_freq, scale_cpu;
> + u32 contrib = delta;
> u64 periods;
> - u32 contrib;
>
> scale_freq = arch_scale_freq_capacity(NULL, cpu);
> scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> @@ -2880,13 +2845,14 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
> decay_load(cfs_rq->runnable_load_sum, periods);
> }
> sa->util_sum = decay_load((u64)(sa->util_sum), periods);
> - }
>
> - /*
> - * Step 2
> - */
> - delta %= 1024;
> - contrib = __accumulate_sum(periods, sa->period_contrib, delta);
> + /*
> + * Step 2
> + */
> + delta %= 1024;
> + contrib = __accumulate_pelt_segments(periods,
> + 1024 - sa->period_contrib, delta);
> + }
> sa->period_contrib = delta;
>
> contrib = cap_scale(contrib, scale_freq);

2017-03-31 02:44:50

by Yuyang Du

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

Hi Paul,

On Thu, Mar 30, 2017 at 04:21:08AM -0700, Paul Turner wrote:
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[]
> > * Approximate:
> > * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
> > */
> > -static __always_inline u64 decay_load(u64 val, u64 n)
> > +static u64 decay_load(u64 val, u64 n)
> > {
> > unsigned int local_n;
> >
> > @@ -2795,32 +2795,113 @@ static __always_inline u64 decay_load(u6
> > return val;
> > }
> >
> > -/*
> > - * For updates fully spanning n periods, the contribution to runnable
> > - * average will be: \Sum 1024*y^n
> > - *
> > - * We can compute this reasonably efficiently by combining:
> > - * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
> > - */
> > -static u32 __compute_runnable_contrib(u64 n)
> > +static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
>
> - The naming here is really ambiguous:
> "__accumulate_sum" -> "__accumulate_pelt_segments"?

Yes, sounds better :)

> - Passing in "remainder" seems irrelevant to the sum accumulation. It would be
> more clear to handle it from the caller.

It's relevant, because it "may" be c3. But maybe "remainder" isn't a good name,
as it's actually:

delta %= 1024, in which delta is (now - last_period_boundary).

> >
> > {
> > - u32 contrib = 0;
> > + u32 c1, c2, c3 = remainder; /* y^0 == 1 */
> >
> > - if (likely(n <= LOAD_AVG_PERIOD))
> > - return runnable_avg_yN_sum[n];
> > - else if (unlikely(n >= LOAD_AVG_MAX_N))
> > + if (!periods)
> > + return remainder - period_contrib;
>
> This is super confusing. It only works because remainder already had
> period_contrib aggregated _into_ it. We're literally computing:
> remainder + period_contrib - period_contrib

Sort of, but we're just reusing (delta += period_contrib), which is the simple
way to compute periods. And we carry the hope that we passed (delta %= 1024) as
the c3 if (periods). Or we could keep the real "delta" only for the !0 periods
case, which is good too. Yes, making the code as compact as possible produces
confusion, in which direction I have gone too much.

> We should just not call this in the !periods case and handle the remainder
> below.

It'd be clear to stick to __accumulate_pelt_segments() being c1 + c2 + c3,
described as step 2.

> > +
> > + if (unlikely(periods >= LOAD_AVG_MAX_N))
> > return LOAD_AVG_MAX;
> >
> > - /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
> > - contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
> > - n %= LOAD_AVG_PERIOD;
> > - contrib = decay_load(contrib, n);
> > - return contrib + runnable_avg_yN_sum[n];
> > + /*
> > + * c1 = d1 y^(p+1)
> > + */
> > + c1 = decay_load((u64)(1024 - period_contrib), periods);
> > +
> > + periods -= 1;
> > + /*
> > + * For updates fully spanning n periods, the contribution to runnable
> > + * average will be:
> > + *
> > + * c2 = 1024 \Sum y^n
> > + *
> > + * We can compute this reasonably efficiently by combining:
> > + *
> > + * y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
> > + */
> > + if (likely(periods <= LOAD_AVG_PERIOD)) {
> > + c2 = runnable_avg_yN_sum[periods];
> > + } else {
> > + c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
>
> This still wants the comment justifying why we can't overflow.

Yes, and it'd be:

/*
* We can't overflow because we would have returned if
* periods >= LOAD_AVG_MAX_N.
*/

> > + periods %= LOAD_AVG_PERIOD;
> > + c2 = decay_load(c2, periods);
> > + c2 += runnable_avg_yN_sum[periods];
> > + }
> > +
> > + return c1 + c2 + c3;
> > }
> >
> > #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
> >
> > /*
> > + * Accumulate the three separate parts of the sum; d1 the remainder
> > + * of the last (incomplete) period, d2 the span of full periods and d3
> > + * the remainder of the (incomplete) current period.
> > + *
> > + * d1 d2 d3
> > + * ^ ^ ^
> > + * | | |
> > + * |<->|<----------------->|<--->|
> > + * ... |---x---|------| ... |------|-----x (now)
> > + *
> > + * p
> > + * u' = (u + d1) y^(p+1) + 1024 \Sum y^n + d3 y^0
> > + * n=1
> > + *
> > + * = u y^(p+1) + (Step 1)
> > + *
> > + * p
> > + * d1 y^(p+1) + 1024 \Sum y^n + d3 y^0 (Step 2)
> > + * n=1
> > + */
> > +static __always_inline u32
> > +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
> > + unsigned long weight, int running, struct cfs_rq *cfs_rq)
> > +{
> > + unsigned long scale_freq, scale_cpu;
> > + u64 periods;
> > + u32 contrib;
> > +
> > + scale_freq = arch_scale_freq_capacity(NULL, cpu);
> > + scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> > +
> > + delta += sa->period_contrib;
> > + periods = delta / 1024; /* A period is 1024us (~1ms) */
> > +
> > + if (periods) {
> > + sa->load_sum = decay_load(sa->load_sum, periods);
> > + if (cfs_rq) {
> > + cfs_rq->runnable_load_sum =
> > + decay_load(cfs_rq->runnable_load_sum, periods);
> > + }
> > + sa->util_sum = decay_load((u64)(sa->util_sum), periods);
>
> Move step 2 here, handle remainder below.

See above.

Thanks,
Yuyang

2017-03-31 02:54:56

by Yuyang Du

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Thu, Mar 30, 2017 at 03:46:58PM +0200, Peter Zijlstra wrote:
> On Thu, Mar 30, 2017 at 02:16:58PM +0200, Peter Zijlstra wrote:
> > On Thu, Mar 30, 2017 at 04:21:08AM -0700, Paul Turner wrote:
>
> > > - The naming here is really ambiguous:
> > > "__accumulate_sum" -> "__accumulate_pelt_segments"?
> >
> > OK, I did struggle with that a bit too but failed to improve, I'll change it.
> >
> > > - Passing in "remainder" seems irrelevant to the sum accumulation. It would be
> > > more clear to handle it from the caller.
> >
> > Well, this way we have all 3 delta parts in one function. I'll try it
> > and see what it looks like though.
>
> > > This is super confusing. It only works because remainder already had
> > > period_contrib aggregated _into_ it. We're literally computing:
> > > remainder + period_contrib - period_contrib
> >
> > Correct; although I didn't find it too confusing. Could be because I'd
> > been staring at this code for a few hours though.
> >
> > > We should just not call this in the !periods case and handle the remainder
> > > below.
> >
> > I'll change it see what it looks like.
>
> How's this?

That is good. Only:

> ---
> kernel/sched/fair.c | 22 ++++++++++------------
> 1 file changed, 10 insertions(+), 12 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 76f67b3e34d6..10d34498b5fe 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2795,12 +2795,9 @@ static u64 decay_load(u64 val, u64 n)
> return val;
> }
>
> -static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
> +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
> {
> - u32 c1, c2, c3 = remainder; /* y^0 == 1 */
> -
> - if (!periods)
> - return remainder - period_contrib;
> + u32 c1, c2, c3 = d3; /* y^0 == 1 */
>
> if (unlikely(periods >= LOAD_AVG_MAX_N))
> return LOAD_AVG_MAX;
> @@ -2861,8 +2858,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
> unsigned long weight, int running, struct cfs_rq *cfs_rq)
> {
> unsigned long scale_freq, scale_cpu;
> + u32 contrib = delta;

u64 -> u32 may raise some question, so better cast it to show confidence
at the first.

2017-03-31 03:18:47

by Yuyang Du

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Thu, Mar 30, 2017 at 04:14:28PM +0200, Peter Zijlstra wrote:
> On Thu, Mar 30, 2017 at 02:16:58PM +0200, Peter Zijlstra wrote:
> > On Thu, Mar 30, 2017 at 04:21:08AM -0700, Paul Turner wrote:
>
> > > > +
> > > > + if (unlikely(periods >= LOAD_AVG_MAX_N))
> > > > return LOAD_AVG_MAX;
>
> > >
> > > Is this correct in the iterated periods > LOAD_AVG_MAX_N case?
> > > I don't think the decay above is guaranteed to return these to zero.
> >
> > Ah!
> >
> > Indeed, so decay_load() needs LOAD_AVG_PERIOD * 63 before it truncates
> > to 0, because every LOAD_AVG_PERIOD we half the value; loose 1 bit; so
> > 63 of those and we're 0.
> >
> > But __accumulate_sum() OTOH returns LOAD_AVG_MAX after only
> > LOAD_AVG_MAX_N, which < LOAD_AVG_PERIOD * 63.
> >
> > So yes, combined we exceed LOAD_AVG_MAX, which is bad. Let me think what
> > to do about that.
>
>
> So at the very least it should be decay_load(LOAD_AVG_MAX, 1) (aka
> LOAD_AVG_MAX - 1024), but that still doesn't account for the !0
> decay_load() of the first segment.
>
> I'm thinking that we can compute the middle segment, by taking the max
> value and chopping off the ends, like:
>
>
> p
> c2 = 1024 \Sum y^n
> n=1
>
> inf inf
> = 1024 ( \Sum y^n - \Sum y^n - y^0 )
> n=0 n=p

It looks surprisingly kinda works :)

> + c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
But, I'm not sure this is what you want (just assume p==0).

Thanks,
Yuyang

2017-03-31 03:46:10

by Yuyang Du

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Fri, Mar 31, 2017 at 03:13:55AM +0800, Yuyang Du wrote:
> On Thu, Mar 30, 2017 at 04:14:28PM +0200, Peter Zijlstra wrote:
> > On Thu, Mar 30, 2017 at 02:16:58PM +0200, Peter Zijlstra wrote:
> > > On Thu, Mar 30, 2017 at 04:21:08AM -0700, Paul Turner wrote:
> >
> > > > > +
> > > > > + if (unlikely(periods >= LOAD_AVG_MAX_N))
> > > > > return LOAD_AVG_MAX;
> >
> > > >
> > > > Is this correct in the iterated periods > LOAD_AVG_MAX_N case?
> > > > I don't think the decay above is guaranteed to return these to zero.
> > >
> > > Ah!
> > >
> > > Indeed, so decay_load() needs LOAD_AVG_PERIOD * 63 before it truncates
> > > to 0, because every LOAD_AVG_PERIOD we half the value; loose 1 bit; so
> > > 63 of those and we're 0.
> > >
> > > But __accumulate_sum() OTOH returns LOAD_AVG_MAX after only
> > > LOAD_AVG_MAX_N, which < LOAD_AVG_PERIOD * 63.
> > >
> > > So yes, combined we exceed LOAD_AVG_MAX, which is bad. Let me think what
> > > to do about that.
> >
> >
> > So at the very least it should be decay_load(LOAD_AVG_MAX, 1) (aka
> > LOAD_AVG_MAX - 1024), but that still doesn't account for the !0
> > decay_load() of the first segment.
> >
> > I'm thinking that we can compute the middle segment, by taking the max
> > value and chopping off the ends, like:
> >
> >
> > p
> > c2 = 1024 \Sum y^n
> > n=1
> >
> > inf inf
> > = 1024 ( \Sum y^n - \Sum y^n - y^0 )
> > n=0 n=p
>
> It looks surprisingly kinda works :)
>
> > + c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
> ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> But, I'm not sure this is what you want (just assume p==0).
>

Oh, what I meant is when p != 0, actually p>=1.

And thinking about it for a while, it's really what you want, brilliant :)

2017-03-31 07:01:49

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Thu, Mar 30, 2017 at 03:02:47PM -0700, Paul Turner wrote:
> On Thu, Mar 30, 2017 at 7:14 AM, Peter Zijlstra <[email protected]> wrote:
> > On Thu, Mar 30, 2017 at 02:16:58PM +0200, Peter Zijlstra wrote:
> >> On Thu, Mar 30, 2017 at 04:21:08AM -0700, Paul Turner wrote:
> >
> >> > > +
> >> > > + if (unlikely(periods >= LOAD_AVG_MAX_N))
> >> > > return LOAD_AVG_MAX;
> >
> >> >
> >> > Is this correct in the iterated periods > LOAD_AVG_MAX_N case?
> >> > I don't think the decay above is guaranteed to return these to zero.
> >>
> >> Ah!
> >>
> >> Indeed, so decay_load() needs LOAD_AVG_PERIOD * 63 before it truncates
> >> to 0, because every LOAD_AVG_PERIOD we half the value; loose 1 bit; so
> >> 63 of those and we're 0.
> >>
> >> But __accumulate_sum() OTOH returns LOAD_AVG_MAX after only
> >> LOAD_AVG_MAX_N, which < LOAD_AVG_PERIOD * 63.
> >>
> >> So yes, combined we exceed LOAD_AVG_MAX, which is bad. Let me think what
> >> to do about that.
> >
> >
> > So at the very least it should be decay_load(LOAD_AVG_MAX, 1) (aka
> > LOAD_AVG_MAX - 1024), but that still doesn't account for the !0
> > decay_load() of the first segment.
> >
> > I'm thinking that we can compute the middle segment, by taking the max
> > value and chopping off the ends, like:
> >
> >
> > p
> > c2 = 1024 \Sum y^n
> > n=1
> >
> > inf inf
> > = 1024 ( \Sum y^n - \Sum y^n - y^0 )
> > n=0 n=p
> >
> >
>
> So this is endemic to what I think is a deeper problem:

I think not.

47742*e((345/32)*l(.5))
27.12819932019487579284

So even without weight, 345 periods isn't enough to flatten
LOAD_AVG_MAX.

> The previous rounds of optimization folded weight into load_sum. I
> think this introduced a number of correctness problems:

I'll argue you on correctness; but these might well be problems indeed.

> a) the load_avg is no longer independent of weight; a high weight
> entity can linger for eons. [63 LOAD_AVG_PERIODS]

Certainly longer than before, agreed.

> b) it breaks the dynamic response of load_avg on a weight change.
> While nice is not common, there's a case that this is really important
> for which is cgroups with a low number of threads running. E.g. When we
> transition from 1->2 threads we immediately halve the weight, but
> because of the folding it takes a very large time to be numerically
> correct again.

I think this is a matter of semantics, not correctness. We did have that
weight in the past, so our past average including that isn't incorrect
per se.

Similarly, our vruntime includes increments based on prior weight.

Now; you're arguing that this is undesired, and this might well be.

> c) It doesn't work with scale_load_down and fractional shares below
> SCHED_LOAD_SCALE [we multiply in a zero -> zero rq load]

Yes, I noticed this, and this is indeed undesired.

> d) It makes doing stability/clipping above a nightmare.

Unsure what exactly you're referring to; the scale_down_load() boundary
at 0? How is this a separate point from c then?

> I think it's actually *costing* us cycles, since we end up multiplying
> in the weight every partial [load_sum] update, but we only actually
> need to compute it when updating load_avg [which is per period
> overflow].

So lets pull it out again -- but I don't think we need to undo all of
yuyang's patches for that. So please, look at the patch I proposed for
the problem you spotted. Lets fix the current state and take it from
there, ok?

2017-03-31 07:14:03

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Fri, Mar 31, 2017 at 03:41:18AM +0800, Yuyang Du wrote:

> > >
> > > p
> > > c2 = 1024 \Sum y^n
> > > n=1
> > >
> > > inf inf
> > > = 1024 ( \Sum y^n - \Sum y^n - y^0 )
> > > n=0 n=p
> >
> > It looks surprisingly kinda works :)
> >
> > > + c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
> > ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > But, I'm not sure this is what you want (just assume p==0).
> >
>
> Oh, what I meant is when p != 0, actually p>=1.

Right, note that we'll never get here with p==0. For p==1 we'll have
c2==0, and for p > 1 we'll have whatever section we wanted.

> And thinking about it for a while, it's really what you want, brilliant :)

Right, because:


inf inf inf
( \Sum y^n ) y^p = \Sum y^(n+p) = \Sum y^n
n=0 n=0 n=p

2017-03-31 09:59:42

by Paul Turner

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Fri, Mar 31, 2017 at 12:01 AM, Peter Zijlstra <[email protected]> wrote:
> On Thu, Mar 30, 2017 at 03:02:47PM -0700, Paul Turner wrote:
>> On Thu, Mar 30, 2017 at 7:14 AM, Peter Zijlstra <[email protected]> wrote:
>> > On Thu, Mar 30, 2017 at 02:16:58PM +0200, Peter Zijlstra wrote:
>> >> On Thu, Mar 30, 2017 at 04:21:08AM -0700, Paul Turner wrote:
>> >
>> >> > > +
>> >> > > + if (unlikely(periods >= LOAD_AVG_MAX_N))
>> >> > > return LOAD_AVG_MAX;
>> >
>> >> >
>> >> > Is this correct in the iterated periods > LOAD_AVG_MAX_N case?
>> >> > I don't think the decay above is guaranteed to return these to zero.
>> >>
>> >> Ah!
>> >>
>> >> Indeed, so decay_load() needs LOAD_AVG_PERIOD * 63 before it truncates
>> >> to 0, because every LOAD_AVG_PERIOD we half the value; loose 1 bit; so
>> >> 63 of those and we're 0.
>> >>
>> >> But __accumulate_sum() OTOH returns LOAD_AVG_MAX after only
>> >> LOAD_AVG_MAX_N, which < LOAD_AVG_PERIOD * 63.
>> >>
>> >> So yes, combined we exceed LOAD_AVG_MAX, which is bad. Let me think what
>> >> to do about that.
>> >
>> >
>> > So at the very least it should be decay_load(LOAD_AVG_MAX, 1) (aka
>> > LOAD_AVG_MAX - 1024), but that still doesn't account for the !0
>> > decay_load() of the first segment.
>> >
>> > I'm thinking that we can compute the middle segment, by taking the max
>> > value and chopping off the ends, like:
>> >
>> >
>> > p
>> > c2 = 1024 \Sum y^n
>> > n=1
>> >
>> > inf inf
>> > = 1024 ( \Sum y^n - \Sum y^n - y^0 )
>> > n=0 n=p
>> >
>> >
>>
>> So this is endemic to what I think is a deeper problem:
>
> I think not.
>
> 47742*e((345/32)*l(.5))
> 27.12819932019487579284
>
> So even without weight, 345 periods isn't enough to flatten
> LOAD_AVG_MAX.
>
>> The previous rounds of optimization folded weight into load_sum. I
>> think this introduced a number of correctness problems:
>
> I'll argue you on correctness; but these might well be problems indeed.

Yeah, this side of it will never be numerically perfect. I was just
suggesting that we can more easily clamp to LOAD_AVG_MAX directly if
weight is not accumulated into the sum.

>
>> a) the load_avg is no longer independent of weight; a high weight
>> entity can linger for eons. [63 LOAD_AVG_PERIODS]
>
> Certainly longer than before, agreed.
>
>> b) it breaks the dynamic response of load_avg on a weight change.
>> While nice is not common, there's a case that this is really important
>> for which is cgroups with a low number of threads running. E.g. When we
>> transition from 1->2 threads we immediately halve the weight, but
>> because of the folding it takes a very large time to be numerically
>> correct again.
>
> I think this is a matter of semantics, not correctness. We did have that

Challenge accepted ;)

> weight in the past, so our past average including that isn't incorrect
> per se.

I agree, but it's not correct relative to the numerical
interpretations we actually want to use. We examine these values for
forward-looking decisions, e.g. if we move this thread, how much load
are we moving and vruntime calculations.

E.g. Consider the case above, suppose we have 2 nice-0 threads, one
has been running on 0, one wakes up on 1. The load-balancer will see
a false imbalance between 0, 1 [as well as potentially other cores],
when they should be ~equal (with some obvious caveats or what the
threads are actually doing).

Further this weight is specifically bound to cpu 0. Meaning that we
will also see inconsistent future behavior, dependent on wake-up
placement.

>
> Similarly, our vruntime includes increments based on prior weight.
>
> Now; you're arguing that this is undesired, and this might well be.

Yes, this leads to cross and intra-group fairness issues, taking the
same example as above:
- The thread running on 0 will receive additional time versus the thread on 1
- However, the thread on 1 will be correctly weighted at 50% so
between 0 and 1 we'll see more time in competition with an
equivalently weighted external group than we should.

>
>> c) It doesn't work with scale_load_down and fractional shares below
>> SCHED_LOAD_SCALE [we multiply in a zero -> zero rq load]
>
> Yes, I noticed this, and this is indeed undesired.
>
>> d) It makes doing stability/clipping above a nightmare.
>
> Unsure what exactly you're referring to; the scale_down_load() boundary
> at 0? How is this a separate point from c then?

This is:
a) the clamp to LOAD_AVG_MAX above on overflow [clip]
b) waggle in weights as the number of runnable group entities
increases/decreases means that the sum does not converge nearly as
tightly, even with consistent behavior. [stability]

>
>> I think it's actually *costing* us cycles, since we end up multiplying
>> in the weight every partial [load_sum] update, but we only actually
>> need to compute it when updating load_avg [which is per period
>> overflow].
>
> So lets pull it out again -- but I don't think we need to undo all of
> yuyang's patches for that. So please, look at the patch I proposed for
> the problem you spotted. Lets fix the current state and take it from
> there, ok?

Oh, absolutely. I'm not really not proposing re-vulcanizing any
rubber here. Just saying that this particular problem is just one
facet of the weight mixing. 100% agree on fixing this as is and
iterating. I was actually thinking that these changes (which really
nicely simplify the calculation) greatly simplify restoring the
separation there.

2017-03-31 10:56:15

by Paul Turner

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Thu, Mar 30, 2017 at 7:14 AM, Peter Zijlstra <[email protected]> wrote:
> On Thu, Mar 30, 2017 at 02:16:58PM +0200, Peter Zijlstra wrote:
>> On Thu, Mar 30, 2017 at 04:21:08AM -0700, Paul Turner wrote:
>
>> > > +
>> > > + if (unlikely(periods >= LOAD_AVG_MAX_N))
>> > > return LOAD_AVG_MAX;
>
>> >
>> > Is this correct in the iterated periods > LOAD_AVG_MAX_N case?
>> > I don't think the decay above is guaranteed to return these to zero.
>>
>> Ah!
>>
>> Indeed, so decay_load() needs LOAD_AVG_PERIOD * 63 before it truncates
>> to 0, because every LOAD_AVG_PERIOD we half the value; loose 1 bit; so
>> 63 of those and we're 0.
>>
>> But __accumulate_sum() OTOH returns LOAD_AVG_MAX after only
>> LOAD_AVG_MAX_N, which < LOAD_AVG_PERIOD * 63.
>>
>> So yes, combined we exceed LOAD_AVG_MAX, which is bad. Let me think what
>> to do about that.
>
>
> So at the very least it should be decay_load(LOAD_AVG_MAX, 1) (aka
> LOAD_AVG_MAX - 1024), but that still doesn't account for the !0
> decay_load() of the first segment.
>
> I'm thinking that we can compute the middle segment, by taking the max
> value and chopping off the ends, like:
>

>
> p
> c2 = 1024 \Sum y^n
> n=1
>
> inf inf
> = 1024 ( \Sum y^n - \Sum y^n - y^0 )
> n=0 n=p
>

Very nice!
Minor nit: Second sum needs to be from n=p+1

>
> Which gives something like the below.. Or am I completely off my rocker?
>
> ---
> kernel/sched/fair.c | 70 ++++++++++++++---------------------------------------
> 1 file changed, 18 insertions(+), 52 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 76f67b3e34d6..4f17ec0a378a 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2744,26 +2744,6 @@ static const u32 runnable_avg_yN_inv[] = {
> };
>
> /*
> - * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
> - * over-estimates when re-combining.
> - */
> -static const u32 runnable_avg_yN_sum[] = {
> - 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
> - 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
> - 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
> -};
> -
> -/*
> - * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
> - * lower integers. See Documentation/scheduler/sched-avg.txt how these
> - * were generated:
> - */
> -static const u32 __accumulated_sum_N32[] = {
> - 0, 23371, 35056, 40899, 43820, 45281,
> - 46011, 46376, 46559, 46650, 46696, 46719,
> -};
> -
> -/*
> * Approximate:
> * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
> */
> @@ -2795,40 +2775,25 @@ static u64 decay_load(u64 val, u64 n)
> return val;
> }
>
> -static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
> +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
> {
> - u32 c1, c2, c3 = remainder; /* y^0 == 1 */
> -
> - if (!periods)
> - return remainder - period_contrib;
> -
> - if (unlikely(periods >= LOAD_AVG_MAX_N))
> - return LOAD_AVG_MAX;
> + u32 c1, c2, c3 = d3; /* y^0 == 1 */
>
> /*
> * c1 = d1 y^(p+1)
> */
> - c1 = decay_load((u64)(1024 - period_contrib), periods);
> + c1 = decay_load((u64)d1, periods);
>
> - periods -= 1;
> /*
> - * For updates fully spanning n periods, the contribution to runnable
> - * average will be:
> + * p
> + * c2 = 1024 \Sum y^n
> + * n=1
> *
> - * c2 = 1024 \Sum y^n
> - *
> - * We can compute this reasonably efficiently by combining:
> - *
> - * y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
> + * inf inf
> + * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
> + * n=0 n=p+1
> */
> - if (likely(periods <= LOAD_AVG_PERIOD)) {
> - c2 = runnable_avg_yN_sum[periods];
> - } else {
> - c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
> - periods %= LOAD_AVG_PERIOD;
> - c2 = decay_load(c2, periods);
> - c2 += runnable_avg_yN_sum[periods];
> - }
> + c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;

decay_load(LOAD_AVG_MAX, periods + 1)

I computed all the values vs true value that the old/new computations
result in, and it's very close. Absolutely it's approximately 2x off
the previous computation, e.g. if the old value was -15 (relative to
true value) than the new computation is -30.

This is definitely more than good enough. If we want more precision,
then the correction factor of:
+clamp(periods, 0, 45)

Makes it almost perfect across the board (and more accurate than the
prior calculation). Usefully it results in almost zero error in the
common cases of 0-25ms.

Want to fold this with the other patch above?

Reviewed-by: Paul Turner <[email protected]>

>
> return c1 + c2 + c3;
> }
> @@ -2861,8 +2826,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
> unsigned long weight, int running, struct cfs_rq *cfs_rq)
> {
> unsigned long scale_freq, scale_cpu;
> + u32 contrib = delta;
> u64 periods;
> - u32 contrib;
>
> scale_freq = arch_scale_freq_capacity(NULL, cpu);
> scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> @@ -2880,13 +2845,14 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
> decay_load(cfs_rq->runnable_load_sum, periods);
> }
> sa->util_sum = decay_load((u64)(sa->util_sum), periods);
> - }
>
> - /*
> - * Step 2
> - */
> - delta %= 1024;
> - contrib = __accumulate_sum(periods, sa->period_contrib, delta);
> + /*
> + * Step 2
> + */
> + delta %= 1024;
> + contrib = __accumulate_pelt_segments(periods,
> + 1024 - sa->period_contrib, delta);
> + }
> sa->period_contrib = delta;
>
> contrib = cap_scale(contrib, scale_freq);

2017-03-31 11:24:13

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Fri, Mar 31, 2017 at 02:58:57AM -0700, Paul Turner wrote:
> > So lets pull it out again -- but I don't think we need to undo all of
> > yuyang's patches for that. So please, look at the patch I proposed for
> > the problem you spotted. Lets fix the current state and take it from
> > there, ok?
>
> Oh, absolutely. I'm not really not proposing re-vulcanizing any
> rubber here. Just saying that this particular problem is just one
> facet of the weight mixing. 100% agree on fixing this as is and
> iterating. I was actually thinking that these changes (which really
> nicely simplify the calculation) greatly simplify restoring the
> separation there.

So on top of the previous patch; I've arrived (without testing and
considering numerical overflow much) at the following patch.

Its known broken for things like update_cfs_rq_load_avg() where doing
this will unconditionally flatten load_sum, which seems undesired. Need
to think more on this.

Also, I've pulled out the cpufreq scaling, which I'm not sure is the
right thing to do. Vincent wants to scale time, which would be
persistent in the history, unlike this now.

But yes, this looks doable and simpler than before.

---

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -313,7 +313,7 @@ struct load_weight {
*/
struct sched_avg {
u64 last_update_time;
- u64 load_sum;
+ u32 load_sum;
u32 util_sum;
u32 period_contrib;
unsigned long load_avg;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -747,8 +747,8 @@ void init_entity_runnable_average(struct
* nothing has been attached to the task group yet.
*/
if (entity_is_task(se))
- sa->load_avg = scale_load_down(se->load.weight);
- sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
+ sa->load_avg = se->load.weight;
+ sa->load_sum = LOAD_AVG_MAX;
/*
* At this point, util_avg won't be used in select_task_rq_fair anyway
*/
@@ -2820,15 +2820,11 @@ static u32 __accumulate_pelt_segments(u6
*/
static __always_inline u32
accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
- unsigned long weight, int running, struct cfs_rq *cfs_rq)
+ bool load, bool running, struct cfs_rq *cfs_rq)
{
- unsigned long scale_freq, scale_cpu;
- u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
+ u32 contrib = (u32)delta;
u64 periods;

- scale_freq = arch_scale_freq_capacity(NULL, cpu);
- scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
delta += sa->period_contrib;
periods = delta / 1024; /* A period is 1024us (~1ms) */

@@ -2852,14 +2848,13 @@ accumulate_sum(u64 delta, int cpu, struc
}
sa->period_contrib = delta;

- contrib = cap_scale(contrib, scale_freq);
- if (weight) {
- sa->load_sum += weight * contrib;
+ if (load) {
+ sa->load_sum += contrib;
if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * contrib;
+ cfs_rq->runnable_load_sum += contrib;
}
if (running)
- sa->util_sum += contrib * scale_cpu;
+ sa->util_sum += contrib;

return periods;
}
@@ -2894,9 +2889,10 @@ accumulate_sum(u64 delta, int cpu, struc
*/
static __always_inline int
___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
- unsigned long weight, int running, struct cfs_rq *cfs_rq)
+ unsigned long weight, bool running, struct cfs_rq *cfs_rq)
{
- u64 delta;
+ unsigned long scale_freq, scale_cpu;
+ u64 delta, load;

delta = now - sa->last_update_time;
/*
@@ -2924,18 +2920,22 @@ ___update_load_avg(u64 now, int cpu, str
* Step 1: accumulate *_sum since last_update_time. If we haven't
* crossed period boundaries, finish.
*/
- if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+ if (!accumulate_sum(delta, cpu, sa, !!weight, running, cfs_rq))
return 0;

+ scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
/*
* Step 2: update *_avg.
*/
- sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ load = cap_scale(weight, scale_freq);
+ sa->load_avg = div_u64(load * sa->load_sum, LOAD_AVG_MAX);
if (cfs_rq) {
cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+ div_u64(load * cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
}
- sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+ sa->util_avg = (cap_scale(scale_cpu, scale_freq) * sa->util_sum) / LOAD_AVG_MAX;

return 1;
}
@@ -2950,7 +2950,7 @@ static int
__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return ___update_load_avg(now, cpu, &se->avg,
- se->on_rq * scale_load_down(se->load.weight),
+ se->on_rq * se->load.weight,
cfs_rq->curr == se, NULL);
}

@@ -2958,8 +2958,7 @@ static int
__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
{
return ___update_load_avg(now, cpu, &cfs_rq->avg,
- scale_load_down(cfs_rq->load.weight),
- cfs_rq->curr != NULL, cfs_rq);
+ cfs_rq->load.weight, cfs_rq->curr != NULL, cfs_rq);
}

/*
@@ -3130,11 +3129,11 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq

/* Set new sched_entity's load */
se->avg.load_avg = load;
- se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+ se->avg.load_sum = LOAD_AVG_MAX;

/* Update parent cfs_rq load */
add_positive(&cfs_rq->avg.load_avg, delta);
- cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+ cfs_rq->avg.load_sum = LOAD_AVG_MAX;

/*
* If the sched_entity is already enqueued, we also have to update the
@@ -3143,7 +3142,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq
if (se->on_rq) {
/* Update parent cfs_rq runnable_load_avg */
add_positive(&cfs_rq->runnable_load_avg, delta);
- cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+ cfs_rq->runnable_load_sum = LOAD_AVG_MAX;
}
}


2017-03-31 11:30:13

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Fri, Mar 31, 2017 at 02:58:57AM -0700, Paul Turner wrote:

> I agree, but it's not correct relative to the numerical
> interpretations we actually want to use. We examine these values for
> forward-looking decisions, e.g. if we move this thread, how much load
> are we moving and vruntime calculations.

Ah, good point that. I had only considered numerical 'correctness', not
interpretive.


> Oh, absolutely. I'm not really not proposing re-vulcanizing any
> rubber here. Just saying that this particular problem is just one
> facet of the weight mixing. 100% agree on fixing this as is and
> iterating.

So I have the below; please have a look.

---
Subject: sched: Fix corner case in __accumulate_sum()
From: Peter Zijlstra <[email protected]>
Date: Fri Mar 31 10:51:41 CEST 2017

Paul noticed that in the (periods >= LOAD_AVG_MAX_N) case in
__accumulate_sum(), the returned contribution value (LOAD_AVG_MAX) is
incorrect.

This is because at this point, the decay_load() on the old state --
the first step in accumulate_sum() -- will not have resulted in 0, and
will therefore result in a sum larger than the maximum value of our
series. Obviously broken.

Note that:

decay_load(LOAD_AVG_MAX, LOAD_AVG_MAX_N) =

1 (345 / 32)
47742 * - ^ = ~27
2

Not to mention that any further contribution from the d3 segment (our
new period) would also push it over the maximum.

Solve this by noting that we can write our c2 term:

p
c2 = 1024 \Sum y^n
n=1

In terms of our maximum value:

inf inf p
max = 1024 \Sum y^n = 1024 ( \Sum y^n + \Sum y^n + y^0 )
n=0 n=p+1 n=1

Further note that:

inf inf inf
( \Sum y^n ) y^p = \Sum y^(n+p) = \Sum y^n
n=0 n=0 n=p

Combined that gives us:

p
c2 = 1024 \Sum y^n
n=1

inf inf
= 1024 ( \Sum y^n - \Sum y^n - y^0 )
n=0 n=p+1

= max - (max y^(p+1)) - 1024

Further simplify things by dealing with p=0 early on.

Cc: Yuyang Du <[email protected]>
Fixes: a481db34b9be ("sched/fair: Optimize ___update_sched_avg()")
Reported-by: Paul Turner <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -727,7 +727,6 @@ static unsigned long task_h_load(struct
*/
#define LOAD_AVG_PERIOD 32
#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */

/* Give new sched_entity start runnable values to heavy its load in infant time */
void init_entity_runnable_average(struct sched_entity *se)
@@ -2744,26 +2743,6 @@ static const u32 runnable_avg_yN_inv[] =
};

/*
- * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
- * over-estimates when re-combining.
- */
-static const u32 runnable_avg_yN_sum[] = {
- 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
- 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
- 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
-};
-
-/*
- * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
- * lower integers. See Documentation/scheduler/sched-avg.txt how these
- * were generated:
- */
-static const u32 __accumulated_sum_N32[] = {
- 0, 23371, 35056, 40899, 43820, 45281,
- 46011, 46376, 46559, 46650, 46696, 46719,
-};
-
-/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
*/
@@ -2771,9 +2750,7 @@ static u64 decay_load(u64 val, u64 n)
{
unsigned int local_n;

- if (!n)
- return val;
- else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+ if (unlikely(n > LOAD_AVG_PERIOD * 63))
return 0;

/* after bounds checking we can collapse to 32-bit */
@@ -2795,40 +2772,25 @@ static u64 decay_load(u64 val, u64 n)
return val;
}

-static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
+static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
{
- u32 c1, c2, c3 = remainder; /* y^0 == 1 */
-
- if (!periods)
- return remainder - period_contrib;
-
- if (unlikely(periods >= LOAD_AVG_MAX_N))
- return LOAD_AVG_MAX;
+ u32 c1, c2, c3 = d3; /* y^0 == 1 */

/*
* c1 = d1 y^(p+1)
*/
- c1 = decay_load((u64)(1024 - period_contrib), periods);
+ c1 = decay_load((u64)d1, periods);

- periods -= 1;
/*
- * For updates fully spanning n periods, the contribution to runnable
- * average will be:
- *
- * c2 = 1024 \Sum y^n
+ * p
+ * c2 = 1024 \Sum y^n
+ * n=1
*
- * We can compute this reasonably efficiently by combining:
- *
- * y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
+ * inf inf
+ * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
+ * n=0 n=p+1
*/
- if (likely(periods <= LOAD_AVG_PERIOD)) {
- c2 = runnable_avg_yN_sum[periods];
- } else {
- c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
- periods %= LOAD_AVG_PERIOD;
- c2 = decay_load(c2, periods);
- c2 += runnable_avg_yN_sum[periods];
- }
+ c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;

return c1 + c2 + c3;
}
@@ -2861,8 +2823,8 @@ accumulate_sum(u64 delta, int cpu, struc
unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
unsigned long scale_freq, scale_cpu;
+ u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
u64 periods;
- u32 contrib;

scale_freq = arch_scale_freq_capacity(NULL, cpu);
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
@@ -2880,13 +2842,14 @@ accumulate_sum(u64 delta, int cpu, struc
decay_load(cfs_rq->runnable_load_sum, periods);
}
sa->util_sum = decay_load((u64)(sa->util_sum), periods);
- }

- /*
- * Step 2
- */
- delta %= 1024;
- contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+ /*
+ * Step 2
+ */
+ delta %= 1024;
+ contrib = __accumulate_pelt_segments(periods,
+ 1024 - sa->period_contrib, delta);
+ }
sa->period_contrib = delta;

contrib = cap_scale(contrib, scale_freq);

2017-03-31 11:39:03

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Fri, Mar 31, 2017 at 03:55:40AM -0700, Paul Turner wrote:

> > I'm thinking that we can compute the middle segment, by taking the max
> > value and chopping off the ends, like:
> >
>
> >
> > p
> > c2 = 1024 \Sum y^n
> > n=1
> >
> > inf inf
> > = 1024 ( \Sum y^n - \Sum y^n - y^0 )
> > n=0 n=p
> >
>
> Very nice!
> Minor nit: Second sum needs to be from n=p+1

Correct.

> > +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
> > {
> > + u32 c1, c2, c3 = d3; /* y^0 == 1 */
> >
> > /*
> > * c1 = d1 y^(p+1)
> > */
> > + c1 = decay_load((u64)d1, periods);
> >
> > /*
> > + * p
> > + * c2 = 1024 \Sum y^n
> > + * n=1
> > *
> > + * inf inf
> > + * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
> > + * n=0 n=p+1
> > */
> > + c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
>
> decay_load(LOAD_AVG_MAX, periods + 1)

So here, @periods == p+1, see also c1. Yes, this is confusing [*].

In particular, I think the decay terms for c1 and this should be the
same. We cut off this tail end of the series to replace it with c1 after
all.

[*] hysterically p used to be off by 1, which is where the p+1 came
from, but now periods includes it. I was thinking of doing a patch
correcting all the comments to fully eradicate the whole +1 business.


> I computed all the values vs true value that the old/new computations
> result in, and it's very close. Absolutely it's approximately 2x off
> the previous computation, e.g. if the old value was -15 (relative to
> true value) than the new computation is -30.
>
> This is definitely more than good enough. If we want more precision,
> then the correction factor of:
> +clamp(periods, 0, 45)

Can you do a patch with coherent comment explaining where that
correction term comes from?

2017-04-10 07:39:15

by Dietmar Eggemann

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()



On 31/03/17 12:23, Peter Zijlstra wrote:
> On Fri, Mar 31, 2017 at 02:58:57AM -0700, Paul Turner wrote:
>>> So lets pull it out again -- but I don't think we need to undo all of
>>> yuyang's patches for that. So please, look at the patch I proposed for
>>> the problem you spotted. Lets fix the current state and take it from
>>> there, ok?
>>
>> Oh, absolutely. I'm not really not proposing re-vulcanizing any
>> rubber here. Just saying that this particular problem is just one
>> facet of the weight mixing. 100% agree on fixing this as is and
>> iterating. I was actually thinking that these changes (which really
>> nicely simplify the calculation) greatly simplify restoring the
>> separation there.
>
> So on top of the previous patch; I've arrived (without testing and
> considering numerical overflow much) at the following patch.

I did some testing (performance gov, no big.little) with this and for a
single task the load_avg values are much higher now on 64bit because of
the missing scale_load_down on weight.
Are you planning to get rid of scale_load_down() for weight because of
the correctness problems of the signal mentioned by Paul in this thread?

"... c) It doesn't work with scale_load_down and fractional shares below
SCHED_LOAD_SCALE [we multiply in a zero -> zero rq load] ... "

>
> Its known broken for things like update_cfs_rq_load_avg() where doing
> this will unconditionally flatten load_sum, which seems undesired. Need
> to think more on this.
>
> Also, I've pulled out the cpufreq scaling, which I'm not sure is the
> right thing to do. Vincent wants to scale time, which would be
> persistent in the history, unlike this now.

This patch moves freq and cpu from accumulate_sum() to ___update_load_avg()?

[...]

> @@ -2820,15 +2820,11 @@ static u32 __accumulate_pelt_segments(u6
> */
> static __always_inline u32
> accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
> - unsigned long weight, int running, struct cfs_rq *cfs_rq)
> + bool load, bool running, struct cfs_rq *cfs_rq)
> {
> - unsigned long scale_freq, scale_cpu;
> - u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
> + u32 contrib = (u32)delta;
> u64 periods;
>
> - scale_freq = arch_scale_freq_capacity(NULL, cpu);
> - scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> -
> delta += sa->period_contrib;
> periods = delta / 1024; /* A period is 1024us (~1ms) */
>
> @@ -2852,14 +2848,13 @@ accumulate_sum(u64 delta, int cpu, struc
> }
> sa->period_contrib = delta;
>
> - contrib = cap_scale(contrib, scale_freq);
> - if (weight) {
> - sa->load_sum += weight * contrib;
> + if (load) {
> + sa->load_sum += contrib;
> if (cfs_rq)
> - cfs_rq->runnable_load_sum += weight * contrib;
> + cfs_rq->runnable_load_sum += contrib;
> }
> if (running)
> - sa->util_sum += contrib * scale_cpu;
> + sa->util_sum += contrib;
>
> return periods;
> }
> @@ -2894,9 +2889,10 @@ accumulate_sum(u64 delta, int cpu, struc
> */
> static __always_inline int
> ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
> - unsigned long weight, int running, struct cfs_rq *cfs_rq)
> + unsigned long weight, bool running, struct cfs_rq *cfs_rq)
> {
> - u64 delta;
> + unsigned long scale_freq, scale_cpu;
> + u64 delta, load;
>
> delta = now - sa->last_update_time;
> /*
> @@ -2924,18 +2920,22 @@ ___update_load_avg(u64 now, int cpu, str
> * Step 1: accumulate *_sum since last_update_time. If we haven't
> * crossed period boundaries, finish.
> */
> - if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
> + if (!accumulate_sum(delta, cpu, sa, !!weight, running, cfs_rq))
> return 0;
>
> + scale_freq = arch_scale_freq_capacity(NULL, cpu);
> + scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> +
> /*
> * Step 2: update *_avg.
> */
> - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
> + load = cap_scale(weight, scale_freq);
> + sa->load_avg = div_u64(load * sa->load_sum, LOAD_AVG_MAX);
> if (cfs_rq) {
> cfs_rq->runnable_load_avg =
> - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
> + div_u64(load * cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
> }
> - sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
> + sa->util_avg = (cap_scale(scale_cpu, scale_freq) * sa->util_sum) / LOAD_AVG_MAX;
>
> return 1;
> }

[...]

2017-04-10 08:40:48

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Mon, Apr 10, 2017 at 08:39:10AM +0100, Dietmar Eggemann wrote:

> Are you planning to get rid of scale_load_down() for weight because of
> the correctness problems of the signal mentioned by Paul in this thread?

Yes; it would be good to (again) use all 20 bits of weight for the
load_avg things. It was one of the reasons we went to 20 in the first
place after all.

The patch was a quick draft and known broken, but thanks for having a
look.

2017-04-10 10:47:33

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Fri, Mar 31, 2017 at 01:38:55PM +0200, Peter Zijlstra wrote:
> So here, @periods == p+1, see also c1. Yes, this is confusing [*].

> [*] hysterically p used to be off by 1, which is where the p+1 came
> from, but now periods includes it. I was thinking of doing a patch
> correcting all the comments to fully eradicate the whole +1 business.


Something like so; which also makes it obvious p == 0 is not 'right'.


--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2777,18 +2777,18 @@ static u32 __accumulate_pelt_segments(u6
u32 c1, c2, c3 = d3; /* y^0 == 1 */

/*
- * c1 = d1 y^(p+1)
+ * c1 = d1 y^p
*/
c1 = decay_load((u64)d1, periods);

/*
- * p
+ * p-1
* c2 = 1024 \Sum y^n
* n=1
*
* inf inf
* = 1024 ( \Sum y^n - \Sum y^n - y^0 )
- * n=0 n=p+1
+ * n=0 n=p
*/
c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;

@@ -2808,15 +2808,15 @@ static u32 __accumulate_pelt_segments(u6
* |<->|<----------------->|<--->|
* ... |---x---|------| ... |------|-----x (now)
*
- * p
- * u' = (u + d1) y^(p+1) + 1024 \Sum y^n + d3 y^0
- * n=1
+ * p-1
+ * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
+ * n=1
*
- * = u y^(p+1) + (Step 1)
+ * = u y^p + (Step 1)
*
- * p
- * d1 y^(p+1) + 1024 \Sum y^n + d3 y^0 (Step 2)
- * n=1
+ * p-1
+ * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
+ * n=1
*/
static __always_inline u32
accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,


> > I computed all the values vs true value that the old/new computations
> > result in, and it's very close. Absolutely it's approximately 2x off
> > the previous computation, e.g. if the old value was -15 (relative to
> > true value) than the new computation is -30.
> >
> > This is definitely more than good enough. If we want more precision,
> > then the correction factor of:
> > +clamp(periods, 0, 45)
>
> Can you do a patch with coherent comment explaining where that
> correction term comes from?

ping?