2012-08-23 14:17:12

by Paul Turner

[permalink] [raw]
Subject: [patch 14/16] sched: make __update_entity_runnable_avg() fast

From: Paul Turner <[email protected]>

__update_entity_runnable_avg forms the core of maintaining an entity's runnable
load average. In this function we charge the accumulated run-time since last
update and handle appropriate decay. In some cases, e.g. a waking task, this
time interval may be much larger than our period unit.

Fortunately we can exploit some properties of our series to perform decay for a
blocked update in constant time and account the contribution for a running
update in essentially-constant* time.

[*]: For any running entity they should be performing updates at the tick which
gives us a soft limit of 1 jiffy between updates, and we can compute up to a
32 jiffy update in a single pass.

Signed-off-by: Paul Turner <[email protected]>
Reviewed-by: Ben Segall <[email protected]>
---
kernel/sched/fair.c | 123 +++++++++++++++++++++++++++++++++++++++++----------
1 files changed, 99 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 12e9ae5..b249371 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -879,17 +879,90 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)

#ifdef CONFIG_SMP
/*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47765 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
+
+/* Precomputed fixed inverse multiplies for multiplication by y^n */
+static const u32 runnable_avg_yN_inv[] = {
+ 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+ 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+ 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+ 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+ 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+ 0x85aac367, 0x82cd8698,
+};
+
+/*
+ * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
+ * over-estimates when re-combining.
+ */
+static const u32 runnable_avg_yN_sum[] = {
+ 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
+ 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
+ 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
+};
+
+/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
*/
static __always_inline u64 decay_load(u64 val, u64 n)
{
- for (; n && val; n--) {
- val *= 4008;
- val >>= 12;
+ int local_n;
+ if (!n)
+ return val;
+ else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+ return 0;
+
+ /* will be 32 bits if that's desirable */
+ local_n = n;
+
+ /*
+ * As y^PERIOD = 1/2, we can combine
+ * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
+ * With a look-up table which covers k^n (n<PERIOD)
+ *
+ * To achieve constant time decay_load.
+ */
+ if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
+ val >>= local_n / LOAD_AVG_PERIOD;
+ n %= LOAD_AVG_PERIOD;
}

- return val;
+ val *= runnable_avg_yN_inv[local_n];
+ return SRR(val, 32);
+}
+
+/*
+ * For updates fully spanning n periods, the contribution to runnable
+ * average will be: \Sum 1024*y^n
+ *
+ * We can compute this reasonably efficiently by combining:
+ * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
+ */
+static u32 __compute_runnable_contrib(u64 n)
+{
+ u32 contrib = 0;
+
+ if (likely(n <= LOAD_AVG_PERIOD))
+ return runnable_avg_yN_sum[n];
+ else if (unlikely(n >= LOAD_AVG_MAX_N))
+ return LOAD_AVG_MAX;
+
+ /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
+ do {
+ contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
+ contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
+
+ n -= LOAD_AVG_PERIOD;
+ } while (n > LOAD_AVG_PERIOD);
+
+ contrib = decay_load(contrib, n);
+ return contrib + runnable_avg_yN_sum[n];
}

/* We can represent the historical contribution to runnable average as the
@@ -923,7 +996,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
struct sched_avg *sa,
int runnable)
{
- u64 delta;
+ u64 delta, periods;
+ u32 runnable_contrib;
int delta_w, decayed = 0;

delta = now - sa->last_runnable_update;
@@ -957,25 +1031,26 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
* period and accrue it.
*/
delta_w = 1024 - delta_w;
- BUG_ON(delta_w > delta);
- do {
- if (runnable)
- sa->runnable_avg_sum += delta_w;
- sa->runnable_avg_period += delta_w;
-
- /*
- * Remainder of delta initiates a new period, roll over
- * the previous.
- */
- sa->runnable_avg_sum =
- decay_load(sa->runnable_avg_sum, 1);
- sa->runnable_avg_period =
- decay_load(sa->runnable_avg_period, 1);
-
- delta -= delta_w;
- /* New period is empty */
- delta_w = 1024;
- } while (delta >= 1024);
+ if (runnable)
+ sa->runnable_avg_sum += delta_w;
+ sa->runnable_avg_period += delta_w;
+
+ delta -= delta_w;
+
+ /* Figure out how many additional periods this update spans */
+ periods = delta / 1024;
+ delta %= 1024;
+
+ sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
+ periods + 1);
+ sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+ periods + 1);
+
+ /* Efficiently calculate \sum (1..n_period) 1024*y^i */
+ runnable_contrib = __compute_runnable_contrib(periods);
+ if (runnable)
+ sa->runnable_avg_sum += runnable_contrib;
+ sa->runnable_avg_period += runnable_contrib;
}

/* Remainder of delta accrued against u_0` */


2012-08-24 08:35:18

by Namhyung Kim

[permalink] [raw]
Subject: Re: [patch 14/16] sched: make __update_entity_runnable_avg() fast

On Thu, 23 Aug 2012 07:14:36 -0700, > From: Paul Turner <[email protected]>
>
> __update_entity_runnable_avg forms the core of maintaining an entity's runnable
> load average. In this function we charge the accumulated run-time since last
> update and handle appropriate decay. In some cases, e.g. a waking task, this
> time interval may be much larger than our period unit.
>
> Fortunately we can exploit some properties of our series to perform decay for a
> blocked update in constant time and account the contribution for a running
> update in essentially-constant* time.
>
> [*]: For any running entity they should be performing updates at the tick which
> gives us a soft limit of 1 jiffy between updates, and we can compute up to a
> 32 jiffy update in a single pass.
>
> Signed-off-by: Paul Turner <[email protected]>
> Reviewed-by: Ben Segall <[email protected]>
> ---
> kernel/sched/fair.c | 123 +++++++++++++++++++++++++++++++++++++++++----------
> 1 files changed, 99 insertions(+), 24 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 12e9ae5..b249371 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -879,17 +879,90 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
>
> #ifdef CONFIG_SMP
> /*
> + * We choose a half-life close to 1 scheduling period.
> + * Note: The tables below are dependent on this value.
> + */
> +#define LOAD_AVG_PERIOD 32
> +#define LOAD_AVG_MAX 47765 /* maximum possible load avg */
> +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
> +
> +/* Precomputed fixed inverse multiplies for multiplication by y^n */
> +static const u32 runnable_avg_yN_inv[] = {
> + 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
> + 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
> + 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
> + 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
> + 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
> + 0x85aac367, 0x82cd8698,
> +};
> +
> +/*
> + * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
> + * over-estimates when re-combining.
> + */
> +static const u32 runnable_avg_yN_sum[] = {
> + 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
> + 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
> + 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
> +};
> +
> +/*
> * Approximate:
> * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
> */
> static __always_inline u64 decay_load(u64 val, u64 n)
> {
> - for (; n && val; n--) {
> - val *= 4008;
> - val >>= 12;
> + int local_n;
> + if (!n)
> + return val;
> + else if (unlikely(n > LOAD_AVG_PERIOD * 63))
> + return 0;
> +
> + /* will be 32 bits if that's desirable */
> + local_n = n;
> +
> + /*
> + * As y^PERIOD = 1/2, we can combine
> + * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
> + * With a look-up table which covers k^n (n<PERIOD)
> + *
> + * To achieve constant time decay_load.
> + */
> + if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
> + val >>= local_n / LOAD_AVG_PERIOD;
> + n %= LOAD_AVG_PERIOD;

s/n/local_n/ ?

Thanks,
Namhyung


> }
>
> - return val;
> + val *= runnable_avg_yN_inv[local_n];
> + return SRR(val, 32);
> +}
> +
> +/*
> + * For updates fully spanning n periods, the contribution to runnable
> + * average will be: \Sum 1024*y^n
> + *
> + * We can compute this reasonably efficiently by combining:
> + * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
> + */
> +static u32 __compute_runnable_contrib(u64 n)
> +{
> + u32 contrib = 0;
> +
> + if (likely(n <= LOAD_AVG_PERIOD))
> + return runnable_avg_yN_sum[n];
> + else if (unlikely(n >= LOAD_AVG_MAX_N))
> + return LOAD_AVG_MAX;
> +
> + /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
> + do {
> + contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
> + contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
> +
> + n -= LOAD_AVG_PERIOD;
> + } while (n > LOAD_AVG_PERIOD);
> +
> + contrib = decay_load(contrib, n);
> + return contrib + runnable_avg_yN_sum[n];
> }
>
> /* We can represent the historical contribution to runnable average as the
> @@ -923,7 +996,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
> struct sched_avg *sa,
> int runnable)
> {
> - u64 delta;
> + u64 delta, periods;
> + u32 runnable_contrib;
> int delta_w, decayed = 0;
>
> delta = now - sa->last_runnable_update;
> @@ -957,25 +1031,26 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
> * period and accrue it.
> */
> delta_w = 1024 - delta_w;
> - BUG_ON(delta_w > delta);
> - do {
> - if (runnable)
> - sa->runnable_avg_sum += delta_w;
> - sa->runnable_avg_period += delta_w;
> -
> - /*
> - * Remainder of delta initiates a new period, roll over
> - * the previous.
> - */
> - sa->runnable_avg_sum =
> - decay_load(sa->runnable_avg_sum, 1);
> - sa->runnable_avg_period =
> - decay_load(sa->runnable_avg_period, 1);
> -
> - delta -= delta_w;
> - /* New period is empty */
> - delta_w = 1024;
> - } while (delta >= 1024);
> + if (runnable)
> + sa->runnable_avg_sum += delta_w;
> + sa->runnable_avg_period += delta_w;
> +
> + delta -= delta_w;
> +
> + /* Figure out how many additional periods this update spans */
> + periods = delta / 1024;
> + delta %= 1024;
> +
> + sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
> + periods + 1);
> + sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
> + periods + 1);
> +
> + /* Efficiently calculate \sum (1..n_period) 1024*y^i */
> + runnable_contrib = __compute_runnable_contrib(periods);
> + if (runnable)
> + sa->runnable_avg_sum += runnable_contrib;
> + sa->runnable_avg_period += runnable_contrib;
> }
>
> /* Remainder of delta accrued against u_0` */

2012-10-24 09:58:08

by Paul Turner

[permalink] [raw]
Subject: [tip:sched/core] sched: Make __update_entity_runnable_avg() fast

Commit-ID: 5b51f2f80b3b906ce59bd4dce6eca3c7f34cb1b9
Gitweb: http://git.kernel.org/tip/5b51f2f80b3b906ce59bd4dce6eca3c7f34cb1b9
Author: Paul Turner <[email protected]>
AuthorDate: Thu, 4 Oct 2012 13:18:32 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 24 Oct 2012 10:27:30 +0200

sched: Make __update_entity_runnable_avg() fast

__update_entity_runnable_avg forms the core of maintaining an entity's runnable
load average. In this function we charge the accumulated run-time since last
update and handle appropriate decay. In some cases, e.g. a waking task, this
time interval may be much larger than our period unit.

Fortunately we can exploit some properties of our series to perform decay for a
blocked update in constant time and account the contribution for a running
update in essentially-constant* time.

[*]: For any running entity they should be performing updates at the tick which
gives us a soft limit of 1 jiffy between updates, and we can compute up to a
32 jiffy update in a single pass.

C program to generate the magic constants in the arrays:

#include <math.h>
#include <stdio.h>

#define N 32
#define WMULT_SHIFT 32

const long WMULT_CONST = ((1UL << N) - 1);
double y;

long runnable_avg_yN_inv[N];
void calc_mult_inv() {
int i;
double yn = 0;

printf("inverses\n");
for (i = 0; i < N; i++) {
yn = (double)WMULT_CONST * pow(y, i);
runnable_avg_yN_inv[i] = yn;
printf("%2d: 0x%8lx\n", i, runnable_avg_yN_inv[i]);
}
printf("\n");
}

long mult_inv(long c, int n) {
return (c * runnable_avg_yN_inv[n]) >> WMULT_SHIFT;
}

void calc_yn_sum(int n)
{
int i;
double sum = 0, sum_fl = 0, diff = 0;

/*
* We take the floored sum to ensure the sum of partial sums is never
* larger than the actual sum.
*/
printf("sum y^n\n");
printf(" %8s %8s %8s\n", "exact", "floor", "error");
for (i = 1; i <= n; i++) {
sum = (y * sum + y * 1024);
sum_fl = floor(y * sum_fl+ y * 1024);
printf("%2d: %8.0f %8.0f %8.0f\n", i, sum, sum_fl,
sum_fl - sum);
}
printf("\n");
}

void calc_conv(long n) {
long old_n;
int i = -1;

printf("convergence (LOAD_AVG_MAX, LOAD_AVG_MAX_N)\n");
do {
old_n = n;
n = mult_inv(n, 1) + 1024;
i++;
} while (n != old_n);
printf("%d> %ld\n", i - 1, n);
printf("\n");
}

void main() {
y = pow(0.5, 1/(double)N);
calc_mult_inv();
calc_conv(1024);
calc_yn_sum(N);
}

[ Compile with -lm ]
Signed-off-by: Paul Turner <[email protected]>
Reviewed-by: Ben Segall <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched/fair.c | 125 +++++++++++++++++++++++++++++++++++++++++----------
1 files changed, 101 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 002a769..6ecf455 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -884,17 +884,92 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)

#ifdef CONFIG_SMP
/*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
+
+/* Precomputed fixed inverse multiplies for multiplication by y^n */
+static const u32 runnable_avg_yN_inv[] = {
+ 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+ 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+ 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+ 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+ 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+ 0x85aac367, 0x82cd8698,
+};
+
+/*
+ * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
+ * over-estimates when re-combining.
+ */
+static const u32 runnable_avg_yN_sum[] = {
+ 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
+ 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
+ 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
+};
+
+/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
*/
static __always_inline u64 decay_load(u64 val, u64 n)
{
- for (; n && val; n--) {
- val *= 4008;
- val >>= 12;
+ unsigned int local_n;
+
+ if (!n)
+ return val;
+ else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+ return 0;
+
+ /* after bounds checking we can collapse to 32-bit */
+ local_n = n;
+
+ /*
+ * As y^PERIOD = 1/2, we can combine
+ * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
+ * With a look-up table which covers k^n (n<PERIOD)
+ *
+ * To achieve constant time decay_load.
+ */
+ if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
+ val >>= local_n / LOAD_AVG_PERIOD;
+ local_n %= LOAD_AVG_PERIOD;
}

- return val;
+ val *= runnable_avg_yN_inv[local_n];
+ /* We don't use SRR here since we always want to round down. */
+ return val >> 32;
+}
+
+/*
+ * For updates fully spanning n periods, the contribution to runnable
+ * average will be: \Sum 1024*y^n
+ *
+ * We can compute this reasonably efficiently by combining:
+ * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
+ */
+static u32 __compute_runnable_contrib(u64 n)
+{
+ u32 contrib = 0;
+
+ if (likely(n <= LOAD_AVG_PERIOD))
+ return runnable_avg_yN_sum[n];
+ else if (unlikely(n >= LOAD_AVG_MAX_N))
+ return LOAD_AVG_MAX;
+
+ /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
+ do {
+ contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
+ contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
+
+ n -= LOAD_AVG_PERIOD;
+ } while (n > LOAD_AVG_PERIOD);
+
+ contrib = decay_load(contrib, n);
+ return contrib + runnable_avg_yN_sum[n];
}

/*
@@ -929,7 +1004,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
struct sched_avg *sa,
int runnable)
{
- u64 delta;
+ u64 delta, periods;
+ u32 runnable_contrib;
int delta_w, decayed = 0;

delta = now - sa->last_runnable_update;
@@ -963,25 +1039,26 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
* period and accrue it.
*/
delta_w = 1024 - delta_w;
- BUG_ON(delta_w > delta);
- do {
- if (runnable)
- sa->runnable_avg_sum += delta_w;
- sa->runnable_avg_period += delta_w;
-
- /*
- * Remainder of delta initiates a new period, roll over
- * the previous.
- */
- sa->runnable_avg_sum =
- decay_load(sa->runnable_avg_sum, 1);
- sa->runnable_avg_period =
- decay_load(sa->runnable_avg_period, 1);
-
- delta -= delta_w;
- /* New period is empty */
- delta_w = 1024;
- } while (delta >= 1024);
+ if (runnable)
+ sa->runnable_avg_sum += delta_w;
+ sa->runnable_avg_period += delta_w;
+
+ delta -= delta_w;
+
+ /* Figure out how many additional periods this update spans */
+ periods = delta / 1024;
+ delta %= 1024;
+
+ sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
+ periods + 1);
+ sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+ periods + 1);
+
+ /* Efficiently calculate \sum (1..n_period) 1024*y^i */
+ runnable_contrib = __compute_runnable_contrib(periods);
+ if (runnable)
+ sa->runnable_avg_sum += runnable_contrib;
+ sa->runnable_avg_period += runnable_contrib;
}

/* Remainder of delta accrued against u_0` */