2012-08-23 14:16:49

by Paul Turner

[permalink] [raw]
Subject: [patch 01/16] sched: track the runnable average on a per-task entitiy basis

From: Paul Turner <[email protected]>

Instead of tracking averaging the load parented by a cfs_rq, we can track
entity load directly. With the load for a given cfs_rq then being the sum of
its children.

To do this we represent the historical contribution to runnable average within each
trailing 1024us of execution as the coefficients of a geometric series.

We can express this for a given task t as:
runnable_sum(t) = \Sum u_i * y^i, runnable_avg_period(t) = \Sum 1024 * y^i
load(t) = weight_t * runnable_sum(t) / runnable_avg_period(t)

Where: u_i is the usage in the last i`th 1024us period (approximately 1ms) ~ms
and y is chosen such that y^k = 1/2. We currently choose k to be 32 which
roughly translates to about a sched period.

Signed-off-by: Paul Turner <[email protected]>
Reviewed-by: Ben Segall <[email protected]>
---
include/linux/sched.h | 13 +++++
kernel/sched/core.c | 5 ++
kernel/sched/debug.c | 4 ++
kernel/sched/fair.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 150 insertions(+), 0 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f3eebc1..f553da9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1139,6 +1139,16 @@ struct load_weight {
unsigned long weight, inv_weight;
};

+struct sched_avg {
+ /*
+ * These sums represent an infinite geometric series and so are bound
+ * above by 1024/(1-y). Thus we only need a u32 to store them for for all
+ * choices of y < 1-2^(-32)*1024.
+ */
+ u32 runnable_avg_sum, runnable_avg_period;
+ u64 last_runnable_update;
+};
+
#ifdef CONFIG_SCHEDSTATS
struct sched_statistics {
u64 wait_start;
@@ -1199,6 +1209,9 @@ struct sched_entity {
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
#endif
+#ifdef CONFIG_SMP
+ struct sched_avg avg;
+#endif
};

struct sched_rt_entity {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78d9c96..fcc3cad 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1556,6 +1556,11 @@ static void __sched_fork(struct task_struct *p)
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);

+#ifdef CONFIG_SMP
+ p->se.avg.runnable_avg_period = 0;
+ p->se.avg.runnable_avg_sum = 0;
+#endif
+
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6f79596..61f7097 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -85,6 +85,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P(se->statistics.wait_count);
#endif
P(se->load.weight);
+#ifdef CONFIG_SMP
+ P(se->avg.runnable_avg_sum);
+ P(se->avg.runnable_avg_period);
+#endif
#undef PN
#undef P
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 01d3eda..2c53263 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -971,6 +971,125 @@ static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
}
#endif /* CONFIG_FAIR_GROUP_SCHED */

+#ifdef CONFIG_SMP
+/*
+ * Approximate:
+ * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
+ */
+static __always_inline u64 decay_load(u64 val, u64 n)
+{
+ for (; n && val; n--) {
+ val *= 4008;
+ val >>= 12;
+ }
+
+ return val;
+}
+
+/* We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series. To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ * p0 p1 p1
+ * (now) (~1ms ago) (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ * y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1]
+ */
+static __always_inline int __update_entity_runnable_avg(u64 now,
+ struct sched_avg *sa,
+ int runnable)
+{
+ u64 delta;
+ int delta_w, decayed = 0;
+
+ delta = now - sa->last_runnable_update;
+ /*
+ * This should only happen when time goes backwards, which it
+ * unfortunately does during sched clock init when we swap over to TSC.
+ */
+ if ((s64)delta < 0) {
+ sa->last_runnable_update = now;
+ return 0;
+ }
+
+ /*
+ * Use 1024ns as the unit of measurement since it's a reasonable
+ * approximation of 1us and fast to compute.
+ */
+ delta >>= 10;
+ if (!delta)
+ return 0;
+ sa->last_runnable_update = now;
+
+ /* delta_w is the amount already accumulated against our next period */
+ delta_w = sa->runnable_avg_period % 1024;
+ if (delta + delta_w >= 1024) {
+ /* period roll-over */
+ decayed = 1;
+
+ /*
+ * Now that we know we're crossing a period boundary, figure
+ * out how much from delta we need to complete the current
+ * period and accrue it.
+ */
+ delta_w = 1024 - delta_w;
+ BUG_ON(delta_w > delta);
+ do {
+ if (runnable)
+ sa->runnable_avg_sum += delta_w;
+ sa->runnable_avg_period += delta_w;
+
+ /*
+ * Remainder of delta initiates a new period, roll over
+ * the previous.
+ */
+ sa->runnable_avg_sum =
+ decay_load(sa->runnable_avg_sum, 1);
+ sa->runnable_avg_period =
+ decay_load(sa->runnable_avg_period, 1);
+
+ delta -= delta_w;
+ /* New period is empty */
+ delta_w = 1024;
+ } while (delta >= 1024);
+ }
+
+ /* Remainder of delta accrued against u_0` */
+ if (runnable)
+ sa->runnable_avg_sum += delta;
+ sa->runnable_avg_period += delta;
+
+ return decayed;
+}
+
+/* Update a sched_entity's runnable average */
+static inline void update_entity_load_avg(struct sched_entity *se)
+{
+ __update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg,
+ se->on_rq);
+}
+#else
+static inline void update_entity_load_avg(struct sched_entity *se) {}
+#endif
+
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
@@ -1097,6 +1216,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
update_curr(cfs_rq);
update_cfs_load(cfs_rq, 0);
+ update_entity_load_avg(se);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);

@@ -1171,6 +1291,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ update_entity_load_avg(se);

update_stats_dequeue(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
@@ -1340,6 +1461,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
+ /* in !on_rq case, update occurred at dequeue */
+ update_entity_load_avg(prev);
}
cfs_rq->curr = NULL;
}
@@ -1353,6 +1476,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
update_curr(cfs_rq);

/*
+ * Ensure that runnable average is periodically updated.
+ */
+ update_entity_load_avg(curr);
+
+ /*
* Update share accounting for long-running entities.
*/
update_entity_shares_tick(cfs_rq);


2012-08-24 08:26:55

by Namhyung Kim

[permalink] [raw]
Subject: Re: [patch 01/16] sched: track the runnable average on a per-task entitiy basis

Hi,

Just typos below..

On Thu, 23 Aug 2012 07:14:23 -0700, > From: Paul Turner <[email protected]>
>
> Instead of tracking averaging the load parented by a cfs_rq, we can track
> entity load directly. With the load for a given cfs_rq then being the sum of
> its children.
>
> To do this we represent the historical contribution to runnable average within each
> trailing 1024us of execution as the coefficients of a geometric series.
>
> We can express this for a given task t as:
> runnable_sum(t) = \Sum u_i * y^i, runnable_avg_period(t) = \Sum 1024 * y^i
> load(t) = weight_t * runnable_sum(t) / runnable_avg_period(t)
>
> Where: u_i is the usage in the last i`th 1024us period (approximately 1ms) ~ms
> and y is chosen such that y^k = 1/2. We currently choose k to be 32 which
> roughly translates to about a sched period.
>
> Signed-off-by: Paul Turner <[email protected]>
> Reviewed-by: Ben Segall <[email protected]>
> ---
> include/linux/sched.h | 13 +++++
> kernel/sched/core.c | 5 ++
> kernel/sched/debug.c | 4 ++
> kernel/sched/fair.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 150 insertions(+), 0 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index f3eebc1..f553da9 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1139,6 +1139,16 @@ struct load_weight {
> unsigned long weight, inv_weight;
> };
>
> +struct sched_avg {
> + /*
> + * These sums represent an infinite geometric series and so are bound
> + * above by 1024/(1-y). Thus we only need a u32 to store them for for all
> + * choices of y < 1-2^(-32)*1024.
> + */
> + u32 runnable_avg_sum, runnable_avg_period;
> + u64 last_runnable_update;
> +};
> +
> #ifdef CONFIG_SCHEDSTATS
> struct sched_statistics {
> u64 wait_start;
> @@ -1199,6 +1209,9 @@ struct sched_entity {
> /* rq "owned" by this entity/group: */
> struct cfs_rq *my_q;
> #endif
> +#ifdef CONFIG_SMP
> + struct sched_avg avg;
> +#endif
> };
>
> struct sched_rt_entity {
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 78d9c96..fcc3cad 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1556,6 +1556,11 @@ static void __sched_fork(struct task_struct *p)
> p->se.vruntime = 0;
> INIT_LIST_HEAD(&p->se.group_node);
>
> +#ifdef CONFIG_SMP
> + p->se.avg.runnable_avg_period = 0;
> + p->se.avg.runnable_avg_sum = 0;
> +#endif
> +
> #ifdef CONFIG_SCHEDSTATS
> memset(&p->se.statistics, 0, sizeof(p->se.statistics));
> #endif
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 6f79596..61f7097 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -85,6 +85,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
> P(se->statistics.wait_count);
> #endif
> P(se->load.weight);
> +#ifdef CONFIG_SMP
> + P(se->avg.runnable_avg_sum);
> + P(se->avg.runnable_avg_period);
> +#endif
> #undef PN
> #undef P
> }
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 01d3eda..2c53263 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -971,6 +971,125 @@ static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
> }
> #endif /* CONFIG_FAIR_GROUP_SCHED */
>
> +#ifdef CONFIG_SMP
> +/*
> + * Approximate:
> + * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
> + */
> +static __always_inline u64 decay_load(u64 val, u64 n)
> +{
> + for (; n && val; n--) {
> + val *= 4008;
> + val >>= 12;
> + }
> +
> + return val;
> +}
> +
> +/* We can represent the historical contribution to runnable average as the
> + * coefficients of a geometric series. To do this we sub-divide our runnable
> + * history into segments of approximately 1ms (1024us); label the segment that
> + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
> + *
> + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
> + * p0 p1 p1

Should it be p2 ?


> + * (now) (~1ms ago) (~2ms ago)
> + *
> + * Let u_i denote the fraction of p_i that the entity was runnable.
> + *
> + * We then designate the fractions u_i as our co-efficients, yielding the
> + * following representation of historical load:
> + * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
> + *
> + * We choose y based on the with of a reasonably scheduling period, fixing:
> + * y^32 = 0.5
> + *
> + * This means that the contribution to load ~32ms ago (u_32) will be weighted
> + * approximately half as much as the contribution to load within the last ms
> + * (u_0).
> + *
> + * When a period "rolls over" and we have new u_0`, multiplying the previous
> + * sum again by y is sufficient to update:
> + * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
> + * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1]

s/u_{i+1]/u_{i+1}]/

Thanks,
Namhyung


> + */
> +static __always_inline int __update_entity_runnable_avg(u64 now,
> + struct sched_avg *sa,
> + int runnable)
> +{
> + u64 delta;
> + int delta_w, decayed = 0;
> +
> + delta = now - sa->last_runnable_update;
> + /*
> + * This should only happen when time goes backwards, which it
> + * unfortunately does during sched clock init when we swap over to TSC.
> + */
> + if ((s64)delta < 0) {
> + sa->last_runnable_update = now;
> + return 0;
> + }
> +
> + /*
> + * Use 1024ns as the unit of measurement since it's a reasonable
> + * approximation of 1us and fast to compute.
> + */
> + delta >>= 10;
> + if (!delta)
> + return 0;
> + sa->last_runnable_update = now;
> +
> + /* delta_w is the amount already accumulated against our next period */
> + delta_w = sa->runnable_avg_period % 1024;
> + if (delta + delta_w >= 1024) {
> + /* period roll-over */
> + decayed = 1;
> +
> + /*
> + * Now that we know we're crossing a period boundary, figure
> + * out how much from delta we need to complete the current
> + * period and accrue it.
> + */
> + delta_w = 1024 - delta_w;
> + BUG_ON(delta_w > delta);
> + do {
> + if (runnable)
> + sa->runnable_avg_sum += delta_w;
> + sa->runnable_avg_period += delta_w;
> +
> + /*
> + * Remainder of delta initiates a new period, roll over
> + * the previous.
> + */
> + sa->runnable_avg_sum =
> + decay_load(sa->runnable_avg_sum, 1);
> + sa->runnable_avg_period =
> + decay_load(sa->runnable_avg_period, 1);
> +
> + delta -= delta_w;
> + /* New period is empty */
> + delta_w = 1024;
> + } while (delta >= 1024);
> + }
> +
> + /* Remainder of delta accrued against u_0` */
> + if (runnable)
> + sa->runnable_avg_sum += delta;
> + sa->runnable_avg_period += delta;
> +
> + return decayed;
> +}
> +
> +/* Update a sched_entity's runnable average */
> +static inline void update_entity_load_avg(struct sched_entity *se)
> +{
> + __update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg,
> + se->on_rq);
> +}
> +#else
> +static inline void update_entity_load_avg(struct sched_entity *se) {}
> +#endif
> +
> static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> #ifdef CONFIG_SCHEDSTATS
> @@ -1097,6 +1216,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> */
> update_curr(cfs_rq);
> update_cfs_load(cfs_rq, 0);
> + update_entity_load_avg(se);
> account_entity_enqueue(cfs_rq, se);
> update_cfs_shares(cfs_rq);
>
> @@ -1171,6 +1291,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> * Update run-time statistics of the 'current'.
> */
> update_curr(cfs_rq);
> + update_entity_load_avg(se);
>
> update_stats_dequeue(cfs_rq, se);
> if (flags & DEQUEUE_SLEEP) {
> @@ -1340,6 +1461,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
> update_stats_wait_start(cfs_rq, prev);
> /* Put 'current' back into the tree. */
> __enqueue_entity(cfs_rq, prev);
> + /* in !on_rq case, update occurred at dequeue */
> + update_entity_load_avg(prev);
> }
> cfs_rq->curr = NULL;
> }
> @@ -1353,6 +1476,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
> update_curr(cfs_rq);
>
> /*
> + * Ensure that runnable average is periodically updated.
> + */
> + update_entity_load_avg(curr);
> +
> + /*
> * Update share accounting for long-running entities.
> */
> update_entity_shares_tick(cfs_rq);

2012-08-28 22:13:27

by Paul Turner

[permalink] [raw]
Subject: Re: [patch 01/16] sched: track the runnable average on a per-task entitiy basis

On Fri, Aug 24, 2012 at 1:20 AM, Namhyung Kim <[email protected]> wrote:
> Hi,
>
> Just typos below..
>

Applied, Thanks.

> On Thu, 23 Aug 2012 07:14:23 -0700, > From: Paul Turner <[email protected]>
>>
>> Instead of tracking averaging the load parented by a cfs_rq, we can track
>> entity load directly. With the load for a given cfs_rq then being the
>> sum of
>> its children.
>>
>> To do this we represent the historical contribution to runnable average
>> within each
>> trailing 1024us of execution as the coefficients of a geometric series.
>>
>> We can express this for a given task t as:
>> runnable_sum(t) = \Sum u_i * y^i, runnable_avg_period(t) = \Sum 1024 *
>> y^i
>> load(t) = weight_t * runnable_sum(t) / runnable_avg_period(t)
>>
>> Where: u_i is the usage in the last i`th 1024us period (approximately
>> 1ms) ~ms
>> and y is chosen such that y^k = 1/2. We currently choose k to be 32
>> which
>> roughly translates to about a sched period.
>>
>> Signed-off-by: Paul Turner <[email protected]>
>> Reviewed-by: Ben Segall <[email protected]>
>> ---
>> include/linux/sched.h | 13 +++++
>> kernel/sched/core.c | 5 ++
>> kernel/sched/debug.c | 4 ++
>> kernel/sched/fair.c | 128
>> +++++++++++++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 150 insertions(+), 0 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index f3eebc1..f553da9 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1139,6 +1139,16 @@ struct load_weight {
>> unsigned long weight, inv_weight;
>> };
>>
>> +struct sched_avg {
>> + /*
>> + * These sums represent an infinite geometric series and so are
>> bound
>> + * above by 1024/(1-y). Thus we only need a u32 to store them for
>> for all
>> + * choices of y < 1-2^(-32)*1024.
>> + */
>> + u32 runnable_avg_sum, runnable_avg_period;
>> + u64 last_runnable_update;
>> +};
>> +
>> #ifdef CONFIG_SCHEDSTATS
>> struct sched_statistics {
>> u64 wait_start;
>> @@ -1199,6 +1209,9 @@ struct sched_entity {
>> /* rq "owned" by this entity/group: */
>> struct cfs_rq *my_q;
>> #endif
>> +#ifdef CONFIG_SMP
>> + struct sched_avg avg;
>> +#endif
>> };
>>
>> struct sched_rt_entity {
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 78d9c96..fcc3cad 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -1556,6 +1556,11 @@ static void __sched_fork(struct task_struct *p)
>> p->se.vruntime = 0;
>> INIT_LIST_HEAD(&p->se.group_node);
>>
>> +#ifdef CONFIG_SMP
>> + p->se.avg.runnable_avg_period = 0;
>> + p->se.avg.runnable_avg_sum = 0;
>> +#endif
>> +
>> #ifdef CONFIG_SCHEDSTATS
>> memset(&p->se.statistics, 0, sizeof(p->se.statistics));
>> #endif
>> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
>> index 6f79596..61f7097 100644
>> --- a/kernel/sched/debug.c
>> +++ b/kernel/sched/debug.c
>> @@ -85,6 +85,10 @@ static void print_cfs_group_stats(struct seq_file *m,
>> int cpu, struct task_group
>> P(se->statistics.wait_count);
>> #endif
>> P(se->load.weight);
>> +#ifdef CONFIG_SMP
>> + P(se->avg.runnable_avg_sum);
>> + P(se->avg.runnable_avg_period);
>> +#endif
>> #undef PN
>> #undef P
>> }
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 01d3eda..2c53263 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -971,6 +971,125 @@ static inline void update_entity_shares_tick(struct
>> cfs_rq *cfs_rq)
>> }
>> #endif /* CONFIG_FAIR_GROUP_SCHED */
>>
>> +#ifdef CONFIG_SMP
>> +/*
>> + * Approximate:
>> + * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
>> + */
>> +static __always_inline u64 decay_load(u64 val, u64 n)
>> +{
>> + for (; n && val; n--) {
>> + val *= 4008;
>> + val >>= 12;
>> + }
>> +
>> + return val;
>> +}
>> +
>> +/* We can represent the historical contribution to runnable average as
>> the
>> + * coefficients of a geometric series. To do this we sub-divide our
>> runnable
>> + * history into segments of approximately 1ms (1024us); label the
>> segment that
>> + * occurred N-ms ago p_N, with p_0 corresponding to the current period,
>> e.g.
>> + *
>> + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
>> + * p0 p1 p1
>
> Should it be p2 ?
>
>
>> + * (now) (~1ms ago) (~2ms ago)
>> + *
>> + * Let u_i denote the fraction of p_i that the entity was runnable.
>> + *
>> + * We then designate the fractions u_i as our co-efficients, yielding
>> the
>> + * following representation of historical load:
>> + * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
>> + *
>> + * We choose y based on the with of a reasonably scheduling period,
>> fixing:
>> + * y^32 = 0.5
>> + *
>> + * This means that the contribution to load ~32ms ago (u_32) will be
>> weighted
>> + * approximately half as much as the contribution to load within the
>> last ms
>> + * (u_0).
>> + *
>> + * When a period "rolls over" and we have new u_0`, multiplying the
>> previous
>> + * sum again by y is sufficient to update:
>> + * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
>> + * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1]
>
> s/u_{i+1]/u_{i+1}]/
>
> Thanks,
> Namhyung
>
>
>> + */
>> +static __always_inline int __update_entity_runnable_avg(u64 now,
>> + struct sched_avg
>> *sa,
>> + int runnable)
>> +{
>> + u64 delta;
>> + int delta_w, decayed = 0;
>> +
>> + delta = now - sa->last_runnable_update;
>> + /*
>> + * This should only happen when time goes backwards, which it
>> + * unfortunately does during sched clock init when we swap over to
>> TSC.
>> + */
>> + if ((s64)delta < 0) {
>> + sa->last_runnable_update = now;
>> + return 0;
>> + }
>> +
>> + /*
>> + * Use 1024ns as the unit of measurement since it's a reasonable
>> + * approximation of 1us and fast to compute.
>> + */
>> + delta >>= 10;
>> + if (!delta)
>> + return 0;
>> + sa->last_runnable_update = now;
>> +
>> + /* delta_w is the amount already accumulated against our next
>> period */
>> + delta_w = sa->runnable_avg_period % 1024;
>> + if (delta + delta_w >= 1024) {
>> + /* period roll-over */
>> + decayed = 1;
>> +
>> + /*
>> + * Now that we know we're crossing a period boundary,
>> figure
>> + * out how much from delta we need to complete the current
>> + * period and accrue it.
>> + */
>> + delta_w = 1024 - delta_w;
>> + BUG_ON(delta_w > delta);
>> + do {
>> + if (runnable)
>> + sa->runnable_avg_sum += delta_w;
>> + sa->runnable_avg_period += delta_w;
>> +
>> + /*
>> + * Remainder of delta initiates a new period, roll
>> over
>> + * the previous.
>> + */
>> + sa->runnable_avg_sum =
>> + decay_load(sa->runnable_avg_sum, 1);
>> + sa->runnable_avg_period =
>> + decay_load(sa->runnable_avg_period, 1);
>> +
>> + delta -= delta_w;
>> + /* New period is empty */
>> + delta_w = 1024;
>> + } while (delta >= 1024);
>> + }
>> +
>> + /* Remainder of delta accrued against u_0` */
>> + if (runnable)
>> + sa->runnable_avg_sum += delta;
>> + sa->runnable_avg_period += delta;
>> +
>> + return decayed;
>> +}
>> +
>> +/* Update a sched_entity's runnable average */
>> +static inline void update_entity_load_avg(struct sched_entity *se)
>> +{
>> + __update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task,
>> &se->avg,
>> + se->on_rq);
>> +}
>> +#else
>> +static inline void update_entity_load_avg(struct sched_entity *se) {}
>> +#endif
>> +
>> static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity
>> *se)
>> {
>> #ifdef CONFIG_SCHEDSTATS
>> @@ -1097,6 +1216,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct
>> sched_entity *se, int flags)
>> */
>> update_curr(cfs_rq);
>> update_cfs_load(cfs_rq, 0);
>> + update_entity_load_avg(se);
>> account_entity_enqueue(cfs_rq, se);
>> update_cfs_shares(cfs_rq);
>>
>> @@ -1171,6 +1291,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct
>> sched_entity *se, int flags)
>> * Update run-time statistics of the 'current'.
>> */
>> update_curr(cfs_rq);
>> + update_entity_load_avg(se);
>>
>> update_stats_dequeue(cfs_rq, se);
>> if (flags & DEQUEUE_SLEEP) {
>> @@ -1340,6 +1461,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq,
>> struct sched_entity *prev)
>> update_stats_wait_start(cfs_rq, prev);
>> /* Put 'current' back into the tree. */
>> __enqueue_entity(cfs_rq, prev);
>> + /* in !on_rq case, update occurred at dequeue */
>> + update_entity_load_avg(prev);
>> }
>> cfs_rq->curr = NULL;
>> }
>> @@ -1353,6 +1476,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct
>> sched_entity *curr, int queued)
>> update_curr(cfs_rq);
>>
>> /*
>> + * Ensure that runnable average is periodically updated.
>> + */
>> + update_entity_load_avg(curr);
>> +
>> + /*
>> * Update share accounting for long-running entities.
>> */
>> update_entity_shares_tick(cfs_rq);

2012-10-24 09:45:09

by Paul Turner

[permalink] [raw]
Subject: [tip:sched/core] sched: Track the runnable average on a per-task entity basis

Commit-ID: 9d85f21c94f7f7a84d0ba686c58aa6d9da58fdbb
Gitweb: http://git.kernel.org/tip/9d85f21c94f7f7a84d0ba686c58aa6d9da58fdbb
Author: Paul Turner <[email protected]>
AuthorDate: Thu, 4 Oct 2012 13:18:29 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 24 Oct 2012 10:27:18 +0200

sched: Track the runnable average on a per-task entity basis

Instead of tracking averaging the load parented by a cfs_rq, we can track
entity load directly. With the load for a given cfs_rq then being the sum
of its children.

To do this we represent the historical contribution to runnable average
within each trailing 1024us of execution as the coefficients of a
geometric series.

We can express this for a given task t as:

runnable_sum(t) = \Sum u_i * y^i, runnable_avg_period(t) = \Sum 1024 * y^i
load(t) = weight_t * runnable_sum(t) / runnable_avg_period(t)

Where: u_i is the usage in the last i`th 1024us period (approximately 1ms)
~ms and y is chosen such that y^k = 1/2. We currently choose k to be 32 which
roughly translates to about a sched period.

Signed-off-by: Paul Turner <[email protected]>
Reviewed-by: Ben Segall <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/sched.h | 13 +++++
kernel/sched/core.c | 5 ++
kernel/sched/debug.c | 4 ++
kernel/sched/fair.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 151 insertions(+), 0 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dd42a0..418fc6d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1095,6 +1095,16 @@ struct load_weight {
unsigned long weight, inv_weight;
};

+struct sched_avg {
+ /*
+ * These sums represent an infinite geometric series and so are bound
+ * above by 1024/(1-y). Thus we only need a u32 to store them for for all
+ * choices of y < 1-2^(-32)*1024.
+ */
+ u32 runnable_avg_sum, runnable_avg_period;
+ u64 last_runnable_update;
+};
+
#ifdef CONFIG_SCHEDSTATS
struct sched_statistics {
u64 wait_start;
@@ -1155,6 +1165,9 @@ struct sched_entity {
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
#endif
+#ifdef CONFIG_SMP
+ struct sched_avg avg;
+#endif
};

struct sched_rt_entity {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927f..fd9d085 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1524,6 +1524,11 @@ static void __sched_fork(struct task_struct *p)
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);

+#ifdef CONFIG_SMP
+ p->se.avg.runnable_avg_period = 0;
+ p->se.avg.runnable_avg_sum = 0;
+#endif
+
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6f79596..61f7097 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -85,6 +85,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P(se->statistics.wait_count);
#endif
P(se->load.weight);
+#ifdef CONFIG_SMP
+ P(se->avg.runnable_avg_sum);
+ P(se->avg.runnable_avg_period);
+#endif
#undef PN
#undef P
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a1..16d67f9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -971,6 +971,126 @@ static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
}
#endif /* CONFIG_FAIR_GROUP_SCHED */

+#ifdef CONFIG_SMP
+/*
+ * Approximate:
+ * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
+ */
+static __always_inline u64 decay_load(u64 val, u64 n)
+{
+ for (; n && val; n--) {
+ val *= 4008;
+ val >>= 12;
+ }
+
+ return val;
+}
+
+/*
+ * We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series. To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ * p0 p1 p2
+ * (now) (~1ms ago) (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ * y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ */
+static __always_inline int __update_entity_runnable_avg(u64 now,
+ struct sched_avg *sa,
+ int runnable)
+{
+ u64 delta;
+ int delta_w, decayed = 0;
+
+ delta = now - sa->last_runnable_update;
+ /*
+ * This should only happen when time goes backwards, which it
+ * unfortunately does during sched clock init when we swap over to TSC.
+ */
+ if ((s64)delta < 0) {
+ sa->last_runnable_update = now;
+ return 0;
+ }
+
+ /*
+ * Use 1024ns as the unit of measurement since it's a reasonable
+ * approximation of 1us and fast to compute.
+ */
+ delta >>= 10;
+ if (!delta)
+ return 0;
+ sa->last_runnable_update = now;
+
+ /* delta_w is the amount already accumulated against our next period */
+ delta_w = sa->runnable_avg_period % 1024;
+ if (delta + delta_w >= 1024) {
+ /* period roll-over */
+ decayed = 1;
+
+ /*
+ * Now that we know we're crossing a period boundary, figure
+ * out how much from delta we need to complete the current
+ * period and accrue it.
+ */
+ delta_w = 1024 - delta_w;
+ BUG_ON(delta_w > delta);
+ do {
+ if (runnable)
+ sa->runnable_avg_sum += delta_w;
+ sa->runnable_avg_period += delta_w;
+
+ /*
+ * Remainder of delta initiates a new period, roll over
+ * the previous.
+ */
+ sa->runnable_avg_sum =
+ decay_load(sa->runnable_avg_sum, 1);
+ sa->runnable_avg_period =
+ decay_load(sa->runnable_avg_period, 1);
+
+ delta -= delta_w;
+ /* New period is empty */
+ delta_w = 1024;
+ } while (delta >= 1024);
+ }
+
+ /* Remainder of delta accrued against u_0` */
+ if (runnable)
+ sa->runnable_avg_sum += delta;
+ sa->runnable_avg_period += delta;
+
+ return decayed;
+}
+
+/* Update a sched_entity's runnable average */
+static inline void update_entity_load_avg(struct sched_entity *se)
+{
+ __update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg,
+ se->on_rq);
+}
+#else
+static inline void update_entity_load_avg(struct sched_entity *se) {}
+#endif
+
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
@@ -1097,6 +1217,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
update_curr(cfs_rq);
update_cfs_load(cfs_rq, 0);
+ update_entity_load_avg(se);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);

@@ -1171,6 +1292,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ update_entity_load_avg(se);

update_stats_dequeue(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
@@ -1340,6 +1462,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
+ /* in !on_rq case, update occurred at dequeue */
+ update_entity_load_avg(prev);
}
cfs_rq->curr = NULL;
}
@@ -1353,6 +1477,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
update_curr(cfs_rq);

/*
+ * Ensure that runnable average is periodically updated.
+ */
+ update_entity_load_avg(curr);
+
+ /*
* Update share accounting for long-running entities.
*/
update_entity_shares_tick(cfs_rq);

2012-10-25 03:48:05

by Li Guang

[permalink] [raw]
Subject: Re: [tip:sched/core] sched: Track the runnable average on a per-task entity basis

在 2012-10-24三的 02:43 -0700,tip-bot for Paul Turner写道:
> Commit-ID: 9d85f21c94f7f7a84d0ba686c58aa6d9da58fdbb
> Gitweb: http://git.kernel.org/tip/9d85f21c94f7f7a84d0ba686c58aa6d9da58fdbb
> Author: Paul Turner <[email protected]>
> AuthorDate: Thu, 4 Oct 2012 13:18:29 +0200
> Committer: Ingo Molnar <[email protected]>
> CommitDate: Wed, 24 Oct 2012 10:27:18 +0200
>
> sched: Track the runnable average on a per-task entity basis
>
> Instead of tracking averaging the load parented by a cfs_rq, we can track
> entity load directly. With the load for a given cfs_rq then being the sum
> of its children.
>
> To do this we represent the historical contribution to runnable average
> within each trailing 1024us of execution as the coefficients of a
> geometric series.
>
> We can express this for a given task t as:
>
> runnable_sum(t) = \Sum u_i * y^i, runnable_avg_period(t) = \Sum 1024 * y^i
> load(t) = weight_t * runnable_sum(t) / runnable_avg_period(t)
>
> Where: u_i is the usage in the last i`th 1024us period (approximately 1ms)
> ~ms and y is chosen such that y^k = 1/2. We currently choose k to be 32 which
> roughly translates to about a sched period.
>
> Signed-off-by: Paul Turner <[email protected]>
> Reviewed-by: Ben Segall <[email protected]>
> Signed-off-by: Peter Zijlstra <[email protected]>
> Link: http://lkml.kernel.org/r/[email protected]
> Signed-off-by: Ingo Molnar <[email protected]>
> ---
> include/linux/sched.h | 13 +++++
> kernel/sched/core.c | 5 ++
> kernel/sched/debug.c | 4 ++
> kernel/sched/fair.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 151 insertions(+), 0 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 0dd42a0..418fc6d 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1095,6 +1095,16 @@ struct load_weight {
> unsigned long weight, inv_weight;
> };
>
> +struct sched_avg {
> + /*
> + * These sums represent an infinite geometric series and so are bound
> + * above by 1024/(1-y). Thus we only need a u32 to store them for for all
> + * choices of y < 1-2^(-32)*1024.
> + */
> + u32 runnable_avg_sum, runnable_avg_period;
> + u64 last_runnable_update;
> +};
> +
> #ifdef CONFIG_SCHEDSTATS
> struct sched_statistics {
> u64 wait_start;
> @@ -1155,6 +1165,9 @@ struct sched_entity {
> /* rq "owned" by this entity/group: */
> struct cfs_rq *my_q;
> #endif
> +#ifdef CONFIG_SMP
> + struct sched_avg avg;
> +#endif
> };
>
> struct sched_rt_entity {
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 2d8927f..fd9d085 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1524,6 +1524,11 @@ static void __sched_fork(struct task_struct *p)
> p->se.vruntime = 0;
> INIT_LIST_HEAD(&p->se.group_node);
>
> +#ifdef CONFIG_SMP
> + p->se.avg.runnable_avg_period = 0;
> + p->se.avg.runnable_avg_sum = 0;
> +#endif
> +
> #ifdef CONFIG_SCHEDSTATS
> memset(&p->se.statistics, 0, sizeof(p->se.statistics));
> #endif
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 6f79596..61f7097 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -85,6 +85,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
> P(se->statistics.wait_count);
> #endif
> P(se->load.weight);
> +#ifdef CONFIG_SMP
> + P(se->avg.runnable_avg_sum);
> + P(se->avg.runnable_avg_period);
> +#endif
> #undef PN
> #undef P
> }
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6b800a1..16d67f9 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -971,6 +971,126 @@ static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
> }
> #endif /* CONFIG_FAIR_GROUP_SCHED */
>
> +#ifdef CONFIG_SMP
> +/*
> + * Approximate:
> + * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
> + */
> +static __always_inline u64 decay_load(u64 val, u64 n)
> +{
> + for (; n && val; n--) {
> + val *= 4008;
> + val >>= 12;
> + }
> +
> + return val;
> +}
> +
> +/*
> + * We can represent the historical contribution to runnable average as the
> + * coefficients of a geometric series. To do this we sub-divide our runnable
> + * history into segments of approximately 1ms (1024us); label the segment that
> + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
> + *
> + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
> + * p0 p1 p2
> + * (now) (~1ms ago) (~2ms ago)
> + *
> + * Let u_i denote the fraction of p_i that the entity was runnable.
> + *
> + * We then designate the fractions u_i as our co-efficients, yielding the
> + * following representation of historical load:
> + * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
> + *
> + * We choose y based on the with of a reasonably scheduling period, fixing:
> + * y^32 = 0.5
> + *
> + * This means that the contribution to load ~32ms ago (u_32) will be weighted
> + * approximately half as much as the contribution to load within the last ms
> + * (u_0).
> + *
> + * When a period "rolls over" and we have new u_0`, multiplying the previous
> + * sum again by y is sufficient to update:
> + * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
> + * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
> + */
> +static __always_inline int __update_entity_runnable_avg(u64 now,
> + struct sched_avg *sa,
> + int runnable)
> +{
> + u64 delta;
> + int delta_w, decayed = 0;
> +
> + delta = now - sa->last_runnable_update;
> + /*
> + * This should only happen when time goes backwards, which it
> + * unfortunately does during sched clock init when we swap over to TSC.
> + */
> + if ((s64)delta < 0) {
> + sa->last_runnable_update = now;
> + return 0;
> + }
> +
> + /*
> + * Use 1024ns as the unit of measurement since it's a reasonable
> + * approximation of 1us and fast to compute.
> + */
> + delta >>= 10;
> + if (!delta)
> + return 0;
> + sa->last_runnable_update = now;
> +
> + /* delta_w is the amount already accumulated against our next period */
> + delta_w = sa->runnable_avg_period % 1024;
> + if (delta + delta_w >= 1024) {
> + /* period roll-over */
> + decayed = 1;
> +
> + /*
> + * Now that we know we're crossing a period boundary, figure
> + * out how much from delta we need to complete the current
> + * period and accrue it.
> + */
> + delta_w = 1024 - delta_w;
> + BUG_ON(delta_w > delta);
> + do {
> + if (runnable)
> + sa->runnable_avg_sum += delta_w;
> + sa->runnable_avg_period += delta_w;
> +
> + /*
> + * Remainder of delta initiates a new period, roll over
> + * the previous.
> + */
> + sa->runnable_avg_sum =
> + decay_load(sa->runnable_avg_sum, 1);

Is this u0+u1*y+u2*y^2+u3*y^3 ...,
seems no, this is u0+u1*y+u2*y+u3*y+u4*y ...

> + sa->runnable_avg_period =
> + decay_load(sa->runnable_avg_period, 1);
> +
> + delta -= delta_w;
> + /* New period is empty */
> + delta_w = 1024;
> + } while (delta >= 1024);
> + }
> +
> + /* Remainder of delta accrued against u_0` */
> + if (runnable)
> + sa->runnable_avg_sum += delta;
> + sa->runnable_avg_period += delta;
> +
> + return decayed;
> +}
> +
> +/* Update a sched_entity's runnable average */
> +static inline void update_entity_load_avg(struct sched_entity *se)
> +{
> + __update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg,
> + se->on_rq);
> +}
> +#else
> +static inline void update_entity_load_avg(struct sched_entity *se) {}
> +#endif
> +
> static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> #ifdef CONFIG_SCHEDSTATS
> @@ -1097,6 +1217,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> */
> update_curr(cfs_rq);
> update_cfs_load(cfs_rq, 0);
> + update_entity_load_avg(se);
> account_entity_enqueue(cfs_rq, se);
> update_cfs_shares(cfs_rq);
>
> @@ -1171,6 +1292,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> * Update run-time statistics of the 'current'.
> */
> update_curr(cfs_rq);
> + update_entity_load_avg(se);
>
> update_stats_dequeue(cfs_rq, se);
> if (flags & DEQUEUE_SLEEP) {
> @@ -1340,6 +1462,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
> update_stats_wait_start(cfs_rq, prev);
> /* Put 'current' back into the tree. */
> __enqueue_entity(cfs_rq, prev);
> + /* in !on_rq case, update occurred at dequeue */
> + update_entity_load_avg(prev);
> }
> cfs_rq->curr = NULL;
> }
> @@ -1353,6 +1477,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
> update_curr(cfs_rq);
>
> /*
> + * Ensure that runnable average is periodically updated.
> + */
> + update_entity_load_avg(curr);
> +
> + /*
> * Update share accounting for long-running entities.
> */
> update_entity_shares_tick(cfs_rq);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

--
liguang [email protected]
FNST linux kernel team

2012-10-25 16:58:31

by Benjamin Segall

[permalink] [raw]
Subject: Re: [tip:sched/core] sched: Track the runnable average on a per-task entity basis

li guang <[email protected]> writes:

> 在 2012-10-24三的 02:43 -0700,tip-bot for Paul Turner写道:
>> + do {
>> + if (runnable)
>> + sa->runnable_avg_sum += delta_w;
>> + sa->runnable_avg_period += delta_w;
>> +
>> + /*
>> + * Remainder of delta initiates a new period, roll over
>> + * the previous.
>> + */
>> + sa->runnable_avg_sum =
>> + decay_load(sa->runnable_avg_sum, 1);
>
> Is this u0+u1*y+u2*y^2+u3*y^3 ...,
> seems no, this is u0+u1*y+u2*y+u3*y+u4*y ...
>
It is cumulative, so it is u0+y*(u1+y*(u2+..., which is u0+u1*y+u2*y^2+...