2017-09-01 13:40:15

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH -v2 04/18] sched/fair: Remove se->load.weight from se->avg.load_sum

Remove the load from the load_sum for sched_entities, basically
turning load_sum into runnable_sum. This prepares for better
reweighting of group entities.

Since we now have different rules for computing load_avg, split
___update_load_avg() into two parts, ___update_load_sum() and
___update_load_avg().

So for se:

___update_load_sum(.weight = 1)
___upate_load_avg(.weight = se->load.weight)

and for cfs_rq:

___update_load_sum(.weight = cfs_rq->load.weight)
___upate_load_avg(.weight = 1)

Since the primary consumable is load_avg, most things will not be
affected. Only those few sites that initialize/modify load_sum need
attention.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/fair.c | 91 ++++++++++++++++++++++++++++++++++++----------------
1 file changed, 64 insertions(+), 27 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -744,7 +744,7 @@ void init_entity_runnable_average(struct
*/
if (entity_is_task(se))
sa->load_avg = scale_load_down(se->load.weight);
- sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
+ sa->load_sum = LOAD_AVG_MAX;
/*
* At this point, util_avg won't be used in select_task_rq_fair anyway
*/
@@ -1967,7 +1967,7 @@ static u64 numa_get_avg_runtime(struct t
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
} else {
- delta = p->se.avg.load_sum / p->se.load.weight;
+ delta = p->se.avg.load_sum;
*period = LOAD_AVG_MAX;
}

@@ -2872,8 +2872,8 @@ accumulate_sum(u64 delta, int cpu, struc
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
static __always_inline int
-___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
- unsigned long weight, int running, struct cfs_rq *cfs_rq)
+___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
+ unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
u64 delta;

@@ -2907,39 +2907,80 @@ ___update_load_avg(u64 now, int cpu, str
if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
return 0;

+ return 1;
+}
+
+static __always_inline void
+___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq *cfs_rq)
+{
+ u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+
/*
* Step 2: update *_avg.
*/
- sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
+ sa->load_avg = div_u64(weight * sa->load_sum, divider);
if (cfs_rq) {
cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
+ div_u64(cfs_rq->runnable_load_sum, divider);
}
- sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
+ sa->util_avg = sa->util_sum / divider;
+}

- return 1;
+/*
+ * XXX we want to get rid of this helper and use the full load resolution.
+ */
+static inline long se_weight(struct sched_entity *se)
+{
+ return scale_load_down(se->load.weight);
}

+/*
+ * sched_entity:
+ *
+ * load_sum := runnable_sum
+ * load_avg = se_weight(se) * runnable_avg
+ *
+ * cfq_rs:
+ *
+ * load_sum = \Sum se_weight(se) * se->avg.load_sum
+ * load_avg = \Sum se->avg.load_avg
+ */
+
static int
__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
{
- return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
+ if (___update_load_sum(now, cpu, &se->avg, 0, 0, NULL)) {
+ ___update_load_avg(&se->avg, se_weight(se), NULL);
+ return 1;
+ }
+
+ return 0;
}

static int
__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return ___update_load_avg(now, cpu, &se->avg,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
+ if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq,
+ cfs_rq->curr == se, NULL)) {
+
+ ___update_load_avg(&se->avg, se_weight(se), NULL);
+ return 1;
+ }
+
+ return 0;
}

static int
__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
{
- return ___update_load_avg(now, cpu, &cfs_rq->avg,
- scale_load_down(cfs_rq->load.weight),
- cfs_rq->curr != NULL, cfs_rq);
+ if (___update_load_sum(now, cpu, &cfs_rq->avg,
+ scale_load_down(cfs_rq->load.weight),
+ cfs_rq->curr != NULL, cfs_rq)) {
+ ___update_load_avg(&cfs_rq->avg, 1, cfs_rq);
+ return 1;
+ }
+
+ return 0;
}

/*
@@ -3110,7 +3151,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq

/* Set new sched_entity's load */
se->avg.load_avg = load;
- se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+ se->avg.load_sum = LOAD_AVG_MAX;

/* Update parent cfs_rq load */
add_positive(&cfs_rq->avg.load_avg, delta);
@@ -3340,7 +3381,7 @@ static void attach_entity_load_avg(struc
{
se->avg.last_update_time = cfs_rq->avg.last_update_time;
cfs_rq->avg.load_avg += se->avg.load_avg;
- cfs_rq->avg.load_sum += se->avg.load_sum;
+ cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
set_tg_cfs_propagate(cfs_rq);
@@ -3360,7 +3401,7 @@ static void detach_entity_load_avg(struc
{

sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+ sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
set_tg_cfs_propagate(cfs_rq);
@@ -3372,12 +3413,10 @@ static void detach_entity_load_avg(struc
static inline void
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct sched_avg *sa = &se->avg;
-
- cfs_rq->runnable_load_avg += sa->load_avg;
- cfs_rq->runnable_load_sum += sa->load_sum;
+ cfs_rq->runnable_load_avg += se->avg.load_avg;
+ cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum;

- if (!sa->last_update_time) {
+ if (!se->avg.last_update_time) {
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, 0);
}
@@ -3387,10 +3426,8 @@ enqueue_entity_load_avg(struct cfs_rq *c
static inline void
dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- cfs_rq->runnable_load_avg =
- max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
- cfs_rq->runnable_load_sum =
- max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
+ sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg);
+ sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum);
}

#ifndef CONFIG_64BIT



2017-09-29 15:26:16

by Morten Rasmussen

[permalink] [raw]
Subject: Re: [PATCH -v2 04/18] sched/fair: Remove se->load.weight from se->avg.load_sum

On Fri, Sep 01, 2017 at 03:21:03PM +0200, Peter Zijlstra wrote:
> +/*
> + * sched_entity:
> + *
> + * load_sum := runnable_sum
> + * load_avg = se_weight(se) * runnable_avg
> + *
> + * cfq_rs:

I think this should be "cfs_rq" instead.

> + *
> + * load_sum = \Sum se_weight(se) * se->avg.load_sum
> + * load_avg = \Sum se->avg.load_avg
> + */

I find it a bit confusing that load_sum and load_avg have different
definitions, but I guess I will discover why dropping weight from
se->avg.load_sum helps a bit later. We can't do the same for cfs_rq as
it is a \Sum of sums where we add/remove contributions when tasks
migrate.

Have we defined the relation between runnable_sum and runnable_avg in a
comment somewhere already? Otherwise it might be helpful to add. It is
in the code of course :-)

2017-09-29 16:39:28

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH -v2 04/18] sched/fair: Remove se->load.weight from se->avg.load_sum

On Fri, Sep 29, 2017 at 04:26:10PM +0100, Morten Rasmussen wrote:
> On Fri, Sep 01, 2017 at 03:21:03PM +0200, Peter Zijlstra wrote:
> > +/*
> > + * sched_entity:
> > + *
> > + * load_sum := runnable_sum
> > + * load_avg = se_weight(se) * runnable_avg
> > + *
> > + * cfq_rs:
>
> I think this should be "cfs_rq" instead.

Quite.

> > + *
> > + * load_sum = \Sum se_weight(se) * se->avg.load_sum
> > + * load_avg = \Sum se->avg.load_avg
> > + */
>
> I find it a bit confusing that load_sum and load_avg have different
> definitions, but I guess I will discover why dropping weight from
> se->avg.load_sum helps a bit later.

It makes changing the weight cheaper, we don't have to divide the old
weight out and then multiple by the new weight.

Up until this patch set, we mixed the old and new weight in the sum,
with the result that a weight change takes a while to propagate.

But we want to change weight instantly. See:

sched/fair: More accurate reweight_entity()

> We can't do the same for cfs_rq as it is a \Sum of sums where we
> add/remove contributions when tasks migrate.

Just so. The patch:

sched/fair: Rewrite PELT migration propagation

Has text on this:

+ * Because while for entities historical weight is not important and we
+ * really only care about our future and therefore can consider a pure
+ * runnable sum, runqueues can NOT do this.
+ *
+ * We specifically want runqueues to have a load_avg that includes
+ * historical weights. Those represent the blocked load, the load we expect
+ * to (shortly) return to us. This only works by keeping the weights as
+ * integral part of the sum. We therefore cannot decompose as per (3).

> Have we defined the relation between runnable_sum and runnable_avg in a
> comment somewhere already? Otherwise it might be helpful to add. It is
> in the code of course :-)

I think this comment here is about it... But who knows. Please keep this
as a note while you read the rest of the series. If you find its still
insufficiently addressed at the end, I'm sure we can do another patch
with moar comments still ;-)