LinuxLists.cc - [patch 02/16] sched: maintain per-rq runnable averages

2012-08-23 14:16:20

Subject: [patch 02/16] sched: maintain per-rq runnable averages

From: Ben Segall <[email protected]>

Since runqueues do not have a corresponding sched_entity we instead embed a
sched_avg structure directly.

Signed-off-by: Ben Segall <[email protected]>
Reviewed-by: Paul Turner <[email protected]>
---
kernel/sched/debug.c | 10 ++++++++--
kernel/sched/fair.c | 18 ++++++++++++++++--
kernel/sched/sched.h | 2 ++
3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 61f7097..4240abc 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec)
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
struct sched_entity *se = tg->se[cpu];
- if (!se)
- return;

#define P(F) \
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))

+ if (!se) {
+ struct sched_avg *avg = &cpu_rq(cpu)->avg;
+ P(avg->runnable_avg_sum);
+ P(avg->runnable_avg_period);
+ return;
+ }
+
+
PN(se->exec_start);
PN(se->vruntime);
PN(se->sum_exec_runtime);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2c53263..6eb2ce2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1086,8 +1086,14 @@ static inline void update_entity_load_avg(struct sched_entity *se)
__update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg,
se->on_rq);
}
+
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
+{
+ __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+}
#else
static inline void update_entity_load_avg(struct sched_entity *se) {}
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
#endif

static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -2339,8 +2345,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_shares(cfs_rq);
}

- if (!se)
+ if (!se) {
+ update_rq_runnable_avg(rq, rq->nr_running);
inc_nr_running(rq);
+ }
hrtick_update(rq);
}

@@ -2398,8 +2406,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_shares(cfs_rq);
}

- if (!se)
+ if (!se) {
dec_nr_running(rq);
+ update_rq_runnable_avg(rq, 1);
+ }
hrtick_update(rq);
}

@@ -4573,6 +4583,8 @@ void idle_balance(int this_cpu, struct rq *this_rq)
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;

+ update_rq_runnable_avg(this_rq, 1);
+
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
*/
@@ -5071,6 +5083,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);
}
+
+ update_rq_runnable_avg(rq, 1);
}

/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 804c2e5..eb61c75 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -467,6 +467,8 @@ struct rq {
#ifdef CONFIG_SMP
struct llist_head wake_list;
#endif
+
+ struct sched_avg avg;
};

static inline int cpu_of(struct rq *rq)

2012-10-24 09:46:01

by Benjamin Segall

[permalink] [raw]

Subject: [tip:sched/core] sched: Maintain per-rq runnable averages

Commit-ID: 18bf2805d9b30cb823d4919b42cd230f59c7ce1f
Gitweb: http://git.kernel.org/tip/18bf2805d9b30cb823d4919b42cd230f59c7ce1f
Author: Ben Segall <[email protected]>
AuthorDate: Thu, 4 Oct 2012 12:51:20 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 24 Oct 2012 10:27:20 +0200

sched: Maintain per-rq runnable averages

Since runqueues do not have a corresponding sched_entity we instead embed a
sched_avg structure directly.

Signed-off-by: Ben Segall <[email protected]>
Reviewed-by: Paul Turner <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched/debug.c | 10 ++++++++--
kernel/sched/fair.c | 18 ++++++++++++++++--
kernel/sched/sched.h | 2 ++
3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 61f7097..4240abc 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec)
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
struct sched_entity *se = tg->se[cpu];
- if (!se)
- return;

#define P(F) \
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))

+ if (!se) {
+ struct sched_avg *avg = &cpu_rq(cpu)->avg;
+ P(avg->runnable_avg_sum);
+ P(avg->runnable_avg_period);
+ return;
+ }
+
+
PN(se->exec_start);
PN(se->vruntime);
PN(se->sum_exec_runtime);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 16d67f9..8c5468f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1087,8 +1087,14 @@ static inline void update_entity_load_avg(struct sched_entity *se)
__update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg,
se->on_rq);
}
+
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
+{
+ __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+}
#else
static inline void update_entity_load_avg(struct sched_entity *se) {}
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
#endif

static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -2340,8 +2346,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_shares(cfs_rq);
}

- if (!se)
+ if (!se) {
+ update_rq_runnable_avg(rq, rq->nr_running);
inc_nr_running(rq);
+ }
hrtick_update(rq);
}

@@ -2399,8 +2407,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_shares(cfs_rq);
}

- if (!se)
+ if (!se) {
dec_nr_running(rq);
+ update_rq_runnable_avg(rq, 1);
+ }
hrtick_update(rq);
}

@@ -4586,6 +4596,8 @@ void idle_balance(int this_cpu, struct rq *this_rq)
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;

+ update_rq_runnable_avg(this_rq, 1);
+
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
*/
@@ -5083,6 +5095,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);
}
+
+ update_rq_runnable_avg(rq, 1);
}

/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7a7db09..14b5719 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -467,6 +467,8 @@ struct rq {
#ifdef CONFIG_SMP
struct llist_head wake_list;
#endif
+
+ struct sched_avg avg;
};

static inline int cpu_of(struct rq *rq)

2012-10-28 10:12:29

by Preeti Murthy

[permalink] [raw]

Subject: Re: [patch 02/16] sched: maintain per-rq runnable averages

Hi Paul, Ben,

A few queries regarding this patch:

1.What exactly is the significance of introducing sched_avg structure
for a runqueue? If I have
understood correctly, sched_avg keeps track of how long a task has
been active,
how long has it been serviced by the processor and its lifetime.How
does this apply analogously
to the runqueue?

2.Is this a right measure to overwrite rq->load.weight because the
rq->sched_avg does not seem to
take care of task priorities.IOW, what is the idea behind
introducing this metric for the runqueue?
Why cant the run queue load be updated the same way as the cfs_rq
load is updated:
cfs_rq->runnable_load_avg and cfs_rq->blocked_load_avg.

3.What is the significance of passing rq->nr_running in
enqueue_task_fair while updating
the run queue load? Because __update_entity_runnable_avg does not
treat this argument
any differently if it is >1.

Thank you

Regards
Preeti U Murthy

On Thu, Aug 23, 2012 at 7:44 PM, <[email protected]> wrote:
> From: Ben Segall <[email protected]>
>
> Since runqueues do not have a corresponding sched_entity we instead embed a
> sched_avg structure directly.
>
> Signed-off-by: Ben Segall <[email protected]>
> Reviewed-by: Paul Turner <[email protected]>
> ---

2012-10-29 17:38:22

by Benjamin Segall

[permalink] [raw]

Subject: Re: [patch 02/16] sched: maintain per-rq runnable averages

Preeti Murthy <[email protected]> writes:

> Hi Paul, Ben,
>
> A few queries regarding this patch:
>
> 1.What exactly is the significance of introducing sched_avg structure
> for a runqueue? If I have
> understood correctly, sched_avg keeps track of how long a task has
> been active,
> how long has it been serviced by the processor and its lifetime.How
> does this apply analogously
> to the runqueue?

Remember that sched_avg's are not just for tasks, they're for any CFS
group entity (sched_entity), for which they track the time runnable and
the time used, which allows the system-wide per-task_group computation
of runnable and usage.

Computing these on the root has no usage in this patchset, but any
extensions of this using hierarchy-based fractional usage or runnable
time would need it, and retrofitting it afterwards would be a pain.
>
> 2.Is this a right measure to overwrite rq->load.weight because the
> rq->sched_avg does not seem to
> take care of task priorities.IOW, what is the idea behind
> introducing this metric for the runqueue?
> Why cant the run queue load be updated the same way as the cfs_rq
> load is updated:
> cfs_rq->runnable_load_avg and cfs_rq->blocked_load_avg.

Loadwise you would indeed want the cfs_rq statistics, that is what they
are there for. The sched_avg numbers are only useful in computing the
parent's load (irrelevant on the root), or for extensions using raw
usage/runnable numbers.
>
> 3.What is the significance of passing rq->nr_running in
> enqueue_task_fair while updating
> the run queue load? Because __update_entity_runnable_avg does not
> treat this argument
> any differently if it is >1.

That could just as well be rq->nr_running != 0, it would behave the same.

2012-11-07 08:28:57

by Preeti U Murthy

[permalink] [raw]

Subject: Re: [patch 02/16] sched: maintain per-rq runnable averages

On 10/29/2012 11:08 PM, Benjamin Segall wrote:
> Preeti Murthy <[email protected]> writes:
>
>> Hi Paul, Ben,
>>
>> A few queries regarding this patch:
>>
>> 1.What exactly is the significance of introducing sched_avg structure
>> for a runqueue? If I have
>> understood correctly, sched_avg keeps track of how long a task has
>> been active,
>> how long has it been serviced by the processor and its lifetime.How
>> does this apply analogously
>> to the runqueue?
>
> Remember that sched_avg's are not just for tasks, they're for any CFS
> group entity (sched_entity), for which they track the time runnable and
> the time used, which allows the system-wide per-task_group computation
> of runnable and usage.
>
> Computing these on the root has no usage in this patchset, but any
> extensions of this using hierarchy-based fractional usage or runnable
> time would need it, and retrofitting it afterwards would be a pain.
>>
>> 2.Is this a right measure to overwrite rq->load.weight because the
>> rq->sched_avg does not seem to
>> take care of task priorities.IOW, what is the idea behind
>> introducing this metric for the runqueue?
>> Why cant the run queue load be updated the same way as the cfs_rq
>> load is updated:
>> cfs_rq->runnable_load_avg and cfs_rq->blocked_load_avg.
>
> Loadwise you would indeed want the cfs_rq statistics, that is what they
> are there for. The sched_avg numbers are only useful in computing the
> parent's load (irrelevant on the root), or for extensions using raw
> usage/runnable numbers.
>>
>> 3.What is the significance of passing rq->nr_running in
>> enqueue_task_fair while updating
>> the run queue load? Because __update_entity_runnable_avg does not
>> treat this argument
>> any differently if it is >1.
>
> That could just as well be rq->nr_running != 0, it would behave the same.

Hi Ben,
After going through your suggestions,below is a patch which I wish to begin
with in my effort to integrate the per-entity-load-tracking metric with the
scheduler.I had posted out a patchset earlier,
(https://lkml.org/lkml/2012/10/25/162) but due to various drawbacks,
I am redoing it along the lines of the suggestions posted in reply to it.
Please do let me know if I am using the metric in the right way.Thanks.

Regards
Preeti U Murthy

------------START OF PATCH--------------------------------------------

Since load balancing requires runqueue load to track the load of the sched
groups and hence the sched domains,introduce the cfs_rq equivalent metric of
current runnable load to the run queue as well.

The idea is something like this:

1.The entire load balancing framework is hinged upon what
weighted_cpuload() has to say about the load of the sched group which in turn
adds upto the weight of the sched domain and will ultimately be used to decide
whether to do load balance and to calculate the imbalance.

2.Currently weighted_cpuload() is returning rq->load.weight,but it needs to
use the per-entity-load-tracking metric to reflect the runqueue load.So it
needs to be replaced it with rq->runnable_load_avg.

3.This being the first step towards integrating the per-entity-load tracking
metric with the load balancer.

Signed-off-by: Preeti U Murthy
---
kernel/sched/fair.c | 9 ++++++++-
kernel/sched/sched.h | 1 +
2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a9cdc8f..6c89b28 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1499,8 +1499,11 @@ static inline void update_entity_load_avg(struct sched_entity *se,
if (!update_cfs_rq)
return;

- if (se->on_rq)
+ if (se->on_rq) {
cfs_rq->runnable_load_avg += contrib_delta;
+ if(!parent_entity(se))
+ rq->runnable_load_avg += contrib_delta;
+ }
else
subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
}
@@ -1579,6 +1582,8 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
}

cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+ if(!parent_entity(se))
+ rq->runnable_load_avg += se->avg.load_avg_contrib;
/* we force update consideration on load-balancer moves */
update_cfs_rq_blocked_load(cfs_rq, !wakeup);
}
@@ -1597,6 +1602,8 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
update_cfs_rq_blocked_load(cfs_rq, !sleep);

cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+ if(!parent_entity(se))
+ rq->runnable_load_avg -= se->avg.load_avg_contrib;
if (sleep) {
cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bfd004a..3001d97 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -382,6 +382,7 @@ struct rq {
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
#ifdef CONFIG_SMP
+ u64 runnable_load_avg;
unsigned long h_load_throttle;
#endif /* CONFIG_SMP */
#endif /* CONFIG_FAIR_GROUP_SCHED */