LinuxLists.cc - [PATCH 1/3] Added runqueue clock normalized with cpufreq

[permalink] [raw]

Subject: [PATCH 2/3] cpufreq normalized runtime to enforce runtime cycles also at lower frequencies.

This patch do the actual changes to sched deadline v3 to
utilize the normalized runtime clock. Note that the
deadline/periods still use the regular runtime clock.

Change-Id: I75c88676e9e18a71d94d6c4e779b376a7ac0615f

Signed-off-by: Harald Gustafsson <[email protected]>
---
include/linux/sched.h | 6 +++
kernel/sched.c | 2 +
kernel/sched_dl.c | 82 +++++++++++++++++++++++++++++++++++++++++++++---
3 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 89a158e..167771c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1301,6 +1301,12 @@ struct sched_dl_entity {
u64 deadline; /* absolute deadline for this instance */
unsigned int flags; /* specifying the scheduler behaviour */

+ /*
+ * CPU frequency normalized start time.
+ * Put it inside DL since only one using it.
+ */
+ u64 exec_start_norm;
+
/*
* Some bool flags:
*
diff --git a/kernel/sched.c b/kernel/sched.c
index 2816371..ddb18d2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2671,6 +2671,7 @@ static void __sched_fork(struct task_struct *p)
p->dl.dl_deadline = p->dl.deadline = 0;
p->dl.dl_period = 0;
p->dl.flags = 0;
+ p->dl.exec_start_norm = 0;

INIT_LIST_HEAD(&p->rt.run_list);
p->se.on_rq = 0;
@@ -8475,6 +8476,7 @@ void normalize_rt_tasks(void)
continue;

p->se.exec_start = 0;
+ p->dl.exec_start_norm = 0;
#ifdef CONFIG_SCHEDSTATS
p->se.wait_start = 0;
p->se.sleep_start = 0;
diff --git a/kernel/sched_dl.c b/kernel/sched_dl.c
index 5aa5a52..049c001 100644
--- a/kernel/sched_dl.c
+++ b/kernel/sched_dl.c
@@ -333,6 +333,40 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
}

/*
+ * A cpu freq normalized overflow check, see dl_entity_overflow
+ * function for details. Check against current cpu frequency.
+ * For this to hold, we must check if:
+ * runtime / (norm_factor * (deadline - t)) < dl_runtime / dl_deadline .
+ */
+static bool dl_entity_overflow_norm(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se, u64 t,
+ struct rq *rq)
+{
+ u64 left, right;
+
+ /*
+ * left and right are the two sides of the equation above,
+ * after a bit of shuffling to use multiplications instead
+ * of divisions.
+ *
+ * Note that none of the time values involved in the two
+ * multiplications are absolute: dl_deadline and dl_runtime
+ * are the relative deadline and the maximum runtime of each
+ * instance, runtime is the runtime left for the last instance
+ * and (deadline - t), since t is rq->clock, is the time left
+ * to the (absolute) deadline. Therefore, overflowing the u64
+ * type is very unlikely to occur in both cases.
+ * Likewise the runtime multiplied with the norm factor is
+ * for the same reasons unlikely to overflow u64 and since
+ * norm factor is max 1<<32.
+ */
+ left = pi_se->dl_deadline * dl_se->runtime;
+ right = (dl_se->deadline - t) * ((pi_se->dl_runtime * rq->norm_factor) >> 32);
+
+ return dl_time_before(right, left);
+}
+
+/*
* When a -deadline entity is queued back on the runqueue, its runtime and
* deadline might need updating.
*
@@ -358,12 +392,16 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
}

if (dl_time_before(dl_se->deadline, rq->clock) ||
- dl_entity_overflow(dl_se, pi_se, rq->clock)) {
+ dl_entity_overflow_norm(dl_se, pi_se, rq->clock, rq)) {
dl_se->deadline = rq->clock + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
overflow = 1;
}
#ifdef CONFIG_SCHEDSTATS
+ if(dl_entity_overflow(dl_se, pi_se, rq->clock))
+ overflow |= 2;
+ if(dl_entity_overflow_norm(dl_se, pi_se, rq->clock, rq))
+ overflow |= 4;
trace_sched_stat_updt_dl(dl_task_of(dl_se), rq->clock, overflow);
#endif
}
@@ -549,10 +587,15 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
* executing, then we have already used some of the runtime of
* the next instance. Thus, if we do not account that, we are
* stealing bandwidth from the system at each deadline miss!
+ *
+ * Use normalization of deadline and clock to compensate the
+ * runtime. Here assuming that the whole exceeded runtime is
+ * done with current cpu frequency.
*/
if (dmiss) {
dl_se->runtime = rorun ? dl_se->runtime : 0;
- dl_se->runtime -= rq->clock - dl_se->deadline;
+ dl_se->runtime -= ((rq->clock - dl_se->deadline)
+ * rq->norm_factor) >> 32;
}

return 1;
@@ -576,31 +619,46 @@ static void update_curr_dl(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_dl_entity *dl_se = &curr->dl;
- u64 delta_exec;
+ u64 delta_exec, delta_exec_norm;

if (!dl_task(curr) || !on_dl_rq(dl_se))
return;

+ /*
+ * Maintaine the unnormalized execution statistics
+ * to keep user space happy.
+ *
+ * Do cpu frequency normalized runtime handling for
+ * the actual DL scheduling to enforce the CPU
+ * max frequency runtime cycles even at lower freq.
+ */
+
delta_exec = rq->clock - curr->se.exec_start;
if (unlikely((s64)delta_exec < 0))
delta_exec = 0;

+ delta_exec_norm = rq->clock_norm - curr->dl.exec_start_norm;
+ if (unlikely((s64)delta_exec_norm < 0))
+ delta_exec_norm = 0;
+
schedstat_set(curr->se.exec_max,
max(curr->se.exec_max, delta_exec));

curr->se.sum_exec_runtime += delta_exec;
schedstat_add(&rq->dl, exec_clock, delta_exec);
account_group_exec_runtime(curr, delta_exec);
- trace_sched_stat_runtime_dl(curr, rq->clock, delta_exec);
+ trace_sched_stat_runtime_dl(curr, rq->clock, delta_exec_norm);

curr->se.exec_start = rq->clock;
+ curr->dl.exec_start_norm = rq->clock_norm;
cpuacct_charge(curr, delta_exec);
cg_cpufreq_charge(curr, delta_exec, curr->se.exec_start);

sched_dl_avg_update(rq, delta_exec);

dl_se->stats.tot_rtime += delta_exec;
- dl_se->runtime -= delta_exec;
+
+ dl_se->runtime -= delta_exec_norm;
if (dl_runtime_exceeded(rq, dl_se)) {
__dequeue_task_dl(rq, curr, 0);
if (likely(start_dl_timer(dl_se, !!curr->pi_top_task)))
@@ -865,10 +923,12 @@ static long wait_interval_dl(struct task_struct *p, struct timespec *rqtp,
* instant. This involves a division (to calculate the reverse of the
* task's bandwidth), but it is worth to notice that it is quite
* unlikely that we get into here very often.
+ * Use normalized overflow check since used for setting the timer.
*/
+
wakeup = timespec_to_ns(rqtp);
if (dl_time_before(wakeup, dl_se->deadline) &&
- !dl_entity_overflow(dl_se, dl_se, wakeup)) {
+ !dl_entity_overflow_norm(dl_se, dl_se, wakeup, rq)) {
u64 ibw = (u64)dl_se->runtime * dl_se->dl_period;

ibw = div_u64(ibw, dl_se->dl_runtime);
@@ -989,6 +1049,13 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
#ifdef CONFIG_SCHED_HRTICK
static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
{
+ /*
+ * Don't use normalized runtime to calculate the
+ * delta, since the clock frequency might increase
+ * and we then misses our needed tick time.
+ * Worst case we will be ticked an extra time.
+ * We also don't need to do a u64 division.
+ */
s64 delta = p->dl.dl_runtime - p->dl.runtime;

if (delta > 10000)
@@ -1037,6 +1104,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq)

p = dl_task_of(dl_se);
p->se.exec_start = rq->clock;
+ p->dl.exec_start_norm = rq->clock_norm;

/* Running task will never be pushed. */
if (p)
@@ -1061,6 +1129,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)

update_curr_dl(rq);
p->se.exec_start = 0;
+ p->dl.exec_start_norm = 0;

if (on_dl_rq(&p->dl) && p->dl.nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
@@ -1102,6 +1171,7 @@ static void set_curr_task_dl(struct rq *rq)
struct task_struct *p = rq->curr;

p->se.exec_start = rq->clock;
+ p->dl.exec_start_norm = rq->clock_norm;

/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
--
1.7.0.4

2010-12-17 13:02:47

[permalink] [raw]

Subject: [PATCH 3/3] sched trace updated with normalized clock info.

Updated the sched deadline v3 traces with the normalized runtime clock.
The delta execution runtime and the last start of execution is also
using the normalized clock.

Change-Id: I6f05a76ad876e8895f3f24940f3ee07f1cb0e8b8

Signed-off-by: Harald Gustafsson <[email protected]>
---
include/trace/events/sched.h | 25 +++++++++++++++++--------
kernel/sched.c | 2 +-
kernel/sched_dl.c | 2 +-
3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 3307353..3c766eb 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -379,16 +379,17 @@ TRACE_EVENT(sched_stat_runtime,
*/
TRACE_EVENT(sched_switch_dl,

- TP_PROTO(u64 clock,
+ TP_PROTO(u64 clock, u64 clock_norm,
struct task_struct *prev,
struct task_struct *next),

- TP_ARGS(clock, prev, next),
+ TP_ARGS(clock, clock_norm, prev, next),

TP_STRUCT__entry(
__array( char, prev_comm, TASK_COMM_LEN )
__field( pid_t, prev_pid )
__field( u64, clock )
+ __field( u64, clock_norm )
__field( s64, prev_rt )
__field( u64, prev_dl )
__field( long, prev_state )
@@ -402,6 +403,7 @@ TRACE_EVENT(sched_switch_dl,
memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
__entry->prev_pid = prev->pid;
__entry->clock = clock;
+ __entry->clock_norm = clock_norm;
__entry->prev_rt = prev->dl.runtime;
__entry->prev_dl = prev->dl.deadline;
__entry->prev_state = prev->state;
@@ -412,7 +414,7 @@ TRACE_EVENT(sched_switch_dl,
),

TP_printk("prev_comm=%s prev_pid=%d prev_rt=%Ld [ns] prev_dl=%Lu [ns] prev_state=%s ==> "
- "next_comm=%s next_pid=%d next_rt=%Ld [ns] next_dl=%Lu [ns] clock=%Lu [ns]",
+ "next_comm=%s next_pid=%d next_rt=%Ld [ns] next_dl=%Lu [ns] clock=%Lu (%Lu) [ns]",
__entry->prev_comm, __entry->prev_pid, (long long)__entry->prev_rt,
(unsigned long long)__entry->prev_dl, __entry->prev_state ?
__print_flags(__entry->prev_state, "|",
@@ -420,7 +422,8 @@ TRACE_EVENT(sched_switch_dl,
{ 16, "Z" }, { 32, "X" }, { 64, "x" },
{ 128, "W" }) : "R",
__entry->next_comm, __entry->next_pid, (long long)__entry->next_rt,
- (unsigned long long)__entry->next_dl, (unsigned long long)__entry->clock)
+ (unsigned long long)__entry->next_dl, (unsigned long long)__entry->clock,
+ (unsigned long long)__entry->clock_norm)
);

/*
@@ -655,9 +658,9 @@ DEFINE_EVENT(sched_stat_template_dl, sched_stat_updt_dl,
*/
TRACE_EVENT(sched_stat_runtime_dl,

- TP_PROTO(struct task_struct *p, u64 clock, u64 last),
+ TP_PROTO(struct task_struct *p, u64 clock, u64 last, u64 clock_norm),

- TP_ARGS(p, clock, last),
+ TP_ARGS(p, clock, last, clock_norm),

TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
@@ -667,6 +670,8 @@ TRACE_EVENT(sched_stat_runtime_dl,
__field( s64, rt )
__field( u64, dl )
__field( u64, start )
+ __field( u64, start_norm )
+ __field( u64, clock_norm )
),

TP_fast_assign(
@@ -677,12 +682,16 @@ TRACE_EVENT(sched_stat_runtime_dl,
__entry->rt = p->dl.runtime - last;
__entry->dl = p->dl.deadline;
__entry->start = p->se.exec_start;
+ __entry->start_norm = p->dl.exec_start_norm;
+ __entry->clock_norm = clock_norm;
),

- TP_printk("comm=%s pid=%d clock=%Lu [ns] delta_exec=%Lu [ns] rt=%Ld [ns] dl=%Lu [ns] exec_start=%Lu [ns]",
+ TP_printk("comm=%s pid=%d clock=%Lu (%Lu) [ns] delta_exec=%Lu [ns] rt=%Ld [ns] dl=%Lu [ns] exec_start=%Lu (%Lu) [ns]",
__entry->comm, __entry->pid, (unsigned long long)__entry->clock,
+ (unsigned long long)__entry->clock_norm,
(unsigned long long)__entry->last, (long long)__entry->rt,
- (unsigned long long)__entry->dl, (unsigned long long)__entry->start)
+ (unsigned long long)__entry->dl, (unsigned long long)__entry->start,
+ (unsigned long long)__entry->start_norm)
);

/*
diff --git a/kernel/sched.c b/kernel/sched.c
index ddb18d2..b6b8ccc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3027,7 +3027,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
prepare_task_switch(rq, prev, next);
trace_sched_switch(rq, prev, next);
if (unlikely(__dl_task(prev) || __dl_task(next)))
- trace_sched_switch_dl(rq->clock, prev, next);
+ trace_sched_switch_dl(rq->clock, rq->clock_norm, prev, next);
mm = next->mm;
oldmm = prev->active_mm;
/*
diff --git a/kernel/sched_dl.c b/kernel/sched_dl.c
index 049c001..b37a905 100644
--- a/kernel/sched_dl.c
+++ b/kernel/sched_dl.c
@@ -647,7 +647,7 @@ static void update_curr_dl(struct rq *rq)
curr->se.sum_exec_runtime += delta_exec;
schedstat_add(&rq->dl, exec_clock, delta_exec);
account_group_exec_runtime(curr, delta_exec);
- trace_sched_stat_runtime_dl(curr, rq->clock, delta_exec_norm);
+ trace_sched_stat_runtime_dl(curr, rq->clock, delta_exec_norm, rq->clock_norm);

curr->se.exec_start = rq->clock;
curr->dl.exec_start_norm = rq->clock_norm;
--
1.7.0.4

2010-12-17 14:30:19

[permalink] [raw]

Subject: Re: [PATCH 1/3] Added runqueue clock normalized with cpufreq

On Fri, 2010-12-17 at 14:02 +0100, Harald Gustafsson wrote:

> This is a request for comments on additions to sched deadline v3 patches.
> Deadline scheduler is the first scheduler (I think) we introduce in Linux that
> specifies the runtime in time and not only as a weight or a relation.
> I have introduced a normalized runtime clock dependent on the CPU frequency.
> This is used, in [PATCH 2/3], to calculate the deadline thread's runtime
> so that approximately the same number of cycles are giving to the thread
> independent of the CPU frequency.
>
> I suggest that this is important for users of hard reservation based schedulers
> that the intended amount of work can be accomplished independent of the CPU frequency.
> The usage of CPU frequency scaling is important on mobile devices and hence
> the combination of deadline scheduler and cpufreq should be solved.

> So before I do this for the linux tip I would welcome a discussion about if this
> is a good idea and also suggestions on how to improve this.

I'm thinking this is going about it totally wrong..

Solving the CPUfreq problem involves writing a SCHED_DEADLINE aware
CPUfreq governor. The governor must know about the constraints placed on
the system by the task-set. You simply cannot lower the frequency when
your system is at u=1.

Once you have a governor that keeps the freq such that: freq/max_freq >=
utilization (which is only sufficient for deadline == period systems),
then you need to frob the SCHED_DEADLINE runtime accounting.

Adding a complete normalized clock to the system like you've done is a
total no-go, it adds overhead even for the !SCHED_DEADLINE case.

The simple solution would be to slow down the runtime accounting of
SCHED_DEADLINE tasks by freq/max_freq. So instead of having:

dl_se->runtime -= delta;

you do something like:

dl_se->runtime -= (freq * delta) / max_freq;

Which auto-magically grows the actual bandwidth, and since the deadlines
are wall-time already it all works out nicely. It also keeps the
overhead inside SCHED_DEADLINE.

2010-12-17 14:33:24

[permalink] [raw]

Subject: Re: [PATCH 1/3] Added runqueue clock normalized with cpufreq

On Fri, 2010-12-17 at 15:29 +0100, Peter Zijlstra wrote:
> On Fri, 2010-12-17 at 14:02 +0100, Harald Gustafsson wrote:
>
> > This is a request for comments on additions to sched deadline v3 patches.
> > Deadline scheduler is the first scheduler (I think) we introduce in Linux that
> > specifies the runtime in time and not only as a weight or a relation.
> > I have introduced a normalized runtime clock dependent on the CPU frequency.
> > This is used, in [PATCH 2/3], to calculate the deadline thread's runtime
> > so that approximately the same number of cycles are giving to the thread
> > independent of the CPU frequency.
> >
> > I suggest that this is important for users of hard reservation based schedulers
> > that the intended amount of work can be accomplished independent of the CPU frequency.
> > The usage of CPU frequency scaling is important on mobile devices and hence
> > the combination of deadline scheduler and cpufreq should be solved.
>
> > So before I do this for the linux tip I would welcome a discussion about if this
> > is a good idea and also suggestions on how to improve this.
>
> I'm thinking this is going about it totally wrong..
>
> Solving the CPUfreq problem involves writing a SCHED_DEADLINE aware
> CPUfreq governor. The governor must know about the constraints placed on
> the system by the task-set. You simply cannot lower the frequency when
> your system is at u=1.
>
> Once you have a governor that keeps the freq such that: freq/max_freq >=
> utilization (which is only sufficient for deadline == period systems),
> then you need to frob the SCHED_DEADLINE runtime accounting.
>
> Adding a complete normalized clock to the system like you've done is a
> total no-go, it adds overhead even for the !SCHED_DEADLINE case.
>
> The simple solution would be to slow down the runtime accounting of
> SCHED_DEADLINE tasks by freq/max_freq. So instead of having:
>
> dl_se->runtime -= delta;
>
> you do something like:
>
> dl_se->runtime -= (freq * delta) / max_freq;
>
> Which auto-magically grows the actual bandwidth, and since the deadlines
> are wall-time already it all works out nicely. It also keeps the
> overhead inside SCHED_DEADLINE.

This is all assuming lowering the frequency is sensible to begin with in
the first place... but that's all part of the CPUfreq governor, it needs
to find a way to lower energy usage while conforming to the system
constraints.

2010-12-17 15:02:44

[permalink] [raw]

Subject: Re: [PATCH 1/3] Added runqueue clock normalized with cpufreq

2010/12/17 Peter Zijlstra <[email protected]>:
>
> I'm thinking this is going about it totally wrong..
>
> Solving the CPUfreq problem involves writing a SCHED_DEADLINE aware
> CPUfreq governor. The governor must know about the constraints placed on
> the system by the task-set. You simply cannot lower the frequency when
> your system is at u=1.
>
> Once you have a governor that keeps the freq such that: freq/max_freq >=
> utilization (which is only sufficient for deadline == period systems),
> then you need to frob the SCHED_DEADLINE runtime accounting.

I agree that this is the other part of the solution, which I have in a separate
ondemand governor, but that code is not ready for public review yet. Since that
code also incorporate other ondemand changes I'm playing with. Such
changes to the ondemand is quite simple it just picks a frequency that at
least supports the total dl bandwidth. It might get tricky for systems which
support individual/clusters frequency for the cores on the system together with
the G-EDF.

> Adding a complete normalized clock to the system like you've done is a
> total no-go, it adds overhead even for the !SCHED_DEADLINE case.

I suspected this, it works as a proof of concept, but not good for mainline.
I will rework this part, if we in general thinks having the dl runtime
accounting
be cpufreq "aware" is a good idea.

> The simple solution would be to slow down the runtime accounting of
> SCHED_DEADLINE tasks by freq/max_freq. So instead of having:
>
> ?dl_se->runtime -= delta;
>
> you do something like:
>
> ?dl_se->runtime -= (freq * delta) / max_freq;
>
> Which auto-magically grows the actual bandwidth, and since the deadlines
> are wall-time already it all works out nicely. It also keeps the
> overhead inside SCHED_DEADLINE.

OK, I can do that. My thought from the beginning was considering that
the reading of the clock was done more often then updating it, but I agree that
it has a negative impact on none dl threads.

/Harald

2010-12-17 15:06:18

[permalink] [raw]

Subject: Re: [PATCH 1/3] Added runqueue clock normalized with cpufreq

2010/12/17 Peter Zijlstra <[email protected]>:
> This is all assuming lowering the frequency is sensible to begin with in
> the first place... but that's all part of the CPUfreq governor, it needs
> to find a way to lower energy usage while conforming to the system
> constraints.

Yes, I and you have already suggested the safe way to not lower it below
the total dl bandwidth. But for softer use cases it might be possible to
e.g. exclude threads with longer periods than cpufreq change periods in the
minimum frequency.

2010-12-17 15:17:17

[permalink] [raw]

Subject: Re: [PATCH 1/3] Added runqueue clock normalized with cpufreq

On Fri, 2010-12-17 at 16:06 +0100, Harald Gustafsson wrote:
> 2010/12/17 Peter Zijlstra <[email protected]>:
> > This is all assuming lowering the frequency is sensible to begin with in
> > the first place... but that's all part of the CPUfreq governor, it needs
> > to find a way to lower energy usage while conforming to the system
> > constraints.
>
> Yes, I and you have already suggested the safe way to not lower it below
> the total dl bandwidth. But for softer use cases it might be possible to
> e.g. exclude threads with longer periods than cpufreq change periods in the
> minimum frequency.

I was more hinting at the fact that CPUfreq is at best a controversial
approach to power savings. I much prefer the whole race-to-idle
approach, its much simpler.

2010-12-17 15:36:50

[permalink] [raw]

Subject: Re: [PATCH 1/3] Added runqueue clock normalized with cpufreq

2010/12/17 Peter Zijlstra <[email protected]>:
> I was more hinting at the fact that CPUfreq is at best a controversial
> approach to power savings. I much prefer the whole race-to-idle
> approach, its much simpler.

That depends to a large degree on architecture, chip technology node
and deployed user space
applications. I don't agree that race-to-idle is a good idea for
some/many combinations at least
for embedded systems. But of course race-to-idle is simpler, but not
necessarily giving the
lowest energy.

/Harald

2010-12-17 15:44:11

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [PATCH 1/3] Added runqueue clock normalized with cpufreq

On Fri, 17 Dec 2010, Peter Zijlstra wrote:
> On Fri, 2010-12-17 at 16:06 +0100, Harald Gustafsson wrote:
> > 2010/12/17 Peter Zijlstra <[email protected]>:
> > > This is all assuming lowering the frequency is sensible to begin with in
> > > the first place... but that's all part of the CPUfreq governor, it needs
> > > to find a way to lower energy usage while conforming to the system
> > > constraints.
> >
> > Yes, I and you have already suggested the safe way to not lower it below
> > the total dl bandwidth. But for softer use cases it might be possible to
> > e.g. exclude threads with longer periods than cpufreq change periods in the
> > minimum frequency.
>
> I was more hinting at the fact that CPUfreq is at best a controversial
> approach to power savings. I much prefer the whole race-to-idle
> approach, its much simpler.

There's that and I have yet to see a proof that running code with
lower frequency and not going idle saves more power than running full
speed and going into low power states for longer time.

Also if you want to have your deadline scheduler aware of cpu
frequency changes, then simply limit the total bandwith based on the
lowest possible frequency and it works always. This whole dynamic
bandwith expansion is more an academic exercise than a practical
necessity.

Thanks,

tglx

2010-12-17 15:54:07

[permalink] [raw]

Subject: Re: [PATCH 1/3] Added runqueue clock normalized with cpufreq

2010/12/17 Thomas Gleixner <[email protected]>:
> Also if you want to have your deadline scheduler aware of cpu
> frequency changes, then simply limit the total bandwith based on the
> lowest possible frequency and it works always. This whole dynamic
> bandwith expansion is more an academic exercise than a practical
> necessity.

This would severely limit the bandwidth available to deadline tasks. Which
then also reduces the use cases that could benefit from using sched deadline.
Also it would imply a over-reservation of the system, e.g. if you need 10%
BW of the total system and the lowest speed is at 20%, you basically need to
set a BW of 50%, to always be guaranteed that you get your 10% when cpufreq
clocks down. If you use cpufreq and want to use sched deadline this has
strong practical implications and is definitely not academic only.

2010-12-17 18:43:01

[permalink] [raw]

Subject: Re: [PATCH 1/3] Added runqueue clock normalized with cpufreq

On Fri, 2010-12-17 at 16:43 +0100, Thomas Gleixner wrote:
> There's that and I have yet to see a proof that running code with
> lower frequency and not going idle saves more power than running full
> speed and going into low power states for longer time.
>
I was expecting a reply like this from right from you! :-P

BTW, I mostly agree that race to idle is better. The point here is that
you might end in a situation where frequency scaling is enabled and/or
a particular frequency is statically selected for whatever reason. In
that case, making the scheduler aware of such could be needed to get the
expected behaviour out of it, independently from the fact it is probably
going to be worse than race-to-idle for power saving purposes... How
much am I wrong?

> Also if you want to have your deadline scheduler aware of cpu
> frequency changes, then simply limit the total bandwith based on the
> lowest possible frequency and it works always.
>
That could be a solution as well, although you're limiting a lot the
bandwidth available for deadline task. But something similar could be
considered...

> This whole dynamic
> bandwith expansion is more an academic exercise than a practical
> necessity.
>
Well, despite the fact that Harald is with Ericsson and as not much to
do with academia. :-D

Regards,
Dario

--
<<This happens because I choose it to happen!>> (Raistlin Majere)
----------------------------------------------------------------------
Dario Faggioli, ReTiS Lab, Scuola Superiore Sant'Anna, Pisa (Italy)

http://retis.sssup.it/people/faggioli -- [email protected]

Attachments:

signature.asc (198.00 B)
This is a digitally signed message part

2010-12-17 18:46:19

[permalink] [raw]

Subject: Re: [PATCH 1/3] Added runqueue clock normalized with cpufreq

On Fri, 2010-12-17 at 16:02 +0100, Harald Gustafsson wrote:
> > Once you have a governor that keeps the freq such that: freq/max_freq >=
> > utilization (which is only sufficient for deadline == period systems),
> > then you need to frob the SCHED_DEADLINE runtime accounting.
>
> I agree that this is the other part of the solution, which I have in a separate
> ondemand governor, but that code is not ready for public review yet. Since that
> code also incorporate other ondemand changes I'm playing with.
>
So, while we're waiting for this to be cooked...

> OK, I can do that. My thought from the beginning was considering that
> the reading of the clock was done more often then updating it, but I agree that
> it has a negative impact on none dl threads.
>
... We can at least integrate this (done in the proper, way as Peter
suggests, i.e., _inside_ SCHED_DEADLINE) in the next release of the
patchset, can't we?

Regards,
Dario

--
<<This happens because I choose it to happen!>> (Raistlin Majere)
----------------------------------------------------------------------
Dario Faggioli, ReTiS Lab, Scuola Superiore Sant'Anna, Pisa (Italy)

http://retis.sssup.it/people/faggioli -- [email protected]

Attachments:

signature.asc (198.00 B)
This is a digitally signed message part

2010-12-17 18:54:13

[permalink] [raw]

Subject: Re: [PATCH 1/3] Added runqueue clock normalized with cpufreq

On Fri, 2010-12-17 at 15:29 +0100, Peter Zijlstra wrote:
> Solving the CPUfreq problem involves writing a SCHED_DEADLINE aware
> CPUfreq governor. The governor must know about the constraints placed on
> the system by the task-set. You simply cannot lower the frequency when
> your system is at u=1.
>
We already did the very same thing (for another EU Project called
FRESCOR), although it was done in an userspace sort of daemon. It was
also able to consider other "high level" parameters like some estimation
of the QoS of each application and of the global QoS of the system.

However, converting the basic mechanism into a CPUfreq governor should
be easily doable... The only problem is finding the time for that! ;-P

> The simple solution would be to slow down the runtime accounting of
> SCHED_DEADLINE tasks by freq/max_freq. So instead of having:
>
> dl_se->runtime -= delta;
>
> you do something like:
>
> dl_se->runtime -= (freq * delta) / max_freq;
>
> Which auto-magically grows the actual bandwidth, and since the deadlines
> are wall-time already it all works out nicely. It also keeps the
> overhead inside SCHED_DEADLINE.
>
And, at least for the meantime, this seems a very very nice solution.
The only thing I don't like is that division which would end up in being
performed at each tick/update_curr_dl(), but we can try to find out a
way to mitigate this, what do you think Harald?

Regards,
Dario

--
<<This happens because I choose it to happen!>> (Raistlin Majere)
----------------------------------------------------------------------
Dario Faggioli, ReTiS Lab, Scuola Superiore Sant'Anna, Pisa (Italy)

http://retis.sssup.it/people/faggioli -- [email protected]

Attachments:

signature.asc (198.00 B)
This is a digitally signed message part

2010-12-17 19:02:13

[permalink] [raw]

Subject: Re: [PATCH 1/3] Added runqueue clock normalized with cpufreq

On Fri, 2010-12-17 at 19:56 +0100, Dario Faggioli wrote:
> On Fri, 2010-12-17 at 15:29 +0100, Peter Zijlstra wrote:
> > Solving the CPUfreq problem involves writing a SCHED_DEADLINE aware
> > CPUfreq governor. The governor must know about the constraints placed on
> > the system by the task-set. You simply cannot lower the frequency when
> > your system is at u=1.
> >
> We already did the very same thing (for another EU Project called
> FRESCOR), although it was done in an userspace sort of daemon. It was
> also able to consider other "high level" parameters like some estimation
> of the QoS of each application and of the global QoS of the system.
>
> However, converting the basic mechanism into a CPUfreq governor should
> be easily doable... The only problem is finding the time for that! ;-P

Ah, I think Harald will solve that for you,.. :)

> > The simple solution would be to slow down the runtime accounting of
> > SCHED_DEADLINE tasks by freq/max_freq. So instead of having:
> >
> > dl_se->runtime -= delta;
> >
> > you do something like:
> >
> > dl_se->runtime -= (freq * delta) / max_freq;
> >
> > Which auto-magically grows the actual bandwidth, and since the deadlines
> > are wall-time already it all works out nicely. It also keeps the
> > overhead inside SCHED_DEADLINE.
> >
> And, at least for the meantime, this seems a very very nice solution.
> The only thing I don't like is that division which would end up in being
> performed at each tick/update_curr_dl(), but we can try to find out a
> way to mitigate this, what do you think Harald?

A simple mult and shift-right should do. You can either pre-compute for
a platform, or compute the inv multiplier in the cpufreq notifier thing.

2010-12-17 19:14:48

[permalink] [raw]

Subject: Re: [PATCH 1/3] Added runqueue clock normalized with cpufreq

On Fri, 2010-12-17 at 19:59 +0100, Peter Zijlstra wrote:
> > However, converting the basic mechanism into a CPUfreq governor should
> > be easily doable... The only problem is finding the time for that! ;-P
>
> Ah, I think Harald will solve that for you,.. :)
>
Yeah, I saw that... Help is a wonderful thing, you know? :-P

> > And, at least for the meantime, this seems a very very nice solution.
> > The only thing I don't like is that division which would end up in being
> > performed at each tick/update_curr_dl(), but we can try to find out a
> > way to mitigate this, what do you think Harald?
>
> A simple mult and shift-right should do. You can either pre-compute for
> a platform, or compute the inv multiplier in the cpufreq notifier thing.
>
Yeah, I was thinking about something like the last solution you just
propose, but we'll consider all of them.

Thanks and Regards,
Dario

--
<<This happens because I choose it to happen!>> (Raistlin Majere)
----------------------------------------------------------------------
Dario Faggioli, ReTiS Lab, Scuola Superiore Sant'Anna, Pisa (Italy)

http://retis.sssup.it/people/faggioli -- [email protected]

Attachments:

signature.asc (198.00 B)
This is a digitally signed message part

2010-12-17 19:27:53

[permalink] [raw]

Subject: Re: [PATCH 1/3] Added runqueue clock normalized with cpufreq

2010/12/17 Dario Faggioli <[email protected]>:
> On Fri, 2010-12-17 at 15:29 +0100, Peter Zijlstra wrote:
>> Solving the CPUfreq problem involves writing a SCHED_DEADLINE aware
>> CPUfreq governor. The governor must know about the constraints placed on
>> the system by the task-set. You simply cannot lower the frequency when
>> your system is at u=1.
>>
> We already did the very same thing (for another EU Project called
> FRESCOR), although it was done in an userspace sort of daemon. It was
> also able to consider other "high level" parameters like some estimation
> of the QoS of each application and of the global QoS of the system.
>
> However, converting the basic mechanism into a CPUfreq governor should
> be easily doable... The only problem is finding the time for that! ;-P

I'm a bit choked before the holidays, but I can fix this in the
beginning of next year.
At the same time as I do a new version of the current patches that takes
in Peter's comments.

>> The simple solution would be to slow down the runtime accounting of
>> SCHED_DEADLINE tasks by freq/max_freq. So instead of having:
>>
>> ? dl_se->runtime -= delta;
>>
>> you do something like:
>>
>> ? dl_se->runtime -= (freq * delta) / max_freq;
>>
>> Which auto-magically grows the actual bandwidth, and since the deadlines
>> are wall-time already it all works out nicely. It also keeps the
>> overhead inside SCHED_DEADLINE.
>>
> And, at least for the meantime, this seems a very very nice solution.
> The only thing I don't like is that division which would end up in being
> performed at each tick/update_curr_dl(), but we can try to find out a
> way to mitigate this, what do you think Harald?

Yes, I will do something like this instead, need to make sure that
everything is consider first though.

2010-12-17 19:31:55

[permalink] [raw]

Subject: Re: [PATCH 1/3] Added runqueue clock normalized with cpufreq

>> We already did the very same thing (for another EU Project called
>> FRESCOR), although it was done in an userspace sort of daemon. It was
>> also able to consider other "high level" parameters like some estimation
>> of the QoS of each application and of the global QoS of the system.
>>
>> However, converting the basic mechanism into a CPUfreq governor should
>> be easily doable... The only problem is finding the time for that! ;-P
>
> Ah, I think Harald will solve that for you,.. :)

Yes, I don't mind doing that. Could you point me to the right part of
the FRESCOR code, Dario?
I will then compare that with what I already have.

2010-12-20 00:11:21

by Tommaso Cucinotta

[permalink] [raw]

Subject: Re: [PATCH 1/3] Added runqueue clock normalized with cpufreq

Il 17/12/2010 20:31, Harald Gustafsson ha scritto:
>>> We already did the very same thing (for another EU Project called
>>> FRESCOR), although it was done in an userspace sort of daemon. It was
>>> also able to consider other "high level" parameters like some estimation
>>> of the QoS of each application and of the global QoS of the system.
>>>
>>> However, converting the basic mechanism into a CPUfreq governor should
>>> be easily doable... The only problem is finding the time for that! ;-P
>> Ah, I think Harald will solve that for you,.. :)
> Yes, I don't mind doing that. Could you point me to the right part of
> the FRESCOR code, Dario?

Hi there,

I'm sorry to join so late this discussion, but the unprecedented 20cm of
snow in Pisa had some non-negligible drawbacks on my return flight from
Perth :-).

Let me try to briefly recap what the outcomes of FRESCOR were, w.r.t.
power management (but usually I'm not that brief :-) ):

1. from a requirements analysis phase, it comes out that it should be
possible to specify the individual runtimes for each possible frequency,
as it is well-known that the way computation times scale to CPU
frequency is application-dependent (and platform-dependent); this
assumes that as a developer I can specify the possible configurations of
my real-time app, then the OS will be free to pick the CPU frequency
that best suites its power management logic (i.e., keeping the minimum
frequency by which I can meet all the deadlines).

Requirements Analysis:

http://www.frescor.org/index.php?mact=Uploads,cntnt01,getfile,0&cntnt01showtemplate=false&cntnt01upload_id=62&cntnt01returnid=54

Proposed API:

http://www.frescor.org/index.php?mact=Uploads,cntnt01,getfile,0&cntnt01showtemplate=false&cntnt01upload_id=105&cntnt01returnid=54

I also attach the API we implemented, however consider it is a mix of
calls for doing both what I wrote above, and building an OS-independent
abstraction layer for dealing with CPU frequency scaling (and not only)
on the heterogeneous OSes we had in FRESCOR;

2. this was also assuming, at an API level, a quite static settings
(typical of hard RT), in which I configure the system and don't change
its frequency too often; for example, implications of power switches on
hard real-time requirements (i.e., time windows in which the CPU is not
operating during the switch, and limits on the max sustainable switching
frequencies by apps and the like) have not been stated through the API;

3. for soft real-time contexts and Linux (consider FRESCOR targeted both
hard RT on RT OSes and soft RT on Linux), we played with a much simpler
trivial linear scaling, which is exactly what has been proposed and
implemented by someone in this thread on top of SCHED_DEADLINE (AFAIU);
however, there's a trick which cannot be neglected, i.e., *change
protocol* (see 5); benchmarks on MPEG-2 decoding times showed that the
linear approximation is not that bad, but the best interpolating ratio
between the computing times in different CPU frequencies do not
perfectly conform to the frequencies ratios; we didn't make any attempt
of extensive evaluation over different workloads so far. See Figure 4.1
in D-AQ2v2:

http://www.frescor.org/index.php?mact=Uploads,cntnt01,getfile,0&cntnt01showtemplate=false&cntnt01upload_id=82&cntnt01returnid=54

4. I would say that, given the tendency to over-provision the runtime
(WCET) for hard real-time contexts, it would not bee too much of a
burden for a hard RT developer to properly over-provision the required
budget in presence of a trivial runtime rescaling policy like in 2.;
however, in order to make everybody happy, it doesn't seem a bad idea to
have something like:
4a) use the fine runtimes specified by the user if they are available;
4b) use the trivially rescaled runtimes if the user only specified a
single runtime, of course it should be clear through the API what is the
frequency the user is referring its runtime to, in such case (e.g.,
maximum one ?)

5. Mode Change Protocol: whenever a frequency switch occurs (e.g.,
dictated by the non-RT workload fluctuations), runtimes cannot simply be
rescaled instantaneously: keeping it short, the simplest thing we can do
is relying on the various CBS servers implemented in the scheduler to
apply the change from the next "runtime recharge", i.e., the next
period. This creates the potential problem that the RT tasks have a
non-negligible transitory for the instances crossing the CPU frequency
switch, in which they do not have enough runtime for their work. Now,
the general "rule of thumb" is straightforward: make room first, then
"pack", i.e., we need to consider 2 distinct cases:

5a) we want to *increase the CPU frequency*; we can immediately
increase the frequency, then the RT applications will have a temporary
over-provisioning of runtime (still tuned for the slower frequency
case), however as soon as we're sure the CPU frequency switch completed,
we can lower the runtimes to the new values;

5b) we want to *decrease the CPU frequency*; unfortunately, here we
need to proceed in the other way round: first, we need to increase the
runtimes of the RT applications to the new values, then, as soon as
we're sure all the scheduling servers made the change (waiting at most
for a time equal to the maximum configured RT period), then we can
actually perform the frequency switch. Of course, before switching the
frequency, there's an assumption: that the new runtimes after the freq
decrease are still schedulable, so the CPU freq switching logic needs to
be aware of the allocated RT reservations.

The protocol in 5. has been implemented completely in user-space as a
modification to the powernowd daemon, in the context of an extended
version of a paper in which we were automagically guessing the whole set
of scheduling parameters for periodic RT applications (EuroSys 2010).
The modified powernowd was considering both the whole RT utilization as
imposed by the RT reservations, and the non-RT utilization as measured
on the CPU. The paper will appear on ACM TECS, but who knows when, so
here u can find it (see Section 7.5 "Power Management"):

http://retis.sssup.it/~tommaso/publications/ACM-TECS-2010.pdf

(last remark: no attempt to deal with multi-cores and their various
power switching capabilities, on this paper . . .)

Last, but not least, the whole point in the above discussion is the
assumption that it is meaningful to have a CPU frequency switching
policy, as opposed to merely CPU idle-ing. Perhaps on old embedded CPUs
this is still the case. Unfortunately, from preliminary measurements
made on a few systems I use every day through a cheap power measurement
device attached on the power cable, I could actually see that for RT
workloads only it is worth to leave the system at the maximum frequency
and exploit the much higher time spent in idle mode(s), except when the
system is completely idle.

If you're interested, I can share the collected data sets.

Bye (and apologies for the length).

T.

--
Tommaso Cucinotta, Computer Engineering PhD, Researcher
ReTiS Lab, Scuola Superiore Sant'Anna, Pisa, Italy
Tel +39 050 882 024, Fax +39 050 882 003
http://retis.sssup.it/people/tommaso

Attachments:

frsh_energy_management.h (11.42 kB)

2010-12-20 09:44:47