rq->cpuload strongly depends on cgroup hierarchy. For example, if hundreds of
tasks are running inside cpu:/test cgroup, the sum of cpuload over all cpus
won't exceed 1024 (by default). That makes the cpuidle menu governor take wrong
decisions, which can negatively affect overall performance.
To cope this, use nr_running last seen in __update_cpu_load() instead of
cpuload for calculating performance multiplier.
---
drivers/cpuidle/governors/menu.c | 17 +++--------------
include/linux/sched.h | 2 +-
kernel/sched/core.c | 7 ++++---
kernel/sched/sched.h | 3 +++
4 files changed, 11 insertions(+), 18 deletions(-)
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 0633575..2aa2625 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -123,17 +123,6 @@ struct menu_device {
};
-#define LOAD_INT(x) ((x) >> FSHIFT)
-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
-
-static int get_loadavg(void)
-{
- unsigned long this = this_cpu_load();
-
-
- return LOAD_INT(this) * 10 + LOAD_FRAC(this) / 10;
-}
-
static inline int which_bucket(unsigned int duration)
{
int bucket = 0;
@@ -170,13 +159,13 @@ static inline int which_bucket(unsigned int duration)
static inline int performance_multiplier(void)
{
int mult = 1;
+ int this_cpu = smp_processor_id();
/* for higher loadavg, we are more reluctant */
-
- mult += 2 * get_loadavg();
+ mult += 10 * nr_active_cpu(this_cpu);
/* for IO wait tasks (per cpu!) we add 5x each */
- mult += 10 * nr_iowait_cpu(smp_processor_id());
+ mult += 10 * nr_iowait_cpu(this_cpu);
return mult;
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f45c0b2..fb83d22 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -141,7 +141,7 @@ extern unsigned long nr_running(void);
extern unsigned long nr_uninterruptible(void);
extern unsigned long nr_iowait(void);
extern unsigned long nr_iowait_cpu(int cpu);
-extern unsigned long this_cpu_load(void);
+extern unsigned long nr_active_cpu(int cpu);
extern void calc_global_load(unsigned long ticks);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 39eb601..8cc2011 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2155,10 +2155,10 @@ unsigned long nr_iowait_cpu(int cpu)
return atomic_read(&this->nr_iowait);
}
-unsigned long this_cpu_load(void)
+unsigned long nr_active_cpu(int cpu)
{
- struct rq *this = this_rq();
- return this->cpu_load[0];
+ struct rq *this = cpu_rq(cpu);
+ return this->nr_active;
}
@@ -2494,6 +2494,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
this_rq->nr_load_updates++;
/* Update our load: */
+ this_rq->nr_active = this_rq->nr_running;
this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
unsigned long old_load, new_load;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ba9dccf..2564712 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -379,6 +379,9 @@ struct rq {
struct list_head leaf_rt_rq_list;
#endif
+ /* nr_running last seen in __update_cpu_load() */
+ unsigned long nr_active;
+
/*
* This is part of a global counter where only the total sum
* over all CPUs matters. A task can increase this counter on
--
1.7.1
On Mon, 2012-06-04 at 14:24 +0400, Vladimir Davydov wrote:
> rq->cpuload strongly depends on cgroup hierarchy. For example, if hundreds of
> tasks are running inside cpu:/test cgroup, the sum of cpuload over all cpus
> won't exceed 1024 (by default). That makes the cpuidle menu governor take wrong
> decisions, which can negatively affect overall performance.
>
> To cope this, use nr_running last seen in __update_cpu_load() instead of
> cpuload for calculating performance multiplier.
What is cpuidle trying to do?
On 06/04/2012 02:32 PM, Peter Zijlstra wrote:
> On Mon, 2012-06-04 at 14:24 +0400, Vladimir Davydov wrote:
>> rq->cpuload strongly depends on cgroup hierarchy. For example, if hundreds of
>> tasks are running inside cpu:/test cgroup, the sum of cpuload over all cpus
>> won't exceed 1024 (by default). That makes the cpuidle menu governor take wrong
>> decisions, which can negatively affect overall performance.
>>
>> To cope this, use nr_running last seen in __update_cpu_load() instead of
>> cpuload for calculating performance multiplier.
> What is cpuidle trying to do?
Basing on cpuload, it selects idle state. The less the load, the lowest
the state.
On 6/4/2012 3:32 AM, Peter Zijlstra wrote:
> On Mon, 2012-06-04 at 14:24 +0400, Vladimir Davydov wrote:
>> rq->cpuload strongly depends on cgroup hierarchy. For example, if hundreds of
>> tasks are running inside cpu:/test cgroup, the sum of cpuload over all cpus
>> won't exceed 1024 (by default). That makes the cpuidle menu governor take wrong
>> decisions, which can negatively affect overall performance.
>>
>> To cope this, use nr_running last seen in __update_cpu_load() instead of
>> cpuload for calculating performance multiplier.
>
> What is cpuidle trying to do?
what it is doing is trying to use "cpuload" as proxy for performance
sensitivity. The higher the load, the longer the idle period (predicted)
needs to be, for cpuidle to be willing to tolerate the latency of deeper
C states.
On 6/4/2012 3:24 AM, Vladimir Davydov wrote:
> rq->cpuload strongly depends on cgroup hierarchy. For example, if hundreds of
> tasks are running inside cpu:/test cgroup, the sum of cpuload over all cpus
> won't exceed 1024 (by default). That makes the cpuidle menu governor take wrong
> decisions, which can negatively affect overall performance.
nr_running is the wrong answer... it is instantaneous, not longer term.
cpuidle wants a longer term, per cpu, notion of "busy", to use as proxy
for performance sensitivity.
On Jun 4, 2012, at 5:15 PM, "Arjan van de Ven" <[email protected]> wrote:
> On 6/4/2012 3:24 AM, Vladimir Davydov wrote:
>> rq->cpuload strongly depends on cgroup hierarchy. For example, if hundreds of
>> tasks are running inside cpu:/test cgroup, the sum of cpuload over all cpus
>> won't exceed 1024 (by default). That makes the cpuidle menu governor take wrong
>> decisions, which can negatively affect overall performance.
>
> nr_running is the wrong answer... it is instantaneous, not longer term.
> cpuidle wants a longer term, per cpu, notion of "busy", to use as proxy
> for performance sensitivity.
>
>
I use not the instantaneous nr_running, but nr_running last seen in update_cpu_load() where cpuload[0], which is currently used by cpuidle, is updated.
On Mon, 2012-06-04 at 06:13 -0700, Arjan van de Ven wrote:
> On 6/4/2012 3:32 AM, Peter Zijlstra wrote:
> > On Mon, 2012-06-04 at 14:24 +0400, Vladimir Davydov wrote:
> >> rq->cpuload strongly depends on cgroup hierarchy. For example, if hundreds of
> >> tasks are running inside cpu:/test cgroup, the sum of cpuload over all cpus
> >> won't exceed 1024 (by default). That makes the cpuidle menu governor take wrong
> >> decisions, which can negatively affect overall performance.
> >>
> >> To cope this, use nr_running last seen in __update_cpu_load() instead of
> >> cpuload for calculating performance multiplier.
> >
> > What is cpuidle trying to do?
>
> what it is doing is trying to use "cpuload" as proxy for performance
> sensitivity. The higher the load, the longer the idle period (predicted)
> needs to be, for cpuidle to be willing to tolerate the latency of deeper
> C states.
Well, both are complete crap of course.. load has no relation to busy
what so ever anyway.
But what you're saying is that its trying to correlate latency with
load, higher loaded systems should receive less latency spikes. Why does
this make sense at all?
Or are we simply trying to guestimate the idle period and got confused?
Neither nr_running nor cpu_load have anything to do with how busy we
are.
On 6/4/2012 6:45 AM, Peter Zijlstra wrote:
> On Mon, 2012-06-04 at 06:13 -0700, Arjan van de Ven wrote:
>> On 6/4/2012 3:32 AM, Peter Zijlstra wrote:
>>> On Mon, 2012-06-04 at 14:24 +0400, Vladimir Davydov wrote:
>>>> rq->cpuload strongly depends on cgroup hierarchy. For example, if hundreds of
>>>> tasks are running inside cpu:/test cgroup, the sum of cpuload over all cpus
>>>> won't exceed 1024 (by default). That makes the cpuidle menu governor take wrong
>>>> decisions, which can negatively affect overall performance.
>>>>
>>>> To cope this, use nr_running last seen in __update_cpu_load() instead of
>>>> cpuload for calculating performance multiplier.
>>>
>>> What is cpuidle trying to do?
>>
>> what it is doing is trying to use "cpuload" as proxy for performance
>> sensitivity. The higher the load, the longer the idle period (predicted)
>> needs to be, for cpuidle to be willing to tolerate the latency of deeper
>> C states.
>
> Well, both are complete crap of course.. load has no relation to busy
> what so ever anyway.
it's not about busy, it's about performance sensitive.
it's not a super nice proxy, no argument, but it's one of the few long
term ones we have.
> But what you're saying is that its trying to correlate latency with
> load, higher loaded systems should receive less latency spikes. Why does
> this make sense at all?
higher loaded systems are assumed to be more sensitive to performance
loss, and thus the cpuidle code will be more conservative in adding
performance loss (via the exit latency) on such systems.
>
> Or are we simply trying to guestimate the idle period and got confused?
> Neither nr_running nor cpu_load have anything to do with how busy we
> are.
no this has nothing to do with the guestimation.
On Mon, 2012-06-04 at 06:48 -0700, Arjan van de Ven wrote:
> it's not about busy, it's about performance sensitive.
> it's not a super nice proxy, no argument, but it's one of the few long
> term ones we have.
>
I'm still not seeing how it makes any sense at all. Is there an actual
workload here this matters?
It seems to me your own history of idle guestimation is the best measure
there is for this. If you're shown to be too agressive, grow the window,
otherwise shrink it.
You can make that adjustment as slow as you like to include as much
history as you think you need.
On 6/4/2012 8:08 AM, Peter Zijlstra wrote:
> On Mon, 2012-06-04 at 06:48 -0700, Arjan van de Ven wrote:
>> it's not about busy, it's about performance sensitive.
>> it's not a super nice proxy, no argument, but it's one of the few long
>> term ones we have.
>>
> I'm still not seeing how it makes any sense at all. Is there an actual
> workload here this matters?
yes there are, mostly server ones.
the problem isn't an individual idle, it's that the 100us-200us
latencies add up if you go in and out repeatedly, when the system is in
a situation where it is sensitive to performance (which is not an
instant thing, this is a "over the long run we're busy" thing)...
... they become a real factor.
now, "performance sensitive" is highly elusive and unmeasureable, but
load average is a reasonable approximation... well, the best we have.
(just cpu usage is not, you can have low cpu usage but block on
semaphores or IO all the time, and you're still sensitive to performance)
On Mon, 2012-06-04 at 08:14 -0700, Arjan van de Ven wrote:
> On 6/4/2012 8:08 AM, Peter Zijlstra wrote:
> > On Mon, 2012-06-04 at 06:48 -0700, Arjan van de Ven wrote:
> >> it's not about busy, it's about performance sensitive.
> >> it's not a super nice proxy, no argument, but it's one of the few long
> >> term ones we have.
> >>
> > I'm still not seeing how it makes any sense at all. Is there an actual
> > workload here this matters?
>
> yes there are, mostly server ones.
OK, so pick one that cares, and try creating a heuristic based on wakeup
history or whatever.
> the problem isn't an individual idle, it's that the 100us-200us
> latencies add up if you go in and out repeatedly, when the system is in
> a situation where it is sensitive to performance (which is not an
> instant thing, this is a "over the long run we're busy" thing)...
> ... they become a real factor.
Right, but since you're inflating idle time, the work will be displaced
and will complete later. This should result in your idle time est
shrinking.
I'm just not buying load actually matters or works, if there's lots of
idle time load history should be low, if there's not a lot of idle time,
you're busy (per definition) and again load isn't important.
On 6/4/2012 8:26 AM, Peter Zijlstra wrote:
> On Mon, 2012-06-04 at 08:14 -0700, Arjan van de Ven wrote:
>> On 6/4/2012 8:08 AM, Peter Zijlstra wrote:
>>> On Mon, 2012-06-04 at 06:48 -0700, Arjan van de Ven wrote:
>>>> it's not about busy, it's about performance sensitive.
>>>> it's not a super nice proxy, no argument, but it's one of the few long
>>>> term ones we have.
>>>>
>>> I'm still not seeing how it makes any sense at all. Is there an actual
>>> workload here this matters?
>>
>> yes there are, mostly server ones.
>
> OK, so pick one that cares, and try creating a heuristic based on wakeup
> history or whatever.
sounds easy. is not.
>
>> the problem isn't an individual idle, it's that the 100us-200us
>> latencies add up if you go in and out repeatedly, when the system is in
>> a situation where it is sensitive to performance (which is not an
>> instant thing, this is a "over the long run we're busy" thing)...
>> ... they become a real factor.
>
> Right, but since you're inflating idle time, the work will be displaced
> and will complete later. This should result in your idle time est
> shrinking.
hmm I think you're missing the whole point.
>
> I'm just not buying load actually matters or works, if there's lots of
> idle time load history should be low, if there's not a lot of idle time,
> you're busy (per definition) and again load isn't important.
if there is a lot of idle, load can be low or high; load is more than
just cpu usage. it includes waiting for resources and mutexes etc.
if load is low, you are idle, sure (in that direction it works). If load
is low, the heuristic that is used here will not hinder a deep C state
choice.
if there is not a lot of idle time, sure, load is high. but because idle
time tends to be bursty, we can still be idle for, say, a millisecond
every 10 milliseconds. In this scenario, the load average is used to
ensure that the 200 usecond cost of exiting idle is acceptable.
one other way of doing this would be tracking cumulative accrued latency
as a percentage of cpu busy time... but that's also a pretty
approximative measure.
On Mon, 2012-06-04 at 08:39 -0700, Arjan van de Ven wrote:
> hmm I think you're missing the whole point.
Probably.. as I've still no clue what you're wanting to do.
> > I'm just not buying load actually matters or works, if there's lots of
> > idle time load history should be low, if there's not a lot of idle time,
> > you're busy (per definition) and again load isn't important.
>
> if there is a lot of idle, load can be low or high; load is more than
> just cpu usage. it includes waiting for resources and mutexes etc.
It very much does not. The thing its using: this_cpu_load() returns
rq->cpu_load[0], does not include blocked tasks of any kind.
> if load is low, you are idle, sure (in that direction it works). If load
> is low, the heuristic that is used here will not hinder a deep C state
> choice.
>
> if there is not a lot of idle time, sure, load is high.
False, you can have 0 idle time and still have low load.
> but because idle
> time tends to be bursty, we can still be idle for, say, a millisecond
> every 10 milliseconds. In this scenario, the load average is used to
> ensure that the 200 usecond cost of exiting idle is acceptable.
So what you're saying is that if you have 1ms idle in 10ms, it might not
be a continuous 1ms. And you're using load as a measure of how many
fragments it comes apart in?
That's not making sense.
> one other way of doing this would be tracking cumulative accrued latency
> as a percentage of cpu busy time... but that's also a pretty
> approximative measure.
To what purpose? I'm completely confused now, none of what you say is
making any sense.
>
> False, you can have 0 idle time and still have low load.
1 is not low in this context fwiw.
>
>> but because idle
>> time tends to be bursty, we can still be idle for, say, a millisecond
>> every 10 milliseconds. In this scenario, the load average is used to
>> ensure that the 200 usecond cost of exiting idle is acceptable.
>
> So what you're saying is that if you have 1ms idle in 10ms, it might not
> be a continuous 1ms. And you're using load as a measure of how many
> fragments it comes apart in?
no
what I'm saying is that if you have a workload where you have 10 msec of
work, then 1 msec of idle, then 10 msec of work, 1 msec of idle etc etc,
it is very different from 100 msec of work, 10 msec of idle, 100 msec of
work, even though utilization is the same.
what the logic is trying to do, on a 10 km level, is to limit the damage
of accumulated C state exit time.
(I'll avoid the word "latency" here, since the real time people will
then immediately think this is about controlling latency response, which
it isn't)
Now, if you're very idle for a sustained duration (e.g. low load),
you're assumed not sensitive to a bit of performance cost.
but if you're actually busy (over a longer period, not just "right
now"), you're assumed to be sensitive to the performance cost,
and what the algorithm does is make it less easy to go into the
expensive states.
the closest metric we have right now to "sensitive to performance cost"
that I know of is "load average". If the scheduler has a better metric,
I'd be more than happy to switch the idle selection code over to it...
note that the idle selection code has 3 metrics, this is only one of them:
1. PM_QOS latency tolerance
2. Energy break even
3. Performance tolerance
On Jun 4, 2012, at 8:53 PM, Arjan van de Ven wrote:
>
>>
>> False, you can have 0 idle time and still have low load.
>
> 1 is not low in this context fwiw.
>
>>
>>> but because idle
>>> time tends to be bursty, we can still be idle for, say, a millisecond
>>> every 10 milliseconds. In this scenario, the load average is used to
>>> ensure that the 200 usecond cost of exiting idle is acceptable.
>>
>> So what you're saying is that if you have 1ms idle in 10ms, it might not
>> be a continuous 1ms. And you're using load as a measure of how many
>> fragments it comes apart in?
>
> no
>
> what I'm saying is that if you have a workload where you have 10 msec of
> work, then 1 msec of idle, then 10 msec of work, 1 msec of idle etc etc,
> it is very different from 100 msec of work, 10 msec of idle, 100 msec of
> work, even though utilization is the same.
>
> what the logic is trying to do, on a 10 km level, is to limit the damage
> of accumulated C state exit time.
> (I'll avoid the word "latency" here, since the real time people will
> then immediately think this is about controlling latency response, which
> it isn't)
>
> Now, if you're very idle for a sustained duration (e.g. low load),
> you're assumed not sensitive to a bit of performance cost.
> but if you're actually busy (over a longer period, not just "right
> now"), you're assumed to be sensitive to the performance cost,
> and what the algorithm does is make it less easy to go into the
> expensive states.
>
> the closest metric we have right now to "sensitive to performance cost"
> that I know of is "load average". If the scheduler has a better metric,
> I'd be more than happy to switch the idle selection code over to it...
>
But this_cpu_load(), which is currently used by cpuidle, does not return the "load average". It returns the value of cpuload at some moment in the past (actually, the value is updated in update_cpu_load()). This value is usually used for load balancing.
Moreover, this value does not reflect the real cpu load from the cpuidle pow, because it depends on tasks priority (nice) and, what is worse, with the introduction of cgroups it can be pretty arbitrary. For instance, each group of tasks is accounted just as a single task with standard priority "spreaded" among all cpus, no matter how many tasks are actually running in it.
>
> note that the idle selection code has 3 metrics, this is only one of them:
> 1. PM_QOS latency tolerance
> 2. Energy break even
> 3. Performance tolerance
>
>
On Mon, 2012-06-04 at 09:53 -0700, Arjan van de Ven wrote:
> >
> > False, you can have 0 idle time and still have low load.
>
> 1 is not low in this context fwiw.
I think you're mis-understanding the load number you're using. I suspect
you're expecting something like the load-avg top/uptime provide. You're
very much not using anything similar.
Nor do we compute anything like that, and I want to avoid having to
compute anything like that because its expensive.
> >> but because idle
> >> time tends to be bursty, we can still be idle for, say, a millisecond
> >> every 10 milliseconds. In this scenario, the load average is used to
> >> ensure that the 200 usecond cost of exiting idle is acceptable.
> >
> > So what you're saying is that if you have 1ms idle in 10ms, it might not
> > be a continuous 1ms. And you're using load as a measure of how many
> > fragments it comes apart in?
>
> no
>
> what I'm saying is that if you have a workload where you have 10 msec of
> work, then 1 msec of idle, then 10 msec of work, 1 msec of idle etc etc,
> it is very different from 100 msec of work, 10 msec of idle, 100 msec of
> work, even though utilization is the same.
Sure..
> what the logic is trying to do, on a 10 km level, is to limit the damage
> of accumulated C state exit time.
> (I'll avoid the word "latency" here, since the real time people will
> then immediately think this is about controlling latency response, which
> it isn't)
But why? There's a natural limit to his, say the wakeup costs 0.2ms then
you can only do 5k of those a second. Once you need to actually do some
work as well this comes down.
But its all idle time, you cannot be idle longer than there is a lack of
work. So if you're idle too long (because of long exit latency) your
work shifts and the future idle time reduces, eventually causing a lower
C state to be used.
Also, when you notice you're waking up too soon, you can quickly ramp
down on the C state levels.
> Now, if you're very idle for a sustained duration (e.g. low load),
> you're assumed not sensitive to a bit of performance cost.
> but if you're actually busy (over a longer period, not just "right
> now"), you're assumed to be sensitive to the performance cost,
> and what the algorithm does is make it less easy to go into the
> expensive states.
My brain still sparks and fizzles when I read that.. it just doesn't
compute.
What performance? performance isn't a well defined word.
> the closest metric we have right now to "sensitive to performance cost"
> that I know of is "load average". If the scheduler has a better metric,
> I'd be more than happy to switch the idle selection code over to it...
I can't suggest anything better for something I've still no clue about.
You're completely failing to explain this thing to me.
> note that the idle selection code has 3 metrics, this is only one of them:
> 1. PM_QOS latency tolerance
> 2. Energy break even
> 3. Performance tolerance
That 3rd, I'm completely failing to understand.
On Mon, 2012-06-04 at 21:08 +0400, Vladimir Davydov wrote:
> But this_cpu_load(), which is currently used by cpuidle, does not
> return the "load average". It returns the value of cpuload at some
> moment in the past (actually, the value is updated in
> update_cpu_load()). This value is usually used for load balancing.
>
> Moreover, this value does not reflect the real cpu load from the
> cpuidle pow, because it depends on tasks priority (nice) and, what is
> worse, with the introduction of cgroups it can be pretty arbitrary.
> For instance, each group of tasks is accounted just as a single task
> with standard priority "spreaded" among all cpus, no matter how many
> tasks are actually running in it.
Indeed.
>> what the logic is trying to do, on a 10 km level, is to limit the damage
>> of accumulated C state exit time.
>> (I'll avoid the word "latency" here, since the real time people will
>> then immediately think this is about controlling latency response, which
>> it isn't)
>
> But why? There's a natural limit to his, say the wakeup costs 0.2ms then
> you can only do 5k of those a second. Once you need to actually do some
> work as well this comes down.
but if you only do 100 per second, you still burn 20 msec, which can be
too much if you're performance sensitive.
> But its all idle time, you cannot be idle longer than there is a lack of
> work. So if you're idle too long (because of long exit latency) your
> work shifts and the future idle time reduces, eventually causing a lower
> C state to be used.
>
> Also, when you notice you're waking up too soon, you can quickly ramp
> down on the C state levels.
when wakeups happen too soon, this already happens.
but that's orthogonal to some degree unfortunately.
>
>> Now, if you're very idle for a sustained duration (e.g. low load),
>> you're assumed not sensitive to a bit of performance cost.
>> but if you're actually busy (over a longer period, not just "right
>> now"), you're assumed to be sensitive to the performance cost,
>> and what the algorithm does is make it less easy to go into the
>> expensive states.
>
> My brain still sparks and fizzles when I read that.. it just doesn't
> compute.
>
> What performance? performance isn't a well defined word.
for the sake of argument, lets call it the amount of work the system can
get done. (where 'work' is obviously still vague, but the software
running on the system will know what it is).
can you afford to do no work for those 20 msec every second ?
the answer to that will be "it depends", and estimating that "depends"
is what the load average value is used for.
I will totally accept that the value used right now is not the optimal
or correct value to use (as you said, the "top" type load average is not
available, which would have been much nicer).
On Mon, 2012-06-04 at 10:25 -0700, Arjan van de Ven wrote:
>
> for the sake of argument, lets call it the amount of work the system can
> get done. (where 'work' is obviously still vague, but the software
> running on the system will know what it is).
Screw the software, explain it to me.
> can you afford to do no work for those 20 msec every second ?
> the answer to that will be "it depends", and estimating that "depends"
> is what the load average value is used for.
See this just doesn't make any sense..
If there's work, we're not idle. If there's more work we're idle less.
We're just not going to be idle 20 msec every second if there's more to
do.
And like I said many times now, if you inflate some of the idle periods,
the work shifts (it doesn't become less) and a next idle period will be
smaller -- since we'll only become idle again once all work is done.
( and since its smaller you decrease your idle guestimate, and lower you
C state level )
The absolute only thing any of this makes any difference to is
synchronous stuff, like round-trip latencies. If you have a workload
that doesn't do anything until something else happens and so on. Then,
if you delay each signal a bit the total time will increase and the
amount of stuff done in a given time decreases.
However, if you stuff enough work down that pipe, the total throughput
should still be good -- esp. if you can saturate the thing and all idle
time goes away.
You said that accrued latency per time unit was an approximate measure,
but afaict that's related to what you want, whereas the unix
load-average is completely unrelated to any of this.
Anyway, as it stands you're better off ripping the entire thing out,
since I don't see how it could have worked, given that you're using an
entirely random measure of something.
Also, I'm still not understanding why decreasing your idle-period
guestimation when you're woken up early isn't catching any of this.
On Mon, 2012-06-04 at 09:53 -0700, Arjan van de Ven wrote:
> (I'll avoid the word "latency" here, since the real time people will
> then immediately think this is about controlling latency response, which
> it isn't)
Note that if it is the synchronous thing, it is exactly that.
On 6/4/2012 1:31 PM, Peter Zijlstra wrote:
>
> And like I said many times now, if you inflate some of the idle periods,
> the work shifts (it doesn't become less) and a next idle period will be
> smaller -- since we'll only become idle again once all work is done.
this is what is not really correct.
you can be idle for many reasons, not just because you have no work
left. most common is waiting for a disk IO. the idle period for that
will not get shorter just because the previous one took more time.
On Mon, 2012-06-04 at 13:45 -0700, Arjan van de Ven wrote:
> On 6/4/2012 1:31 PM, Peter Zijlstra wrote:
> >
> > And like I said many times now, if you inflate some of the idle periods,
> > the work shifts (it doesn't become less) and a next idle period will be
> > smaller -- since we'll only become idle again once all work is done.
>
> this is what is not really correct.
>
> you can be idle for many reasons, not just because you have no work
> left. most common is waiting for a disk IO. the idle period for that
> will not get shorter just because the previous one took more time.
Then you're back to synchronous behaviour and your earlier claim that it
is was not about latency response is false.
On Mon, 2012-06-04 at 08:39 -0700, Arjan van de Ven wrote:
> if there is a lot of idle, load can be low or high; load is more than
> just cpu usage. it includes waiting for resources and mutexes etc.
05:40:42 up 39 min, 20 users, load average: 0.63, 0.53, 0.42
root pts/11 05:03 5:05 6:16 0.00s /bin/sh /root/bin/tbench.sh 1 400
Like scheduling something cross core that is close to being synchronous.
-Mike
Hello again
Playing with cpu cgroups, I've found that performance of various workloads running inside cgroups can be significantly improved by increasing sched load resolution and that this improvement has already been committed to the kernel (c8b281161dfa4bb5d5be63fb036ce19347b88c63). Unfortunately, it turned out that the commit triggered power usage regression and as a result was reverted until the root cause of the regression was found (see commit e4c2fb0d5776b58049d2556b456144a4db3fe5a9). And the root cause is still unknown...
I guess that the behavior of the cpuidle menu governor, which is stated in the patch below, can explain the regression. The governor uses this_cpu_load() to estimate idleness of the system. After commit c8b281161dfa4bb5d5be63fb036ce19347b88c63, which increases sched load resolution, this_cpu_load() seems to return large values even for small loads, which would probably lead to the cpuidle governor making wrong decisions due to overestimating the system load.
So, this seems to be another reason to use some different performance multiplier in cpuidle governor.
On Jun 4, 2012, at 2:24 PM, Vladimir Davydov <[email protected]> wrote:
> rq->cpuload strongly depends on cgroup hierarchy. For example, if hundreds of
> tasks are running inside cpu:/test cgroup, the sum of cpuload over all cpus
> won't exceed 1024 (by default). That makes the cpuidle menu governor take wrong
> decisions, which can negatively affect overall performance.
>
> To cope this, use nr_running last seen in __update_cpu_load() instead of
> cpuload for calculating performance multiplier.
> ---
> drivers/cpuidle/governors/menu.c | 17 +++--------------
> include/linux/sched.h | 2 +-
> kernel/sched/core.c | 7 ++++---
> kernel/sched/sched.h | 3 +++
> 4 files changed, 11 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
> index 0633575..2aa2625 100644
> --- a/drivers/cpuidle/governors/menu.c
> +++ b/drivers/cpuidle/governors/menu.c
> @@ -123,17 +123,6 @@ struct menu_device {
> };
>
>
> -#define LOAD_INT(x) ((x) >> FSHIFT)
> -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
> -
> -static int get_loadavg(void)
> -{
> - unsigned long this = this_cpu_load();
> -
> -
> - return LOAD_INT(this) * 10 + LOAD_FRAC(this) / 10;
> -}
> -
> static inline int which_bucket(unsigned int duration)
> {
> int bucket = 0;
> @@ -170,13 +159,13 @@ static inline int which_bucket(unsigned int duration)
> static inline int performance_multiplier(void)
> {
> int mult = 1;
> + int this_cpu = smp_processor_id();
>
> /* for higher loadavg, we are more reluctant */
> -
> - mult += 2 * get_loadavg();
> + mult += 10 * nr_active_cpu(this_cpu);
>
> /* for IO wait tasks (per cpu!) we add 5x each */
> - mult += 10 * nr_iowait_cpu(smp_processor_id());
> + mult += 10 * nr_iowait_cpu(this_cpu);
>
> return mult;
> }
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index f45c0b2..fb83d22 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -141,7 +141,7 @@ extern unsigned long nr_running(void);
> extern unsigned long nr_uninterruptible(void);
> extern unsigned long nr_iowait(void);
> extern unsigned long nr_iowait_cpu(int cpu);
> -extern unsigned long this_cpu_load(void);
> +extern unsigned long nr_active_cpu(int cpu);
>
>
> extern void calc_global_load(unsigned long ticks);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 39eb601..8cc2011 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2155,10 +2155,10 @@ unsigned long nr_iowait_cpu(int cpu)
> return atomic_read(&this->nr_iowait);
> }
>
> -unsigned long this_cpu_load(void)
> +unsigned long nr_active_cpu(int cpu)
> {
> - struct rq *this = this_rq();
> - return this->cpu_load[0];
> + struct rq *this = cpu_rq(cpu);
> + return this->nr_active;
> }
>
>
> @@ -2494,6 +2494,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
> this_rq->nr_load_updates++;
>
> /* Update our load: */
> + this_rq->nr_active = this_rq->nr_running;
> this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
> for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
> unsigned long old_load, new_load;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index ba9dccf..2564712 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -379,6 +379,9 @@ struct rq {
> struct list_head leaf_rt_rq_list;
> #endif
>
> + /* nr_running last seen in __update_cpu_load() */
> + unsigned long nr_active;
> +
> /*
> * This is part of a global counter where only the total sum
> * over all CPUs matters. A task can increase this counter on
> --
> 1.7.1
>