the attached patch fixes a fundamental (and long-standing) bug in the
sleep-average estimator which is the root cause of the "contest
process_load" problems reported by Mike Galbraith and Andrew Morton, and
which problem is addressed by Mike's patch.
the bug is the following: the sleep_time code in activate_task()
over-estimates the true sleep time by 0.5 jiffies on average (0.5 msecs on
recent 2.5 kernels). Furthermore, for highly context-switch intensive and
CPU-intensive workloads it means a constant 1 jiffy over-estimation. This
turns the balance of giving and removing ticks and nils the effect of the
CPU busy-tick, catapulting the task(s) to highly interactive status -
while in reality they are constantly burning CPU time.
the fix is to round down sleep_time, not to round it up. This slightly
under-estimates the sleep time, but this is not a real problem, any task
with a sleep time in the 1 jiffy range will see timekeeping granularity
artifacts from various parts of the kernel anyway. We could use rdtsc to
estimate the sleep time, but i think that's unnecessary overhead.
the fixups in Mike's scheduler patch (which is in -mm8) basically work
around this bug. The patch below definitely fixes the contest-load
starvation bug, but it remains to be seen what other effects it has on
interactivity. In any case, this bug in the estimator is real and if
there's any other interactivity problem around then we need to deal with
it ontop of this patch.
this bug has been in the O(1) scheduler from day 1 on basically, so i'm
quite hopeful that a number of interactivity complaints are fixed by this
patch.
Ingo
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -342,10 +342,10 @@ static inline void __activate_task(task_
*/
static inline int activate_task(task_t *p, runqueue_t *rq)
{
- unsigned long sleep_time = jiffies - p->last_run;
+ long sleep_time = jiffies - p->last_run - 1;
int requeue_waker = 0;
- if (sleep_time) {
+ if (sleep_time > 0) {
int sleep_avg;
/*
the attached patch is ontop of the previous -C4 scheduler patch, it
removes/fixes a few whitespaces and removes the MAX_PRIO setting in the
init task path which is unnecessary and which might even lead to bugs -
MAX_PRIO is outside the valid range and technically the init thread is not
an idle thread yet at this point.
Ingo
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -846,7 +846,7 @@ void sched_balance_exec(void)
}
/*
- * Find the busiest node. All previous node loads contribute with a
+ * Find the busiest node. All previous node loads contribute with a
* geometrically deccaying weight to the load measure:
* load_{t} = load_{t-1}/2 + nr_node_running_{t}
* This way sudden load peaks are flattened out a bit.
@@ -854,7 +854,7 @@ void sched_balance_exec(void)
static int find_busiest_node(int this_node)
{
int i, node = -1, load, this_load, maxload;
-
+
this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1)
+ atomic_read(&node_nr_running[this_node]);
this_rq()->prev_node_load[this_node] = this_load;
@@ -1194,8 +1194,8 @@ void scheduler_tick(int user_ticks, int
runqueue_t *rq = this_rq();
task_t *p = current;
- if (rcu_pending(cpu))
- rcu_check_callbacks(cpu, user_ticks);
+ if (rcu_pending(cpu))
+ rcu_check_callbacks(cpu, user_ticks);
if (p == rq->idle) {
/* note: this timer irq context must be accounted for as well */
@@ -1353,7 +1353,7 @@ switch_tasks:
if (likely(prev != next)) {
rq->nr_switches++;
rq->curr = next;
-
+
prepare_arch_switch(rq, next);
prev = context_switch(rq, prev, next);
barrier();
@@ -1483,7 +1483,7 @@ void __wake_up_sync(wait_queue_head_t *q
}
#endif
-
+
void complete(struct completion *x)
{
unsigned long flags;
@@ -1567,7 +1567,7 @@ long interruptible_sleep_on_timeout(wait
void sleep_on(wait_queue_head_t *q)
{
SLEEP_ON_VAR
-
+
current->state = TASK_UNINTERRUPTIBLE;
SLEEP_ON_HEAD
@@ -1578,7 +1578,7 @@ void sleep_on(wait_queue_head_t *q)
long sleep_on_timeout(wait_queue_head_t *q, long timeout)
{
SLEEP_ON_VAR
-
+
current->state = TASK_UNINTERRUPTIBLE;
SLEEP_ON_HEAD
@@ -2472,12 +2472,12 @@ spinlock_t kernel_flag __cacheline_align
static void kstat_init_cpu(int cpu)
{
- /* Add any initialisation to kstat here */
- /* Useful when cpu offlining logic is added.. */
+ /* Add any initialisation to kstat here */
+ /* Useful when cpu offlining logic is added.. */
}
static int __devinit kstat_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+ unsigned long action, void *hcpu)
{
int cpu = (unsigned long)hcpu;
switch(action) {
@@ -2489,7 +2489,7 @@ static int __devinit kstat_cpu_notify(st
}
return NOTIFY_OK;
}
-
+
static struct notifier_block __devinitdata kstat_nb = {
.notifier_call = kstat_cpu_notify,
.next = NULL,
@@ -2498,7 +2498,7 @@ static struct notifier_block __devinitda
__init static void init_kstat(void) {
kstat_cpu_notify(&kstat_nb, (unsigned long)CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
- register_cpu_notifier(&kstat_nb);
+ register_cpu_notifier(&kstat_nb);
}
void __init sched_init(void)
@@ -2538,7 +2538,6 @@ void __init sched_init(void)
rq->idle = current;
set_task_cpu(current, smp_processor_id());
wake_up_forked_process(current);
- current->prio = MAX_PRIO;
init_timers();
Ingo Molnar <[email protected]> wrote:
>
> the attached patch fixes a fundamental (and long-standing) bug in the
> sleep-average estimator which is the root cause of the "contest
> process_load" problems reported by Mike Galbraith and Andrew Morton, and
> which problem is addressed by Mike's patch.
> --- linux/kernel/sched.c.orig
> +++ linux/kernel/sched.c
> @@ -342,10 +342,10 @@ static inline void __activate_task(task_
> */
> static inline int activate_task(task_t *p, runqueue_t *rq)
> {
> - unsigned long sleep_time = jiffies - p->last_run;
> + long sleep_time = jiffies - p->last_run - 1;
> int requeue_waker = 0;
>
> - if (sleep_time) {
> + if (sleep_time > 0) {
> int sleep_avg;
>
> /*
>
Would this be an equivalent fix for 2.4.20-ck4?
--- kernel/sched.c 2003-03-20 16:33:07.000000000 -0800
+++ kernel/sched.c.orig 2003-03-20 16:38:51.000000000 -0800
@@ -307,10 +286,10 @@
static inline void activate_task(task_t *p, runqueue_t *rq)
{
- long sleep_time = jiffies - p->sleep_timestamp - 1;
+ unsigned long sleep_time = jiffies - p->sleep_timestamp;
prio_array_t *array = rq->active;
- if (!rt_task(p) && (sleep_time > 0)) {
+ if (!rt_task(p) && sleep_time) {
/*
* This code gives a bonus to interactive tasks. We update
* an 'average sleep time' value here, based on
--
Eric Wong
On Fri, 21 Mar 2003 11:44, Eric Wong wrote:
> Ingo Molnar <[email protected]> wrote:
> > the attached patch fixes a fundamental (and long-standing) bug in the
> > sleep-average estimator which is the root cause of the "contest
> > process_load" problems reported by Mike Galbraith and Andrew Morton, and
> > which problem is addressed by Mike's patch.
> >
> Would this be an equivalent fix for 2.4.20-ck4?
Yes it would be, but ck4 is less prone to the same problem without the other
interactivity changes in 2.5. I'm working on putting them all together for
-ck* but until they are semi-stabilised I wont release them.
Con
On Thu, 20 Mar 2003, Eric Wong wrote:
> Would this be an equivalent fix for 2.4.20-ck4?
yes - but the reverse of it.
> - long sleep_time = jiffies - p->sleep_timestamp - 1;
> + unsigned long sleep_time = jiffies - p->sleep_timestamp;
> prio_array_t *array = rq->active;
>
> - if (!rt_task(p) && (sleep_time > 0)) {
> + if (!rt_task(p) && sleep_time) {
Ingo