2003-02-28 09:40:17

by Ingo Molnar

[permalink] [raw]
Subject: [patch] "HT scheduler", sched-2.5.63-B3


this is the latest HT scheduler patch, against 2.5.63-BK. Changes in -B3,
relative to the -A3 patch:

- fix bootup hang caused by an SMP init race. [the race only triggered
when not using serial logging - ugh.]

- fix bootup hang on non-HT SMP boxes compiled with HT-scheduling
support, the active balancer's loop did not handle this case correctly.

- fix over-eager active-balancing due to over-inflated nr_running value
when active-balancing via the migration thread.

This patch has been test-compiled and test-booted it on a HT, SMP and UP
box, everything works.

Ingo

--- linux/include/linux/sched.h.orig
+++ linux/include/linux/sched.h
@@ -147,6 +147,24 @@ typedef struct task_struct task_t;
extern void sched_init(void);
extern void init_idle(task_t *idle, int cpu);

+/*
+ * Is there a way to do this via Kconfig?
+ */
+#if CONFIG_NR_SIBLINGS_2
+# define CONFIG_NR_SIBLINGS 2
+#elif CONFIG_NR_SIBLINGS_4
+# define CONFIG_NR_SIBLINGS 4
+#else
+# define CONFIG_NR_SIBLINGS 0
+#endif
+
+#if CONFIG_NR_SIBLINGS
+# define CONFIG_SHARE_RUNQUEUE 1
+#else
+# define CONFIG_SHARE_RUNQUEUE 0
+#endif
+extern void sched_map_runqueue(int cpu1, int cpu2);
+
extern void show_state(void);
extern void show_trace(unsigned long *stack);
extern void show_stack(unsigned long *stack);
@@ -328,7 +346,7 @@ struct task_struct {
prio_array_t *array;

unsigned long sleep_avg;
- unsigned long sleep_timestamp;
+ unsigned long last_run;

unsigned long policy;
unsigned long cpus_allowed;
@@ -803,11 +821,6 @@ static inline unsigned int task_cpu(stru
return p->thread_info->cpu;
}

-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
- p->thread_info->cpu = cpu;
-}
-
#else

static inline unsigned int task_cpu(struct task_struct *p)
@@ -815,10 +828,6 @@ static inline unsigned int task_cpu(stru
return 0;
}

-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-}
-
#endif /* CONFIG_SMP */

#endif /* __KERNEL__ */
--- linux/include/asm-i386/apic.h.orig
+++ linux/include/asm-i386/apic.h
@@ -98,4 +98,6 @@ extern unsigned int nmi_watchdog;

#endif /* CONFIG_X86_LOCAL_APIC */

+extern int phys_proc_id[NR_CPUS];
+
#endif /* __ASM_APIC_H */
--- linux/arch/i386/kernel/cpu/proc.c.orig
+++ linux/arch/i386/kernel/cpu/proc.c
@@ -1,4 +1,5 @@
#include <linux/smp.h>
+#include <linux/sched.h>
#include <linux/timex.h>
#include <linux/string.h>
#include <asm/semaphore.h>
@@ -101,6 +102,13 @@ static int show_cpuinfo(struct seq_file
fpu_exception ? "yes" : "no",
c->cpuid_level,
c->wp_works_ok ? "yes" : "no");
+#if CONFIG_SHARE_RUNQUEUE
+{
+ extern long __rq_idx[NR_CPUS];
+
+ seq_printf(m, "\nrunqueue\t: %ld\n", __rq_idx[n]);
+}
+#endif

for ( i = 0 ; i < 32*NCAPINTS ; i++ )
if ( test_bit(i, c->x86_capability) &&
--- linux/arch/i386/kernel/smpboot.c.orig
+++ linux/arch/i386/kernel/smpboot.c
@@ -38,6 +38,7 @@
#include <linux/kernel.h>

#include <linux/mm.h>
+#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/smp_lock.h>
#include <linux/irq.h>
@@ -939,6 +940,16 @@ void *xquad_portio;

int cpu_sibling_map[NR_CPUS] __cacheline_aligned;

+static int test_ht;
+
+static int __init ht_setup(char *str)
+{
+ test_ht = 1;
+ return 1;
+}
+
+__setup("test_ht", ht_setup);
+
static void __init smp_boot_cpus(unsigned int max_cpus)
{
int apicid, cpu, bit;
@@ -1079,7 +1090,20 @@ static void __init smp_boot_cpus(unsigne
Dprintk("Boot done.\n");

/*
- * If Hyper-Threading is avaialble, construct cpu_sibling_map[], so
+ * Here we can be sure that there is an IO-APIC in the system. Let's
+ * go and set it up:
+ */
+ smpboot_setup_io_apic();
+
+ setup_boot_APIC_clock();
+
+ /*
+ * Synchronize the TSC with the AP
+ */
+ if (cpu_has_tsc && cpucount)
+ synchronize_tsc_bp();
+ /*
+ * If Hyper-Threading is available, construct cpu_sibling_map[], so
* that we can tell the sibling CPU efficiently.
*/
if (cpu_has_ht && smp_num_siblings > 1) {
@@ -1088,7 +1112,8 @@ static void __init smp_boot_cpus(unsigne

for (cpu = 0; cpu < NR_CPUS; cpu++) {
int i;
- if (!test_bit(cpu, &cpu_callout_map)) continue;
+ if (!test_bit(cpu, &cpu_callout_map))
+ continue;

for (i = 0; i < NR_CPUS; i++) {
if (i == cpu || !test_bit(i, &cpu_callout_map))
@@ -1104,17 +1129,41 @@ static void __init smp_boot_cpus(unsigne
printk(KERN_WARNING "WARNING: No sibling found for CPU %d.\n", cpu);
}
}
- }
-
- smpboot_setup_io_apic();
-
- setup_boot_APIC_clock();
+#if CONFIG_SHARE_RUNQUEUE
+ /*
+ * At this point APs would have synchronised TSC and
+ * waiting for smp_commenced, with their APIC timer
+ * disabled. So BP can go ahead do some initialization
+ * for Hyper-Threading (if needed).
+ */
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ int i;
+ if (!test_bit(cpu, &cpu_callout_map))
+ continue;
+ for (i = 0; i < NR_CPUS; i++) {
+ if (i <= cpu)
+ continue;
+ if (!test_bit(i, &cpu_callout_map))
+ continue;

- /*
- * Synchronize the TSC with the AP
- */
- if (cpu_has_tsc && cpucount)
- synchronize_tsc_bp();
+ if (phys_proc_id[cpu] != phys_proc_id[i])
+ continue;
+ /*
+ * merge runqueues, resulting in one
+ * runqueue per package:
+ */
+ sched_map_runqueue(cpu, i);
+ break;
+ }
+ }
+#endif
+ }
+#if CONFIG_SHARE_RUNQUEUE
+ if (smp_num_siblings == 1 && test_ht) {
+ printk("Simulating shared runqueues - mapping CPU#1's runqueue to CPU#0's runqueue.\n");
+ sched_map_runqueue(0, 1);
+ }
+#endif
}

/* These are wrappers to interface to the new boot process. Someone
--- linux/arch/i386/Kconfig.orig
+++ linux/arch/i386/Kconfig
@@ -422,6 +422,24 @@ config SMP

If you don't know what to do here, say N.

+choice
+
+ prompt "Hyperthreading Support"
+ depends on SMP
+ default NR_SIBLINGS_0
+
+config NR_SIBLINGS_0
+ bool "off"
+
+config NR_SIBLINGS_2
+ bool "2 siblings"
+
+config NR_SIBLINGS_4
+ bool "4 siblings"
+
+endchoice
+
+
config PREEMPT
bool "Preemptible Kernel"
help
--- linux/kernel/fork.c.orig
+++ linux/kernel/fork.c
@@ -916,7 +916,7 @@ static struct task_struct *copy_process(
*/
p->first_time_slice = 1;
current->time_slice >>= 1;
- p->sleep_timestamp = jiffies;
+ p->last_run = jiffies;
if (!current->time_slice) {
/*
* This case is rare, it happens when the parent has only
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -67,6 +67,7 @@
#define INTERACTIVE_DELTA 2
#define MAX_SLEEP_AVG (2*HZ)
#define STARVATION_LIMIT (2*HZ)
+#define AGRESSIVE_IDLE_STEAL 1
#define NODE_THRESHOLD 125

/*
@@ -141,6 +142,48 @@ struct prio_array {
};

/*
+ * It's possible for two CPUs to share the same runqueue.
+ * This makes sense if they eg. share caches.
+ *
+ * We take the common 1:1 (SMP, UP) case and optimize it,
+ * the rest goes via remapping: rq_idx(cpu) gives the
+ * runqueue on which a particular cpu is on, cpu_idx(cpu)
+ * gives the rq-specific index of the cpu.
+ *
+ * (Note that the generic scheduler code does not impose any
+ * restrictions on the mappings - there can be 4 CPUs per
+ * runqueue or even assymetric mappings.)
+ */
+#if CONFIG_SHARE_RUNQUEUE
+# define MAX_NR_SIBLINGS CONFIG_NR_SIBLINGS
+ long __rq_idx[NR_CPUS] __cacheline_aligned;
+ static long __cpu_idx[NR_CPUS] __cacheline_aligned;
+# define rq_idx(cpu) (__rq_idx[(cpu)])
+# define cpu_idx(cpu) (__cpu_idx[(cpu)])
+# define for_each_sibling(idx, rq) \
+ for ((idx) = 0; (idx) < (rq)->nr_cpus; (idx)++)
+# define rq_nr_cpus(rq) ((rq)->nr_cpus)
+# define cpu_active_balance(c) (cpu_rq(c)->cpu[0].active_balance)
+#else
+# define MAX_NR_SIBLINGS 1
+# define rq_idx(cpu) (cpu)
+# define cpu_idx(cpu) 0
+# define for_each_sibling(idx, rq) while (0)
+# define cpu_active_balance(c) 0
+# define do_active_balance(rq, cpu) do { } while (0)
+# define rq_nr_cpus(rq) 1
+ static inline void active_load_balance(runqueue_t *rq, int this_cpu) { }
+#endif
+
+typedef struct cpu_s {
+ task_t *curr, *idle;
+ task_t *migration_thread;
+ struct list_head migration_queue;
+ int active_balance;
+ int cpu;
+} cpu_t;
+
+/*
* This is the main, per-CPU runqueue data structure.
*
* Locking rule: those places that want to lock multiple runqueues
@@ -151,7 +194,6 @@ struct runqueue {
spinlock_t lock;
unsigned long nr_running, nr_switches, expired_timestamp,
nr_uninterruptible;
- task_t *curr, *idle;
struct mm_struct *prev_mm;
prio_array_t *active, *expired, arrays[2];
int prev_nr_running[NR_CPUS];
@@ -160,27 +202,39 @@ struct runqueue {
unsigned int nr_balanced;
int prev_node_load[MAX_NUMNODES];
#endif
- task_t *migration_thread;
- struct list_head migration_queue;
+ int nr_cpus;
+ cpu_t cpu[MAX_NR_SIBLINGS];

atomic_t nr_iowait;
} ____cacheline_aligned;

static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;

-#define cpu_rq(cpu) (runqueues + (cpu))
+#define cpu_rq(cpu) (runqueues + (rq_idx(cpu)))
+#define cpu_int(c) ((cpu_rq(c))->cpu + cpu_idx(c))
+#define cpu_curr_ptr(cpu) (cpu_int(cpu)->curr)
+#define cpu_idle_ptr(cpu) (cpu_int(cpu)->idle)
+
#define this_rq() cpu_rq(smp_processor_id())
#define task_rq(p) cpu_rq(task_cpu(p))
-#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define rt_task(p) ((p)->prio < MAX_RT_PRIO)

+#define migration_thread(cpu) (cpu_int(cpu)->migration_thread)
+#define migration_queue(cpu) (&cpu_int(cpu)->migration_queue)
+
+#if NR_CPUS > 1
+# define task_allowed(p, cpu) ((p)->cpus_allowed & (1UL << (cpu)))
+#else
+# define task_allowed(p, cpu) 1
+#endif
+
/*
* Default context-switch locking:
*/
#ifndef prepare_arch_switch
# define prepare_arch_switch(rq, next) do { } while(0)
# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
-# define task_running(rq, p) ((rq)->curr == (p))
+# define task_running(p) (cpu_curr_ptr(task_cpu(p)) == (p))
#endif

#ifdef CONFIG_NUMA
@@ -323,16 +377,21 @@ static inline int effective_prio(task_t
* Also update all the scheduling statistics stuff. (sleep average
* calculation, priority modifiers, etc.)
*/
+static inline void __activate_task(task_t *p, runqueue_t *rq)
+{
+ enqueue_task(p, rq->active);
+ nr_running_inc(rq);
+}
+
static inline void activate_task(task_t *p, runqueue_t *rq)
{
- unsigned long sleep_time = jiffies - p->sleep_timestamp;
- prio_array_t *array = rq->active;
+ unsigned long sleep_time = jiffies - p->last_run;

if (!rt_task(p) && sleep_time) {
/*
* This code gives a bonus to interactive tasks. We update
* an 'average sleep time' value here, based on
- * sleep_timestamp. The more time a task spends sleeping,
+ * ->last_run. The more time a task spends sleeping,
* the higher the average gets - and the higher the priority
* boost gets as well.
*/
@@ -341,8 +400,7 @@ static inline void activate_task(task_t
p->sleep_avg = MAX_SLEEP_AVG;
p->prio = effective_prio(p);
}
- enqueue_task(p, array);
- nr_running_inc(rq);
+ __activate_task(p, rq);
}

/*
@@ -383,8 +441,18 @@ static inline void resched_task(task_t *
#endif
}

+static inline void resched_cpu(int cpu)
+{
+ resched_task(cpu_curr_ptr(cpu));
+}
+
#ifdef CONFIG_SMP

+static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+ p->thread_info->cpu = cpu;
+}
+
/*
* wait_task_inactive - wait for a thread to unschedule.
*
@@ -402,7 +470,7 @@ void wait_task_inactive(task_t * p)
repeat:
preempt_disable();
rq = task_rq(p);
- if (unlikely(task_running(rq, p))) {
+ if (unlikely(task_running(p))) {
cpu_relax();
/*
* enable/disable preemption just to make this
@@ -413,7 +481,7 @@ repeat:
goto repeat;
}
rq = task_rq_lock(p, &flags);
- if (unlikely(task_running(rq, p))) {
+ if (unlikely(task_running(p))) {
task_rq_unlock(rq, &flags);
preempt_enable();
goto repeat;
@@ -421,6 +489,13 @@ repeat:
task_rq_unlock(rq, &flags);
preempt_enable();
}
+
+#else
+
+static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+}
+
#endif

/*
@@ -435,10 +510,39 @@ repeat:
*/
void kick_if_running(task_t * p)
{
- if ((task_running(task_rq(p), p)) && (task_cpu(p) != smp_processor_id()))
+ if ((task_running(p)) && (task_cpu(p) != smp_processor_id()))
resched_task(p);
}

+static void wake_up_cpu(runqueue_t *rq, int cpu, task_t *p)
+{
+ cpu_t *curr_cpu;
+ task_t *curr;
+ int idx;
+
+ if (idle_cpu(cpu))
+ return resched_cpu(cpu);
+
+ for_each_sibling(idx, rq) {
+ curr_cpu = rq->cpu + idx;
+ if (!task_allowed(p, curr_cpu->cpu))
+ continue;
+ if (curr_cpu->idle == curr_cpu->curr)
+ return resched_cpu(curr_cpu->cpu);
+ }
+
+ if (p->prio < cpu_curr_ptr(cpu)->prio)
+ return resched_task(cpu_curr_ptr(cpu));
+
+ for_each_sibling(idx, rq) {
+ curr_cpu = rq->cpu + idx;
+ if (!task_allowed(p, curr_cpu->cpu))
+ continue;
+ curr = curr_cpu->curr;
+ if (p->prio < curr->prio)
+ return resched_task(curr);
+ }
+}
/***
* try_to_wake_up - wake up a thread
* @p: the to-be-woken-up thread
@@ -460,6 +564,7 @@ static int try_to_wake_up(task_t * p, un
long old_state;
runqueue_t *rq;

+ BUG_ON(!p);
repeat_lock_task:
rq = task_rq_lock(p, &flags);
old_state = p->state;
@@ -469,7 +574,7 @@ repeat_lock_task:
* Fast-migrate the task if it's not running or runnable
* currently. Do not violate hard affinity.
*/
- if (unlikely(sync && !task_running(rq, p) &&
+ if (unlikely(sync && !task_running(p) &&
(task_cpu(p) != smp_processor_id()) &&
(p->cpus_allowed & (1UL << smp_processor_id())))) {

@@ -479,10 +584,12 @@ repeat_lock_task:
}
if (old_state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible--;
- activate_task(p, rq);
-
- if (p->prio < rq->curr->prio)
- resched_task(rq->curr);
+ if (sync)
+ __activate_task(p, rq);
+ else {
+ activate_task(p, rq);
+ wake_up_cpu(rq, task_cpu(p), p);
+ }
success = 1;
}
p->state = TASK_RUNNING;
@@ -605,7 +712,7 @@ static inline task_t * context_switch(ru
{
struct mm_struct *mm = next->mm;
struct mm_struct *oldmm = prev->active_mm;
-
+
if (unlikely(!mm)) {
next->active_mm = oldmm;
atomic_inc(&oldmm->mm_count);
@@ -637,8 +744,9 @@ unsigned long nr_running(void)
unsigned long i, sum = 0;

for (i = 0; i < NR_CPUS; i++)
- sum += cpu_rq(i)->nr_running;
-
+ /* Shared runqueues are counted only once. */
+ if (!cpu_idx(i))
+ sum += cpu_rq(i)->nr_running;
return sum;
}

@@ -649,7 +757,9 @@ unsigned long nr_uninterruptible(void)
for (i = 0; i < NR_CPUS; i++) {
if (!cpu_online(i))
continue;
- sum += cpu_rq(i)->nr_uninterruptible;
+ /* Shared runqueues are counted only once. */
+ if (!cpu_idx(i))
+ sum += cpu_rq(i)->nr_uninterruptible;
}
return sum;
}
@@ -831,7 +941,23 @@ static inline unsigned long cpus_to_bala

#endif /* CONFIG_NUMA */

-#if CONFIG_SMP
+/*
+ * One of the idle_cpu_tick() and busy_cpu_tick() functions will
+ * get called every timer tick, on every CPU. Our balancing action
+ * frequency and balancing agressivity depends on whether the CPU is
+ * idle or not.
+ *
+ * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on
+ * systems with HZ=100, every 10 msecs.)
+ */
+#define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
+#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
+
+#if !CONFIG_SMP
+
+static inline void load_balance(runqueue_t *rq, int this_cpu, int idle) { }
+
+#else

/*
* double_lock_balance - lock the busiest runqueue
@@ -947,12 +1073,7 @@ static inline void pull_task(runqueue_t
set_task_cpu(p, this_cpu);
nr_running_inc(this_rq);
enqueue_task(p, this_rq->active);
- /*
- * Note that idle threads have a prio of MAX_PRIO, for this test
- * to be always true for them.
- */
- if (p->prio < this_rq->curr->prio)
- set_need_resched();
+ wake_up_cpu(this_rq, this_cpu, p);
}

/*
@@ -963,9 +1084,9 @@ static inline void pull_task(runqueue_t
* We call this with the current runqueue locked,
* irqs disabled.
*/
-static void load_balance(runqueue_t *this_rq, int idle)
+static void load_balance(runqueue_t *this_rq, int this_cpu, int idle)
{
- int imbalance, idx, this_cpu = smp_processor_id();
+ int imbalance, idx;
runqueue_t *busiest;
prio_array_t *array;
struct list_head *head, *curr;
@@ -1013,12 +1134,15 @@ skip_queue:
* 1) running (obviously), or
* 2) cannot be migrated to this CPU due to cpus_allowed, or
* 3) are cache-hot on their current CPU.
+ *
+ * (except if we are in idle mode which is a more agressive
+ * form of rebalancing.)
*/

-#define CAN_MIGRATE_TASK(p,rq,this_cpu) \
- ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \
- !task_running(rq, p) && \
- ((p)->cpus_allowed & (1UL << (this_cpu))))
+#define CAN_MIGRATE_TASK(p,rq,cpu) \
+ (((idle && AGRESSIVE_IDLE_STEAL) || \
+ (jiffies - (p)->last_run > cache_decay_ticks)) && \
+ !task_running(p) && task_allowed(p, cpu))

curr = curr->prev;

@@ -1041,31 +1165,144 @@ out:
;
}

+#if CONFIG_SHARE_RUNQUEUE
+static void active_load_balance(runqueue_t *this_rq, int this_cpu)
+{
+ runqueue_t *rq;
+ int i, idx;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!cpu_online(i))
+ continue;
+ rq = cpu_rq(i);
+ if (rq == this_rq)
+ continue;
+ /*
+ * If there's only one CPU mapped to this runqueue
+ * then there can be no SMT imbalance:
+ */
+ if (rq->nr_cpus == 1)
+ continue;
+ /*
+ * Any SMT-specific imbalance?
+ */
+ for_each_sibling(idx, rq)
+ if (rq->cpu[idx].idle == rq->cpu[idx].curr)
+ continue;
+
+ /*
+ * At this point it's sure that we have a SMT
+ * imbalance: this (physical) CPU is idle but
+ * another CPU has two (or more) tasks running.
+ *
+ * We wake up one of the migration threads (it
+ * doesnt matter which one) and let it fix things up:
+ */
+ if (!cpu_active_balance(i)) {
+ cpu_active_balance(i) = 1;
+ spin_unlock(&this_rq->lock);
+ if (rq->cpu[0].migration_thread)
+ wake_up_process(rq->cpu[0].migration_thread);
+ spin_lock(&this_rq->lock);
+ }
+ }
+}
+
+static void do_active_balance(runqueue_t *this_rq, int this_cpu)
+{
+ runqueue_t *rq;
+ int i, idx;
+
+ spin_unlock(&this_rq->lock);
+
+ cpu_active_balance(this_cpu) = 0;
+
+ /*
+ * Is the imbalance still present?
+ */
+ for_each_sibling(idx, this_rq)
+ if (this_rq->cpu[idx].idle == this_rq->cpu[idx].curr)
+ goto out;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!cpu_online(i))
+ continue;
+ rq = cpu_rq(i);
+ if (rq == this_rq)
+ continue;
+
+ /* completely idle CPU? */
+ if (rq->nr_running)
+ continue;
+
+ /*
+ * At this point it's reasonably sure that we have an
+ * imbalance. Since we are the migration thread, try to
+ * balance a thread over to the target queue.
+ */
+ spin_lock(&this_rq->lock);
+ this_rq->nr_running--;
+ spin_unlock(&this_rq->lock);
+
+ spin_lock(&rq->lock);
+ load_balance(rq, i, 1);
+ spin_unlock(&rq->lock);
+
+ spin_lock(&this_rq->lock);
+ this_rq->nr_running++;
+ spin_unlock(&this_rq->lock);
+ goto out;
+ }
+out:
+ spin_lock(&this_rq->lock);
+}
+
/*
- * One of the idle_cpu_tick() and busy_cpu_tick() functions will
- * get called every timer tick, on every CPU. Our balancing action
- * frequency and balancing agressivity depends on whether the CPU is
- * idle or not.
+ * This routine is called to map a CPU into another CPU's runqueue.
*
- * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on
- * systems with HZ=100, every 10 msecs.)
+ * This must be called during bootup with the merged runqueue having
+ * no tasks.
*/
-#define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
-#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
+void sched_map_runqueue(int cpu1, int cpu2)
+{
+ runqueue_t *rq1 = cpu_rq(cpu1);
+ runqueue_t *rq2 = cpu_rq(cpu2);
+ int cpu2_idx_orig = cpu_idx(cpu2), cpu2_idx;
+
+ BUG_ON(rq1 == rq2 || rq2->nr_running || rq_idx(cpu1) != cpu1);
+ /*
+ * At this point, we dont have anything in the runqueue yet. So,
+ * there is no need to move processes between the runqueues.
+ * Only, the idle processes should be combined and accessed
+ * properly.
+ */
+ cpu2_idx = rq1->nr_cpus++;
+
+ rq_idx(cpu2) = cpu1;
+ cpu_idx(cpu2) = cpu2_idx;
+ rq1->cpu[cpu2_idx].cpu = cpu2;
+ rq1->cpu[cpu2_idx].idle = rq2->cpu[cpu2_idx_orig].idle;
+ rq1->cpu[cpu2_idx].curr = rq2->cpu[cpu2_idx_orig].curr;
+ INIT_LIST_HEAD(&rq1->cpu[cpu2_idx].migration_queue);

-static inline void idle_tick(runqueue_t *rq)
+ /* just to be safe: */
+ rq2->cpu[cpu2_idx_orig].idle = NULL;
+ rq2->cpu[cpu2_idx_orig].curr = NULL;
+}
+#endif
+#endif
+
+DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } };
+
+static inline void idle_tick(runqueue_t *rq, unsigned long j)
{
- if (jiffies % IDLE_REBALANCE_TICK)
+ if (j % IDLE_REBALANCE_TICK)
return;
spin_lock(&rq->lock);
- load_balance(rq, 1);
+ load_balance(rq, smp_processor_id(), 1);
spin_unlock(&rq->lock);
}

-#endif
-
-DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } };
-
/*
* We place interactive tasks back into the active array, if possible.
*
@@ -1076,9 +1313,9 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
* increasing number of running tasks:
*/
#define EXPIRED_STARVING(rq) \
- ((rq)->expired_timestamp && \
+ (STARVATION_LIMIT && ((rq)->expired_timestamp && \
(jiffies - (rq)->expired_timestamp >= \
- STARVATION_LIMIT * ((rq)->nr_running) + 1))
+ STARVATION_LIMIT * ((rq)->nr_running) + 1)))

/*
* This function gets called by the timer code, with HZ frequency.
@@ -1091,12 +1328,13 @@ void scheduler_tick(int user_ticks, int
{
int cpu = smp_processor_id();
runqueue_t *rq = this_rq();
+ unsigned long j = jiffies;
task_t *p = current;

if (rcu_pending(cpu))
rcu_check_callbacks(cpu, user_ticks);

- if (p == rq->idle) {
+ if (p == cpu_idle_ptr(cpu)) {
/* note: this timer irq context must be accounted for as well */
if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
kstat_cpu(cpu).cpustat.system += sys_ticks;
@@ -1104,9 +1342,7 @@ void scheduler_tick(int user_ticks, int
kstat_cpu(cpu).cpustat.iowait += sys_ticks;
else
kstat_cpu(cpu).cpustat.idle += sys_ticks;
-#if CONFIG_SMP
- idle_tick(rq);
-#endif
+ idle_tick(rq, j);
return;
}
if (TASK_NICE(p) > 0)
@@ -1115,12 +1351,13 @@ void scheduler_tick(int user_ticks, int
kstat_cpu(cpu).cpustat.user += user_ticks;
kstat_cpu(cpu).cpustat.system += sys_ticks;

+ spin_lock(&rq->lock);
/* Task might have expired already, but not scheduled off yet */
if (p->array != rq->active) {
set_tsk_need_resched(p);
+ spin_unlock(&rq->lock);
return;
}
- spin_lock(&rq->lock);
if (unlikely(rt_task(p))) {
/*
* RR tasks need a special form of timeslice management.
@@ -1156,16 +1393,14 @@ void scheduler_tick(int user_ticks, int

if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
if (!rq->expired_timestamp)
- rq->expired_timestamp = jiffies;
+ rq->expired_timestamp = j;
enqueue_task(p, rq->expired);
} else
enqueue_task(p, rq->active);
}
out:
-#if CONFIG_SMP
- if (!(jiffies % BUSY_REBALANCE_TICK))
- load_balance(rq, 0);
-#endif
+ if (!(j % BUSY_REBALANCE_TICK))
+ load_balance(rq, smp_processor_id(), 0);
spin_unlock(&rq->lock);
}

@@ -1176,11 +1411,11 @@ void scheduling_functions_start_here(voi
*/
asmlinkage void schedule(void)
{
+ int idx, this_cpu, retry = 0;
+ struct list_head *queue;
task_t *prev, *next;
- runqueue_t *rq;
prio_array_t *array;
- struct list_head *queue;
- int idx;
+ runqueue_t *rq;

/*
* Test if we are atomic. Since do_exit() needs to call into
@@ -1193,15 +1428,15 @@ asmlinkage void schedule(void)
dump_stack();
}
}
-
- check_highmem_ptes();
need_resched:
+ check_highmem_ptes();
+ this_cpu = smp_processor_id();
preempt_disable();
prev = current;
rq = this_rq();

release_kernel_lock(prev);
- prev->sleep_timestamp = jiffies;
+ prev->last_run = jiffies;
spin_lock_irq(&rq->lock);

/*
@@ -1224,12 +1459,14 @@ need_resched:
}
pick_next_task:
if (unlikely(!rq->nr_running)) {
-#if CONFIG_SMP
- load_balance(rq, 1);
+ load_balance(rq, this_cpu, 1);
if (rq->nr_running)
goto pick_next_task;
-#endif
- next = rq->idle;
+ active_load_balance(rq, this_cpu);
+ if (rq->nr_running)
+ goto pick_next_task;
+pick_idle:
+ next = cpu_idle_ptr(this_cpu);
rq->expired_timestamp = 0;
goto switch_tasks;
}
@@ -1245,18 +1482,49 @@ pick_next_task:
rq->expired_timestamp = 0;
}

+new_array:
idx = sched_find_first_bit(array->bitmap);
queue = array->queue + idx;
next = list_entry(queue->next, task_t, run_list);
+ if ((next != prev) && (rq_nr_cpus(rq) > 1)) {
+ struct list_head *tmp = queue->next;
+
+ while ((task_running(next) && (next != prev)) || !task_allowed(next, this_cpu)) {
+ tmp = tmp->next;
+ if (tmp != queue) {
+ next = list_entry(tmp, task_t, run_list);
+ continue;
+ }
+ idx = find_next_bit(array->bitmap, MAX_PRIO, ++idx);
+ if (idx == MAX_PRIO) {
+ if (retry || !rq->expired->nr_active) {
+ goto pick_idle;
+ }
+ /*
+ * To avoid infinite changing of arrays,
+ * when we have only tasks runnable by
+ * sibling.
+ */
+ retry = 1;
+
+ array = rq->expired;
+ goto new_array;
+ }
+ queue = array->queue + idx;
+ tmp = queue->next;
+ next = list_entry(tmp, task_t, run_list);
+ }
+ }

switch_tasks:
prefetch(next);
clear_tsk_need_resched(prev);
- RCU_qsctr(prev->thread_info->cpu)++;
+ RCU_qsctr(task_cpu(prev))++;

if (likely(prev != next)) {
rq->nr_switches++;
- rq->curr = next;
+ cpu_curr_ptr(this_cpu) = next;
+ set_task_cpu(next, this_cpu);

prepare_arch_switch(rq, next);
prev = context_switch(rq, prev, next);
@@ -1522,9 +1790,8 @@ void set_user_nice(task_t *p, long nice)
* If the task is running and lowered its priority,
* or increased its priority then reschedule its CPU:
*/
- if ((NICE_TO_PRIO(nice) < p->static_prio) ||
- task_running(rq, p))
- resched_task(rq->curr);
+ if ((NICE_TO_PRIO(nice) < p->static_prio) || task_running(p))
+ resched_task(cpu_curr_ptr(task_cpu(p)));
}
out_unlock:
task_rq_unlock(rq, &flags);
@@ -1602,7 +1869,7 @@ int task_nice(task_t *p)
*/
int task_curr(task_t *p)
{
- return cpu_curr(task_cpu(p)) == p;
+ return cpu_curr_ptr(task_cpu(p)) == p;
}

/**
@@ -1611,7 +1878,7 @@ int task_curr(task_t *p)
*/
int idle_cpu(int cpu)
{
- return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+ return cpu_curr_ptr(cpu) == cpu_idle_ptr(cpu);
}

/**
@@ -1701,7 +1968,7 @@ static int setscheduler(pid_t pid, int p
else
p->prio = p->static_prio;
if (array)
- activate_task(p, task_rq(p));
+ __activate_task(p, task_rq(p));

out_unlock:
task_rq_unlock(rq, &flags);
@@ -2161,7 +2428,7 @@ void __init init_idle(task_t *idle, int
local_irq_save(flags);
double_rq_lock(idle_rq, rq);

- idle_rq->curr = idle_rq->idle = idle;
+ cpu_curr_ptr(cpu) = cpu_idle_ptr(cpu) = idle;
deactivate_task(idle, rq);
idle->array = NULL;
idle->prio = MAX_PRIO;
@@ -2216,6 +2483,7 @@ void set_cpus_allowed(task_t *p, unsigne
unsigned long flags;
migration_req_t req;
runqueue_t *rq;
+ int cpu;

#if 0 /* FIXME: Grab cpu_lock, return error on this case. --RR */
new_mask &= cpu_online_map;
@@ -2237,31 +2505,31 @@ void set_cpus_allowed(task_t *p, unsigne
* If the task is not on a runqueue (and not running), then
* it is sufficient to simply update the task's cpu field.
*/
- if (!p->array && !task_running(rq, p)) {
+ if (!p->array && !task_running(p)) {
set_task_cpu(p, __ffs(p->cpus_allowed));
task_rq_unlock(rq, &flags);
return;
}
init_completion(&req.done);
req.task = p;
- list_add(&req.list, &rq->migration_queue);
+ cpu = task_cpu(p);
+ list_add(&req.list, migration_queue(cpu));
task_rq_unlock(rq, &flags);
-
- wake_up_process(rq->migration_thread);
+ wake_up_process(migration_thread(cpu));

wait_for_completion(&req.done);
}

/*
- * migration_thread - this is a highprio system thread that performs
+ * migration_task - this is a highprio system thread that performs
* thread migration by 'pulling' threads into the target runqueue.
*/
-static int migration_thread(void * data)
+static int migration_task(void * data)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
int cpu = (long) data;
runqueue_t *rq;
- int ret;
+ int ret, idx;

daemonize("migration/%d", cpu);
set_fs(KERNEL_DS);
@@ -2275,7 +2543,8 @@ static int migration_thread(void * data)
ret = setscheduler(0, SCHED_FIFO, &param);

rq = this_rq();
- rq->migration_thread = current;
+ migration_thread(cpu) = current;
+ idx = cpu_idx(cpu);

for (;;) {
runqueue_t *rq_src, *rq_dest;
@@ -2286,8 +2555,10 @@ static int migration_thread(void * data)
task_t *p;

spin_lock_irqsave(&rq->lock, flags);
- head = &rq->migration_queue;
- current->state = TASK_INTERRUPTIBLE;
+ if (cpu_active_balance(cpu))
+ do_active_balance(rq, cpu);
+ head = migration_queue(cpu);
+ current->state = TASK_UNINTERRUPTIBLE;
if (list_empty(head)) {
spin_unlock_irqrestore(&rq->lock, flags);
schedule();
@@ -2315,9 +2586,8 @@ repeat:
set_task_cpu(p, cpu_dest);
if (p->array) {
deactivate_task(p, rq_src);
- activate_task(p, rq_dest);
- if (p->prio < rq_dest->curr->prio)
- resched_task(rq_dest->curr);
+ __activate_task(p, rq_dest);
+ wake_up_cpu(rq_dest, cpu_dest, p);
}
}
double_rq_unlock(rq_src, rq_dest);
@@ -2335,12 +2605,13 @@ static int migration_call(struct notifie
unsigned long action,
void *hcpu)
{
+ long cpu = (long) hcpu;
+
switch (action) {
case CPU_ONLINE:
- printk("Starting migration thread for cpu %li\n",
- (long)hcpu);
- kernel_thread(migration_thread, hcpu, CLONE_KERNEL);
- while (!cpu_rq((long)hcpu)->migration_thread)
+ printk("Starting migration thread for cpu %li\n", cpu);
+ kernel_thread(migration_task, hcpu, CLONE_KERNEL);
+ while (!migration_thread(cpu))
yield();
break;
}
@@ -2415,11 +2686,20 @@ void __init sched_init(void)
for (i = 0; i < NR_CPUS; i++) {
prio_array_t *array;

+ /*
+ * Start with a 1:1 mapping between CPUs and runqueues:
+ */
+#if CONFIG_SHARE_RUNQUEUE
+ rq_idx(i) = i;
+ cpu_idx(i) = 0;
+#endif
rq = cpu_rq(i);
rq->active = rq->arrays;
rq->expired = rq->arrays + 1;
spin_lock_init(&rq->lock);
- INIT_LIST_HEAD(&rq->migration_queue);
+ INIT_LIST_HEAD(migration_queue(i));
+ rq->nr_cpus = 1;
+ rq->cpu[cpu_idx(i)].cpu = i;
atomic_set(&rq->nr_iowait, 0);
nr_running_init(rq);

@@ -2437,9 +2717,9 @@ void __init sched_init(void)
* We have to do a little magic to get the first
* thread right in SMP mode.
*/
- rq = this_rq();
- rq->curr = current;
- rq->idle = current;
+ cpu_curr_ptr(smp_processor_id()) = current;
+ cpu_idle_ptr(smp_processor_id()) = current;
+
set_task_cpu(current, smp_processor_id());
wake_up_forked_process(current);

--- linux/init/main.c.orig
+++ linux/init/main.c
@@ -354,7 +354,14 @@ static void __init smp_init(void)

static void rest_init(void)
{
+ /*
+ * We count on the initial thread going ok
+ * Like idlers init is an unlocked kernel thread, which will
+ * make syscalls (and thus be locked).
+ */
+ init_idle(current, smp_processor_id());
kernel_thread(init, NULL, CLONE_KERNEL);
+
unlock_kernel();
cpu_idle();
}
@@ -438,13 +445,6 @@ asmlinkage void __init start_kernel(void
check_bugs();
printk("POSIX conformance testing by UNIFIX\n");

- /*
- * We count on the initial thread going ok
- * Like idlers init is an unlocked kernel thread, which will
- * make syscalls (and thus be locked).
- */
- init_idle(current, smp_processor_id());
-
/* Do the rest non-__init'ed, we're now alive */
rest_init();
}


2003-02-28 21:05:24

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

Ingo Molnar <[email protected]> wrote:
>
>
> this is the latest HT scheduler patch, against 2.5.63-BK.

It's looking pretty good Ingo.

- The "tbench starves everything else" problem is fixed.

- The mm_struct leak is fixed

- As far as I can tell, the scheduler update no longer causes the
large "contest io_load" changes, wherein the streaming write made
lots more progress at the expense of the kernel compile.

- The longstanding problem wherein a kernel build makes my X desktop
unusable is 90% fixed - it is still possible to trigger stalls, but
they are less severe, and you actually have to work at it a bit to
make them happen.

Thanks.

2003-02-28 21:12:45

by Robert Love

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Fri, 2003-02-28 at 16:12, Andrew Morton wrote:

> - The longstanding problem wherein a kernel build makes my X desktop
> unusable is 90% fixed - it is still possible to trigger stalls, but
> they are less severe, and you actually have to work at it a bit to
> make them happen.

That is odd, because I do not see any changes in here that would improve
interactivity. This looks to be pretty much purely the HT stuff.

Andrew, if you drop this patch, your X desktop usability drops?

Ingo, is there something in here that I am missing?

Robert Love

2003-03-01 04:15:34

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

Robert Love <[email protected]> wrote:
>
> On Fri, 2003-02-28 at 16:12, Andrew Morton wrote:
>
> > - The longstanding problem wherein a kernel build makes my X desktop
> > unusable is 90% fixed - it is still possible to trigger stalls, but
> > they are less severe, and you actually have to work at it a bit to
> > make them happen.
>
> That is odd, because I do not see any changes in here that would improve
> interactivity. This looks to be pretty much purely the HT stuff.
>
> Andrew, if you drop this patch, your X desktop usability drops?

hm, you're right. It's still really bad. I forgot that I was using distcc.

And I also forgot that tbench starves everything else only on CONFIG_SMP=n.
That problem remains with us as well.


2003-03-06 03:12:21

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Fri, 28 Feb 2003, Andrew Morton wrote:
> >
> > Andrew, if you drop this patch, your X desktop usability drops?
>
> hm, you're right. It's still really bad. I forgot that I was using distcc.
>
> And I also forgot that tbench starves everything else only on CONFIG_SMP=n.
> That problem remains with us as well.

Andrew, I always thought that the scheduler interactivity was bogus, since
it didn't give any bonus to processes that _help_ interactive users
(notably the X server, but it could be other things).

To fix that, some people nice up their X servers, which has its own set of
problems.

How about something more like this (yeah, untested, but you get the idea):
the person who wakes up an interactive task gets the interactivity bonus
if the interactive task is already maxed out. I dunno how well this plays
with the X server, but assuming most clients use UNIX domain sockets, the
wake-ups _should_ be synchronous, so it should work well to say "waker
gets bonus".

This should result in:

- if X ends up using all of its time to handle clients, obviously X will
not count as interactive on its own. HOWEVER, if an xterm or something
gets an X event, the fact that the xterm has been idle means that _it_
gets a interactivity boost at wakeup.

- after a few such boosts (or assuming lots of idleness of xterm), the
process that caused the wakeup (ie the X server) will get the
"extraneous interactivity".

This all depends on whether the unix domain socket code runs in bottom
half or process context. If it runs in bottom half context we're screwed,
I haven't checked.

Does this make any difference for you? I don't know what your load test
is, and considering that my regular desktop has 4 fast CPU's I doubt I can
see the effect very clearly anyway ("Awww, poor Linus!")

NOTE! This doesn't help a "chain" of interactive helpers. It could be
extended to that, by just allowing the waker to "steal" interactivity
points from a sleeping process, but then we'd need to start being careful
about fairness and in particular we'd have to disallow this for signal
handling.

Linus

----
===== kernel/sched.c 1.161 vs edited =====
--- 1.161/kernel/sched.c Thu Feb 20 20:33:52 2003
+++ edited/kernel/sched.c Wed Mar 5 19:09:45 2003
@@ -337,8 +337,15 @@
* boost gets as well.
*/
p->sleep_avg += sleep_time;
- if (p->sleep_avg > MAX_SLEEP_AVG)
+ if (p->sleep_avg > MAX_SLEEP_AVG) {
+ int ticks = p->sleep_avg - MAX_SLEEP_AVG + current->sleep_avg;
p->sleep_avg = MAX_SLEEP_AVG;
+ if (ticks > MAX_SLEEP_AVG)
+ ticks = MAX_SLEEP_AVG;
+ if (!in_interrupt())
+ current->sleep_avg = ticks;
+ }
+
p->prio = effective_prio(p);
}
enqueue_task(p, array);

2003-03-06 06:48:12

by Ingo Molnar

[permalink] [raw]
Subject: [patch] "interactivity changes", sched-2.5.64-A4


Andrew,

> hm, you're right. It's still really bad. I forgot that I was using
> distcc.
>
> And I also forgot that tbench starves everything else only on
> CONFIG_SMP=n. That problem remains with us as well.

well, i took out the interactivity improvements from the 2.5.59-E6
scheduler patch, to keep the pure HT-scheduler bits only, and havent added
them back since. The two patch-sets (interactivity, and HT scheduler)
interact heavily, so i did not post the patch separately, but here it goes
as-is, against 2.5.64 - does it help your interactivity problems on UP
setups?

The patch includes the following items:

- a new wakeup feature: SMART_WAKE_CHILD (default:1) [this is a reworked
version of the wakeup changes from Andrea's tree.]

- scheduler parameter tunings (CHILD_PENALTY, MAX_TIMESLICE,
STARVATION_LIMIT, MAX_SLEEP_AVG)

- activate_task() splitup to separate the interactivity-active variant
from the task-queue variant => this fixes our statistics in some cases.

- finer-grained preemption: if priority is equal then the task with the
longer pending timeslice wins.

- make it possible to disable sync-wakeups via SYNC_WAKEUPS (default:1)

- a naming cleanup (sleep_timestamp => last_run)

- fix up STARVATION_LIMIT usage so that a value of 0 is recognized as 'no
starvation limit'.

ideally we'd like to have these piece by piece, but could you just give
this patchset a go, and if it makes a difference then we can separate out
the bits that matter, to pinpoint your problems as exactly as possible.

i've test-booted the patch on UP and SMP as well.

Ingo

--- linux/include/linux/sched.h.orig
+++ linux/include/linux/sched.h
@@ -328,7 +328,7 @@ struct task_struct {
prio_array_t *array;

unsigned long sleep_avg;
- unsigned long sleep_timestamp;
+ unsigned long last_run;

unsigned long policy;
unsigned long cpus_allowed;
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -54,20 +54,21 @@
/*
* These are the 'tuning knobs' of the scheduler:
*
- * Minimum timeslice is 10 msecs, default timeslice is 150 msecs,
- * maximum timeslice is 300 msecs. Timeslices get refilled after
+ * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
+ * maximum timeslice is 200 msecs. Timeslices get refilled after
* they expire.
*/
#define MIN_TIMESLICE ( 10 * HZ / 1000)
-#define MAX_TIMESLICE (300 * HZ / 1000)
-#define CHILD_PENALTY 95
+#define MAX_TIMESLICE (200 * HZ / 1000)
+#define CHILD_PENALTY 50
#define PARENT_PENALTY 100
#define EXIT_WEIGHT 3
#define PRIO_BONUS_RATIO 25
#define INTERACTIVE_DELTA 2
-#define MAX_SLEEP_AVG (2*HZ)
-#define STARVATION_LIMIT (2*HZ)
-#define NODE_THRESHOLD 125
+#define MAX_SLEEP_AVG (10*HZ)
+#define STARVATION_LIMIT (10*HZ)
+#define SYNC_WAKEUPS 1
+#define SMART_WAKE_CHILD 1

/*
* If a task is 'interactive' then we reinsert it in the active
@@ -323,16 +324,21 @@ static inline int effective_prio(task_t
* Also update all the scheduling statistics stuff. (sleep average
* calculation, priority modifiers, etc.)
*/
+static inline void __activate_task(task_t *p, runqueue_t *rq)
+{
+ enqueue_task(p, rq->active);
+ rq->nr_running++;
+}
+
static inline void activate_task(task_t *p, runqueue_t *rq)
{
- unsigned long sleep_time = jiffies - p->sleep_timestamp;
- prio_array_t *array = rq->active;
+ unsigned long sleep_time = jiffies - p->last_run;

if (!rt_task(p) && sleep_time) {
/*
* This code gives a bonus to interactive tasks. We update
* an 'average sleep time' value here, based on
- * sleep_timestamp. The more time a task spends sleeping,
+ * ->last_run. The more time a task spends sleeping,
* the higher the average gets - and the higher the priority
* boost gets as well.
*/
@@ -341,8 +347,7 @@ static inline void activate_task(task_t
p->sleep_avg = MAX_SLEEP_AVG;
p->prio = effective_prio(p);
}
- enqueue_task(p, array);
- nr_running_inc(rq);
+ __activate_task(p, rq);
}

/*
@@ -460,6 +465,7 @@ static int try_to_wake_up(task_t * p, un
long old_state;
runqueue_t *rq;

+ sync &= SYNC_WAKEUPS;
repeat_lock_task:
rq = task_rq_lock(p, &flags);
old_state = p->state;
@@ -479,12 +485,17 @@ repeat_lock_task:
}
if (old_state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible--;
- activate_task(p, rq);
-
- if (p->prio < rq->curr->prio)
- resched_task(rq->curr);
+ if (sync)
+ __activate_task(p, rq);
+ else {
+ activate_task(p, rq);
+ if (p->prio < rq->curr->prio)
+ resched_task(rq->curr);
+ }
success = 1;
}
+ if (p->state >= TASK_ZOMBIE)
+ BUG();
p->state = TASK_RUNNING;
}
task_rq_unlock(rq, &flags);
@@ -525,8 +536,19 @@ void wake_up_forked_process(task_t * p)
p->prio = effective_prio(p);
}
set_task_cpu(p, smp_processor_id());
- activate_task(p, rq);

+ if (SMART_WAKE_CHILD) {
+ if (unlikely(!current->array))
+ __activate_task(p, rq);
+ else {
+ p->prio = current->prio;
+ list_add_tail(&p->run_list, &current->run_list);
+ p->array = current->array;
+ p->array->nr_active++;
+ rq->nr_running++;
+ }
+ } else
+ activate_task(p, rq);
task_rq_unlock(rq, &flags);
}

@@ -953,6 +975,11 @@ static inline void pull_task(runqueue_t
*/
if (p->prio < this_rq->curr->prio)
set_need_resched();
+ else {
+ if (p->prio == this_rq->curr->prio &&
+ p->time_slice > this_rq->curr->time_slice)
+ set_need_resched();
+ }
}

/*
@@ -1016,7 +1043,7 @@ skip_queue:
*/

#define CAN_MIGRATE_TASK(p,rq,this_cpu) \
- ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \
+ ((jiffies - (p)->last_run > cache_decay_ticks) && \
!task_running(rq, p) && \
((p)->cpus_allowed & (1UL << (this_cpu))))

@@ -1076,9 +1103,9 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
* increasing number of running tasks:
*/
#define EXPIRED_STARVING(rq) \
- ((rq)->expired_timestamp && \
+ (STARVATION_LIMIT && ((rq)->expired_timestamp && \
(jiffies - (rq)->expired_timestamp >= \
- STARVATION_LIMIT * ((rq)->nr_running) + 1))
+ STARVATION_LIMIT * ((rq)->nr_running) + 1)))

/*
* This function gets called by the timer code, with HZ frequency.
@@ -1201,7 +1228,7 @@ need_resched:
rq = this_rq();

release_kernel_lock(prev);
- prev->sleep_timestamp = jiffies;
+ prev->last_run = jiffies;
spin_lock_irq(&rq->lock);

/*
@@ -1701,7 +1728,7 @@ static int setscheduler(pid_t pid, int p
else
p->prio = p->static_prio;
if (array)
- activate_task(p, task_rq(p));
+ __activate_task(p, task_rq(p));

out_unlock:
task_rq_unlock(rq, &flags);
--- linux/kernel/fork.c.orig
+++ linux/kernel/fork.c
@@ -916,7 +916,7 @@ static struct task_struct *copy_process(
*/
p->first_time_slice = 1;
current->time_slice >>= 1;
- p->sleep_timestamp = jiffies;
+ p->last_run = jiffies;
if (!current->time_slice) {
/*
* This case is rare, it happens when the parent has only

2003-03-06 07:34:40

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

Linus Torvalds <[email protected]> wrote:
>
>
> On Fri, 28 Feb 2003, Andrew Morton wrote:
> > >
> > > Andrew, if you drop this patch, your X desktop usability drops?
> >
> > hm, you're right. It's still really bad. I forgot that I was using distcc.
> >
> > And I also forgot that tbench starves everything else only on CONFIG_SMP=n.
> > That problem remains with us as well.
>
> Andrew, I always thought that the scheduler interactivity was bogus, since
> it didn't give any bonus to processes that _help_ interactive users
> (notably the X server, but it could be other things).

The current interactivity booster heuristic does appear to work very well - I
did an A/B comparison with 2.4.x a while back, and 2.5 is distinctly better.

But it is a heuristic, and it will inevitably make mistakes. The problem
which I am observing is that the cost of those mistakes is very high.

I believe that we should recognise that no heuristic will be 100% accurate,
and that we should seek to minimise the impact of those 0.1%-of-the time
mistakes which it will make. Perhaps by just dropping the max timeslice??

Let me redescribe the problem:

- Dual 850MHz PIII, 900M of RAM.
- Start a `make -j3 vmlinux', everything in pagecache
- Start using X applications. Moving a window about is the usual trigger.

Everything goes happily for a while, and then blam. Windows get stuck for
0.5-1.0 seconds, the mouse pointer gets so laggy that it is uncontrollable,
etc. The fix is to put your hands in your pockets for 5-10 seconds and wait
for the scheduler to notice that the X server is idle.

Interestingly, this does not happen if the background load is a bunch of
busywaits. It seems to need the fork&exit load of a compilation to trigger.

Robert was able to reproduce this, and noticed that the scheduler had niced
the X server down as far as it would go.

I'm a pretty pathological case, because I was driving two 1600x1200x24
displays with a crufty old AGP voodoo3 and a $2 PCI nvidia. Am now using a
presumably less crufty radeon and the problem persists.

But last time I tested Ingo's interactivity patch things were a lot better.
I shall retest his latest now.

2003-03-06 07:37:26

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Wed, 5 Mar 2003, Andrew Morton wrote:

> The current interactivity booster heuristic does appear to work very
> well - I did an A/B comparison with 2.4.x a while back, and 2.5 is
> distinctly better.

> Let me redescribe the problem:
>
> - Dual 850MHz PIII, 900M of RAM.
> - Start a `make -j3 vmlinux', everything in pagecache
> - Start using X applications. Moving a window about is the usual trigger.

please do another thing as well: in addition to applying the -A4 patch,
also renice X to -10.

> But last time I tested Ingo's interactivity patch things were a lot
> better. I shall retest his latest now.

please do.

Ingo

2003-03-06 09:50:37

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch] "interactivity changes", sched-2.5.64-A4

Ingo Molnar <[email protected]> wrote:
>
>
> well, i took out the interactivity improvements from the 2.5.59-E6
> scheduler patch, to keep the pure HT-scheduler bits only, and havent added
> them back since. The two patch-sets (interactivity, and HT scheduler)
> interact heavily, so i did not post the patch separately, but here it goes
> as-is, against 2.5.64 - does it help your interactivity problems on UP
> setups?

Ah, this is the patch I thought I was testing last time ;) A bit more careful
this time:

- The tbench-starves-everything-on-uniprocessor problem (well, I'd have to
say it is a bug) is fixed.

When running a compilation against a `tbench 4' one would expect the
compilation to be slowed down by maybe 4x. It seems a little slower than
that, but it is in that ballpark. At least the compilation makes _some_
progress now.

- The large performance shift with `contest io_load' is still there.

This test times how long it takes to compile a kernel in the presence of
massive streaming writeout to the same filesystem.

2.5.64, UP, !preempt, ext3:
Finished compiling kernel: elapsed: 409 user: 107 system: 11
Finished io_load: elapsed: 409 user: 2 system: 37 loads: 16810

Finished compiling kernel: elapsed: 283 user: 107 system: 10
Finished io_load: elapsed: 286 user: 1 system: 17 loads: 7990

2.5.66+sched-2.5.64-A4, UP, !preempt, ext3:
Finished compiling kernel: elapsed: 910 user: 108 system: 12
Finished io_load: elapsed: 912 user: 4 system: 75 loads: 35210

Finished compiling kernel: elapsed: 940 user: 108 system: 12
Finished io_load: elapsed: 940 user: 4 system: 78 loads: 36510

The compilation took twice as long, and the streaming write made much
more progress.

Given that a monster `dd if=/dev/zero' takes only a few percent CPU
anyway, it is quite odd that this is happening.

Regardless of the fairness issue we want to maximise CPU utilisaton in
this workload. The above figures show that the total CPU utilisation has
fallen from

409 / (107+11+4+75) = 48% CPU
and 283 / (107+10+1+17) = 48% CPU

down to

910 / (108+12+4+75) = 22% CPU
and 940 / (108+12+4+78) = 21% CPU

which is quite bad.

I cannot explain this - why so much idle time? It seems to happen with
ext2 as well, so it may not be the weird interaction between kjournald, the
CPU scheduler and the IO scheduler which I initially suspected. Poor me.

- On the X server problem: The patch pretty much fixes it up. I have to
work pretty hard to make the UI flip into sucky mode, and it is much less
severe. I'd say it it acceptable.

Renicing X to -10 makes it significantly better. Text under moved
windows gets redrawn promptly. But renicing X is not a very adequate
solution in my experience - I've found that when the email client goes off
to parse a 1000-message folder the scheduler can decide to penalise it, and
the application freezes up for some time.

Overall, yep, good patch and we should run with it. I need to work out what
on earth has happened to the IO load balancing. We only got that working
properly a few days back.


2003-03-06 09:58:01

by Ingo Molnar

[permalink] [raw]
Subject: [patch] "interactivity changes", sched-2.5.64-A5


On Thu, 6 Mar 2003, Andrew Morton wrote:

> Overall, yep, good patch and we should run with it. [...]

great - here it is -A5 which is -A4 in a cleaned up form:

- make SMART_WAKE_CHILD and SYNC_WAKEUPS unconditional.
- remove a mis-merge of a TASK_ZOMBIE assert.

(without the last_run rename the patch becomes even smaller, do a
':s/last_run/sleep_timestamp/g' on it in vi.)

Ingo

--- linux/include/linux/sched.h.orig
+++ linux/include/linux/sched.h
@@ -328,7 +328,7 @@ struct task_struct {
prio_array_t *array;

unsigned long sleep_avg;
- unsigned long sleep_timestamp;
+ unsigned long last_run;

unsigned long policy;
unsigned long cpus_allowed;
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -54,20 +54,19 @@
/*
* These are the 'tuning knobs' of the scheduler:
*
- * Minimum timeslice is 10 msecs, default timeslice is 150 msecs,
- * maximum timeslice is 300 msecs. Timeslices get refilled after
+ * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
+ * maximum timeslice is 200 msecs. Timeslices get refilled after
* they expire.
*/
#define MIN_TIMESLICE ( 10 * HZ / 1000)
-#define MAX_TIMESLICE (300 * HZ / 1000)
-#define CHILD_PENALTY 95
+#define MAX_TIMESLICE (200 * HZ / 1000)
+#define CHILD_PENALTY 50
#define PARENT_PENALTY 100
#define EXIT_WEIGHT 3
#define PRIO_BONUS_RATIO 25
#define INTERACTIVE_DELTA 2
-#define MAX_SLEEP_AVG (2*HZ)
-#define STARVATION_LIMIT (2*HZ)
-#define NODE_THRESHOLD 125
+#define MAX_SLEEP_AVG (10*HZ)
+#define STARVATION_LIMIT (10*HZ)

/*
* If a task is 'interactive' then we reinsert it in the active
@@ -323,16 +322,21 @@ static inline int effective_prio(task_t
* Also update all the scheduling statistics stuff. (sleep average
* calculation, priority modifiers, etc.)
*/
+static inline void __activate_task(task_t *p, runqueue_t *rq)
+{
+ enqueue_task(p, rq->active);
+ rq->nr_running++;
+}
+
static inline void activate_task(task_t *p, runqueue_t *rq)
{
- unsigned long sleep_time = jiffies - p->sleep_timestamp;
- prio_array_t *array = rq->active;
+ unsigned long sleep_time = jiffies - p->last_run;

if (!rt_task(p) && sleep_time) {
/*
* This code gives a bonus to interactive tasks. We update
* an 'average sleep time' value here, based on
- * sleep_timestamp. The more time a task spends sleeping,
+ * ->last_run. The more time a task spends sleeping,
* the higher the average gets - and the higher the priority
* boost gets as well.
*/
@@ -341,8 +345,7 @@ static inline void activate_task(task_t
p->sleep_avg = MAX_SLEEP_AVG;
p->prio = effective_prio(p);
}
- enqueue_task(p, array);
- nr_running_inc(rq);
+ __activate_task(p, rq);
}

/*
@@ -479,10 +482,13 @@ repeat_lock_task:
}
if (old_state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible--;
- activate_task(p, rq);
-
- if (p->prio < rq->curr->prio)
- resched_task(rq->curr);
+ if (sync)
+ __activate_task(p, rq);
+ else {
+ activate_task(p, rq);
+ if (p->prio < rq->curr->prio)
+ resched_task(rq->curr);
+ }
success = 1;
}
p->state = TASK_RUNNING;
@@ -525,8 +531,16 @@ void wake_up_forked_process(task_t * p)
p->prio = effective_prio(p);
}
set_task_cpu(p, smp_processor_id());
- activate_task(p, rq);

+ if (unlikely(!current->array))
+ __activate_task(p, rq);
+ else {
+ p->prio = current->prio;
+ list_add_tail(&p->run_list, &current->run_list);
+ p->array = current->array;
+ p->array->nr_active++;
+ rq->nr_running++;
+ }
task_rq_unlock(rq, &flags);
}

@@ -953,6 +967,11 @@ static inline void pull_task(runqueue_t
*/
if (p->prio < this_rq->curr->prio)
set_need_resched();
+ else {
+ if (p->prio == this_rq->curr->prio &&
+ p->time_slice > this_rq->curr->time_slice)
+ set_need_resched();
+ }
}

/*
@@ -1016,7 +1035,7 @@ skip_queue:
*/

#define CAN_MIGRATE_TASK(p,rq,this_cpu) \
- ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \
+ ((jiffies - (p)->last_run > cache_decay_ticks) && \
!task_running(rq, p) && \
((p)->cpus_allowed & (1UL << (this_cpu))))

@@ -1076,9 +1095,9 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
* increasing number of running tasks:
*/
#define EXPIRED_STARVING(rq) \
- ((rq)->expired_timestamp && \
+ (STARVATION_LIMIT && ((rq)->expired_timestamp && \
(jiffies - (rq)->expired_timestamp >= \
- STARVATION_LIMIT * ((rq)->nr_running) + 1))
+ STARVATION_LIMIT * ((rq)->nr_running) + 1)))

/*
* This function gets called by the timer code, with HZ frequency.
@@ -1201,7 +1220,7 @@ need_resched:
rq = this_rq();

release_kernel_lock(prev);
- prev->sleep_timestamp = jiffies;
+ prev->last_run = jiffies;
spin_lock_irq(&rq->lock);

/*
@@ -1701,7 +1720,7 @@ static int setscheduler(pid_t pid, int p
else
p->prio = p->static_prio;
if (array)
- activate_task(p, task_rq(p));
+ __activate_task(p, task_rq(p));

out_unlock:
task_rq_unlock(rq, &flags);
--- linux/kernel/fork.c.orig
+++ linux/kernel/fork.c
@@ -916,7 +916,7 @@ static struct task_struct *copy_process(
*/
p->first_time_slice = 1;
current->time_slice >>= 1;
- p->sleep_timestamp = jiffies;
+ p->last_run = jiffies;
if (!current->time_slice) {
/*
* This case is rare, it happens when the parent has only

2003-03-06 10:06:10

by Chris Wedgwood

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Fri, Feb 28, 2003 at 10:50:03AM +0100, Ingo Molnar wrote:

> +#elif CONFIG_NR_SIBLINGS_4
> +# define CONFIG_NR_SIBLINGS 4

There are HT chips which have 4 siblings?


--cw

2003-03-06 10:08:20

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Chris Wedgwood wrote:

> > +#elif CONFIG_NR_SIBLINGS_4
> > +# define CONFIG_NR_SIBLINGS 4
>
> There are HT chips which have 4 siblings?

not that i'm aware of - this was just there because i'm sure this thing
wont stop at 2 logical CPUs, and i wanted to add a Kconfig structure that
deals with that too. We can comment off the "CONFIG_NR_SIBLINGS 4" line
from the Kconfig, for now.

Ingo

2003-03-06 14:59:37

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Ingo Molnar wrote:
>
> please do another thing as well: in addition to applying the -A4 patch,
> also renice X to -10.

NO.

This is a BUG IN THE SCHEDULER!

We should notice automatically that X is "interactive", thanks to the
fact that interactive tasks wait for it. That's the whole point of my very
simple patch..

So don't argue about things that are obviously broken. Renicing X is not
the answer, and in fact there have been multiple reports that it makes
XMMS skip sound because it just _hides_ the problem and moves it
elsewhere.

Please test my simple patch that should at least attack the real issue
instead of waffling and hiding it.

Linus

2003-03-06 15:29:30

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Wed, 5 Mar 2003, Andrew Morton wrote:
>
> But it is a heuristic, and it will inevitably make mistakes. The problem
> which I am observing is that the cost of those mistakes is very high.

The problem I used to see quite clearly is that X gets turned into a
non-interactive task, because with a complex X desktop X can easily use
5-15% of CPU even when everything is perfectly normal. That's simply
because it ends up having to _do_ a lot: anti-aliased text, shifting
logos, xdaliclock in the corner, blinking cursors, annoying moving
advertising on web-sites, small animations in the task-bar etc.

And the current heuristic SUCKS for this. And it's quite fundamental: the
heuristic is _wrong_. The heuristic breaks down _completely_ when there
are ten CPU hogs that peg the CPU, and now the one process that wants 10%
of the CPU suddenly doesn't sleep any more, since it ends up needing all
the timeslices it could get.

At this point the current heurstic totally breaks down, and doesn't have
any way out. The _only_ part of Ingo's patch that may help is the fact
that he made the maximum timeslice be smaller, but realize that since X
is now considered to be a background task, that doesn't help very much
_either_, since it's going to be scheduled round-robin with those other
ten background tasks, and changing the max timeslice from 300 to 200ms
only means that the pauses will be "only" two seconds instead of being
three seconds.

See?

I told Ingo about it half a year ago, and he ignored it then, saying it
can't be solved. Re-nicing X doesn't really help: it only hides the
problem because it means that X won't be scheduled round-robin with the
background load any more, but because it makes X more important than user
processes it then has other fundamental flaws.

So with "nice -10", when X gets busy, it gets _too_ busy and locks out
other processes - it fixes the interactivity, but it causes latency
problems because X is still considered a _background_ job, so it gets a
200ms timeslice at high priority and guess what? That kind of sucks for
regular programs that want low latency.

> Let me redescribe the problem:
>
> - Dual 850MHz PIII, 900M of RAM.
> - Start a `make -j3 vmlinux', everything in pagecache
> - Start using X applications. Moving a window about is the usual trigger.

Yeah. Guess what? Moving a window makes X suddenly use a lot more CPU (try
turning off acceleration if you _really_ want to see it), so you see the
fundamental problem with less load. Add some load (and some X eye-candy)
and you will see it much easier.

And read the above description of what happens, and realize that it is
_fundamental_.

And guess what my 5-liner patch tries to address? The one that you didn't
bother testing.. The one that actually tries to tackle the problem head-on
instead of hiding it.

> Interestingly, this does not happen if the background load is a bunch of
> busywaits. It seems to need the fork&exit load of a compilation to trigger.

It happened with busy-waits too last time I checked (which is about halfa
year ago), it's just that a normal busy-wait tends to _do_ a lot less than
doign a fork()/wait()/busy-child. When the load on the CPU becomes
"constrained" (ie when the CPU idle time goes down to zero) of a
fork/wait/busy-child, suddenly that isn't a load unit of "1" any more,
since the fork/wait/exit actually takes time.

Which means that you may need 6 busy-wait tasks to get the same kind of
real load that you get with just 3 fork/wait/exit things.

> Robert was able to reproduce this, and noticed that the scheduler had niced
> the X server down as far as it would go.

Right. It's totally starved for CPU, so it never sleeps, so it gets turned
into non-interactive, so it gets round-robined-with-the-background-tasks.

> I'm a pretty pathological case, because I was driving two 1600x1200x24
> displays with a crufty old AGP voodoo3 and a $2 PCI nvidia. Am now using a
> presumably less crufty radeon and the problem persists.

I saw the problem on this 4-CPU machine with my old graphics card, which
was something perfectly moderns (ATI Radeon 7000). I can still trigger it
even with my current Radeon 8500, but I need to basically make the build
be "make -j15" ot something like that to trigger it, at which point I'm
not sure it's relevant any more..

And yes, I think my five-liner makes a difference to me, but since I need
to push the machine so far "off" a reasonable load to trigger it, I don't
think my machine is a good test.

A good test is a fairly slow graphics card, with some stuff happening all
the time on the screen (ie xdaliclock or a noisy web-site). And then
seeing what moving a window does when doing a compile.

The window moving itself is not helped by my patch, but my patch _should_
mean that the _users_ of X (which are less starved for CPU than X is,
since they don't actually have to do the heavy lifting) will be giving X
"life points" as X wakes them up.

And that's the whole point of the patch. Any synchronous wake-up that
wakes up an interactive task is a _good_ thing. Not just for the
interactive task itself, but also for the thing that causes it to wake up.
It makes X itself more interactive - it still goes into "background mode"
for short stretches, but it seems to need more pushing, and it seems
to come out of its funk faster.

Note the "seems". This is why I'd like others to test it - since it's very
much a perception issue.

Linus

2003-03-06 15:51:35

by Jeff Garzik

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

Pardon the suggestion of a dumb hueristic, feel free to ignore me:
would it work to run-first processes that have modified their iopl()
level? i.e. "if you access hardware directly, we'll treat you specially
in the scheduler"?

An alternative is to encourage distros to set some sort of flag for
processes like the X server, when it is run. This sounds suspiciously
like the existing "renice X server" hack, but it could be something like
changing the X server from SCHED_OTHER to SCHED_HW_ACCESS instead.

Just throwing out some things, because I care about apps which access
hardware from userspace :)

2003-03-06 16:44:41

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Linus Torvalds wrote:

> We should notice automatically that X is "interactive", thanks to the
> fact that interactive tasks wait for it. That's the whole point of my
> very simple patch..

the whole compilation (gcc tasks) will be rated 'interactive' as well,
because an 'interactive' make process and/or shell process is waiting on
it. I tried something like this before, and it didnt work.

> So don't argue about things that are obviously broken. Renicing X is not
> the answer, and in fact there have been multiple reports that it makes
> XMMS skip sound because it just _hides_ the problem and moves it
> elsewhere.

the xine has been analyzed quite well (which is analogous to the XMMS
problem), it's not X that makes XMMS skip, it's the other CPU-bound tasks
on the desktops that cause it to skip occasionally. Increasing the
priority of xine to just -1 or -2 solves the skipping problem.

Ingo

2003-03-06 16:42:53

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Jeff Garzik wrote:
>
> Pardon the suggestion of a dumb hueristic, feel free to ignore me:
> would it work to run-first processes that have modified their iopl()
> level? i.e. "if you access hardware directly, we'll treat you specially
> in the scheduler"?

See, the thing is, I don't actually believe that X is _special_ in this
regard.

If we have an interactive process that has been waiting nicely for
somebody else (ie it is "interactive" in that it tends to run quickly and
then wait again, without blowing away the cache), then I think that
"somebody else" should be automatically getting some of the extra
attention - just for latency reasons.

It's not an X special-case issue. It's a completely _generic_ "low-latency
service to interactive tasks" issue.

Put another way:

Imagine other servers running on the same machine. Imagine, if you will, a
logging service (or IM server, or whatever) that gets overwhelmed when it
has thousands of interactive clients on an overloaded machine. We know the
clients care about latency (clearly true since they sleep a lot - they
can't be about bandwidth: the logic only triggers if the clients have lots
of interactivity bonus points), so if we notice that this server is waking
up a lot of these latency-critical clients, doesn't it make sense to make
the _server_ as latency-critical too?

See? My small patch hits _exactly_ that case. It misses a lot of
opportunities (it really only works with synchronous wakeups, so in
practice it probably ends up working mainly for things like UNIX domain
sockets and regular pipes), but that's an implementation issue, not a
"special case" thing.

Linus

2003-03-06 16:54:44

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Ingo Molnar wrote:
>
> the whole compilation (gcc tasks) will be rated 'interactive' as well,
> because an 'interactive' make process and/or shell process is waiting on
> it.

No. The make that is waiting for it will be woken up _once_ - when the
thing dies. Marking it interactive at that point is absolutely fine.

> I tried something like this before, and it didnt work.

You can't have tried it very hard.

In fact, you haven't apparently tried it hard enough to even bother giving
my patch a look, much less apply it and try it out.

> the xine has been analyzed quite well (which is analogous to the XMMS
> problem), it's not X that makes XMMS skip, it's the other CPU-bound tasks
> on the desktops that cause it to skip occasionally. Increasing the
> priority of xine to just -1 or -2 solves the skipping problem.

Are you _crazy_?

Normal users can't "just increase the priority". You have to be root to do
so. And I already told you why it's only hiding the problem.

In short, you're taking a very NT'ish approach - make certain programs run
in the "foreground", and give them a static boost because they are
magically more important. And you're ignoring the fact that the heuristics
we have now are clearly fundamentally broken in certain circumstances.

I've pointed out the circumstances, I've told you why it happens and when
it happens, and you've not actually even answered that part. You've only
gone "it's not a problem, you can fix it up by renicing every time you
find a problem".

Get your head out of the sand, and stop this "nice" blathering.

Linus

2003-03-06 17:03:20

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Wed, 5 Mar 2003, Linus Torvalds wrote:

> p->sleep_avg += sleep_time;
> - if (p->sleep_avg > MAX_SLEEP_AVG)
> + if (p->sleep_avg > MAX_SLEEP_AVG) {
> + int ticks = p->sleep_avg - MAX_SLEEP_AVG + current->sleep_avg;
> p->sleep_avg = MAX_SLEEP_AVG;
> + if (ticks > MAX_SLEEP_AVG)
> + ticks = MAX_SLEEP_AVG;
> + if (!in_interrupt())
> + current->sleep_avg = ticks;
> + }
> +
> p->prio = effective_prio(p);

interesting approach, but it has one problem which so far i tried to
avoid: it makes it too easy for a process to gain a bonus. Until now
pretty much the only way to get an interactive bonus was to actually
sleep. Another rule was that interactivity is kept constant, ie. the only
'source' of interactivity was passing time, not some artificial activity
performed by any process. Even the timeslice passing across fork()/exit()
is controlled carefully to maintain the total sum of timeslices.

With the above code it's enough to keep a single helper thread around
which blocks on a pipe, to create an almost endless source of
interactivity bonus. And does not even have to be 'deliberate' - there's
tons of code that just waits for a CPU-bound task to finish (eg. 'make'
waiting for gcc to finish), and which processes/threads have a maximum
boost already, in which case the interactivity boost is not justified.

Anyway, Andrew, could you give Linus' patch a go as well?

Ingo


2003-03-06 17:06:56

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Jeff Garzik wrote:

> Pardon the suggestion of a dumb hueristic, feel free to ignore me:
> would it work to run-first processes that have modified their iopl()
> level? i.e. "if you access hardware directly, we'll treat you specially
> in the scheduler"?

we did this in the Red Hat kernel, just as a quick hack to boost X without
having to modify X (bleh). While it worked very well in practice, it's a
really bad idea from an interface design POV, because it singles out X
mostly based on a random characteristics of X, but leaves out some other
tasks that are 'suffering' from not being recognized as interactive, but
still having a user watching them (and expecting keypresses to react
fast): games, xine, XMMS, etc.

> An alternative is to encourage distros to set some sort of flag for
> processes like the X server, when it is run. This sounds suspiciously
> like the existing "renice X server" hack, but it could be something like
> changing the X server from SCHED_OTHER to SCHED_HW_ACCESS instead.

yes, an ELF flag might work, or my suggestion to allow applications to
increase their priority (up until a certain degree).

Ingo

2003-03-06 17:01:52

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Linus Torvalds wrote:

> See, the thing is, I don't actually believe that X is _special_ in this
> regard.

X is special. Especially in Andrew's wild-window-dragging experiment X is
a pure CPU-bound task that just eats CPU cycles no end. There _only_ thing
that makes it special is that there's a human looking at the output of the
X client. This is information that is simply not available to the kernel.

Windows solves this problem by giving the application that owns 'the
foreground window' a special boost - they have the window manager in the
kernel after all. I do not like this approach, i could very well depend on
an application's latency that is not directly connected to the foreground
task.

the only thing that is more or less a good sign of humans being involved,
is delay. It's not always the case, but _most_ of the time, when a human
is involved, there's lots of sleep time. The 2.4 scheduler kind of
rewarded tasks based on this, the 2.5 scheduler does this more
prominently.

now which are those tasks that are CPU-bound but still have a human eye
looking at them most of the time? The two main ones are games and video
playback. Games tend to be CPU hogs for natural reasons, and video
playback is something that is CPU-intensive but non-smooth output is still
easily noticed by humans.

note that there isnt all that much problem with any but these two
categories.

and for those two categories, we have to give up and just help the kernel
a bit. We have to tell the kernel that there's a human looking at the
output of these applications. One plan that i think might work would be to
enable users to raise priority of applications by 2 or 3 priority levels,
which the kernel would allow. If the interface only allows such priority
setting for the current process, and if it's not inherited across fork(),
then at least initially, it would be harder to abuse this mechanism for a
'boost all of my processes' tool in .bashrc.

another experiment (a really bad hack) was to do the boost in iopl() -
recognizing the fact that if an application uses iopl(), it's privileged
and it's special. This of course unacceptable, but it isolated X and a
couple of other 'important' apps. It doesnt solve the 'xine problem'.

Ingo

2003-03-06 17:10:29

by Alan

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Thu, 2003-03-06 at 16:01, Jeff Garzik wrote:
> Pardon the suggestion of a dumb hueristic, feel free to ignore me:
> would it work to run-first processes that have modified their iopl()
> level? i.e. "if you access hardware directly, we'll treat you specially
> in the scheduler"?

Not all X servers do that. X is not special in any way. Its just a
daemon. It has the same timing properties as many other daemons doing
time critical operations for many clients


2003-03-06 17:14:37

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


another thing. What really happens in the 'recompile job' thing is not
that X gets non-interactive. Most of the time it _is_ interactive. The
problem is that the gcc processes and make processes themselves, which do
use up a considerable portion of CPU time (generate a load of 4-5 with
make -j2), get rated as interactive as well. Shells, gnome-terminal, X
itself will be often preempted by these gcc and make processes. Making X
more interactive does not help in this case.

so my patchset attempts to tune things in the direction of making the
scheduler recognize the compile job as truly CPU-bound. It was a bad
interaction of child-inherits-priority plus child-wakeup priorities.

My suggestion to increase X's priority was just an unrelated suggestion -
if Andrew is dragging windows around then X does get 100% CPU-bound, and
the only good way to get out of that situation is to tell the system that
it's rightfully monopolizing the CPU, and that the user has no problem
with that.

I also had feedback from people who reniced X in the _other_ direction,
because they did not want their rare administration work done under X
impact the generic performance of the server.

Ingo

2003-03-06 17:23:09

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Linus Torvalds wrote:

> > interesting approach,
>
> Ahh.. After five emails on the subject, you finally took a look at the
> patch? Good.

no, it was the first email i wrote, it was in my postponed queue.

Ingo

2003-03-06 17:22:08

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Ingo Molnar wrote:
>
> interesting approach,

Ahh.. After five emails on the subject, you finally took a look at the
patch? Good.

> but it has one problem which so far i tried to
> avoid: it makes it too easy for a process to gain a bonus.

Yes and no. It actually follows your rules:

> Until now
> pretty much the only way to get an interactive bonus was to actually
> sleep.

And that is still true. _Somebody_ has to sleep, and the bonus is still
entirely based on how long that somebody slept.


> Another rule was that interactivity is kept constant, ie. the only
> 'source' of interactivity was passing time, not some artificial activity
> performed by any process.

And this is also still true. Again, the only _source_ of interactivity is
the fact that the woken-up process slept for a noticeable time. There is
no "artificial source" there.

Do the math: the patch uses the exact same source as the current algorithm
does - it only changes the _cutoff_.

The current algorithm just throws away "interactivity bonus" points when
the sleeper has gotten to some maximum value.

The new one doesn't throw them away when it hits the maximum - it gives
them to the person who woke it up.

> Even the timeslice passing across fork()/exit()
> is controlled carefully to maintain the total sum of timeslices.

But my patch DID NOT CHANGE THE TOTAL SUM! Read the patch again.

See:

+ int ticks = p->sleep_avg - MAX_SLEEP_AVG + current->sleep_avg;
p->sleep_avg = MAX_SLEEP_AVG;

Note how this breaks down:

"p->sleep_avg - MAX_SLEEP_AVG"

this is the part we're "throwing away", because the sleeper had already
accumulated enough interactivity points.

" + current->sleep_avg"

this is the part that the waker _already_had_.

if (ticks > MAX_SLEEP_AVG)
ticks = MAX_SLEEP_AVG;

This just says that the waker, too, will be limited by the "maximum
interactivity" thing.

if (!in_interrupt())
current->sleep_avg = ticks;

and this part says "we only give the potential boost to a _synchronous_
waker" (ie a network packet that comes in and wakes somebody up in
bottom half processing will _not_ cause an interactivity boost to a random
process).

See? The patch maintains the rule that "interactivity" only gets created
by sleeping. The only thing it really does is to change how we
_distribute_ the interactivity we get. It gives some of the interactivity
to the waker.

> With the above code it's enough to keep a single helper thread around
> which blocks on a pipe, to create an almost endless source of
> interactivity bonus.

No. You do not get to "create" interactivity. All the CPU cycles you win
from interactivity you must have lost by having one of your processes wait
for it.

Sure - it changes the balance of who gets to be considered interactive.
But that's the whole _point_ of it.

> And does not even have to be 'deliberate' - there's
> tons of code that just waits for a CPU-bound task to finish (eg. 'make'
> waiting for gcc to finish), and which processes/threads have a maximum
> boost already, in which case the interactivity boost is not justified.

And if they already have the maximum boost, they won't get any more. Look
at the five lines of actual code added.

Also, note how your "waiting for gcc to finish" is still not true. Sure,
that "make" will be considered interactive, but it's not going to help the
waker (gcc) any, since it will be interactive waiting for gcc to _die_.

I can come up with better (ie "real") examples of where this changes
behaviour:

- "cc1" (slow) writes to a pipe to "as" (fast)

"as" is fast, so as ends up waiting most of the time. Thus it ends up
being marked interactive.

When cc1 wakes up as, assuming as has been marked "maximally
interactive", cc1 will get an interactivity boost too.

Is this "wrong"? Maybe, if you see it from a pure "interactivity"
standpoint. But another way of seeing it is to say that it automatically
tries to _balance_ this kind of pipeline - since "cc1" is much slower and
actually _wants_ the interactivity that "as" is clearly not ever going to
actually get any real advantage from, it is actually likely to be
perfectly fine give "cc1" a priority.

Also, note that if cc1 is _really_ slow (which it is) and really doesn't
wake up as very often, you won't see this effect, since the
"anti-interactive" code will balance things out.

In short, it's all about balancing. There are things that are
"pro-interactive" (real sleeping), and there are things that are
"anti-interactive" (using up your timeslice). The question is how you
spread out the bonus points (or the negative points).

The current scheduler doesn't spread them out at all. I think that's a
bug, since pipelines of multiple processes are actually perfectly common,
and X is only one example of this.

And my patch may spread it out _too_ much. Maybe we shouldn't give _all_
of the left-over interactivity to the waker. Maybe we should give just
half of it away..

See?

Linus

2003-03-06 17:27:37

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On 6 Mar 2003, Alan Cox wrote:
>
> Not all X servers do that. X is not special in any way. Its just a
> daemon. It has the same timing properties as many other daemons doing
> time critical operations for many clients

I really think that this is the most important observation about the
behaviour. X really isn't doing anything that other deamons don't do. It
so happens that what X is doing is a lot more complex than most deamons
do, so it's fairly easy to trigger X into using a fair amount of CPU time,
but that only makes the breakdown case so much easier to see.

In all honesty, though, it's also obviously the case that the breakdown
case in the case of X is _really_ visible in a very very concrete way,
since X is the _only_ thing really "visible" unless you have Direct
Rendering clients. So even if the problem were to happen with another
deamon, it probably wouldn't stand out quite as badly. So in that sense X
ends up having some special characteristics - "visibility".

Linus

2003-03-06 17:33:52

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Ingo Molnar wrote:
>
> another thing. What really happens in the 'recompile job' thing is not
> that X gets non-interactive. Most of the time it _is_ interactive.

This is not what I've seen.

When X is interactive, and is competing against other interactive jobs,
you don't get multi-second slowdowns. X still gets 10% of the CPU, it just
gets it in smaller chunks.

The multi-second "freezes" are the thing that bothered me, and those were
definitely due to the fact that X was competing as a _non_interactive
member against other non-interactive members, causing it to still get 10%
of the CPU, but only every few seconds. So you'd get a very bursty
behaviour with very visible pauses.

It's ok to slow X down. Nobody in their right mind would expect X to track
the mouse 100% when scrolling and the machine load is 15+. I certainly
don't.

But having X just _pause_ for several seconds gets to me. I can't easily
make it happen any more thanks to having ridiculous hardware, and I think
X itself has gotten better thanks to more optimizations in both clients
and X itself (ie if the CPU requirements of X go down from 5% to 3%, it
gets a _lot_ harder to trigger).

But it was definitely there. 3-5 second _pauses_. Not slowdowns.

Linus

2003-03-06 17:34:26

by Alan

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Thu, 2003-03-06 at 17:11, Ingo Molnar wrote:
> X is special. Especially in Andrew's wild-window-dragging experiment X is
> a pure CPU-bound task that just eats CPU cycles no end. There _only_ thing
> that makes it special is that there's a human looking at the output of the
> X client. This is information that is simply not available to the kernel.

Just like a streaming video server
Just like a 3D game using DRI

X isnt special at all. Research OS people have done stuff like time transfers
but I've not seen that in a production OS. In that situation an interactive
task blocking on a server hands on some of its interactiveness to whoever
services the request.

Thats not a trivial Linux addition however

2003-03-06 17:44:48

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


okay, here's a patch that merges Linus' "priority boost backmerging" and
my CPU-hog-detection-improvement patches. Andrew, just in case you were
about to test various scheduler-patch combinations on your box, this might
be something worth to try.

Ingo

--- linux/include/linux/sched.h.orig
+++ linux/include/linux/sched.h
@@ -328,7 +328,7 @@ struct task_struct {
prio_array_t *array;

unsigned long sleep_avg;
- unsigned long sleep_timestamp;
+ unsigned long last_run;

unsigned long policy;
unsigned long cpus_allowed;
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -54,20 +54,19 @@
/*
* These are the 'tuning knobs' of the scheduler:
*
- * Minimum timeslice is 10 msecs, default timeslice is 150 msecs,
- * maximum timeslice is 300 msecs. Timeslices get refilled after
+ * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
+ * maximum timeslice is 200 msecs. Timeslices get refilled after
* they expire.
*/
#define MIN_TIMESLICE ( 10 * HZ / 1000)
-#define MAX_TIMESLICE (300 * HZ / 1000)
-#define CHILD_PENALTY 95
+#define MAX_TIMESLICE (200 * HZ / 1000)
+#define CHILD_PENALTY 50
#define PARENT_PENALTY 100
#define EXIT_WEIGHT 3
#define PRIO_BONUS_RATIO 25
#define INTERACTIVE_DELTA 2
-#define MAX_SLEEP_AVG (2*HZ)
-#define STARVATION_LIMIT (2*HZ)
-#define NODE_THRESHOLD 125
+#define MAX_SLEEP_AVG (10*HZ)
+#define STARVATION_LIMIT (10*HZ)

/*
* If a task is 'interactive' then we reinsert it in the active
@@ -323,26 +322,36 @@ static inline int effective_prio(task_t
* Also update all the scheduling statistics stuff. (sleep average
* calculation, priority modifiers, etc.)
*/
+static inline void __activate_task(task_t *p, runqueue_t *rq)
+{
+ enqueue_task(p, rq->active);
+ rq->nr_running++;
+}
+
static inline void activate_task(task_t *p, runqueue_t *rq)
{
- unsigned long sleep_time = jiffies - p->sleep_timestamp;
- prio_array_t *array = rq->active;
+ unsigned long sleep_time = jiffies - p->last_run;

if (!rt_task(p) && sleep_time) {
/*
* This code gives a bonus to interactive tasks. We update
* an 'average sleep time' value here, based on
- * sleep_timestamp. The more time a task spends sleeping,
+ * ->last_run. The more time a task spends sleeping,
* the higher the average gets - and the higher the priority
* boost gets as well.
*/
p->sleep_avg += sleep_time;
- if (p->sleep_avg > MAX_SLEEP_AVG)
+ if (p->sleep_avg > MAX_SLEEP_AVG) {
+ int ticks = p->sleep_avg - MAX_SLEEP_AVG + current->sleep_avg;
p->sleep_avg = MAX_SLEEP_AVG;
- p->prio = effective_prio(p);
+ if (ticks > MAX_SLEEP_AVG)
+ ticks = MAX_SLEEP_AVG;
+ if (!in_interrupt())
+ current->sleep_avg = ticks;
+ }
+ p->prio = effective_prio(p);
}
- enqueue_task(p, array);
- nr_running_inc(rq);
+ __activate_task(p, rq);
}

/*
@@ -479,10 +488,13 @@ repeat_lock_task:
}
if (old_state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible--;
- activate_task(p, rq);
-
- if (p->prio < rq->curr->prio)
- resched_task(rq->curr);
+ if (sync)
+ __activate_task(p, rq);
+ else {
+ activate_task(p, rq);
+ if (p->prio < rq->curr->prio)
+ resched_task(rq->curr);
+ }
success = 1;
}
p->state = TASK_RUNNING;
@@ -525,8 +537,16 @@ void wake_up_forked_process(task_t * p)
p->prio = effective_prio(p);
}
set_task_cpu(p, smp_processor_id());
- activate_task(p, rq);

+ if (unlikely(!current->array))
+ __activate_task(p, rq);
+ else {
+ p->prio = current->prio;
+ list_add_tail(&p->run_list, &current->run_list);
+ p->array = current->array;
+ p->array->nr_active++;
+ rq->nr_running++;
+ }
task_rq_unlock(rq, &flags);
}

@@ -953,6 +973,11 @@ static inline void pull_task(runqueue_t
*/
if (p->prio < this_rq->curr->prio)
set_need_resched();
+ else {
+ if (p->prio == this_rq->curr->prio &&
+ p->time_slice > this_rq->curr->time_slice)
+ set_need_resched();
+ }
}

/*
@@ -1016,7 +1041,7 @@ skip_queue:
*/

#define CAN_MIGRATE_TASK(p,rq,this_cpu) \
- ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \
+ ((jiffies - (p)->last_run > cache_decay_ticks) && \
!task_running(rq, p) && \
((p)->cpus_allowed & (1UL << (this_cpu))))

@@ -1076,9 +1101,9 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
* increasing number of running tasks:
*/
#define EXPIRED_STARVING(rq) \
- ((rq)->expired_timestamp && \
+ (STARVATION_LIMIT && ((rq)->expired_timestamp && \
(jiffies - (rq)->expired_timestamp >= \
- STARVATION_LIMIT * ((rq)->nr_running) + 1))
+ STARVATION_LIMIT * ((rq)->nr_running) + 1)))

/*
* This function gets called by the timer code, with HZ frequency.
@@ -1201,7 +1226,7 @@ need_resched:
rq = this_rq();

release_kernel_lock(prev);
- prev->sleep_timestamp = jiffies;
+ prev->last_run = jiffies;
spin_lock_irq(&rq->lock);

/*
@@ -1701,7 +1726,7 @@ static int setscheduler(pid_t pid, int p
else
p->prio = p->static_prio;
if (array)
- activate_task(p, task_rq(p));
+ __activate_task(p, task_rq(p));

out_unlock:
task_rq_unlock(rq, &flags);
--- linux/kernel/fork.c.orig
+++ linux/kernel/fork.c
@@ -916,7 +917,7 @@ static struct task_struct *copy_process(
*/
p->first_time_slice = 1;
current->time_slice >>= 1;
- p->sleep_timestamp = jiffies;
+ p->last_run = jiffies;
if (!current->time_slice) {
/*
* This case is rare, it happens when the parent has only

2003-03-06 17:42:35

by jvlists

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


[email protected] (Ingo Molnar) writes:

> X is special. Especially in Andrew's wild-window-dragging experiment X is
> a pure CPU-bound task that just eats CPU cycles no end. There _only_ thing
> that makes it special is that there's a human looking at the output of the
> X client. This is information that is simply not available to the kernel.

Isn't it special because the window manager and other interactive
tasks are blocked waiting for it? Presumably during a wild-window drag
there will be loads of block-wakeup sequences between X and the window
manager. IF every such event gave a little boost to the X server that
would mean the X server would get loads of interactivity brownie
points.

This may be a stupid idea (feel free to ignore, it may even be what
the kernel already does), but if you had a separate input server and
display server instead of one X server you could follow the trail all
the way from the real user.

/dev/input/mice is special because we know it interfaces to the user.
The X input server gains points for waiting on /dev/input/mice
interactive X program gains points for waiting on X input server
X output server is special because it blocks waiting on the
interactive X program (and vice versa!)

The hard part would be detecting interactivity for stuff waiting on ip
sockets.

If this information was available to user programs then the X display
server could even prioritize rendering a new character in bash over
displaying the fishbowl animation running in the background.

Jan

P.S. IMVHO the xine problem is completely different as has nothing to
with interactivity but with the fact that it is soft
real-time. i.e. you need to distingish xine from say a gimp filter or
a 3D renderer with incremental live updates of the scene it is
creating.

2003-03-06 17:45:04

by John Levon

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Thu, Mar 06, 2003 at 09:42:03AM -0800, Linus Torvalds wrote:

> But it was definitely there. 3-5 second _pauses_. Not slowdowns.

It's still there. Red Hat 8.0, 2.5.63. The thing can pause for 15+
seconds (and during this time madplay quite happily trundled on playing
an mp3). Workload was KDE, gcc, nothing exciting...

john

2003-03-06 17:39:56

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Linus Torvalds wrote:

> this is the part we're "throwing away", because the sleeper had already
> accumulated enough interactivity points.
>
> " + current->sleep_avg"
>
> this is the part that the waker _already_had_.
>
> if (ticks > MAX_SLEEP_AVG)
> ticks = MAX_SLEEP_AVG;
>
> This just says that the waker, too, will be limited by the "maximum
> interactivity" thing.

ok. I misread this part. It's actually a 'super boost' for interactive
tasks and tasks related to them.

> if (!in_interrupt())
> current->sleep_avg = ticks;

this (making a difference in characteristics based on in_interrupt()) was
something i tried before, but got killed due to generic wackyness :-)
Also, it's not really justified to isolate a process just because it got
woken up by a hw event.

> and this part says "we only give the potential boost to a _synchronous_
> waker" (ie a network packet that comes in and wakes somebody up in
> bottom half processing will _not_ cause an interactivity boost to a
> random process).

how about a keyboard interrupt?

> See? The patch maintains the rule that "interactivity" only gets created
> by sleeping. The only thing it really does is to change how we
> _distribute_ the interactivity we get. It gives some of the
> interactivity to the waker.

yes - i misunderstood this property of it - and this removed most of my
objections.

> Also, note how your "waiting for gcc to finish" is still not true. Sure,
> that "make" will be considered interactive, but it's not going to help
> the waker (gcc) any, since it will be interactive waiting for gcc to
> _die_.

there's also another phenomenon in the 'make -j5 hell': gas getting
boosted due to it waiting on the gcc pipeline. Now gcc will be 'back
boosted'. But we might be lucky and get away with it - testing will show.

> - "cc1" (slow) writes to a pipe to "as" (fast)
>
> "as" is fast, so as ends up waiting most of the time. Thus it ends up
> being marked interactive.
>
> When cc1 wakes up as, assuming as has been marked "maximally
> interactive", cc1 will get an interactivity boost too.
>
> Is this "wrong"? Maybe, if you see it from a pure "interactivity"
> standpoint. But another way of seeing it is to say that it automatically
> tries to _balance_ this kind of pipeline - since "cc1" is much slower
> and actually _wants_ the interactivity that "as" is clearly not ever
> going to actually get any real advantage from, it is actually likely to
> be perfectly fine give "cc1" a priority.

okay.

> In short, it's all about balancing. There are things that are
> "pro-interactive" (real sleeping), and there are things that are
> "anti-interactive" (using up your timeslice). The question is how you
> spread out the bonus points (or the negative points).

yes.

> The current scheduler doesn't spread them out at all. I think that's a
> bug, since pipelines of multiple processes are actually perfectly
> common, and X is only one example of this.

i have tried multiple schemes before to spread out interactivity, none
worked so far - but i have not tried any 'back-boosting' towards a CPU-hog
before, so it's an interesting experiment. If you look at the
child-timeslice thing that is a common vector for interactivity to spread.

> And my patch may spread it out _too_ much. Maybe we shouldn't give _all_
> of the left-over interactivity to the waker. Maybe we should give just
> half of it away..

yes, not spreading out could also make it possible to give it back via
multiple wakeup links, interactivity will 'diffuse' along wakeups.

Ingo

2003-03-06 17:48:52

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On 6 Mar 2003, Alan Cox wrote:

> On Thu, 2003-03-06 at 17:11, Ingo Molnar wrote:
> > X is special. Especially in Andrew's wild-window-dragging experiment X is
> > a pure CPU-bound task that just eats CPU cycles no end. There _only_ thing
> > that makes it special is that there's a human looking at the output of the
> > X client. This is information that is simply not available to the kernel.
>
> Just like a streaming video server
> Just like a 3D game using DRI
>
> X isnt special at all. [...]

i'm convinced we are talking about the same thing. My argument was that X
is special _only_ because humans are looking at it. Ie. it's not special
at all to the kernel.

Ingo

2003-03-06 17:52:07

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Ingo Molnar wrote:
>
> > And my patch may spread it out _too_ much. Maybe we shouldn't give _all_
> > of the left-over interactivity to the waker. Maybe we should give just
> > half of it away..
>
> yes, not spreading out could also make it possible to give it back via
> multiple wakeup links, interactivity will 'diffuse' along wakeups.

Yes, I think this could work well. What we could do is basically just
always spread the interactivity eavenly between the waker and the sleeper,
instead of my current "give the sleeper as much as we can, and if
anything is left over give some to the waker" approach.

So my current patch is extreme in that it tries to use up as much as
possible of the "interactivity bonus points", but it is _not_ extreme in
the sense that it doesn't inherit interactivity both ways.

So my patch will _not_ try to balance a series of three or more processes,
where only one is interactive. Because it will trigger only for the
process _directly_ connected to the interactive one.

If we spread it out instead by always trying to balance the interactivity,
it would spread out from the interactive one along any synchronous wakeup
chain. Which is more extreme than I was willing to try with a first
example..

Linus

2003-03-06 17:47:11

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On 6 Mar 2003, Alan Cox wrote:
>
> X isnt special at all. Research OS people have done stuff like time transfers
> but I've not seen that in a production OS. In that situation an interactive
> task blocking on a server hands on some of its interactiveness to whoever
> services the request.

This is what I really wanted to do - give the interactivity away at
blocking time. However, there's usually no sane way to do that, since we
don't know a-priori who we are blocking _for_. We could try to transfer it
in the unix domain socket sleep case to whoever is waiting at the other
end, but that's actually quite complex and ends up having each sleep entry
somehow inform the subsystem it is sleeping on that it wants to give
interactivity to the other end.

The thing about wakeups is that it's an "illogical" place to give the
bonus ("it's too late to give the server a bonus _now_, I wanted it when I
went to sleep"), but it's the _trivial_ place to give it.

It also (I'm convinced) is actually in the end exactly equivalent to
giving the bonus at sleep time - in the steady state picture. In the
single-transfer case it is wrong, but the single transfer case doesn't
matter - the interactivity bonus isn't a "immediate shot of caffeine into
the bloodstream" anyway. The interactivity bonus is supposed to help over
time, so the only thing that matters is really the steady state.

But the proof is in the pudding. Does this actually make things appear
"nicer" to people? Or is it just another wanking session?

Linus

2003-03-06 17:54:21

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Linus Torvalds wrote:

> But the proof is in the pudding. Does this actually make things appear
> "nicer" to people? Or is it just another wanking session?

yes, it would be interesting to see Andrew's experiments redone for the
following combinations:

- your patch
- my patch
- both patches

in fact my patch was tested and it mostly solved the problem for Andrew,
but i'm now convinced that the combination of patches will be the real
solution for this class of problems - as that will attack _both_ ends,
both CPU hogs are recognized better, and interactivity detection
interactivity-distribution is improved.

but neither patch solves all problems. The typical DRI game just does
nothing but calls the DRI ioctl() and burns CPU time. So i'm convinced we
need one more way to help the kernel identify tasks that are important to
people - increasing the priority of those tasks programmatically is
certainly a workable solution, but there might be other solutions.

Ingo

2003-03-06 17:59:16

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, John Levon wrote:
> On Thu, Mar 06, 2003 at 09:42:03AM -0800, Linus Torvalds wrote:
>
> > But it was definitely there. 3-5 second _pauses_. Not slowdowns.
>
> It's still there. Red Hat 8.0, 2.5.63. The thing can pause for 15+
> seconds (and during this time madplay quite happily trundled on playing
> an mp3). Workload was KDE, gcc, nothing exciting...

Oh, well. I didn't actually even verify that UNIX domain sockets will
cause synchronous wakeups, so the patch may literally be doing nothing at
all. You can try that theory out by just removing the test for
"in_interrupt()".

It will also almost certainly be totally ineffective if you use TCP
sockets for the same reason. So if your DISPLAY variable is set to
"localhost:0" instead of ":0.0", you'd definitely have to remove the
"in_interrupt()" test.

But it may also just be that the theory is just broken.

Linus

2003-03-06 18:00:50

by John Levon

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Thu, Mar 06, 2003 at 10:07:17AM -0800, Linus Torvalds wrote:

> > > But it was definitely there. 3-5 second _pauses_. Not slowdowns.
> >
> > It's still there. Red Hat 8.0, 2.5.63. The thing can pause for 15+
>
> Oh, well. I didn't actually even verify that UNIX domain sockets will
> cause synchronous wakeups, so the patch may literally be doing nothing at
> all. You can try that theory out by just removing the test for
> "in_interrupt()".

Sorry, I think I misunderstood what you meant in your comment above. I
haven't actually tried these patches yet, worry not.

regards,
john

2003-03-06 17:54:23

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Ingo Molnar wrote:
>
> okay, here's a patch that merges Linus' "priority boost backmerging" and
> my CPU-hog-detection-improvement patches.

I was actually going to suggest making the CPU-hog detector _worse_, to
see what the extreme behaviour of the "boost balancing" is. One of the
things I would hope for is that the interactivity balancing would act as a
damper on the CPU hug detector, and make it less important to get that one
right in the first place.

Linus

2003-03-06 18:02:08

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Linus Torvalds wrote:

> > okay, here's a patch that merges Linus' "priority boost backmerging" and
> > my CPU-hog-detection-improvement patches.
>
> I was actually going to suggest making the CPU-hog detector _worse_, to
> see what the extreme behaviour of the "boost balancing" is. One of the
> things I would hope for is that the interactivity balancing would act as
> a damper on the CPU hug detector, and make it less important to get that
> one right in the first place.

this does not appear to happen, at least with the current incarnation,
make -j5 jobs of the kernel tree (on UP), with your patch alone are
hoovering at a considerably higher dynamic priority than they are with my
patch. But interactivity itself is good as far as i can tell, in both
cases - but i think Andrew would be a better tester in this regard.

plus my patch doesnt really change the core CPU-hog detector, what it does
is that it changes the way children/parents will inherit priorities, plus
changes the order of how children get their first timeslice. The
end-effect is that the 'CPU-hog' make job is better recognized as that -
but X itself will not be recognized in any different way.

Ingo

2003-03-06 18:07:49

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


in any case, i'm quite confortable about your patch now, because it just
fits into the thinking that is behind the current boosting concept. So
it's not a random tweak added here or there (like the iopl() boost), it's
an extension of the core interactivity concept, and as such it cannot have
bad effects, unless the core concept is faulty. (which i dont think it
is.)

Ingo

2003-03-06 18:08:20

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, John Levon wrote:
>
> Sorry, I think I misunderstood what you meant in your comment above. I
> haven't actually tried these patches yet, worry not.

Oh, ok. There is still hope. And as apparently you can see the problem
easily, maybe you can be one of the poor guinea-pigs for the patches
floating around.

(Damn! 15 second lockups? That's just not much fun any more.)

Linus

2003-03-06 18:05:44

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Linus Torvalds wrote:

> > It's still there. Red Hat 8.0, 2.5.63. The thing can pause for 15+
> > seconds (and during this time madplay quite happily trundled on playing
> > an mp3). Workload was KDE, gcc, nothing exciting...
>
> Oh, well. I didn't actually even verify that UNIX domain sockets will
> cause synchronous wakeups, so the patch may literally be doing nothing
> at all. You can try that theory out by just removing the test for
> "in_interrupt()".

you are not referring to the 'synchronous wakeups' as used by fs/pipe.c,
right? in_interrupt() isolates interrupt-context wakeups (asynchronous
wakeups) and process-context wakeups - which can also be called
synchronous, in a way.

so i think your current patch should cover unix domain sockets just as
well, they certain dont use IRQ-context wakeups.

Ingo

2003-03-06 18:13:29

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

>> But the proof is in the pudding. Does this actually make things appear
>> "nicer" to people? Or is it just another wanking session?
>
> yes, it would be interesting to see Andrew's experiments redone for the
> following combinations:
>
> - your patch
> - my patch
> - both patches
>
> in fact my patch was tested and it mostly solved the problem for Andrew,
> but i'm now convinced that the combination of patches will be the real
> solution for this class of problems - as that will attack _both_ ends,
> both CPU hogs are recognized better, and interactivity detection
> interactivity-distribution is improved.

It would be nice if we had a "batch-job-able" simulation of this situation,
to do more accurate testing with ... can anyone think of an easy way to
do such a thing? "waggle a few windows around on X and see if it feels
better to you or not" is kind of hard to do accurate testing with.
Of course, the simulation has to be accurate too, or it gets stupid ;-)

M.

2003-03-06 18:12:37

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Ingo Molnar wrote:
> > Oh, well. I didn't actually even verify that UNIX domain sockets will
> > cause synchronous wakeups, so the patch may literally be doing nothing
> > at all. You can try that theory out by just removing the test for
> > "in_interrupt()".
>
> you are not referring to the 'synchronous wakeups' as used by fs/pipe.c,
> right?

No, sorry. Bad choice of words.

The traditional "synchronous wakeups" as used by fs/pipe.c is a hint to
the scheduler that the waker will go to sleep.

And no, that's not the hint I'm using at all. I'm only interested in
"process-synchronous", since if the wakeup isn't process-synchronous then
"current" doesn't make much sense to me.

> so i think your current patch should cover unix domain sockets just as
> well, they certain dont use IRQ-context wakeups.

Note that "in_interrupt()" will also trigger for callers that call from
bh-atomic regions as well as actual BH handlers. Which is correct - they
are both "interrupt contexts" as far as most users should be concerned.

The unix domain case may well be bh-atomic, I haven't looked at the code.
I'm pretty much certain that the TCP case _will_ be BH-atomic, even for
loopback.

David?

Linus

2003-03-06 18:18:47

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003 [email protected] wrote:

> P.S. IMVHO the xine problem is completely different as has nothing to
> with interactivity but with the fact that it is soft real-time. i.e. you
> need to distingish xine from say a gimp filter or a 3D renderer with
> incremental live updates of the scene it is creating.

it is the same category of problems: xine and X are both applications,
which, if lagged, are noticed by users. xine can be a perfectly fine CPU
hog when playing back DVDs. It can also be a mostly interactive task
playing back music mostly. For xine it's not just that audio skipping that
gets noticed, it's also the video-playback jerkiness that can be noticed
by users.

Ingo

2003-03-06 18:18:06

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Martin J. Bligh wrote:
>
> It would be nice if we had a "batch-job-able" simulation of this situation,
> to do more accurate testing with ... can anyone think of an easy way to
> do such a thing? "waggle a few windows around on X and see if it feels
> better to you or not" is kind of hard to do accurate testing with.
> Of course, the simulation has to be accurate too, or it gets stupid ;-)

It should be possible to use the same approach that the other latency
testers use (ie contest&co), by just generating some specific background
load (say, different mixtures of X clients and plain compute-bound things
like kernel compiles).

Ie measure directly exactly what we're interested in: the latency of some
general X request, under different loads.

Linus

2003-03-06 18:22:12

by Dimitrie O. Paun

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Thu, 6 Mar 2003, Ingo Molnar wrote:

> yes, an ELF flag might work, or my suggestion to allow applications to
> increase their priority (up until a certain degree).

An ELF flag might be better, as it's declarative -- it allows the kernel
to implement 'interactivity' in various ways, so we can keep tweeking it.
Priority might prove to be a bit different than interactivity, so we
better not overload the two just yet.

--
Dimi.

2003-03-06 18:19:48

by David Miller

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

From: Linus Torvalds <[email protected]>
Date: Thu, 6 Mar 2003 10:20:42 -0800 (PST)

Note that "in_interrupt()" will also trigger for callers that call from
bh-atomic regions as well as actual BH handlers. Which is correct - they
are both "interrupt contexts" as far as most users should be concerned.

The unix domain case may well be bh-atomic, I haven't looked at the code.
I'm pretty much certain that the TCP case _will_ be BH-atomic, even for
loopback.

David?

Unix sockets use non-BH locks, there are no software interrupts
involved in AF_UNIX processing so no need to protect against them.

The actual wakeup comes from the socket callbacks, we use the
default data_ready() which is:

void sock_def_readable(struct sock *sk, int len)
{
read_lock(&sk->callback_lock);
if (sk->sleep && waitqueue_active(sk->sleep))
wake_up_interruptible(sk->sleep);
sk_wake_async(sk,1,POLL_IN);
read_unlock(&sk->callback_lock);
}

And for write wakeups Unix uses it's own, which is:

static void unix_write_space(struct sock *sk)
{
read_lock(&sk->callback_lock);
if (unix_writable(sk)) {
if (sk->sleep && waitqueue_active(sk->sleep))
wake_up_interruptible(sk->sleep);
sk_wake_async(sk, 2, POLL_OUT);
}
read_unlock(&sk->callback_lock);
}

So, to reiterate, no BH locking is used by AF_UNIX.

2003-03-06 20:35:42

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

Linus Torvalds <[email protected]> wrote:
>
>
> ===== kernel/sched.c 1.161 vs edited =====
> --- 1.161/kernel/sched.c Thu Feb 20 20:33:52 2003
> +++ edited/kernel/sched.c Wed Mar 5 19:09:45 2003
> @@ -337,8 +337,15 @@
> * boost gets as well.
> */
> p->sleep_avg += sleep_time;
> - if (p->sleep_avg > MAX_SLEEP_AVG)
> + if (p->sleep_avg > MAX_SLEEP_AVG) {
> + int ticks = p->sleep_avg - MAX_SLEEP_AVG + current->sleep_avg;
> p->sleep_avg = MAX_SLEEP_AVG;
> + if (ticks > MAX_SLEEP_AVG)
> + ticks = MAX_SLEEP_AVG;
> + if (!in_interrupt())
> + current->sleep_avg = ticks;
> + }
> +
> p->prio = effective_prio(p);
> }
> enqueue_task(p, array);

This improves the X interactivity tremendously. I went back to 2.5.64 base
just to verify, and the difference was very noticeable.

The test involved doing the big kernel compile while moving large xterm,
mozilla and sylpheed windows about. With this patch the mouse cursor was
sometimes a little jerky (0.1 seconds, perhaps) and mozilla redraws were
maybe 0.5 seconds laggy.

So. A big thumbs up on that one. It appears to be approximately as
successful as sched-2.5.64-a5.

Ingo's combo patch is better still - that is sched-2.5.64-a5 and your patch
combined (yes?). The slight mouse jerkiness is gone and even when doing
really silly things I cannot make it misbehave at all. I'd handwavingly
describe both your patch and sched-2.5.64-a5 as 80% solutions, and the combo
95%.


So I'm a happy camper, and will be using Ingo's combo patch. But I do not
use XMMS and xine and things like that - they may be running like crap with
these patches. I do not know, and I do not have a base to compare against
even if I could work out how to get them going.


I did spend a bit of time a while back trying to come up with some
combination of tests which would exhibit this problem, and which would allow
it to be measured and tweaked. It was unsuccessful - the peculiar
combination of a compilation and X seems to bring it out.


2003-03-06 21:53:00

by Martin Waitz

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

hi :)

On Wed, Mar 05, 2003 at 07:20:34PM -0800, Linus Torvalds wrote:
> How about something more like this (yeah, untested, but you get the idea):
> the person who wakes up an interactive task gets the interactivity bonus
> if the interactive task is already maxed out. I dunno how well this plays
> with the X server, but assuming most clients use UNIX domain sockets, the
> wake-ups _should_ be synchronous, so it should work well to say "waker
> gets bonus".

i used a similar method to correctly account resource usage
(cpu,energy,..) of processes in my diploma thesis:
work done by a sever (e.g. X) is accounted to the current client,
giving more resources to the server
http://admingilde.org/~martin/papers/

implementation is working but far from being mergeable...




RE: the patch, i think using sleep_avg is a wrong metric from the
beginning.

in addition, timeslices should be shortest for high priority processes
(depending on dynamic prio, not static)

but these are of course just simple statements and i don't have
a patch that makes a really good scheduler :(

--
CU, / Friedrich-Alexander University Erlangen, Germany
Martin Waitz // [Tali on IRCnet] [tali.home.pages.de] _________
______________/// - - - - - - - - - - - - - - - - - - - - ///
dies ist eine manuell generierte mail, sie beinhaltet //
tippfehler und ist auch ohne grossbuchstaben gueltig. /
-
Wer bereit ist, grundlegende Freiheiten aufzugeben, um sich
kurzfristige Sicherheit zu verschaffen, der hat weder Freiheit
noch Sicherheit verdient. Benjamin Franklin (1706 - 1790)


Attachments:
(No filename) (1.56 kB)
(No filename) (189.00 B)
Download all attachments

2003-03-06 21:56:59

by Robert Love

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Thu, 2003-03-06 at 17:03, Martin Waitz wrote:

> RE: the patch, i think using sleep_avg is a wrong metric from the
> beginning.

Eh? It is as close to a heuristic of interactivity as I can think of.

> in addition, timeslices should be shortest for high priority processes
> (depending on dynamic prio, not static)

No, they should be longer. In some cases they should be nearly
infinitely long (which is sort of what we do with the reinsertion into
the active array for highly interactive tasks). We want interactive
tasks to always be able to run.

You may think they need shorter timeslice because they are I/O-bound.
Keep in mind they need not _use_ all their timeslice in one go, and
having a large timeslice ensures they have timeslice available when they
wake up and need to run.

Robert Love

2003-03-06 21:53:47

by Bill Davidsen

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Thu, 6 Mar 2003, Jeff Garzik wrote:

> Pardon the suggestion of a dumb hueristic, feel free to ignore me:
> would it work to run-first processes that have modified their iopl()
> level? i.e. "if you access hardware directly, we'll treat you specially
> in the scheduler"?
>
> An alternative is to encourage distros to set some sort of flag for
> processes like the X server, when it is run. This sounds suspiciously
> like the existing "renice X server" hack, but it could be something like
> changing the X server from SCHED_OTHER to SCHED_HW_ACCESS instead.
>
> Just throwing out some things, because I care about apps which access
> hardware from userspace :)

Well, close. But any low-latency access, even if somewhat buffered,
is likely to be user visible if unresponsive. Clearly that means video
memory like X and DRI, and sound, and mouse. If you define this generally
it would include serial as well, probably not a bad thing.

The proposal to backfeed priority from interractive processes to processes
waking them sounds useful, perhaps that might be limited a bit however.
Someone (Ingo?, Linus?) proposed limiting this to a fraction of the
priority of the interractive process, but it might be more useful to
simply limit how much could be added at one time, perhaps as a fraction of
the delta between max and current interractive bonus, which would have the
waker increase asymptomically toward max. Hysteresis is nice.

I do agree that it would be better to identify processes using physical
devices, far better than trying to identify some subset because it's
obvious.

I think this would include parallel port, although other than PL/IP I
don't think of a case where it matters much.

--
bill davidsen <[email protected]>
CTO, TMR Associates, Inc
Doing interesting things with little computers since 1979.

2003-03-06 22:07:25

by Robert Love

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Thu, 2003-03-06 at 15:42, Andrew Morton wrote:

> So I'm a happy camper, and will be using Ingo's combo patch. But I do not
> use XMMS and xine and things like that - they may be running like crap with
> these patches. I do not know, and I do not have a base to compare against
> even if I could work out how to get them going.

Linus,

This is great for me, too. I played around with some mp3 playing and
did the akpm-window-wiggle test. It is definitely the smoothest.

I think we definitely need Ingo's tweaked scheduler parameters - I have
been running a similar set of values myself for some time. But your
patch seems to make the difference.

This is the most subject stuff on the planet, but here is a rough
ranking of interactivity performance in the bad cases on a scale of 1
(worse) to 5 (best):

linus-patch + tweaked-parameters: 5
linus-patch: 4
tweaked-parameters + reniced X: 3.5
tweaked-parameters: 2.5
stock: 1

Sorry, did not test Ingo's full patch. It is basically the tweaked
parameters plus the sync wakeup which looks correct.

In the average case, the O(1) scheduler does fine without any changes.
The heuristic works. It is just the worst-case cases where we need
help, and from above I think we have that.

Robert Love

2003-03-06 22:13:27

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

>> yes, an ELF flag might work, or my suggestion to allow applications to
>> increase their priority (up until a certain degree).
>
> An ELF flag might be better, as it's declarative -- it allows the kernel
> to implement 'interactivity' in various ways, so we can keep tweeking it.
> Priority might prove to be a bit different than interactivity, so we
> better not overload the two just yet.

Difficult to see how this would work. For instance, is bash interactive
or a batch job?

M.

2003-03-06 22:20:24

by Eric Northup

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Thursday 06 March 2003 12:35 pm, Linus Torvalds wrote:
> On 6 Mar 2003, Alan Cox wrote:
> > Not all X servers do that. X is not special in any way. Its just a
> > daemon. It has the same timing properties as many other daemons doing
> > time critical operations for many clients
>
> I really think that this is the most important observation about the
> behaviour. X really isn't doing anything that other deamons don't do. It
> so happens that what X is doing is a lot more complex than most deamons
> do, so it's fairly easy to trigger X into using a fair amount of CPU time,
> but that only makes the breakdown case so much easier to see.
>
> In all honesty, though, it's also obviously the case that the breakdown
> case in the case of X is _really_ visible in a very very concrete way,
> since X is the _only_ thing really "visible" unless you have Direct
> Rendering clients. So even if the problem were to happen with another
> deamon, it probably wouldn't stand out quite as badly. So in that sense X
> ends up having some special characteristics - "visibility".

Yes. The special thing about X (and XMMS, XINE, et al) is that the
interactivity heuristic fails in a user-noticeable way. The heuristic must
also be guessing incorrectly in other situations, but nobody seems to have
noticed when it did (or, at least, complained about it).

Another noteworthy attribute of these programs is that they _know_ they will
be used interactively. And, if there were a way for them to convey that
information to the kernel (not by using 'nice'), I bet all of the above
projects would take advantage of that mechanism.

At first I strongly dissaproved of the kernel's timeslice adjustment by
interactivity estimation; policy belongs in userland, I thought. But the
algorithm is actually quite effective -- a balance between high HZ for
real-time-ish applications but manages to avoid the cache-thrashing in places
it would hurt. This policy works well most of the time, so the kernel
includes it as a sensible default. But X demonstrates that sometimes we want
to set a different policy. After all, the heuristic will not become perfect,
and its worst-case behavior is pretty bad. (If your computers are too fast
to notice, try turning off cache with /proc/mtrr. Not be a great idea on an
important system though...)

The question is: what interface would be used? A new syscall seems excessive;
but something in /proc (/proc/xyz/preferred_timeslice ?) is non-optimal,
because it must not be used in chroot. I am not qualified to suggest the
Proper mechanism, but I strongly believe there should be one.

--Eric

2003-03-06 22:25:08

by Martin Waitz

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Thu, Mar 06, 2003 at 05:07:37PM -0500, Robert Love wrote:
> On Thu, 2003-03-06 at 17:03, Martin Waitz wrote:
>
> > RE: the patch, i think using sleep_avg is a wrong metric from the
> > beginning.
> Eh? It is as close to a heuristic of interactivity as I can think of.

processes tend to max out at one extreme or the other

processes that get stalled by a huge overall load of the machine
are forced to sleep, too; yet they are not interactive.

priority should be based on things the processes did, not on what
they haven't done.

> > in addition, timeslices should be shortest for high priority processes
> > (depending on dynamic prio, not static)
>
> No, they should be longer. In some cases they should be nearly
> infinitely long (which is sort of what we do with the reinsertion into
> the active array for highly interactive tasks). We want interactive
> tasks to always be able to run.

but for interactive tasks, latency is all-important
if the task can't provide a result in a short timeslice it already
failed to provide low latency and should not be considered interactive
any more.

when the dynamic prio of the currently running process is high,
then only interactive processes should run.
but these processes should be rescheduled fast, to
provide low-latency equally to all interactive processes

when the dynamic prio of the currently running process is low
then no interactive process runs and the scheduler can resort
to a more batch-job mode: increase timeslices to reduce
scheduling overhead.

> You may think they need shorter timeslice because they are I/O-bound.
> Keep in mind they need not _use_ all their timeslice in one go, and
> having a large timeslice ensures they have timeslice available when they
> wake up and need to run.
but the time slice should not be large enough to stall other
processes, which is extremely important for interactivity

--
CU, / Friedrich-Alexander University Erlangen, Germany
Martin Waitz // [Tali on IRCnet] [tali.home.pages.de] _________
______________/// - - - - - - - - - - - - - - - - - - - - ///
dies ist eine manuell generierte mail, sie beinhaltet //
tippfehler und ist auch ohne grossbuchstaben gueltig. /
-
Wer bereit ist, grundlegende Freiheiten aufzugeben, um sich
kurzfristige Sicherheit zu verschaffen, der hat weder Freiheit
noch Sicherheit verdient. Benjamin Franklin (1706 - 1790)


Attachments:
(No filename) (2.34 kB)
(No filename) (189.00 B)
Download all attachments

2003-03-06 22:47:51

by Dimitrie O. Paun

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Thu, 6 Mar 2003, Martin J. Bligh wrote:

> Difficult to see how this would work. For instance, is bash interactive
> or a batch job?

Right, being able to control this interactivity knob programmatically
seems like a useful thing. That way, the window manager can boost the
interactivity of the foreground window for example. It does seem that
figuring out that something is interactive in the scheduler is tough,
there is just not enough information, whereas a higher layer may know
this for a fact. I guess this reduces my argument to just keeping the
interactivity setting separate from priority.

--
Dimi.

2003-03-06 22:45:24

by Robert Love

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Thu, 2003-03-06 at 17:35, Martin Waitz wrote:

> processes tend to max out at one extreme or the other
>
> processes that get stalled by a huge overall load of the machine
> are forced to sleep, too; yet they are not interactive.

Not running != Sleeping

> priority should be based on things the processes did, not on what
> they haven't done.

It _is_ based on something the process has done: sleeping in the kernel

> but for interactive tasks, latency is all-important
> if the task can't provide a result in a short timeslice it already
> failed to provide low latency and should not be considered interactive
> any more.

Ahh, but you are confusing timeslice with the time a process runs for.

This is what I pointed out in my initial email was your flaw :)

A process may have a 100ms timeslice, but only run for 1ms at a time
(100x before recalculating its quanta). This is the intention of giving
large timeslices to interactive tasks: not that they need all 100ms at
_once_ but they may need some of it soon, and when they need it they
really NEED it, so make sure they have enough timeslice such that there
is plenty available when they wake up (since the latency is important,
as you said).

> but the time slice should not be large enough to stall other
> processes, which is extremely important for interactivity

Once a process stalls other processes (i.e. by running a long time i.e.
by being a CPU hog) then it loses its interactive bonus.

The heuristic works. No one doubts that, modulo the corner cases we are
working on fixing. I suggest you read the code.

Robert Love

2003-03-06 22:54:56

by Robert Love

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Thu, 2003-03-06 at 17:31, Dimitrie O. Paun wrote:

> Right, being able to control this interactivity knob programmatically
> seems like a useful thing. That way, the window manager can boost the
> interactivity of the foreground window for example. It does seem that
> figuring out that something is interactive in the scheduler is tough,
> there is just not enough information, whereas a higher layer may know
> this for a fact. I guess this reduces my argument to just keeping the
> interactivity setting separate from priority.

No no no. Martin's point shows exactly that nothing but the kernel can
ever know whether a task is I/O or CPU bound. What is bash? Is it
interactive (when you are typing into it) or CPU bound (when its running
a script or doing other junk)?

Only the kernel knows exactly the sleep patterns of tasks, which is
essentially whether or not a task is interactive.

Finally, the windows manager idea is bad. The foreground window may
have dependencies elsewhere, and giving it a boost only partially solves
the problem.

I think with Linus's patch, the problem is solved, because we boost both
I/O-bound tasks and tasks that help I/O bound tasks.

Robert Love

2003-03-06 23:07:58

by Robert Love

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Thu, 2003-03-06 at 12:52, [email protected] wrote:

> P.S. IMVHO the xine problem is completely different as has nothing to
> with interactivity but with the fact that it is soft
> real-time. i.e. you need to distingish xine from say a gimp filter or
> a 3D renderer with incremental live updates of the scene it is
> creating.

Xine is the same as X or vi or xmms, for this problem.

They are all, in fact, soft real-time issues where the latency limit we
want is whatever is the threshold of human perception. This limit may
be more or less for DVD playback vs. typing vs. music playback... but
ultimately there is some latency threshold you want maintained. If Xine
is starved too long, your video skips. If vi is starved too long, you
can perceive a lag in your keystrokes showing up. It is all the same.

Maybe we do not care about the latency of the gimp filter. Those are
indeed different. But the interactivity of other things - say, gimp's
menus in response to you clicking them - are all the same sort of thing.

Robert Love

2003-03-06 23:25:22

by Robert Love

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Thu, 2003-03-06 at 18:27, Martin Waitz wrote:

> schedule() does prev->sleep_timestamp = jiffies; just before
> deactivating prev.
> so i guess inactivity is counted towards sleep_avg, too

That is just the initial value. See activate_task() which actually sets
the sleep_avg value. If the task is never woken up, sleep_timestamp is
never used and sleep_avg is not touched.

sleep_avg is the important value.

sleep_timestamp is missed named, its really just the jiffies value at
which the task last ran. Ingo renamed it "last_run" in his latest
patch.

> but most of the time, not _one_ process is waken up, but several at once
>
> if it happens that the first who gets to run is cpu-bound,
> then all other interactive processes have to wait a long time, even
> if they would only need 1ms to finish their work.

Interactive tasks also have a higher dynamic priority. So they will be
the one to run.

> scheduling overhead was successfully brought down to a minimum
> thanks to the great work of a lot of people.
> i think we should use that work to improve latency by reducing
> the available timeslice for interactive processes.
>
> if the process is still considered interactive after the time slice had run
> out, nothing is lost; it simply gets another one.
>
> but the kernel should get the chance to frequently reschedule
> when interactivity is needed.

I understand your point, but we get that now without using super small
timeslices.

Giving interactive tasks larger timeslice ensures they can always run
when they need to. It also lets us set an upper bound and not have to
recalculate timeslices constantly.

If an interactive task _does_ use all its timeslice, then it is no
longer interactive and that will be noted and it will lose its bonus.

Robert Love

2003-03-06 23:17:30

by Martin Waitz

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

hi :)

On Thu, Mar 06, 2003 at 05:56:06PM -0500, Robert Love wrote:
> On Thu, 2003-03-06 at 17:35, Martin Waitz wrote:
>
> > processes tend to max out at one extreme or the other
> >
> > processes that get stalled by a huge overall load of the machine
> > are forced to sleep, too; yet they are not interactive.
>
> Not running != Sleeping

schedule() does prev->sleep_timestamp = jiffies; just before
deactivating prev.
so i guess inactivity is counted towards sleep_avg, too

> A process may have a 100ms timeslice, but only run for 1ms at a time
> (100x before recalculating its quanta).
but it _can_ use the 100ms at a time if it is cpu-bound
what's so bad about recalculating the quantum?

> This is the intention of giving large timeslices to interactive tasks:
> not that they need all 100ms at _once_ but they may need some of it
> soon, and when they need it they really NEED it, so make sure they
> have enough timeslice such that there is plenty available when they
> wake up (since the latency is important, as you said).
but most of the time, not _one_ process is waken up, but several at once

if it happens that the first who gets to run is cpu-bound,
then all other interactive processes have to wait a long time, even
if they would only need 1ms to finish their work.

scheduling overhead was successfully brought down to a minimum
thanks to the great work of a lot of people.
i think we should use that work to improve latency by reducing
the available timeslice for interactive processes.

if the process is still considered interactive after the time slice had run
out, nothing is lost; it simply gets another one.

but the kernel should get the chance to frequently reschedule
when interactivity is needed.

> Once a process stalls other processes (i.e. by running a long time i.e.
> by being a CPU hog) then it loses its interactive bonus.
but it takes too long. 100ms is noticeable

> I suggest you read the code.
i've read it ;)

--
CU, / Friedrich-Alexander University Erlangen, Germany
Martin Waitz // [Tali on IRCnet] [tali.home.pages.de] _________
______________/// - - - - - - - - - - - - - - - - - - - - ///
dies ist eine manuell generierte mail, sie beinhaltet //
tippfehler und ist auch ohne grossbuchstaben gueltig. /
-
Wer bereit ist, grundlegende Freiheiten aufzugeben, um sich
kurzfristige Sicherheit zu verschaffen, der hat weder Freiheit
noch Sicherheit verdient. Benjamin Franklin (1706 - 1790)


Attachments:
(No filename) (2.41 kB)
(No filename) (189.00 B)
Download all attachments

2003-03-06 23:40:37

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

> At first I strongly dissaproved of the kernel's timeslice adjustment by
> interactivity estimation; policy belongs in userland, I thought. But the

That seems to be one of the most dramatically overused misguided statements
bandied about in Linux at the moment, with respect to just about every
subsystem, and it's really starting to get annoying.

Yes, you should be able to frig with policy decisions from userspace, if
you really want to BUT the kernel should do something pretty sane by
default without user intervention. It's exactly the same mistake apps
keep making, and why half of them are unusable.

M.

2003-03-06 23:32:31

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

>> Right, being able to control this interactivity knob programmatically
>> seems like a useful thing. That way, the window manager can boost the
>> interactivity of the foreground window for example. It does seem that
>> figuring out that something is interactive in the scheduler is tough,
>> there is just not enough information, whereas a higher layer may know
>> this for a fact. I guess this reduces my argument to just keeping the
>> interactivity setting separate from priority.
>
> No no no. Martin's point shows exactly that nothing but the kernel can
> ever know whether a task is I/O or CPU bound. What is bash? Is it
> interactive (when you are typing into it) or CPU bound (when its running
> a script or doing other junk)?
>
> Only the kernel knows exactly the sleep patterns of tasks, which is
> essentially whether or not a task is interactive.

Exactly ... all this tweaking, and setting up every app individually is bad.
It should "just frigging work" ;-) We seem to be pretty close to that
at the moment - 2.5 feels *so* much better than 2.4 already (2.4 degenerates
into a total slug overnight, presumably when things like man page reindexes
thrash the page cache).

The fact that the debian renice of the X server actually breaks things is
probably good news ... we're actually paying real attention to the nice
value ;-)

M.

2003-03-06 23:55:47

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Martin J. Bligh wrote:
>
> Yes, you should be able to frig with policy decisions from userspace, if
> you really want to BUT the kernel should do something pretty sane by
> default without user intervention.

Amen, brother!

One thing that I've often disliked about "tuning parameters" is that many
people take them to be valid excuses for bad behaviour under some load.
For example, when some benchmark does badly, or something stutters, you'll
always find somebody pointing out that it can be fixed by doing a

echo 55 > /proc/sys/mm/dirty_ratio

or similar. And the fact that you can tune the thing for certain loads
somehow thus makes the bug ok..

IT AIN'T SO! Tuning constants are wonderfully useful for developers tuning
their guesses a bit, and to give more information about what knobs make a
difference.

In other words, it's ok for a developer to say: "Ok, what happens if you
do the above echo?" to figure out whether maybe the problem is due to
excessive spurgel-production in the VM frobnicator, but it is NOT OK to
say "you can tweak it for your use, so don't complain about the bad
behaviour".

This is the same thing as "'nice -10' the X-server". Something is wrong,
and we couldn't fix it, so here's the band-aid to avoid that problem for
hat particular case. It's acceptable as a band-aid, but if you don't
realize that it's indicative of a problem, then you're just kidding
yourself.

Can we do perfectly all the time? No. Sometimes best performance _will_
require tuning for the load. But it should be seen as a failure of the
generic algorithm, not as a success.

Btw, "success" is often "being good enough". We shouldn't _suck_, but you
should always remember the old "perfect is the enemy of good" thing.
Trying to get perfect behaviour under some circumstances often means that
you suck under others, and then the right answer is usually not to try to
be so damn perfect, but "just being good".

Linus

2003-03-06 23:50:40

by Daniel Phillips

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Thu 06 Mar 03 18:55, Linus Torvalds wrote:
> The thing about wakeups is that it's an "illogical" place to give the
> bonus ("it's too late to give the server a bonus _now_, I wanted it when I
> went to sleep"), but it's the _trivial_ place to give it.
>
> It also (I'm convinced) is actually in the end exactly equivalent to
> giving the bonus at sleep time - in the steady state picture.

And anyway, on the first sleep there was no chance to measure the sleep_time,
so there was no bonus to give.

Regards,

Daniel

2003-03-07 00:28:31

by Dimitrie O. Paun

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On March 6, 2003 07:02 pm, Linus Torvalds wrote:
> Btw, "success" is often "being good enough". We shouldn't _suck_, but you
> should always remember the old "perfect is the enemy of good" thing.
> Trying to get perfect behaviour under some circumstances often means that
> you suck under others, and then the right answer is usually not to try to
> be so damn perfect, but "just being good".

Very much so! I can not agree more with you that the lack of a default
good policy is stupid -- this is why X was in such a bad state for years!
I could write pages on this subject, but I'm glad other people share my
ideas in this matter :)

That being said, I have nothing against an automatic algorithm. If we can
get that one to do what we want, there's obviously no need for knobs. All
I'm saying is that *if* there are cases where we need help, we should not
have to tweak the priority of a process, but create a different knob.
Priority is a different beast from interactivity. For example, it's clear
that good interactive behaviour _requires_ information only available in
the kernel. Which makes the interactivity know (if at all required) more
of a hint rather than a set in stone thing like the priority.

But I guess all this is academic. If the automatic stuff just works,
we're better off anyway.

--
Dimi.

2003-03-07 03:11:06

by Horst H. von Brand

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

Linus Torvalds <[email protected]> said:
> On Thu, 6 Mar 2003, Ingo Molnar wrote:
> > the whole compilation (gcc tasks) will be rated 'interactive' as well,
> > because an 'interactive' make process and/or shell process is waiting on
> > it.
>
> No. The make that is waiting for it will be woken up _once_ - when the
> thing dies. Marking it interactive at that point is absolutely fine.

It is woken up each time one Makefile line has been processed, to call the
next one.
--
Dr. Horst H. von Brand User #22616 counter.li.org
Departamento de Informatica Fono: +56 32 654431
Universidad Tecnica Federico Santa Maria +56 32 654239
Casilla 110-V, Valparaiso, Chile Fax: +56 32 797513

2003-03-07 05:42:20

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Andrew Morton wrote:

> This improves the X interactivity tremendously. I went back to 2.5.64
> base just to verify, and the difference was very noticeable.
>
> The test involved doing the big kernel compile while moving large xterm,
> mozilla and sylpheed windows about. With this patch the mouse cursor
> was sometimes a little jerky (0.1 seconds, perhaps) and mozilla redraws
> were maybe 0.5 seconds laggy.
>
> So. A big thumbs up on that one. It appears to be approximately as
> successful as sched-2.5.64-a5.
>
> Ingo's combo patch is better still - that is sched-2.5.64-a5 and your
> patch combined (yes?). The slight mouse jerkiness is gone and even when
> doing really silly things I cannot make it misbehave at all. I'd
> handwavingly describe both your patch and sched-2.5.64-a5 as 80%
> solutions, and the combo 95%.

great! Ie. both approaches go in the same (good) direction, and the fact
that the combined effect is even better than either patch (although it's
hard to improve something that is 'good' already), shows that there's no
conflict between the two concepts.

> So I'm a happy camper, and will be using Ingo's combo patch. But I do
> not use XMMS and xine and things like that - they may be running like
> crap with these patches. I do not know, and I do not have a base to
> compare against even if I could work out how to get them going.

these always used to be problematic to a certain degree, so they are the
next on the list.

Ingo

2003-03-07 05:47:42

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Linus Torvalds wrote:

> The multi-second "freezes" are the thing that bothered me, and those
> were definitely due to the fact that X was competing as a
> _non_interactive member against other non-interactive members, causing
> it to still get 10% of the CPU, but only every few seconds. So you'd get
> a very bursty behaviour with very visible pauses.

this is only part of what happens in the 'X freeze' case. A CPU hog
competing against another CPU hog can never result in a multi-second
freeze, unless the system is hopelessly overloaded.

What happens is that the CPU-hog estimation is off for compile jobs, and
that _they_ end up being partly interactive, and starve X which does
happen to fall into the CPU-hog category briefly. The 10-15 seconds
timeout is the starvation limit kicking in.

so the correct approach is both to make X more interactive (your patch),
_and_ to make the compilation jobs less interactive (my patch). This is
that explains why Andrew saw roughly similar interactivity with you and my
patch applied separately, but the best result was when the combo patch was
applied. Agreed?

Ingo

2003-03-07 05:52:07

by Shawn

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Thu, 2003-03-06 at 23:57, Ingo Molnar wrote:
> so the correct approach is both to make X more interactive (your patch),
> _and_ to make the compilation jobs less interactive (my patch). This is
> that explains why Andrew saw roughly similar interactivity with you and my
> patch applied separately, but the best result was when the combo patch was
> applied. Agreed?

Even better, can you point me to where this "combo" patch can be found?

2003-03-07 05:48:27

by Shawn

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Thu, 2003-03-06 at 23:52, Ingo Molnar wrote:
> On Thu, 6 Mar 2003, Andrew Morton wrote:
>
> > This improves the X interactivity tremendously. I went back to 2.5.64
> > base just to verify, and the difference was very noticeable.
> >
> > The test involved doing the big kernel compile while moving large xterm,
> > mozilla and sylpheed windows about. With this patch the mouse cursor
> > was sometimes a little jerky (0.1 seconds, perhaps) and mozilla redraws
> > were maybe 0.5 seconds laggy.
> >
> > So. A big thumbs up on that one. It appears to be approximately as
> > successful as sched-2.5.64-a5.

I'm sorry, I'm an idiot. Where's sched-2.5.64-a5 available for download?
Latest I found was sched-2.5.63-B3 inside Andrew's patchset.

I'm kinda drooling over this thread.

2003-03-07 05:51:15

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On 6 Mar 2003, Shawn wrote:

> > > So. A big thumbs up on that one. It appears to be approximately as
> > > successful as sched-2.5.64-a5.
>
> I'm sorry, I'm an idiot. Where's sched-2.5.64-a5 available for download?
> Latest I found was sched-2.5.63-B3 inside Andrew's patchset.

-A5 was the last update of my patch which i sent before the 'combo' patch,
attached to the end of the mail, with the subject line of:

Subject: [patch] "interactivity changes", sched-2.5.64-A5

Ingo

2003-03-07 05:53:50

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On 6 Mar 2003, Shawn wrote:

> > so the correct approach is both to make X more interactive (your patch),
> > _and_ to make the compilation jobs less interactive (my patch). This is
> > that explains why Andrew saw roughly similar interactivity with you and my
> > patch applied separately, but the best result was when the combo patch was
> > applied. Agreed?
>
> Even better, can you point me to where this "combo" patch can be found?

in the same thread, a few mails later, send by me. But better yet, try the
latest BK kernel, it has the 'combo' patch merged.

Ingo

2003-03-07 05:57:18

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

Shawn <[email protected]> wrote:
>
> On Thu, 2003-03-06 at 23:52, Ingo Molnar wrote:
> > On Thu, 6 Mar 2003, Andrew Morton wrote:
> >
> > > This improves the X interactivity tremendously. I went back to 2.5.64
> > > base just to verify, and the difference was very noticeable.
> > >
> > > The test involved doing the big kernel compile while moving large xterm,
> > > mozilla and sylpheed windows about. With this patch the mouse cursor
> > > was sometimes a little jerky (0.1 seconds, perhaps) and mozilla redraws
> > > were maybe 0.5 seconds laggy.
> > >
> > > So. A big thumbs up on that one. It appears to be approximately as
> > > successful as sched-2.5.64-a5.
>
> I'm sorry, I'm an idiot. Where's sched-2.5.64-a5 available for download?
> Latest I found was sched-2.5.63-B3 inside Andrew's patchset.

It all got merged.

http://www.kernel.org/pub/linux/kernel/v2.5/testing/cset/cset-1.1068.1.17-to-1.1107.txt.gz

2003-03-07 06:05:38

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Andrew Morton wrote:

> So I'm a happy camper, and will be using Ingo's combo patch. But I do
> not use XMMS and xine and things like that - they may be running like
> crap with these patches. [...]

one thing the -A5 patch does is that it decreases the default timeslice
value from 150 msecs to 100 msecs. While increasing the timeslice seemed
like a good idea originally (and even with -A5 it's still higher than in
2.4), 150 msecs was too much, it did end up causing just that little bit
of extra latency to audio apps that made them skip more likely. So audio
apps might as well perform a bit better. Certain gaming related problems i
suspect are related to the too long timeslice length as well.

one of the problems with XMMS is that it's often not just an audio app -
it has plugins with fancy graphics, which take up much more CPU time than
the audio decoding/feeding itself, and it's not as threaded as eg. xine.
It was _this_ interaction that was the root of many XMMS complaints - not
the fact that XMMS does audio.

the xine situation i dont think will change with these patches, xine
simply uses up 90% of CPU time soft-playing a DVD from the DVD player, on
a 500 MHz x86 CPU, and that makes it qualify as a CPU-hog no matter what.
Furthermore, it uses the Xv extension which makes it communicate much less
with X itself. But xine's problems are not due to interactive tasks, xine
is hurting due to other CPU-hogs (stuff that triggers in desktops
regularly) taking away timeslices. I believe we should still enable
application programmers to give certain apps _some_ minor priority boost,
so that other CPU hogs cannot starve xine. The fact that xine was playing
back perfectly with a +2 boost shows that this could be a quite powerful
tool.

But indeed this all needs to be re-checked with the latest scheduler.

Ingo

2003-03-07 06:35:30

by Aaron Lehmann

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Thu, Mar 06, 2003 at 09:42:03AM -0800, Linus Torvalds wrote:
> But it was definitely there. 3-5 second _pauses_. Not slowdowns.

I can second this. Using Linux 2.5.5x, untarring a file while
compiling could cause X to freeze for several seconds at a time.
I haven't seen this problem recently, though I do experience my share
of XMMS skips.

2003-03-07 06:40:43

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Aaron Lehmann wrote:

> > But it was definitely there. 3-5 second _pauses_. Not slowdowns.
>
> I can second this. Using Linux 2.5.5x, untarring a file while
> compiling could cause X to freeze for several seconds at a time.
> I haven't seen this problem recently, [...]

i believe this is rather due to IO scheduling / VM throttling. Andrew
added some nice improvements lately, so this should really not happen with
2.5.64 kernels.

> [...] though I do experience my share of XMMS skips.

okay, could you please test BK-curr, or 2.5.64+combo-patch? Do the skips
still persist? Did they get worse perhaps? I guess it might take a few
days of music listening while doing normal desktop activity, to get a good
feel of it though.

Ingo

2003-03-07 06:49:40

by Aaron Lehmann

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Fri, Mar 07, 2003 at 07:50:59AM +0100, Ingo Molnar wrote:
> > [...] though I do experience my share of XMMS skips.
>
> okay, could you please test BK-curr, or 2.5.64+combo-patch? Do the skips
> still persist? Did they get worse perhaps? I guess it might take a few
> days of music listening while doing normal desktop activity, to get a good
> feel of it though.

I was able to reproduce them by selecting text in Mathematica (ugh,
not a very helpful example). The skips were shorter and about three
times as hard to trigger as on 2.5.63.

2003-03-07 06:49:54

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

Aaron Lehmann <[email protected]> wrote:
>
> On Thu, Mar 06, 2003 at 09:42:03AM -0800, Linus Torvalds wrote:
> > But it was definitely there. 3-5 second _pauses_. Not slowdowns.
>
> I can second this. Using Linux 2.5.5x, untarring a file while
> compiling could cause X to freeze for several seconds at a time.

This may be a VM/block/IO scheduler thing. If the X server is trying to page
some text in and the disk is busy writing some stuff out then yes, the X
server can freeze for some time.

Of course, a ton of work has gone into this problem, and more work continues
to be done.

If your machine has a small amount of memory, or is heavily overcommitted
then it is more likely to happen.

Using the CFQ or AS IO schedulers will help a bit. And decreasing
/proc/sys/vm/swappiness may help too.

I'd be interested in feedback on the latter. The default policy of 60% seems
to be about right for the 128M to 256M desktops, but I am running 85% on
larger memory machines so that a bit more stuff gets paged out.


2003-03-07 07:25:59

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Thu, 6 Mar 2003, Aaron Lehmann wrote:

> > okay, could you please test BK-curr, or 2.5.64+combo-patch? Do the skips
> > still persist? Did they get worse perhaps? I guess it might take a few
> > days of music listening while doing normal desktop activity, to get a good
> > feel of it though.
>
> I was able to reproduce them by selecting text in Mathematica (ugh, not
> a very helpful example). The skips were shorter and about three times as
> hard to trigger as on 2.5.63.

okay, just as a data point, could you try to renice the player
process/thread to -2? Does it make the skipping harder to trigger?
How about -5, or -10?

Ingo

2003-03-07 07:35:21

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Fri, 7 Mar 2003, Mike Galbraith wrote:

> Weeeeell, FWIW my box (p3-500/128mb/rage128) disagrees.

UP box, right?

> I can still barely control the box when a make -j5 bzImage is running
> under X/KDE in one terminal and a vmstat (SCHED_RR) in another. I'm not
> swapping, though a bit of idle junk does page out. IOW, I think I'm
> seeing serious cpu starvation.

which precise scheduler are you using, BK-curr? And to see whether the
problem is related to scheduling at all, could you try to renice X to -10?
[this is not a solution, this is just a test to see the problem is
scheduling related.]

is there any way we could exclude/isolate the VM as the source of
interactivity problems? Eg. any chance to get more RAM into that box?

Ingo

2003-03-07 07:30:24

by Mike Galbraith

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

At 12:42 PM 3/6/2003 -0800, Andrew Morton wrote:
>Ingo's combo patch is better still - that is sched-2.5.64-a5 and your patch
>combined (yes?). The slight mouse jerkiness is gone and even when doing
>really silly things I cannot make it misbehave at all. I'd handwavingly
>describe both your patch and sched-2.5.64-a5 as 80% solutions, and the combo
>95%.

Weeeeell, FWIW my box (p3-500/128mb/rage128) disagrees.

I can still barely control the box when a make -j5 bzImage is running under
X/KDE in one terminal and a vmstat (SCHED_RR) in another. I'm not swapping,
though a bit of idle junk does page out. IOW, I think I'm seeing serious
cpu starvation.

For reference, I used to be able to easily control the same box under X
while swapping moderately at make -j15 (during page aging vm days), and
with a heavily twiddled VM at even -j30 (_way_ overloaded with X oinkers
running). At -j5, gui/icky was quite usable.

With the make -j30 bzImage, heavy load which does _hit_ VM saturation but
not constantly, I see what I can only describe as starvation. vmstat
output for this load is definitely abby-normal, and there's no way
/proc/vmstat should look like the attached. It takes forever to build
enough vm pressure that swap is touched (bad bad bad), and the volume
reached is way low. My hungry little cc1's are not sharing resources the
way they should be (methinks).

With earlier versions of Ingo's patch, I was only able to get memory
pressure up to where it should be by drastically reducing timeslice (at the
expense of turning throughput to crud). Take this with a giant economy
sized grain of salt, but what I _think_ (no, I haven't instrumented it) is
happening is that chunks of the total job are being rescheduled in front of
the rest of the job such that they alloc/use/exit before the other guys can
alloc. (geez that's a nebulous WAG;)

-Mike (please torch me if I'm being stupid;)


Attachments:
data.txt (1.25 kB)

2003-03-07 07:54:26

by Mike Galbraith

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

At 08:45 AM 3/7/2003 +0100, Ingo Molnar wrote:

>On Fri, 7 Mar 2003, Mike Galbraith wrote:
>
> > Weeeeell, FWIW my box (p3-500/128mb/rage128) disagrees.
>
>UP box, right?

Yes, p3/500 w. 128mb ram and rage128.

> > I can still barely control the box when a make -j5 bzImage is running
> > under X/KDE in one terminal and a vmstat (SCHED_RR) in another. I'm not
> > swapping, though a bit of idle junk does page out. IOW, I think I'm
> > seeing serious cpu starvation.
>
>which precise scheduler are you using, BK-curr? And to see whether the
>problem is related to scheduling at all, could you try to renice X to -10?
>[this is not a solution, this is just a test to see the problem is
>scheduling related.]

I plugged your combo patch into virgin .64.

>is there any way we could exclude/isolate the VM as the source of
>interactivity problems? Eg. any chance to get more RAM into that box?

I can (could with your earlier patches anyway) eliminate the X stalls by
setting X junk to SCHED_FIFO. I don't have ram to plug in, but I'm as
certain as I can be without actually doing so that it's not ram shortage.

Best would be for other testers to run some tests. With the make -j30
weirdness, I _suspect_ that other oddities (hmm... multi-client db load...
query service time) will show.

-Mike

2003-03-07 07:59:31

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

Mike Galbraith <[email protected]> wrote:
>
> ...
> Best would be for other testers to run some tests. With the make -j30
> weirdness, I _suspect_ that other oddities (hmm... multi-client db load...
> query service time) will show.
>

Yes, this is the second surprise interaction between the CPU scheduler
and the IO system.

Perhaps. In your case it seems that you're simply unable to generate
the amount of concurrency which you used to. Which would make it purely
a CPU scheduler thing.

It is not necessarily a bad thing (unless your total runtime has increased?)
but we need to understand what has happened.

What filesystem are you using?

2003-03-07 08:04:54

by Xavier Bestel

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

Le jeu 06/03/2003 ? 19:27, Ingo Molnar a ?crit:
> On Thu, 6 Mar 2003 [email protected] wrote:
>
> > P.S. IMVHO the xine problem is completely different as has nothing to
> > with interactivity but with the fact that it is soft real-time. i.e. you
> > need to distingish xine from say a gimp filter or a 3D renderer with
> > incremental live updates of the scene it is creating.
>
> it is the same category of problems: xine and X are both applications,
> which, if lagged, are noticed by users.

Actually I don't think so:
- X and games need hard interactivity: they have to compute fast a
response to an input.
- xine can precompute a few frames if it wants (I dunno if it does) and
just needs a precise timing to display them and sync the audio buffers.
It could do with CPU slices if X does the right 'display this frame at
this time' thing.

Xav

2003-03-07 08:05:16

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Fri, 7 Mar 2003, Mike Galbraith wrote:

> I can (could with your earlier patches anyway) eliminate the X stalls by
> setting X junk to SCHED_FIFO. I don't have ram to plug in, but I'm as
> certain as I can be without actually doing so that it's not ram
> shortage.

okay. Can you eliminate the X stalls with setting X priority to -10 or so
(or SCHED_FIFO - although SCHED_FIFO is much more dangerous). And how does
interactivity under the same load look like with vanilla .64, as compared
to .64+combo? First step is to ensure that the new changes did not
actually hurt interactivity.

Ingo

2003-03-07 08:10:49

by Mike Galbraith

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

At 12:10 AM 3/7/2003 -0800, Andrew Morton wrote:
>Mike Galbraith <[email protected]> wrote:
> >
> > ...
> > Best would be for other testers to run some tests. With the make -j30
> > weirdness, I _suspect_ that other oddities (hmm... multi-client db load...
> > query service time) will show.
> >
>
>Yes, this is the second surprise interaction between the CPU scheduler
>and the IO system.
>
>Perhaps. In your case it seems that you're simply unable to generate
>the amount of concurrency which you used to. Which would make it purely
>a CPU scheduler thing.

Yes, of this I'm sure.

>It is not necessarily a bad thing (unless your total runtime has increased?)
>but we need to understand what has happened.

Total system throughput is fine, and as expected, the only difference is
the modest overhead of paging heftily or lightly and/or not at all. The
realtime throughput difference between kernels is ~10 seconds... very
definitely concurrency issue imvho. And I agree 100% that this is not
_necessarily_ a bad thing. The time it takes for _any_ pressure to build
looks decidedly bad though. With the combo patch it does look better than
earlier patches... sudden bursts of paging do not occurr, it's.....
smoother once it starts acting something resembling normal.

>What filesystem are you using?

The build is running in a small EXt2 partition.

-Mike

2003-03-07 08:18:57

by Mike Galbraith

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

At 09:15 AM 3/7/2003 +0100, Ingo Molnar wrote:

>On Fri, 7 Mar 2003, Mike Galbraith wrote:
>
> > I can (could with your earlier patches anyway) eliminate the X stalls by
> > setting X junk to SCHED_FIFO. I don't have ram to plug in, but I'm as
> > certain as I can be without actually doing so that it's not ram
> > shortage.
>
>okay. Can you eliminate the X stalls with setting X priority to -10 or so
>(or SCHED_FIFO - although SCHED_FIFO is much more dangerous). And how does
>interactivity under the same load look like with vanilla .64, as compared
>to .64+combo? First step is to ensure that the new changes did not
>actually hurt interactivity.

Oh, it's definitely _much_ better than .virgin. I didn't mean to imply
that damage was done ;-) With .virgin, I have a _heck_ of a time grabbing
the build window to kill it. With combo, rodent jerkiness makes it hard to
find/grab, and multisecond stalls are still there, but much improved.

-Mike

2003-03-07 08:21:12

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Fri, 7 Mar 2003, Mike Galbraith wrote:

> Oh, it's definitely _much_ better than .virgin. I didn't mean to imply
> that damage was done ;-) With .virgin, I have a _heck_ of a time
> grabbing the build window to kill it. With combo, rodent jerkiness
> makes it hard to find/grab, and multisecond stalls are still there, but
> much improved.

ok! [breathing lighter].

could you still try the renice -10 thing with the combo patch, to see how
much of this thing is X actually not getting enough timeslices, vs. other
effects (such as X applications not getting enough timeslices)?

Ingo

2003-03-07 09:19:50

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


ok, forget the -B0 patch, it's broken - we cannot change the current
task's priority without unqueueing it first. -B1 will have the right code.

Ingo

2003-03-07 09:23:41

by Mike Galbraith

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

At 10:03 AM 3/7/2003 +0100, Ingo Molnar wrote:

>here's sched-2.5.64-B0:

KABOOM! Major explosions on boot. First time I booted, I got an instant
reboot. Second time, I got a couple of double faults and then explosion
during fsck (uhoh;). PID is 1684368482 :-/

Time to reboot into .virgin for (hopefully) a couple of fretful
moments. (rebooting into .virgin after first explosion would have been a
bit smarter;)

-Mike

2003-03-07 09:53:03

by Ingo Molnar

[permalink] [raw]
Subject: [patch] "interactivity changes", sched-2.5.64-B2


i've attached the -B2 patch (against vanilla 2.5.64), which, with its
default settings, should be equivalent to what -B0 was supposed to be.
(ie. the only change is that the priority backboost is immediately
propagated into current's priority.)

-B2 also has all scheduler tunables exported into /proc/sys/kernel/, for
testing/development purposes only. The following values are now tunable:
MIN_TIMESLICE, MAX_TIMESLICE, CHILD_PENALTY, PARENT_PENALTY, EXIT_WEIGHT,
PRIO_BONUS_RATIO, PRIO_BONUS_RATIO, INTERACTIVE_DELTA, WAKER_BONUS_RATIO,
MAX_SLEEP_AVG, STARVATION_LIMIT, NODE_THRESHOLD.

NOTE: you really have to know which value means what, some combinations
might cause a broken scheduler & cause crashes. Eg. dont even attempt to
set a smaller min timeslice than max timeslice, and dont set any of them
to 0.

NOTE2: when setting these tunables, always make sure to reach an 'idle'
scheduling state first. While it's safe to change these values on the fly,
it makes little sense to eg. set them while a kernel compile is ongoing -
the make processes might have accumulated dynamic priority already. The
recommended way to set these tunables is to stop everything in the system,
(X can stay around, it should just be mostly idle) and wait 10 seconds (or
the timeout you use for MAX_SLEEP_AVG), so that all the sleep-averages get
back to an 'idle' state. Then start whatever workload you do to test
interactivity, from scratch.

i've also implemented a new tunable, WAKER_BONUS_RATIO, which implements
Linus' idea of sharing the boost between waker and wakee in a more
immediate way. It defaults to 0 currently, but you might want to try to
set it to 50%, or 25%, or even 75%.

The -B2 patch has the following changes over -A6 (BK-curr):

- fix a (now-) bug in kernel/softirq.c, it did a wakeup outside any
atomic regions, which falsely identified random processes as a
non-atomic wakeup, and which causes random priority boost to be
distributed.

- reset the initial idle thread's priority back to PRIO_MAX after doing
the wakeup_forked_process() - correct preemption relies on this.

- introduce WAKER_BONUS_RATIO

- update current->prio immediately after a backboost.

- clean up effective_prio() & sleep_avg calculations so that there are
fewer RT-task special cases. This, besides resulting in a much cleaner
WAKER_BONUS_RATIO code, also has the advantage of the sleep_avg being
maintained even for RT tasks - this could be advantegous for tasks that
briefly enter/exit RT mode.

Ingo

--- linux/include/linux/sched.h.orig
+++ linux/include/linux/sched.h
@@ -328,7 +328,7 @@ struct task_struct {
prio_array_t *array;

unsigned long sleep_avg;
- unsigned long sleep_timestamp;
+ unsigned long last_run;

unsigned long policy;
unsigned long cpus_allowed;
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -54,20 +54,21 @@
/*
* These are the 'tuning knobs' of the scheduler:
*
- * Minimum timeslice is 10 msecs, default timeslice is 150 msecs,
- * maximum timeslice is 300 msecs. Timeslices get refilled after
+ * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
+ * maximum timeslice is 200 msecs. Timeslices get refilled after
* they expire.
*/
-#define MIN_TIMESLICE ( 10 * HZ / 1000)
-#define MAX_TIMESLICE (300 * HZ / 1000)
-#define CHILD_PENALTY 95
-#define PARENT_PENALTY 100
-#define EXIT_WEIGHT 3
-#define PRIO_BONUS_RATIO 25
-#define INTERACTIVE_DELTA 2
-#define MAX_SLEEP_AVG (2*HZ)
-#define STARVATION_LIMIT (2*HZ)
-#define NODE_THRESHOLD 125
+unsigned int MIN_TIMESLICE = ( 10 * HZ / 1000);
+unsigned int MAX_TIMESLICE = (200 * HZ / 1000);
+unsigned int CHILD_PENALTY = 50;
+unsigned int PARENT_PENALTY = 100;
+unsigned int EXIT_WEIGHT = 3;
+unsigned int PRIO_BONUS_RATIO = 25;
+unsigned int INTERACTIVE_DELTA = 2;
+unsigned int WAKER_BONUS_RATIO = 0;
+unsigned int MAX_SLEEP_AVG = (10*HZ);
+unsigned int STARVATION_LIMIT = (10*HZ);
+unsigned int NODE_THRESHOLD = 125;

/*
* If a task is 'interactive' then we reinsert it in the active
@@ -302,10 +303,13 @@ static inline void enqueue_task(struct t
*
* Both properties are important to certain workloads.
*/
-static inline int effective_prio(task_t *p)
+static int effective_prio(task_t *p)
{
int bonus, prio;

+ if (rt_task(p))
+ return p->prio;
+
bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;

@@ -323,26 +327,61 @@ static inline int effective_prio(task_t
* Also update all the scheduling statistics stuff. (sleep average
* calculation, priority modifiers, etc.)
*/
+static inline void __activate_task(task_t *p, runqueue_t *rq)
+{
+ enqueue_task(p, rq->active);
+ rq->nr_running++;
+}
+
static inline void activate_task(task_t *p, runqueue_t *rq)
{
- unsigned long sleep_time = jiffies - p->sleep_timestamp;
- prio_array_t *array = rq->active;
+ unsigned long sleep_time = jiffies - p->last_run;

- if (!rt_task(p) && sleep_time) {
+ if (sleep_time) {
+ int waker_bonus = 0;
/*
- * This code gives a bonus to interactive tasks. We update
- * an 'average sleep time' value here, based on
- * sleep_timestamp. The more time a task spends sleeping,
- * the higher the average gets - and the higher the priority
- * boost gets as well.
+ * This code gives a bonus to interactive tasks. For
+ * asynchronous wakeups all the bonus goes to the woken up
+ * task. For non-atomic-context wakeups, the bonus is
+ * shared between the waker and the woken up task. Via
+ * this we recognize the waker as being related to the
+ * 'interactiveness' of the woken up task.
+ *
+ * The boost works by updating the 'average sleep time'
+ * value here, based on ->last_run. The more time a task
+ * spends sleeping, the higher the average gets - and the
+ * higher the priority boost gets as well.
*/
- p->sleep_avg += sleep_time;
- if (p->sleep_avg > MAX_SLEEP_AVG)
+ if (!in_interrupt())
+ waker_bonus = sleep_time * WAKER_BONUS_RATIO / 100;
+
+ p->sleep_avg += sleep_time - waker_bonus;
+
+ /*
+ * 'Overflow' bonus ticks go to the waker as well, so the
+ * ticks are not lost. This has the effect of further
+ * boosting tasks that are related to maximum-interactive
+ * tasks.
+ */
+ if (p->sleep_avg > MAX_SLEEP_AVG) {
+ waker_bonus += p->sleep_avg - MAX_SLEEP_AVG;
p->sleep_avg = MAX_SLEEP_AVG;
- p->prio = effective_prio(p);
+ }
+ p->prio = effective_prio(p);
+
+ if (!in_interrupt()) {
+ prio_array_t *array = current->array;
+ BUG_ON(!array);
+
+ current->sleep_avg += waker_bonus;
+ if (current->sleep_avg > MAX_SLEEP_AVG)
+ current->sleep_avg = MAX_SLEEP_AVG;
+ dequeue_task(current, array);
+ current->prio = effective_prio(current);
+ enqueue_task(current, array);
+ }
}
- enqueue_task(p, array);
- nr_running_inc(rq);
+ __activate_task(p, rq);
}

/*
@@ -479,10 +518,13 @@ repeat_lock_task:
}
if (old_state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible--;
- activate_task(p, rq);
-
- if (p->prio < rq->curr->prio)
- resched_task(rq->curr);
+ if (sync)
+ __activate_task(p, rq);
+ else {
+ activate_task(p, rq);
+ if (p->prio < rq->curr->prio)
+ resched_task(rq->curr);
+ }
success = 1;
}
p->state = TASK_RUNNING;
@@ -514,19 +556,25 @@ void wake_up_forked_process(task_t * p)
runqueue_t *rq = task_rq_lock(current, &flags);

p->state = TASK_RUNNING;
- if (!rt_task(p)) {
- /*
- * We decrease the sleep average of forking parents
- * and children as well, to keep max-interactive tasks
- * from forking tasks that are max-interactive.
- */
- current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
- p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
- p->prio = effective_prio(p);
- }
+ /*
+ * We decrease the sleep average of forking parents
+ * and children as well, to keep max-interactive tasks
+ * from forking tasks that are max-interactive.
+ */
+ current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
+ p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
+ p->prio = effective_prio(p);
set_task_cpu(p, smp_processor_id());
- activate_task(p, rq);

+ if (unlikely(!current->array))
+ __activate_task(p, rq);
+ else {
+ p->prio = current->prio;
+ list_add_tail(&p->run_list, &current->run_list);
+ p->array = current->array;
+ p->array->nr_active++;
+ rq->nr_running++;
+ }
task_rq_unlock(rq, &flags);
}

@@ -953,6 +1001,11 @@ static inline void pull_task(runqueue_t
*/
if (p->prio < this_rq->curr->prio)
set_need_resched();
+ else {
+ if (p->prio == this_rq->curr->prio &&
+ p->time_slice > this_rq->curr->time_slice)
+ set_need_resched();
+ }
}

/*
@@ -1016,7 +1069,7 @@ skip_queue:
*/

#define CAN_MIGRATE_TASK(p,rq,this_cpu) \
- ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \
+ ((jiffies - (p)->last_run > cache_decay_ticks) && \
!task_running(rq, p) && \
((p)->cpus_allowed & (1UL << (this_cpu))))

@@ -1076,9 +1129,9 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
* increasing number of running tasks:
*/
#define EXPIRED_STARVING(rq) \
- ((rq)->expired_timestamp && \
+ (STARVATION_LIMIT && ((rq)->expired_timestamp && \
(jiffies - (rq)->expired_timestamp >= \
- STARVATION_LIMIT * ((rq)->nr_running) + 1))
+ STARVATION_LIMIT * ((rq)->nr_running) + 1)))

/*
* This function gets called by the timer code, with HZ frequency.
@@ -1121,6 +1174,16 @@ void scheduler_tick(int user_ticks, int
return;
}
spin_lock(&rq->lock);
+ /*
+ * The task was running during this tick - update the
+ * time slice counter and the sleep average. Note: we
+ * do not update a thread's priority until it either
+ * goes to sleep or uses up its timeslice. This makes
+ * it possible for interactive tasks to use up their
+ * timeslices at their highest priority levels.
+ */
+ if (p->sleep_avg)
+ p->sleep_avg--;
if (unlikely(rt_task(p))) {
/*
* RR tasks need a special form of timeslice management.
@@ -1137,16 +1200,6 @@ void scheduler_tick(int user_ticks, int
}
goto out;
}
- /*
- * The task was running during this tick - update the
- * time slice counter and the sleep average. Note: we
- * do not update a thread's priority until it either
- * goes to sleep or uses up its timeslice. This makes
- * it possible for interactive tasks to use up their
- * timeslices at their highest priority levels.
- */
- if (p->sleep_avg)
- p->sleep_avg--;
if (!--p->time_slice) {
dequeue_task(p, rq->active);
set_tsk_need_resched(p);
@@ -1201,7 +1254,7 @@ need_resched:
rq = this_rq();

release_kernel_lock(prev);
- prev->sleep_timestamp = jiffies;
+ prev->last_run = jiffies;
spin_lock_irq(&rq->lock);

/*
@@ -1701,7 +1754,7 @@ static int setscheduler(pid_t pid, int p
else
p->prio = p->static_prio;
if (array)
- activate_task(p, task_rq(p));
+ __activate_task(p, task_rq(p));

out_unlock:
task_rq_unlock(rq, &flags);
@@ -2442,6 +2495,7 @@ void __init sched_init(void)
rq->idle = current;
set_task_cpu(current, smp_processor_id());
wake_up_forked_process(current);
+ current->prio = MAX_PRIO;

init_timers();

--- linux/kernel/fork.c.orig
+++ linux/kernel/fork.c
@@ -916,7 +916,7 @@ static struct task_struct *copy_process(
*/
p->first_time_slice = 1;
current->time_slice >>= 1;
- p->sleep_timestamp = jiffies;
+ p->last_run = jiffies;
if (!current->time_slice) {
/*
* This case is rare, it happens when the parent has only
--- linux/kernel/sysctl.c.orig
+++ linux/kernel/sysctl.c
@@ -43,6 +43,7 @@

/* External variables not in a header file. */
extern int panic_timeout;
+extern unsigned int MIN_TIMESLICE, MAX_TIMESLICE, CHILD_PENALTY, PARENT_PENALTY, EXIT_WEIGHT, PRIO_BONUS_RATIO, PRIO_BONUS_RATIO, INTERACTIVE_DELTA, WAKER_BONUS_RATIO, MAX_SLEEP_AVG, STARVATION_LIMIT, NODE_THRESHOLD;
extern int C_A_D;
extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
@@ -172,6 +173,28 @@ static ctl_table kern_table[] = {
0644, NULL, &proc_doutsstring, &sysctl_string},
{KERN_PANIC, "panic", &panic_timeout, sizeof(int),
0644, NULL, &proc_dointvec},
+ {KERN_PANIC, "MIN_TIMESLICE", &MIN_TIMESLICE, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_PANIC, "MAX_TIMESLICE", &MAX_TIMESLICE, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_PANIC, "CHILD_PENALTY", &CHILD_PENALTY, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_PANIC, "PARENT_PENALTY", &PARENT_PENALTY, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_PANIC, "EXIT_WEIGHT", &EXIT_WEIGHT, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_PANIC, "PRIO_BONUS_RATIO", &PRIO_BONUS_RATIO, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_PANIC, "INTERACTIVE_DELTA", &INTERACTIVE_DELTA, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_PANIC, "WAKER_BONUS_RATIO", &WAKER_BONUS_RATIO, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_PANIC, "MAX_SLEEP_AVG", &MAX_SLEEP_AVG, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_PANIC, "STARVATION_LIMIT", &STARVATION_LIMIT, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_PANIC, "NODE_THRESHOLD", &NODE_THRESHOLD, sizeof(int),
+ 0644, NULL, &proc_dointvec},
{KERN_CORE_USES_PID, "core_uses_pid", &core_uses_pid, sizeof(int),
0644, NULL, &proc_dointvec},
{KERN_CORE_PATTERN, "core_pattern", core_pattern, 64,
--- linux/kernel/softirq.c.orig
+++ linux/kernel/softirq.c
@@ -92,10 +92,9 @@ restart:
mask &= ~pending;
goto restart;
}
- __local_bh_enable();
-
if (pending)
wakeup_softirqd(cpu);
+ __local_bh_enable();
}

local_irq_restore(flags);

2003-03-07 10:07:20

by Helge Hafting

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

Ingo Molnar wrote:
> I believe we should still enable
> application programmers to give certain apps _some_ minor priority boost,
> so that other CPU hogs cannot starve xine.

But we don't really need further kernel support for that, do we?
I know a user currently cannot raise priority, but the user can
run all his normal apps at slightly lower priority, except for xine.
And the admin/distrubutor can set everything up for using the slightly
lower priority by default. Well, perhaps all this involves so
much use of "nice" that kernel support is a good idea anyway...

Helge Hafting

2003-03-07 10:08:16

by Mike Galbraith

[permalink] [raw]
Subject: Re: [patch] "interactivity changes", sched-2.5.64-B2

At 11:03 AM 3/7/2003 +0100, Ingo Molnar wrote:

>i've attached the -B2 patch (against vanilla 2.5.64).....

(I'll test and report back in an hour or so. RL honey-do's are SCHED_RR...
if a guy knows what's good for him;)

2003-03-07 10:17:02

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On 6 Mar 2003, Robert Love wrote:

> > in addition, timeslices should be shortest for high priority processes
> > (depending on dynamic prio, not static)
>
> No, they should be longer. In some cases they should be nearly
> infinitely long (which is sort of what we do with the reinsertion into
> the active array for highly interactive tasks). [...]

yes, and in fact tasks with 'infinite timeslice' do exist: RT tasks with
SCHED_FIFO/SCHED_RR.

Ingo

2003-03-07 10:29:27

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On 6 Mar 2003, Robert Love wrote:

> > but the kernel should get the chance to frequently reschedule
> > when interactivity is needed.
>
> I understand your point, but we get that now without using super small
> timeslices.

well, the -A6 patch does reduce the timeslice length, it's certainly one
tool to improve "interactivity" of CPU-bound tasks. At the moment we
cannot reduce it arbitrarily though, because right now the way to
implement different CPU-sharing priorities is via the timeslice
calculator. With 100 msec default timeslices, the biggest difference one
can get is a 10 msec timeslice for a nice +19 task, and a 100 msec
timeslice for a nice 0 task. Ie. a nice +19 CPU-bound task uses up ~10% of
CPU time while a nice 0 task uses up 90% of CPU time, if both are running
at once - roughly.

i have planned to add a new interface to solve this generic problem,
basically a syscall that enables the setting of the default timeslice
length - which gives one more degree of freedom to applications. Games
could set this to a super-low value to get really good 'keypress latency'
even in the presence of other CPU hogs. Compilation jobs would use the
default long timeslices, to maximize caching benefits. And we could
lengthen the timeslice of nice +19 tasks as well. The complexity here is
to decouple the timeslice length from CPU-sharing priorities. I've got
this on my list ...

Ingo

2003-03-07 10:56:56

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3


On Fri, 7 Mar 2003, Helge Hafting wrote:

> But we don't really need further kernel support for that, do we? I know
> a user currently cannot raise priority, but the user can run all his
> normal apps at slightly lower priority, except for xine. And the
> admin/distrubutor can set everything up for using the slightly lower
> priority by default. Well, perhaps all this involves so much use of
> "nice" that kernel support is a good idea anyway...

true, but i think we could help by doing this automatically. This will
also make it easier to tune things as part of the scheduler proper.

Ingo

2003-03-07 12:39:23

by Mike Galbraith

[permalink] [raw]
Subject: Re: [patch] "interactivity changes", sched-2.5.64-B2

At 11:23 AM 3/7/2003 +0100, Mike Galbraith wrote:
>At 11:03 AM 3/7/2003 +0100, Ingo Molnar wrote:
>
>>i've attached the -B2 patch (against vanilla 2.5.64).....
>
>(I'll test and report back in an hour or so...

B2 appears to fix my interactivity woes w. X + make -j5 bzImage. I can now
waggle windows to my heart's content, browse around and click on RFC's and
such and they pop up with nice response times. Spiffy :) /Me thinks
desktop users will like this patch a bunch.

-Mike

(I'd really like to see a multi-client db load time-to-query comparison
though. that make -j30 weirdness might not be a bad thing, but it makes me
nervous [worry wart])


2003-03-07 13:12:23

by Mike Galbraith

[permalink] [raw]
Subject: Re: [patch] "interactivity changes", sched-2.5.64-B2

At 01:54 PM 3/7/2003 +0100, Mike Galbraith wrote:
>Spiffy :) /Me thinks desktop users will like this patch a bunch.

(I can even play asteroids [fly little rocket ship around, shoot at and ram
space rocks] with make -j25 bzImage and some swapping [sucks when you hit
heavy swap of course, but quite playable as long as swap is light])

2003-03-07 14:18:29

by jlnance

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Fri, Mar 07, 2003 at 06:57:58AM +0100, Ingo Molnar wrote:

> so the correct approach is both to make X more interactive (your patch),
> _and_ to make the compilation jobs less interactive (my patch). This is
> that explains why Andrew saw roughly similar interactivity with you and my
> patch applied separately, but the best result was when the combo patch was
> applied. Agreed?

Ingo,
Forgive me if this is what your patch does, I have only looked at
Linus'es. But I dont think this is what you are doing.
Linus'es patch gives a boost to non-interactive proceses that wake
up interactive ones. It seems that we could solve the make problem by
applying the same idea in the reverse direction. Lets unboost interactive
processes that are woken up by non-interactive ones. So if make gets
woken up by gcc (which is hopefully non-interactive), make does not
get "credit" for having been asleep.
I am trying to think if this would break anything. The only thing
I can think of is that if the X server gets marked as non-interactive,
then it is going to cause its clients to get marked as non-interactive
as well when it sends them data. But this may be a good thing. If
X is marked as non-interactive it means that it is busy, so this may
have the effect of preventing the X clients from sending it a lot of
extra work while its already busy.

Thanks,

Jim

2003-03-07 14:33:03

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "interactivity changes", sched-2.5.64-B2


On Fri, 7 Mar 2003, Mike Galbraith wrote:

> >Spiffy :) /Me thinks desktop users will like this patch a bunch.
>
> (I can even play asteroids [fly little rocket ship around, shoot at and
> ram space rocks] with make -j25 bzImage and some swapping [sucks when
> you hit heavy swap of course, but quite playable as long as swap is
> light])

cool! Could you please also play a bit with various WAKER_BONUS_RATIO
values? If the default '0' value is the best-performing variant then i'll
nuke it from the patch altogether.

Ingo


2003-03-07 14:34:46

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "interactivity changes", sched-2.5.64-B2


btw., could you please revert the kernel/softirq.c change, and re-test
-j25 interactivity with that patch? That way we'll know exactly which
component of -B2 caused the improvement on your box.

Ingo

2003-03-07 15:29:44

by Mike Galbraith

[permalink] [raw]
Subject: Re: [patch] "interactivity changes", sched-2.5.64-B2

At 03:45 PM 3/7/2003 +0100, Ingo Molnar wrote:

>btw., could you please revert the kernel/softirq.c change, and re-test
>-j25 interactivity with that patch? That way we'll know exactly which
>component of -B2 caused the improvement on your box.

Done. If either the softirq.c change or changing WAKER_BONUS_RATIO value
(25:50:75) make any difference at all with what I'm doing, it's too close
for me to tell.

-Mike

2003-03-07 18:39:56

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "interactivity changes", sched-2.5.64-B2


On Fri, 7 Mar 2003, Mike Galbraith wrote:

> Done. If either the softirq.c change or changing WAKER_BONUS_RATIO
> value (25:50:75) make any difference at all with what I'm doing, it's
> too close for me to tell.

hm. Could you then please just re-test the -A6 patch with the same base
tree again? I'd like to make sure that it _is_ the -A6 => -B0 delta that
causes this considerable level of improvement for you.

Ingo

2003-03-07 18:47:10

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] "interactivity changes", sched-2.5.64-B2


On Fri, 7 Mar 2003, Ingo Molnar wrote:
>
> hm. Could you then please just re-test the -A6 patch with the same base
> tree again? I'd like to make sure that it _is_ the -A6 => -B0 delta that
> causes this considerable level of improvement for you.

Side note: current BK has the -B2 thing without the non-core changes (ie
no WAKER_BONUS_RATIO, and no sysctl stuff, and some reorg of the code due
to the dropping of the WAKER_BONUS_RATIO).

Ingo, you might want to double-check that you agree that my reorg is
equivalent.

Linus

2003-03-07 19:02:10

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] "interactivity changes", sched-2.5.64-B2


Ingo,
I already merged and pushed out the core changes in -B2, but now that I'm
looking more closely at it I'm almost certain that it's buggy.

In particular, you do this:

..
dequeue_task(current, array);
current->prio = effective_prio(current);
enqueue_task(current, array);
..

without actually holding the rq lock on the array!

Yes, we hold _a_ rq_lock - we hold the rq lock for the array that "p" is
on, but that's necessarily the same as this_rq().

So you can end up with the priority bitmap and the runqueue being
corrupted by another wakeup on another CPU that wakes up something that is
on the runqueue for the current CPU. At which point the machine will be
basically dead (runqueue corruption is not likely to be a survivable
event).

Or am I missing something?

Linus

2003-03-07 19:03:39

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "interactivity changes", sched-2.5.64-B2


you are right - i'm working on a fix.

Ingo

2003-03-07 19:21:56

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "interactivity changes", sched-2.5.64-B2


the attached patch (against your tree) fixes the SMP locking bug. The
patch also includes some other stuff i already added to my tree by that
time:

- only update the priority and do a requeueing if the sleep average has
changed. (this does not happen for pure CPU hogs or pure interactive
tasks, so no need to requeue/recalc-prio in that case.) [All the
necessary values are available at that point already, so gcc should
have an easy job making this branch really cheap.]

- do not do a full task activation in the migration-thread path - that is
supposed to be near-atomic anyway.

- fix up comments

i solved the SMP locking bug by moving the requeueing outside of
try_to_wake_up(). It does not matter that the priority update is not
atomically done now, since the current process wont do anything inbetween.
(well, it could get preempted in a preemptible kernel, but even that wont
do any harm.)

did a quick testcompile & testboot on an SMP box, appears to work.

Ingo

--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -321,10 +321,7 @@ static int effective_prio(task_t *p)
}

/*
- * activate_task - move a task to the runqueue.
-
- * Also update all the scheduling statistics stuff. (sleep average
- * calculation, priority modifiers, etc.)
+ * __activate_task - move a task to the runqueue.
*/
static inline void __activate_task(task_t *p, runqueue_t *rq)
{
@@ -332,9 +329,16 @@ static inline void __activate_task(task_
nr_running_inc(rq);
}

-static inline void activate_task(task_t *p, runqueue_t *rq)
+/*
+ * activate_task - move a task to the runqueue and do priority recalculation
+ *
+ * Update all the scheduling statistics stuff. (sleep average
+ * calculation, priority modifiers, etc.)
+ */
+static inline int activate_task(task_t *p, runqueue_t *rq)
{
unsigned long sleep_time = jiffies - p->last_run;
+ int requeue_waker = 0;

if (sleep_time) {
int sleep_avg;
@@ -357,23 +361,25 @@ static inline void activate_task(task_t
*/
if (sleep_avg > MAX_SLEEP_AVG) {
if (!in_interrupt()) {
- prio_array_t *array = current->array;
- BUG_ON(!array);
sleep_avg += current->sleep_avg - MAX_SLEEP_AVG;
if (sleep_avg > MAX_SLEEP_AVG)
sleep_avg = MAX_SLEEP_AVG;

- current->sleep_avg = sleep_avg;
- dequeue_task(current, array);
- current->prio = effective_prio(current);
- enqueue_task(current, array);
+ if (current->sleep_avg != sleep_avg) {
+ current->sleep_avg = sleep_avg;
+ requeue_waker = 1;
+ }
}
sleep_avg = MAX_SLEEP_AVG;
}
- p->sleep_avg = sleep_avg;
- p->prio = effective_prio(p);
+ if (p->sleep_avg != sleep_avg) {
+ p->sleep_avg = sleep_avg;
+ p->prio = effective_prio(p);
+ }
}
__activate_task(p, rq);
+
+ return requeue_waker;
}

/*
@@ -486,8 +492,8 @@ void kick_if_running(task_t * p)
*/
static int try_to_wake_up(task_t * p, unsigned int state, int sync)
{
+ int success = 0, requeue_waker = 0;
unsigned long flags;
- int success = 0;
long old_state;
runqueue_t *rq;

@@ -513,7 +519,7 @@ repeat_lock_task:
if (sync)
__activate_task(p, rq);
else {
- activate_task(p, rq);
+ requeue_waker = activate_task(p, rq);
if (p->prio < rq->curr->prio)
resched_task(rq->curr);
}
@@ -523,6 +529,21 @@ repeat_lock_task:
}
task_rq_unlock(rq, &flags);

+ /*
+ * We have to do this outside the other spinlock, the two
+ * runqueues might be different:
+ */
+ if (requeue_waker) {
+ prio_array_t *array;
+
+ rq = task_rq_lock(current, &flags);
+ array = current->array;
+ dequeue_task(current, array);
+ current->prio = effective_prio(current);
+ enqueue_task(current, array);
+ task_rq_unlock(rq, &flags);
+ }
+
return success;
}

@@ -2360,7 +2381,7 @@ repeat:
set_task_cpu(p, cpu_dest);
if (p->array) {
deactivate_task(p, rq_src);
- activate_task(p, rq_dest);
+ __activate_task(p, rq_dest);
if (p->prio < rq_dest->curr->prio)
resched_task(rq_dest->curr);
}

2003-03-07 20:29:17

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] "interactivity changes", sched-2.5.64-B2


On 7 Mar 2003, Martin Josefsson wrote:

> Some negative things:

if you have time/interest, could you re-test the negative things with X
reniced to -10, to further isolate the problem? Another thing to try is to
renice xmms to -10 (or RT priority). Which one makes the larger
difference?

Ingo

2003-03-07 20:24:12

by Martin Josefsson

[permalink] [raw]
Subject: Re: [patch] "interactivity changes", sched-2.5.64-B2

On Fri, 2003-03-07 at 15:43, Ingo Molnar wrote:
> On Fri, 7 Mar 2003, Mike Galbraith wrote:
>
> > >Spiffy :) /Me thinks desktop users will like this patch a bunch.
> >
> > (I can even play asteroids [fly little rocket ship around, shoot at and
> > ram space rocks] with make -j25 bzImage and some swapping [sucks when
> > you hit heavy swap of course, but quite playable as long as swap is
> > light])
>
> cool! Could you please also play a bit with various WAKER_BONUS_RATIO
> values? If the default '0' value is the best-performing variant then i'll
> nuke it from the patch altogether.

Hi,

I've tested 2.5.64 + sched-2.5.64-B2 on an UP pIII 700 with 704MB ram,
here's my results...

General impression is that interactivity is good but it seems there's a
few cases that gets worse, sometimes a lot worse.

First the things that got better...

Xmms doesn't skip when switching tabs in galeon anymore during general
desktopusage (more on that below)
Earlier I had to change some schedulersettings to get it to stop
skipping.

And it was possible to watch a movie using mplayer while running a make
-j10 on a kerneltree, the playback wasn't 100% perfect but I'd say it
was ~90% , the jerkyness I saw wasn't very irritating. I tried the
window wiggle-test while the compile and movie was playing and it was
smooth and only added a little more jerkyness to the playback.


Some negative things:

Xmms could skip during the compile/movie playback, most notably one
rather long skip right after a song-change but sometimes in the middle
of a song as well, those skips were quite long as well, 2-3 seconds.

Another xmms related problem is that even without any load on the
machine the mousepointer gets jerky for 0.1-0.5 seconds right after
having used the mouse to search forward or backwards in a song. It gets
jerky at the moment you release the mousebutton.
Basically no change in the jerkyness even with a make -j10 and movie
playing.

The biggest problem is the fact that my windowmanager gets unresponsive
for ~30 seconds when I perform something that works fine with vanilla
2.4 and 2.5. I have two keybindings that execute 'aumix -v+3' and
'aumix -v-3'. And if I hold one of those buttons down for say 2 seconds
then at first it seems like nothing is happening but then the volume
starts to inc-/decrease sloowly and the windowmanager (sawfish) is
unresponsive for ~30 seconds. With 2.4 or vanilla 2.5 the change in
volume is instant and I've never noticed that sawfish got unresponsive.
Sure holding the button down for 2 seconds at a repeatrate of 30/s will
cause a lot of aumix's to be executed but 30 seconds for 60 aumix's is
quite a long time. A few times it has been enough to just press one of
those keys a few times to get the exact same effect. It's as if sawfish
goes into a weird loop. All this might be a bug in sawfish...
This behaviour is the same with or without the compile/movie running.

Another problem is that the composer-window in evolution gets
unresponsive for 1-2 seconds if you hold down a key, and when it gets
responsive again it gets all the keypresses...

I've tried changing WAKER_BONUS_RATIO but I didn't see any diffrence.
At least it didn't help with the Xmms mouse-jerkyness or the
sawfish_goes_out_for_coffee_and_a_cigarette problem.

There, that was my impressions of 2 hours usage of the patch.

--
/Martin

Never argue with an idiot. They drag you down to their level, then beat you with experience.

2003-03-07 20:55:01

by David Lang

[permalink] [raw]
Subject: Re: [patch] "interactivity changes", sched-2.5.64-B2

sounds like the code in X that detects a key being held down is running
into problems. any chance it's doing a busy loop or something silly like
that that's just not running enough? (probably not, but since you have
problems in a couple applications that happen when you hold a key down I
would look there rather then at the scheduling code itself)

David Lang

On 7 Mar 2003, Martin Josefsson wrote:

> Date: 07 Mar 2003 21:34:38 +0100
> From: Martin Josefsson <[email protected]>
> To: Ingo Molnar <[email protected]>
> Cc: Mike Galbraith <[email protected]>, Andrew Morton <[email protected]>,
> Linus Torvalds <[email protected]>, Robert Love <[email protected]>,
> [email protected]
> Subject: Re: [patch] "interactivity changes", sched-2.5.64-B2
>
> On Fri, 2003-03-07 at 15:43, Ingo Molnar wrote:
> > On Fri, 7 Mar 2003, Mike Galbraith wrote:
> >
> > > >Spiffy :) /Me thinks desktop users will like this patch a bunch.
> > >
> > > (I can even play asteroids [fly little rocket ship around, shoot at and
> > > ram space rocks] with make -j25 bzImage and some swapping [sucks when
> > > you hit heavy swap of course, but quite playable as long as swap is
> > > light])
> >
> > cool! Could you please also play a bit with various WAKER_BONUS_RATIO
> > values? If the default '0' value is the best-performing variant then i'll
> > nuke it from the patch altogether.
>
> Hi,
>
> I've tested 2.5.64 + sched-2.5.64-B2 on an UP pIII 700 with 704MB ram,
> here's my results...
>
> General impression is that interactivity is good but it seems there's a
> few cases that gets worse, sometimes a lot worse.
>
> First the things that got better...
>
> Xmms doesn't skip when switching tabs in galeon anymore during general
> desktopusage (more on that below)
> Earlier I had to change some schedulersettings to get it to stop
> skipping.
>
> And it was possible to watch a movie using mplayer while running a make
> -j10 on a kerneltree, the playback wasn't 100% perfect but I'd say it
> was ~90% , the jerkyness I saw wasn't very irritating. I tried the
> window wiggle-test while the compile and movie was playing and it was
> smooth and only added a little more jerkyness to the playback.
>
>
> Some negative things:
>
> Xmms could skip during the compile/movie playback, most notably one
> rather long skip right after a song-change but sometimes in the middle
> of a song as well, those skips were quite long as well, 2-3 seconds.
>
> Another xmms related problem is that even without any load on the
> machine the mousepointer gets jerky for 0.1-0.5 seconds right after
> having used the mouse to search forward or backwards in a song. It gets
> jerky at the moment you release the mousebutton.
> Basically no change in the jerkyness even with a make -j10 and movie
> playing.
>
> The biggest problem is the fact that my windowmanager gets unresponsive
> for ~30 seconds when I perform something that works fine with vanilla
> 2.4 and 2.5. I have two keybindings that execute 'aumix -v+3' and
> 'aumix -v-3'. And if I hold one of those buttons down for say 2 seconds
> then at first it seems like nothing is happening but then the volume
> starts to inc-/decrease sloowly and the windowmanager (sawfish) is
> unresponsive for ~30 seconds. With 2.4 or vanilla 2.5 the change in
> volume is instant and I've never noticed that sawfish got unresponsive.
> Sure holding the button down for 2 seconds at a repeatrate of 30/s will
> cause a lot of aumix's to be executed but 30 seconds for 60 aumix's is
> quite a long time. A few times it has been enough to just press one of
> those keys a few times to get the exact same effect. It's as if sawfish
> goes into a weird loop. All this might be a bug in sawfish...
> This behaviour is the same with or without the compile/movie running.
>
> Another problem is that the composer-window in evolution gets
> unresponsive for 1-2 seconds if you hold down a key, and when it gets
> responsive again it gets all the keypresses...
>
> I've tried changing WAKER_BONUS_RATIO but I didn't see any diffrence.
> At least it didn't help with the Xmms mouse-jerkyness or the
> sawfish_goes_out_for_coffee_and_a_cigarette problem.
>
> There, that was my impressions of 2 hours usage of the patch.
>
> --
> /Martin
>
> Never argue with an idiot. They drag you down to their level, then beat you with experience.
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2003-03-07 20:55:46

by Martin Josefsson

[permalink] [raw]
Subject: Re: [patch] "interactivity changes", sched-2.5.64-B2

On Fri, 2003-03-07 at 21:39, Ingo Molnar wrote:
> On 7 Mar 2003, Martin Josefsson wrote:
>
> > Some negative things:
>
> if you have time/interest, could you re-test the negative things with X
> reniced to -10, to further isolate the problem? Another thing to try is to
> renice xmms to -10 (or RT priority). Which one makes the larger
> difference?

X niced to -10 makes the xmms mouse-jerkyness disappear. but it can
still skip right after a songchange. If I renice xmms to -10 as well I
get the mouse-jerkyness again, it I renice it to -5 it seems to behave
quite well, no mouse-jerkyness and so far I havn't heard any skips, but
it can take some time until a skip occurs.

With X reniced to -10 it feels like the wiggle_a_window while a
'make -j10' and a movie is playing gets a bit more sluggish.

And sawfish still takes 30 second naps when executing a bunch of
aumix's.

--
/Martin

Never argue with an idiot. They drag you down to their level, then beat you with experience.

2003-03-07 21:03:11

by Martin Josefsson

[permalink] [raw]
Subject: Re: [patch] "interactivity changes", sched-2.5.64-B2

On Fri, 2003-03-07 at 22:03, David Lang wrote:
> sounds like the code in X that detects a key being held down is running
> into problems. any chance it's doing a busy loop or something silly like
> that that's just not running enough? (probably not, but since you have
> problems in a couple applications that happen when you hold a key down I
> would look there rather then at the scheduling code itself)

Wouldn't surprise me if it's an X problem... I can't say that I feel
like going digging into X sources...

I can get the same problem in sawfish if I press the key a few times
quite rapidly as well, without any background load at all. This problem
has never occured before on 2.4 or 2.5, with or without load. It could
be that the scheduler changes exposes some bug in X but I'm not really
sure how to start investigating...

--
/Martin

Never argue with an idiot. They drag you down to their level, then beat you with experience.

2003-03-07 21:37:03

by Martin Josefsson

[permalink] [raw]
Subject: Re: [patch] "interactivity changes", sched-2.5.64-B2

On Fri, 2003-03-07 at 22:05, Martin Josefsson wrote:

> And sawfish still takes 30 second naps when executing a bunch of
> aumix's.

For information: this is 2.5.64 + sched-2.5.64-B2, no preempt or other
patches.

I changed the command that sawfish executes to only append the current
date to a file and then I made it take a nap again.

I got a file containing 57 lines, one for each time the command was
executed by sawfish.

it took 41 seconds to execute the command 57 times and the machine was
very idle during that time.

According to top sawfish's priority is between 15 and 20 the whole time.
And as I said the machine is very idle during the test.

The command was a very small bashscript:

#!/bin/bash

date >> /tmp/result


So no aumix executed in the script, didn't want another error factor in
the test.

I've reproduced the problem three times and this is what I got:

executions time
---------- ----
57 41 seconds
52 39 seconds
28 25 seconds


results from the third run

Fri Mar 7 22:31:00 CET 2003
Fri Mar 7 22:31:02 CET 2003
Fri Mar 7 22:31:03 CET 2003
Fri Mar 7 22:31:04 CET 2003
Fri Mar 7 22:31:05 CET 2003
Fri Mar 7 22:31:06 CET 2003
Fri Mar 7 22:31:07 CET 2003
Fri Mar 7 22:31:07 CET 2003
Fri Mar 7 22:31:07 CET 2003
Fri Mar 7 22:31:09 CET 2003
Fri Mar 7 22:31:10 CET 2003
Fri Mar 7 22:31:11 CET 2003
Fri Mar 7 22:31:12 CET 2003
Fri Mar 7 22:31:14 CET 2003
Fri Mar 7 22:31:14 CET 2003
Fri Mar 7 22:31:14 CET 2003
Fri Mar 7 22:31:15 CET 2003
Fri Mar 7 22:31:16 CET 2003
Fri Mar 7 22:31:17 CET 2003
Fri Mar 7 22:31:18 CET 2003
Fri Mar 7 22:31:19 CET 2003
Fri Mar 7 22:31:19 CET 2003
Fri Mar 7 22:31:19 CET 2003
Fri Mar 7 22:31:20 CET 2003
Fri Mar 7 22:31:22 CET 2003
Fri Mar 7 22:31:23 CET 2003
Fri Mar 7 22:31:24 CET 2003
Fri Mar 7 22:31:25 CET 2003

Do you have any clues to what might be happening?

--
/Martin

Never argue with an idiot. They drag you down to their level, then beat you with experience.

2003-03-07 22:53:04

by Helge Hafting

[permalink] [raw]
Subject: Re: [patch] "interactivity changes", sched-2.5.64-B2

Martin Josefsson wrote:

> Wouldn't surprise me if it's an X problem... I can't say that I feel
> like going digging into X sources...
>
> I can get the same problem in sawfish if I press the key a few times
> quite rapidly as well, without any background load at all. This problem
> has never occured before on 2.4 or 2.5, with or without load. It could
> be that the scheduler changes exposes some bug in X but I'm not really
> sure how to start investigating...

You could perhaps rule out sawfish bugs by testing this with
another window manager?

Helge Hafting


2003-03-08 18:18:11

by Aaron Lehmann

[permalink] [raw]
Subject: Re: [patch] "HT scheduler", sched-2.5.63-B3

On Fri, Mar 07, 2003 at 08:36:12AM +0100, Ingo Molnar wrote:
> > I was able to reproduce them by selecting text in Mathematica (ugh, not
> > a very helpful example). The skips were shorter and about three times as
> > hard to trigger as on 2.5.63.
>
> okay, just as a data point, could you try to renice the player
> process/thread to -2? Does it make the skipping harder to trigger?
> How about -5, or -10?

Renicing all xmms threads to -2 makes the problem impossible to
trigger in this way. I'm sorry for having such a stupid testcase;
Mathematica is proprietary software, and judging by the way it makes
XMMS skip, it's probably doing something stupid. XMMS also has a "Use
realtime priority" option, which makes it do:

sparam.sched_priority = sched_get_priority_max(SCHED_RR);
sched_setscheduler(0, SCHED_RR, &sparam);

Obviously, this requires root privileges, however, it also prevents
the skipping. I wonder if setting CAP_SYS_NICE on xmms and hacking it
to ignore geteuid() would be safe.

2003-03-11 01:16:15

by Perez-Gonzalez, Inaky

[permalink] [raw]
Subject: RE: [patch] "HT scheduler", sched-2.5.63-B3

> On Fri, 7 Mar 2003, Perez-Gonzalez, Inaky wrote:
>
> > Question: does this apply also to tasks that are current on another CPU
> > other than the local?
> >
> > Or, can I change the prio of a task that is current on another CPU
> > directly and then set_tsk_need_resched()? Or I also have to unqueue it
> > first?
>
> any task must be unqueued before changing its priority - the runqueues are
> indexed by the priority field. The special case in wakeup is that 'p' is
> not on any runqueues (hence the wakeup ...), so we have a cheap
> opportunity there to fix up priority. But it's not a problem for an
> already running process either - it needs to be requeued.

Thanks, I was suspecting that [when something makes full sense, it is
easy to suspect it :)].

How friendly would you be to a hook into setscheduler()? I have this
same problem with real-time futexes, specially wrt to effective priority
in priority-inheritance & prio-protection. Basically, I need to know
right away when a waiting tasks' priority has been changed so I can
properly reposition it into the waiting list.

As well, whenever I change the priority of an active/running task, it needs
to activate the hooks (for my proof-of-concept I did that manually; however,
it is limited and non-extensible), and I am looking into unfolding
setscheduler() into setscheduler() and do_setscheduler(), the later taking
the task struct, policy and priority to set, so that it can be called
from inside the kernel. setscheduler() would only do the user-space
stuff handling.

Would that be ok with you?

I?aky P?rez-Gonz?lez -- Not speaking for Intel -- all opinions are my own
(and my fault)