2003-09-05 17:57:50

by Nick Piggin

[permalink] [raw]
Subject: [PATCH] Nick's scheduler policy v12

linux-2.6-npiggin/fs/proc/array.c | 7
linux-2.6-npiggin/include/linux/sched.h | 10
linux-2.6-npiggin/kernel/fork.c | 30 -
linux-2.6-npiggin/kernel/sched.c | 542 +++++++++++++++++++-------------
4 files changed, 342 insertions(+), 247 deletions(-)

diff -puN fs/proc/array.c~rollup fs/proc/array.c
--- linux-2.6/fs/proc/array.c~rollup 2003-09-06 03:44:21.000000000 +1000
+++ linux-2.6-npiggin/fs/proc/array.c 2003-09-06 03:44:41.000000000 +1000
@@ -154,13 +154,18 @@ static inline char * task_state(struct t
read_lock(&tasklist_lock);
buffer += sprintf(buffer,
"State:\t%s\n"
+ "sleep_avg:\t%lu\n"
+ "sleep_time:\t%lu\n"
+ "total_time:\t%lu\n"
"Tgid:\t%d\n"
"Pid:\t%d\n"
"PPid:\t%d\n"
"TracerPid:\t%d\n"
"Uid:\t%d\t%d\t%d\t%d\n"
"Gid:\t%d\t%d\t%d\t%d\n",
- get_task_state(p), p->tgid,
+ get_task_state(p),
+ p->sleep_avg, p->sleep_time, p->total_time,
+ p->tgid,
p->pid, p->pid ? p->real_parent->pid : 0,
p->pid && p->ptrace ? p->parent->pid : 0,
p->uid, p->euid, p->suid, p->fsuid,
diff -puN include/linux/sched.h~rollup include/linux/sched.h
--- linux-2.6/include/linux/sched.h~rollup 2003-09-06 03:44:26.000000000 +1000
+++ linux-2.6-npiggin/include/linux/sched.h 2003-09-06 03:44:41.000000000 +1000
@@ -280,7 +280,7 @@ struct signal_struct {
#define MAX_USER_RT_PRIO 100
#define MAX_RT_PRIO MAX_USER_RT_PRIO

-#define MAX_PRIO (MAX_RT_PRIO + 40)
+#define MAX_PRIO (MAX_RT_PRIO + 60)

/*
* Some day this will be a full-fledged user tracking system..
@@ -339,12 +339,17 @@ struct task_struct {
struct list_head run_list;
prio_array_t *array;

+ /* Scheduler variables follow. kernel/sched.c */
+ unsigned long array_sequence;
+ unsigned long timestamp;
+
+ unsigned long total_time, sleep_time;
unsigned long sleep_avg;
- unsigned long last_run;

unsigned long policy;
cpumask_t cpus_allowed;
unsigned int time_slice, first_time_slice;
+ unsigned int used_slice;

struct list_head tasks;
struct list_head ptrace_children;
@@ -552,6 +557,7 @@ extern int FASTCALL(wake_up_state(struct
extern int FASTCALL(wake_up_process(struct task_struct * tsk));
extern int FASTCALL(wake_up_process_kick(struct task_struct * tsk));
extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk));
+extern void FASTCALL(sched_fork(task_t * p));
extern void FASTCALL(sched_exit(task_t * p));

asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru);
diff -puN kernel/fork.c~rollup kernel/fork.c
--- linux-2.6/kernel/fork.c~rollup 2003-09-06 03:44:31.000000000 +1000
+++ linux-2.6-npiggin/kernel/fork.c 2003-09-06 03:44:41.000000000 +1000
@@ -911,33 +911,9 @@ struct task_struct *copy_process(unsigne
p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
p->pdeath_signal = 0;

- /*
- * Share the timeslice between parent and child, thus the
- * total amount of pending timeslices in the system doesn't change,
- * resulting in more scheduling fairness.
- */
- local_irq_disable();
- p->time_slice = (current->time_slice + 1) >> 1;
- /*
- * The remainder of the first timeslice might be recovered by
- * the parent if the child exits early enough.
- */
- p->first_time_slice = 1;
- current->time_slice >>= 1;
- p->last_run = jiffies;
- if (!current->time_slice) {
- /*
- * This case is rare, it happens when the parent has only
- * a single jiffy left from its timeslice. Taking the
- * runqueue lock is not a problem.
- */
- current->time_slice = 1;
- preempt_disable();
- scheduler_tick(0, 0);
- local_irq_enable();
- preempt_enable();
- } else
- local_irq_enable();
+ /* Perform scheduler related accounting */
+ sched_fork(p);
+
/*
* Ok, add it to the run-queues and make it
* visible to the rest of the system.
diff -puN kernel/sched.c~rollup kernel/sched.c
--- linux-2.6/kernel/sched.c~rollup 2003-09-06 03:44:35.000000000 +1000
+++ linux-2.6-npiggin/kernel/sched.c 2003-09-06 03:44:41.000000000 +1000
@@ -60,85 +60,56 @@
#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

/*
- * These are the 'tuning knobs' of the scheduler:
- *
- * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
- * maximum timeslice is 200 msecs. Timeslices get refilled after
- * they expire.
- */
-#define MIN_TIMESLICE ( 10 * HZ / 1000)
-#define MAX_TIMESLICE (200 * HZ / 1000)
-#define CHILD_PENALTY 50
-#define PARENT_PENALTY 100
-#define EXIT_WEIGHT 3
-#define PRIO_BONUS_RATIO 25
-#define INTERACTIVE_DELTA 2
-#define MAX_SLEEP_AVG (10*HZ)
-#define STARVATION_LIMIT (10*HZ)
-#define NODE_THRESHOLD 125
+ * MIN_TIMESLICE is the timeslice that a minimum priority process gets if there
+ * is a maximum priority process runnable. MAX_TIMESLICE is derived from the
+ * formula in task_timeslice. It cannot be changed here. It is the timesilce
+ * that the maximum priority process will get. Larger timeslices are attainable
+ * by low priority processes however.
+ */
+#define MIN_TIMESLICE ((1 * HZ / 1000) ? 1 * HZ / 1000 : 1)
+#define MAX_TIMESLICE (60 * MIN_TIMESLICE) /* do not change */
+
+/* Maximum amount of history that will be used to calculate priority */
+#define MAX_SLEEP (HZ)

/*
- * If a task is 'interactive' then we reinsert it in the active
- * array after it has expired its current timeslice. (it will not
- * continue to run immediately, it will still roundrobin with
- * other interactive tasks.)
- *
- * This part scales the interactivity limit depending on niceness.
- *
- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
- * Here are a few examples of different nice levels:
- *
- * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
- * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
- * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
- * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
- * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
- *
- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
- * priority range a task can explore, a value of '1' means the
- * task is rated interactive.)
- *
- * Ie. nice +19 tasks can never get 'interactive' enough to be
- * reinserted into the active array. And only heavily CPU-hog nice -20
- * tasks will be expired. Default nice 0 tasks are somewhere between,
- * it takes some effort for them to get interactive, but it's not
- * too hard.
+ * Maximum effect that 1 block of activity (run/sleep/etc) can have. This is
+ * will moderate dicard freak events (eg. SIGSTOP)
*/
+#define MAX_SLEEP_AFFECT (MAX_SLEEP/4)
+#define MAX_RUN_AFFECT (MAX_SLEEP/4)
+#define MAX_WAIT_AFFECT (MAX_RUN_AFFECT/4)

-#define SCALE(v1,v1_max,v2_max) \
- (v1) * (v2_max) / (v1_max)
+/*
+ * The amount of history can be decreased (on fork for example). This puts a
+ * lower bound on it.
+ */
+#define MIN_HISTORY (MAX_SLEEP/2)

-#define DELTA(p) \
- (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
- INTERACTIVE_DELTA)
+/*
+ * SLEEP_FACTOR is a fixed point factor used to scale history tracking things.
+ * In particular: total_time, sleep_time, sleep_avg.
+ */
+#define SLEEP_FACTOR (1024)

-#define TASK_INTERACTIVE(p) \
- ((p)->prio <= (p)->static_prio - DELTA(p))
+#define NODE_THRESHOLD 125

/*
- * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
- * to time slice values.
- *
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
- *
- * task_timeslice() is the interface that is used by the scheduler.
+ * The scheduler classifies a process as performing one of the following
+ * activities
*/
+#define STIME_SLEEP 1 /* Sleeping */
+#define STIME_RUN 2 /* Using CPU */
+#define STIME_WAIT 3 /* Waiting for CPU */

-#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
- ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1)))
-
-static inline unsigned int task_timeslice(task_t *p)
-{
- return BASE_TIMESLICE(p);
-}
+#define TASK_PREEMPTS_CURR(p, rq) \
+ ( (p)->prio < (rq)->curr->prio )

/*
* These are the runqueue data structures:
*/

-#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
+#define BITMAP_SIZE ((((MAX_PRIO+7)/8)+sizeof(long)-1)/sizeof(long))

typedef struct runqueue runqueue_t;

@@ -157,7 +128,8 @@ struct prio_array {
*/
struct runqueue {
spinlock_t lock;
- unsigned long nr_running, nr_switches, expired_timestamp,
+ unsigned long array_sequence;
+ unsigned long nr_running, nr_switches,
nr_uninterruptible;
task_t *curr, *idle;
struct mm_struct *prev_mm;
@@ -299,34 +271,102 @@ static inline void enqueue_task(struct t
}

/*
- * effective_prio - return the priority that is based on the static
- * priority but is modified by bonuses/penalties.
- *
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
- * into the -5 ... 0 ... +5 bonus/penalty range.
- *
- * We use 25% of the full 0...39 priority range so that:
- *
- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
+ * add_task_time updates a task @p after @time of doing the specified @type
+ * of activity. See STIME_*. This is used for priority calculation.
+ */
+static void add_task_time(task_t *p, unsigned long time, unsigned long type)
+{
+ unsigned long r;
+ unsigned long max_affect;
+
+ if (time == 0)
+ return;
+
+ if (type == STIME_SLEEP)
+ max_affect = MAX_SLEEP_AFFECT;
+ else if (type == STIME_RUN)
+ max_affect = MAX_RUN_AFFECT;
+ else
+ max_affect = MAX_WAIT_AFFECT;
+
+ if (time > max_affect)
+ time = max_affect;
+
+ r = MAX_SLEEP - time;
+ p->total_time = (r*p->total_time + MAX_SLEEP/2) / MAX_SLEEP;
+ p->sleep_time = (r*p->sleep_time + MAX_SLEEP/2) / MAX_SLEEP;
+
+ if (type != STIME_WAIT) {
+ p->total_time += SLEEP_FACTOR * time;
+ if (type == STIME_SLEEP)
+ p->sleep_time += SLEEP_FACTOR * time;
+
+ p->sleep_avg = (SLEEP_FACTOR * p->sleep_time) / p->total_time;
+ }
+
+ if (p->total_time < SLEEP_FACTOR * MIN_HISTORY) {
+ p->total_time = SLEEP_FACTOR * MIN_HISTORY;
+ p->sleep_time = p->total_time * p->sleep_avg / SLEEP_FACTOR;
+ }
+}
+
+/*
+ * The higher a thread's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority thread gets MIN_TIMESLICE worth of execution time.
*
- * Both properties are important to certain workloads.
+ * Timeslices are scaled, so if only low priority processes are running,
+ * they will all get long timeslices.
+ */
+static unsigned int task_timeslice(task_t *p, runqueue_t *rq)
+{
+ int idx, delta;
+ unsigned int base, timeslice;
+
+ if (rt_task(p))
+ return MAX_TIMESLICE;
+
+ idx = min(find_next_bit(rq->active->bitmap, MAX_PRIO, MAX_RT_PRIO),
+ find_next_bit(rq->expired->bitmap, MAX_PRIO, MAX_RT_PRIO));
+ idx = min(idx, p->prio);
+ delta = p->prio - idx;
+
+ /*
+ * This is a bit subtle. The first line establishes a timeslice based
+ * on how far this task is from being the highest priority runnable.
+ * The second line scales this result so low priority tasks will get
+ * big timeslices if higher priority ones are not running.
+ */
+ base = MIN_TIMESLICE * (MAX_USER_PRIO + 1) / (delta + 2);
+ timeslice = base * (USER_PRIO(idx) + 8) / 24;
+
+ if (timeslice <= 0)
+ timeslice = 1;
+
+ return timeslice;
+}
+
+/*
+ * task_priority: calculates a task's priority based on previous running
+ * history (see add_task_time). The priority is just a simple linear function
+ * based on sleep_avg and static_prio.
*/
-static int effective_prio(task_t *p)
+static unsigned long task_priority(task_t *p)
{
int bonus, prio;

if (rt_task(p))
return p->prio;

- bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
- MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
+ bonus = ((MAX_USER_PRIO / 3) * p->sleep_avg + (SLEEP_FACTOR / 2)) / SLEEP_FACTOR;
+ prio = USER_PRIO(p->static_prio) + (MAX_USER_PRIO / 3);

- prio = p->static_prio - bonus;
+ prio = MAX_RT_PRIO + prio - bonus;
if (prio < MAX_RT_PRIO)
prio = MAX_RT_PRIO;
if (prio > MAX_PRIO-1)
prio = MAX_PRIO-1;
+
return prio;
}

@@ -347,34 +387,21 @@ static inline void __activate_task(task_
*/
static inline void activate_task(task_t *p, runqueue_t *rq)
{
- long sleep_time = jiffies - p->last_run - 1;
+ unsigned long now = jiffies;
+ unsigned long sleep = now - p->timestamp;
+ p->timestamp = now;

- if (sleep_time > 0) {
- int sleep_avg;
+ add_task_time(p, sleep, STIME_SLEEP);

- /*
- * This code gives a bonus to interactive tasks.
- *
- * The boost works by updating the 'average sleep time'
- * value here, based on ->last_run. The more time a task
- * spends sleeping, the higher the average gets - and the
- * higher the priority boost gets as well.
- */
- sleep_avg = p->sleep_avg + sleep_time;
+ p->prio = task_priority(p);
+
+ /*
+ * If we have slept through an active/expired array switch, restart
+ * our timeslice too.
+ */
+ if (rq->array_sequence != p->array_sequence)
+ p->used_slice = 0;

- /*
- * 'Overflow' bonus ticks go to the waker as well, so the
- * ticks are not lost. This has the effect of further
- * boosting tasks that are related to maximum-interactive
- * tasks.
- */
- if (sleep_avg > MAX_SLEEP_AVG)
- sleep_avg = MAX_SLEEP_AVG;
- if (p->sleep_avg != sleep_avg) {
- p->sleep_avg = sleep_avg;
- p->prio = effective_prio(p);
- }
- }
__activate_task(p, rq);
}

@@ -383,6 +410,7 @@ static inline void activate_task(task_t
*/
static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
{
+ p->array_sequence = rq->array_sequence;
nr_running_dec(rq);
if (p->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible++;
@@ -426,7 +454,7 @@ static inline void resched_task(task_t *
* be called with interrupts off, or it may introduce deadlock with
* smp_call_function() if an IPI is sent by the same process we are
* waiting to become inactive.
- */
+ n*/
void wait_task_inactive(task_t * p)
{
unsigned long flags;
@@ -497,11 +525,9 @@ repeat_lock_task:
}
if (old_state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible--;
- if (sync)
- __activate_task(p, rq);
- else {
- activate_task(p, rq);
- if (p->prio < rq->curr->prio)
+ activate_task(p, rq);
+ if (!sync) {
+ if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
}
success = 1;
@@ -534,36 +560,90 @@ int wake_up_state(task_t *p, unsigned in
}

/*
+ * Perform scheduler related accounting for a newly forked process @p.
+ * @p is forked by current.
+ */
+void sched_fork(task_t *p)
+{
+ unsigned long ts;
+ unsigned long flags;
+ runqueue_t *rq;
+
+ /*
+ * Share the timeslice between parent and child, thus the
+ * total amount of pending timeslices in the system doesn't change,
+ * resulting in more scheduling fairness.
+ */
+ local_irq_disable();
+ p->timestamp = jiffies;
+ rq = task_rq_lock(current, &flags);
+ ts = task_timeslice(current, rq);
+ task_rq_unlock(rq, &flags);
+
+ /*
+ * Share half our timeslice with the child.
+ */
+ p->used_slice = current->used_slice + (ts - current->used_slice) / 2;
+ current->used_slice += (ts - current->used_slice + 1) / 2;
+
+ /*
+ * The remainder of the first timeslice might be recovered by
+ * the parent if the child exits early enough.
+ */
+ p->first_time_slice = 1;
+ if (current->used_slice >= ts) {
+ /*
+ * This case is rare, it happens when the parent has only
+ * a single jiffy left from its timeslice. Taking the
+ * runqueue lock is not a problem.
+ */
+ current->used_slice = ts - 1;
+ preempt_disable();
+ scheduler_tick(0, 0);
+ local_irq_enable();
+ preempt_enable();
+ } else
+ local_irq_enable();
+}
+
+/*
* wake_up_forked_process - wake up a freshly forked process.
*
* This function will do some initial scheduler statistics housekeeping
* that must be done for every newly created process.
*/
-void wake_up_forked_process(task_t * p)
+void wake_up_forked_process(task_t *p)
{
unsigned long flags;
runqueue_t *rq = task_rq_lock(current, &flags);

p->state = TASK_RUNNING;
- /*
- * We decrease the sleep average of forking parents
- * and children as well, to keep max-interactive tasks
- * from forking tasks that are max-interactive.
- */
- current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
- p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
- p->prio = effective_prio(p);
+
set_task_cpu(p, smp_processor_id());

- if (unlikely(!current->array))
- __activate_task(p, rq);
- else {
- p->prio = current->prio;
- list_add_tail(&p->run_list, &current->run_list);
- p->array = current->array;
- p->array->nr_active++;
- nr_running_inc(rq);
+ /*
+ * Get only 1/10th of the parents history. Limited by MIN_HISTORY.
+ */
+ p->total_time = current->total_time / 4;
+ p->sleep_time = current->sleep_time / 4;
+ p->sleep_avg = current->sleep_avg;
+
+ if (p->total_time < SLEEP_FACTOR * MIN_HISTORY) {
+ p->total_time = SLEEP_FACTOR * MIN_HISTORY;
+ p->sleep_time = p->total_time * p->sleep_avg / SLEEP_FACTOR;
}
+
+ /*
+ * Lose 1/4 sleep_time for forking.
+ */
+ current->sleep_time = 3 * current->sleep_time / 4;
+ if (current->total_time != 0)
+ current->sleep_avg = (SLEEP_FACTOR * current->sleep_time)
+ / current->total_time;
+
+ p->prio = task_priority(p);
+ __activate_task(p, rq);
+
task_rq_unlock(rq, &flags);
}

@@ -581,19 +661,25 @@ void sched_exit(task_t * p)
unsigned long flags;

local_irq_save(flags);
+
+ /* Regain the unused timeslice given to @p by its parent */
if (p->first_time_slice) {
- p->parent->time_slice += p->time_slice;
- if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
- p->parent->time_slice = MAX_TIMESLICE;
+ unsigned long flags;
+ runqueue_t *rq;
+ rq = task_rq_lock(p, &flags);
+ p->parent->used_slice -= task_timeslice(p, rq) - p->used_slice;
+ task_rq_unlock(rq, &flags);
+ }
+
+ /* Apply some penalty to @p's parent if @p used a lot of CPU */
+ if (p->sleep_avg < p->parent->sleep_avg) {
+ add_task_time(p->parent,
+ MAX_SLEEP * (p->parent->sleep_avg - p->sleep_avg)
+ / SLEEP_FACTOR / 2,
+ STIME_RUN);
}
+
local_irq_restore(flags);
- /*
- * If the child was a (relative-) CPU hog then decrease
- * the sleep_avg of the parent as well.
- */
- if (p->sleep_avg < p->parent->sleep_avg)
- p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT +
- p->sleep_avg) / (EXIT_WEIGHT + 1);
}

/**
@@ -959,10 +1045,10 @@ static inline runqueue_t *find_busiest_q
if (likely(!busiest))
goto out;

- *imbalance = (max_load - nr_running) / 2;
+ *imbalance = max_load - nr_running;

/* It needs an at least ~25% imbalance to trigger balancing. */
- if (!idle && (*imbalance < (max_load + 3)/4)) {
+ if (!idle && ((*imbalance)*4 < max_load)) {
busiest = NULL;
goto out;
}
@@ -972,7 +1058,7 @@ static inline runqueue_t *find_busiest_q
* Make sure nothing changed since we checked the
* runqueue length.
*/
- if (busiest->nr_running <= nr_running + 1) {
+ if (busiest->nr_running <= nr_running) {
spin_unlock(&busiest->lock);
busiest = NULL;
}
@@ -995,13 +1081,40 @@ static inline void pull_task(runqueue_t
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
*/
- if (p->prio < this_rq->curr->prio)
+ if (TASK_PREEMPTS_CURR(p, this_rq))
set_need_resched();
- else {
- if (p->prio == this_rq->curr->prio &&
- p->time_slice > this_rq->curr->time_slice)
- set_need_resched();
- }
+}
+
+/*
+ * can_migrate_task
+ * May task @p from runqueue @rq be migrated to @this_cpu?
+ * @idle: Is this_cpu idle
+ * Returns: 1 if @p may be migrated, 0 otherwise.
+ */
+static inline int
+can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, int idle)
+{
+ unsigned long delta;
+
+ /*
+ * We do not migrate tasks that are:
+ * 1) running (obviously), or
+ * 2) cannot be migrated to this CPU due to cpus_allowed, or
+ * 3) are cache-hot on their current CPU.
+ */
+
+ if (task_running(rq, p))
+ return 0;
+
+ if (!cpu_isset(this_cpu, p->cpus_allowed))
+ return 0;
+
+ /* Aggressive migration if we're idle */
+ delta = jiffies - p->timestamp;
+ if (!idle && (delta <= cache_decay_ticks))
+ return 0;
+
+ return 1;
}

/*
@@ -1025,6 +1138,12 @@ static void load_balance(runqueue_t *thi
goto out;

/*
+ * We only want to steal a number of tasks equal to 1/2 the imbalance,
+ * otherwise we'll just shift the imbalance to the new queue:
+ */
+ imbalance /= 2;
+
+ /*
* We first consider expired tasks. Those will likely not be
* executed in the near future, and they are most likely to
* be cache-cold, thus switching CPUs has the least effect
@@ -1056,27 +1175,17 @@ skip_bitmap:
skip_queue:
tmp = list_entry(curr, task_t, run_list);

- /*
- * We do not migrate tasks that are:
- * 1) running (obviously), or
- * 2) cannot be migrated to this CPU due to cpus_allowed, or
- * 3) are cache-hot on their current CPU.
- */
-
-#define CAN_MIGRATE_TASK(p,rq,this_cpu) \
- ((!idle || (jiffies - (p)->last_run > cache_decay_ticks)) && \
- !task_running(rq, p) && \
- cpu_isset(this_cpu, (p)->cpus_allowed))
-
curr = curr->prev;

- if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
+ if (!can_migrate_task(tmp, busiest, this_cpu, idle)) {
if (curr != head)
goto skip_queue;
idx++;
goto skip_bitmap;
}
pull_task(busiest, array, tmp, this_rq, this_cpu);
+
+ /* Only migrate 1 task if we're idle */
if (!idle && --imbalance) {
if (curr != head)
goto skip_queue;
@@ -1171,20 +1280,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
EXPORT_PER_CPU_SYMBOL(kstat);

/*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks:
- */
-#define EXPIRED_STARVING(rq) \
- (STARVATION_LIMIT && ((rq)->expired_timestamp && \
- (jiffies - (rq)->expired_timestamp >= \
- STARVATION_LIMIT * ((rq)->nr_running) + 1)))
-
-/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*
@@ -1201,17 +1296,11 @@ void scheduler_tick(int user_ticks, int
if (rcu_pending(cpu))
rcu_check_callbacks(cpu, user_ticks);

- /* note: this timer irq context must be accounted for as well */
- if (hardirq_count() - HARDIRQ_OFFSET) {
- cpustat->irq += sys_ticks;
- sys_ticks = 0;
- } else if (softirq_count()) {
- cpustat->softirq += sys_ticks;
- sys_ticks = 0;
- }
-
if (p == rq->idle) {
- if (atomic_read(&rq->nr_iowait) > 0)
+ /* note: this timer irq context must be accounted for as well */
+ if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
+ cpustat->system += sys_ticks;
+ else if (atomic_read(&rq->nr_iowait) > 0)
cpustat->iowait += sys_ticks;
else
cpustat->idle += sys_ticks;
@@ -1232,43 +1321,39 @@ void scheduler_tick(int user_ticks, int
spin_lock(&rq->lock);
/*
* The task was running during this tick - update the
- * time slice counter and the sleep average. Note: we
- * do not update a thread's priority until it either
- * goes to sleep or uses up its timeslice. This makes
- * it possible for interactive tasks to use up their
- * timeslices at their highest priority levels.
+ * time slice counter. Note: we do not update a thread's
+ * priority until it either goes to sleep or uses up its
+ * timeslice.
*/
- if (p->sleep_avg)
- p->sleep_avg--;
if (unlikely(rt_task(p))) {
/*
* RR tasks need a special form of timeslice management.
* FIFO tasks have no timeslices.
*/
- if ((p->policy == SCHED_RR) && !--p->time_slice) {
- p->time_slice = task_timeslice(p);
- p->first_time_slice = 0;
- set_tsk_need_resched(p);
-
- /* put it at the end of the queue: */
- dequeue_task(p, rq->active);
- enqueue_task(p, rq->active);
+ if (p->policy == SCHED_RR) {
+ p->used_slice++;
+ if (p->used_slice >= task_timeslice(p, rq)) {
+ p->used_slice = 0;
+ p->first_time_slice = 0;
+ set_tsk_need_resched(p);
+
+ /* put it at the end of the queue: */
+ dequeue_task(p, rq->active);
+ enqueue_task(p, rq->active);
+ }
}
goto out_unlock;
}
- if (!--p->time_slice) {
+
+ p->used_slice++;
+ if (p->used_slice >= task_timeslice(p, rq)) {
dequeue_task(p, rq->active);
set_tsk_need_resched(p);
- p->prio = effective_prio(p);
- p->time_slice = task_timeslice(p);
+ p->prio = task_priority(p);
+ p->used_slice = 0;
p->first_time_slice = 0;

- if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
- if (!rq->expired_timestamp)
- rq->expired_timestamp = jiffies;
- enqueue_task(p, rq->expired);
- } else
- enqueue_task(p, rq->active);
+ enqueue_task(p, rq->expired);
}
out_unlock:
spin_unlock(&rq->lock);
@@ -1287,6 +1372,8 @@ asmlinkage void schedule(void)
runqueue_t *rq;
prio_array_t *array;
struct list_head *queue;
+ unsigned long now;
+ unsigned long run_time;
int idx;

/*
@@ -1307,7 +1394,11 @@ need_resched:
rq = this_rq();

release_kernel_lock(prev);
- prev->last_run = jiffies;
+ now = jiffies;
+ run_time = now - prev->timestamp;
+
+ add_task_time(prev, run_time, STIME_RUN);
+
spin_lock_irq(&rq->lock);

/*
@@ -1336,7 +1427,6 @@ pick_next_task:
goto pick_next_task;
#endif
next = rq->idle;
- rq->expired_timestamp = 0;
goto switch_tasks;
}

@@ -1345,10 +1435,10 @@ pick_next_task:
/*
* Switch the active and expired arrays.
*/
+ rq->array_sequence++;
rq->active = rq->expired;
rq->expired = array;
array = rq->active;
- rq->expired_timestamp = 0;
}

idx = sched_find_first_bit(array->bitmap);
@@ -1360,7 +1450,10 @@ switch_tasks:
clear_tsk_need_resched(prev);
RCU_qsctr(task_cpu(prev))++;

+ prev->timestamp = now;
if (likely(prev != next)) {
+ add_task_time(next, now - next->timestamp, STIME_WAIT);
+ next->timestamp = now;
rq->nr_switches++;
rq->curr = next;

@@ -1600,6 +1693,7 @@ void set_user_nice(task_t *p, long nice)
unsigned long flags;
prio_array_t *array;
runqueue_t *rq;
+ int old_prio, new_prio, delta;

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
return;
@@ -1608,6 +1702,12 @@ void set_user_nice(task_t *p, long nice)
* the task might be in the middle of scheduling on another CPU.
*/
rq = task_rq_lock(p, &flags);
+ /*
+ * The RT priorities are set via setscheduler(), but we still
+ * allow the 'normal' nice value to be set - but as expected
+ * it wont have any effect on scheduling until the task is
+ * not SCHED_NORMAL:
+ */
if (rt_task(p)) {
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
@@ -1615,16 +1715,20 @@ void set_user_nice(task_t *p, long nice)
array = p->array;
if (array)
dequeue_task(p, array);
+
+ old_prio = p->prio;
+ new_prio = NICE_TO_PRIO(nice);
+ delta = new_prio - old_prio;
p->static_prio = NICE_TO_PRIO(nice);
- p->prio = NICE_TO_PRIO(nice);
+ p->prio += delta;
+
if (array) {
enqueue_task(p, array);
/*
- * If the task is running and lowered its priority,
- * or increased its priority then reschedule its CPU:
+ * If the task increased its priority or is running and
+ * lowered its priority, then reschedule its CPU:
*/
- if ((NICE_TO_PRIO(nice) < p->static_prio) ||
- task_running(rq, p))
+ if (delta < 0 || (delta > 0 && task_running(rq, p)))
resched_task(rq->curr);
}
out_unlock:
@@ -2139,6 +2243,8 @@ asmlinkage long sys_sched_rr_get_interva
int retval = -EINVAL;
struct timespec t;
task_t *p;
+ unsigned long flags;
+ runqueue_t *rq;

if (pid < 0)
goto out_nounlock;
@@ -2153,8 +2259,10 @@ asmlinkage long sys_sched_rr_get_interva
if (retval)
goto out_unlock;

+ rq = task_rq_lock(p, &flags);
jiffies_to_timespec(p->policy & SCHED_FIFO ?
- 0 : task_timeslice(p), &t);
+ 0 : task_timeslice(p, rq), &t);
+ task_rq_unlock(rq, &flags);
read_unlock(&tasklist_lock);
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
out_nounlock:

_


Attachments:
rollup.patch (27.55 kB)

2003-09-05 19:05:16

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v12

> Backboost is gone so X really should be at -10 or even higher.

Wasn't that causing half the problems originally? Boosting X seemed
to starve xmms et al. Or do the interactivity changes fix xmms
somehow, but not X itself? Explicitly fiddling with task's priorities
seems flawed to me.

M.

2003-09-05 20:22:37

by Mike Fedyk

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v12

On Fri, Sep 05, 2003 at 11:54:04AM -0700, Martin J. Bligh wrote:
> > Backboost is gone so X really should be at -10 or even higher.
>
> Wasn't that causing half the problems originally? Boosting X seemed
> to starve xmms et al. Or do the interactivity changes fix xmms
> somehow, but not X itself? Explicitly fiddling with task's priorities
> seems flawed to me.

Wasn't it the larger timeslices with lower nice values in stock and Con's
patches that made X with nice -10 a bad idea?

2003-09-05 20:31:19

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v12

> On Fri, Sep 05, 2003 at 11:54:04AM -0700, Martin J. Bligh wrote:
>> > Backboost is gone so X really should be at -10 or even higher.
>>
>> Wasn't that causing half the problems originally? Boosting X seemed
>> to starve xmms et al. Or do the interactivity changes fix xmms
>> somehow, but not X itself? Explicitly fiddling with task's priorities
>> seems flawed to me.
>
> Wasn't it the larger timeslices with lower nice values in stock and Con's
> patches that made X with nice -10 a bad idea?

Debian renices X by default to -10 ... I fixed all my desktop interactivity
problems around 2.5.63 timeframe by just turning that off. That was way
before Con's patches.

There may be some more details around this, and I'd love to hear them,
but I fundmantally believe that explitit fiddling with particular
processes because we believe they're somehow magic is wrong (and so
does Linus, from previous discussions).

M.

2003-09-05 20:39:09

by Mike Fedyk

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v12

On Fri, Sep 05, 2003 at 01:19:24PM -0700, Martin J. Bligh wrote:
> > On Fri, Sep 05, 2003 at 11:54:04AM -0700, Martin J. Bligh wrote:
> >> > Backboost is gone so X really should be at -10 or even higher.
> >>
> >> Wasn't that causing half the problems originally? Boosting X seemed
> >> to starve xmms et al. Or do the interactivity changes fix xmms
> >> somehow, but not X itself? Explicitly fiddling with task's priorities
> >> seems flawed to me.
> >
> > Wasn't it the larger timeslices with lower nice values in stock and Con's
> > patches that made X with nice -10 a bad idea?
>
> Debian renices X by default to -10 ... I fixed all my desktop interactivity
> problems around 2.5.63 timeframe by just turning that off. That was way
> before Con's patches.

Exactly. Because the larger time slices for lower nice values came from
O(1), not Con.

>
> There may be some more details around this, and I'd love to hear them,
> but I fundmantally believe that explitit fiddling with particular
> processes because we believe they're somehow magic is wrong (and so
> does Linus, from previous discussions).
>

Linus added a patch to 2.5.65 or so that was supposed to allow nice 0 on X
without any detrament.

2003-09-05 20:58:30

by Robert Love

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v12

On Fri, 2003-09-05 at 16:39, Mike Fedyk wrote:

> Exactly. Because the larger time slices for lower nice values came from
> O(1), not Con.

The larger timeslices may not help, but one reason why renicing X hurts
multimedia is that it gives a preference to the GUI over the multimedia
thread(s).

Look at it this way. Assume renicing X does not _help_ whatever the
problem is (simply because the problem, in this case, is not stemming
from X). Then giving X the higher priority and larger timeslice only
adversely affects the problem.

So, since the multimedia thread in (say) xmms is really unrelated to X
(its a separate thread and not doing any Xlib calls), it just hurts it.

> Linus added a patch to 2.5.65 or so that was supposed to allow nice 0 on X
> without any detrament.

That is the backboost thing. It was ripped out, due to regressions in
SETI and elsewhere.

Nick added a similar thing back in, but he just removed it. Its a shame
it cannot be made to work, because I like the idea. I talked with Nick
and OLS about the ability of a forward+back boost combination to solve
our interactivity issues. I really wish they panned out.

Robert Love


2003-09-06 01:19:02

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v12



Martin J. Bligh wrote:

>>On Fri, Sep 05, 2003 at 11:54:04AM -0700, Martin J. Bligh wrote:
>>
>>>>Backboost is gone so X really should be at -10 or even higher.
>>>>
>>>Wasn't that causing half the problems originally? Boosting X seemed
>>>to starve xmms et al. Or do the interactivity changes fix xmms
>>>somehow, but not X itself? Explicitly fiddling with task's priorities
>>>seems flawed to me.
>>>
>>Wasn't it the larger timeslices with lower nice values in stock and Con's
>>patches that made X with nice -10 a bad idea?
>>
>
>Debian renices X by default to -10 ... I fixed all my desktop interactivity
>problems around 2.5.63 timeframe by just turning that off. That was way
>before Con's patches.
>

Yep, as Mike mentioned, renicing X causes it to get bigger
timeslices with the stock scheduler. If you had 2 nice -20 processes,
they would each get a timeslice of 200ms, so you're harming their
latency.

>
>There may be some more details around this, and I'd love to hear them,
>but I fundmantally believe that explitit fiddling with particular
>processes because we believe they're somehow magic is wrong (and so
>does Linus, from previous discussions).
>

Well it would be nice if someone could find out how to do it, but I
think that if we want X to be able to get 80% CPU when 2 other CPU hogs
are running, you have to renice it.


2003-09-06 01:32:24

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v12



Robert Love wrote:

>On Fri, 2003-09-05 at 16:39, Mike Fedyk wrote:
>
>
>>Exactly. Because the larger time slices for lower nice values came from
>>O(1), not Con.
>>
>
>The larger timeslices may not help, but one reason why renicing X hurts
>multimedia is that it gives a preference to the GUI over the multimedia
>thread(s).
>
>Look at it this way. Assume renicing X does not _help_ whatever the
>problem is (simply because the problem, in this case, is not stemming
>from X). Then giving X the higher priority and larger timeslice only
>adversely affects the problem.
>
>So, since the multimedia thread in (say) xmms is really unrelated to X
>(its a separate thread and not doing any Xlib calls), it just hurts it.
>

Hi Robert,
Yeah you are right. Backboost is sort of an implicit renice though,
except it doesn't always go where you want it or when you want :(

I have found that is enough to have good scheduling latency to ensure
xmms skips are difficult to produce.


2003-09-06 03:37:25

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v12

> Yep, as Mike mentioned, renicing X causes it to get bigger
> timeslices with the stock scheduler. If you had 2 nice -20 processes,
> they would each get a timeslice of 200ms, so you're harming their
> latency.

Well, if I can be naive for a second (and I'll fully admit I don't
understand the implications of this), there are two things here -
either give it more of a timeslice (bandwidth increase), or make it
more interactive (latency increase). Those two seem to be separable,
but we don't bother. Seems better to pass a more subtle hint to the
scheduler that this is interactive - nice seems like a very large
brick between the eyes.

>> There may be some more details around this, and I'd love to hear them,
>> but I fundmantally believe that explitit fiddling with particular
>> processes because we believe they're somehow magic is wrong (and so
>> does Linus, from previous discussions).
>
> Well it would be nice if someone could find out how to do it, but I
> think that if we want X to be able to get 80% CPU when 2 other CPU hogs
> are running, you have to renice it.

OK. So you renice it ... then your two cpu jobs exit, and you kick off
xmms. Every time you waggle a window, X will steal the cpu back from
xmms, and it'll stall, surely? That's what seemed to happen before.
I don't see how you can fix anything by doing static priority alterations
(eg nice), because the workload changes.

I'm probably missing something ... feel free to slap me ;-)

M.

2003-09-06 06:20:41

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v12

On Fri, 5 September 2003 20:36:32 -0700, Martin J. Bligh wrote:
>
> Well, if I can be naive for a second (and I'll fully admit I don't
> understand the implications of this), there are two things here -
> either give it more of a timeslice (bandwidth increase), or make it
> more interactive (latency increase). Those two seem to be separable,
> but we don't bother. Seems better to pass a more subtle hint to the
> scheduler that this is interactive - nice seems like a very large
> brick between the eyes.

Just as na?ve as you, Martin, but your idea is at least incomplete.
We have to make some processes behave differently, right, but we also
have to detect, which ones they are. Is the user unhappy with gcc
being a bit jerky? No, so gcc is not a problem. X, xmms, xine,
mplayer, quake and a thousand more make the user unhappy, but how do
we detect them and only them? And which ones do we favor if several
are competing?

One way is to let the user tell you, through nice or anything else,
but the user doesn't want to, so that is suboptimal.

Another way is to integrate the GUI into the kernel, so you know which
window is on top, etc. But we surely don't want that.

A third way may be to accept hints from userspace, so that X can tell
us, which window is on top. Maybe as a last resort.

What remains is to try to detect the "interactive" processes through
their behaviour and I am not sure if it is possible to always get this
right in the general case.

Detecting the right processes is the real problem, Martin. If you can
do that, the rest is quite simple. And so far, we are quite bad at
it. :(

> I'm probably missing something ... feel free to slap me ;-)

*slap*
Maybe you didn't offer me to, but it still felt good. :)

J?rn

--
Fools ignore complexity. Pragmatists suffer it.
Some can avoid it. Geniuses remove it.
-- Perlis's Programming Proverb #58, SIGPLAN Notices, Sept. 1982

2003-09-06 06:38:19

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v12



Martin J. Bligh wrote:

>>Yep, as Mike mentioned, renicing X causes it to get bigger
>>timeslices with the stock scheduler. If you had 2 nice -20 processes,
>>they would each get a timeslice of 200ms, so you're harming their
>>latency.
>>
>
>Well, if I can be naive for a second (and I'll fully admit I don't
>understand the implications of this), there are two things here -
>either give it more of a timeslice (bandwidth increase), or make it
>more interactive (latency increase). Those two seem to be separable,
>but we don't bother. Seems better to pass a more subtle hint to the
>scheduler that this is interactive - nice seems like a very large
>brick between the eyes.
>

I think the two will always related. One means giving a higher
dynamic priority, the other a bigger timeslice. So you want
say gcc to have a 100ms timeslice with lowest scheduling prio,
but X to have a 20ms slice and a very high scheduling priority.

Unfortunately, the way the scheduler currently works, X might
use all its timeslice, then have to wait 100ms for gcc to finish
its. The way I do it is give a small timeslice to high prio tasks,
and lower priority tasks get progressively less.

When _only_ low priority tasks are running, they'll all get long
timeslices.

>
>>>There may be some more details around this, and I'd love to hear them,
>>>but I fundmantally believe that explitit fiddling with particular
>>>processes because we believe they're somehow magic is wrong (and so
>>>does Linus, from previous discussions).
>>>
>>Well it would be nice if someone could find out how to do it, but I
>>think that if we want X to be able to get 80% CPU when 2 other CPU hogs
>>are running, you have to renice it.
>>
>
>OK. So you renice it ... then your two cpu jobs exit, and you kick off
>xmms. Every time you waggle a window, X will steal the cpu back from
>xmms, and it'll stall, surely? That's what seemed to happen before.
>I don't see how you can fix anything by doing static priority alterations
>(eg nice), because the workload changes.
>
>I'm probably missing something ... feel free to slap me ;-)
>

OK well just as a rough idea of how mine works: worst case for
xmms is that X is at its highest dynamic priority (and reniced).
xmms will be at its highest dynamic prio, or maybe one or two
below that.

X will get to run for maybe 30ms first, then xmms is allowed 6ms.
That is still 15% CPU. And X soon comes down in priority if it
continues to use a lot of CPU.


2003-09-06 06:55:55

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v12



Nick Piggin wrote:

>
>
> Martin J. Bligh wrote:
>
>>
>> OK. So you renice it ... then your two cpu jobs exit, and you kick off
>> xmms. Every time you waggle a window, X will steal the cpu back from
>> xmms, and it'll stall, surely? That's what seemed to happen before.
>> I don't see how you can fix anything by doing static priority
>> alterations
>> (eg nice), because the workload changes.
>>
>> I'm probably missing something ... feel free to slap me ;-)
>>
>
> OK well just as a rough idea of how mine works: worst case for
> xmms is that X is at its highest dynamic priority (and reniced).
> xmms will be at its highest dynamic prio, or maybe one or two
> below that.
>
> X will get to run for maybe 30ms first, then xmms is allowed 6ms.
> That is still 15% CPU. And X soon comes down in priority if it
> continues to use a lot of CPU.
>

Backboost is not very different from renicing. It is just and implicit
and much less controlled way of allowing unfairness. And that is not
very different from the interactivity stuff either.


2003-09-06 07:59:05

by Martin Schlemmer

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v12

On Sat, 2003-09-06 at 05:36, Martin J. Bligh wrote:

> > Well it would be nice if someone could find out how to do it, but I
> > think that if we want X to be able to get 80% CPU when 2 other CPU hogs
> > are running, you have to renice it.
>
> OK. So you renice it ... then your two cpu jobs exit, and you kick off
> xmms. Every time you waggle a window, X will steal the cpu back from
> xmms, and it'll stall, surely? That's what seemed to happen before.
> I don't see how you can fix anything by doing static priority alterations
> (eg nice), because the workload changes.
>

Not here with version 10 of Nick's patch, and X reniced to -10.
I have not had chance to test v12, but will do tonight when I
get home.


Cheers,

--
Martin Schlemmer


2003-09-06 15:14:38

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v12

> I think the two will always related. One means giving a higher
> dynamic priority, the other a bigger timeslice. So you want
> say gcc to have a 100ms timeslice with lowest scheduling prio,
> but X to have a 20ms slice and a very high scheduling priority.

Right.

> Unfortunately, the way the scheduler currently works, X might
> use all its timeslice, then have to wait 100ms for gcc to finish
> its. The way I do it is give a small timeslice to high prio tasks,
> and lower priority tasks get progressively less.

If the interactive task uses all it's timeslice, then it's not really
very interactive, it's chewing quite a bit of CPU ... presumably in
the common case, these things don't finish their timeslices. I thought
we preempted the running task when a higher prio one woke up, so this
should still work, right?

So it would seem to make sense to boost the prio of a interactive task
*without* increasing the size of it's timeslice.

> When _only_ low priority tasks are running, they'll all get long
> timeslices.

That at least makes sense. AFIAK at least the early versions of Con's
stuff make cpu bound jobs' timeslices short even if there were no
interactive jobs. I don't like that (or more relevantly, the benchmarks
don't either ;-)).

> OK well just as a rough idea of how mine works: worst case for
> xmms is that X is at its highest dynamic priority (and reniced).
> xmms will be at its highest dynamic prio, or maybe one or two
> below that.
>
> X will get to run for maybe 30ms first, then xmms is allowed 6ms.
> That is still 15% CPU. And X soon comes down in priority if it
> continues to use a lot of CPU.

If it works in practice, it works, I guess. I just don't see why X
is super special ... are we going to have to renice *all* interactive
tasks in order to get things to work properly?

M.

2003-09-06 15:47:20

by Ed Sweetman

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v12

Martin J. Bligh wrote:
>>I think the two will always related. One means giving a higher
>>dynamic priority, the other a bigger timeslice. So you want
>>say gcc to have a 100ms timeslice with lowest scheduling prio,
>>but X to have a 20ms slice and a very high scheduling priority.
>
>
> Right.
>
>
>>Unfortunately, the way the scheduler currently works, X might
>>use all its timeslice, then have to wait 100ms for gcc to finish
>>its. The way I do it is give a small timeslice to high prio tasks,
>>and lower priority tasks get progressively less.
>
>
> If the interactive task uses all it's timeslice, then it's not really
> very interactive, it's chewing quite a bit of CPU ... presumably in
> the common case, these things don't finish their timeslices. I thought
> we preempted the running task when a higher prio one woke up, so this
> should still work, right?
>
> So it would seem to make sense to boost the prio of a interactive task
> *without* increasing the size of it's timeslice.
>
>
>>When _only_ low priority tasks are running, they'll all get long
>>timeslices.
>
>
> That at least makes sense. AFIAK at least the early versions of Con's
> stuff make cpu bound jobs' timeslices short even if there were no
> interactive jobs. I don't like that (or more relevantly, the benchmarks
> don't either ;-)).
>
>
>>OK well just as a rough idea of how mine works: worst case for
>>xmms is that X is at its highest dynamic priority (and reniced).
>>xmms will be at its highest dynamic prio, or maybe one or two
>>below that.
>>
>>X will get to run for maybe 30ms first, then xmms is allowed 6ms.
>>That is still 15% CPU. And X soon comes down in priority if it
>>continues to use a lot of CPU.
>
>
> If it works in practice, it works, I guess. I just don't see why X
> is super special ... are we going to have to renice *all* interactive
> tasks in order to get things to work properly?
>
> M.


If you dont see why X is super special then why is xmms even a part of
the discussion? All of this basing scheduling performance on a bloated
wannabe winamp makes as much sense as gauging car performance using a
van. If this was a purely scheduling problem, then why do other
players like alsaplayer and such not suck as bad as xmms when under the
exact same priority and all? At least use something without a frontend
so that you can limit the possibility that the programmers did something
stupid like make decoding dependent on some update to the gui. xmms was
coded first and foremost to look and work like winamp. Streamlined -
even low latency performance was not a base goal. And outside of these
two programs X and some audio player, how are things going to be
effected? Forget having to renice certain processes, if i have a video
player that say, outputs using X will the video thread get lowered below
certain other processes because the audio thread is getting a higher
dynamic priority and the video thread uses a lot of cpu (maybe i dont
have hw accel). What about video players that dont use theads, they
require both a lot of cpu and audio playing performance to stay in sync,
will it be dropped below the priority of other apps?

It's early for me so maybe i'm overreacting. I just dont see the logic
in using a program like xmms to guage audio playback performance since
it's main goal is to be like winamp, not efficiently playback audio and
so can be introducing all kinds of performance hits not found in other
players.


2003-09-07 02:35:54

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v12

> If you dont see why X is super special then why is xmms even a part of
> the discussion?

Because it's an example of why X *isn't* super special - it's just another
interactive app. If we have to explicitly boost every interactive app,
we're screwed.

> All of this basing scheduling performance on a bloated wannabe winamp
> makes as much sense as gauging car performance using a van. If this
> was a purely scheduling problem, then why do other players like
> alsaplayer and such not suck as bad as xmms when under the exact same
> priority and all? At least use something without a frontend so that
> you can limit the possibility that the programmers did something stupid
> like make decoding dependent on some update to the gui.
> xmms was coded first and foremost to look and work like winamp.
> Streamlined - even low latency performance was not a base goal.

The reality is that people use xmms, and whilst it may not be the greatest
program known to man, I don't believe it's *that* fundamentally screwed up
that it should skip under normal desktop loads. *Especially* if it worked
fine under 2.4 ;-)

Some people have observed that ALSA drivers seem worse than the older
ones ... I haven't done enough comparisons to be sure that's true, but
maybe they have less buffering or something. More buffering in the kernel
side might get rid of the sound skips.

> And outside of these two programs X and some audio player, how are
> things going to be effected? Forget having to renice certain
> processes, if i have a video player that say, outputs using X will
> the video thread get lowered below certain other processes because
> the audio thread is getting a higher dynamic priority and the video
> thread uses a lot of cpu (maybe i dont have hw accel). What about
> video players that dont use theads, they require both a lot of cpu
> and audio playing performance to stay in sync, will it be dropped
> below the priority of other apps?

Don't think that's really so easy to solve, unless you do the hardware
boost thing. The original focus seemed to be giving a prio boost to
tasks that weren't cpu hogs.

> It's early for me so maybe i'm overreacting. I just dont see the logic
> in using a program like xmms to guage audio playback performance since
> it's main goal is to be like winamp, not efficiently playback audio
> and so can be introducing all kinds of performance hits not found in
> other players.

People should gague performance by how well we do on the apps they want
to run. If people want to run xmms, they'll be interested in how well
it works. My main objection to benchmarking like this is (and window
wiggle tests) are that they're pretty much subjective, and hard to
measure.

Personally, I just use xmms because I've found nothing better, but the
the UI does suck. Mostly that's because I'm too idle to look hard (or
I have better things to do) and it's good enough. Here is _not_ the
place for people to tell me what I should use instead ;-)

M.

2003-09-07 03:28:04

by Valdis Klētnieks

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v12

On Sat, 06 Sep 2003 19:34:58 PDT, "Martin J. Bligh" said:

> Some people have observed that ALSA drivers seem worse than the older
> ones ... I haven't done enough comparisons to be sure that's true, but
> maybe they have less buffering or something. More buffering in the kernel
> side might get rid of the sound skips.

At the expense of additional latency. It really weirds you out when the volume,
tone, and mute buttons have a full second lag ;)

> People should gague performance by how well we do on the apps they want
> to run. If people want to run xmms, they'll be interested in how well
> it works. My main objection to benchmarking like this is (and window
> wiggle tests) are that they're pretty much subjective, and hard to
> measure.

I have to admit on my laptop, I've failed to see much subjective improvement
since O7 or so, as VM and IO issues have predominated since then. The
scheduler is basically powerless and the VM manager only slightly more so when
the *proper* fix is probably stick another 256M in. ;)

Either that, or put the XFree86 4.3.0 server on a severe diet - 'ps aux' is reporting
that as I type, mine has VSZ=384852 and RSS=64724. Thrown in a mozilla
with VSZ=122900 and RSS=47316, and you're just waiting for things to
degrade into thrashing.

At least for my laptop, any future performance wins are going to come from
elsewhere than the CPU scheduler - I'll pick reiser4, VM manager, and I/O
manager as top contenders, with "being able to recompile the CPU hogs with xlc"
as a long-shot.. ;)

> Personally, I just use xmms because I've found nothing better, but the
> the UI does suck. Mostly that's because I'm too idle to look hard (or
> I have better things to do) and it's good enough.

Amen to that. I point xmms at a bunch of .ogg's, it plays them, and it does so
well enough that I've not gotten peeved enough to go find a replacement.


Attachments:
(No filename) (226.00 B)

2003-09-07 04:42:24

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v12



Martin J. Bligh wrote:

>>All of this basing scheduling performance on a bloated wannabe winamp
>>makes as much sense as gauging car performance using a van. If this
>>was a purely scheduling problem, then why do other players like
>>alsaplayer and such not suck as bad as xmms when under the exact same
>>priority and all? At least use something without a frontend so that
>>you can limit the possibility that the programmers did something stupid
>>like make decoding dependent on some update to the gui.
>>xmms was coded first and foremost to look and work like winamp.
>>Streamlined - even low latency performance was not a base goal.
>>
>
>The reality is that people use xmms, and whilst it may not be the greatest
>program known to man, I don't believe it's *that* fundamentally screwed up
>that it should skip under normal desktop loads. *Especially* if it worked
>fine under 2.4 ;-)
>

I agree with Martin here. xmms may not be the smartest music player,
but its really sad if it skips on a P4 or Athlon.


2003-09-07 04:37:52

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v12



Martin J. Bligh wrote:

>>I think the two will always related. One means giving a higher
>>dynamic priority, the other a bigger timeslice. So you want
>>say gcc to have a 100ms timeslice with lowest scheduling prio,
>>but X to have a 20ms slice and a very high scheduling priority.
>>
>
>Right.
>
>
>>Unfortunately, the way the scheduler currently works, X might
>>use all its timeslice, then have to wait 100ms for gcc to finish
>>its. The way I do it is give a small timeslice to high prio tasks,
>>and lower priority tasks get progressively less.
>>
>
>If the interactive task uses all it's timeslice, then it's not really
>very interactive, it's chewing quite a bit of CPU ... presumably in
>the common case, these things don't finish their timeslices. I thought
>we preempted the running task when a higher prio one woke up, so this
>should still work, right?
>

No, you are _very_ right about that.

>
>So it would seem to make sense to boost the prio of a interactive task
>*without* increasing the size of it's timeslice.
>

Well, what I do is boost their priority and make the timeslices of non
interactive apps smaller. And sometimes they do need small bursts of
using a lot of cpu.

>
>>When _only_ low priority tasks are running, they'll all get long
>>timeslices.
>>
>
>That at least makes sense. AFIAK at least the early versions of Con's
>stuff make cpu bound jobs' timeslices short even if there were no
>interactive jobs. I don't like that (or more relevantly, the benchmarks
>don't either ;-)).
>
>
>>OK well just as a rough idea of how mine works: worst case for
>>xmms is that X is at its highest dynamic priority (and reniced).
>>xmms will be at its highest dynamic prio, or maybe one or two
>>below that.
>>
>>X will get to run for maybe 30ms first, then xmms is allowed 6ms.
>>That is still 15% CPU. And X soon comes down in priority if it
>>continues to use a lot of CPU.
>>
>
>If it works in practice, it works, I guess. I just don't see why X
>is super special ... are we going to have to renice *all* interactive
>tasks in order to get things to work properly?
>

The reason X is special is that it uses a lot of CPU, and it can be
continually using a lot of CPU, but it is "interactive" - it probably
requires the lowest scheduling latency of any other interactive process
because it runs the mouse, screen, keyboard etc, things that obviously
can't make use of much buffering, if any.

If you wanted X to be treated as any other process, thats fine, use
renice 0. It will be given low priorities when using lots of CPU
though.