2003-08-24 12:36:55

by Nick Piggin

[permalink] [raw]
Subject: [PATCH] Nick's scheduler policy

--- archs/linux-2.6/include/linux/sched.h 2003-08-24 18:58:59.000000000 +1000
+++ linux-2.6/include/linux/sched.h 2003-08-24 20:29:37.000000000 +1000
@@ -281,7 +281,9 @@ struct signal_struct {
#define MAX_RT_PRIO MAX_USER_RT_PRIO

#define MAX_PRIO (MAX_RT_PRIO + 40)
-
+
+#define rt_task(p) ((p)->prio < MAX_RT_PRIO)
+
/*
* Some day this will be a full-fledged user tracking system..
*/
@@ -339,12 +341,16 @@ struct task_struct {
struct list_head run_list;
prio_array_t *array;

+ unsigned long array_sequence;
+ unsigned long timestamp;
+
+ unsigned long total_time, sleep_time;
unsigned long sleep_avg;
- unsigned long last_run;

unsigned long policy;
cpumask_t cpus_allowed;
unsigned int time_slice, first_time_slice;
+ unsigned int used_slice;

struct list_head tasks;
struct list_head ptrace_children;
@@ -552,6 +558,7 @@ extern int FASTCALL(wake_up_state(struct
extern int FASTCALL(wake_up_process(struct task_struct * tsk));
extern int FASTCALL(wake_up_process_kick(struct task_struct * tsk));
extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk));
+extern void FASTCALL(sched_fork(task_t * p));
extern void FASTCALL(sched_exit(task_t * p));

asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru);
--- archs/linux-2.6/kernel/fork.c 2003-08-24 18:58:59.000000000 +1000
+++ linux-2.6/kernel/fork.c 2003-08-24 20:32:15.000000000 +1000
@@ -911,33 +911,9 @@ struct task_struct *copy_process(unsigne
p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
p->pdeath_signal = 0;

- /*
- * Share the timeslice between parent and child, thus the
- * total amount of pending timeslices in the system doesn't change,
- * resulting in more scheduling fairness.
- */
- local_irq_disable();
- p->time_slice = (current->time_slice + 1) >> 1;
- /*
- * The remainder of the first timeslice might be recovered by
- * the parent if the child exits early enough.
- */
- p->first_time_slice = 1;
- current->time_slice >>= 1;
- p->last_run = jiffies;
- if (!current->time_slice) {
- /*
- * This case is rare, it happens when the parent has only
- * a single jiffy left from its timeslice. Taking the
- * runqueue lock is not a problem.
- */
- current->time_slice = 1;
- preempt_disable();
- scheduler_tick(0, 0);
- local_irq_enable();
- preempt_enable();
- } else
- local_irq_enable();
+ /* Perform scheduler related accounting */
+ sched_fork(p);
+
/*
* Ok, add it to the run-queues and make it
* visible to the rest of the system.
--- archs/linux-2.6/kernel/sched.c 2003-08-24 18:58:59.000000000 +1000
+++ linux-2.6/kernel/sched.c 2003-08-24 22:15:30.000000000 +1000
@@ -66,73 +66,17 @@
* maximum timeslice is 200 msecs. Timeslices get refilled after
* they expire.
*/
-#define MIN_TIMESLICE ( 10 * HZ / 1000)
-#define MAX_TIMESLICE (200 * HZ / 1000)
-#define CHILD_PENALTY 50
-#define PARENT_PENALTY 100
-#define EXIT_WEIGHT 3
-#define PRIO_BONUS_RATIO 25
-#define INTERACTIVE_DELTA 2
-#define MAX_SLEEP_AVG (10*HZ)
-#define STARVATION_LIMIT (10*HZ)
-#define NODE_THRESHOLD 125
-
-/*
- * If a task is 'interactive' then we reinsert it in the active
- * array after it has expired its current timeslice. (it will not
- * continue to run immediately, it will still roundrobin with
- * other interactive tasks.)
- *
- * This part scales the interactivity limit depending on niceness.
- *
- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
- * Here are a few examples of different nice levels:
- *
- * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
- * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
- * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
- * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
- * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
- *
- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
- * priority range a task can explore, a value of '1' means the
- * task is rated interactive.)
- *
- * Ie. nice +19 tasks can never get 'interactive' enough to be
- * reinserted into the active array. And only heavily CPU-hog nice -20
- * tasks will be expired. Default nice 0 tasks are somewhere between,
- * it takes some effort for them to get interactive, but it's not
- * too hard.
- */
-
-#define SCALE(v1,v1_max,v2_max) \
- (v1) * (v2_max) / (v1_max)
+#define MIN_TIMESLICE ((1 * HZ / 1000) ? 1 * HZ / 1000 : 1)
+#define MAX_TIMESLICE (20 * MIN_TIMESLICE) /* This cannot be changed */

-#define DELTA(p) \
- (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
- INTERACTIVE_DELTA)
+#define MAX_SLEEP (HZ)

-#define TASK_INTERACTIVE(p) \
- ((p)->prio <= (p)->static_prio - DELTA(p))
-
-/*
- * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
- * to time slice values.
- *
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
- *
- * task_timeslice() is the interface that is used by the scheduler.
- */
-
-#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
- ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1)))
+#define NODE_THRESHOLD 125

-static inline unsigned int task_timeslice(task_t *p)
-{
- return BASE_TIMESLICE(p);
-}
+#define TASK_PREEMPTS_CURR(p, rq) \
+ ( (p)->prio < (rq)->curr->prio \
+ || ((p)->prio == (rq)->curr->prio \
+ && (p)->static_prio < (rq)->curr->static_prio) )

/*
* These are the runqueue data structures:
@@ -157,7 +101,8 @@ struct prio_array {
*/
struct runqueue {
spinlock_t lock;
- unsigned long nr_running, nr_switches, expired_timestamp,
+ unsigned long array_sequence;
+ unsigned long nr_running, nr_switches,
nr_uninterruptible;
task_t *curr, *idle;
struct mm_struct *prev_mm;
@@ -179,7 +124,6 @@ static DEFINE_PER_CPU(struct runqueue, r
#define this_rq() (&__get_cpu_var(runqueues))
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-#define rt_task(p) ((p)->prio < MAX_RT_PRIO)

/*
* Default context-switch locking:
@@ -298,35 +242,79 @@ static inline void enqueue_task(struct t
p->array = array;
}

+static void add_task_time(task_t *p, unsigned long time, int sleep)
+{
+ if (time == 0)
+ return;
+
+ if (time > MAX_SLEEP) {
+ time = MAX_SLEEP;
+ p->total_time = 0;
+ p->sleep_time = 0;
+ } else {
+ unsigned long r;
+
+ r = MAX_SLEEP - time;
+ p->total_time = (r*p->total_time + MAX_SLEEP/2) / MAX_SLEEP;
+ p->sleep_time = (r*p->sleep_time + MAX_SLEEP/2) / MAX_SLEEP;
+ }
+
+ p->total_time += 1000 * time;
+ if (sleep)
+ p->sleep_time += 1000 * time;
+
+ p->sleep_avg = (1000 * p->sleep_time) / p->total_time;
+}
+
/*
- * effective_prio - return the priority that is based on the static
- * priority but is modified by bonuses/penalties.
- *
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
- * into the -5 ... 0 ... +5 bonus/penalty range.
- *
- * We use 25% of the full 0...39 priority range so that:
- *
- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
- *
- * Both properties are important to certain workloads.
+ * The higher a thread's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority thread gets MIN_TIMESLICE worth of execution time.
*/
-static int effective_prio(task_t *p)
+static unsigned int task_timeslice(task_t *p, runqueue_t *rq)
+{
+ int idx, delta;
+ unsigned int base, timeslice;
+
+ if (rt_task(p))
+ return MAX_TIMESLICE;
+
+#if 0
+ unsigned int timeslice = MIN_TIMESLICE +
+ ( (MAX_USER_PRIO - USER_PRIO(p->prio))
+ * (MAX_TIMESLICE - MIN_TIMESLICE) )
+ / MAX_USER_PRIO;
+#endif
+ idx = min(find_next_bit(rq->active->bitmap, MAX_PRIO, MAX_RT_PRIO),
+ find_next_bit(rq->expired->bitmap, MAX_PRIO, MAX_RT_PRIO));
+ idx = min(idx, p->prio);
+ delta = p->prio - idx;
+
+ base = MIN_TIMESLICE * MAX_USER_PRIO / (delta + 1);
+ timeslice = base * (USER_PRIO(idx) + 4) / 4;
+
+ if (timeslice <= 0)
+ timeslice = 1;
+
+ return timeslice;
+}
+
+static unsigned long task_priority(task_t *p)
{
int bonus, prio;

if (rt_task(p))
return p->prio;

- bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
- MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
+ bonus = (MAX_USER_PRIO * p->sleep_avg) / 1000 / 2;
+ prio = USER_PRIO(p->static_prio) / 2 + (MAX_USER_PRIO / 2);

- prio = p->static_prio - bonus;
+ prio = MAX_RT_PRIO + prio - bonus;
if (prio < MAX_RT_PRIO)
prio = MAX_RT_PRIO;
if (prio > MAX_PRIO-1)
prio = MAX_PRIO-1;
+
return prio;
}

@@ -347,34 +335,38 @@ static inline void __activate_task(task_
*/
static inline void activate_task(task_t *p, runqueue_t *rq)
{
- long sleep_time = jiffies - p->last_run - 1;
+ unsigned long sleep = jiffies - p->timestamp;
+ p->timestamp = jiffies;

- if (sleep_time > 0) {
- int sleep_avg;
+ if (sleep > MAX_SLEEP)
+ sleep = MAX_SLEEP;

- /*
- * This code gives a bonus to interactive tasks.
- *
- * The boost works by updating the 'average sleep time'
- * value here, based on ->last_run. The more time a task
- * spends sleeping, the higher the average gets - and the
- * higher the priority boost gets as well.
- */
- sleep_avg = p->sleep_avg + sleep_time;
+ if (!in_interrupt() && current->mm) {
+ unsigned long boost = sleep / 2;
+ add_task_time(current, boost, 1);
+ add_task_time(p, sleep - boost, 1);
+ } else {
+ add_task_time(p, sleep, 1);
+
+ if (in_interrupt())
+ add_task_time(p, sleep / 2, 1);
+ }

- /*
- * 'Overflow' bonus ticks go to the waker as well, so the
- * ticks are not lost. This has the effect of further
- * boosting tasks that are related to maximum-interactive
- * tasks.
- */
- if (sleep_avg > MAX_SLEEP_AVG)
- sleep_avg = MAX_SLEEP_AVG;
- if (p->sleep_avg != sleep_avg) {
- p->sleep_avg = sleep_avg;
- p->prio = effective_prio(p);
- }
+ p->prio = task_priority(p);
+
+ if (rq->array_sequence != p->array_sequence) {
+ p->used_slice = 0;
}
+
+ if (!in_interrupt() && current->mm) {
+ unsigned long steal;
+
+ steal = min((unsigned int)sleep / 2,
+ (task_timeslice(p, rq) - p->used_slice) / 2);
+ p->used_slice += steal;
+ current->used_slice -= steal;
+ }
+
__activate_task(p, rq);
}

@@ -383,6 +375,7 @@ static inline void activate_task(task_t
*/
static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
{
+ p->array_sequence = rq->array_sequence;
nr_running_dec(rq);
if (p->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible++;
@@ -426,7 +419,7 @@ static inline void resched_task(task_t *
* be called with interrupts off, or it may introduce deadlock with
* smp_call_function() if an IPI is sent by the same process we are
* waiting to become inactive.
- */
+ n*/
void wait_task_inactive(task_t * p)
{
unsigned long flags;
@@ -497,11 +490,9 @@ repeat_lock_task:
}
if (old_state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible--;
- if (sync)
- __activate_task(p, rq);
- else {
- activate_task(p, rq);
- if (p->prio < rq->curr->prio)
+ activate_task(p, rq);
+ if (!sync) {
+ if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
}
success = 1;
@@ -534,36 +525,74 @@ int wake_up_state(task_t *p, unsigned in
}

/*
+ * Perform scheduler related accounting for a newly forked process @p.
+ * @p is forked by current.
+ */
+void sched_fork(task_t *p)
+{
+ unsigned long ts;
+ unsigned long flags;
+ runqueue_t *rq;
+
+ /*
+ * Share the timeslice between parent and child, thus the
+ * total amount of pending timeslices in the system doesn't change,
+ * resulting in more scheduling fairness.
+ */
+ local_irq_disable();
+ p->timestamp = jiffies;
+ rq = task_rq_lock(current, &flags);
+ ts = task_timeslice(current, rq);
+ task_rq_unlock(rq, &flags);
+ p->used_slice = current->used_slice + (ts - current->used_slice+1) / 2;
+ current->used_slice += (ts - current->used_slice) / 2;
+ /*
+ * The remainder of the first timeslice might be recovered by
+ * the parent if the child exits early enough.
+ */
+ p->first_time_slice = 1;
+ if (current->used_slice >= ts) {
+ /*
+ * This case is rare, it happens when the parent has only
+ * a single jiffy left from its timeslice. Taking the
+ * runqueue lock is not a problem.
+ */
+ current->used_slice = ts + 1;
+ preempt_disable();
+ scheduler_tick(0, 0);
+ local_irq_enable();
+ preempt_enable();
+ } else
+ local_irq_enable();
+}
+
+/*
* wake_up_forked_process - wake up a freshly forked process.
*
* This function will do some initial scheduler statistics housekeeping
* that must be done for every newly created process.
*/
-void wake_up_forked_process(task_t * p)
+void wake_up_forked_process(task_t *p)
{
unsigned long flags;
runqueue_t *rq = task_rq_lock(current, &flags);

p->state = TASK_RUNNING;
- /*
- * We decrease the sleep average of forking parents
- * and children as well, to keep max-interactive tasks
- * from forking tasks that are max-interactive.
- */
- current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
- p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
- p->prio = effective_prio(p);
+
set_task_cpu(p, smp_processor_id());

- if (unlikely(!current->array))
- __activate_task(p, rq);
- else {
- p->prio = current->prio;
- list_add_tail(&p->run_list, &current->run_list);
- p->array = current->array;
- p->array->nr_active++;
- nr_running_inc(rq);
- }
+ p->sleep_time = current->sleep_time / 40;
+ p->total_time = current->total_time / 40;
+ p->sleep_avg = current->sleep_avg;
+
+ current->sleep_time = 2 * (current->sleep_time) / 3;
+ if (current->total_time != 0)
+ current->sleep_avg = (1000 * current->sleep_time)
+ / current->total_time;
+
+ p->prio = task_priority(p);
+ __activate_task(p, rq);
+
task_rq_unlock(rq, &flags);
}

@@ -582,18 +611,13 @@ void sched_exit(task_t * p)

local_irq_save(flags);
if (p->first_time_slice) {
- p->parent->time_slice += p->time_slice;
- if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
- p->parent->time_slice = MAX_TIMESLICE;
+ unsigned long flags;
+ runqueue_t *rq;
+ rq = task_rq_lock(p, &flags);
+ p->parent->used_slice -= task_timeslice(p, rq) - p->used_slice;
+ task_rq_unlock(rq, &flags);
}
local_irq_restore(flags);
- /*
- * If the child was a (relative-) CPU hog then decrease
- * the sleep_avg of the parent as well.
- */
- if (p->sleep_avg < p->parent->sleep_avg)
- p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT +
- p->sleep_avg) / (EXIT_WEIGHT + 1);
}

/**
@@ -995,13 +1019,29 @@ static inline void pull_task(runqueue_t
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
*/
- if (p->prio < this_rq->curr->prio)
+ if (TASK_PREEMPTS_CURR(p, this_rq))
set_need_resched();
- else {
- if (p->prio == this_rq->curr->prio &&
- p->time_slice > this_rq->curr->time_slice)
- set_need_resched();
- }
+}
+
+/*
+ * comment me
+ */
+
+static inline int
+can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle)
+{
+ unsigned long delta;
+
+ if (task_running(rq, tsk))
+ return 0;
+ if (!cpu_isset(this_cpu, tsk->cpus_allowed))
+ return 0;
+
+ delta = jiffies - tsk->timestamp;
+ if (idle && (delta <= cache_decay_ticks))
+ return 0;
+
+ return 1;
}

/*
@@ -1063,14 +1103,9 @@ skip_queue:
* 3) are cache-hot on their current CPU.
*/

-#define CAN_MIGRATE_TASK(p,rq,this_cpu) \
- ((!idle || (jiffies - (p)->last_run > cache_decay_ticks)) && \
- !task_running(rq, p) && \
- cpu_isset(this_cpu, (p)->cpus_allowed))
-
curr = curr->prev;

- if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
+ if (!can_migrate_task(tmp, busiest, this_cpu, idle)) {
if (curr != head)
goto skip_queue;
idx++;
@@ -1171,20 +1206,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
EXPORT_PER_CPU_SYMBOL(kstat);

/*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks:
- */
-#define EXPIRED_STARVING(rq) \
- (STARVATION_LIMIT && ((rq)->expired_timestamp && \
- (jiffies - (rq)->expired_timestamp >= \
- STARVATION_LIMIT * ((rq)->nr_running) + 1)))
-
-/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*
@@ -1201,17 +1222,11 @@ void scheduler_tick(int user_ticks, int
if (rcu_pending(cpu))
rcu_check_callbacks(cpu, user_ticks);

- /* note: this timer irq context must be accounted for as well */
- if (hardirq_count() - HARDIRQ_OFFSET) {
- cpustat->irq += sys_ticks;
- sys_ticks = 0;
- } else if (softirq_count()) {
- cpustat->softirq += sys_ticks;
- sys_ticks = 0;
- }
-
if (p == rq->idle) {
- if (atomic_read(&rq->nr_iowait) > 0)
+ /* note: this timer irq context must be accounted for as well */
+ if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
+ cpustat->system += sys_ticks;
+ else if (atomic_read(&rq->nr_iowait) > 0)
cpustat->iowait += sys_ticks;
else
cpustat->idle += sys_ticks;
@@ -1232,43 +1247,39 @@ void scheduler_tick(int user_ticks, int
spin_lock(&rq->lock);
/*
* The task was running during this tick - update the
- * time slice counter and the sleep average. Note: we
- * do not update a thread's priority until it either
- * goes to sleep or uses up its timeslice. This makes
- * it possible for interactive tasks to use up their
- * timeslices at their highest priority levels.
+ * time slice counter. Note: we do not update a thread's
+ * priority until it either goes to sleep or uses up its
+ * timeslice.
*/
- if (p->sleep_avg)
- p->sleep_avg--;
if (unlikely(rt_task(p))) {
/*
* RR tasks need a special form of timeslice management.
* FIFO tasks have no timeslices.
*/
- if ((p->policy == SCHED_RR) && !--p->time_slice) {
- p->time_slice = task_timeslice(p);
- p->first_time_slice = 0;
- set_tsk_need_resched(p);
-
- /* put it at the end of the queue: */
- dequeue_task(p, rq->active);
- enqueue_task(p, rq->active);
+ if (p->policy == SCHED_RR) {
+ p->used_slice++;
+ if (p->used_slice >= task_timeslice(p, rq)) {
+ p->used_slice = 0;
+ p->first_time_slice = 0;
+ set_tsk_need_resched(p);
+
+ /* put it at the end of the queue: */
+ dequeue_task(p, rq->active);
+ enqueue_task(p, rq->active);
+ }
}
goto out_unlock;
}
- if (!--p->time_slice) {
+
+ p->used_slice++;
+ if (p->used_slice >= task_timeslice(p, rq)) {
dequeue_task(p, rq->active);
set_tsk_need_resched(p);
- p->prio = effective_prio(p);
- p->time_slice = task_timeslice(p);
+ p->prio = task_priority(p);
+ p->used_slice = 0;
p->first_time_slice = 0;

- if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
- if (!rq->expired_timestamp)
- rq->expired_timestamp = jiffies;
- enqueue_task(p, rq->expired);
- } else
- enqueue_task(p, rq->active);
+ enqueue_task(p, rq->expired);
}
out_unlock:
spin_unlock(&rq->lock);
@@ -1287,6 +1298,8 @@ asmlinkage void schedule(void)
runqueue_t *rq;
prio_array_t *array;
struct list_head *queue;
+ unsigned long now;
+ unsigned long run_time;
int idx;

/*
@@ -1307,7 +1320,11 @@ need_resched:
rq = this_rq();

release_kernel_lock(prev);
- prev->last_run = jiffies;
+ now = jiffies;
+ run_time = now - prev->timestamp;
+
+ add_task_time(prev, run_time, 0);
+
spin_lock_irq(&rq->lock);

/*
@@ -1336,7 +1353,6 @@ pick_next_task:
goto pick_next_task;
#endif
next = rq->idle;
- rq->expired_timestamp = 0;
goto switch_tasks;
}

@@ -1345,10 +1361,10 @@ pick_next_task:
/*
* Switch the active and expired arrays.
*/
+ rq->array_sequence++;
rq->active = rq->expired;
rq->expired = array;
array = rq->active;
- rq->expired_timestamp = 0;
}

idx = sched_find_first_bit(array->bitmap);
@@ -1360,7 +1376,9 @@ switch_tasks:
clear_tsk_need_resched(prev);
RCU_qsctr(task_cpu(prev))++;

+ prev->timestamp = now;
if (likely(prev != next)) {
+ next->timestamp = now;
rq->nr_switches++;
rq->curr = next;

@@ -2139,6 +2168,8 @@ asmlinkage long sys_sched_rr_get_interva
int retval = -EINVAL;
struct timespec t;
task_t *p;
+ unsigned long flags;
+ runqueue_t *rq;

if (pid < 0)
goto out_nounlock;
@@ -2153,8 +2184,10 @@ asmlinkage long sys_sched_rr_get_interva
if (retval)
goto out_unlock;

+ rq = task_rq_lock(p, &flags);
jiffies_to_timespec(p->policy & SCHED_FIFO ?
- 0 : task_timeslice(p), &t);
+ 0 : task_timeslice(p, rq), &t);
+ task_rq_unlock(rq, &flags);
read_unlock(&tasklist_lock);
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
out_nounlock:


Attachments:
sched-policy-6 (20.50 kB)

2003-08-24 14:29:20

by Felipe Alfaro Solana

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy

On Sun, 2003-08-24 at 14:35, Nick Piggin wrote:
> Hi,
> Patch against 2.6.0-test4. It fixes a lot of problems here vs
> previous versions. There aren't really any open issues for me, so
> testers would be welcome.
>
> The big change is more dynamic timeslices, which allows "interactive"
> tasks to get very small timeslices while more compute intensive loads
> can be given bigger timeslices than usual. This works properly with
> nice (niced processes will tend to get bigger timeslices).
>
> I think I have cured test-starve too.

I haven't still found any starvation cases, but forking time when the
system is under heavy load has increased considerable with respect to
vanilla or Con's O18.1int:

1. On a Konsole session, run "while true; do a=2; done"
2. Now, try forming a new Konsole session and you'll see it takes
approximately twice the time it takes when the system is under no load.

Also, renicing X to -20 helps X interactivity, while with Con's patches,
renicing X to -20 makes it feel worse.

2003-08-24 16:55:40

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy

Seems to do badly with CPU intensive tasks:

Kernbench: (make -j vmlinux, maximal tasks)
Elapsed System User CPU
2.6.0-test3 46.00 115.49 571.94 1494.25
2.6.0-test4-nick 49.37 131.31 611.15 1500.75

Oddly, schedule itself is significantly cheaper, but you seem
to end up with much more expense elsewhere. Thrashing tasks between
CPUs, maybe (esp given the increased user time)? I'll do a proper
baseline against test4, but I don't expect it to be any different, really.

diffprofile {2.6.0-test3,2.6.0-test4-nick}/kernbench/0/profile
12314 7.4% total
3843 16.3% page_remove_rmap
1657 20.8% __d_lookup
1322 9.4% do_anonymous_page
1143 21.5% __copy_to_user_ll
1034 55.3% atomic_dec_and_lock
683 48.4% free_hot_cold_page
669 393.5% filp_close
553 147.5% .text.lock.file_table
484 479.2% file_ra_state_init
409 24.3% buffered_rmqueue
391 24.9% kmem_cache_free
362 11.3% zap_pte_range
304 16.5% path_lookup
247 31.5% pte_alloc_one
237 68.3% pgd_ctor
229 19.0% file_move
224 16.7% link_path_walk
220 57.7% file_kill
188 7.9% do_no_page
164 24.6% __wake_up
162 4.4% find_get_page
146 24.4% generic_file_open
141 87.6% .text.lock.dcache
139 11.5% release_pages
131 51.4% vfs_read
127 40.2% dnotify_parent
...
-144 -4.2% __copy_from_user_ll
-149 -2.3% page_add_rmap
-352 -25.1% schedule
-3469 -6.8% default_idle

2003-08-25 03:00:47

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy



Martin J. Bligh wrote:

>Seems to do badly with CPU intensive tasks:
>
>Kernbench: (make -j vmlinux, maximal tasks)
> Elapsed System User CPU
> 2.6.0-test3 46.00 115.49 571.94 1494.25
> 2.6.0-test4-nick 49.37 131.31 611.15 1500.75
>

Thanks Martin.

>
>Oddly, schedule itself is significantly cheaper, but you seem
>to end up with much more expense elsewhere. Thrashing tasks between
>CPUs, maybe (esp given the increased user time)? I'll do a proper
>baseline against test4, but I don't expect it to be any different, really.
>

Yeah I'd say most if not all would be my fault though. What happens
is that a lowest priority process will get a 1ms timeslice if there
is a highest priority process on the same runqueue, though it will
get I think 275ms if there are only other low priority processes
there.

A kernbench probably has enough IO to keep priorities up a bit and
keep timeslices short. The timeslice stuff could probably still use
a bit of tuning. On the other hand, nice -20 processes should get
big timeslices, while other schedulers give them small timeslices.


2003-08-25 03:06:05

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy



Felipe Alfaro Solana wrote:

>On Sun, 2003-08-24 at 14:35, Nick Piggin wrote:
>
>>Hi,
>>Patch against 2.6.0-test4. It fixes a lot of problems here vs
>>previous versions. There aren't really any open issues for me, so
>>testers would be welcome.
>>
>>The big change is more dynamic timeslices, which allows "interactive"
>>tasks to get very small timeslices while more compute intensive loads
>>can be given bigger timeslices than usual. This works properly with
>>nice (niced processes will tend to get bigger timeslices).
>>
>>I think I have cured test-starve too.
>>
>
>I haven't still found any starvation cases, but forking time when the
>system is under heavy load has increased considerable with respect to
>vanilla or Con's O18.1int:
>
>1. On a Konsole session, run "while true; do a=2; done"
>2. Now, try forming a new Konsole session and you'll see it takes
>approximately twice the time it takes when the system is under no load.
>

Yeah, it probably penalises parents and children too much on fork, and
doesn't penalise parents of exiting cpu hogs enough. I have noticed
this too.

>
>Also, renicing X to -20 helps X interactivity, while with Con's patches,
>renicing X to -20 makes it feel worse.
>

renicing IMO is a lot more sane in my patches, although others might
disagree. In Con's patches, when you make X -20, it gets huge timeslices.
In my version, it will get lots of smaller timeslices.

Thanks again for testing.

Nick

2003-08-25 03:27:42

by Randy.Dunlap

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy

> Hi,
> Patch against 2.6.0-test4. It fixes a lot of problems here vs
> previous versions. There aren't really any open issues for me, so
> testers would be welcome.
>
...
>
> On the other hand, I expect the best cases and maybe most usual cases would
> be better on Con's... and Con might have since done some work in the latency
> area.

Has anyone developed a (run-time) scheduler [policy] selector, via
sysctl or sysfs, so that different kernel builds aren't required?

I know that I have heard discussions of this previously.

~Randy



2003-08-25 03:37:11

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy



Randy.Dunlap wrote:

>>Hi,
>>Patch against 2.6.0-test4. It fixes a lot of problems here vs
>>previous versions. There aren't really any open issues for me, so
>>testers would be welcome.
>>
>>
>...
>
>>On the other hand, I expect the best cases and maybe most usual cases would
>>be better on Con's... and Con might have since done some work in the latency
>>area.
>>
>
>Has anyone developed a (run-time) scheduler [policy] selector, via
>sysctl or sysfs, so that different kernel builds aren't required?
>
>I know that I have heard discussions of this previously.
>

Not that I know of. This would probably require an extra layer of
indirection in the standard form of Linux's struct of pointers to
functions, with your standard schedule functions as wrappers.
I think it would be highly unlikely that this would get into a
standard kernel, but might make a nice testing tool...

In fact this might end up being incompatible with architectures
like SPARC... but I'm sure someone could make it work if they really
wanted to.

I think the present boot-time selector (selecting different kernels
at boot) will have to suffice for now :P


2003-08-25 10:41:55

by Nick Piggin

[permalink] [raw]
Subject: [PATCH] Nick's scheduler policy v7

--- linux-2.6/include/linux/sched.h.orig 2003-08-25 20:30:17.000000000 +1000
+++ linux-2.6/include/linux/sched.h 2003-08-25 20:30:24.000000000 +1000
@@ -281,7 +281,9 @@ struct signal_struct {
#define MAX_RT_PRIO MAX_USER_RT_PRIO

#define MAX_PRIO (MAX_RT_PRIO + 40)
-
+
+#define rt_task(p) ((p)->prio < MAX_RT_PRIO)
+
/*
* Some day this will be a full-fledged user tracking system..
*/
@@ -339,12 +341,16 @@ struct task_struct {
struct list_head run_list;
prio_array_t *array;

+ unsigned long array_sequence;
+ unsigned long timestamp;
+
+ unsigned long total_time, sleep_time;
unsigned long sleep_avg;
- unsigned long last_run;

unsigned long policy;
cpumask_t cpus_allowed;
unsigned int time_slice, first_time_slice;
+ unsigned int used_slice;

struct list_head tasks;
struct list_head ptrace_children;
@@ -552,6 +558,7 @@ extern int FASTCALL(wake_up_state(struct
extern int FASTCALL(wake_up_process(struct task_struct * tsk));
extern int FASTCALL(wake_up_process_kick(struct task_struct * tsk));
extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk));
+extern void FASTCALL(sched_fork(task_t * p));
extern void FASTCALL(sched_exit(task_t * p));

asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru);
--- linux-2.6/kernel/fork.c.orig 2003-08-25 20:30:13.000000000 +1000
+++ linux-2.6/kernel/fork.c 2003-08-25 20:30:24.000000000 +1000
@@ -911,33 +911,9 @@ struct task_struct *copy_process(unsigne
p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
p->pdeath_signal = 0;

- /*
- * Share the timeslice between parent and child, thus the
- * total amount of pending timeslices in the system doesn't change,
- * resulting in more scheduling fairness.
- */
- local_irq_disable();
- p->time_slice = (current->time_slice + 1) >> 1;
- /*
- * The remainder of the first timeslice might be recovered by
- * the parent if the child exits early enough.
- */
- p->first_time_slice = 1;
- current->time_slice >>= 1;
- p->last_run = jiffies;
- if (!current->time_slice) {
- /*
- * This case is rare, it happens when the parent has only
- * a single jiffy left from its timeslice. Taking the
- * runqueue lock is not a problem.
- */
- current->time_slice = 1;
- preempt_disable();
- scheduler_tick(0, 0);
- local_irq_enable();
- preempt_enable();
- } else
- local_irq_enable();
+ /* Perform scheduler related accounting */
+ sched_fork(p);
+
/*
* Ok, add it to the run-queues and make it
* visible to the rest of the system.
--- linux-2.6/kernel/sched.c.orig 2003-08-25 20:30:10.000000000 +1000
+++ linux-2.6/kernel/sched.c 2003-08-25 20:30:24.000000000 +1000
@@ -66,73 +66,17 @@
* maximum timeslice is 200 msecs. Timeslices get refilled after
* they expire.
*/
-#define MIN_TIMESLICE ( 10 * HZ / 1000)
-#define MAX_TIMESLICE (200 * HZ / 1000)
-#define CHILD_PENALTY 50
-#define PARENT_PENALTY 100
-#define EXIT_WEIGHT 3
-#define PRIO_BONUS_RATIO 25
-#define INTERACTIVE_DELTA 2
-#define MAX_SLEEP_AVG (10*HZ)
-#define STARVATION_LIMIT (10*HZ)
-#define NODE_THRESHOLD 125
-
-/*
- * If a task is 'interactive' then we reinsert it in the active
- * array after it has expired its current timeslice. (it will not
- * continue to run immediately, it will still roundrobin with
- * other interactive tasks.)
- *
- * This part scales the interactivity limit depending on niceness.
- *
- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
- * Here are a few examples of different nice levels:
- *
- * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
- * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
- * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
- * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
- * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
- *
- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
- * priority range a task can explore, a value of '1' means the
- * task is rated interactive.)
- *
- * Ie. nice +19 tasks can never get 'interactive' enough to be
- * reinserted into the active array. And only heavily CPU-hog nice -20
- * tasks will be expired. Default nice 0 tasks are somewhere between,
- * it takes some effort for them to get interactive, but it's not
- * too hard.
- */
-
-#define SCALE(v1,v1_max,v2_max) \
- (v1) * (v2_max) / (v1_max)
-
-#define DELTA(p) \
- (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
- INTERACTIVE_DELTA)
-
-#define TASK_INTERACTIVE(p) \
- ((p)->prio <= (p)->static_prio - DELTA(p))
+#define MIN_TIMESLICE ((1 * HZ / 1000) ? 1 * HZ / 1000 : 1)
+#define MAX_TIMESLICE (20 * MIN_TIMESLICE) /* This cannot be changed */

-/*
- * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
- * to time slice values.
- *
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
- *
- * task_timeslice() is the interface that is used by the scheduler.
- */
+#define MAX_SLEEP (HZ)

-#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
- ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1)))
+#define NODE_THRESHOLD 125

-static inline unsigned int task_timeslice(task_t *p)
-{
- return BASE_TIMESLICE(p);
-}
+#define TASK_PREEMPTS_CURR(p, rq) \
+ ( (p)->prio < (rq)->curr->prio \
+ || ((p)->prio == (rq)->curr->prio \
+ && (p)->static_prio < (rq)->curr->static_prio) )

/*
* These are the runqueue data structures:
@@ -157,7 +101,8 @@ struct prio_array {
*/
struct runqueue {
spinlock_t lock;
- unsigned long nr_running, nr_switches, expired_timestamp,
+ unsigned long array_sequence;
+ unsigned long nr_running, nr_switches,
nr_uninterruptible;
task_t *curr, *idle;
struct mm_struct *prev_mm;
@@ -179,7 +124,6 @@ static DEFINE_PER_CPU(struct runqueue, r
#define this_rq() (&__get_cpu_var(runqueues))
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-#define rt_task(p) ((p)->prio < MAX_RT_PRIO)

/*
* Default context-switch locking:
@@ -298,35 +242,73 @@ static inline void enqueue_task(struct t
p->array = array;
}

+static void add_task_time(task_t *p, unsigned long time, int sleep)
+{
+ if (time == 0)
+ return;
+
+ if (time > MAX_SLEEP) {
+ time = MAX_SLEEP;
+ p->total_time = 0;
+ p->sleep_time = 0;
+ } else {
+ unsigned long r;
+
+ r = MAX_SLEEP - time;
+ p->total_time = (r*p->total_time + MAX_SLEEP/2) / MAX_SLEEP;
+ p->sleep_time = (r*p->sleep_time + MAX_SLEEP/2) / MAX_SLEEP;
+ }
+
+ p->total_time += 1000 * time;
+ if (sleep)
+ p->sleep_time += 1000 * time;
+
+ p->sleep_avg = (1000 * p->sleep_time) / p->total_time;
+}
+
/*
- * effective_prio - return the priority that is based on the static
- * priority but is modified by bonuses/penalties.
- *
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
- * into the -5 ... 0 ... +5 bonus/penalty range.
- *
- * We use 25% of the full 0...39 priority range so that:
- *
- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
- *
- * Both properties are important to certain workloads.
+ * The higher a thread's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority thread gets MIN_TIMESLICE worth of execution time.
*/
-static int effective_prio(task_t *p)
+static unsigned int task_timeslice(task_t *p, runqueue_t *rq)
+{
+ int idx, delta;
+ unsigned int base, timeslice;
+
+ if (rt_task(p))
+ return MAX_TIMESLICE;
+
+ idx = min(find_next_bit(rq->active->bitmap, MAX_PRIO, MAX_RT_PRIO),
+ find_next_bit(rq->expired->bitmap, MAX_PRIO, MAX_RT_PRIO));
+ idx = min(idx, p->prio);
+ delta = p->prio - idx;
+
+ base = MIN_TIMESLICE * MAX_USER_PRIO / (delta + 1);
+ timeslice = base * (USER_PRIO(idx) + 4) / 8;
+
+ if (timeslice <= 0)
+ timeslice = 1;
+
+ return timeslice;
+}
+
+static unsigned long task_priority(task_t *p)
{
int bonus, prio;

if (rt_task(p))
return p->prio;

- bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
- MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
+ bonus = (MAX_USER_PRIO * p->sleep_avg) / 1000 / 2;
+ prio = USER_PRIO(p->static_prio) / 2 + (MAX_USER_PRIO / 2);

- prio = p->static_prio - bonus;
+ prio = MAX_RT_PRIO + prio - bonus;
if (prio < MAX_RT_PRIO)
prio = MAX_RT_PRIO;
if (prio > MAX_PRIO-1)
prio = MAX_PRIO-1;
+
return prio;
}

@@ -347,34 +329,38 @@ static inline void __activate_task(task_
*/
static inline void activate_task(task_t *p, runqueue_t *rq)
{
- long sleep_time = jiffies - p->last_run - 1;
+ unsigned long sleep = jiffies - p->timestamp;
+ p->timestamp = jiffies;

- if (sleep_time > 0) {
- int sleep_avg;
+ if (sleep > MAX_SLEEP)
+ sleep = MAX_SLEEP;

- /*
- * This code gives a bonus to interactive tasks.
- *
- * The boost works by updating the 'average sleep time'
- * value here, based on ->last_run. The more time a task
- * spends sleeping, the higher the average gets - and the
- * higher the priority boost gets as well.
- */
- sleep_avg = p->sleep_avg + sleep_time;
+ if (!in_interrupt() && current->mm) {
+ unsigned long boost = sleep / 2;
+ add_task_time(current, boost, 1);
+ add_task_time(p, sleep - boost, 1);
+ } else {
+ add_task_time(p, sleep, 1);
+
+ if (in_interrupt())
+ add_task_time(p, sleep / 2, 1);
+ }

- /*
- * 'Overflow' bonus ticks go to the waker as well, so the
- * ticks are not lost. This has the effect of further
- * boosting tasks that are related to maximum-interactive
- * tasks.
- */
- if (sleep_avg > MAX_SLEEP_AVG)
- sleep_avg = MAX_SLEEP_AVG;
- if (p->sleep_avg != sleep_avg) {
- p->sleep_avg = sleep_avg;
- p->prio = effective_prio(p);
- }
+ p->prio = task_priority(p);
+
+ if (rq->array_sequence != p->array_sequence) {
+ p->used_slice = 0;
}
+
+ if (!in_interrupt() && current->mm) {
+ unsigned long steal;
+
+ steal = min((unsigned int)sleep / 2,
+ (task_timeslice(p, rq) - p->used_slice) / 2);
+ p->used_slice += steal;
+ current->used_slice -= steal;
+ }
+
__activate_task(p, rq);
}

@@ -383,6 +369,7 @@ static inline void activate_task(task_t
*/
static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
{
+ p->array_sequence = rq->array_sequence;
nr_running_dec(rq);
if (p->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible++;
@@ -426,7 +413,7 @@ static inline void resched_task(task_t *
* be called with interrupts off, or it may introduce deadlock with
* smp_call_function() if an IPI is sent by the same process we are
* waiting to become inactive.
- */
+ n*/
void wait_task_inactive(task_t * p)
{
unsigned long flags;
@@ -497,11 +484,9 @@ repeat_lock_task:
}
if (old_state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible--;
- if (sync)
- __activate_task(p, rq);
- else {
- activate_task(p, rq);
- if (p->prio < rq->curr->prio)
+ activate_task(p, rq);
+ if (!sync) {
+ if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
}
success = 1;
@@ -534,36 +519,74 @@ int wake_up_state(task_t *p, unsigned in
}

/*
+ * Perform scheduler related accounting for a newly forked process @p.
+ * @p is forked by current.
+ */
+void sched_fork(task_t *p)
+{
+ unsigned long ts;
+ unsigned long flags;
+ runqueue_t *rq;
+
+ /*
+ * Share the timeslice between parent and child, thus the
+ * total amount of pending timeslices in the system doesn't change,
+ * resulting in more scheduling fairness.
+ */
+ local_irq_disable();
+ p->timestamp = jiffies;
+ rq = task_rq_lock(current, &flags);
+ ts = task_timeslice(current, rq);
+ task_rq_unlock(rq, &flags);
+ p->used_slice = current->used_slice + (ts - current->used_slice) / 2;
+ current->used_slice += (ts - current->used_slice + 1) / 2;
+ /*
+ * The remainder of the first timeslice might be recovered by
+ * the parent if the child exits early enough.
+ */
+ p->first_time_slice = 1;
+ if (current->used_slice >= ts) {
+ /*
+ * This case is rare, it happens when the parent has only
+ * a single jiffy left from its timeslice. Taking the
+ * runqueue lock is not a problem.
+ */
+ current->used_slice = ts - 1;
+ preempt_disable();
+ scheduler_tick(0, 0);
+ local_irq_enable();
+ preempt_enable();
+ } else
+ local_irq_enable();
+}
+
+/*
* wake_up_forked_process - wake up a freshly forked process.
*
* This function will do some initial scheduler statistics housekeeping
* that must be done for every newly created process.
*/
-void wake_up_forked_process(task_t * p)
+void wake_up_forked_process(task_t *p)
{
unsigned long flags;
runqueue_t *rq = task_rq_lock(current, &flags);

p->state = TASK_RUNNING;
- /*
- * We decrease the sleep average of forking parents
- * and children as well, to keep max-interactive tasks
- * from forking tasks that are max-interactive.
- */
- current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
- p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
- p->prio = effective_prio(p);
+
set_task_cpu(p, smp_processor_id());

- if (unlikely(!current->array))
- __activate_task(p, rq);
- else {
- p->prio = current->prio;
- list_add_tail(&p->run_list, &current->run_list);
- p->array = current->array;
- p->array->nr_active++;
- nr_running_inc(rq);
- }
+ p->sleep_time = current->sleep_time / 10;
+ p->total_time = current->total_time / 20;
+ p->sleep_avg = current->sleep_avg * 2;
+
+ current->sleep_time = 3 * (current->sleep_time) / 4;
+ if (current->total_time != 0)
+ current->sleep_avg = (1000 * current->sleep_time)
+ / current->total_time;
+
+ p->prio = task_priority(p);
+ __activate_task(p, rq);
+
task_rq_unlock(rq, &flags);
}

@@ -582,18 +605,17 @@ void sched_exit(task_t * p)

local_irq_save(flags);
if (p->first_time_slice) {
- p->parent->time_slice += p->time_slice;
- if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
- p->parent->time_slice = MAX_TIMESLICE;
+ unsigned long flags;
+ runqueue_t *rq;
+ rq = task_rq_lock(p, &flags);
+ p->parent->used_slice -= task_timeslice(p, rq) - p->used_slice;
+ task_rq_unlock(rq, &flags);
}
- local_irq_restore(flags);
- /*
- * If the child was a (relative-) CPU hog then decrease
- * the sleep_avg of the parent as well.
- */
+
if (p->sleep_avg < p->parent->sleep_avg)
- p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT +
- p->sleep_avg) / (EXIT_WEIGHT + 1);
+ add_task_time(p->parent, (p->parent->sleep_avg - p->sleep_avg)/2, 0);
+
+ local_irq_restore(flags);
}

/**
@@ -995,13 +1017,29 @@ static inline void pull_task(runqueue_t
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
*/
- if (p->prio < this_rq->curr->prio)
+ if (TASK_PREEMPTS_CURR(p, this_rq))
set_need_resched();
- else {
- if (p->prio == this_rq->curr->prio &&
- p->time_slice > this_rq->curr->time_slice)
- set_need_resched();
- }
+}
+
+/*
+ * comment me
+ */
+
+static inline int
+can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle)
+{
+ unsigned long delta;
+
+ if (task_running(rq, tsk))
+ return 0;
+ if (!cpu_isset(this_cpu, tsk->cpus_allowed))
+ return 0;
+
+ delta = jiffies - tsk->timestamp;
+ if (idle && (delta <= cache_decay_ticks))
+ return 0;
+
+ return 1;
}

/*
@@ -1063,14 +1101,9 @@ skip_queue:
* 3) are cache-hot on their current CPU.
*/

-#define CAN_MIGRATE_TASK(p,rq,this_cpu) \
- ((!idle || (jiffies - (p)->last_run > cache_decay_ticks)) && \
- !task_running(rq, p) && \
- cpu_isset(this_cpu, (p)->cpus_allowed))
-
curr = curr->prev;

- if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
+ if (!can_migrate_task(tmp, busiest, this_cpu, idle)) {
if (curr != head)
goto skip_queue;
idx++;
@@ -1171,20 +1204,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
EXPORT_PER_CPU_SYMBOL(kstat);

/*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks:
- */
-#define EXPIRED_STARVING(rq) \
- (STARVATION_LIMIT && ((rq)->expired_timestamp && \
- (jiffies - (rq)->expired_timestamp >= \
- STARVATION_LIMIT * ((rq)->nr_running) + 1)))
-
-/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*
@@ -1201,17 +1220,11 @@ void scheduler_tick(int user_ticks, int
if (rcu_pending(cpu))
rcu_check_callbacks(cpu, user_ticks);

- /* note: this timer irq context must be accounted for as well */
- if (hardirq_count() - HARDIRQ_OFFSET) {
- cpustat->irq += sys_ticks;
- sys_ticks = 0;
- } else if (softirq_count()) {
- cpustat->softirq += sys_ticks;
- sys_ticks = 0;
- }
-
if (p == rq->idle) {
- if (atomic_read(&rq->nr_iowait) > 0)
+ /* note: this timer irq context must be accounted for as well */
+ if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
+ cpustat->system += sys_ticks;
+ else if (atomic_read(&rq->nr_iowait) > 0)
cpustat->iowait += sys_ticks;
else
cpustat->idle += sys_ticks;
@@ -1232,43 +1245,39 @@ void scheduler_tick(int user_ticks, int
spin_lock(&rq->lock);
/*
* The task was running during this tick - update the
- * time slice counter and the sleep average. Note: we
- * do not update a thread's priority until it either
- * goes to sleep or uses up its timeslice. This makes
- * it possible for interactive tasks to use up their
- * timeslices at their highest priority levels.
+ * time slice counter. Note: we do not update a thread's
+ * priority until it either goes to sleep or uses up its
+ * timeslice.
*/
- if (p->sleep_avg)
- p->sleep_avg--;
if (unlikely(rt_task(p))) {
/*
* RR tasks need a special form of timeslice management.
* FIFO tasks have no timeslices.
*/
- if ((p->policy == SCHED_RR) && !--p->time_slice) {
- p->time_slice = task_timeslice(p);
- p->first_time_slice = 0;
- set_tsk_need_resched(p);
-
- /* put it at the end of the queue: */
- dequeue_task(p, rq->active);
- enqueue_task(p, rq->active);
+ if (p->policy == SCHED_RR) {
+ p->used_slice++;
+ if (p->used_slice >= task_timeslice(p, rq)) {
+ p->used_slice = 0;
+ p->first_time_slice = 0;
+ set_tsk_need_resched(p);
+
+ /* put it at the end of the queue: */
+ dequeue_task(p, rq->active);
+ enqueue_task(p, rq->active);
+ }
}
goto out_unlock;
}
- if (!--p->time_slice) {
+
+ p->used_slice++;
+ if (p->used_slice >= task_timeslice(p, rq)) {
dequeue_task(p, rq->active);
set_tsk_need_resched(p);
- p->prio = effective_prio(p);
- p->time_slice = task_timeslice(p);
+ p->prio = task_priority(p);
+ p->used_slice = 0;
p->first_time_slice = 0;

- if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
- if (!rq->expired_timestamp)
- rq->expired_timestamp = jiffies;
- enqueue_task(p, rq->expired);
- } else
- enqueue_task(p, rq->active);
+ enqueue_task(p, rq->expired);
}
out_unlock:
spin_unlock(&rq->lock);
@@ -1287,6 +1296,8 @@ asmlinkage void schedule(void)
runqueue_t *rq;
prio_array_t *array;
struct list_head *queue;
+ unsigned long now;
+ unsigned long run_time;
int idx;

/*
@@ -1307,7 +1318,11 @@ need_resched:
rq = this_rq();

release_kernel_lock(prev);
- prev->last_run = jiffies;
+ now = jiffies;
+ run_time = now - prev->timestamp;
+
+ add_task_time(prev, run_time, 0);
+
spin_lock_irq(&rq->lock);

/*
@@ -1336,7 +1351,6 @@ pick_next_task:
goto pick_next_task;
#endif
next = rq->idle;
- rq->expired_timestamp = 0;
goto switch_tasks;
}

@@ -1345,10 +1359,10 @@ pick_next_task:
/*
* Switch the active and expired arrays.
*/
+ rq->array_sequence++;
rq->active = rq->expired;
rq->expired = array;
array = rq->active;
- rq->expired_timestamp = 0;
}

idx = sched_find_first_bit(array->bitmap);
@@ -1360,7 +1374,9 @@ switch_tasks:
clear_tsk_need_resched(prev);
RCU_qsctr(task_cpu(prev))++;

+ prev->timestamp = now;
if (likely(prev != next)) {
+ next->timestamp = now;
rq->nr_switches++;
rq->curr = next;

@@ -1600,6 +1616,7 @@ void set_user_nice(task_t *p, long nice)
unsigned long flags;
prio_array_t *array;
runqueue_t *rq;
+ int old_prio, new_prio, delta;

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
return;
@@ -1608,6 +1625,12 @@ void set_user_nice(task_t *p, long nice)
* the task might be in the middle of scheduling on another CPU.
*/
rq = task_rq_lock(p, &flags);
+ /*
+ * The RT priorities are set via setscheduler(), but we still
+ * allow the 'normal' nice value to be set - but as expected
+ * it wont have any effect on scheduling until the task is
+ * not SCHED_NORMAL:
+ */
if (rt_task(p)) {
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
@@ -1615,16 +1638,20 @@ void set_user_nice(task_t *p, long nice)
array = p->array;
if (array)
dequeue_task(p, array);
+
+ old_prio = p->prio;
+ new_prio = NICE_TO_PRIO(nice);
+ delta = new_prio - old_prio;
p->static_prio = NICE_TO_PRIO(nice);
- p->prio = NICE_TO_PRIO(nice);
+ p->prio += delta;
+
if (array) {
enqueue_task(p, array);
/*
- * If the task is running and lowered its priority,
- * or increased its priority then reschedule its CPU:
+ * If the task increased its priority or is running and
+ * lowered its priority, then reschedule its CPU:
*/
- if ((NICE_TO_PRIO(nice) < p->static_prio) ||
- task_running(rq, p))
+ if (delta < 0 || (delta > 0 && task_running(rq, p)))
resched_task(rq->curr);
}
out_unlock:
@@ -2139,6 +2166,8 @@ asmlinkage long sys_sched_rr_get_interva
int retval = -EINVAL;
struct timespec t;
task_t *p;
+ unsigned long flags;
+ runqueue_t *rq;

if (pid < 0)
goto out_nounlock;
@@ -2153,8 +2182,10 @@ asmlinkage long sys_sched_rr_get_interva
if (retval)
goto out_unlock;

+ rq = task_rq_lock(p, &flags);
jiffies_to_timespec(p->policy & SCHED_FIFO ?
- 0 : task_timeslice(p), &t);
+ 0 : task_timeslice(p, rq), &t);
+ task_rq_unlock(rq, &flags);
read_unlock(&tasklist_lock);
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
out_nounlock:


Attachments:
sched-policy-7a (21.88 kB)

2003-08-25 11:03:35

by Felipe Alfaro Solana

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v7

On Mon, 2003-08-25 at 12:41, Nick Piggin wrote:

[...]
> Please test. I'm not getting enough feedback!

I'm testing, I'm testing ;-)
Just let me a few hours to play with it...

2003-08-25 14:36:15

by MånsRullgård

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v7

Nick Piggin <[email protected]> writes:

> This one has a few changes. Children now get a priority boost
> on fork, and parents retain more priority after forking a child,
> however exiting CPU hogs will now penalise parents a bit.
>
> Timeslice scaling was tweaked a bit. Oh and remember raising X's
> priority should _help_ interactivity with this patch, and IMO is
> not an unreasonable thing to be doing.
>
> Please test. I'm not getting enough feedback!

OK, if you test my software.

Seriously, though, it seems OK at first glance. I can't reproduce the
XEmacs problems I had with Con's recent versions. I'll have to run it
for a while and see what it seems like.

--
M?ns Rullg?rd
[email protected]

2003-08-25 22:39:15

by Bill Davidsen

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy

On Sun, 24 Aug 2003, Felipe Alfaro Solana wrote:

> On Sun, 2003-08-24 at 14:35, Nick Piggin wrote:
> > Hi,
> > Patch against 2.6.0-test4. It fixes a lot of problems here vs
> > previous versions. There aren't really any open issues for me, so
> > testers would be welcome.
> >
> > The big change is more dynamic timeslices, which allows "interactive"
> > tasks to get very small timeslices while more compute intensive loads
> > can be given bigger timeslices than usual. This works properly with
> > nice (niced processes will tend to get bigger timeslices).
> >
> > I think I have cured test-starve too.
>
> I haven't still found any starvation cases, but forking time when the
> system is under heavy load has increased considerable with respect to
> vanilla or Con's O18.1int:

Not having starvation cases may be worth a little overhead. I am more
concerned about avoiding "jackpot cases" than throughput, at least on a
desktop.

--
bill davidsen <[email protected]>
CTO, TMR Associates, Inc
Doing interesting things with little computers since 1979.

2003-08-26 03:16:46

by Mike Fedyk

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy

On Mon, Aug 25, 2003 at 01:36:25PM +1000, Nick Piggin wrote:
>
>
> Randy.Dunlap wrote:
> >Has anyone developed a (run-time) scheduler [policy] selector, via
> >sysctl or sysfs, so that different kernel builds aren't required?
> >
> >I know that I have heard discussions of this previously.
> >
>
> Not that I know of. This would probably require an extra layer of
...
> In fact this might end up being incompatible with architectures
> like SPARC... but I'm sure someone could make it work if they really
> wanted to.
>

Why wouldn't it work on sparc?

2003-08-26 03:24:49

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v7

> I didn't miss 5 revisions, I'll just stick to using my internal
> numbering for releases.
>
> This one has a few changes. Children now get a priority boost
> on fork, and parents retain more priority after forking a child,
> however exiting CPU hogs will now penalise parents a bit.
>
> Timeslice scaling was tweaked a bit. Oh and remember raising X's
> priority should _help_ interactivity with this patch, and IMO is
> not an unreasonable thing to be doing.
>
> Please test. I'm not getting enough feedback!

Well, it's actually a bit faster than either mainline or your previous
rev whilst running SDET:

SDET 128 (see disclaimer)
Throughput Std. Dev
2.6.0-test4 100.0% 0.3%
2.6.0-test4-nick 102.9% 0.3%
2.6.0-test4-nick7a 105.1% 0.5%

But kernbench is significantly slower. The increase in sys time has
dropped from last time, but user time is up.

Kernbench: (make -j vmlinux, maximal tasks)
Elapsed System User CPU
2.6.0-test4 45.87 116.92 571.10 1499.00
2.6.0-test4-nick 49.37 131.31 611.15 1500.75
2.6.0-test4-nick7a 49.48 125.95 617.71 1502.00

diffprofile {2.6.0-test4,2.6.0-test4-nick7a}/kernbench/0/profile

13989 8.6% total
4402 9.6% default_idle
3385 14.4% page_remove_rmap
1093 13.8% __d_lookup
702 5.0% do_anonymous_page
613 11.5% __copy_to_user_ll
613 32.9% atomic_dec_and_lock
565 40.9% free_hot_cold_page
322 18.7% buffered_rmqueue
296 9.4% zap_pte_range
282 75.6% .text.lock.file_table
185 11.4% kmem_cache_free
183 9.8% path_lookup
164 12.2% link_path_walk
154 12.7% release_pages
152 43.1% pgd_ctor
127 33.0% file_kill
126 15.8% pte_alloc_one
123 10.4% file_move
107 75.4% .text.lock.dcache
...
-59 -9.5% copy_process
-94 -22.5% release_task
-146 -2.3% page_add_rmap
-352 -24.7% schedule
-1026 -29.4% __copy_from_user_ll

Not sure why you're beating up on rmap so much more from a scheduler
change.

diffprofile {2.6.0-test4,2.6.0-test4-nick7a}/sdetbench/128/profile
513 19.1% .text.lock.filemap
246 2.6% find_get_page
150 4.4% copy_mm
86 46.0% try_to_wake_up
82 24.1% kunmap_high
76 0.0% sched_fork
74 0.5% copy_page_range
67 1.1% do_no_page
54 17.3% __pagevec_lru_add_active
53 10.2% radix_tree_lookup
51 7.4% __wake_up
...
-101 -8.6% __block_prepare_write
-105 -65.2% release_blocks
-108 -4.7% link_path_walk
-112 -15.2% mmgrab
-116 -5.5% buffered_rmqueue
-118 -5.9% path_release
-119 -2.9% do_wp_page
-125 -3.9% pte_alloc_one
-125 -15.6% proc_pid_status
-127 -5.5% free_hot_cold_page
-132 -10.2% exit_notify
-138 -11.7% __read_lock_failed
-146 -9.7% number
-154 -28.5% proc_check_root
-155 -20.9% proc_root_link
-176 -10.3% d_alloc
-179 -13.6% task_mem
-186 -9.8% .text.lock.dcache
-186 -7.6% proc_pid_stat
-193 -11.1% ext2_new_inode
-230 -4.0% kmem_cache_free
-239 -8.2% .text.lock.dec_and_lock
-244 -11.5% schedule
-250 -38.3% __blk_queue_bounce
-257 -15.0% current_kernel_time
-307 -17.6% release_task
-327 -1.8% zap_pte_range
-338 -7.7% clear_page_tables
-384 -20.7% lookup_mnt
-406 -26.5% __find_get_block
-412 -18.5% follow_mount
-565 -9.7% path_lookup
-729 -11.6% atomic_dec_and_lock
-865 -46.1% grab_block
-1185 -10.5% __d_lookup
-2145 -0.5% default_idle
-2786 -7.0% page_add_rmap
-12702 -14.2% page_remove_rmap
-29467 -3.8% total

Again, rmap and dlookup. Very odd. Some sort of locality thing, I guess.

M.

2003-08-26 04:04:51

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v7

Martin J. Bligh wrote:

>>I didn't miss 5 revisions, I'll just stick to using my internal
>>numbering for releases.
>>
>>This one has a few changes. Children now get a priority boost
>>on fork, and parents retain more priority after forking a child,
>>however exiting CPU hogs will now penalise parents a bit.
>>
>>Timeslice scaling was tweaked a bit. Oh and remember raising X's
>>priority should _help_ interactivity with this patch, and IMO is
>>not an unreasonable thing to be doing.
>>
>>Please test. I'm not getting enough feedback!
>>
>
>Well, it's actually a bit faster than either mainline or your previous
>rev whilst running SDET:
>
>SDET 128 (see disclaimer)
> Throughput Std. Dev
> 2.6.0-test4 100.0% 0.3%
> 2.6.0-test4-nick 102.9% 0.3%
> 2.6.0-test4-nick7a 105.1% 0.5%
>
>But kernbench is significantly slower. The increase in sys time has
>dropped from last time, but user time is up.
>
>Kernbench: (make -j vmlinux, maximal tasks)
> Elapsed System User CPU
> 2.6.0-test4 45.87 116.92 571.10 1499.00
> 2.6.0-test4-nick 49.37 131.31 611.15 1500.75
> 2.6.0-test4-nick7a 49.48 125.95 617.71 1502.00
>

Thanks Martin. OK, so the drop in kernbench is quite likely to be what
I thought - elevated priorities (caused by eg. make waiting for children)
causing timeslices to shrink. As long as its not a fundamental problem,
this should be able to be tweaked back.

Yeah, I guess the random kernel and user times are probably due to cache.



2003-08-26 09:45:24

by Yaroslav Rastrigin

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v7

Hi !
>
> This one has a few changes. Children now get a priority boost
> on fork, and parents retain more priority after forking a child,
> however exiting CPU hogs will now penalise parents a bit.
>
> Timeslice scaling was tweaked a bit. Oh and remember raising X's
> priority should _help_ interactivity with this patch, and IMO is
> not an unreasonable thing to be doing.
>
> Please test. I'm not getting enough feedback!
And here's my report (almost no numbers, everything is purely subjective).
I haven't tested Con's O18.1 yet, so my comparision is against -test4 vanilla.
First (and most subjective opinion) - great. Under casual load (Opera
rendering page and using its motifwrapper to handle flash (it's a CPU hog
alone) + ocassional compilation + XMMS using ALSA ) everything feels very
smooth _and_ responsive, no XMMS jerks, windows are moving nicely - X is
definitely not starved, ps ax in xterm displays its output almost instantly,
apps startup time is also very pleasantly low - all in all great.
Unusual load (make -j 4 bzImage + aforementioned activity ) - I was able to
notice 1 jerk in XMMS (wasn't able to reproduce, so this is acceptable),
application startup time is slightly worse, but nonetheless useable and once
started, all apps are quite smooth, but X is definitely starved, and it has
great impact on WM - window movement is jerky and feels bad. However, renice
-20 <X pid> cures this behavior completely, without any noticeable penalty on
another apps - music is playing nicely, page rendering is still clean and
nice.

Overall - very nice.
I'll stick to your scheduling policy for a while .

System specs:

IBM ThinkPad T21, PIII-800Mhz , 256Mb.
Linux 2.6.0-test4, APM, ALSA, anticipatory IO scheduling
XFree 4.3.99.9

P.S. make clean && make -j 4 bzImage completed while I was writing this
letter, so I'm assuming throughput is also OK for me.

--
With all the best, yarick at relex dot ru.

2003-08-27 09:24:15

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH] Nick's scheduler policy v7

At 08:41 PM 8/25/2003 +1000, Nick Piggin wrote:
>Hi,

Greetings,

>I didn't miss 5 revisions, I'll just stick to using my internal
>numbering for releases.
>
>This one has a few changes. Children now get a priority boost
>on fork, and parents retain more priority after forking a child,
>however exiting CPU hogs will now penalise parents a bit.
>
>Timeslice scaling was tweaked a bit. Oh and remember raising X's
>priority should _help_ interactivity with this patch, and IMO is
>not an unreasonable thing to be doing.
>
>Please test. I'm not getting enough feedback!

Heavy parallel make throughput is still down a little over 10%, but X
choppiness is markedly improved. Test-starve is now working. One thing
that I noticed is that irman takes quite a bit longer to complete than with
stock. I've attached the results of that, plus some contest numbers. My
local variant of contest has a couple of differences to stock: dbench
doesn't run so many instances, and list_load is just a small tree being
md5summed. There are two additional loads as well. ab_load is an ancient
apache bench jabbering with an also ancient apache (boring static page via
localhost)... you can guess what irman2_load is :)

-Mike


Attachments:
xx (5.15 kB)