Given the frequency of schedule() calling, there is a margin to
improve it. In step of recalculate task priority, it first dequeues
from one priority queue, calls recalc_task_prio(), then enqueue the
task into another priority queue. However, statistics shows only
around 0.5% of recalculation changed task priority (see below). While
rest of 99.5% of recalculation do not change priority, it is
reasonably to use requeue_task() to avoid overhead of dequeue and
enqueue on the same priority queue.
The patch is implemented with above idea. Note, a new help function,
change_queue_task(), to combine dequeue() and enqueue() to reduce one
function call overhead. Two statistics fields, sched_prio_changed and
sched_prio_unchanged, are added to provide statistic data on priority
recalculation.
Thanks,
chen
[email protected]
/*===== Statistics ===== */
The statistics is based on Intel x86 machine with 2 Xeon 1.8G
processors with hyperthreading enable.
Prio_unchanged prio_changed sched_cnt
CPU0 109 22743 59123 CPU1 120 23733 60407
CPU2 73 29981 86153 CPU3 96 22050 53094
/*===== Patch <linux-2.6.11.10> kernel/sched.c =====*/
--- linux-2.6.11.10.orig/kernel/sched.c 2005-05-16 10:51:53.000000000 -0700
+++ linux-2.6.11.10/kernel/sched.c 2005-05-18 22:31:32.000000000 -0700
@@ -249,6 +249,8 @@
unsigned long sched_noswitch;
unsigned long sched_switch;
unsigned long sched_cnt;
+ unsigned long sched_prio_changed;
+ unsigned long sched_prio_unchanged;
unsigned long sched_goidle;
/* pull_task() stats */
@@ -347,12 +349,20 @@
/* runqueue-specific stats */
seq_printf(seq,
- "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
- "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
+ "cpu%d \n\tyld: both(%lu) act(%lu) both(%lu) cnt(%lu) "
+ "\n\tsched: noswitch(%lu) switch(%lu) "
+ "\n\t\t cnt(%lu) prio_changed(%lu) prio_unchanged(%lu)"
+ "\n\t\t goidle(%lu) "
+ "\n\talb: cnt(%lu) gained(%lu) lost(%lu) failed(%lu) "
+ "\n\tttwu:cnt(%lu) moved(%lu) attempts(%lu) "
+ "\n\twunt: cnt(%lu) moved(%lu) "
+ "\n\tsmt: cnt(%lu) \n\tsbe: cnt(%lu) "
+ "\n\trq_sched_info: cpu_time(%lu) run_delay(%lu) pcnt(%lu)",
cpu, rq->yld_both_empty,
rq->yld_act_empty, rq->yld_exp_empty,
rq->yld_cnt, rq->sched_noswitch,
- rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
+ rq->sched_switch, rq->sched_cnt, rq->sched_prio_changed,
+ rq->sched_prio_unchanged, rq->sched_goidle,
rq->alb_cnt, rq->alb_gained, rq->alb_lost,
rq->alb_failed,
rq->ttwu_cnt, rq->ttwu_moved, rq->ttwu_attempts,
@@ -374,14 +384,14 @@
seq_printf(seq, "domain%d %s", dcnt++, mask_str);
for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
itype++) {
- seq_printf(seq, " %lu %lu %lu %lu %lu",
+ seq_printf(seq, "lb: cnt(%lu) failed(%lu) imbl(%lu) nobzq(%lu) %lu",
sd->lb_cnt[itype],
sd->lb_failed[itype],
sd->lb_imbalance[itype],
sd->lb_nobusyq[itype],
sd->lb_nobusyg[itype]);
}
- seq_printf(seq, " %lu %lu %lu %lu\n",
+ seq_printf(seq, "sbe: pushed(%lu) attempts(%lu) %lu %lu\n",
sd->sbe_pushed, sd->sbe_attempts,
sd->ttwu_wake_affine, sd->ttwu_wake_balance);
}
@@ -580,6 +590,18 @@
p->array = array;
}
+static void change_queue_task(struct task_struct *p, prio_array_t *array,
+ int old_prio)
+{
+ list_del(&p->run_list);
+ if (list_empty(array->queue + old_prio))
+ __clear_bit(old_prio, array->bitmap);
+
+ sched_info_queued(p);
+ list_add_tail(&p->run_list, array->queue + p->prio);
+ __set_bit(p->prio, array->bitmap);
+ p->array = array;
+}
/*
* Put task to the end of the run list without the overhead of dequeue
* followed by enqueue.
@@ -2668,7 +2690,7 @@
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
- int cpu, idx;
+ int cpu, idx, prio;
/*
* Test if we are atomic. Since do_exit() needs to call into
@@ -2787,9 +2809,19 @@
delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
array = next->array;
- dequeue_task(next, array);
+ prio = next->prio;
recalc_task_prio(next, next->timestamp + delta);
- enqueue_task(next, array);
+
+ if (unlikely(prio != next->prio))
+ {
+ change_queue_task(next, array, prio);
+ schedstat_inc(rq, sched_prio_changed);
+ }
+ else
+ {
+ requeue_task(next, array);
+ schedstat_inc(rq, sched_prio_unchanged);
+ }
}
next->activated = 0;
switch_tasks:
chen Shang wrote:
>Given the frequency of schedule() calling, there is a margin to
>improve it. In step of recalculate task priority, it first dequeues
>from one priority queue, calls recalc_task_prio(), then enqueue the
>task into another priority queue. However, statistics shows only
>around 0.5% of recalculation changed task priority (see below). While
>rest of 99.5% of recalculation do not change priority, it is
>reasonably to use requeue_task() to avoid overhead of dequeue and
>enqueue on the same priority queue.
>
>The patch is implemented with above idea. Note, a new help function,
>change_queue_task(), to combine dequeue() and enqueue() to reduce one
>function call overhead. Two statistics fields, sched_prio_changed and
>sched_prio_unchanged, are added to provide statistic data on priority
>recalculation.
>
>Thanks,
>
>chen
>[email protected]
>
Hi Chen,
With the added branch and the extra icache footprint, it isn't clear
that this would be a win.
Also, you didn't say where your statistics came from (what workload).
So you really need to start by demonstrating some increase on some workload.
Also, minor comments on the patch: please work against mm kernels,
please follow
kernel coding style, and don't change schedstat output format in the
same patch
(makes it easier for those with schedstat parsing tools).
>
>+static void change_queue_task(struct task_struct *p, prio_array_t *array,
>+ int old_prio)
>+{
>+ list_del(&p->run_list);
>+ if (list_empty(array->queue + old_prio))
>+ __clear_bit(old_prio, array->bitmap);
>+
>+ sched_info_queued(p);
>+ list_add_tail(&p->run_list, array->queue + p->prio);
>+ __set_bit(p->prio, array->bitmap);
>+ p->array = array;
>+}
> /*
> * Put task to the end of the run list without the overhead of dequeue
> * followed by enqueue.
>@@ -2668,7 +2690,7 @@
> struct list_head *queue;
> unsigned long long now;
> unsigned long run_time;
>- int cpu, idx;
>+ int cpu, idx, prio;
>
> /*
> * Test if we are atomic. Since do_exit() needs to call into
>@@ -2787,9 +2809,19 @@
> delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
>
> array = next->array;
>- dequeue_task(next, array);
>+ prio = next->prio;
> recalc_task_prio(next, next->timestamp + delta);
>- enqueue_task(next, array);
>+
>+ if (unlikely(prio != next->prio))
>+ {
>+ change_queue_task(next, array, prio);
>+ schedstat_inc(rq, sched_prio_changed);
>+ }
>+ else
>+ {
>+ requeue_task(next, array);
>+ schedstat_inc(rq, sched_prio_unchanged);
>+ }
> }
> next->activated = 0;
> switch_tasks:
>-
>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>the body of a message to [email protected]
>More majordomo info at http://vger.kernel.org/majordomo-info.html
>Please read the FAQ at http://www.tux.org/lkml/
>
>
> Hi Chen,
> With the added branch and the extra icache footprint, it isn't clear
> that this would be a win.
>
> Also, you didn't say where your statistics came from (what workload).
>
> So you really need to start by demonstrating some increase on some workload.
>
> Also, minor comments on the patch: please work against mm kernels,
> please follow
> kernel coding style, and don't change schedstat output format in the
> same patch
> (makes it easier for those with schedstat parsing tools).
>
Hi Nick,
Thank you very much for your comments. This is the first time of my
kernel hacking. I will reduce the lines of changes as much as
possible. As regard to the statistics, there are just count, ie, the
total number of priority-recalculations vs. the number of priority
changed from the former recalculation.
Thanks again,
-chen
On Thu, 2005-05-19 at 21:17 -0700, chen Shang wrote:
> > Hi Chen,
> > With the added branch and the extra icache footprint, it isn't clear
> > that this would be a win.
> >
> > Also, you didn't say where your statistics came from (what workload).
> >
> > So you really need to start by demonstrating some increase on some workload.
> >
> > Also, minor comments on the patch: please work against mm kernels,
> > please follow
> > kernel coding style, and don't change schedstat output format in the
> > same patch
> > (makes it easier for those with schedstat parsing tools).
> >
> Hi Nick,
>
> Thank you very much for your comments. This is the first time of my
> kernel hacking. I will reduce the lines of changes as much as
> possible. As regard to the statistics, there are just count, ie, the
> total number of priority-recalculations vs. the number of priority
> changed from the former recalculation.
A kernel profile (check list archives for oprofile) would easily
demonstrate any performance gain. On my system the residency of
schedule() is around 1% so this will be easy to spot.
Lee
chen Shang wrote:
>>Hi Chen,
>>With the added branch and the extra icache footprint, it isn't clear
>>that this would be a win.
>>
>>Also, you didn't say where your statistics came from (what workload).
>>
>>So you really need to start by demonstrating some increase on some workload.
>>
>>Also, minor comments on the patch: please work against mm kernels,
>>please follow
>>kernel coding style, and don't change schedstat output format in the
>>same patch
>>(makes it easier for those with schedstat parsing tools).
>>
>>
>Hi Nick,
>
>Thank you very much for your comments. This is the first time of my
>kernel hacking. I will reduce the lines of changes as much as
>possible. As regard to the statistics, there are just count, ie, the
>total number of priority-recalculations vs. the number of priority
>changed from the former recalculation.
>
>
The only problem there is that changing the format will break
at least my schedstats parser.
Seeing as it is not some functional change to the scheduler,
your patch should not change schedstats. If you are *also*
interested in the priority changed vs unchanged statistic, then
you can do a patch for that seperately (ie. it doesn't depend
on your optimisation in question).
Thanks,
Nick
I minimized my patch and against to 2.6.12-rc4 this time, see below.
The new schedstat fields are for the test propose only, so I removed
them completedly from patch. Theoritically, requeue_task() is always
cheaper than dequeue_task() followed by enqueue_task(). So, if 99% of
priority recalculation trigger requeue_task(), it will save.
In addition, my load is to build the kernel, which took around 30
minutes with around 30% CPU usage on 2x2 processors (duel processors
with HT enable).
Here is the statistics:
CPU0: priority_changed (669 times), priority_unchanged(335,138 times)
CPU1: priority_changed (784 times), priority_unchanged(342,419 times)
CPU2: priority_changed (782 times), priority_unchanged(283,494 times)
CPU3: priority_changed (872 times), priority_unchanged(365,865 times)
Thanks,
-chen
/*=====Patch=====*/
--- linux-2.6.12-rc4.orig/kernel/sched.c 2005-05-19 14:57:55.000000000 -0700
+++ linux-2.6.12-rc4/kernel/sched.c 2005-05-19 23:47:22.000000000 -0700
@@ -2613,7 +2613,7 @@
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
- int cpu, idx;
+ int cpu, idx, prio;
/*
* Test if we are atomic. Since do_exit() needs to call into
@@ -2735,9 +2735,17 @@
delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
array = next->array;
- dequeue_task(next, array);
+ prio = next->prio;
+
recalc_task_prio(next, next->timestamp + delta);
- enqueue_task(next, array);
+
+ if (unlikely(prio != next->prio))
+ {
+ dequeue_task(next, array);
+ enqueue_task(next, array);
+ }
+ else
+ requeue_task(next, array);
}
next->activated = 0;
switch_tasks:
chen Shang wrote:
>I minimized my patch and against to 2.6.12-rc4 this time, see below.
>
>The new schedstat fields are for the test propose only, so I removed
>them completedly from patch. Theoritically, requeue_task() is always
>cheaper than dequeue_task() followed by enqueue_task(). So, if 99% of
>priority recalculation trigger requeue_task(), it will save.
>
>In addition, my load is to build the kernel, which took around 30
>minutes with around 30% CPU usage on 2x2 processors (duel processors
>with HT enable).
>Here is the statistics:
>
>CPU0: priority_changed (669 times), priority_unchanged(335,138 times)
>CPU1: priority_changed (784 times), priority_unchanged(342,419 times)
>CPU2: priority_changed (782 times), priority_unchanged(283,494 times)
>CPU3: priority_changed (872 times), priority_unchanged(365,865 times)
>
>
OK that gives you a good grounds to look at the patch, but _performance_
improvement is what is needed to get it included.
>Thanks,
>-chen
>
>
>/*=====Patch=====*/
>--- linux-2.6.12-rc4.orig/kernel/sched.c 2005-05-19 14:57:55.000000000 -0700
>+++ linux-2.6.12-rc4/kernel/sched.c 2005-05-19 23:47:22.000000000 -0700
>@@ -2613,7 +2613,7 @@
> struct list_head *queue;
> unsigned long long now;
> unsigned long run_time;
>- int cpu, idx;
>+ int cpu, idx, prio;
>
> /*
> * Test if we are atomic. Since do_exit() needs to call into
>@@ -2735,9 +2735,17 @@
> delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
>
> array = next->array;
>- dequeue_task(next, array);
>+ prio = next->prio;
>+
> recalc_task_prio(next, next->timestamp + delta);
>- enqueue_task(next, array);
>+
>+ if (unlikely(prio != next->prio))
>+ {
>+ dequeue_task(next, array);
>+ enqueue_task(next, array);
>+ }
>+ else
>+ requeue_task(next, array);
>
Coding style says
if (unlikely()) {
...
} else
...
On Fri, 20 May 2005 17:21, Nick Piggin wrote:
> chen Shang wrote:
> >I minimized my patch and against to 2.6.12-rc4 this time, see below.
> >
> >The new schedstat fields are for the test propose only, so I removed
> >them completedly from patch. Theoritically, requeue_task() is always
> >cheaper than dequeue_task() followed by enqueue_task(). So, if 99% of
> >priority recalculation trigger requeue_task(), it will save.
> >
> >In addition, my load is to build the kernel, which took around 30
> >minutes with around 30% CPU usage on 2x2 processors (duel processors
> >with HT enable).
> >Here is the statistics:
> >
> >CPU0: priority_changed (669 times), priority_unchanged(335,138 times)
> >CPU1: priority_changed (784 times), priority_unchanged(342,419 times)
> >CPU2: priority_changed (782 times), priority_unchanged(283,494 times)
> >CPU3: priority_changed (872 times), priority_unchanged(365,865 times)
>
> OK that gives you a good grounds to look at the patch, but _performance_
> improvement is what is needed to get it included.
If you end up using requeue_task() in the fast path and it is hit frequently
with your code you'll need to modify requeue_task to be inline as well.
Currently it is hit only via sched_yield and once every 10 scheduler ticks
which is why it is not inline. The performance hit will be demonstrable if it
is hit in every schedule()
Cheers,
Con
* chen Shang <[email protected]> wrote:
> I minimized my patch and against to 2.6.12-rc4 this time, see below.
looks good - i've done some small style/whitespace cleanups and renamed
prio to old_prio, patch against -rc4 below.
Ingo
From: Chen Shang <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -2613,7 +2613,7 @@ asmlinkage void __sched schedule(void)
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
- int cpu, idx;
+ int cpu, idx, old_prio;
/*
* Test if we are atomic. Since do_exit() needs to call into
@@ -2735,9 +2735,15 @@ go_idle:
delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
array = next->array;
- dequeue_task(next, array);
+ old_prio = next->prio;
+
recalc_task_prio(next, next->timestamp + delta);
- enqueue_task(next, array);
+
+ if (unlikely(old_prio != next->prio)) {
+ dequeue_task(next, array);
+ enqueue_task(next, array);
+ } else
+ requeue_task(next, array);
}
next->activated = 0;
switch_tasks:
On Fri, 20 May 2005 19:49, Ingo Molnar wrote:
> * chen Shang <[email protected]> wrote:
> > I minimized my patch and against to 2.6.12-rc4 this time, see below.
>
> looks good - i've done some small style/whitespace cleanups and renamed
> prio to old_prio, patch against -rc4 below.
We should inline requeue_task as well.
Con
----
* Con Kolivas <[email protected]> wrote:
> On Fri, 20 May 2005 19:49, Ingo Molnar wrote:
> > * chen Shang <[email protected]> wrote:
> > > I minimized my patch and against to 2.6.12-rc4 this time, see below.
> >
> > looks good - i've done some small style/whitespace cleanups and renamed
> > prio to old_prio, patch against -rc4 below.
>
> We should inline requeue_task as well.
yeah.
Acked-by: Ingo Molnar <[email protected]>
Ingo
Here is the statistics data to support requeue_task() inline with the
counter on sched_cnt at the same period. In brief, every 10 schedule()
will trigger recalc_task_prio() a little more than 4 times.
CPU0: priority_changed (669 times), priority_unchanged(335,138 times),
schedule_cnt(787,085 times)
CPU1: priority_changed (784 times), priority_unchanged(342,419 times),
schedule_cnt(784,873 times)
CPU2: priority_changed (782 times), priority_unchanged(283,494 times),
schedule_cnt(681,917 times)
CPU3: priority_changed (872 times), priority_unchanged(365,865 times),
schedule_cnt(809,624 times)
On 5/20/05, Con Kolivas <[email protected]> wrote:
> On Fri, 20 May 2005 17:21, Nick Piggin wrote:
> > chen Shang wrote:
> > >I minimized my patch and against to 2.6.12-rc4 this time, see below.
> > >
> > >The new schedstat fields are for the test propose only, so I removed
> > >them completedly from patch. Theoritically, requeue_task() is always
> > >cheaper than dequeue_task() followed by enqueue_task(). So, if 99% of
> > >priority recalculation trigger requeue_task(), it will save.
> > >
> > >In addition, my load is to build the kernel, which took around 30
> > >minutes with around 30% CPU usage on 2x2 processors (duel processors
> > >with HT enable).
> > >Here is the statistics:
> > >
> > >CPU0: priority_changed (669 times), priority_unchanged(335,138 times)
> > >CPU1: priority_changed (784 times), priority_unchanged(342,419 times)
> > >CPU2: priority_changed (782 times), priority_unchanged(283,494 times)
> > >CPU3: priority_changed (872 times), priority_unchanged(365,865 times)
> >
> > OK that gives you a good grounds to look at the patch, but _performance_
> > improvement is what is needed to get it included.
>
> If you end up using requeue_task() in the fast path and it is hit frequently
> with your code you'll need to modify requeue_task to be inline as well.
> Currently it is hit only via sched_yield and once every 10 scheduler ticks
> which is why it is not inline. The performance hit will be demonstrable if it
> is hit in every schedule()
>
> Cheers,
> Con
>
>
>
/*===== ISSUE ====*/
My second version of patch has a defect.
+ if (unlikely(old_prio != next->prio)) {
+ dequeue_task(next, array); --> ### dequeue should against
old_prio, NOT next->prio ###
+ enqueue_task(next, array);
+ }
unforunately, dequeue_task does not accept the third parameter to make
adjustment. Personally, I feel it's good to add extra function as my
first version of patch to combine dequeue and enqueue together.
Reasons as following:
1) adding the third parameter to dequeue_task() would cause other
places' code change;
2) for schedule functions, performance is the first consideration.
Notice both dequeue_task() and enqueue_task() are NOT inline.
Combining those two in one saves one function call overhead;
/* ===== NEW PATCH ===== */
The new patch, see below, adds new function change_queue_task() to
dequeue from "old_prio queue" and enqueue the "next task" to
"next->prio queue".
The patch also inlines requeue_task().
The patch has been tested with 2.6.11.10, looks good. -For somehow,
2.6.12-rc4 is still not stable on my machine (Fedora 3).
/* ===== [PATCH 2.6.11.?] kernel/sched.c =====*/
--- linux-2.6.12-rc4.orig/kernel/sched.c 2005-05-06 22:20:31.000000000 -0700
+++ linux12/kernel/sched.c 2005-05-21 16:19:11.000000000 -0700
@@ -556,11 +556,23 @@
p->array = array;
}
+static void change_queue_task(struct task_struct *p, prio_array_t
*array, int old_prio)
+{
+ list_del(&p->run_list);
+ if (list_empty(array->queue + old_prio))
+ __clear_bit(old_prio, array->bitmap);
+
+ sched_info_queued(p);
+ list_add_tail(&p->run_list, array->queue + p->prio);
+ __set_bit(p->prio, array->bitmap);
+ p->array = array;
+}
+
/*
* Put task to the end of the run list without the overhead of dequeue
* followed by enqueue.
*/
-static void requeue_task(struct task_struct *p, prio_array_t *array)
+static inline void requeue_task(struct task_struct *p, prio_array_t *array)
{
list_move_tail(&p->run_list, array->queue + p->prio);
}
@@ -2613,7 +2625,7 @@
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
- int cpu, idx;
+ int cpu, idx, old_prio;
/*
* Test if we are atomic. Since do_exit() needs to call into
@@ -2735,9 +2747,14 @@
delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
array = next->array;
- dequeue_task(next, array);
+ old_prio = next->prio;
+
recalc_task_prio(next, next->timestamp + delta);
- enqueue_task(next, array);
+
+ if (unlikely(old_prio != next->prio))
+ change_queue_task(next, array, old_prio);
+ else
+ requeue_task(next, array);
}
next->activated = 0;
switch_tasks:
/* ===== PATCH END ===== */
Thanks,
-chen
On 5/20/05, Ingo Molnar <[email protected]> wrote:
>
> * Con Kolivas <[email protected]> wrote:
>
> > On Fri, 20 May 2005 19:49, Ingo Molnar wrote:
> > > * chen Shang <[email protected]> wrote:
> > > > I minimized my patch and against to 2.6.12-rc4 this time, see below.
> > >
> > > looks good - i've done some small style/whitespace cleanups and renamed
> > > prio to old_prio, patch against -rc4 below.
> >
> > We should inline requeue_task as well.
>
> yeah.
>
> Acked-by: Ingo Molnar <[email protected]>
>
> Ingo
>
* Chen Shang <[email protected]> wrote:
> /*===== ISSUE ====*/
> My second version of patch has a defect.
>
> + if (unlikely(old_prio != next->prio)) {
> + dequeue_task(next, array); --> ### dequeue should against
> old_prio, NOT next->prio ###
> + enqueue_task(next, array);
> + }
indeed...
> unforunately, dequeue_task does not accept the third parameter to make
> adjustment. Personally, I feel it's good to add extra function as my
> first version of patch to combine dequeue and enqueue together.
> Reasons as following:
> 1) adding the third parameter to dequeue_task() would cause other
> places' code change;
> 2) for schedule functions, performance is the first consideration.
> Notice both dequeue_task() and enqueue_task() are NOT inline.
> Combining those two in one saves one function call overhead;
the real problem comes from recalc_task_prio() having a side-effect,
which makes requeueing of tasks harder. The solution is to return the
prio from recalc_task_prio() - see the tested patch below. Agreed?
Ingo
--
micro-optimize task requeueing in schedule() & clean up
recalc_task_prio().
From: Chen Shang <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -675,7 +675,7 @@ static inline void __activate_idle_task(
rq->nr_running++;
}
-static void recalc_task_prio(task_t *p, unsigned long long now)
+static int recalc_task_prio(task_t *p, unsigned long long now)
{
/* Caller must always ensure 'now >= p->timestamp' */
unsigned long long __sleep_time = now - p->timestamp;
@@ -734,7 +734,7 @@ static void recalc_task_prio(task_t *p,
}
}
- p->prio = effective_prio(p);
+ return effective_prio(p);
}
/*
@@ -757,7 +757,7 @@ static void activate_task(task_t *p, run
}
#endif
- recalc_task_prio(p, now);
+ p->prio = recalc_task_prio(p, now);
/*
* This checks to make sure it's not an uninterruptible task
@@ -2751,7 +2751,7 @@ asmlinkage void __sched schedule(void)
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
- int cpu, idx;
+ int cpu, idx, new_prio;
/*
* Test if we are atomic. Since do_exit() needs to call into
@@ -2873,9 +2873,14 @@ go_idle:
delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
array = next->array;
- dequeue_task(next, array);
- recalc_task_prio(next, next->timestamp + delta);
- enqueue_task(next, array);
+ new_prio = recalc_task_prio(next, next->timestamp + delta);
+
+ if (unlikely(next->prio != new_prio)) {
+ dequeue_task(next, array);
+ next->prio = new_prio;
+ enqueue_task(next, array);
+ } else
+ requeue_task(next, array);
}
next->activated = 0;
switch_tasks:
much better.
If so, another place, activate_task(), need to make adjustment as well.
-chen
/* ============================ */
/*
@@ -704,7 +704,7 @@
}
#endif
- recalc_task_prio(p, now);
+ p->prio = recalc_task_prio(p, now);
/* ============================ */
On 5/23/05, Ingo Molnar <[email protected]> wrote:
>
> the real problem comes from recalc_task_prio() having a side-effect,
> which makes requeueing of tasks harder. The solution is to return the
> prio from recalc_task_prio() - see the tested patch below. Agreed?
>
> Ingo
>
> --
>
> micro-optimize task requeueing in schedule() & clean up
> recalc_task_prio().
>
> --- linux/kernel/sched.c.orig
> +++ linux/kernel/sched.c
> @@ -675,7 +675,7 @@ static inline void __activate_idle_task(
> rq->nr_running++;
> }
>
> -static void recalc_task_prio(task_t *p, unsigned long long now)
> +static int recalc_task_prio(task_t *p, unsigned long long now)
> {
> /* Caller must always ensure 'now >= p->timestamp' */
> unsigned long long __sleep_time = now - p->timestamp;
> @@ -734,7 +734,7 @@ static void recalc_task_prio(task_t *p,
> }
> }
>
> - p->prio = effective_prio(p);
> + return effective_prio(p);
> }
>
> /*
> @@ -757,7 +757,7 @@ static void activate_task(task_t *p, run
> }
> #endif
>
> - recalc_task_prio(p, now);
> + p->prio = recalc_task_prio(p, now);
>
> /*
> * This checks to make sure it's not an uninterruptible task
> @@ -2751,7 +2751,7 @@ asmlinkage void __sched schedule(void)
> struct list_head *queue;
> unsigned long long now;
> unsigned long run_time;
> - int cpu, idx;
> + int cpu, idx, new_prio;
>
> /*
> * Test if we are atomic. Since do_exit() needs to call into
> @@ -2873,9 +2873,14 @@ go_idle:
> delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
>
> array = next->array;
> - dequeue_task(next, array);
> - recalc_task_prio(next, next->timestamp + delta);
> - enqueue_task(next, array);
> + new_prio = recalc_task_prio(next, next->timestamp + delta);
> +
> + if (unlikely(next->prio != new_prio)) {
> + dequeue_task(next, array);
> + next->prio = new_prio;
> + enqueue_task(next, array);
> + } else
> + requeue_task(next, array);
> }
> next->activated = 0;
> switch_tasks:
>