When a runqueue runs out of RT tasks, it may have non-RT tasks or
none tasks(idle). Currently, RT balance treats the two cases equally
and manipulates cpupri.pri_to_cpu[CPUPRI_NORMAL] only which may cause
problems.
For instance, 4 cpus system, non-RT task1 is running on cpu0, RT
task2 is running on cpu3, cpu1/cpu2 both are idle. Then RT task3
(usually CPU-intensive) is waken up or created on cpu3, it will
be placed to cpu0 (see find_lowest_rq()) causing task1 starving
until cfs load balance places task1 to another cpu, or even worse
if task1 is bound on cpu0. So, it would be reasonable to put task3
to cpu1 or cpu2 which is idle(even though doing this may break the
energy-saving idle state).
This patch tackles the problem by operating pri_to_cpu[CPUPRI_IDLE]
of cpupri according to the stages of idle task, so that when pushing
or selecting RT tasks through find_lowest_rq(), it will try to find
one idle cpu as the goal.
Signed-off-by: pang.xunlei <[email protected]>
---
kernel/sched/idle_task.c | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 67ad4e7..3dc372e 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -26,6 +26,15 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev)
{
+#ifdef CONFIG_SMP
+ struct cpupri *cp = &rq->rd->cpupri;
+ int currpri = cp->cpu_to_pri[rq->cpu];
+
+ BUG_ON(currpri != CPUPRI_NORMAL);
+ /* Set CPUPRI_IDLE bitmap for this cpu */
+ cpupri_set(cp, rq->cpu, MAX_PRIO);
+#endif
+
put_prev_task(rq, prev);
schedstat_inc(rq, sched_goidle);
@@ -47,6 +56,18 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
+#ifdef CONFIG_SMP
+ struct cpupri *cp = &rq->rd->cpupri;
+ int currpri = cp->cpu_to_pri[rq->cpu];
+
+ /*
+ * Set CPUPRI_NORMAL bitmap for this cpu when exiting from idle.
+ * RT tasks may be queued beforehand, so the judgement is needed.
+ */
+ if (currpri == CPUPRI_IDLE)
+ cpupri_set(cp, rq->cpu, MAX_RT_PRIO);
+#endif
+
idle_exit_fair(rq);
rq_last_tick_reset(rq);
}
--
1.7.9.5
Actually, cpupri_set() and cpupri_init() can never be used without
CONFIG_SMP.
Signed-off-by: pang.xunlei <[email protected]>
---
kernel/sched/cpupri.h | 3 ---
1 file changed, 3 deletions(-)
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 6b03334..63cbb9c 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp,
void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
-#else
-#define cpupri_set(cp, cpu, pri) do { } while (0)
-#define cpupri_init() do { } while (0)
#endif
#endif /* _LINUX_CPUPRI_H */
--
1.7.9.5
When selecting the cpu for a waking RT task, if curr is a non-RT
task which is bound only on this cpu, then we can give it a chance
to select a different cpu(definitely an idle cpu if existing) for
the RT task to avoid curr starving.
Signed-off-by: pang.xunlei <[email protected]>
---
kernel/sched/rt.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d024e6c..89202ab 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1319,6 +1319,11 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
* runqueue. Otherwise simply start this RT task
* on its current runqueue.
*
+ * If the current task on @p's runqueue is a non-RT task,
+ * and this task is bound on current runqueue, then try to
+ * see if we can wake this RT task up on a different runqueue,
+ * we will definitely find an idle cpu if there is any.
+ *
* We want to avoid overloading runqueues. If the woken
* task is a higher priority, then it will stay on this CPU
* and the lower prio task should be moved to another CPU.
@@ -1335,9 +1340,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
* This test is optimistic, if we get it wrong the load-balancer
* will have to sort it out.
*/
- if (curr && unlikely(rt_task(curr)) &&
- (curr->nr_cpus_allowed < 2 ||
- curr->prio <= p->prio)) {
+ if (curr && unlikely(curr->nr_cpus_allowed < 2 ||
+ curr->prio <= p->prio)) {
int target = find_lowest_rq(p);
if (target != -1)
--
1.7.9.5
On Mon, Nov 03, 2014 at 06:30:18PM +0800, pang.xunlei wrote:
> kernel/sched/idle_task.c | 21 +++++++++++++++++++++
> 1 file changed, 21 insertions(+)
>
> diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
> index 67ad4e7..3dc372e 100644
> --- a/kernel/sched/idle_task.c
> +++ b/kernel/sched/idle_task.c
> @@ -26,6 +26,15 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
> static struct task_struct *
> pick_next_task_idle(struct rq *rq, struct task_struct *prev)
> {
> +#ifdef CONFIG_SMP
> + struct cpupri *cp = &rq->rd->cpupri;
> + int currpri = cp->cpu_to_pri[rq->cpu];
> +
> + BUG_ON(currpri != CPUPRI_NORMAL);
> + /* Set CPUPRI_IDLE bitmap for this cpu */
> + cpupri_set(cp, rq->cpu, MAX_PRIO);
> +#endif
> +
This should really be idle_enter_rt() and implemented in
kernel/sched/rt.c.
> put_prev_task(rq, prev);
>
> schedstat_inc(rq, sched_goidle);
> @@ -47,6 +56,18 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
>
> static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
> {
> +#ifdef CONFIG_SMP
> + struct cpupri *cp = &rq->rd->cpupri;
> + int currpri = cp->cpu_to_pri[rq->cpu];
> +
> + /*
> + * Set CPUPRI_NORMAL bitmap for this cpu when exiting from idle.
> + * RT tasks may be queued beforehand, so the judgement is needed.
> + */
> + if (currpri == CPUPRI_IDLE)
> + cpupri_set(cp, rq->cpu, MAX_RT_PRIO);
> +#endif
idle_exit_rt() and the same.
> idle_exit_fair(rq);
> rq_last_tick_reset(rq);
> }
Also, try and keep the deadline bits in sync with the rt semantics.