LinuxLists.cc - [PATCH v3 1/7] sched/rt: Deal with cpupri.pri_to_cpu[CPUPRI

2014-11-05 15:48:47

Subject: [PATCH v3 1/7] sched/rt: Deal with cpupri.pri_to_cpu[CPUPRI_IDLE] for idle cases

When a runqueue runs out of RT tasks, it may have non-RT tasks or
none tasks(idle). Currently, RT balance treats the two cases equally
and manipulates cpupri.pri_to_cpu[CPUPRI_NORMAL] only which may cause
problems.

For instance, 4 cpus system, non-RT task1 is running on cpu0, RT
task2 is running on cpu3, cpu1/cpu2 both are idle. Then RT task3
(usually CPU-intensive) is waken up or created on cpu3, it will
be placed to cpu0 (see find_lowest_rq()) causing task1 starving
until cfs load balance places task1 to another cpu, or even worse
if task1 is bound on cpu0. So, it would be reasonable to put task3
to cpu1 or cpu2 which is idle(even though doing this may break the
energy-saving idle state).

This patch tackles the problem by operating pri_to_cpu[CPUPRI_IDLE]
of cpupri according to the stages of idle task, so that when pushing
RT tasks through find_lowest_rq(), it will try to find one idle cpu
as the goal.

Signed-off-by: pang.xunlei <[email protected]>
---
kernel/sched/idle_task.c | 3 +++
kernel/sched/rt.c | 21 +++++++++++++++++++++
kernel/sched/sched.h | 6 ++++++
3 files changed, 30 insertions(+)

diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 67ad4e7..e053347 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -26,6 +26,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev)
{
+ idle_enter_rt(rq);
+
put_prev_task(rq, prev);

schedstat_inc(rq, sched_goidle);
@@ -47,6 +49,7 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)

static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
+ idle_exit_rt(rq);
idle_exit_fair(rq);
rq_last_tick_reset(rq);
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d024e6c..da6922e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -992,6 +992,27 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)

#if defined CONFIG_SMP

+/* Set CPUPRI_IDLE bitmap for this cpu when entering idle. */
+void idle_enter_rt(struct rq *this_rq)
+{
+ struct cpupri *cp = &this_rq->rd->cpupri;
+ int currpri = cp->cpu_to_pri[this_rq->cpu];
+
+ BUG_ON(currpri != CPUPRI_NORMAL);
+ cpupri_set(cp, this_rq->cpu, MAX_PRIO);
+}
+
+/* Set CPUPRI_NORMAL bitmap for this cpu when exiting from idle. */
+void idle_exit_rt(struct rq *this_rq)
+{
+ struct cpupri *cp = &this_rq->rd->cpupri;
+ int currpri = cp->cpu_to_pri[this_rq->cpu];
+
+ /* RT tasks may be queued before, this judgement is needed. */
+ if (currpri == CPUPRI_IDLE)
+ cpupri_set(cp, this_rq->cpu, MAX_RT_PRIO);
+}
+
static void
inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 24156c84..cc603fa 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1162,11 +1162,17 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu);

extern void trigger_load_balance(struct rq *rq);

+extern void idle_enter_rt(struct rq *this_rq);
+extern void idle_exit_rt(struct rq *this_rq);
+
extern void idle_enter_fair(struct rq *this_rq);
extern void idle_exit_fair(struct rq *this_rq);

#else

+static inline void idle_enter_rt(struct rq *rq) { }
+static inline void idle_exit_rt(struct rq *rq) { }
+
static inline void idle_enter_fair(struct rq *rq) { }
static inline void idle_exit_fair(struct rq *rq) { }

--
2.1.0

2014-11-05 15:48:52

by Xunlei Pang

[permalink] [raw]

Subject: [PATCH v3 2/7] sched/cpupri: Remove unnecessary definitions in cpupri.h

Actually, cpupri_set() and cpupri_init() can never be used without
CONFIG_SMP.

Signed-off-by: pang.xunlei <[email protected]>
---
kernel/sched/cpupri.h | 3 ---
1 file changed, 3 deletions(-)

diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 6b03334..63cbb9c 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp,
void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
-#else
-#define cpupri_set(cp, cpu, pri) do { } while (0)
-#define cpupri_init() do { } while (0)
#endif

#endif /* _LINUX_CPUPRI_H */
--
2.1.0

2014-11-05 15:48:58

by Xunlei Pang

[permalink] [raw]

Subject: [PATCH v3 3/7] sched/rt: Optimize find_lowest_rq() to select a cache hot cpu

Add the case for iteration of sched_domains without SD_WAKE_AFFINE
flags to select a cpu, this flag may be unset through proc by users.

Signed-off-by: pang.xunlei <[email protected]>
---
kernel/sched/rt.c | 18 ++++++++++++++----
1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index da6922e..49164f1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1549,6 +1549,7 @@ static int find_lowest_rq(struct task_struct *task)
struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
+ int cachehot_cpu = nr_cpu_ids;

/* Make sure the mask is initialized first */
if (unlikely(!lowest_mask))
@@ -1581,7 +1582,7 @@ static int find_lowest_rq(struct task_struct *task)
rcu_read_lock();
for_each_domain(cpu, sd) {
if (sd->flags & SD_WAKE_AFFINE) {
- int best_cpu;
+ int wakeaffine_cpu;

/*
* "this_cpu" is cheaper to preempt than a
@@ -1593,16 +1594,25 @@ static int find_lowest_rq(struct task_struct *task)
return this_cpu;
}

- best_cpu = cpumask_first_and(lowest_mask,
+ wakeaffine_cpu = cpumask_first_and(lowest_mask,
sched_domain_span(sd));
- if (best_cpu < nr_cpu_ids) {
+ if (wakeaffine_cpu < nr_cpu_ids) {
rcu_read_unlock();
- return best_cpu;
+ return wakeaffine_cpu;
}
+ } else {
+ /* affine domain outweighs lower level non-affine domain? */
+ if (cachehot_cpu >= nr_cpu_ids)
+ cachehot_cpu = cpumask_first_and(lowest_mask,
+ sched_domain_span(sd));
}
}
rcu_read_unlock();

+ /* most likely cache-hot */
+ if (cachehot_cpu < nr_cpu_ids)
+ return cachehot_cpu;
+
/*
* And finally, if there were no matches within the domains
* just give the caller *something* to work with from the compatible
--
2.1.0

2014-11-05 15:49:15

by Xunlei Pang

[permalink] [raw]

Subject: [PATCH v3 6/7] sched/deadline: Remove unnecessary definitions in cpudeadline.h

Actually, cpudl_set() and cpudl_init() can never be used without
CONFIG_SMP.

Signed-off-by: pang.xunlei <[email protected]>
---
kernel/sched/cpudeadline.h | 3 ---
1 file changed, 3 deletions(-)

diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 0c9636e..dfdf594 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -25,9 +25,6 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
int cpudl_init(struct cpudl *cp);
void cpudl_cleanup(struct cpudl *cp);
-#else
-#define cpudl_set(cp, cpu, dl) do { } while (0)
-#define cpudl_init() do { } while (0)
#endif /* CONFIG_SMP */

#endif /* _LINUX_CPUDL_H */
--
2.1.0

2014-11-05 15:49:23

by Xunlei Pang

[permalink] [raw]

Subject: [PATCH v3 7/7] sched/deadline: Modify cpudl_find() for more cases of electing best_cpu

When a runqueue runs out of DL tasks, it may have RT tasks or non-RT
tasks or just idle. It'd be better to push the DL task to an idle cpu
or non-RT cpu if there is any.

Adds idle_enter_dl()/idle_exit_dl() to detect idle cases.
Adds rt_enter_dl()/rt_exit_dl() to detect non-RT cases.

Use the same thought as tackling RT in the former patch.

Signed-off-by: pang.xunlei <[email protected]>
---
kernel/sched/cpudeadline.c | 80 +++++++++++++++++++++++++++++++++++++---------
kernel/sched/cpudeadline.h | 13 ++++++--
kernel/sched/deadline.c | 32 ++++++++++++++++---
kernel/sched/idle_task.c | 2 ++
kernel/sched/rt.c | 7 ++++
kernel/sched/sched.h | 11 +++++++
6 files changed, 123 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 72a3da3..8254310 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -98,7 +98,7 @@ static inline int cpudl_maximum(struct cpudl *cp)
* @cp: the cpudl max-heap context
* @p: the task
* @later_mask: a mask used to filter cpus, also used to fill
- * in with the selected CPUs if set_flag is set. Not NULL.
+ * back in with the selected CPUs if set_flag is set. Not NULL.
* @set_flag: a flag to determine if should set the later_mask.
*
* Returns: (int)bool - CPUs were found
@@ -110,7 +110,15 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
const struct sched_dl_entity *dl_se = &p->dl;

- if (cpumask_and(&tmp_mask, later_mask, cp->free_cpus)) {
+ if (cpumask_and(&tmp_mask, later_mask, cp->idle_cpus)) {
+ if (set_flag)
+ cpumask_copy(later_mask, &tmp_mask);
+ return 1;
+ } else if (cpumask_and(&tmp_mask, later_mask, cp->freert_cpus)) {
+ if (set_flag)
+ cpumask_copy(later_mask, &tmp_mask);
+ return 1;
+ } else if (cpumask_and(&tmp_mask, later_mask, cp->freedl_cpus)) {
if (set_flag)
cpumask_copy(later_mask, &tmp_mask);
return 1;
@@ -127,21 +135,47 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
* @cp: the cpudl max-heap context
* @cpu: the target cpu
* @dl: the new earliest deadline for this cpu
- *
+ * @set_flags: CPUDL_SET_XXX, CPUDL_CLEAR_XXX
* Notes: assumes cpu_rq(cpu)->lock is locked
*
* Returns: (void)
*/
-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int set_flags)
{
int old_idx, new_cpu;
unsigned long flags;

WARN_ON(!cpu_present(cpu));

+ /* We can do this percpu operation without spinlock */
+ switch (set_flags) {
+ case CPUDL_SET_IDLE:
+ cpumask_set_cpu(cpu, cp->idle_cpus);
+ /* sync for cpudl_find() */
+ smp_rmb();
+ return;
+ case CPUDL_CLEAR_IDLE:
+ cpumask_clear_cpu(cpu, cp->idle_cpus);
+ /* sync for cpudl_find() */
+ smp_rmb();
+ return;
+ case CPUDL_SET_FREERT:
+ cpumask_set_cpu(cpu, cp->freert_cpus);
+ /* sync for cpudl_find() */
+ smp_rmb();
+ return;
+ case CPUDL_CLEAR_FREERT:
+ cpumask_clear_cpu(cpu, cp->freert_cpus);
+ /* sync for cpudl_find() */
+ smp_rmb();
+ return;
+ default:
+ break;
+ }
+
raw_spin_lock_irqsave(&cp->lock, flags);
old_idx = cp->elements[cpu].idx;
- if (!is_valid) {
+ if (set_flags == CPUDL_SET_FREEDL) {
/* remove item */
if (old_idx == IDX_INVALID) {
/*
@@ -163,8 +197,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
cpudl_exchange(cp, old_idx, parent(old_idx));
old_idx = parent(old_idx);
}
- cpumask_set_cpu(cpu, cp->free_cpus);
- cpudl_heapify(cp, old_idx);
+ cpumask_set_cpu(cpu, cp->freedl_cpus);
+ cpudl_heapify(cp, old_idx);

goto out;
}
@@ -175,7 +209,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
cp->elements[cp->size - 1].cpu = cpu;
cp->elements[cpu].idx = cp->size - 1;
cpudl_change_key(cp, cp->size - 1, dl);
- cpumask_clear_cpu(cpu, cp->free_cpus);
+ cpumask_clear_cpu(cpu, cp->freedl_cpus);
} else {
cpudl_change_key(cp, old_idx, dl);
}
@@ -200,19 +234,33 @@ int cpudl_init(struct cpudl *cp)
sizeof(struct cpudl_item),
GFP_KERNEL);
if (!cp->elements)
- return -ENOMEM;
+ goto out;
+
+ if (!alloc_cpumask_var(&cp->freedl_cpus, GFP_KERNEL))
+ goto free_elements;
+
+ if (!zalloc_cpumask_var(&cp->freert_cpus, GFP_KERNEL))
+ goto free_freedl_cpus;
+
+ if (!zalloc_cpumask_var(&cp->idle_cpus, GFP_KERNEL))
+ goto free_freert_cpus;

- if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
- kfree(cp->elements);
- return -ENOMEM;
- }

for_each_possible_cpu(i)
cp->elements[i].idx = IDX_INVALID;

- cpumask_setall(cp->free_cpus);
+ cpumask_setall(cp->freedl_cpus);

return 0;
+
+free_freert_cpus:
+ kfree(cp->freert_cpus);
+free_freedl_cpus:
+ kfree(cp->freedl_cpus);
+free_elements:
+ kfree(cp->elements);
+out:
+ return -ENOMEM;
}

/*
@@ -221,6 +269,8 @@ int cpudl_init(struct cpudl *cp)
*/
void cpudl_cleanup(struct cpudl *cp)
{
- free_cpumask_var(cp->free_cpus);
+ free_cpumask_var(cp->freedl_cpus);
+ free_cpumask_var(cp->freert_cpus);
+ free_cpumask_var(cp->idle_cpus);
kfree(cp->elements);
}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index dfdf594..20ebfffe 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -5,6 +5,13 @@

#define IDX_INVALID -1

+#define CPUDL_SET_DL 1 /* set deadline value, clear freedl_cpus */
+#define CPUDL_SET_FREEDL 2 /* set freedl_cpus */
+#define CPUDL_SET_FREERT 3 /* set freert_cpus */
+#define CPUDL_CLEAR_FREERT 4 /* clear freert_cpus */
+#define CPUDL_SET_IDLE 5 /* set idle_cpus */
+#define CPUDL_CLEAR_IDLE 6 /* clear idle_cpus */
+
struct cpudl_item {
u64 dl;
int cpu;
@@ -14,7 +21,9 @@ struct cpudl_item {
struct cpudl {
raw_spinlock_t lock;
int size;
- cpumask_var_t free_cpus;
+ cpumask_var_t idle_cpus;
+ cpumask_var_t freert_cpus;
+ cpumask_var_t freedl_cpus;
struct cpudl_item *elements;
};

@@ -22,7 +31,7 @@ struct cpudl {
#ifdef CONFIG_SMP
int cpudl_find(struct cpudl *cp, struct task_struct *p,
struct cpumask *later_mask, int set_flag);
-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int set_flags);
int cpudl_init(struct cpudl *cp);
void cpudl_cleanup(struct cpudl *cp);
#endif /* CONFIG_SMP */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ddb6185..dc021a1 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -665,6 +665,26 @@ static void update_curr_dl(struct rq *rq)

#ifdef CONFIG_SMP

+void idle_enter_dl(struct rq *this_rq)
+{
+ cpudl_set(&this_rq->rd->cpudl, this_rq->cpu, 0, CPUDL_SET_IDLE);
+}
+
+void idle_exit_dl(struct rq *this_rq)
+{
+ cpudl_set(&this_rq->rd->cpudl, this_rq->cpu, 0, CPUDL_CLEAR_IDLE);
+}
+
+void rt_enter_dl(struct rq *this_rq)
+{
+ cpudl_set(&this_rq->rd->cpudl, this_rq->cpu, 0, CPUDL_CLEAR_FREERT);
+}
+
+void rt_exit_dl(struct rq *this_rq)
+{
+ cpudl_set(&this_rq->rd->cpudl, this_rq->cpu, 0, CPUDL_SET_FREERT);
+}
+
static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu);

static inline u64 next_deadline(struct rq *rq)
@@ -691,7 +711,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
*/
dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
dl_rq->earliest_dl.curr = deadline;
- cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
+ cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, CPUDL_SET_DL);
} else if (dl_rq->earliest_dl.next == 0 ||
dl_time_before(deadline, dl_rq->earliest_dl.next)) {
/*
@@ -715,7 +735,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
if (!dl_rq->dl_nr_running) {
dl_rq->earliest_dl.curr = 0;
dl_rq->earliest_dl.next = 0;
- cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+ cpudl_set(&rq->rd->cpudl, rq->cpu, 0, CPUDL_SET_FREEDL);
} else {
struct rb_node *leftmost = dl_rq->rb_leftmost;
struct sched_dl_entity *entry;
@@ -723,7 +743,8 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
dl_rq->earliest_dl.curr = entry->deadline;
dl_rq->earliest_dl.next = next_deadline(rq);
- cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
+ cpudl_set(&rq->rd->cpudl, rq->cpu,
+ entry->deadline, CPUDL_SET_DL);
}
}

@@ -1560,7 +1581,8 @@ static void rq_online_dl(struct rq *rq)
dl_set_overload(rq);

if (rq->dl.dl_nr_running > 0)
- cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
+ cpudl_set(&rq->rd->cpudl, rq->cpu,
+ rq->dl.earliest_dl.curr, CPUDL_SET_DL);
}

/* Assumes rq->lock is held */
@@ -1569,7 +1591,7 @@ static void rq_offline_dl(struct rq *rq)
if (rq->dl.overloaded)
dl_clear_overload(rq);

- cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+ cpudl_set(&rq->rd->cpudl, rq->cpu, 0, CPUDL_SET_FREEDL);
}

void init_sched_dl_class(void)
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index e053347..7838e56 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -26,6 +26,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev)
{
+ idle_enter_dl(rq);
idle_enter_rt(rq);

put_prev_task(rq, prev);
@@ -49,6 +50,7 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)

static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
+ idle_exit_dl(rq);
idle_exit_rt(rq);
idle_exit_fair(rq);
rq_last_tick_reset(rq);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 49164f1..ee49b94 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1484,6 +1484,9 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
if (!rt_rq->rt_queued)
return NULL;

+ if (prev->sched_class != &rt_sched_class)
+ rt_enter_dl(rq);
+
put_prev_task(rq, prev);

p = _pick_next_task_rt(rq);
@@ -1498,6 +1501,10 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)

static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
+ /* Neglect stop preempt. As for dl preempt, doesn't matter */
+ if (rq->curr->sched_class != &rt_sched_class)
+ rt_exit_dl(rq);
+
update_curr_rt(rq);

/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cc603fa..b76dfef 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1162,6 +1162,12 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu);

extern void trigger_load_balance(struct rq *rq);

+extern void rt_enter_dl(struct rq *this_rq);
+extern void rt_exit_dl(struct rq *this_rq);
+
+extern void idle_enter_dl(struct rq *this_rq);
+extern void idle_exit_dl(struct rq *this_rq);
+
extern void idle_enter_rt(struct rq *this_rq);
extern void idle_exit_rt(struct rq *this_rq);

@@ -1169,6 +1175,11 @@ extern void idle_enter_fair(struct rq *this_rq);
extern void idle_exit_fair(struct rq *this_rq);

#else
+static inline void rt_enter_dl(struct rq *rq) { }
+static inline void rt_exit_dl(struct rq *rq) { }
+
+static inline void idle_enter_dl(struct rq *rq) { }
+static inline void idle_exit_dl(struct rq *rq) { }

static inline void idle_enter_rt(struct rq *rq) { }
static inline void idle_exit_rt(struct rq *rq) { }
--
2.1.0

2014-11-05 15:49:05

by Xunlei Pang

[permalink] [raw]

Subject: [PATCH v3 4/7] sched/deadline: Fix several problems with cpudl_find()

cpudl_find() has some problems:

1)in check_preempt_equal_dl(), called with NULL later_mask, thus
cpudl_find() doesn't check cpudl.free_cpus at all.

2)Also, the whole system isn't always overloaded with many DL tasks
in which cases all the cpu may have a DL task running, so it may
return the best cpu, because we only return the first maximum deadline
cpu(is there a need to iterate the same deadline value to find more
different cpus if possible?).
So it may be reasonable to change the return value of cpudl_find()
to a bool type, because it isn't always the best cpu actually which
can be better determined in find_later_rq() via sched_domain topology.

3)in the "else if" branch, uses cpus_allowed to test again.

This patch syncs the logic in a former patch by "Juri Lelli", then
problem 1) will be solved naturally, and modifies cpudl_find() and
all its call sites to address these problems.

The former patch by "Juri Lelli" is:
"sched/deadline: Fix inter- exclusive cpusets migrations"

Signed-off-by: pang.xunlei <[email protected]>
---
kernel/sched/cpudeadline.c | 29 ++++++++++++++---------------
kernel/sched/cpudeadline.h | 2 +-
kernel/sched/deadline.c | 45 ++++++++++++++++++++++++++-------------------
3 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 539ca3c..72a3da3 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -97,30 +97,29 @@ static inline int cpudl_maximum(struct cpudl *cp)
* cpudl_find - find the best (later-dl) CPU in the system
* @cp: the cpudl max-heap context
* @p: the task
- * @later_mask: a mask to fill in with the selected CPUs (or NULL)
+ * @later_mask: a mask used to filter cpus, also used to fill
+ * in with the selected CPUs if set_flag is set. Not NULL.
+ * @set_flag: a flag to determine if should set the later_mask.
*
- * Returns: int - best CPU (heap maximum if suitable)
+ * Returns: (int)bool - CPUs were found
*/
int cpudl_find(struct cpudl *cp, struct task_struct *p,
- struct cpumask *later_mask)
+ struct cpumask *later_mask, int set_flag)
{
- int best_cpu = -1;
+ struct cpumask tmp_mask;
const struct sched_dl_entity *dl_se = &p->dl;

- if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) {
- best_cpu = cpumask_any(later_mask);
- goto out;
- } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
+
+ if (cpumask_and(&tmp_mask, later_mask, cp->free_cpus)) {
+ if (set_flag)
+ cpumask_copy(later_mask, &tmp_mask);
+ return 1;
+ } else if (cpumask_and(later_mask, later_mask, cpumask_of(cpudl_maximum(cp))) &&
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
- best_cpu = cpudl_maximum(cp);
- if (later_mask)
- cpumask_set_cpu(best_cpu, later_mask);
+ return 1;
}

-out:
- WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
-
- return best_cpu;
+ return 0;
}

/*
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 538c979..0c9636e 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -21,7 +21,7 @@ struct cpudl {

#ifdef CONFIG_SMP
int cpudl_find(struct cpudl *cp, struct task_struct *p,
- struct cpumask *later_mask);
+ struct cpumask *later_mask, int set_flag);
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
int cpudl_init(struct cpudl *cp);
void cpudl_cleanup(struct cpudl *cp);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 256e577..42edfcd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -946,23 +946,34 @@ out:
return cpu;
}

+static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
+
static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
{
+ struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
+
/*
* Current can't be migrated, useless to reschedule,
* let's hope p can move out.
*/
- if (rq->curr->nr_cpus_allowed == 1 ||
- cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
+ if (rq->curr->nr_cpus_allowed == 1)
+ return;
+
+ cpumask_and(later_mask, rq->rd->span, cpu_active_mask);
+ cpumask_and(later_mask, later_mask, &rq->curr->cpus_allowed);
+ if (!cpudl_find(&rq->rd->cpudl, rq->curr, later_mask, 0))
return;

/*
* p is migratable, so let's not schedule it and
* see if it is pushed or pulled somewhere else.
*/
- if (p->nr_cpus_allowed != 1 &&
- cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
- return;
+ if (p->nr_cpus_allowed != 1) {
+ cpumask_and(later_mask, rq->rd->span, cpu_active_mask);
+ cpumask_and(later_mask, later_mask, &p->cpus_allowed);
+ if (cpudl_find(&rq->rd->cpudl, p, later_mask, 0))
+ return;
+ }

resched_curr(rq);
}
@@ -1148,14 +1159,12 @@ next_node:
return NULL;
}

-static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
-
static int find_later_rq(struct task_struct *task)
{
struct sched_domain *sd;
struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
int this_cpu = smp_processor_id();
- int best_cpu, cpu = task_cpu(task);
+ int cpu = task_cpu(task);

/* Make sure the mask is initialized first */
if (unlikely(!later_mask))
@@ -1168,14 +1177,14 @@ static int find_later_rq(struct task_struct *task)
* We have to consider system topology and task affinity
* first, then we can look for a suitable cpu.
*/
- cpumask_copy(later_mask, task_rq(task)->rd->span);
- cpumask_and(later_mask, later_mask, cpu_active_mask);
+ cpumask_and(later_mask, task_rq(task)->rd->span, cpu_active_mask);
cpumask_and(later_mask, later_mask, &task->cpus_allowed);
- best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
- task, later_mask);
- if (best_cpu == -1)
+ if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask, 1))
return -1;

+ if (cpumask_weight(later_mask) == 1)
+ return cpumask_any(later_mask);
+
/*
* If we are here, some target has been found,
* the most suitable of which is cached in best_cpu.
@@ -1200,6 +1209,7 @@ static int find_later_rq(struct task_struct *task)

rcu_read_lock();
for_each_domain(cpu, sd) {
+ int best_cpu;
if (sd->flags & SD_WAKE_AFFINE) {

/*
@@ -1212,12 +1222,9 @@ static int find_later_rq(struct task_struct *task)
return this_cpu;
}

- /*
- * Last chance: if best_cpu is valid and is
- * in the mask, that becomes our choice.
- */
- if (best_cpu < nr_cpu_ids &&
- cpumask_test_cpu(best_cpu, sched_domain_span(sd))) {
+ best_cpu = cpumask_first_and(later_mask,
+ sched_domain_span(sd));
+ if (best_cpu < nr_cpu_ids) {
rcu_read_unlock();
return best_cpu;
}
--
2.1.0

2014-11-05 15:49:56

by Xunlei Pang

[permalink] [raw]

Subject: [PATCH v3 5/7] sched/deadline: Optimize find_later_rq() to select a cache hot cpu

Add the case for iteration of sched_domains without SD_WAKE_AFFINE
flags to select a cpu, this flag may be unset through proc by users.

Signed-off-by: pang.xunlei <[email protected]>
---
kernel/sched/deadline.c | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 42edfcd..ddb6185 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1165,6 +1165,7 @@ static int find_later_rq(struct task_struct *task)
struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
+ int cachehot_cpu = nr_cpu_ids;

/* Make sure the mask is initialized first */
if (unlikely(!later_mask))
@@ -1209,7 +1210,7 @@ static int find_later_rq(struct task_struct *task)

rcu_read_lock();
for_each_domain(cpu, sd) {
- int best_cpu;
+ int wakeaffine_cpu;
if (sd->flags & SD_WAKE_AFFINE) {

/*
@@ -1222,16 +1223,24 @@ static int find_later_rq(struct task_struct *task)
return this_cpu;
}

- best_cpu = cpumask_first_and(later_mask,
- sched_domain_span(sd));
- if (best_cpu < nr_cpu_ids) {
+ wakeaffine_cpu = cpumask_first_and(later_mask,
+ sched_domain_span(sd));
+ if (wakeaffine_cpu < nr_cpu_ids) {
rcu_read_unlock();
- return best_cpu;
+ return wakeaffine_cpu;
}
+ } else {
+ if (cachehot_cpu >= nr_cpu_ids)
+ cachehot_cpu = cpumask_first_and(later_mask,
+ sched_domain_span(sd));
}
}
rcu_read_unlock();

+ /* Most likely cache-hot */
+ if (cachehot_cpu < nr_cpu_ids)
+ return cachehot_cpu;
+
/*
* At this point, all our guesses failed, we just return
* 'something', and let the caller sort the things out.
--
2.1.0