Here are three patches which correct scale usage in both fix_small_imbalance
and update_sg_lb_stats.
And give out comment over when fix_small_imbalance would cause load change.
V2: fix scale usage for update_sg_lb_stats
V3: fix scale problem in comparing sds->busiest_load_per_task
and sds->avg_load when there is not balanced group inside the domain
Lei Wen (3):
sched: reduce calculation effort in fix_small_imbalance
sched: scale the busy and this queue's per-task load before compare
sched: scale cpu load for judgment of group imbalance
kernel/sched/fair.c | 64 +++++++++++++++++++++++++++++++--------------------
1 file changed, 39 insertions(+), 25 deletions(-)
--
1.7.10.4
Since for max_load and this_load, they are the value that already be
scaled. It is not reasonble to get a minimum value between the scaled
and non-scaled value, like below example.
min(sds->busiest_load_per_task, sds->max_load);
Also add comment over in what condition, there would be cpu power gain
in move the load.
Signed-off-by: Lei Wen <[email protected]>
---
kernel/sched/fair.c | 38 +++++++++++++++++++++++++-------------
1 file changed, 25 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 28052fa..6173095 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4692,10 +4692,14 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
unsigned long tmp, pwr_now = 0, pwr_move = 0;
unsigned int imbn = 2;
- unsigned long scaled_busy_load_per_task;
if (sds->this_nr_running) {
sds->this_load_per_task /= sds->this_nr_running;
+
+ /* Scale this_load_per_task to local power not related */
+ sds->this_load_per_task <<= SCHED_POWER_SHIFT;
+ sds->this_load_per_task /= sds->this->sgp->power;
+
if (sds->busiest_load_per_task >
sds->this_load_per_task)
imbn = 1;
@@ -4704,12 +4708,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
cpu_avg_load_per_task(env->dst_cpu);
}
- scaled_busy_load_per_task = sds->busiest_load_per_task
- * SCHED_POWER_SCALE;
- scaled_busy_load_per_task /= sds->busiest->sgp->power;
-
- if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
- (scaled_busy_load_per_task * imbn)) {
+ if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
+ (sds->busiest_load_per_task * imbn)) {
env->imbalance = sds->busiest_load_per_task;
return;
}
@@ -4727,22 +4727,29 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
pwr_now /= SCHED_POWER_SCALE;
/* Amount of load we'd subtract */
- if (sds->max_load > scaled_busy_load_per_task) {
+ if (sds->max_load > sds->busiest_load_per_task) {
pwr_move += sds->busiest->sgp->power *
min(sds->busiest_load_per_task,
- sds->max_load - scaled_busy_load_per_task);
- tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
- sds->this->sgp->power;
+ sds->max_load - sds->busiest_load_per_task);
+ tmp = sds->busiest_load_per_task;
} else
- tmp = (sds->max_load * sds->busiest->sgp->power) /
- sds->this->sgp->power;
+ tmp = sds->max_load;
+ /* Scale to this queue from busiest queue */
+ tmp = (tmp * sds->busiest->sgp->power) /
+ sds->this->sgp->power;
/* Amount of load we'd add */
pwr_move += sds->this->sgp->power *
min(sds->this_load_per_task, sds->this_load + tmp);
pwr_move /= SCHED_POWER_SCALE;
/* Move if we gain throughput */
+ /*
+ * The only possibilty for below statement be true, is:
+ * sds->max_load is larger than sds->busiest_load_per_task, while,
+ * sds->busiest_load_per_task is larger than sds->this_load plus by
+ * the scaled sds->busiest_load_per_task moved into this queue
+ */
if (pwr_move > pwr_now)
env->imbalance = sds->busiest_load_per_task;
}
@@ -4758,6 +4765,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
unsigned long max_pull, load_above_capacity = ~0UL;
sds->busiest_load_per_task /= sds->busiest_nr_running;
+
+ /* Scale busiest_load_per_task to local power not related */
+ sds->busiest_load_per_task <<= SCHED_POWER_SHIFT;
+ sds->busiest_load_per_task /= sds->busiest->sgp->power;
+
if (sds->group_imb) {
sds->busiest_load_per_task =
min(sds->busiest_load_per_task, sds->avg_load);
--
1.7.10.4
Actually all below item could be repalced by scaled_busy_load_per_task
(sds->busiest_load_per_task * SCHED_POWER_SCALE)
/sds->busiest->sgp->power;
Signed-off-by: Lei Wen <[email protected]>
---
kernel/sched/fair.c | 19 ++++++++-----------
1 file changed, 8 insertions(+), 11 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c61a614..28052fa 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4727,20 +4727,17 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
pwr_now /= SCHED_POWER_SCALE;
/* Amount of load we'd subtract */
- tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
- sds->busiest->sgp->power;
- if (sds->max_load > tmp)
+ if (sds->max_load > scaled_busy_load_per_task) {
pwr_move += sds->busiest->sgp->power *
- min(sds->busiest_load_per_task, sds->max_load - tmp);
-
- /* Amount of load we'd add */
- if (sds->max_load * sds->busiest->sgp->power <
- sds->busiest_load_per_task * SCHED_POWER_SCALE)
- tmp = (sds->max_load * sds->busiest->sgp->power) /
- sds->this->sgp->power;
- else
+ min(sds->busiest_load_per_task,
+ sds->max_load - scaled_busy_load_per_task);
tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
sds->this->sgp->power;
+ } else
+ tmp = (sds->max_load * sds->busiest->sgp->power) /
+ sds->this->sgp->power;
+
+ /* Amount of load we'd add */
pwr_move += sds->this->sgp->power *
min(sds->this_load_per_task, sds->this_load + tmp);
pwr_move /= SCHED_POWER_SCALE;
--
1.7.10.4
We cannot compare two load directly from two cpus, since the cpu power
over two cpu may vary largely.
Suppose we meet such two kind of cpus.
CPU A:
No real time work, and there are 3 task, with rq->load.weight
being 512.
CPU B:
Has real time work, and it take 3/4 of the cpu power, which
makes CFS only take 1/4, that is 1024/4=256 cpu power. And over its CFS
runqueue, there is only one task with weight as 128.
Since both cpu's CFS task take for half of the CFS's cpu power, it
should be considered as balanced in such case.
But original judgment like:
if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
(max_nr_running - min_nr_running) > 1)
It makes (512-128)>=((512+128)/4), and lead to imbalance conclusion...
Make the load as scaled, to avoid such case.
Signed-off-by: Lei Wen <[email protected]>
---
kernel/sched/fair.c | 19 ++++++++++++-------
1 file changed, 12 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6173095..12826f9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4434,7 +4434,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
int local_group, int *balance, struct sg_lb_stats *sgs)
{
unsigned long nr_running, max_nr_running, min_nr_running;
- unsigned long load, max_cpu_load, min_cpu_load;
+ unsigned long scaled_load, load, max_cpu_load, min_cpu_load;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long avg_load_per_task = 0;
int i;
@@ -4464,10 +4464,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
load = target_load(i, load_idx);
} else {
load = source_load(i, load_idx);
- if (load > max_cpu_load)
- max_cpu_load = load;
- if (min_cpu_load > load)
- min_cpu_load = load;
+ scaled_load = load * SCHED_POWER_SCALE
+ / cpu_rq(i)->cpu_power;
+ if (scaled_load > max_cpu_load)
+ max_cpu_load = scaled_load;
+ if (min_cpu_load > scaled_load)
+ min_cpu_load = scaled_load;
if (nr_running > max_nr_running)
max_nr_running = nr_running;
@@ -4511,8 +4513,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
* normalized nr_running number somewhere that negates
* the hierarchy?
*/
- if (sgs->sum_nr_running)
- avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+ if (sgs->sum_nr_running) {
+ avg_load_per_task = sgs->sum_weighted_load * SCHED_POWER_SCALE
+ / group->sgp->power;
+ avg_load_per_task /= sgs->sum_nr_running;
+ }
if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
(max_nr_running - min_nr_running) > 1)
--
1.7.10.4