Here are three patches which correct scale usage in both fix_small_imbalance
and update_sg_lb_stats.
And give out comment over when fix_small_imbalance would cause load change.
V2: fix scale usage for update_sg_lb_stats
V3: fix scale problem in comparing sds->busiest_load_per_task
and sds->avg_load when there is not balanced group inside the domain
V4: env->imbalance should be applied with not scaled value, fix according to it.
Fix one ping-pong adjustment for special case.
Lei Wen (4):
sched: reduce calculation effort in fix_small_imbalance
sched: scale the busy and this queue's per-task load before compare
sched: scale cpu load for judgment of group imbalance
sched: adjust fix_small_imbalance moving task condition
kernel/sched/fair.c | 85 ++++++++++++++++++++++++++++++++-------------------
1 file changed, 54 insertions(+), 31 deletions(-)
--
1.7.10.4
We cannot compare two load directly from two cpus, since the cpu power
over two cpu may vary largely.
Suppose we meet such two kind of cpus.
CPU A:
No real time work, and there are 3 task, with rq->load.weight
being 512.
CPU B:
Has real time work, and it take 3/4 of the cpu power, which
makes CFS only take 1/4, that is 1024/4=256 cpu power. And over its CFS
runqueue, there is only one task with weight as 128.
Since both cpu's CFS task take for half of the CFS's cpu power, it
should be considered as balanced in such case.
But original judgment like:
if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
(max_nr_running - min_nr_running) > 1)
It makes (512-128)>=((512+128)/4), and lead to imbalance conclusion...
Make the load as scaled, to avoid such case.
Signed-off-by: Lei Wen <[email protected]>
---
kernel/sched/fair.c | 19 ++++++++++++-------
1 file changed, 12 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fd9cbee..c478022 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4434,7 +4434,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
int local_group, int *balance, struct sg_lb_stats *sgs)
{
unsigned long nr_running, max_nr_running, min_nr_running;
- unsigned long load, max_cpu_load, min_cpu_load;
+ unsigned long scaled_load, load, max_cpu_load, min_cpu_load;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long avg_load_per_task = 0;
int i;
@@ -4464,10 +4464,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
load = target_load(i, load_idx);
} else {
load = source_load(i, load_idx);
- if (load > max_cpu_load)
- max_cpu_load = load;
- if (min_cpu_load > load)
- min_cpu_load = load;
+ scaled_load = load * SCHED_POWER_SCALE
+ / cpu_rq(i)->cpu_power;
+ if (scaled_load > max_cpu_load)
+ max_cpu_load = scaled_load;
+ if (min_cpu_load > scaled_load)
+ min_cpu_load = scaled_load;
if (nr_running > max_nr_running)
max_nr_running = nr_running;
@@ -4511,8 +4513,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
* normalized nr_running number somewhere that negates
* the hierarchy?
*/
- if (sgs->sum_nr_running)
- avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+ if (sgs->sum_nr_running) {
+ avg_load_per_task = sgs->sum_weighted_load * SCHED_POWER_SCALE
+ / group->sgp->power;
+ avg_load_per_task /= sgs->sum_nr_running;
+ }
if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
(max_nr_running - min_nr_running) > 1)
--
1.7.10.4
Suppose here we have such example, in current domain, there are two
cpus. CPU A with 4 task, while CPU B with 5 tasks. All tasks over two
runqueue are the same priority.
Let say the load for CPU A is 4096, and CPU B is 5120, then avg_load
should be 4608. Suppose sgp->power == SCHED_POWER_SCALE.
Then the imbalance value should be 512, less then the
busiest_load_per_task, which is 1024. From the logic we enter into
fix_small_imbalance adjustment.
For busiest_per_task==this_load_per_task, imbn's value is equal to 2.
Thus below statement would be true, and load balance would move one task
from CPU B to CPU A.
if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
(scaled_busy_load_per_task * imbn)) {
When CPU B issue load balance later, it would move one task back again,
which bring infinite end...
Kill the >=, and make it as >, to stop this loop.
Signed-off-by: Lei Wen <[email protected]>
---
kernel/sched/fair.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c478022..3be7844 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4717,7 +4717,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds,
<< SCHED_POWER_SHIFT;
scaled_this_load_per_task /= sds->this->sgp->power;
- if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
+ if (sds->max_load - sds->this_load + scaled_busy_load_per_task >
(scaled_busy_load_per_task * imbn)) {
env->imbalance = sds->busiest_load_per_task;
return;
--
1.7.10.4
Actually all below item could be repalced by scaled_busy_load_per_task
(sds->busiest_load_per_task * SCHED_POWER_SCALE)
/sds->busiest->sgp->power;
Signed-off-by: Lei Wen <[email protected]>
---
kernel/sched/fair.c | 19 ++++++++-----------
1 file changed, 8 insertions(+), 11 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c61a614..28052fa 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4727,20 +4727,17 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
pwr_now /= SCHED_POWER_SCALE;
/* Amount of load we'd subtract */
- tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
- sds->busiest->sgp->power;
- if (sds->max_load > tmp)
+ if (sds->max_load > scaled_busy_load_per_task) {
pwr_move += sds->busiest->sgp->power *
- min(sds->busiest_load_per_task, sds->max_load - tmp);
-
- /* Amount of load we'd add */
- if (sds->max_load * sds->busiest->sgp->power <
- sds->busiest_load_per_task * SCHED_POWER_SCALE)
- tmp = (sds->max_load * sds->busiest->sgp->power) /
- sds->this->sgp->power;
- else
+ min(sds->busiest_load_per_task,
+ sds->max_load - scaled_busy_load_per_task);
tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
sds->this->sgp->power;
+ } else
+ tmp = (sds->max_load * sds->busiest->sgp->power) /
+ sds->this->sgp->power;
+
+ /* Amount of load we'd add */
pwr_move += sds->this->sgp->power *
min(sds->this_load_per_task, sds->this_load + tmp);
pwr_move /= SCHED_POWER_SCALE;
--
1.7.10.4
Since for max_load and this_load, they are the value that already be
scaled. It is not reasonble to get a minimum value between the scaled
and non-scaled value, like below example.
min(sds->busiest_load_per_task, sds->max_load);
Also add comment over in what condition, there would be cpu power gain
in move the load.
Signed-off-by: Lei Wen <[email protected]>
---
kernel/sched/fair.c | 55 +++++++++++++++++++++++++++++++++++----------------
1 file changed, 38 insertions(+), 17 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 28052fa..fd9cbee 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4686,16 +4686,19 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
* load balancing.
* @env: The load balancing environment.
* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * @scaled_busiest_load_per_task: per calculated busist queue average load
*/
static inline
-void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
+void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds,
+ unsigned long scaled_busy_load_per_task)
{
unsigned long tmp, pwr_now = 0, pwr_move = 0;
+ unsigned long scaled_this_load_per_task;
unsigned int imbn = 2;
- unsigned long scaled_busy_load_per_task;
if (sds->this_nr_running) {
sds->this_load_per_task /= sds->this_nr_running;
+
if (sds->busiest_load_per_task >
sds->this_load_per_task)
imbn = 1;
@@ -4704,9 +4707,10 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
cpu_avg_load_per_task(env->dst_cpu);
}
- scaled_busy_load_per_task = sds->busiest_load_per_task
- * SCHED_POWER_SCALE;
- scaled_busy_load_per_task /= sds->busiest->sgp->power;
+ /* Scale this_load_per_task to local power not related */
+ scaled_this_load_per_task = sds->this_load_per_task
+ << SCHED_POWER_SHIFT;
+ scaled_this_load_per_task /= sds->this->sgp->power;
if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
(scaled_busy_load_per_task * imbn)) {
@@ -4721,28 +4725,35 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
*/
pwr_now += sds->busiest->sgp->power *
- min(sds->busiest_load_per_task, sds->max_load);
+ min(scaled_busy_load_per_task, sds->max_load);
pwr_now += sds->this->sgp->power *
- min(sds->this_load_per_task, sds->this_load);
+ min(scaled_this_load_per_task, sds->this_load);
pwr_now /= SCHED_POWER_SCALE;
/* Amount of load we'd subtract */
if (sds->max_load > scaled_busy_load_per_task) {
pwr_move += sds->busiest->sgp->power *
- min(sds->busiest_load_per_task,
+ min(scaled_busy_load_per_task,
sds->max_load - scaled_busy_load_per_task);
- tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
- sds->this->sgp->power;
+ tmp = scaled_busy_load_per_task;
} else
- tmp = (sds->max_load * sds->busiest->sgp->power) /
- sds->this->sgp->power;
+ tmp = sds->max_load;
+ /* Scale to this queue from busiest queue */
+ tmp = (tmp * sds->busiest->sgp->power) /
+ sds->this->sgp->power;
/* Amount of load we'd add */
pwr_move += sds->this->sgp->power *
- min(sds->this_load_per_task, sds->this_load + tmp);
+ min(scaled_this_load_per_task, sds->this_load + tmp);
pwr_move /= SCHED_POWER_SCALE;
/* Move if we gain throughput */
+ /*
+ * The only possibilty for below statement be true, is:
+ * sds->max_load is larger than sds->busiest_load_per_task, while,
+ * sds->busiest_load_per_task is larger than sds->this_load plus by
+ * the scaled sds->busiest_load_per_task moved into this queue
+ */
if (pwr_move > pwr_now)
env->imbalance = sds->busiest_load_per_task;
}
@@ -4756,11 +4767,21 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
unsigned long max_pull, load_above_capacity = ~0UL;
+ unsigned long scaled_busy_load_per_task;
sds->busiest_load_per_task /= sds->busiest_nr_running;
+
+ /* Scale busiest_load_per_task to local power not related */
+ scaled_busy_load_per_task = sds->busiest_load_per_task
+ << SCHED_POWER_SHIFT;
+ scaled_busy_load_per_task /= sds->busiest->sgp->power;
+
if (sds->group_imb) {
- sds->busiest_load_per_task =
- min(sds->busiest_load_per_task, sds->avg_load);
+ scaled_busy_load_per_task =
+ min(scaled_busy_load_per_task, sds->avg_load);
+ sds->busiest_load_per_task = scaled_busy_load_per_task
+ * sds->busiest->sgp->power;
+ sds->busiest_load_per_task >>= SCHED_POWER_SHIFT;
}
/*
@@ -4770,7 +4791,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
if (sds->max_load < sds->avg_load) {
env->imbalance = 0;
- return fix_small_imbalance(env, sds);
+ return fix_small_imbalance(env, sds, scaled_busy_load_per_task);
}
if (!sds->group_imb) {
@@ -4809,7 +4830,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* moved
*/
if (env->imbalance < sds->busiest_load_per_task)
- return fix_small_imbalance(env, sds);
+ return fix_small_imbalance(env, sds, scaled_busy_load_per_task);
}
--
1.7.10.4