This patchset applies on top of Peter's sched/esting branch minus the last 2 commit:
56eb46798b33 ("sched: Clean up nohz enter/exit")
v5:
- fix wrong test condition in the newly idle case
- fix some typos
v4:
- fixed some compilation issues raised by kbuild
- update comments
- reorder condition tests for calling _nohz_idle_balance in newly_idle case
in order to make it more readable
v3:
- add memory barrier
- add comments
v2:
- minor naming updates
Vincent Guittot (3):
sched: Stop nohz stats when decayed
sched: reduce the periodic update duration
sched: update blocked load when newly idle
kernel/sched/fair.c | 453 ++++++++++++++++++++++++++++++++++-----------------
kernel/sched/sched.h | 1 +
2 files changed, 305 insertions(+), 149 deletions(-)
--
2.7.4
When NEWLY_IDLE load balance is not triggered, we might need to update the
blocked load anyway. We can kick an ilb so an idle CPU will take care of
updating blocked load or we can try to update them locally before entering
idle. In the latter case, we reuse part of the nohz_idle_balance.
Signed-off-by: Vincent Guittot <[email protected]>
---
kernel/sched/fair.c | 324 +++++++++++++++++++++++++++++++---------------------
1 file changed, 193 insertions(+), 131 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9183fee..4db5de76 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8832,120 +8832,6 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
}
/*
- * idle_balance is called by schedule() if this_cpu is about to become
- * idle. Attempts to pull tasks from other CPUs.
- */
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
-{
- unsigned long next_balance = jiffies + HZ;
- int this_cpu = this_rq->cpu;
- struct sched_domain *sd;
- int pulled_task = 0;
- u64 curr_cost = 0;
-
- /*
- * We must set idle_stamp _before_ calling idle_balance(), such that we
- * measure the duration of idle_balance() as idle time.
- */
- this_rq->idle_stamp = rq_clock(this_rq);
-
- /*
- * Do not pull tasks towards !active CPUs...
- */
- if (!cpu_active(this_cpu))
- return 0;
-
- /*
- * This is OK, because current is on_cpu, which avoids it being picked
- * for load-balance and preemption/IRQs are still disabled avoiding
- * further scheduler activity on it and we're being very careful to
- * re-start the picking loop.
- */
- rq_unpin_lock(this_rq, rf);
-
- if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !this_rq->rd->overload) {
- rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq->sd);
- if (sd)
- update_next_balance(sd, &next_balance);
- rcu_read_unlock();
-
- goto out;
- }
-
- raw_spin_unlock(&this_rq->lock);
-
- update_blocked_averages(this_cpu);
- rcu_read_lock();
- for_each_domain(this_cpu, sd) {
- int continue_balancing = 1;
- u64 t0, domain_cost;
-
- if (!(sd->flags & SD_LOAD_BALANCE))
- continue;
-
- if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
- update_next_balance(sd, &next_balance);
- break;
- }
-
- if (sd->flags & SD_BALANCE_NEWIDLE) {
- t0 = sched_clock_cpu(this_cpu);
-
- pulled_task = load_balance(this_cpu, this_rq,
- sd, CPU_NEWLY_IDLE,
- &continue_balancing);
-
- domain_cost = sched_clock_cpu(this_cpu) - t0;
- if (domain_cost > sd->max_newidle_lb_cost)
- sd->max_newidle_lb_cost = domain_cost;
-
- curr_cost += domain_cost;
- }
-
- update_next_balance(sd, &next_balance);
-
- /*
- * Stop searching for tasks to pull if there are
- * now runnable tasks on this rq.
- */
- if (pulled_task || this_rq->nr_running > 0)
- break;
- }
- rcu_read_unlock();
-
- raw_spin_lock(&this_rq->lock);
-
- if (curr_cost > this_rq->max_idle_balance_cost)
- this_rq->max_idle_balance_cost = curr_cost;
-
- /*
- * While browsing the domains, we released the rq lock, a task could
- * have been enqueued in the meantime. Since we're not going idle,
- * pretend we pulled a task.
- */
- if (this_rq->cfs.h_nr_running && !pulled_task)
- pulled_task = 1;
-
-out:
- /* Move the next balance forward */
- if (time_after(this_rq->next_balance, next_balance))
- this_rq->next_balance = next_balance;
-
- /* Is there a task of a high priority class? */
- if (this_rq->nr_running != this_rq->cfs.h_nr_running)
- pulled_task = -1;
-
- if (pulled_task)
- this_rq->idle_stamp = 0;
-
- rq_repin_lock(this_rq, rf);
-
- return pulled_task;
-}
-
-/*
* active_load_balance_cpu_stop is run by cpu stopper. It pushes
* running tasks off the busiest CPU onto idle CPUs. It requires at
* least 1 task to be running on each physical CPU where possible, and
@@ -9413,10 +9299,14 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
#ifdef CONFIG_NO_HZ_COMMON
/*
- * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ * Internal function that runs load balance for all idle cpus. The load balance
+ * can be a simple update of blocked load or a complete load balance with
+ * tasks movement depending of flags.
+ * The function returns false if the loop has stopped before running
+ * through all idle CPUs.
*/
-static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
+ enum cpu_idle_type idle)
{
/* Earliest time when we have to do rebalance again */
unsigned long now = jiffies;
@@ -9424,20 +9314,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
bool has_blocked_load = false;
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
- unsigned int flags;
int balance_cpu;
+ int ret = false;
struct rq *rq;
- if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
- return false;
-
- if (idle != CPU_IDLE) {
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
- return false;
- }
-
- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
-
SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
/*
@@ -9482,10 +9362,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (time_after_eq(jiffies, rq->next_balance)) {
struct rq_flags rf;
- rq_lock_irq(rq, &rf);
+ rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
cpu_load_update_idle(rq);
- rq_unlock_irq(rq, &rf);
+ rq_unlock_irqrestore(rq, &rf);
if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(rq, CPU_IDLE);
@@ -9497,13 +9377,21 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
}
}
- update_blocked_averages(this_cpu);
+ /* Newly idle CPU doesn't need an update */
+ if (idle != CPU_NEWLY_IDLE) {
+ update_blocked_averages(this_cpu);
+ has_blocked_load |= this_rq->has_blocked_load;
+ }
+
if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(this_rq, CPU_IDLE);
WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+ /* The full idle balance loop has been done */
+ ret = true;
+
abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
@@ -9517,6 +9405,35 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (likely(update_next_balance))
nohz.next_balance = next_balance;
+ return ret;
+}
+
+/*
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+{
+ int this_cpu = this_rq->cpu;
+ unsigned int flags;
+
+ if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+ return false;
+
+ if (idle != CPU_IDLE) {
+ atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+ return false;
+ }
+
+ /*
+ * barrier, pairs with nohz_balance_enter_idle(), ensures ...
+ */
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+ if (!(flags & NOHZ_KICK_MASK))
+ return false;
+
+ _nohz_idle_balance(this_rq, flags, idle);
+
return true;
}
#else
@@ -9527,6 +9444,151 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
#endif
/*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
+{
+ unsigned long next_balance = jiffies + HZ;
+ int this_cpu = this_rq->cpu;
+ struct sched_domain *sd;
+ int pulled_task = 0;
+ u64 curr_cost = 0;
+
+ /*
+ * We must set idle_stamp _before_ calling idle_balance(), such that we
+ * measure the duration of idle_balance() as idle time.
+ */
+ this_rq->idle_stamp = rq_clock(this_rq);
+
+ /*
+ * Do not pull tasks towards !active CPUs...
+ */
+ if (!cpu_active(this_cpu))
+ return 0;
+
+ /*
+ * This is OK, because current is on_cpu, which avoids it being picked
+ * for load-balance and preemption/IRQs are still disabled avoiding
+ * further scheduler activity on it and we're being very careful to
+ * re-start the picking loop.
+ */
+ rq_unpin_lock(this_rq, rf);
+
+ if (this_rq->avg_idle < sysctl_sched_migration_cost ||
+ !this_rq->rd->overload) {
+#ifdef CONFIG_NO_HZ_COMMON
+ unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
+ unsigned long next_blocked = READ_ONCE(nohz.next_blocked);
+#endif
+ rcu_read_lock();
+ sd = rcu_dereference_check_sched_domain(this_rq->sd);
+ if (sd)
+ update_next_balance(sd, &next_balance);
+ rcu_read_unlock();
+
+#ifdef CONFIG_NO_HZ_COMMON
+ /*
+ * This CPU doesn't want to be disturbed by scheduler
+ * housekeeping
+ */
+ if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
+ goto out;
+
+ /* Will wake up very soon. No time for doing anything else*/
+ if (this_rq->avg_idle < sysctl_sched_migration_cost)
+ goto out;
+
+ /* Don't need to update blocked load of idle CPUs*/
+ if (!has_blocked || time_before(jiffies, next_blocked))
+ goto out;
+
+ raw_spin_unlock(&this_rq->lock);
+ /*
+ * This CPU is going to be idle and blocked load of idle CPUs
+ * need to be updated. Run the ilb locally as it is a good
+ * candidate for ilb instead of waking up another idle CPU.
+ * Kick an normal ilb if we failed to do the update.
+ */
+ if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
+ kick_ilb(NOHZ_STATS_KICK);
+ raw_spin_lock(&this_rq->lock);
+#endif
+ goto out;
+ }
+
+ raw_spin_unlock(&this_rq->lock);
+
+ update_blocked_averages(this_cpu);
+ rcu_read_lock();
+ for_each_domain(this_cpu, sd) {
+ int continue_balancing = 1;
+ u64 t0, domain_cost;
+
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ continue;
+
+ if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
+ update_next_balance(sd, &next_balance);
+ break;
+ }
+
+ if (sd->flags & SD_BALANCE_NEWIDLE) {
+ t0 = sched_clock_cpu(this_cpu);
+
+ pulled_task = load_balance(this_cpu, this_rq,
+ sd, CPU_NEWLY_IDLE,
+ &continue_balancing);
+
+ domain_cost = sched_clock_cpu(this_cpu) - t0;
+ if (domain_cost > sd->max_newidle_lb_cost)
+ sd->max_newidle_lb_cost = domain_cost;
+
+ curr_cost += domain_cost;
+ }
+
+ update_next_balance(sd, &next_balance);
+
+ /*
+ * Stop searching for tasks to pull if there are
+ * now runnable tasks on this rq.
+ */
+ if (pulled_task || this_rq->nr_running > 0)
+ break;
+ }
+ rcu_read_unlock();
+
+ raw_spin_lock(&this_rq->lock);
+
+ if (curr_cost > this_rq->max_idle_balance_cost)
+ this_rq->max_idle_balance_cost = curr_cost;
+
+ /*
+ * While browsing the domains, we released the rq lock, a task could
+ * have been enqueued in the meantime. Since we're not going idle,
+ * pretend we pulled a task.
+ */
+ if (this_rq->cfs.h_nr_running && !pulled_task)
+ pulled_task = 1;
+
+out:
+ /* Move the next balance forward */
+ if (time_after(this_rq->next_balance, next_balance))
+ this_rq->next_balance = next_balance;
+
+ /* Is there a task of a high priority class? */
+ if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+ pulled_task = -1;
+
+ if (pulled_task)
+ this_rq->idle_stamp = 0;
+
+ rq_repin_lock(this_rq, rf);
+
+ return pulled_task;
+}
+
+/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
*/
--
2.7.4
Instead of using the cfs_rq_is_decayed() which monitors all *_avg
and *_sum, we create a cfs_rq_has_blocked() which only takes care of
util_avg and load_avg. We are only interested by these 2 values which are
decaying faster than the *_sum so we can stop the periodic update earlier.
Signed-off-by: Vincent Guittot <[email protected]>
---
kernel/sched/fair.c | 21 +++++++++++++++++----
1 file changed, 17 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5a6835e..9183fee 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7337,6 +7337,19 @@ static void attach_tasks(struct lb_env *env)
rq_unlock(env->dst_rq, &rf);
}
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->avg.load_avg)
+ return true;
+
+ if (cfs_rq->avg.util_avg)
+ return true;
+
+ return false;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
{
if (cfs_rq->load.weight)
@@ -7354,8 +7367,6 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
return true;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -7391,7 +7402,9 @@ static void update_blocked_averages(int cpu)
*/
if (cfs_rq_is_decayed(cfs_rq))
list_del_leaf_cfs_rq(cfs_rq);
- else
+
+ /* Don't need periodic decay once load/util_avg are null */
+ if (cfs_rq_has_blocked(cfs_rq))
done = false;
}
@@ -7461,7 +7474,7 @@ static inline void update_blocked_averages(int cpu)
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
- if (cfs_rq_is_decayed(cfs_rq))
+ if (!cfs_rq_has_blocked(cfs_rq))
rq->has_blocked_load = 0;
#endif
rq_unlock_irqrestore(rq, &rf);
--
2.7.4
Stopped the periodic update of blocked load when all idle CPUs have fully
decayed. We introduce a new nohz.has_blocked that reflect if some idle
CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
is set everytime that a Idle CPU can have blocked load and it is then clear
when no more blocked load has been detected during an update. We don't need
atomic operation but only to make cure of the right ordering when updating
nohz.idle_cpus_mask and nohz.has_blocked.
Suggested-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Vincent Guittot <[email protected]>
---
kernel/sched/fair.c | 122 ++++++++++++++++++++++++++++++++++++++++++---------
kernel/sched/sched.h | 1 +
2 files changed, 102 insertions(+), 21 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7af1fa9..5a6835e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5383,8 +5383,9 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
static struct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
+ int has_blocked; /* Idle CPUS has blocked load */
unsigned long next_balance; /* in jiffy units */
- unsigned long next_stats;
+ unsigned long next_blocked; /* Next update of blocked load in jiffies */
} nohz ____cacheline_aligned;
#endif /* CONFIG_NO_HZ_COMMON */
@@ -6951,6 +6952,7 @@ enum fbq_type { regular, remote, all };
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
#define LBF_NOHZ_STATS 0x10
+#define LBF_NOHZ_AGAIN 0x20
struct lb_env {
struct sched_domain *sd;
@@ -7335,8 +7337,6 @@ static void attach_tasks(struct lb_env *env)
rq_unlock(env->dst_rq, &rf);
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
{
if (cfs_rq->load.weight)
@@ -7354,11 +7354,14 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
return true;
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq, *pos;
struct rq_flags rf;
+ bool done = true;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
@@ -7388,10 +7391,14 @@ static void update_blocked_averages(int cpu)
*/
if (cfs_rq_is_decayed(cfs_rq))
list_del_leaf_cfs_rq(cfs_rq);
+ else
+ done = false;
}
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
+ if (done)
+ rq->has_blocked_load = 0;
#endif
rq_unlock_irqrestore(rq, &rf);
}
@@ -7454,6 +7461,8 @@ static inline void update_blocked_averages(int cpu)
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
+ if (cfs_rq_is_decayed(cfs_rq))
+ rq->has_blocked_load = 0;
#endif
rq_unlock_irqrestore(rq, &rf);
}
@@ -7789,18 +7798,25 @@ group_type group_classify(struct sched_group *group,
return group_other;
}
-static void update_nohz_stats(struct rq *rq)
+static bool update_nohz_stats(struct rq *rq)
{
#ifdef CONFIG_NO_HZ_COMMON
unsigned int cpu = rq->cpu;
+ if (!rq->has_blocked_load)
+ return false;
+
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
- return;
+ return false;
if (!time_after(jiffies, rq->last_blocked_load_update_tick))
- return;
+ return true;
update_blocked_averages(cpu);
+
+ return rq->has_blocked_load;
+#else
+ return false;
#endif
}
@@ -7826,8 +7842,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
- if (env->flags & LBF_NOHZ_STATS)
- update_nohz_stats(rq);
+ if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
+ env->flags |= LBF_NOHZ_AGAIN;
/* Bias balancing toward cpus of our domain */
if (local_group)
@@ -7979,18 +7995,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
+#ifdef CONFIG_NO_HZ_COMMON
+ int has_blocked = READ_ONCE(nohz.has_blocked);
+#endif
bool overload = false;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
#ifdef CONFIG_NO_HZ_COMMON
- if (env->idle == CPU_NEWLY_IDLE) {
+ if (env->idle == CPU_NEWLY_IDLE && has_blocked)
env->flags |= LBF_NOHZ_STATS;
-
- if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
- nohz.next_stats = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);
- }
#endif
load_idx = get_sd_load_idx(env->sd, env->idle);
@@ -8046,6 +8061,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
sg = sg->next;
} while (sg != env->sd->groups);
+#ifdef CONFIG_NO_HZ_COMMON
+ if ((env->flags & LBF_NOHZ_AGAIN) &&
+ cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
+
+ WRITE_ONCE(nohz.next_blocked,
+ jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
+ }
+#endif
+
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
@@ -9069,6 +9093,8 @@ static void nohz_balancer_kick(struct rq *rq)
struct sched_domain *sd;
int nr_busy, i, cpu = rq->cpu;
unsigned int flags = 0;
+ unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
+ unsigned long next_blocked = READ_ONCE(nohz.next_blocked);
if (unlikely(rq->idle_balance))
return;
@@ -9086,7 +9112,7 @@ static void nohz_balancer_kick(struct rq *rq)
if (likely(!atomic_read(&nohz.nr_cpus)))
return;
- if (time_after(now, nohz.next_stats))
+ if (time_after(now, next_blocked) && has_blocked)
flags = NOHZ_STATS_KICK;
if (time_before(now, nohz.next_balance))
@@ -9207,13 +9233,26 @@ void nohz_balance_enter_idle(int cpu)
if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
return;
+ /*
+ * Can be set safely without rq->lock held
+ * If a clear happens, it will have evaluated last additions because
+ * rq->lock is held during the check and the clear
+ */
+ rq->has_blocked_load = 1;
+
+ /*
+ * The tick is still stopped but load could have been added in the
+ * meantime. We set the nohz.has_blocked flag to trig a check of the
+ * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
+ * of nohz.has_blocked can only happen after checking the new load
+ */
if (rq->nohz_tick_stopped)
- return;
+ goto out;
/*
* If we're a completely isolated CPU, we don't play.
*/
- if (on_null_domain(cpu_rq(cpu)))
+ if (on_null_domain(rq))
return;
rq->nohz_tick_stopped = 1;
@@ -9221,7 +9260,21 @@ void nohz_balance_enter_idle(int cpu)
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
atomic_inc(&nohz.nr_cpus);
+ /*
+ * Ensures that if nohz_idle_balance() fails to observe our
+ * @idle_cpus_mask store, it must observe the @has_blocked
+ * store.
+ */
+ smp_mb__after_atomic();
+
set_cpu_sd_state_idle(cpu);
+
+out:
+ /*
+ * Each time a cpu enter idle, we assume that it has blocked load and
+ * enable the periodic update of the load of idle cpus
+ */
+ WRITE_ONCE(nohz.has_blocked, 1);
}
#else
static inline void nohz_balancer_kick(struct rq *rq) { }
@@ -9355,7 +9408,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
/* Earliest time when we have to do rebalance again */
unsigned long now = jiffies;
unsigned long next_balance = now + 60*HZ;
- unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
+ bool has_blocked_load = false;
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
unsigned int flags;
@@ -9374,6 +9427,22 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
+ /*
+ * We assume there will be no idle load after this update and clear
+ * the has_blocked flag. If a cpu enters idle in the mean time, it will
+ * set the has_blocked flag and trig another update of idle load.
+ * Because a cpu that becomes idle, is added to idle_cpus_mask before
+ * setting the flag, we are sure to not clear the state and not
+ * check the load of an idle cpu.
+ */
+ WRITE_ONCE(nohz.has_blocked, 0);
+
+ /*
+ * Ensures that if we miss the CPU, we must see the has_blocked
+ * store from nohz_balance_enter_idle().
+ */
+ smp_mb();
+
for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
continue;
@@ -9383,11 +9452,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
* work being done for other cpus. Next load
* balancing owner will pick it up.
*/
- if (need_resched())
- break;
+ if (need_resched()) {
+ has_blocked_load = true;
+ goto abort;
+ }
rq = cpu_rq(balance_cpu);
+ update_blocked_averages(rq->cpu);
+ has_blocked_load |= rq->has_blocked_load;
+
/*
* If time for next balance is due,
* do the balance.
@@ -9400,7 +9474,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
cpu_load_update_idle(rq);
rq_unlock_irq(rq, &rf);
- update_blocked_averages(rq->cpu);
if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(rq, CPU_IDLE);
}
@@ -9415,7 +9488,13 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(this_rq, CPU_IDLE);
- nohz.next_stats = next_stats;
+ WRITE_ONCE(nohz.next_blocked,
+ now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+
+abort:
+ /* There is still blocked load, enable periodic update */
+ if (has_blocked_load)
+ WRITE_ONCE(nohz.has_blocked, 1);
/*
* next_balance will be updated only when there is a need.
@@ -10046,6 +10125,7 @@ __init void init_sched_fair_class(void)
#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
+ nohz.next_blocked = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
#endif
#endif /* SMP */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e200045..ad9b929 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -723,6 +723,7 @@ struct rq {
#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
unsigned long last_blocked_load_update_tick;
+ unsigned int has_blocked_load;
#endif /* CONFIG_SMP */
unsigned int nohz_tick_stopped;
atomic_t nohz_flags;
--
2.7.4
On Thu, Feb 15, 2018 at 03:22:47PM +0000, Patrick Bellasi wrote:
> > > static struct {
> > > cpumask_var_t idle_cpus_mask;
> > > atomic_t nr_cpus;
> > > + int has_blocked; /* Idle CPUS has blocked load */
>
> Why not "bool"?
>
Because then he gets me yelling that sizeof(_Bool) is undefined and the
whole structure layout goes out the window :-)
Never use bool in a composite type.
On 15-Feb 17:50, Peter Zijlstra wrote:
> On Thu, Feb 15, 2018 at 03:22:47PM +0000, Patrick Bellasi wrote:
> > > > static struct {
> > > > cpumask_var_t idle_cpus_mask;
> > > > atomic_t nr_cpus;
> > > > + int has_blocked; /* Idle CPUS has blocked load */
> >
> > Why not "bool"?
> >
> Because then he gets me yelling that sizeof(_Bool) is undefined and the
> whole structure layout goes out the window :-)
Now I know...
> Never use bool in a composite type.
... got a note ;-)
--
#include <best/regards.h>
Patrick Bellasi
Hi Patrick,
On 15 February 2018 at 16:22, Patrick Bellasi <[email protected]> wrote:
> Hi Vincent,
> here are some possible small code improvement suggestions from my side.
>
> Cheers Patrick
>
> On 14-Feb 16:26, Vincent Guittot wrote:
>> > Stopped the periodic update of blocked load when all idle CPUs have fully
>> > decayed. We introduce a new nohz.has_blocked that reflect if some idle
>> > CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
>> > is set everytime that a Idle CPU can have blocked load and it is then clear
>> > when no more blocked load has been detected during an update. We don't need
>> > atomic operation but only to make cure of the right ordering when updating
>> > nohz.idle_cpus_mask and nohz.has_blocked.
>> >
>> > Suggested-by: Peter Zijlstra (Intel) <[email protected]>
>> > Signed-off-by: Vincent Guittot <[email protected]>
>> > ---
>> > kernel/sched/fair.c | 122 ++++++++++++++++++++++++++++++++++++++++++---------
>> > kernel/sched/sched.h | 1 +
>> > 2 files changed, 102 insertions(+), 21 deletions(-)
>> >
>> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> > index 7af1fa9..5a6835e 100644
>> > --- a/kernel/sched/fair.c
>> > +++ b/kernel/sched/fair.c
>> > @@ -5383,8 +5383,9 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
>> > static struct {
>> > cpumask_var_t idle_cpus_mask;
>> > atomic_t nr_cpus;
>> > + int has_blocked; /* Idle CPUS has blocked load */
>
> Why not "bool"?
>
> What about "has_blocked_load", which makes it more clear the meaning
> and is also more consistent with the RQ flags "aggregated" by this
> one... as well as some other local variables you already name like
> that?
>
>> unsigned long next_balance; /* in jiffy units */
>> - unsigned long next_stats;
>> + unsigned long next_blocked; /* Next update of blocked load in jiffies */
>> } nohz ____cacheline_aligned;
>>
>> #endif /* CONFIG_NO_HZ_COMMON */
>> @@ -6951,6 +6952,7 @@ enum fbq_type { regular, remote, all };
>> #define LBF_DST_PINNED 0x04
>> #define LBF_SOME_PINNED 0x08
>> #define LBF_NOHZ_STATS 0x10
>> +#define LBF_NOHZ_AGAIN 0x20
>>
>> struct lb_env {
>> struct sched_domain *sd;
>> @@ -7335,8 +7337,6 @@ static void attach_tasks(struct lb_env *env)
>> rq_unlock(env->dst_rq, &rf);
>> }
>>
>> -#ifdef CONFIG_FAIR_GROUP_SCHED
>> -
>> static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
>> {
>> if (cfs_rq->load.weight)
>> @@ -7354,11 +7354,14 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
>> return true;
>> }
>>
>> +#ifdef CONFIG_FAIR_GROUP_SCHED
>> +
>> static void update_blocked_averages(int cpu)
>> {
>> struct rq *rq = cpu_rq(cpu);
>> struct cfs_rq *cfs_rq, *pos;
>> struct rq_flags rf;
>> + bool done = true;
>>
>> rq_lock_irqsave(rq, &rf);
>> update_rq_clock(rq);
>> @@ -7388,10 +7391,14 @@ static void update_blocked_averages(int cpu)
>> */
>> if (cfs_rq_is_decayed(cfs_rq))
>> list_del_leaf_cfs_rq(cfs_rq);
>> + else
>> + done = false;
>> }
>>
>> #ifdef CONFIG_NO_HZ_COMMON
>> rq->last_blocked_load_update_tick = jiffies;
>> + if (done)
>> + rq->has_blocked_load = 0;
>> #endif
>> rq_unlock_irqrestore(rq, &rf);
>> }
>> @@ -7454,6 +7461,8 @@ static inline void update_blocked_averages(int cpu)
>> update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
>> #ifdef CONFIG_NO_HZ_COMMON
>> rq->last_blocked_load_update_tick = jiffies;
>> + if (cfs_rq_is_decayed(cfs_rq))
>> + rq->has_blocked_load = 0;
>> #endif
>> rq_unlock_irqrestore(rq, &rf);
>> }
>> @@ -7789,18 +7798,25 @@ group_type group_classify(struct sched_group *group,
>> return group_other;
>> }
>>
>> -static void update_nohz_stats(struct rq *rq)
>> +static bool update_nohz_stats(struct rq *rq)
>> {
>> #ifdef CONFIG_NO_HZ_COMMON
>> unsigned int cpu = rq->cpu;
>>
>> + if (!rq->has_blocked_load)
>> + return false;
>> +
>> if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
>> - return;
>> + return false;
>>
>> if (!time_after(jiffies, rq->last_blocked_load_update_tick))
>> - return;
>> + return true;
>>
>> update_blocked_averages(cpu);
>> +
>> + return rq->has_blocked_load;
>> +#else
>> + return false;
>> #endif
>> }
>>
>> @@ -7826,8 +7842,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>> for_each_cpu_and(i, sched_group_span(group), env->cpus) {
>> struct rq *rq = cpu_rq(i);
>>
>> - if (env->flags & LBF_NOHZ_STATS)
>> - update_nohz_stats(rq);
>> + if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
>> + env->flags |= LBF_NOHZ_AGAIN;
>>
>> /* Bias balancing toward cpus of our domain */
>> if (local_group)
>> @@ -7979,18 +7995,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>> struct sg_lb_stats *local = &sds->local_stat;
>> struct sg_lb_stats tmp_sgs;
>> int load_idx, prefer_sibling = 0;
>> +#ifdef CONFIG_NO_HZ_COMMON
>> + int has_blocked = READ_ONCE(nohz.has_blocked);
>> +#endif
>> bool overload = false;
>>
>> if (child && child->flags & SD_PREFER_SIBLING)
>> prefer_sibling = 1;
>>
>> #ifdef CONFIG_NO_HZ_COMMON
>> - if (env->idle == CPU_NEWLY_IDLE) {
>> + if (env->idle == CPU_NEWLY_IDLE && has_blocked)
>
> Why using a local "has_blocked" here?
>
> Is not better to inline directly the READ_ONCE, we avoid the above
> idfed and also (maybe?) save the load if the previous checks should fail?
I agree that saving ifdef could be interesting although it make the
condition less readable but that's just an opinion
About saving the load, I expect the compiler to optimize that
correctly and i'm not sure that inlining the READ_ONCE will prevent
any reordering and will really save the load
>
>> env->flags |= LBF_NOHZ_STATS;
>> -
>> - if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
>> - nohz.next_stats = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);
>> - }
>> #endif
>>
>> load_idx = get_sd_load_idx(env->sd, env->idle);
>> @@ -8046,6 +8061,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>> sg = sg->next;
>> } while (sg != env->sd->groups);
>>
>> +#ifdef CONFIG_NO_HZ_COMMON
>> + if ((env->flags & LBF_NOHZ_AGAIN) &&
>> + cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
>> +
>> + WRITE_ONCE(nohz.next_blocked,
>> + jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
>> + }
>> +#endif
>> +
>
> Moreover, the above two ifdef maybe can be better moved into compile
> time inline functions to keep update_sg_lb_stats more streamlined,
> e.g. something like:
>
> ---8<---
> #ifdef CONFIG_NO_HZ_COMMON
> static inline void nohz_update_next_stats(struct lb_env *env)
> {
> if (env->idle != CPU_NEWLY_IDLE || READ_ONCE(nohz.has_blocked))
> return;
> eenv->flags |= LBF_NOHZ_STATS;
> etc...
> }
IMHO, Having inline function for 2 lines of code doesn't make it more
readable as we then have to looks for the content of the function to
understand what it does
>
> static inline void nohz_update_next_blocked(struct lb_env *env)
> {
> if (!(env->flags & LBF_NOHZ_AGAIN)
> return
> if (!cpumask_subset(nohz.idle_cpus_mask,
> sched_domain_span(env->sd)))
> return
> WRITE_ONCE();
ditto
> }
> #else
> #define nohz_update_next_stats(env) do {} while(false);
> #define nohz_update_blocked(env) do {} while(false);
> #endif
> ---8<---
>
>> if (env->sd->flags & SD_NUMA)
>> env->fbq_type = fbq_classify_group(&sds->busiest_stat);
>>
>> @@ -9069,6 +9093,8 @@ static void nohz_balancer_kick(struct rq *rq)
>> struct sched_domain *sd;
>> int nr_busy, i, cpu = rq->cpu;
>> unsigned int flags = 0;
>> + unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
>> + unsigned long next_blocked = READ_ONCE(nohz.next_blocked);
>>
>> if (unlikely(rq->idle_balance))
>> return;
>> @@ -9086,7 +9112,7 @@ static void nohz_balancer_kick(struct rq *rq)
>> if (likely(!atomic_read(&nohz.nr_cpus)))
>> return;
>>
>> - if (time_after(now, nohz.next_stats))
>> + if (time_after(now, next_blocked) && has_blocked)
>
> Why not using READ_ONCE right here, these are the only two usages in
> this function, isn't it?
It' s mainly a matter of keeping the condition on 1 line to make it
more readable.
> This would allow also to skip (some of) the loads if some of the
> checks should fails. Ins't it?
As mentioned previously I expect the compiler to re order all data
access to the most optimized way
>
>> flags = NOHZ_STATS_KICK;
>>
>> if (time_before(now, nohz.next_balance))
>> @@ -9207,13 +9233,26 @@ void nohz_balance_enter_idle(int cpu)
>> if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
>> return;
>>
>> + /*
>> + * Can be set safely without rq->lock held
>> + * If a clear happens, it will have evaluated last additions because
>> + * rq->lock is held during the check and the clear
>> + */
>> + rq->has_blocked_load = 1;
>> +
>> + /*
>> + * The tick is still stopped but load could have been added in the
>> + * meantime. We set the nohz.has_blocked flag to trig a check of the
>> + * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
>> + * of nohz.has_blocked can only happen after checking the new load
>> + */
>> if (rq->nohz_tick_stopped)
>> - return;
>> + goto out;
>>
>> /*
>> * If we're a completely isolated CPU, we don't play.
>> */
>> - if (on_null_domain(cpu_rq(cpu)))
>> + if (on_null_domain(rq))
>> return;
>>
>> rq->nohz_tick_stopped = 1;
>> @@ -9221,7 +9260,21 @@ void nohz_balance_enter_idle(int cpu)
>> cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
>> atomic_inc(&nohz.nr_cpus);
>>
>> + /*
>> + * Ensures that if nohz_idle_balance() fails to observe our
>> + * @idle_cpus_mask store, it must observe the @has_blocked
>> + * store.
>> + */
>> + smp_mb__after_atomic();
>> +
>> set_cpu_sd_state_idle(cpu);
>> +
>> +out:
>> + /*
>> + * Each time a cpu enter idle, we assume that it has blocked load and
>> + * enable the periodic update of the load of idle cpus
>> + */
>> + WRITE_ONCE(nohz.has_blocked, 1);
>> }
>> #else
>> static inline void nohz_balancer_kick(struct rq *rq) { }
>> @@ -9355,7 +9408,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> /* Earliest time when we have to do rebalance again */
>> unsigned long now = jiffies;
>> unsigned long next_balance = now + 60*HZ;
>> - unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
>> + bool has_blocked_load = false;
>> int update_next_balance = 0;
>> int this_cpu = this_rq->cpu;
>> unsigned int flags;
>> @@ -9374,6 +9427,22 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>
>> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>
>> + /*
>> + * We assume there will be no idle load after this update and clear
>> + * the has_blocked flag. If a cpu enters idle in the mean time, it will
>> + * set the has_blocked flag and trig another update of idle load.
>> + * Because a cpu that becomes idle, is added to idle_cpus_mask before
>> + * setting the flag, we are sure to not clear the state and not
>> + * check the load of an idle cpu.
>> + */
>> + WRITE_ONCE(nohz.has_blocked, 0);
>> +
>> + /*
>> + * Ensures that if we miss the CPU, we must see the has_blocked
>> + * store from nohz_balance_enter_idle().
>> + */
>> + smp_mb();
>> +
>> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>> continue;
>> @@ -9383,11 +9452,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> * work being done for other cpus. Next load
>> * balancing owner will pick it up.
>> */
>> - if (need_resched())
>> - break;
>> + if (need_resched()) {
>> + has_blocked_load = true;
>> + goto abort;
>> + }
>>
>> rq = cpu_rq(balance_cpu);
>>
>> + update_blocked_averages(rq->cpu);
>> + has_blocked_load |= rq->has_blocked_load;
>> +
>> /*
>> * If time for next balance is due,
>> * do the balance.
>> @@ -9400,7 +9474,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> cpu_load_update_idle(rq);
>> rq_unlock_irq(rq, &rf);
>>
>> - update_blocked_averages(rq->cpu);
>> if (flags & NOHZ_BALANCE_KICK)
>> rebalance_domains(rq, CPU_IDLE);
>> }
>> @@ -9415,7 +9488,13 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> if (flags & NOHZ_BALANCE_KICK)
>> rebalance_domains(this_rq, CPU_IDLE);
>>
>> - nohz.next_stats = next_stats;
>> + WRITE_ONCE(nohz.next_blocked,
>> + now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>
> Is it not slightly more clear to still use a local? I.e.
>
> next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
next_stats doesn't exist anymore with this patch
> WRITE_ONCE(nohz.next_blocked, next_stats);
>
>> +
>> +abort:
>> + /* There is still blocked load, enable periodic update */
>> + if (has_blocked_load)
>> + WRITE_ONCE(nohz.has_blocked, 1);
>>
>> /*
>> * next_balance will be updated only when there is a need.
>> @@ -10046,6 +10125,7 @@ __init void init_sched_fair_class(void)
>>
>> #ifdef CONFIG_NO_HZ_COMMON
>> nohz.next_balance = jiffies;
>> + nohz.next_blocked = jiffies;
>> zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
>> #endif
>> #endif /* SMP */
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index e200045..ad9b929 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -723,6 +723,7 @@ struct rq {
>> #ifdef CONFIG_SMP
>> unsigned long last_load_update_tick;
>> unsigned long last_blocked_load_update_tick;
>> + unsigned int has_blocked_load;
>> #endif /* CONFIG_SMP */
>> unsigned int nohz_tick_stopped;
>> atomic_t nohz_flags;
>> > --
>> > 2.7.4
>> >
>
> --
> #include <best/regards.h>
>
> Patrick Bellasi
Hi Vincent,
here are some possible small code improvement suggestions from my side.
Cheers Patrick
On 14-Feb 16:26, Vincent Guittot wrote:
> > Stopped the periodic update of blocked load when all idle CPUs have fully
> > decayed. We introduce a new nohz.has_blocked that reflect if some idle
> > CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
> > is set everytime that a Idle CPU can have blocked load and it is then clear
> > when no more blocked load has been detected during an update. We don't need
> > atomic operation but only to make cure of the right ordering when updating
> > nohz.idle_cpus_mask and nohz.has_blocked.
> >
> > Suggested-by: Peter Zijlstra (Intel) <[email protected]>
> > Signed-off-by: Vincent Guittot <[email protected]>
> > ---
> > kernel/sched/fair.c | 122 ++++++++++++++++++++++++++++++++++++++++++---------
> > kernel/sched/sched.h | 1 +
> > 2 files changed, 102 insertions(+), 21 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 7af1fa9..5a6835e 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -5383,8 +5383,9 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
> > static struct {
> > cpumask_var_t idle_cpus_mask;
> > atomic_t nr_cpus;
> > + int has_blocked; /* Idle CPUS has blocked load */
Why not "bool"?
What about "has_blocked_load", which makes it more clear the meaning
and is also more consistent with the RQ flags "aggregated" by this
one... as well as some other local variables you already name like
that?
> unsigned long next_balance; /* in jiffy units */
> - unsigned long next_stats;
> + unsigned long next_blocked; /* Next update of blocked load in jiffies */
> } nohz ____cacheline_aligned;
>
> #endif /* CONFIG_NO_HZ_COMMON */
> @@ -6951,6 +6952,7 @@ enum fbq_type { regular, remote, all };
> #define LBF_DST_PINNED 0x04
> #define LBF_SOME_PINNED 0x08
> #define LBF_NOHZ_STATS 0x10
> +#define LBF_NOHZ_AGAIN 0x20
>
> struct lb_env {
> struct sched_domain *sd;
> @@ -7335,8 +7337,6 @@ static void attach_tasks(struct lb_env *env)
> rq_unlock(env->dst_rq, &rf);
> }
>
> -#ifdef CONFIG_FAIR_GROUP_SCHED
> -
> static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
> {
> if (cfs_rq->load.weight)
> @@ -7354,11 +7354,14 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
> return true;
> }
>
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> +
> static void update_blocked_averages(int cpu)
> {
> struct rq *rq = cpu_rq(cpu);
> struct cfs_rq *cfs_rq, *pos;
> struct rq_flags rf;
> + bool done = true;
>
> rq_lock_irqsave(rq, &rf);
> update_rq_clock(rq);
> @@ -7388,10 +7391,14 @@ static void update_blocked_averages(int cpu)
> */
> if (cfs_rq_is_decayed(cfs_rq))
> list_del_leaf_cfs_rq(cfs_rq);
> + else
> + done = false;
> }
>
> #ifdef CONFIG_NO_HZ_COMMON
> rq->last_blocked_load_update_tick = jiffies;
> + if (done)
> + rq->has_blocked_load = 0;
> #endif
> rq_unlock_irqrestore(rq, &rf);
> }
> @@ -7454,6 +7461,8 @@ static inline void update_blocked_averages(int cpu)
> update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
> #ifdef CONFIG_NO_HZ_COMMON
> rq->last_blocked_load_update_tick = jiffies;
> + if (cfs_rq_is_decayed(cfs_rq))
> + rq->has_blocked_load = 0;
> #endif
> rq_unlock_irqrestore(rq, &rf);
> }
> @@ -7789,18 +7798,25 @@ group_type group_classify(struct sched_group *group,
> return group_other;
> }
>
> -static void update_nohz_stats(struct rq *rq)
> +static bool update_nohz_stats(struct rq *rq)
> {
> #ifdef CONFIG_NO_HZ_COMMON
> unsigned int cpu = rq->cpu;
>
> + if (!rq->has_blocked_load)
> + return false;
> +
> if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
> - return;
> + return false;
>
> if (!time_after(jiffies, rq->last_blocked_load_update_tick))
> - return;
> + return true;
>
> update_blocked_averages(cpu);
> +
> + return rq->has_blocked_load;
> +#else
> + return false;
> #endif
> }
>
> @@ -7826,8 +7842,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> for_each_cpu_and(i, sched_group_span(group), env->cpus) {
> struct rq *rq = cpu_rq(i);
>
> - if (env->flags & LBF_NOHZ_STATS)
> - update_nohz_stats(rq);
> + if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
> + env->flags |= LBF_NOHZ_AGAIN;
>
> /* Bias balancing toward cpus of our domain */
> if (local_group)
> @@ -7979,18 +7995,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> struct sg_lb_stats *local = &sds->local_stat;
> struct sg_lb_stats tmp_sgs;
> int load_idx, prefer_sibling = 0;
> +#ifdef CONFIG_NO_HZ_COMMON
> + int has_blocked = READ_ONCE(nohz.has_blocked);
> +#endif
> bool overload = false;
>
> if (child && child->flags & SD_PREFER_SIBLING)
> prefer_sibling = 1;
>
> #ifdef CONFIG_NO_HZ_COMMON
> - if (env->idle == CPU_NEWLY_IDLE) {
> + if (env->idle == CPU_NEWLY_IDLE && has_blocked)
Why using a local "has_blocked" here?
Is not better to inline directly the READ_ONCE, we avoid the above
idfed and also (maybe?) save the load if the previous checks should fail?
> env->flags |= LBF_NOHZ_STATS;
> -
> - if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
> - nohz.next_stats = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);
> - }
> #endif
>
> load_idx = get_sd_load_idx(env->sd, env->idle);
> @@ -8046,6 +8061,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> sg = sg->next;
> } while (sg != env->sd->groups);
>
> +#ifdef CONFIG_NO_HZ_COMMON
> + if ((env->flags & LBF_NOHZ_AGAIN) &&
> + cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
> +
> + WRITE_ONCE(nohz.next_blocked,
> + jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
> + }
> +#endif
> +
Moreover, the above two ifdef maybe can be better moved into compile
time inline functions to keep update_sg_lb_stats more streamlined,
e.g. something like:
---8<---
#ifdef CONFIG_NO_HZ_COMMON
static inline void nohz_update_next_stats(struct lb_env *env)
{
if (env->idle != CPU_NEWLY_IDLE || READ_ONCE(nohz.has_blocked))
return;
eenv->flags |= LBF_NOHZ_STATS;
etc...
}
static inline void nohz_update_next_blocked(struct lb_env *env)
{
if (!(env->flags & LBF_NOHZ_AGAIN)
return
if (!cpumask_subset(nohz.idle_cpus_mask,
sched_domain_span(env->sd)))
return
WRITE_ONCE();
}
#else
#define nohz_update_next_stats(env) do {} while(false);
#define nohz_update_blocked(env) do {} while(false);
#endif
---8<---
> if (env->sd->flags & SD_NUMA)
> env->fbq_type = fbq_classify_group(&sds->busiest_stat);
>
> @@ -9069,6 +9093,8 @@ static void nohz_balancer_kick(struct rq *rq)
> struct sched_domain *sd;
> int nr_busy, i, cpu = rq->cpu;
> unsigned int flags = 0;
> + unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
> + unsigned long next_blocked = READ_ONCE(nohz.next_blocked);
>
> if (unlikely(rq->idle_balance))
> return;
> @@ -9086,7 +9112,7 @@ static void nohz_balancer_kick(struct rq *rq)
> if (likely(!atomic_read(&nohz.nr_cpus)))
> return;
>
> - if (time_after(now, nohz.next_stats))
> + if (time_after(now, next_blocked) && has_blocked)
Why not using READ_ONCE right here, these are the only two usages in
this function, isn't it?
This would allow also to skip (some of) the loads if some of the
checks should fails. Ins't it?
> flags = NOHZ_STATS_KICK;
>
> if (time_before(now, nohz.next_balance))
> @@ -9207,13 +9233,26 @@ void nohz_balance_enter_idle(int cpu)
> if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
> return;
>
> + /*
> + * Can be set safely without rq->lock held
> + * If a clear happens, it will have evaluated last additions because
> + * rq->lock is held during the check and the clear
> + */
> + rq->has_blocked_load = 1;
> +
> + /*
> + * The tick is still stopped but load could have been added in the
> + * meantime. We set the nohz.has_blocked flag to trig a check of the
> + * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
> + * of nohz.has_blocked can only happen after checking the new load
> + */
> if (rq->nohz_tick_stopped)
> - return;
> + goto out;
>
> /*
> * If we're a completely isolated CPU, we don't play.
> */
> - if (on_null_domain(cpu_rq(cpu)))
> + if (on_null_domain(rq))
> return;
>
> rq->nohz_tick_stopped = 1;
> @@ -9221,7 +9260,21 @@ void nohz_balance_enter_idle(int cpu)
> cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
> atomic_inc(&nohz.nr_cpus);
>
> + /*
> + * Ensures that if nohz_idle_balance() fails to observe our
> + * @idle_cpus_mask store, it must observe the @has_blocked
> + * store.
> + */
> + smp_mb__after_atomic();
> +
> set_cpu_sd_state_idle(cpu);
> +
> +out:
> + /*
> + * Each time a cpu enter idle, we assume that it has blocked load and
> + * enable the periodic update of the load of idle cpus
> + */
> + WRITE_ONCE(nohz.has_blocked, 1);
> }
> #else
> static inline void nohz_balancer_kick(struct rq *rq) { }
> @@ -9355,7 +9408,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> /* Earliest time when we have to do rebalance again */
> unsigned long now = jiffies;
> unsigned long next_balance = now + 60*HZ;
> - unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
> + bool has_blocked_load = false;
> int update_next_balance = 0;
> int this_cpu = this_rq->cpu;
> unsigned int flags;
> @@ -9374,6 +9427,22 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>
> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>
> + /*
> + * We assume there will be no idle load after this update and clear
> + * the has_blocked flag. If a cpu enters idle in the mean time, it will
> + * set the has_blocked flag and trig another update of idle load.
> + * Because a cpu that becomes idle, is added to idle_cpus_mask before
> + * setting the flag, we are sure to not clear the state and not
> + * check the load of an idle cpu.
> + */
> + WRITE_ONCE(nohz.has_blocked, 0);
> +
> + /*
> + * Ensures that if we miss the CPU, we must see the has_blocked
> + * store from nohz_balance_enter_idle().
> + */
> + smp_mb();
> +
> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
> continue;
> @@ -9383,11 +9452,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> * work being done for other cpus. Next load
> * balancing owner will pick it up.
> */
> - if (need_resched())
> - break;
> + if (need_resched()) {
> + has_blocked_load = true;
> + goto abort;
> + }
>
> rq = cpu_rq(balance_cpu);
>
> + update_blocked_averages(rq->cpu);
> + has_blocked_load |= rq->has_blocked_load;
> +
> /*
> * If time for next balance is due,
> * do the balance.
> @@ -9400,7 +9474,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> cpu_load_update_idle(rq);
> rq_unlock_irq(rq, &rf);
>
> - update_blocked_averages(rq->cpu);
> if (flags & NOHZ_BALANCE_KICK)
> rebalance_domains(rq, CPU_IDLE);
> }
> @@ -9415,7 +9488,13 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> if (flags & NOHZ_BALANCE_KICK)
> rebalance_domains(this_rq, CPU_IDLE);
>
> - nohz.next_stats = next_stats;
> + WRITE_ONCE(nohz.next_blocked,
> + now + msecs_to_jiffies(LOAD_AVG_PERIOD));
Is it not slightly more clear to still use a local? I.e.
next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
WRITE_ONCE(nohz.next_blocked, next_stats);
> +
> +abort:
> + /* There is still blocked load, enable periodic update */
> + if (has_blocked_load)
> + WRITE_ONCE(nohz.has_blocked, 1);
>
> /*
> * next_balance will be updated only when there is a need.
> @@ -10046,6 +10125,7 @@ __init void init_sched_fair_class(void)
>
> #ifdef CONFIG_NO_HZ_COMMON
> nohz.next_balance = jiffies;
> + nohz.next_blocked = jiffies;
> zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
> #endif
> #endif /* SMP */
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e200045..ad9b929 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -723,6 +723,7 @@ struct rq {
> #ifdef CONFIG_SMP
> unsigned long last_load_update_tick;
> unsigned long last_blocked_load_update_tick;
> + unsigned int has_blocked_load;
> #endif /* CONFIG_SMP */
> unsigned int nohz_tick_stopped;
> atomic_t nohz_flags;
> > --
> > 2.7.4
> >
--
#include <best/regards.h>
Patrick Bellasi
On 02/14/2018 03:26 PM, Vincent Guittot wrote:
> Stopped the periodic update of blocked load when all idle CPUs have fully
> decayed. We introduce a new nohz.has_blocked that reflect if some idle
> CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
> is set everytime that a Idle CPU can have blocked load and it is then clear
> when no more blocked load has been detected during an update. We don't need
> atomic operation but only to make cure of the right ordering when updating
> nohz.idle_cpus_mask and nohz.has_blocked.
>
> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
> kernel/sched/fair.c | 122 ++++++++++++++++++++++++++++++++++++++++++---------
> kernel/sched/sched.h | 1 +
> 2 files changed, 102 insertions(+), 21 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 7af1fa9..5a6835e 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
>
> [...]
>> @@ -9374,6 +9427,22 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>
> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>
> + /*
> + * We assume there will be no idle load after this update and clear
> + * the has_blocked flag. If a cpu enters idle in the mean time, it will
> + * set the has_blocked flag and trig another update of idle load.
> + * Because a cpu that becomes idle, is added to idle_cpus_mask before
> + * setting the flag, we are sure to not clear the state and not
> + * check the load of an idle cpu.
> + */
> + WRITE_ONCE(nohz.has_blocked, 0);
> +
> + /*
> + * Ensures that if we miss the CPU, we must see the has_blocked
> + * store from nohz_balance_enter_idle().
> + */
> + smp_mb();
> +
> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
> continue;
> @@ -9383,11 +9452,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> * work being done for other cpus. Next load
> * balancing owner will pick it up.
> */
> - if (need_resched())
> - break;
> + if (need_resched()) {
> + has_blocked_load = true;
> + goto abort;
> + }
>
> rq = cpu_rq(balance_cpu);
>
I'd say it's safe to do the following here. The flag is raised in
nohz_balance_enter_idle() before the smp_mb(), so we won't skip a CPU
that just got added to nohz.idle_cpus_mask.
/*
* This cpu doesn't have any remaining blocked load, skip it.
* It's sane to do this because this flag is raised in
* nohz_balance_enter_idle()
*/
if ((flags & NOHZ_KICK_MASK) == NOHZ_STATS_KICK &&
!rq->has_blocked_load)
continue;
> + update_blocked_averages(rq->cpu);
> + has_blocked_load |= rq->has_blocked_load;
> +
> /*
> * If time for next balance is due,
> * do the balance.
> @@ -9400,7 +9474,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> cpu_load_update_idle(rq);
> rq_unlock_irq(rq, &rf);
>
> - update_blocked_averages(rq->cpu);
> if (flags & NOHZ_BALANCE_KICK)
> rebalance_domains(rq, CPU_IDLE);
> }
> @@ -9415,7 +9488,13 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> if (flags & NOHZ_BALANCE_KICK)
> rebalance_domains(this_rq, CPU_IDLE);
>
> - nohz.next_stats = next_stats;
> + WRITE_ONCE(nohz.next_blocked,
> + now + msecs_to_jiffies(LOAD_AVG_PERIOD));
> +
> +abort:
> + /* There is still blocked load, enable periodic update */
> + if (has_blocked_load)
> + WRITE_ONCE(nohz.has_blocked, 1);
>
> /*
> * next_balance will be updated only when there is a need.
On 02/14/2018 03:26 PM, Vincent Guittot wrote:
> Stopped the periodic update of blocked load when all idle CPUs have fully
> decayed. We introduce a new nohz.has_blocked that reflect if some idle
> CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
> is set everytime that a Idle CPU can have blocked load and it is then clear
> when no more blocked load has been detected during an update. We don't need
> atomic operation but only to make cure of the right ordering when updating
> nohz.idle_cpus_mask and nohz.has_blocked.
>
> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
> kernel/sched/fair.c | 122 ++++++++++++++++++++++++++++++++++++++++++---------
> kernel/sched/sched.h | 1 +
> 2 files changed, 102 insertions(+), 21 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 7af1fa9..5a6835e 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
>
> [...]
>
> -static void update_nohz_stats(struct rq *rq)
> +static bool update_nohz_stats(struct rq *rq)
> {
> #ifdef CONFIG_NO_HZ_COMMON
> unsigned int cpu = rq->cpu;
>
> + if (!rq->has_blocked_load)
> + return false;
> +
> if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
> - return;
> + return false;
>
> if (!time_after(jiffies, rq->last_blocked_load_update_tick))
> - return;
> + return true;
>
> update_blocked_averages(cpu);
> +
> + return rq->has_blocked_load;
> +#else
> + return false;
> #endif
> }
>
(Wrongly thought that this bit was in a different patch, comment should have
been squashed in previous reply...)
I've been thinking about this, and it's a messy one if we want to
skip CPUs in idle_balance() / clear the nohz.has_blocked_flag.
AFAICT, the rq->has_blocked_load flag can be wrongly cleared: if we're
calling update_nohz_stats() for CPU A, but CPU A got out/in of
idle really quickly in that same timeframe, I'm not sure you can guarantee
the clearing of rq->has_blocked_load done in update_blocked_averages() will
always end up in memory before the setting of the flag in
nohz_balance_enter_idle().
I was going to say we don't have this problem in _nohz_idle_balance() but
actually I think we do. We have the checking of nohz.idle_cpus_mask after the
smp_mb(), which makes sure the clear of nohz.has_blocked will never
overwrite the set in nohz_balance_enter_idle(), but it doesn't
guarantee the same for the rq flag. So we can have nohz CPUs with blocked
load but with rq->has_blocked_load set to false. Which isn't a problem now
but it is if we want to use the flag to skip CPUs.
Am I correct or am I going crazy ? There's a comment about this in
nohz_balance_enter_idle() but I'm confused now:
/*
* Can be set safely without rq->lock held
* If a clear happens, it will have evaluated last additions because
* rq->lock is held during the check and the clear
*/
rq->has_blocked_load = 1;
On 16 February 2018 at 13:13, Valentin Schneider
<[email protected]> wrote:
> On 02/14/2018 03:26 PM, Vincent Guittot wrote:
>> Stopped the periodic update of blocked load when all idle CPUs have fully
>> decayed. We introduce a new nohz.has_blocked that reflect if some idle
>> CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
>> is set everytime that a Idle CPU can have blocked load and it is then clear
>> when no more blocked load has been detected during an update. We don't need
>> atomic operation but only to make cure of the right ordering when updating
>> nohz.idle_cpus_mask and nohz.has_blocked.
>>
>> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
>> Signed-off-by: Vincent Guittot <[email protected]>
>> ---
>> kernel/sched/fair.c | 122 ++++++++++++++++++++++++++++++++++++++++++---------
>> kernel/sched/sched.h | 1 +
>> 2 files changed, 102 insertions(+), 21 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 7af1fa9..5a6835e 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>>
>> [...]
>>> @@ -9374,6 +9427,22 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>
>> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>
>> + /*
>> + * We assume there will be no idle load after this update and clear
>> + * the has_blocked flag. If a cpu enters idle in the mean time, it will
>> + * set the has_blocked flag and trig another update of idle load.
>> + * Because a cpu that becomes idle, is added to idle_cpus_mask before
>> + * setting the flag, we are sure to not clear the state and not
>> + * check the load of an idle cpu.
>> + */
>> + WRITE_ONCE(nohz.has_blocked, 0);
>> +
>> + /*
>> + * Ensures that if we miss the CPU, we must see the has_blocked
>> + * store from nohz_balance_enter_idle().
>> + */
>> + smp_mb();
>> +
>> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>> continue;
>> @@ -9383,11 +9452,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> * work being done for other cpus. Next load
>> * balancing owner will pick it up.
>> */
>> - if (need_resched())
>> - break;
>> + if (need_resched()) {
>> + has_blocked_load = true;
>> + goto abort;
>> + }
>>
>> rq = cpu_rq(balance_cpu);
>>
>
> I'd say it's safe to do the following here. The flag is raised in
> nohz_balance_enter_idle() before the smp_mb(), so we won't skip a CPU
> that just got added to nohz.idle_cpus_mask.
rq->has_blocked_load will be set before the barrier only if
nohz_tick_stopped is not already set,
Otherwise, we skip cpumask update and the barrier in nohz_balance_enter_idle
>
> /*
> * This cpu doesn't have any remaining blocked load, skip it.
> * It's sane to do this because this flag is raised in
> * nohz_balance_enter_idle()
> */
> if ((flags & NOHZ_KICK_MASK) == NOHZ_STATS_KICK &&
> !rq->has_blocked_load)
> continue;
>
>> + update_blocked_averages(rq->cpu);
>> + has_blocked_load |= rq->has_blocked_load;
>> +
>> /*
>> * If time for next balance is due,
>> * do the balance.
>> @@ -9400,7 +9474,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> cpu_load_update_idle(rq);
>> rq_unlock_irq(rq, &rf);
>>
>> - update_blocked_averages(rq->cpu);
>> if (flags & NOHZ_BALANCE_KICK)
>> rebalance_domains(rq, CPU_IDLE);
>> }
>> @@ -9415,7 +9488,13 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> if (flags & NOHZ_BALANCE_KICK)
>> rebalance_domains(this_rq, CPU_IDLE);
>>
>> - nohz.next_stats = next_stats;
>> + WRITE_ONCE(nohz.next_blocked,
>> + now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>> +
>> +abort:
>> + /* There is still blocked load, enable periodic update */
>> + if (has_blocked_load)
>> + WRITE_ONCE(nohz.has_blocked, 1);
>>
>> /*
>> * next_balance will be updated only when there is a need.
On 16 February 2018 at 13:53, Valentin Schneider
<[email protected]> wrote:
> On 02/14/2018 03:26 PM, Vincent Guittot wrote:
>> Stopped the periodic update of blocked load when all idle CPUs have fully
>> decayed. We introduce a new nohz.has_blocked that reflect if some idle
>> CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
>> is set everytime that a Idle CPU can have blocked load and it is then clear
>> when no more blocked load has been detected during an update. We don't need
>> atomic operation but only to make cure of the right ordering when updating
>> nohz.idle_cpus_mask and nohz.has_blocked.
>>
>> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
>> Signed-off-by: Vincent Guittot <[email protected]>
>> ---
>> kernel/sched/fair.c | 122 ++++++++++++++++++++++++++++++++++++++++++---------
>> kernel/sched/sched.h | 1 +
>> 2 files changed, 102 insertions(+), 21 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 7af1fa9..5a6835e 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>>
>> [...]
>>
>> -static void update_nohz_stats(struct rq *rq)
>> +static bool update_nohz_stats(struct rq *rq)
>> {
>> #ifdef CONFIG_NO_HZ_COMMON
>> unsigned int cpu = rq->cpu;
>>
>> + if (!rq->has_blocked_load)
>> + return false;
>> +
>> if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
>> - return;
>> + return false;
>>
>> if (!time_after(jiffies, rq->last_blocked_load_update_tick))
>> - return;
>> + return true;
>>
>> update_blocked_averages(cpu);
>> +
>> + return rq->has_blocked_load;
>> +#else
>> + return false;
>> #endif
>> }
>>
>
> (Wrongly thought that this bit was in a different patch, comment should have
> been squashed in previous reply...)
>
> I've been thinking about this, and it's a messy one if we want to
> skip CPUs in idle_balance() / clear the nohz.has_blocked_flag.
>
> AFAICT, the rq->has_blocked_load flag can be wrongly cleared: if we're
> calling update_nohz_stats() for CPU A, but CPU A got out/in of
> idle really quickly in that same timeframe, I'm not sure you can guarantee
> the clearing of rq->has_blocked_load done in update_blocked_averages() will
> always end up in memory before the setting of the flag in
> nohz_balance_enter_idle().
Not sure it's a problem in this case because the clear done in
update_blocked_averages() only happens if there is no load on the rq
and new load can't be added in the mean time
>
> I was going to say we don't have this problem in _nohz_idle_balance() but
> actually I think we do. We have the checking of nohz.idle_cpus_mask after the
> smp_mb(), which makes sure the clear of nohz.has_blocked will never
> overwrite the set in nohz_balance_enter_idle(), but it doesn't
> guarantee the same for the rq flag. So we can have nohz CPUs with blocked
> load but with rq->has_blocked_load set to false. Which isn't a problem now
I don't think that we can have this case. Or at least I can't find a
sequence leading to this state.
Have you got a particular sequence in mind ?
> but it is if we want to use the flag to skip CPUs.
>
> Am I correct or am I going crazy ? There's a comment about this in
> nohz_balance_enter_idle() but I'm confused now:
>
> /*
> * Can be set safely without rq->lock held
> * If a clear happens, it will have evaluated last additions because
> * rq->lock is held during the check and the clear
> */
> rq->has_blocked_load = 1;
On 02/16/2018 05:02 PM, Vincent Guittot wrote:
> On 16 February 2018 at 13:53, Valentin Schneider
> <[email protected]> wrote:
>> On 02/14/2018 03:26 PM, Vincent Guittot wrote:
>>> Stopped the periodic update of blocked load when all idle CPUs have fully
>>> decayed. We introduce a new nohz.has_blocked that reflect if some idle
>>> CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
>>> is set everytime that a Idle CPU can have blocked load and it is then clear
>>> when no more blocked load has been detected during an update. We don't need
>>> atomic operation but only to make cure of the right ordering when updating
>>> nohz.idle_cpus_mask and nohz.has_blocked.
>>>
>>> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
>>> Signed-off-by: Vincent Guittot <[email protected]>
>>> ---
>>> kernel/sched/fair.c | 122 ++++++++++++++++++++++++++++++++++++++++++---------
>>> kernel/sched/sched.h | 1 +
>>> 2 files changed, 102 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 7af1fa9..5a6835e 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>>
>>> [...]
>>>
>>> -static void update_nohz_stats(struct rq *rq)
>>> +static bool update_nohz_stats(struct rq *rq)
>>> {
>>> #ifdef CONFIG_NO_HZ_COMMON
>>> unsigned int cpu = rq->cpu;
>>>
>>> + if (!rq->has_blocked_load)
>>> + return false;
>>> +
>>> if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
>>> - return;
>>> + return false;
>>>
>>> if (!time_after(jiffies, rq->last_blocked_load_update_tick))
>>> - return;
>>> + return true;
>>>
>>> update_blocked_averages(cpu);
>>> +
>>> + return rq->has_blocked_load;
>>> +#else
>>> + return false;
>>> #endif
>>> }
>>>
>>
>> (Wrongly thought that this bit was in a different patch, comment should have
>> been squashed in previous reply...)
>>
>> I've been thinking about this, and it's a messy one if we want to
>> skip CPUs in idle_balance() / clear the nohz.has_blocked_flag.
>>
>> AFAICT, the rq->has_blocked_load flag can be wrongly cleared: if we're
>> calling update_nohz_stats() for CPU A, but CPU A got out/in of
>> idle really quickly in that same timeframe, I'm not sure you can guarantee
>> the clearing of rq->has_blocked_load done in update_blocked_averages() will
>> always end up in memory before the setting of the flag in
>> nohz_balance_enter_idle().
>
> Not sure it's a problem in this case because the clear done in
> update_blocked_averages() only happens if there is no load on the rq
> and new load can't be added in the mean time
>
You're right, and that's why there's that comment:
>> /*
>> * Can be set safely without rq->lock held
>> * If a clear happens, it will have evaluated last additions because
>> * rq->lock is held during the check and the clear
>> */
>> rq->has_blocked_load = 1;
Even though it's clearly written there my brain wouldn't process the fact
that the flag is cleared with the rq lock held. So yeah, we can't wrongly
clear rq->has_blocked_load. The only mishap that can happen is that it is
re-raised even though we just went though an update_nohz_stats(), which would
lead to a useless stats update in the future, but that's not as bad.
On Wed, Feb 14, 2018 at 04:26:46PM +0100, Vincent Guittot wrote:
> When NEWLY_IDLE load balance is not triggered, we might need to update the
> blocked load anyway. We can kick an ilb so an idle CPU will take care of
> updating blocked load or we can try to update them locally before entering
> idle. In the latter case, we reuse part of the nohz_idle_balance.
So I still don't like this, but then I couldn't come up with anything I
did like either. Munged the patch a bit, I pulled out the code movement
into a separate patch and otherwise reduced #ifdeffery a bit.
---
Subject: sched: update blocked load when newly idle
From: Vincent Guittot <[email protected]>
Date: Wed, 14 Feb 2018 16:26:46 +0100
When NEWLY_IDLE load balance is not triggered, we might need to update the
blocked load anyway. We can kick an ilb so an idle CPU will take care of
updating blocked load or we can try to update them locally before entering
idle. In the latter case, we reuse part of the nohz_idle_balance.
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Vincent Guittot <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
---
kernel/sched/fair.c | 105 +++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 87 insertions(+), 18 deletions(-)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9394,10 +9394,14 @@ void nohz_balance_enter_idle(int cpu)
}
/*
- * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ * Internal function that runs load balance for all idle cpus. The load balance
+ * can be a simple update of blocked load or a complete load balance with
+ * tasks movement depending of flags.
+ * The function returns false if the loop has stopped before running
+ * through all idle CPUs.
*/
-static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
+ enum cpu_idle_type idle)
{
/* Earliest time when we have to do rebalance again */
unsigned long now = jiffies;
@@ -9405,20 +9409,10 @@ static bool nohz_idle_balance(struct rq
bool has_blocked_load = false;
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
- unsigned int flags;
int balance_cpu;
+ int ret = false;
struct rq *rq;
- if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
- return false;
-
- if (idle != CPU_IDLE) {
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
- return false;
- }
-
- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
-
SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
/*
@@ -9462,10 +9456,10 @@ static bool nohz_idle_balance(struct rq
if (time_after_eq(jiffies, rq->next_balance)) {
struct rq_flags rf;
- rq_lock_irq(rq, &rf);
+ rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
cpu_load_update_idle(rq);
- rq_unlock_irq(rq, &rf);
+ rq_unlock_irqrestore(rq, &rf);
if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(rq, CPU_IDLE);
@@ -9477,13 +9471,21 @@ static bool nohz_idle_balance(struct rq
}
}
- update_blocked_averages(this_cpu);
+ /* Newly idle CPU doesn't need an update */
+ if (idle != CPU_NEWLY_IDLE) {
+ update_blocked_averages(this_cpu);
+ has_blocked_load |= this_rq->has_blocked_load;
+ }
+
if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(this_rq, CPU_IDLE);
WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+ /* The full idle balance loop has been done */
+ ret = true;
+
abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
@@ -9497,15 +9499,79 @@ static bool nohz_idle_balance(struct rq
if (likely(update_next_balance))
nohz.next_balance = next_balance;
+ return ret;
+}
+
+/*
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+{
+ int this_cpu = this_rq->cpu;
+ unsigned int flags;
+
+ if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+ return false;
+
+ if (idle != CPU_IDLE) {
+ atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+ return false;
+ }
+
+ /*
+ * barrier, pairs with nohz_balance_enter_idle(), ensures ...
+ */
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+ if (!(flags & NOHZ_KICK_MASK))
+ return false;
+
+ _nohz_idle_balance(this_rq, flags, idle);
+
return true;
}
+
+static void nohz_newidle_balance(struct rq *this_rq)
+{
+ int this_cpu = this_rq->cpu;
+
+ /*
+ * This CPU doesn't want to be disturbed by scheduler
+ * housekeeping
+ */
+ if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
+ return;
+
+ /* Will wake up very soon. No time for doing anything else*/
+ if (this_rq->avg_idle < sysctl_sched_migration_cost)
+ return;
+
+ /* Don't need to update blocked load of idle CPUs*/
+ if (!READ_ONCE(nohz.has_blocked) ||
+ time_before(jiffies, READ_ONCE(nohz.next_blocked)))
+ return;
+
+ raw_spin_unlock(&this_rq->lock);
+ /*
+ * This CPU is going to be idle and blocked load of idle CPUs
+ * need to be updated. Run the ilb locally as it is a good
+ * candidate for ilb instead of waking up another idle CPU.
+ * Kick an normal ilb if we failed to do the update.
+ */
+ if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
+ kick_ilb(NOHZ_STATS_KICK);
+ raw_spin_lock(&this_rq->lock);
+}
+
#else /* !CONFIG_NO_HZ_COMMON */
static inline void nohz_balancer_kick(struct rq *rq) { }
-static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
return false;
}
+
+static inline void nohz_newidle_balance(struct rq *this_rq) { }
#endif /* CONFIG_NO_HZ_COMMON */
/*
@@ -9542,12 +9608,15 @@ static int idle_balance(struct rq *this_
if (this_rq->avg_idle < sysctl_sched_migration_cost ||
!this_rq->rd->overload) {
+
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
update_next_balance(sd, &next_balance);
rcu_read_unlock();
+ nohz_newidle_balance(this_rq);
+
goto out;
}
On 02/16/2018 01:44 PM, Vincent Guittot wrote:
> On 16 February 2018 at 13:13, Valentin Schneider
> <[email protected]> wrote:
>> On 02/14/2018 03:26 PM, Vincent Guittot wrote:
>>> Stopped the periodic update of blocked load when all idle CPUs have fully
>>> decayed. We introduce a new nohz.has_blocked that reflect if some idle
>>> CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
>>> is set everytime that a Idle CPU can have blocked load and it is then clear
>>> when no more blocked load has been detected during an update. We don't need
>>> atomic operation but only to make cure of the right ordering when updating
>>> nohz.idle_cpus_mask and nohz.has_blocked.
>>>
>>> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
>>> Signed-off-by: Vincent Guittot <[email protected]>
>>> ---
>>> kernel/sched/fair.c | 122 ++++++++++++++++++++++++++++++++++++++++++---------
>>> kernel/sched/sched.h | 1 +
>>> 2 files changed, 102 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 7af1fa9..5a6835e 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>>
>>> [...]
>>> @@ -9383,11 +9452,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>> * work being done for other cpus. Next load
>>> * balancing owner will pick it up.
>>> */
>>> - if (need_resched())
>>> - break;
>>> + if (need_resched()) {
>>> + has_blocked_load = true;
>>> + goto abort;
>>> + }
>>>
>>> rq = cpu_rq(balance_cpu);
>>>
>>
>> I'd say it's safe to do the following here. The flag is raised in
>> nohz_balance_enter_idle() before the smp_mb(), so we won't skip a CPU
>> that just got added to nohz.idle_cpus_mask.
>
> rq->has_blocked_load will be set before the barrier only if
> nohz_tick_stopped is not already set,
> Otherwise, we skip cpumask update and the barrier in nohz_balance_enter_idle
>
Right, forgot about that bit. I think it's still fine because:
- nohz_balance_enter_idle() can't be called before the last running task is
dequeued, which requires the rq lock.
- update_blocked_averages takes the rq lock and clears rq->has_blocked_load
with the lock held.
So even though we could have some very unlikely scenario where a CPU quickly
goes out/in of idle after nohz.idle_cpus_mask has been read, the blocked load
itself is safe so rq->has_blocked_load will end up being set correctly.
(Took me a while to see it that way)
BTW, with the current set on Peter's sched/testing, update_nohz_stats()
is called here, which doesn't do the update if !rq->has_blocked_load
(Although that check is done without lock/barrier, so maybe we could not see
a CPU that just went idle ?)
I have one more question on that bit:
has_blocked_load |= update_nohz_stats(rq, true);
/*
* If time for next balance is due,
* do the balance.
*/
if (time_after_eq(jiffies, rq->next_balance)) {
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
cpu_load_update_idle(rq);
rq_unlock_irqrestore(rq, &rf);
if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(rq, CPU_IDLE);
}
if (time_after(next_balance, rq->next_balance)) {
next_balance = rq->next_balance;
update_next_balance = 1;
}
Now that I think about it, shouldn't we always have a 'continue' after
the blocked load update if (flags & NOHZ_KICK_MASK) == NOHZ_STATS_KICK ?
AFAICT we don't want to push the next_balance forward, only the next_blocked.
That would also take care of not doing the load balance.
>>
>> /*
>> * This cpu doesn't have any remaining blocked load, skip it.
>> * It's sane to do this because this flag is raised in
>> * nohz_balance_enter_idle()
>> */
>> if ((flags & NOHZ_KICK_MASK) == NOHZ_STATS_KICK &&
>> !rq->has_blocked_load)
>> continue;
>>
>>> + update_blocked_averages(rq->cpu);
>>> + has_blocked_load |= rq->has_blocked_load;
>>> +
On 02/21/2018 01:13 PM, Valentin Schneider wrote:
> On 02/16/2018 01:44 PM, Vincent Guittot wrote:
>> On 16 February 2018 at 13:13, Valentin Schneider
>> <[email protected]> wrote:
>>> On 02/14/2018 03:26 PM, Vincent Guittot wrote:
> BTW, with the current set on Peter's sched/testing, update_nohz_stats()
> is called here, which doesn't do the update if !rq->has_blocked_load
> (Although that check is done without lock/barrier, so maybe we could not see
> a CPU that just went idle ?)
Ignore that, that's another case of me being overly paranoid after reading
too much of Documentation/memory-barriers.txt
On 21 February 2018 at 14:13, Valentin Schneider
<[email protected]> wrote:
> On 02/16/2018 01:44 PM, Vincent Guittot wrote:
>> On 16 February 2018 at 13:13, Valentin Schneider
>> <[email protected]> wrote:
>>> On 02/14/2018 03:26 PM, Vincent Guittot wrote:
>>>> Stopped the periodic update of blocked load when all idle CPUs have fully
>>>> decayed. We introduce a new nohz.has_blocked that reflect if some idle
>>>> CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
>>>> is set everytime that a Idle CPU can have blocked load and it is then clear
>>>> when no more blocked load has been detected during an update. We don't need
>>>> atomic operation but only to make cure of the right ordering when updating
>>>> nohz.idle_cpus_mask and nohz.has_blocked.
>>>>
>>>> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
>>>> Signed-off-by: Vincent Guittot <[email protected]>
>>>> ---
>>>> kernel/sched/fair.c | 122 ++++++++++++++++++++++++++++++++++++++++++---------
>>>> kernel/sched/sched.h | 1 +
>>>> 2 files changed, 102 insertions(+), 21 deletions(-)
>>>>
>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>> index 7af1fa9..5a6835e 100644
>>>> --- a/kernel/sched/fair.c
>>>> +++ b/kernel/sched/fair.c
>>>>
>>>> [...]
>>>> @@ -9383,11 +9452,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>>> * work being done for other cpus. Next load
>>>> * balancing owner will pick it up.
>>>> */
>>>> - if (need_resched())
>>>> - break;
>>>> + if (need_resched()) {
>>>> + has_blocked_load = true;
>>>> + goto abort;
>>>> + }
>>>>
>>>> rq = cpu_rq(balance_cpu);
>>>>
>>>
>>> I'd say it's safe to do the following here. The flag is raised in
>>> nohz_balance_enter_idle() before the smp_mb(), so we won't skip a CPU
>>> that just got added to nohz.idle_cpus_mask.
>>
>> rq->has_blocked_load will be set before the barrier only if
>> nohz_tick_stopped is not already set,
>> Otherwise, we skip cpumask update and the barrier in nohz_balance_enter_idle
>>
>
> Right, forgot about that bit. I think it's still fine because:
> - nohz_balance_enter_idle() can't be called before the last running task is
> dequeued, which requires the rq lock.
> - update_blocked_averages takes the rq lock and clears rq->has_blocked_load
> with the lock held.
>
> So even though we could have some very unlikely scenario where a CPU quickly
> goes out/in of idle after nohz.idle_cpus_mask has been read, the blocked load
> itself is safe so rq->has_blocked_load will end up being set correctly.
> (Took me a while to see it that way)
>
>
> BTW, with the current set on Peter's sched/testing, update_nohz_stats()
> is called here, which doesn't do the update if !rq->has_blocked_load
> (Although that check is done without lock/barrier, so maybe we could not see
> a CPU that just went idle ?)
>
> I have one more question on that bit:
>
>
> has_blocked_load |= update_nohz_stats(rq, true);
>
> /*
> * If time for next balance is due,
> * do the balance.
> */
> if (time_after_eq(jiffies, rq->next_balance)) {
> struct rq_flags rf;
>
> rq_lock_irqsave(rq, &rf);
> update_rq_clock(rq);
> cpu_load_update_idle(rq);
> rq_unlock_irqrestore(rq, &rf);
>
> if (flags & NOHZ_BALANCE_KICK)
> rebalance_domains(rq, CPU_IDLE);
> }
>
> if (time_after(next_balance, rq->next_balance)) {
> next_balance = rq->next_balance;
> update_next_balance = 1;
> }
>
>
> Now that I think about it, shouldn't we always have a 'continue' after
> the blocked load update if (flags & NOHZ_KICK_MASK) == NOHZ_STATS_KICK ?
> AFAICT we don't want to push the next_balance forward, only the next_blocked.
But we don't push next_balance forward. It just get the shortest
next_balance and update nohz.next_balance exactly like what is done in
full idle load balance
> That would also take care of not doing the load balance.
>>>
>>> /*
>>> * This cpu doesn't have any remaining blocked load, skip it.
>>> * It's sane to do this because this flag is raised in
>>> * nohz_balance_enter_idle()
>>> */
>>> if ((flags & NOHZ_KICK_MASK) == NOHZ_STATS_KICK &&
>>> !rq->has_blocked_load)
>>> continue;
Then, it's worth keeping the call to cpu_load_update_idle(rq); which
update the cpu_load[] array which is still used at some level
>>>
>>>> + update_blocked_averages(rq->cpu);
>>>> + has_blocked_load |= rq->has_blocked_load;
>>>> +
On 02/22/2018 08:37 AM, Vincent Guittot wrote:
> On 21 February 2018 at 14:13, Valentin Schneider
> <[email protected]> wrote:
>> On 02/16/2018 01:44 PM, Vincent Guittot wrote:
>>> On 16 February 2018 at 13:13, Valentin Schneider
>>> <[email protected]> wrote:
>>>> On 02/14/2018 03:26 PM, Vincent Guittot wrote:
>>>>> Stopped the periodic update of blocked load when all idle CPUs have fully
>>>>> decayed. We introduce a new nohz.has_blocked that reflect if some idle
>>>>> CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
>>>>> is set everytime that a Idle CPU can have blocked load and it is then clear
>>>>> when no more blocked load has been detected during an update. We don't need
>>>>> atomic operation but only to make cure of the right ordering when updating
>>>>> nohz.idle_cpus_mask and nohz.has_blocked.
>>>>>
>>>>> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
>>>>> Signed-off-by: Vincent Guittot <[email protected]>
>>>>> ---
>>>>> kernel/sched/fair.c | 122 ++++++++++++++++++++++++++++++++++++++++++---------
>>>>> kernel/sched/sched.h | 1 +
>>>>> 2 files changed, 102 insertions(+), 21 deletions(-)
>>>>>
>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>> index 7af1fa9..5a6835e 100644
>>>>> --- a/kernel/sched/fair.c
>>>>> +++ b/kernel/sched/fair.c
>>>>>
>>>>> [...]
>>
>> I have one more question on that bit:
>>
>>
>> has_blocked_load |= update_nohz_stats(rq, true);
>>
>> /*
>> * If time for next balance is due,
>> * do the balance.
>> */
>> if (time_after_eq(jiffies, rq->next_balance)) {
>> struct rq_flags rf;
>>
>> rq_lock_irqsave(rq, &rf);
>> update_rq_clock(rq);
>> cpu_load_update_idle(rq);
>> rq_unlock_irqrestore(rq, &rf);
>>
>> if (flags & NOHZ_BALANCE_KICK)
>> rebalance_domains(rq, CPU_IDLE);
>> }
>>
>> if (time_after(next_balance, rq->next_balance)) {
>> next_balance = rq->next_balance;
>> update_next_balance = 1;
>> }
>>
>>
>> Now that I think about it, shouldn't we always have a 'continue' after
>> the blocked load update if (flags & NOHZ_KICK_MASK) == NOHZ_STATS_KICK ?
>> AFAICT we don't want to push the next_balance forward, only the next_blocked.
>
> But we don't push next_balance forward. It just get the shortest
> next_balance and update nohz.next_balance exactly like what is done in
> full idle load balance
>
Sorry, that was a poor choice of words - I probably should've just gone with
"update". What I meant by that is that if we have
(flags & NOHZ_KICK_MASK) == NOHZ_STATS_KICK
then we're not going to do the load balance.
Then, in this case, I thought that we should not be going through any
condition that uses nohz.next_balance (since we're not doing any balancing).
Arguably *updating* nohz.next_balance still makes sense in this scenario.
In short, my comment was mostly about "cleanly" separating stats update vs
load balance.
>> That would also take care of not doing the load balance.
>>>>
>>>> /*
>>>> * This cpu doesn't have any remaining blocked load, skip it.
>>>> * It's sane to do this because this flag is raised in
>>>> * nohz_balance_enter_idle()
>>>> */
>>>> if ((flags & NOHZ_KICK_MASK) == NOHZ_STATS_KICK &&
>>>> !rq->has_blocked_load)
>>>> continue;
>
> Then, it's worth keeping the call to cpu_load_update_idle(rq); which
> update the cpu_load[] array which is still used at some level
>
Is that something we would want to have in update_nohz_stats() to also
cover the idle_balance -> load_balance update scenario ?
From a quick glance I would've said it shouldn't be needed since the CPU doing
the updates wouldn't have been nohz previously, but we're currently calling
it when going through nohz_newidle_balance() so I might have gotten that wrong.
>>>>
>>>>> + update_blocked_averages(rq->cpu);
>>>>> + has_blocked_load |= rq->has_blocked_load;
>>>>> +
Commit-ID: 31e77c93e432dec79c7d90b888bbfc3652592741
Gitweb: https://git.kernel.org/tip/31e77c93e432dec79c7d90b888bbfc3652592741
Author: Vincent Guittot <[email protected]>
AuthorDate: Wed, 14 Feb 2018 16:26:46 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Fri, 9 Mar 2018 07:59:28 +0100
sched/fair: Update blocked load when newly idle
When NEWLY_IDLE load balance is not triggered, we might need to update the
blocked load anyway. We can kick an ilb so an idle CPU will take care of
updating blocked load or we can try to update them locally before entering
idle. In the latter case, we reuse part of the nohz_idle_balance.
Signed-off-by: Vincent Guittot <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched/fair.c | 105 +++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 87 insertions(+), 18 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 058badcfa94b..3582117e1580 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9375,10 +9375,14 @@ out:
}
/*
- * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the CPUs for whom scheduler ticks are stopped.
+ * Internal function that runs load balance for all idle cpus. The load balance
+ * can be a simple update of blocked load or a complete load balance with
+ * tasks movement depending of flags.
+ * The function returns false if the loop has stopped before running
+ * through all idle CPUs.
*/
-static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
+ enum cpu_idle_type idle)
{
/* Earliest time when we have to do rebalance again */
unsigned long now = jiffies;
@@ -9386,20 +9390,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
bool has_blocked_load = false;
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
- unsigned int flags;
int balance_cpu;
+ int ret = false;
struct rq *rq;
- if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
- return false;
-
- if (idle != CPU_IDLE) {
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
- return false;
- }
-
- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
-
SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
/*
@@ -9443,10 +9437,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (time_after_eq(jiffies, rq->next_balance)) {
struct rq_flags rf;
- rq_lock_irq(rq, &rf);
+ rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
cpu_load_update_idle(rq);
- rq_unlock_irq(rq, &rf);
+ rq_unlock_irqrestore(rq, &rf);
if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(rq, CPU_IDLE);
@@ -9458,13 +9452,21 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
}
}
- update_blocked_averages(this_cpu);
+ /* Newly idle CPU doesn't need an update */
+ if (idle != CPU_NEWLY_IDLE) {
+ update_blocked_averages(this_cpu);
+ has_blocked_load |= this_rq->has_blocked_load;
+ }
+
if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(this_rq, CPU_IDLE);
WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+ /* The full idle balance loop has been done */
+ ret = true;
+
abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
@@ -9478,15 +9480,79 @@ abort:
if (likely(update_next_balance))
nohz.next_balance = next_balance;
+ return ret;
+}
+
+/*
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+{
+ int this_cpu = this_rq->cpu;
+ unsigned int flags;
+
+ if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+ return false;
+
+ if (idle != CPU_IDLE) {
+ atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+ return false;
+ }
+
+ /*
+ * barrier, pairs with nohz_balance_enter_idle(), ensures ...
+ */
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+ if (!(flags & NOHZ_KICK_MASK))
+ return false;
+
+ _nohz_idle_balance(this_rq, flags, idle);
+
return true;
}
+
+static void nohz_newidle_balance(struct rq *this_rq)
+{
+ int this_cpu = this_rq->cpu;
+
+ /*
+ * This CPU doesn't want to be disturbed by scheduler
+ * housekeeping
+ */
+ if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
+ return;
+
+ /* Will wake up very soon. No time for doing anything else*/
+ if (this_rq->avg_idle < sysctl_sched_migration_cost)
+ return;
+
+ /* Don't need to update blocked load of idle CPUs*/
+ if (!READ_ONCE(nohz.has_blocked) ||
+ time_before(jiffies, READ_ONCE(nohz.next_blocked)))
+ return;
+
+ raw_spin_unlock(&this_rq->lock);
+ /*
+ * This CPU is going to be idle and blocked load of idle CPUs
+ * need to be updated. Run the ilb locally as it is a good
+ * candidate for ilb instead of waking up another idle CPU.
+ * Kick an normal ilb if we failed to do the update.
+ */
+ if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
+ kick_ilb(NOHZ_STATS_KICK);
+ raw_spin_lock(&this_rq->lock);
+}
+
#else /* !CONFIG_NO_HZ_COMMON */
static inline void nohz_balancer_kick(struct rq *rq) { }
-static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
return false;
}
+
+static inline void nohz_newidle_balance(struct rq *this_rq) { }
#endif /* CONFIG_NO_HZ_COMMON */
/*
@@ -9523,12 +9589,15 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
if (this_rq->avg_idle < sysctl_sched_migration_cost ||
!this_rq->rd->overload) {
+
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
update_next_balance(sd, &next_balance);
rcu_read_unlock();
+ nohz_newidle_balance(this_rq);
+
goto out;
}