by Vincent Guittot

[permalink] [raw]

Subject: Re: [PATCH 3/3] sched: update blocked load when newly idle

On 6 February 2018 at 15:32, Valentin Schneider
<[email protected]> wrote:
> Hi Vincent,
>
> On 02/06/2018 08:32 AM, Vincent Guittot wrote:
>> When NEWLY_IDLE load balance is not triggered, we might need to update the
>> blocked load anyway. We can kick an ilb so an idle CPU will take care of
>> updating blocked load or we can try to update them locally before entering
>> idle. In the latter case, we reuse part of the nohz_idle_balance.
>>
>> Signed-off-by: Vincent Guittot <[email protected]>
>> ---
>> kernel/sched/fair.c | 102 ++++++++++++++++++++++++++++++++++++++++++----------
>> 1 file changed, 84 insertions(+), 18 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 6998528..256defe 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -8829,6 +8829,9 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
>> *next_balance = next;
>> }
>>
>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle);
>> +static void kick_ilb(unsigned int flags);
>> +
>> /*
>> * idle_balance is called by schedule() if this_cpu is about to become
>> * idle. Attempts to pull tasks from other CPUs.
>> @@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
>>
>> if (this_rq->avg_idle < sysctl_sched_migration_cost ||
>> !this_rq->rd->overload) {
>> + unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
>> + unsigned long next = READ_ONCE(nohz.next_blocked);
>
> Ditto on 'next' - there's next_balance referenced in here so it'd be nice to make clear which is which.
>
>> +
>> rcu_read_lock();
>> sd = rcu_dereference_check_sched_domain(this_rq->sd);
>> if (sd)
>> update_next_balance(sd, &next_balance);
>> rcu_read_unlock();
>>
>> + /*
>> + * Update blocked idle load if it has not been done for a
>> + * while. Try to do it locally before entering idle but kick a
>> + * ilb if it takes too much time and/or might delay next local
>> + * wake up
>> + */
>> + if (has_blocked && time_after_eq(jiffies, next) &&
>> + (this_rq->avg_idle < sysctl_sched_migration_cost ||
>> + !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)))
>
> "this_rq->avg_idle < sysctl_sched_migration_cost" is used twice now, how about storing it in an "idle_too_short" variable ?

In fact it's already the 3rd time
Why do you want it to be stored in an "idle_too_short" variable ?

>
>> + kick_ilb(NOHZ_STATS_KICK);
>> +
>> goto out;
>> }
>>
>> @@ -9393,30 +9410,24 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
>>
>> #ifdef CONFIG_NO_HZ_COMMON
>> /*
>> - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>> - * rebalancing for all the cpus for whom scheduler ticks are stopped.
>> + * Internal function that runs load balance for all idle cpus. The load balance
>> + * can be a simple update of blocked load or a complete load balance with
>> + * tasks movement depending of flags.
>> + * For newly idle mode, we abort the loop if it takes too much time and return
>> + * false to notify that the loop has not be completed and a ilb should be kick.
>> */
>> -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle)
>> {
>> /* Earliest time when we have to do rebalance again */
>> unsigned long now = jiffies;
>> unsigned long next_balance = now + 60*HZ;
>> - unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
>> + bool has_blocked_load = false;
>> int update_next_balance = 0;
>> int this_cpu = this_rq->cpu;
>> - unsigned int flags;
>> int balance_cpu;
>> + int ret = false;
>> struct rq *rq;
>> -
>> - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>> - return false;
>> -
>> - if (idle != CPU_IDLE) {
>> - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> - return false;
>> - }
>> -
>> - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> + u64 curr_cost = 0;
>>
>> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>
>> @@ -9431,6 +9442,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> WRITE_ONCE(nohz.has_blocked, 0);
>>
>> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>> + u64 t0, domain_cost;
>> +
>> + t0 = sched_clock_cpu(this_cpu);
>> +
>> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>> continue;
>>
>> @@ -9444,6 +9459,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> goto abort;
>> }
>>
>> + /*
>> + * If the update is done while CPU becomes idle, we abort
>> + * the update when its cost is higher than the average idle
>> + * time in orde to not delay a possible wake up.
>> + */
>> + if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
>> + has_blocked_load = true;
>> + goto abort;
>> + }
>> +
>> rq = cpu_rq(balance_cpu);
>>
>> update_blocked_averages(rq->cpu);
>> @@ -9456,10 +9481,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> if (time_after_eq(jiffies, rq->next_balance)) {
>> struct rq_flags rf;
>>
>> - rq_lock_irq(rq, &rf);
>> + rq_lock_irqsave(rq, &rf);
>> update_rq_clock(rq);
>> cpu_load_update_idle(rq);
>> - rq_unlock_irq(rq, &rf);
>> + rq_unlock_irqrestore(rq, &rf);
>>
>> if (flags & NOHZ_BALANCE_KICK)
>> rebalance_domains(rq, CPU_IDLE);
>> @@ -9469,15 +9494,27 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> next_balance = rq->next_balance;
>> update_next_balance = 1;
>> }
>> +
>> + domain_cost = sched_clock_cpu(this_cpu) - t0;
>> + curr_cost += domain_cost;
>> +
>> + }
>> +
>> + /* Newly idle CPU doesn't need an update */
>> + if (idle != CPU_NEWLY_IDLE) {
>> + update_blocked_averages(this_cpu);
>> + has_blocked_load |= this_rq->has_blocked_load;
>> }
>>
>> - update_blocked_averages(this_cpu);
>> if (flags & NOHZ_BALANCE_KICK)
>> rebalance_domains(this_rq, CPU_IDLE);
>>
>> WRITE_ONCE(nohz.next_blocked,
>> now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>>
>> + /* The full idle balance loop has been done */
>> + ret = true;
>> +
>> abort:
>> /* There is still blocked load, enable periodic update */
>> if (has_blocked_load)
>> @@ -9491,6 +9528,35 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> if (likely(update_next_balance))
>> nohz.next_balance = next_balance;
>>
>> + return ret;
>> +}
>> +
>> +/*
>> + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>> + * rebalancing for all the cpus for whom scheduler ticks are stopped.
>> + */
>> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> +{
>> + int this_cpu = this_rq->cpu;
>> + unsigned int flags;
>> +
>> + if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>> + return false;
>> +
>> + if (idle != CPU_IDLE) {
>> + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> + return false;
>> + }
>> +
>> + /*
>> + * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>> + */
>> + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> + if (!(flags & NOHZ_KICK_MASK))
>> + return false;
>> +
>> + _nohz_idle_balance(this_rq, flags, idle);
>> +
>> return true;
>> }
>> #else
>>

2018-02-06 16:34:54

by Valentin Schneider

[permalink] [raw]

Subject: Re: [PATCH 3/3] sched: update blocked load when newly idle

On 02/06/2018 04:17 PM, Vincent Guittot wrote:
> On 6 February 2018 at 15:32, Valentin Schneider
> <[email protected]> wrote:
>> Hi Vincent,
>>
>> On 02/06/2018 08:32 AM, Vincent Guittot wrote:
>>> When NEWLY_IDLE load balance is not triggered, we might need to update the
>>> blocked load anyway. We can kick an ilb so an idle CPU will take care of
>>> updating blocked load or we can try to update them locally before entering
>>> idle. In the latter case, we reuse part of the nohz_idle_balance.
>>>
>>> Signed-off-by: Vincent Guittot <[email protected]>
>>> ---
>>> kernel/sched/fair.c | 102 ++++++++++++++++++++++++++++++++++++++++++----------
>>> 1 file changed, 84 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 6998528..256defe 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -8829,6 +8829,9 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
>>> *next_balance = next;
>>> }
>>>
>>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle);
>>> +static void kick_ilb(unsigned int flags);
>>> +
>>> /*
>>> * idle_balance is called by schedule() if this_cpu is about to become
>>> * idle. Attempts to pull tasks from other CPUs.
>>> @@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
>>>
>>> if (this_rq->avg_idle < sysctl_sched_migration_cost ||
>>> !this_rq->rd->overload) {
>>> + unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
>>> + unsigned long next = READ_ONCE(nohz.next_blocked);
>>
>> Ditto on 'next' - there's next_balance referenced in here so it'd be nice to make clear which is which.
>>
>>> +
>>> rcu_read_lock();
>>> sd = rcu_dereference_check_sched_domain(this_rq->sd);
>>> if (sd)
>>> update_next_balance(sd, &next_balance);
>>> rcu_read_unlock();
>>>
>>> + /*
>>> + * Update blocked idle load if it has not been done for a
>>> + * while. Try to do it locally before entering idle but kick a
>>> + * ilb if it takes too much time and/or might delay next local
>>> + * wake up
>>> + */
>>> + if (has_blocked && time_after_eq(jiffies, next) &&
>>> + (this_rq->avg_idle < sysctl_sched_migration_cost ||
>>> + !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)))
>>
>> "this_rq->avg_idle < sysctl_sched_migration_cost" is used twice now, how about storing it in an "idle_too_short" variable ?
>
> In fact it's already the 3rd time
> Why do you want it to be stored in an "idle_too_short" variable ?

I meant that locally in idle_balance() to not write the same thing twice. TBH that's me being nitpicky (and liking explicit variables).

>
>>
>>> + kick_ilb(NOHZ_STATS_KICK);
>>> +
>>> goto out;
>>> }
>>>
>>> @@ -9393,30 +9410,24 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
>>>
>>> #ifdef CONFIG_NO_HZ_COMMON
>>> /*
>>> - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>>> - * rebalancing for all the cpus for whom scheduler ticks are stopped.
>>> + * Internal function that runs load balance for all idle cpus. The load balance
>>> + * can be a simple update of blocked load or a complete load balance with
>>> + * tasks movement depending of flags.
>>> + * For newly idle mode, we abort the loop if it takes too much time and return
>>> + * false to notify that the loop has not be completed and a ilb should be kick.
>>> */
>>> -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle)
>>> {
>>> /* Earliest time when we have to do rebalance again */
>>> unsigned long now = jiffies;
>>> unsigned long next_balance = now + 60*HZ;
>>> - unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
>>> + bool has_blocked_load = false;
>>> int update_next_balance = 0;
>>> int this_cpu = this_rq->cpu;
>>> - unsigned int flags;
>>> int balance_cpu;
>>> + int ret = false;
>>> struct rq *rq;
>>> -
>>> - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>>> - return false;
>>> -
>>> - if (idle != CPU_IDLE) {
>>> - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> - return false;
>>> - }
>>> -
>>> - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> + u64 curr_cost = 0;
>>>
>>> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>>
>>> @@ -9431,6 +9442,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>> WRITE_ONCE(nohz.has_blocked, 0);
>>>
>>> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>>> + u64 t0, domain_cost;
>>> +
>>> + t0 = sched_clock_cpu(this_cpu);
>>> +
>>> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>>> continue;
>>>
>>> @@ -9444,6 +9459,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>> goto abort;
>>> }
>>>
>>> + /*
>>> + * If the update is done while CPU becomes idle, we abort
>>> + * the update when its cost is higher than the average idle
>>> + * time in orde to not delay a possible wake up.
>>> + */
>>> + if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
>>> + has_blocked_load = true;
>>> + goto abort;
>>> + }
>>> +
>>> rq = cpu_rq(balance_cpu);
>>>
>>> update_blocked_averages(rq->cpu);
>>> @@ -9456,10 +9481,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>> if (time_after_eq(jiffies, rq->next_balance)) {
>>> struct rq_flags rf;
>>>
>>> - rq_lock_irq(rq, &rf);
>>> + rq_lock_irqsave(rq, &rf);
>>> update_rq_clock(rq);
>>> cpu_load_update_idle(rq);
>>> - rq_unlock_irq(rq, &rf);
>>> + rq_unlock_irqrestore(rq, &rf);
>>>
>>> if (flags & NOHZ_BALANCE_KICK)
>>> rebalance_domains(rq, CPU_IDLE);
>>> @@ -9469,15 +9494,27 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>> next_balance = rq->next_balance;
>>> update_next_balance = 1;
>>> }
>>> +
>>> + domain_cost = sched_clock_cpu(this_cpu) - t0;
>>> + curr_cost += domain_cost;
>>> +
>>> + }
>>> +
>>> + /* Newly idle CPU doesn't need an update */
>>> + if (idle != CPU_NEWLY_IDLE) {
>>> + update_blocked_averages(this_cpu);
>>> + has_blocked_load |= this_rq->has_blocked_load;
>>> }
>>>
>>> - update_blocked_averages(this_cpu);
>>> if (flags & NOHZ_BALANCE_KICK)
>>> rebalance_domains(this_rq, CPU_IDLE);
>>>
>>> WRITE_ONCE(nohz.next_blocked,
>>> now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>>>
>>> + /* The full idle balance loop has been done */
>>> + ret = true;
>>> +
>>> abort:
>>> /* There is still blocked load, enable periodic update */
>>> if (has_blocked_load)
>>> @@ -9491,6 +9528,35 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>> if (likely(update_next_balance))
>>> nohz.next_balance = next_balance;
>>>
>>> + return ret;
>>> +}
>>> +
>>> +/*
>>> + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>>> + * rebalancing for all the cpus for whom scheduler ticks are stopped.
>>> + */
>>> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>> +{
>>> + int this_cpu = this_rq->cpu;
>>> + unsigned int flags;
>>> +
>>> + if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>>> + return false;
>>> +
>>> + if (idle != CPU_IDLE) {
>>> + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> + return false;
>>> + }
>>> +
>>> + /*
>>> + * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>>> + */
>>> + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> + if (!(flags & NOHZ_KICK_MASK))
>>> + return false;
>>> +
>>> + _nohz_idle_balance(this_rq, flags, idle);
>>> +
>>> return true;
>>> }
>>> #else
>>>