2021-04-28 23:29:23

by Crystal Wood

[permalink] [raw]
Subject: [PATCH v2 1/3] sched/fair: Call newidle_balance() from balance_callback on PREEMPT_RT

This is required in order to be able to enable interrupts in the next
patch. This is limited to PREEMPT_RT to avoid adding potentially
measurable overhead to the non-RT case (requiring a double switch when
pulling a task onto a newly idle cpu).

update_misfit_status() is factored out for the PREEMPT_RT case, to ensure
that the misfit status is kept consistent before dropping the lock.

Signed-off-by: Scott Wood <[email protected]>
---
v2: Use a balance callback, and limit to PREEMPT_RT

kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++++----
1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 794c2cb945f8..ff369c38a5b5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5660,6 +5660,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)

#ifdef CONFIG_SMP

+static const bool newidle_balance_in_callback = IS_ENABLED(CONFIG_PREEMPT_RT);
+static DEFINE_PER_CPU(struct callback_head, rebalance_head);
+
/* Working cpumask for: load_balance, load_balance_newidle. */
DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
@@ -10549,7 +10552,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
* 0 - failed, no new tasks
* > 0 - success, new (fair) tasks present
*/
-static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+static int do_newidle_balance(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
@@ -10557,7 +10560,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
int pulled_task = 0;
u64 curr_cost = 0;

- update_misfit_status(NULL, this_rq);
+ if (!newidle_balance_in_callback)
+ update_misfit_status(NULL, this_rq);
+
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
@@ -10576,7 +10581,8 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
* further scheduler activity on it and we're being very careful to
* re-start the picking loop.
*/
- rq_unpin_lock(this_rq, rf);
+ if (!newidle_balance_in_callback)
+ rq_unpin_lock(this_rq, rf);

if (this_rq->avg_idle < sysctl_sched_migration_cost ||
!READ_ONCE(this_rq->rd->overload)) {
@@ -10655,11 +10661,31 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
if (pulled_task)
this_rq->idle_stamp = 0;

- rq_repin_lock(this_rq, rf);
+ if (!newidle_balance_in_callback)
+ rq_repin_lock(this_rq, rf);

return pulled_task;
}

+static void newidle_balance_cb(struct rq *this_rq)
+{
+ update_rq_clock(this_rq);
+ do_newidle_balance(this_rq, NULL);
+}
+
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+{
+ if (newidle_balance_in_callback) {
+ update_misfit_status(NULL, this_rq);
+ queue_balance_callback(this_rq,
+ &per_cpu(rebalance_head, this_rq->cpu),
+ newidle_balance_cb);
+ return 0;
+ }
+
+ return do_newidle_balance(this_rq, rf);
+}
+
/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
--
2.27.0


2021-05-05 12:20:32

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] sched/fair: Call newidle_balance() from balance_callback on PREEMPT_RT

On Thu, 29 Apr 2021 at 01:28, Scott Wood <[email protected]> wrote:
>
> This is required in order to be able to enable interrupts in the next
> patch. This is limited to PREEMPT_RT to avoid adding potentially
> measurable overhead to the non-RT case (requiring a double switch when
> pulling a task onto a newly idle cpu).

IIUC, only the newidle_balance is a problem and not the idle load
balance that runs softirq. In this case, why not skipping
newidle_balance entirely in case of preempt_rt and kick an idle load
balance instead as you switch to idle thread context anyway


>
> update_misfit_status() is factored out for the PREEMPT_RT case, to ensure
> that the misfit status is kept consistent before dropping the lock.
>
> Signed-off-by: Scott Wood <[email protected]>
> ---
> v2: Use a balance callback, and limit to PREEMPT_RT
>
> kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++++----
> 1 file changed, 30 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 794c2cb945f8..ff369c38a5b5 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5660,6 +5660,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>
> #ifdef CONFIG_SMP
>
> +static const bool newidle_balance_in_callback = IS_ENABLED(CONFIG_PREEMPT_RT);
> +static DEFINE_PER_CPU(struct callback_head, rebalance_head);
> +
> /* Working cpumask for: load_balance, load_balance_newidle. */
> DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
> DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
> @@ -10549,7 +10552,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
> * 0 - failed, no new tasks
> * > 0 - success, new (fair) tasks present
> */
> -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
> +static int do_newidle_balance(struct rq *this_rq, struct rq_flags *rf)
> {
> unsigned long next_balance = jiffies + HZ;
> int this_cpu = this_rq->cpu;
> @@ -10557,7 +10560,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
> int pulled_task = 0;
> u64 curr_cost = 0;
>
> - update_misfit_status(NULL, this_rq);
> + if (!newidle_balance_in_callback)
> + update_misfit_status(NULL, this_rq);
> +
> /*
> * We must set idle_stamp _before_ calling idle_balance(), such that we
> * measure the duration of idle_balance() as idle time.
> @@ -10576,7 +10581,8 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
> * further scheduler activity on it and we're being very careful to
> * re-start the picking loop.
> */
> - rq_unpin_lock(this_rq, rf);
> + if (!newidle_balance_in_callback)
> + rq_unpin_lock(this_rq, rf);
>
> if (this_rq->avg_idle < sysctl_sched_migration_cost ||
> !READ_ONCE(this_rq->rd->overload)) {
> @@ -10655,11 +10661,31 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
> if (pulled_task)
> this_rq->idle_stamp = 0;
>
> - rq_repin_lock(this_rq, rf);
> + if (!newidle_balance_in_callback)
> + rq_repin_lock(this_rq, rf);
>
> return pulled_task;
> }
>
> +static void newidle_balance_cb(struct rq *this_rq)
> +{
> + update_rq_clock(this_rq);
> + do_newidle_balance(this_rq, NULL);
> +}
> +
> +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
> +{
> + if (newidle_balance_in_callback) {
> + update_misfit_status(NULL, this_rq);
> + queue_balance_callback(this_rq,
> + &per_cpu(rebalance_head, this_rq->cpu),
> + newidle_balance_cb);
> + return 0;
> + }
> +
> + return do_newidle_balance(this_rq, rf);
> +}
> +
> /*
> * run_rebalance_domains is triggered when needed from the scheduler tick.
> * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
> --
> 2.27.0
>

2021-05-07 17:29:16

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] sched/fair: Call newidle_balance() from balance_callback on PREEMPT_RT

On 05/05/21 14:13, Vincent Guittot wrote:
> On Thu, 29 Apr 2021 at 01:28, Scott Wood <[email protected]> wrote:
>>
>> This is required in order to be able to enable interrupts in the next
>> patch. This is limited to PREEMPT_RT to avoid adding potentially
>> measurable overhead to the non-RT case (requiring a double switch when
>> pulling a task onto a newly idle cpu).
>
> IIUC, only the newidle_balance is a problem and not the idle load
> balance that runs softirq. In this case, why not skipping
> newidle_balance entirely in case of preempt_rt and kick an idle load
> balance instead as you switch to idle thread context anyway
>

So if I follow you, that would be along the lines of having PREEMPT_RT turn
newidle_balance() into:

rq->idle_balance = CPU_IDLE;
rq->next_balance = jiffies;
trigger_load_balance(rq);

which I'm thinking isn't too crazy.