Reply-To: xlpang@redhat.com
Subject: Re: [PATCH] sched/deadline: No need to check NULL later_mask
References: <1459509198-26543-1-git-send-email-xlpang@redhat.com>
 <20160401112919.GP3430@twins.programming.kicks-ass.net>
 <56FE66A5.9050004@redhat.com>
 <20160401162136.GM3448@twins.programming.kicks-ass.net>
To: Peter Zijlstra <peterz@infradead.org>, xlpang@redhat.com
Cc: linux-kernel@vger.kernel.org, Juri Lelli <juri.lelli@arm.com>,
        Ingo Molnar <mingo@redhat.com>, Steven Rostedt <rostedt@goodmis.org>
From: Xunlei Pang <xpang@redhat.com>
Message-ID: <56FF9B84.2030307@redhat.com>
Date: Sat, 2 Apr 2016 18:14:28 +0800
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101
 Thunderbird/38.2.0
MIME-Version: 1.0
In-Reply-To: <20160401162136.GM3448@twins.programming.kicks-ass.net>
Content-Type: text/plain; charset=windows-1252
Content-Transfer-Encoding: 7bit
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 8069
Lines: 254

On 2016/04/02 at 00:21, Peter Zijlstra wrote:
> On Fri, Apr 01, 2016 at 08:16:37PM +0800, Xunlei Pang wrote:
>> On 2016/04/01 at 19:29, Peter Zijlstra wrote:
>>> On Fri, Apr 01, 2016 at 07:13:18PM +0800, Xunlei Pang wrote:
>>>> Unlike rt tasks, we (should) have no deadline tasks in
>>>> booting phase before the mask is allocated, so we can
>>>> safely remove the check.
>>> Why? And have the kernel explode once it grows an early deadline task?
>>>
>>> Is there _any_ benefit to this?
>> This is a performance-critical path, it'd be better to avoid such a check.
> And the changelog didn't say.. and if its an optimization you should at
> least attempt numbers or instruction counts or whatnot.
>
>> I think in the early boot stage before sched_init_smp(), it's weird to
>> use a deadline task, relying on rt tasks should be enough for us.
> You never know.
>
> Something like the below should completely avoid the problem though.
>
> It uses __initdata storage when coming up and switched to allocated data
> before we bring up smp.
>
> A similar thing could be done to RT..
>
> In fact, both could share the mask, its a temporary mask anyhow, and I'm
> pretty sure there's more such cpumasks lying about that we can merge.
>
> Completely untested...
>
> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 11594230ef4d..acdc291577a0 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7243,8 +7244,6 @@ void __init sched_init_smp(void)
>  	sched_init_granularity();
>  	free_cpumask_var(non_isolated_cpus);
>  
> -	init_sched_rt_class();
> -	init_sched_dl_class();
>  }
>  #else
>  void __init sched_init_smp(void)
> @@ -7444,6 +7443,9 @@ void __init sched_init(void)
>  		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
>  	idle_thread_set_boot_cpu();
>  	set_cpu_rq_start_time();
> +
> +	init_sched_rt_class();
> +	init_sched_dl_class();
>  #endif
>  	init_sched_fair_class();
>  
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index affd97ec9f65..24d7dbede99e 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1273,7 +1273,8 @@ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu
>  	return NULL;
>  }
>  
> -static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
> +static __initdata struct cpumask __local_mask;
> +static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl) = &__local_mask;
>  
>  static int find_later_rq(struct task_struct *task)
>  {
> @@ -1282,10 +1283,6 @@ static int find_later_rq(struct task_struct *task)
>  	int this_cpu = smp_processor_id();
>  	int best_cpu, cpu = task_cpu(task);
>  
> -	/* Make sure the mask is initialized first */
> -	if (unlikely(!later_mask))
> -		return -1;
> -
>  	if (task->nr_cpus_allowed == 1)
>  		return -1;
>  

Your proposal is very nice!

At the sched_init() stage we only have one (to be "idle") task and with irq disabled,
no scheduling will happen, and the cpu_possible_mask was already initiated, so it's
safe to simply move them there.

Also, how about rt&deadline sharing a percpu mask? Because only one of them can
use the mask at a moment, operations are always under some spin_lock_irqsave().

I made a new patch below, slightly tested by running tens of rt&dl tasks for a while,
are you fine with it?

---

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0b21e7a..94ae69a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -93,6 +93,11 @@
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
+#ifdef CONFIG_SMP
+/* Used by Push/Pull scheduling shared by rt and deadline */
+DEFINE_PER_CPU_ALIGNED(cpumask_var_t, sched_pp_mask);
+#endif
+
 static void update_rq_clock_task(struct rq *rq, s64 delta);
 
 void update_rq_clock(struct rq *rq)
@@ -7172,9 +7177,6 @@ void __init sched_init_smp(void)
         BUG();
     sched_init_granularity();
     free_cpumask_var(non_isolated_cpus);
-
-    init_sched_rt_class();
-    init_sched_dl_class();
 }
 #else
 void __init sched_init_smp(void)
@@ -7374,6 +7376,11 @@ void __init sched_init(void)
         zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
     idle_thread_set_boot_cpu();
     set_cpu_rq_start_time();
+
+    for_each_possible_cpu(i) {
+        zalloc_cpumask_var_node(&per_cpu(sched_pp_mask, i),
+            GFP_KERNEL, cpu_to_node(i));
+    }
 #endif
     init_sched_fair_class();
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index aeca716..a3557f8 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1288,19 +1288,13 @@ next_node:
     return NULL;
 }
 
-static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
-
 static int find_later_rq(struct task_struct *task)
 {
     struct sched_domain *sd;
-    struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
+    struct cpumask *later_mask;
     int this_cpu = smp_processor_id();
     int best_cpu, cpu = task_cpu(task);
 
-    /* Make sure the mask is initialized first */
-    if (unlikely(!later_mask))
-        return -1;
-
     if (task->nr_cpus_allowed == 1)
         return -1;
 
@@ -1308,6 +1302,7 @@ static int find_later_rq(struct task_struct *task)
      * We have to consider system topology and task affinity
      * first, then we can look for a suitable cpu.
      */
+    later_mask = this_cpu_cpumask_var_ptr(sched_pp_mask);
     best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
             task, later_mask);
     if (best_cpu == -1)
@@ -1694,15 +1689,6 @@ static void rq_offline_dl(struct rq *rq)
     cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
 }
 
-void __init init_sched_dl_class(void)
-{
-    unsigned int i;
-
-    for_each_possible_cpu(i)
-        zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i),
-                    GFP_KERNEL, cpu_to_node(i));
-}
-
 #endif /* CONFIG_SMP */
 
 static void switched_from_dl(struct rq *rq, struct task_struct *p)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 5624713..3491809 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1612,22 +1612,17 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
     return NULL;
 }
 
-static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
-
 static int find_lowest_rq(struct task_struct *task)
 {
     struct sched_domain *sd;
-    struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
+    struct cpumask *lowest_mask;
     int this_cpu = smp_processor_id();
     int cpu      = task_cpu(task);
 
-    /* Make sure the mask is initialized first */
-    if (unlikely(!lowest_mask))
-        return -1;
-
     if (task->nr_cpus_allowed == 1)
         return -1; /* No other targets possible */
 
+    lowest_mask = this_cpu_cpumask_var_ptr(sched_pp_mask);
     if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
         return -1; /* No targets found */
 
@@ -2164,16 +2159,6 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
 
     queue_pull_task(rq);
 }
-
-void __init init_sched_rt_class(void)
-{
-    unsigned int i;
-
-    for_each_possible_cpu(i) {
-        zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
-                    GFP_KERNEL, cpu_to_node(i));
-    }
-}
 #endif /* CONFIG_SMP */
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e6d4a3f..b892954 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -715,6 +715,10 @@ static inline int cpu_of(struct rq *rq)
 
 DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
+#ifdef CONFIG_SMP
+DECLARE_PER_CPU_ALIGNED(cpumask_var_t, sched_pp_mask);
+#endif
+
 #define cpu_rq(cpu)        (&per_cpu(runqueues, (cpu)))
 #define this_rq()        this_cpu_ptr(&runqueues)
 #define task_rq(p)        cpu_rq(task_cpu(p))
@@ -1296,8 +1300,6 @@ extern void sysrq_sched_debug_show(void);
 extern void sched_init_granularity(void);
 extern void update_max_interval(void);
 
-extern void init_sched_dl_class(void);
-extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);
 
 extern void resched_curr(struct rq *rq);
-- 
1.8.3.1