Date: Thu, 23 Jul 2015 18:55:06 +0200
From: Frederic Weisbecker <fweisbec@gmail.com>
To: LKML <linux-kernel@vger.kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>,
        Thomas Gleixner <tglx@linutronix.de>,
        Preeti U Murthy <preeti@linux.vnet.ibm.com>,
        Christoph Lameter <cl@linux.com>, Ingo Molnar <mingo@kernel.org>,
        Viresh Kumar <viresh.kumar@linaro.org>, Rik van Riel <riel@redhat.com>
Subject: Re: [PATCH 07/10] sched: Migrate sched to use new tick dependency
 mask model
Message-ID: <20150723165504.GB1973@lerouge>
References: <1437669735-8786-1-git-send-email-fweisbec@gmail.com>
 <1437669735-8786-8-git-send-email-fweisbec@gmail.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <1437669735-8786-8-git-send-email-fweisbec@gmail.com>
User-Agent: Mutt/1.5.23 (2014-03-12)
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 8856
Lines: 272

On Thu, Jul 23, 2015 at 06:42:12PM +0200, Frederic Weisbecker wrote:
> Instead of providing asynchronous checks for the nohz subsystem to verify
> sched tick dependency, migrate sched to the new mask.
> 
> The easiest is to recycle the current asynchronous tick dependency check
> which verifies the class of the current task and its requirements for
> periodic preemption checks.
> 
> We need to evaluate this tick dependency on three places:
> 
> 1) Task enqueue: One or more tasks have been enqueued, we must check
>    if those are competing with the current task.
> 
> 2) Task dequeue: A possibly competing task has been dequeued, clear the
>    tick dependency if needed.
> 
> 3) schedule(): we might be switching to a task of another scheduler
>    class. Each class has its preemption rules, we must re-evaluate it.
> 
> This doesn't change much compared to the previous layout, except that
> 3) has to be done with rq locked to avoid mask change racing with remote
> enqueue.
> 
> We could get away with 3) by checking the highest prio tasks of the
> runqueue instead of its current task.
> 
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
> Cc: Rik van Riel <riel@redhat.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Viresh Kumar <viresh.kumar@linaro.org>
> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
> ---
>  include/linux/sched.h    |  3 ---
>  kernel/sched/core.c      | 12 ++++++-----
>  kernel/sched/sched.h     | 56 +++++++++++++++++++++++++++++++++++-------------
>  kernel/time/tick-sched.c |  5 -----
>  4 files changed, 48 insertions(+), 28 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index ae21f15..88c99a2 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2296,10 +2296,7 @@ static inline void wake_up_nohz_cpu(int cpu) { }
>  #endif
>  
>  #ifdef CONFIG_NO_HZ_FULL
> -extern bool sched_can_stop_tick(void);
>  extern u64 scheduler_tick_max_deferment(void);
> -#else
> -static inline bool sched_can_stop_tick(void) { return false; }
>  #endif
>  
>  #ifdef CONFIG_SCHED_AUTOGROUP
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 4d34035..6c3db36 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -714,21 +714,22 @@ static inline bool got_nohz_idle_kick(void)
>  #endif /* CONFIG_NO_HZ_COMMON */
>  
>  #ifdef CONFIG_NO_HZ_FULL
> -bool sched_can_stop_tick(void)
> +bool sched_can_stop_tick(struct rq *rq)
>  {
> +	struct task_struct *curr = rq->curr;
>  	/*
>  	 * FIFO realtime policy runs the highest priority task. Other runnable
>  	 * tasks are of a lower priority. The scheduler tick does nothing.
>  	 */
> -	if (current->policy == SCHED_FIFO)
> +	if (curr->policy == SCHED_FIFO)
>  		return true;
>  
>  	/*
>  	 * Round-robin realtime tasks time slice with other tasks at the same
>  	 * realtime priority. Is this task the only one at this priority?
>  	 */
> -	if (current->policy == SCHED_RR) {
> -		struct sched_rt_entity *rt_se = &current->rt;
> +	if (curr->policy == SCHED_RR) {
> +		struct sched_rt_entity *rt_se = &curr->rt;
>  
>  		return rt_se->run_list.prev == rt_se->run_list.next;
>  	}
> @@ -738,7 +739,7 @@ bool sched_can_stop_tick(void)
>  	 * nr_running update is assumed to be visible
>  	 * after IPI is sent from wakers.
>  	 */
> -	if (this_rq()->nr_running > 1)
> +	if (rq->nr_running > 1)
>  		return false;
>  
>  	return true;
> @@ -2489,6 +2490,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
>  		put_task_struct(prev);
>  	}
>  
> +	sched_update_tick_dependency(rq);

A small mistake here that I forgot to update. This is supposed to be between
perf_event_task_sched_in() and finish_lock_switch(). This must be called
when rq is locked otherwise there is a risk that a remote CPU enqueues or
dequeues concurrently and mess up the dependency mask.

Here is the correct diff:

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ae21f15..88c99a2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2296,10 +2296,7 @@ static inline void wake_up_nohz_cpu(int cpu) { }
 #endif
 
 #ifdef CONFIG_NO_HZ_FULL
-extern bool sched_can_stop_tick(void);
 extern u64 scheduler_tick_max_deferment(void);
-#else
-static inline bool sched_can_stop_tick(void) { return false; }
 #endif
 
 #ifdef CONFIG_SCHED_AUTOGROUP
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4d34035..58b16d3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -714,21 +714,22 @@ static inline bool got_nohz_idle_kick(void)
 #endif /* CONFIG_NO_HZ_COMMON */
 
 #ifdef CONFIG_NO_HZ_FULL
-bool sched_can_stop_tick(void)
+bool sched_can_stop_tick(struct rq *rq)
 {
+	struct task_struct *curr = rq->curr;
 	/*
 	 * FIFO realtime policy runs the highest priority task. Other runnable
 	 * tasks are of a lower priority. The scheduler tick does nothing.
 	 */
-	if (current->policy == SCHED_FIFO)
+	if (curr->policy == SCHED_FIFO)
 		return true;
 
 	/*
 	 * Round-robin realtime tasks time slice with other tasks at the same
 	 * realtime priority. Is this task the only one at this priority?
 	 */
-	if (current->policy == SCHED_RR) {
-		struct sched_rt_entity *rt_se = &current->rt;
+	if (curr->policy == SCHED_RR) {
+		struct sched_rt_entity *rt_se = &curr->rt;
 
 		return rt_se->run_list.prev == rt_se->run_list.next;
 	}
@@ -738,7 +739,7 @@ bool sched_can_stop_tick(void)
 	 * nr_running update is assumed to be visible
 	 * after IPI is sent from wakers.
 	 */
-	if (this_rq()->nr_running > 1)
+	if (rq->nr_running > 1)
 		return false;
 
 	return true;
@@ -2471,6 +2472,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	vtime_task_switch(prev);
 	finish_arch_switch(prev);
 	perf_event_task_sched_in(prev, current);
+	sched_update_tick_dependency(rq);
 	finish_lock_switch(rq, prev);
 	finish_arch_post_lock_switch();
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 84d4879..5037acf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1321,6 +1321,38 @@ unsigned long to_ratio(u64 period, u64 runtime);
 
 extern void init_task_runnable_average(struct task_struct *p);
 
+#ifdef CONFIG_NO_HZ_FULL
+extern bool sched_can_stop_tick(struct rq *rq);
+
+/*
+ * Tick is needed if more than one task runs on a CPU.
+ * Send the target an IPI to kick it out of nohz mode.
+ *
+ * We assume that IPI implies full memory barrier and the
+ * new value of rq->nr_running is visible on reception
+ * from the target.
+ */
+static inline void sched_update_tick_dependency(struct rq *rq)
+{
+	int cpu;
+
+	if (!tick_nohz_full_enabled())
+		return;
+
+	cpu = cpu_of(rq);
+
+	if (!tick_nohz_full_cpu(rq->cpu))
+		return;
+
+	if (sched_can_stop_tick(rq))
+		tick_nohz_clear_tick_dependency_cpu(TICK_SCHED_BIT, cpu);
+	else
+		tick_nohz_set_tick_dependency_cpu(TICK_SCHED_BIT, cpu);
+}
+#else
+static inline void sched_update_tick_dependency(struct rq *rq) { }
+#endif
+
 static inline void add_nr_running(struct rq *rq, unsigned count)
 {
 	unsigned prev_nr = rq->nr_running;
@@ -1332,26 +1364,20 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 		if (!rq->rd->overload)
 			rq->rd->overload = true;
 #endif
-
-#ifdef CONFIG_NO_HZ_FULL
-		if (tick_nohz_full_cpu(rq->cpu)) {
-			/*
-			 * Tick is needed if more than one task runs on a CPU.
-			 * Send the target an IPI to kick it out of nohz mode.
-			 *
-			 * We assume that IPI implies full memory barrier and the
-			 * new value of rq->nr_running is visible on reception
-			 * from the target.
-			 */
-			tick_nohz_full_kick_cpu(rq->cpu);
-		}
-#endif
+		/* Check if new task(s) need periodic preemption check */
+		sched_update_tick_dependency(rq);
 	}
 }
 
 static inline void sub_nr_running(struct rq *rq, unsigned count)
 {
-	rq->nr_running -= count;
+	unsigned prev_nr = rq->nr_running;
+
+	rq->nr_running = prev_nr - count;
+	if (prev_nr > 1) {
+		/* Check if we still need preemption */
+		sched_update_tick_dependency(rq);
+	}
 }
 
 static inline void rq_last_tick_reset(struct rq *rq)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index fbe4736..e6447bd 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -192,11 +192,6 @@ static bool can_stop_full_tick(struct tick_sched *ts)
 		return false;
 	}
 
-	if (!sched_can_stop_tick()) {
-		trace_tick_stop(0, "more than 1 task in runqueue\n");
-		return false;
-	}
-
 	if (!posix_cpu_timers_can_stop_tick(current)) {
 		trace_tick_stop(0, "posix timers running\n");
 		return false;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/