Reply-To: xlpang@redhat.com
Subject: Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
References: <1459508418-25577-1-git-send-email-xlpang@redhat.com>
 <20160401113827.GQ3430@twins.programming.kicks-ass.net>
 <56FE685E.6080001@redhat.com>
 <19912883-8AB1-4DFD-A0E1-F23057785243@infradead.org>
 <56FE78E0.5060504@redhat.com> <20160401215143.GB2906@worktop>
To: Peter Zijlstra <peterz@infradead.org>, xlpang@redhat.com
Cc: linux-kernel@vger.kernel.org, Juri Lelli <juri.lelli@arm.com>,
        Ingo Molnar <mingo@redhat.com>, Steven Rostedt <rostedt@goodmis.org>,
        Thomas Gleixner <tglx@linutronix.de>
From: Xunlei Pang <xpang@redhat.com>
Message-ID: <57037974.1020002@redhat.com>
Date: Tue, 5 Apr 2016 16:38:12 +0800
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101
 Thunderbird/38.2.0
MIME-Version: 1.0
In-Reply-To: <20160401215143.GB2906@worktop>
Content-Type: text/plain; charset=windows-1252
Content-Transfer-Encoding: 7bit
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 9069
Lines: 253

On 2016/04/02 at 05:51, Peter Zijlstra wrote:
> On Fri, Apr 01, 2016 at 09:34:24PM +0800, Xunlei Pang wrote:
>
>>>> I checked the code, currently only deadline accesses the
>>>> pi_waiters/pi_waiters_leftmost
>>>> without pi_lock held via rt_mutex_get_top_task(), other cases all have
>>>> pi_lock held.
>> Any better ideas is welcome.
> Something like the below _might_ work; but its late and I haven't looked
> at the PI code in a while. This basically caches a pointer to the top
> waiter task in the running task_struct, under pi_lock and rq->lock, and
> therefore we can use it with only rq->lock held.
>
> Since the task is blocked, and cannot unblock without taking itself from
> the block chain -- which would cause rt_mutex_setprio() to set another
> top waiter task, the lifetime rules should be good.

In rt_mutex_slowunlock(), we release pi_lock and and wait_lock first, then
wake up the top waiter, then call rt_mutex_adjust_prio(), so there is a small
window without any lock or irq disabled between the top waiter wake up
and rt_mutex_adjust_prio(), which can cause problems.

For example, before calling rt_mutex_adjust_prio() to adjust the cached pointer,
if current is preempted and the waken top waiter exited, after that, the task is
back, and it may enter enqueue_task_dl() before entering rt_mutex_adjust_prio(),
where the cached pointer is updated, so it will access a stale cached pointer.

Regards,
Xunlei

> Having this top waiter pointer around might also be useful to further
> implement bandwidth inheritance or such, but I've not thought about that
> too much.
>
> Lots of code deleted because we move the entire task->prio mapping muck
> into the scheduler, because we now pass a pi_task, not a prio.
>
> ---
>  include/linux/sched.h    |  1 +
>  include/linux/sched/rt.h | 20 +-------------------
>  kernel/locking/rtmutex.c | 45 +++++----------------------------------------
>  kernel/sched/core.c      | 34 +++++++++++++++++++++++-----------
>  kernel/sched/deadline.c  |  2 +-
>  5 files changed, 31 insertions(+), 71 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 52c4847b05e2..30169f38cb24 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1615,6 +1615,7 @@ struct task_struct {
>  
>  	/* Protection of the PI data structures: */
>  	raw_spinlock_t pi_lock;
> +	struct task_struct *pi_task;
>  
>  	struct wake_q_node wake_q;
>  
> diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
> index a30b172df6e1..0a8f071b16e3 100644
> --- a/include/linux/sched/rt.h
> +++ b/include/linux/sched/rt.h
> @@ -16,31 +16,13 @@ static inline int rt_task(struct task_struct *p)
>  }
>  
>  #ifdef CONFIG_RT_MUTEXES
> -extern int rt_mutex_getprio(struct task_struct *p);
> -extern void rt_mutex_setprio(struct task_struct *p, int prio);
> -extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
> -extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
> +extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
>  extern void rt_mutex_adjust_pi(struct task_struct *p);
>  static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
>  {
>  	return tsk->pi_blocked_on != NULL;
>  }
>  #else
> -static inline int rt_mutex_getprio(struct task_struct *p)
> -{
> -	return p->normal_prio;
> -}
> -
> -static inline int rt_mutex_get_effective_prio(struct task_struct *task,
> -					      int newprio)
> -{
> -	return newprio;
> -}
> -
> -static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
> -{
> -	return NULL;
> -}
>  # define rt_mutex_adjust_pi(p)		do { } while (0)
>  static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
>  {
> diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
> index 3e746607abe5..13b6b5922d3c 100644
> --- a/kernel/locking/rtmutex.c
> +++ b/kernel/locking/rtmutex.c
> @@ -257,53 +257,18 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
>  }
>  
>  /*
> - * Calculate task priority from the waiter tree priority
> - *
> - * Return task->normal_prio when the waiter tree is empty or when
> - * the waiter is not allowed to do priority boosting
> - */
> -int rt_mutex_getprio(struct task_struct *task)
> -{
> -	if (likely(!task_has_pi_waiters(task)))
> -		return task->normal_prio;
> -
> -	return min(task_top_pi_waiter(task)->prio,
> -		   task->normal_prio);
> -}
> -
> -struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
> -{
> -	if (likely(!task_has_pi_waiters(task)))
> -		return NULL;
> -
> -	return task_top_pi_waiter(task)->task;
> -}
> -
> -/*
> - * Called by sched_setscheduler() to get the priority which will be
> - * effective after the change.
> - */
> -int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
> -{
> -	if (!task_has_pi_waiters(task))
> -		return newprio;
> -
> -	if (task_top_pi_waiter(task)->task->prio <= newprio)
> -		return task_top_pi_waiter(task)->task->prio;
> -	return newprio;
> -}
> -
> -/*
>   * Adjust the priority of a task, after its pi_waiters got modified.
>   *
>   * This can be both boosting and unboosting. task->pi_lock must be held.
>   */
>  static void __rt_mutex_adjust_prio(struct task_struct *task)
>  {
> -	int prio = rt_mutex_getprio(task);
> +	struct task_struct *pi_task = task;
> +
> +	if (unlikely(task_has_pi_waiters(task)))
> +		pi_task = task_top_pi_waiter(task)->task;
>  
> -	if (task->prio != prio || dl_prio(prio))
> -		rt_mutex_setprio(task, prio);
> +	rt_mutex_setprio(task, pi_task);
>  }
>  
>  /*
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 8b489fcac37b..7d7e3a0eaeb0 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3392,7 +3392,7 @@ EXPORT_SYMBOL(default_wake_function);
>  /*
>   * rt_mutex_setprio - set the current priority of a task
>   * @p: task
> - * @prio: prio value (kernel-internal form)
> + * @pi_task: top waiter, donating state
>   *
>   * This function changes the 'effective' priority of a task. It does
>   * not touch ->normal_prio like __setscheduler().
> @@ -3400,13 +3400,20 @@ EXPORT_SYMBOL(default_wake_function);
>   * Used by the rt_mutex code to implement priority inheritance
>   * logic. Call site only calls if the priority of the task changed.
>   */
> -void rt_mutex_setprio(struct task_struct *p, int prio)
> +void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
>  {
> -	int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
> -	struct rq *rq;
> +	int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
>  	const struct sched_class *prev_class;
> +	struct rq *rq;
>  
> -	BUG_ON(prio > MAX_PRIO);
> +	/*
> +	 * For FIFO/RR we simply donate prio; for DL things are
> +	 * more interesting.
> +	 */
> +	/* XXX used to be waiter->prio, not waiter->task->prio */
> +	prio = min(pi_task->prio, p->normal_prio);
> +	if (p->prio == prio && !dl_prio(prio))
> +		return;
>  
>  	rq = __task_rq_lock(p);
>  
> @@ -3442,6 +3449,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
>  	if (running)
>  		put_prev_task(rq, p);
>  
> +	if (pi_task == p)
> +		pi_task = NULL;
> +	p->pi_task = pi_task;
> +
>  	/*
>  	 * Boosting condition are:
>  	 * 1. -rt task is running and holds mutex A
> @@ -3452,7 +3463,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
>  	 *          running task
>  	 */
>  	if (dl_prio(prio)) {
> -		struct task_struct *pi_task = rt_mutex_get_top_task(p);
>  		if (!dl_prio(p->normal_prio) ||
>  		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
>  			p->dl.dl_boosted = 1;
> @@ -3727,10 +3737,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
>  	 * Keep a potential priority boosting if called from
>  	 * sched_setscheduler().
>  	 */
> -	if (keep_boost)
> -		p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
> -	else
> -		p->prio = normal_prio(p);
> +	p->prio = normal_prio(p);
> +	if (keep_boost && p->pi_task)
> +		p->prio = min(p->prio, p->pi_task->prio);
>  
>  	if (dl_prio(p->prio))
>  		p->sched_class = &dl_sched_class;
> @@ -4017,7 +4026,10 @@ static int __sched_setscheduler(struct task_struct *p,
>  		 * the runqueue. This will be done when the task deboost
>  		 * itself.
>  		 */
> -		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
> +		new_effective_prio = newprio;
> +		if (p->pi_task)
> +			new_effective_prio = min(new_effective_prio, p->pi_task->prio);
> +
>  		if (new_effective_prio == oldprio)
>  			queue_flags &= ~DEQUEUE_MOVE;
>  	}
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index affd97ec9f65..6c5aa4612eb6 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -932,7 +932,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
>  
>  static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
>  {
> -	struct task_struct *pi_task = rt_mutex_get_top_task(p);
> +	struct task_struct *pi_task = p->pi_task;
>  	struct sched_dl_entity *pi_se = &p->dl;
>  
>  	/*