From: Xunlei Pang <xlpang@redhat.com>
To: linux-kernel@vger.kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>,
        Thomas Gleixner <tglx@linutronix.de>, Juri Lelli <juri.lelli@arm.com>,
        Ingo Molnar <mingo@redhat.com>, Steven Rostedt <rostedt@goodmis.org>,
        Xunlei Pang <xlpang@redhat.com>
Subject: [PATCH v3 5/6] sched/deadline/rtmutex: Fix unprotected PI access in enqueue_task_dl()
Date: Thu, 14 Apr 2016 19:37:06 +0800
Message-Id: <1460633827-345-6-git-send-email-xlpang@redhat.com>
In-Reply-To: <1460633827-345-1-git-send-email-xlpang@redhat.com>
References: <1460633827-345-1-git-send-email-xlpang@redhat.com>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 7624
Lines: 214

We access @pi_task's data without any lock in enqueue_task_dl(), though
checked "dl_prio(pi_task->normal_prio)" condition, that's not enough.

"dl_period" and "dl_runtime" of "pi_task->dl" can change. For example,
if it changes to !deadline class, dl_runtime will be cleared to zero,
then we will hit an endless loop in replenish_dl_entity() below:
    while (dl_se->runtime <= 0) {
        dl_se->deadline += pi_se->dl_period;
        dl_se->runtime += pi_se->dl_runtime;
    }

or hit "BUG_ON(pi_se->dl_runtime <= 0)" earlier.

That's because without any lock of top waiter, there is no guarantee.

In order to solve it, we add some members in "rt_mutex_waiter" and use
this structure instead of task_struct as the one to be accessed by
enqueue_task_dl(), specifically added:

struct rt_mutex_waiter {
     ... ...

    int prio;
+   /* Updated under waiter's pi_lock and rt_mutex lock */
+   u64 dl_runtime, dl_period;
+   /*
+    * Copied directly from above.
+    * Updated under owner's pi_lock, rq lock, and rt_mutex lock.
+    */
+   u64 dl_runtime_copy, dl_period_copy;
};

We must update "dl_runtime_copy" and "dl_period_copy" under rt_mutex
lock, because they are copied from rt_mutex_waiter's "dl_runtime" and
"dl_period" which are protected by the same rt_mutex lock. We update
the copy in rt_mutex_update_copy() introduced perviously, as it is
called by rt_mutex_setprio() which held owner's pi_lock, rq lock, and
rt_mutex lock.

"dl_runtime_copy" and "dl_period_copy" are updated under owner's pi_lock,
rq lock, and rt_mutex lock, plus the waiter was dependably blocked on
rtmutex, thus they can be safely accessed by enqueue_task_dl() which
held rq lock.

Note that, now we return a rt_mutex_waiter to enqueue_task_dl(), we add
a new "struct sched_dl_entity_fake" to fake as a real sched_dl_entity,
this is ok as long as we keep the "dl_runtime" and "dl_period" in it
the same order as that in sched_dl_entity. Also adjust the location of
"dl_period" in sched_dl_entity to make sched_dl_entity_fake smaller.

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
---
 include/linux/rtmutex.h        |  7 +++++++
 include/linux/sched.h          |  2 +-
 include/linux/sched/deadline.h | 20 ++++++++++++++++++++
 kernel/locking/rtmutex.c       | 23 +++++++++++++++++++++++
 kernel/sched/deadline.c        | 10 +++++++---
 5 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index f9bf40a..56e2aaf 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -58,6 +58,13 @@ struct rt_mutex_waiter {
 	struct rt_mutex		*deadlock_lock;
 #endif
 	int prio;
+	/* Updated under waiter's pi_lock and rt_mutex lock */
+	u64 dl_runtime, dl_period;
+	/*
+	 * Copied directly from above.
+	 * Updated under owner's pi_lock, rq lock, and rt_mutex lock.
+	 */
+	u64 dl_runtime_copy, dl_period_copy;
 };
 
 struct hrtimer_sleeper;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8ad3522..960465c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1323,8 +1323,8 @@ struct sched_dl_entity {
 	 * the next sched_setattr().
 	 */
 	u64 dl_runtime;		/* maximum runtime for each instance	*/
-	u64 dl_deadline;	/* relative deadline of each instance	*/
 	u64 dl_period;		/* separation of two instances (period) */
+	u64 dl_deadline;	/* relative deadline of each instance	*/
 	u64 dl_bw;		/* dl_runtime / dl_deadline		*/
 
 	/*
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index e8304d4..ca5bae5 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -2,6 +2,16 @@
 #define _SCHED_DEADLINE_H
 
 /*
+ * Used by enqueue_task_dl() for PI cases to disguise sched_dl_entity,
+ * thus must be the same order as the counterparts in sched_dl_entity.
+ */
+struct sched_dl_entity_fake {
+	struct rb_node  rb_node;
+	u64 dl_runtime;
+	u64 dl_period;
+};
+
+/*
  * SCHED_DEADLINE tasks has negative priorities, reflecting
  * the fact that any of them has higher prio than RT and
  * NORMAL/BATCH tasks.
@@ -28,4 +38,14 @@ static inline bool dl_time_before(u64 a, u64 b)
 
 extern void rt_mutex_update_copy(struct task_struct *p);
 
+#ifdef CONFIG_RT_MUTEXES
+extern struct rt_mutex_waiter *rt_mutex_get_top_waiter(struct task_struct *p);
+#else
+static inline
+struct rt_mutex_waiter *rt_mutex_get_top_waiter(struct task_struct *p)
+{
+	return NULL;
+}
+#endif
+
 #endif /* _SCHED_DEADLINE_H */
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 00c6560..4d14eee 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -280,6 +280,15 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 				struct rt_mutex_waiter, pi_tree_entry)->task;
 }
 
+struct rt_mutex_waiter *rt_mutex_get_top_waiter(struct task_struct *task)
+{
+	if (!task->pi_waiters_leftmost_copy)
+		return NULL;
+
+	return rb_entry(task->pi_waiters_leftmost_copy,
+				struct rt_mutex_waiter, pi_tree_entry);
+}
+
 /*
  * Called by sched_setscheduler() to get the priority which will be
  * effective after the change.
@@ -299,7 +308,17 @@ int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
  */
 void rt_mutex_update_copy(struct task_struct *p)
 {
+	struct rt_mutex_waiter *top_waiter;
+
+	/* We must always update it, even if NULL */
 	p->pi_waiters_leftmost_copy = p->pi_waiters_leftmost;
+
+	if (!task_has_pi_waiters(p))
+		return;
+
+	top_waiter = task_top_pi_waiter(p);
+	top_waiter->dl_runtime_copy = top_waiter->dl_runtime;
+	top_waiter->dl_period_copy = top_waiter->dl_period;
 }
 
 /*
@@ -632,6 +651,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	/* [7] Requeue the waiter in the lock waiter tree. */
 	rt_mutex_dequeue(lock, waiter);
 	waiter->prio = task->prio;
+	waiter->dl_runtime = dl_policy(task->policy) ? task->dl.dl_runtime : 0;
+	waiter->dl_period = dl_policy(task->policy) ? task->dl.dl_period : 0;
 	rt_mutex_enqueue(lock, waiter);
 
 	/* [8] Release the task */
@@ -902,6 +923,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 	waiter->task = task;
 	waiter->lock = lock;
 	waiter->prio = task->prio;
+	waiter->dl_runtime = dl_policy(task->policy) ? task->dl.dl_runtime : 0;
+	waiter->dl_period = dl_policy(task->policy) ? task->dl.dl_period : 0;
 
 	/* Get the top priority waiter on the lock */
 	if (rt_mutex_has_waiters(lock))
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index affd97e..224aa64 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -932,8 +932,9 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 {
-	struct task_struct *pi_task = rt_mutex_get_top_task(p);
+	struct rt_mutex_waiter *top_waiter = rt_mutex_get_top_waiter(p);
 	struct sched_dl_entity *pi_se = &p->dl;
+	struct sched_dl_entity_fake pi_se_fake;
 
 	/*
 	 * Use the scheduling parameters of the top pi-waiter
@@ -941,8 +942,11 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 	 * smaller than our one... OTW we keep our runtime and
 	 * deadline.
 	 */
-	if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
-		pi_se = &pi_task->dl;
+	if (top_waiter && p->dl.dl_boosted && top_waiter->dl_runtime_copy) {
+		BUG_ON(top_waiter->dl_period_copy == 0);
+		pi_se_fake.dl_runtime = top_waiter->dl_runtime_copy;
+		pi_se_fake.dl_period = top_waiter->dl_period_copy;
+		pi_se = (struct sched_dl_entity *)&pi_se_fake;
 	} else if (!dl_prio(p->normal_prio)) {
 		/*
 		 * Special case in which we have a !SCHED_DEADLINE task
-- 
1.8.3.1