A crash happened while I'm playing with deadline PI rtmutex.
BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
IP: [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
PGD 232a75067 PUD 230947067 PMD 0
Oops: 0000 [#1] SMP
CPU: 1 PID: 10994 Comm: a.out Not tainted
Call Trace:
[<ffffffff810cf8aa>] ? enqueue_task_dl+0x2a/0x320
[<ffffffff810b658c>] enqueue_task+0x2c/0x80
[<ffffffff810ba763>] activate_task+0x23/0x30
[<ffffffff810d0ab5>] pull_dl_task+0x1d5/0x260
[<ffffffff810d0be6>] pre_schedule_dl+0x16/0x20
[<ffffffff8164e783>] __schedule+0xd3/0x900
[<ffffffff8164efd9>] schedule+0x29/0x70
[<ffffffff8165035b>] __rt_mutex_slowlock+0x4b/0xc0
[<ffffffff81650501>] rt_mutex_slowlock+0xd1/0x190
[<ffffffff810eeb33>] rt_mutex_timed_lock+0x53/0x60
[<ffffffff810ecbfc>] futex_lock_pi.isra.18+0x28c/0x390
[<ffffffff810cfa15>] ? enqueue_task_dl+0x195/0x320
[<ffffffff810d0bac>] ? prio_changed_dl+0x6c/0x90
[<ffffffff810ed8b0>] do_futex+0x190/0x5b0
[<ffffffff810edd50>] SyS_futex+0x80/0x180
[<ffffffff8165a089>] system_call_fastpath+0x16/0x1b
RIP [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
This is because rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi()
are only protected by pi_lock when operating pi waiters, while
rt_mutex_get_top_task() will access them with rq lock held but
not holding pi_lock.
In order to tackle it, we introduce a new pointer "pi_top_task"
in task_struct, and update it to be the top waiter task(this waiter
is updated under pi_lock) in rt_mutex_setprio() which is under
both pi_lock and rq lock, then ensure all its accessers be under
rq lock (or pi_lock), this can safely fix the crash.
This patch is originated from "Peter Zijlstra", with several
tweaks and improvements by me.
Tested-by: Xunlei Pang <[email protected]>
Originally-From: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Xunlei Pang <[email protected]>
---
Without the patch, kernel crashes in seconds, after the patch
it can survive overnight.
include/linux/init_task.h | 3 +-
include/linux/sched.h | 1 +
include/linux/sched/deadline.h | 12 +++++++
include/linux/sched/rt.h | 21 ++----------
kernel/fork.c | 1 +
kernel/futex.c | 5 ++-
kernel/locking/rtmutex.c | 71 +++++++++++++++--------------------------
kernel/locking/rtmutex_common.h | 1 +
kernel/sched/core.c | 39 +++++++++++++++-------
kernel/sched/deadline.c | 2 +-
10 files changed, 75 insertions(+), 81 deletions(-)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index f2cb8d4..de834f3 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -162,7 +162,8 @@ extern struct task_group root_task_group;
#ifdef CONFIG_RT_MUTEXES
# define INIT_RT_MUTEXES(tsk) \
.pi_waiters = RB_ROOT, \
- .pi_waiters_leftmost = NULL,
+ .pi_waiters_leftmost = NULL, \
+ .pi_top_task = NULL,
#else
# define INIT_RT_MUTEXES(tsk)
#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c617ea1..b4d9347 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1621,6 +1621,7 @@ struct task_struct {
/* PI waiters blocked on a rt_mutex held by this task */
struct rb_root pi_waiters;
struct rb_node *pi_waiters_leftmost;
+ struct task_struct *pi_top_task;
/* Deadlock detection and priority inheritance handling */
struct rt_mutex_waiter *pi_blocked_on;
#endif
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 9089a2a..9f46729 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -26,4 +26,16 @@ static inline bool dl_time_before(u64 a, u64 b)
return (s64)(a - b) < 0;
}
+#ifdef CONFIG_RT_MUTEXES
+static inline struct task_struct *get_pi_top_task(struct task_struct *p)
+{
+ return p->pi_top_task;
+}
+#else
+static inline struct task_struct *get_pi_top_task(struct task_struct *p)
+{
+ return NULL;
+}
+#endif
+
#endif /* _SCHED_DEADLINE_H */
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index a30b172..4e35e94 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -16,31 +16,14 @@ static inline int rt_task(struct task_struct *p)
}
#ifdef CONFIG_RT_MUTEXES
-extern int rt_mutex_getprio(struct task_struct *p);
-extern void rt_mutex_setprio(struct task_struct *p, int prio);
-extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
-extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
+extern void rt_mutex_setprio(struct task_struct *p,
+ struct task_struct *pi_top_task);
extern void rt_mutex_adjust_pi(struct task_struct *p);
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
{
return tsk->pi_blocked_on != NULL;
}
#else
-static inline int rt_mutex_getprio(struct task_struct *p)
-{
- return p->normal_prio;
-}
-
-static inline int rt_mutex_get_effective_prio(struct task_struct *task,
- int newprio)
-{
- return newprio;
-}
-
-static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
-{
- return NULL;
-}
# define rt_mutex_adjust_pi(p) do { } while (0)
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
{
diff --git a/kernel/fork.c b/kernel/fork.c
index 45a9048..3ad84c9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1209,6 +1209,7 @@ static void rt_mutex_init_task(struct task_struct *p)
p->pi_waiters = RB_ROOT;
p->pi_waiters_leftmost = NULL;
p->pi_blocked_on = NULL;
+ p->pi_top_task = NULL;
#endif
}
diff --git a/kernel/futex.c b/kernel/futex.c
index a5d2e74..e73145b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1326,9 +1326,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
* scheduled away before the wake up can take place.
*/
spin_unlock(&hb->lock);
- wake_up_q(&wake_q);
- if (deboost)
- rt_mutex_adjust_prio(current);
+
+ rt_mutex_postunlock(&wake_q, deboost);
return 0;
}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 5e4294c..6c3a806 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -257,53 +257,18 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
}
/*
- * Calculate task priority from the waiter tree priority
- *
- * Return task->normal_prio when the waiter tree is empty or when
- * the waiter is not allowed to do priority boosting
- */
-int rt_mutex_getprio(struct task_struct *task)
-{
- if (likely(!task_has_pi_waiters(task)))
- return task->normal_prio;
-
- return min(task_top_pi_waiter(task)->prio,
- task->normal_prio);
-}
-
-struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
-{
- if (likely(!task_has_pi_waiters(task)))
- return NULL;
-
- return task_top_pi_waiter(task)->task;
-}
-
-/*
- * Called by sched_setscheduler() to get the priority which will be
- * effective after the change.
- */
-int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
-{
- if (!task_has_pi_waiters(task))
- return newprio;
-
- if (task_top_pi_waiter(task)->task->prio <= newprio)
- return task_top_pi_waiter(task)->task->prio;
- return newprio;
-}
-
-/*
* Adjust the priority of a task, after its pi_waiters got modified.
*
* This can be both boosting and unboosting. task->pi_lock must be held.
*/
static void __rt_mutex_adjust_prio(struct task_struct *task)
{
- int prio = rt_mutex_getprio(task);
+ struct task_struct *pi_top_task = task;
- if (task->prio != prio || dl_prio(prio))
- rt_mutex_setprio(task, prio);
+ if (unlikely(task_has_pi_waiters(task)))
+ pi_top_task = task_top_pi_waiter(task)->task;
+
+ rt_mutex_setprio(task, pi_top_task);
}
/*
@@ -1403,14 +1368,30 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
} else {
bool deboost = slowfn(lock, &wake_q);
- wake_up_q(&wake_q);
-
- /* Undo pi boosting if necessary: */
- if (deboost)
- rt_mutex_adjust_prio(current);
+ rt_mutex_postunlock(&wake_q, deboost);
}
}
+/*
+ * Undo pi boosting (if necessary) and wake top waiter.
+ */
+void rt_mutex_postunlock(struct wake_q_head *wake_q, bool deboost)
+{
+ /*
+ * We should deboost before waking the high-prio task such that
+ * we don't run two tasks with the 'same' state. This however
+ * can lead to prio-inversion if we would get preempted after
+ * the deboost but before waking our high-prio task, hence the
+ * preempt_disable.
+ */
+ preempt_disable();
+ if (deboost)
+ rt_mutex_adjust_prio(current);
+
+ wake_up_q(wake_q);
+ preempt_enable();
+}
+
/**
* rt_mutex_lock - lock a rt_mutex
*
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 4f5f83c..93b0924 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -111,6 +111,7 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
struct wake_q_head *wqh);
+extern void rt_mutex_postunlock(struct wake_q_head *wake_q, bool deboost);
extern void rt_mutex_adjust_prio(struct task_struct *task);
#ifdef CONFIG_DEBUG_RT_MUTEXES
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0b21e7a..a533566 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3374,7 +3374,7 @@ EXPORT_SYMBOL(default_wake_function);
/*
* rt_mutex_setprio - set the current priority of a task
* @p: task
- * @prio: prio value (kernel-internal form)
+ * @pi_top_task: top waiter, donating state
*
* This function changes the 'effective' priority of a task. It does
* not touch ->normal_prio like __setscheduler().
@@ -3382,13 +3382,21 @@ EXPORT_SYMBOL(default_wake_function);
* Used by the rt_mutex code to implement priority inheritance
* logic. Call site only calls if the priority of the task changed.
*/
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_top_task)
{
- int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
- struct rq *rq;
+ int prio, oldprio, queued, running;
+ int queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
const struct sched_class *prev_class;
+ struct rq *rq;
- BUG_ON(prio > MAX_PRIO);
+ /*
+ * For FIFO/RR we simply donate prio; for DL things are
+ * more interesting.
+ */
+ /* XXX used to be waiter->prio, not waiter->task->prio */
+ prio = min(pi_top_task->prio, p->normal_prio);
+ if (p->prio == prio && !dl_prio(prio))
+ return;
rq = __task_rq_lock(p);
@@ -3424,6 +3432,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running)
put_prev_task(rq, p);
+ if (pi_top_task == p)
+ pi_top_task = NULL;
+ p->pi_top_task = pi_top_task;
+
/*
* Boosting condition are:
* 1. -rt task is running and holds mutex A
@@ -3434,9 +3446,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
* running task
*/
if (dl_prio(prio)) {
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
if (!dl_prio(p->normal_prio) ||
- (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
+ (pi_top_task &&
+ dl_entity_preempt(&pi_top_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
queue_flag |= ENQUEUE_REPLENISH;
} else
@@ -3709,10 +3721,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
* Keep a potential priority boosting if called from
* sched_setscheduler().
*/
- if (keep_boost)
- p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
- else
- p->prio = normal_prio(p);
+ p->prio = normal_prio(p);
+ if (keep_boost && get_pi_top_task(p))
+ p->prio = min(p->prio, get_pi_top_task(p)->prio);
if (dl_prio(p->prio))
p->sched_class = &dl_sched_class;
@@ -3999,7 +4010,11 @@ change:
* the runqueue. This will be done when the task deboost
* itself.
*/
- new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+ new_effective_prio = newprio;
+ if (get_pi_top_task(p))
+ new_effective_prio =
+ min(new_effective_prio, get_pi_top_task(p)->prio);
+
if (new_effective_prio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index c7a036f..e564c88 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -928,7 +928,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
+ struct task_struct *pi_task = get_pi_top_task(p);
struct sched_dl_entity *pi_se = &p->dl;
/*
--
1.8.3.1
Current code use pi_waiters_leftmost to record the leftmost waiter,
but actually it can be get directly from task_struct::pi_waiters
using rb_first(). The performance penalty introduced by rb_first()
should be fine, because normally there aren't that many rtmutexes
chained together for one task.
We don't remove rt_mutex:pi_waiters_leftmost, as it is quite possible
for many tasks sharing one rtmutex.
Thus, hereby remove pi_waiters_leftmost from task_struct.
Signed-off-by: Xunlei Pang <[email protected]>
---
include/linux/init_task.h | 1 -
include/linux/sched.h | 1 -
kernel/fork.c | 1 -
kernel/locking/rtmutex.c | 13 ++-----------
kernel/locking/rtmutex_common.h | 2 +-
5 files changed, 3 insertions(+), 15 deletions(-)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index de834f3..a967dbf 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -162,7 +162,6 @@ extern struct task_group root_task_group;
#ifdef CONFIG_RT_MUTEXES
# define INIT_RT_MUTEXES(tsk) \
.pi_waiters = RB_ROOT, \
- .pi_waiters_leftmost = NULL, \
.pi_top_task = NULL,
#else
# define INIT_RT_MUTEXES(tsk)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4d9347..6477e22 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1620,7 +1620,6 @@ struct task_struct {
#ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task */
struct rb_root pi_waiters;
- struct rb_node *pi_waiters_leftmost;
struct task_struct *pi_top_task;
/* Deadlock detection and priority inheritance handling */
struct rt_mutex_waiter *pi_blocked_on;
diff --git a/kernel/fork.c b/kernel/fork.c
index 3ad84c9..fc6e5d8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1207,7 +1207,6 @@ static void rt_mutex_init_task(struct task_struct *p)
raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
p->pi_waiters = RB_ROOT;
- p->pi_waiters_leftmost = NULL;
p->pi_blocked_on = NULL;
p->pi_top_task = NULL;
#endif
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6c3a806..b8e9300 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -223,22 +223,16 @@ rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
struct rb_node **link = &task->pi_waiters.rb_node;
struct rb_node *parent = NULL;
struct rt_mutex_waiter *entry;
- int leftmost = 1;
while (*link) {
parent = *link;
entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
- if (rt_mutex_waiter_less(waiter, entry)) {
+ if (rt_mutex_waiter_less(waiter, entry))
link = &parent->rb_left;
- } else {
+ else
link = &parent->rb_right;
- leftmost = 0;
- }
}
- if (leftmost)
- task->pi_waiters_leftmost = &waiter->pi_tree_entry;
-
rb_link_node(&waiter->pi_tree_entry, parent, link);
rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
}
@@ -249,9 +243,6 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
return;
- if (task->pi_waiters_leftmost == &waiter->pi_tree_entry)
- task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry);
-
rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
RB_CLEAR_NODE(&waiter->pi_tree_entry);
}
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 93b0924..ccfa34a 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -63,7 +63,7 @@ static inline int task_has_pi_waiters(struct task_struct *p)
static inline struct rt_mutex_waiter *
task_top_pi_waiter(struct task_struct *p)
{
- return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter,
+ return rb_entry(rb_first(&p->pi_waiters), struct rt_mutex_waiter,
pi_tree_entry);
}
--
1.8.3.1
On Wed, Apr 06, 2016 at 08:59:15PM +0800, Xunlei Pang wrote:
> A crash happened while I'm playing with deadline PI rtmutex.
>
> BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
> IP: [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
> PGD 232a75067 PUD 230947067 PMD 0
> Oops: 0000 [#1] SMP
> CPU: 1 PID: 10994 Comm: a.out Not tainted
>
> Call Trace:
> [<ffffffff810cf8aa>] ? enqueue_task_dl+0x2a/0x320
> [<ffffffff810b658c>] enqueue_task+0x2c/0x80
> [<ffffffff810ba763>] activate_task+0x23/0x30
> [<ffffffff810d0ab5>] pull_dl_task+0x1d5/0x260
> [<ffffffff810d0be6>] pre_schedule_dl+0x16/0x20
> [<ffffffff8164e783>] __schedule+0xd3/0x900
> [<ffffffff8164efd9>] schedule+0x29/0x70
> [<ffffffff8165035b>] __rt_mutex_slowlock+0x4b/0xc0
> [<ffffffff81650501>] rt_mutex_slowlock+0xd1/0x190
> [<ffffffff810eeb33>] rt_mutex_timed_lock+0x53/0x60
> [<ffffffff810ecbfc>] futex_lock_pi.isra.18+0x28c/0x390
> [<ffffffff810cfa15>] ? enqueue_task_dl+0x195/0x320
> [<ffffffff810d0bac>] ? prio_changed_dl+0x6c/0x90
> [<ffffffff810ed8b0>] do_futex+0x190/0x5b0
> [<ffffffff810edd50>] SyS_futex+0x80/0x180
> [<ffffffff8165a089>] system_call_fastpath+0x16/0x1b
> RIP [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
>
> This is because rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi()
> are only protected by pi_lock when operating pi waiters, while
> rt_mutex_get_top_task() will access them with rq lock held but
> not holding pi_lock.
>
> In order to tackle it, we introduce a new pointer "pi_top_task"
> in task_struct, and update it to be the top waiter task(this waiter
> is updated under pi_lock) in rt_mutex_setprio() which is under
> both pi_lock and rq lock, then ensure all its accessers be under
> rq lock (or pi_lock), this can safely fix the crash.
>
> This patch is originated from "Peter Zijlstra", with several
> tweaks and improvements by me.
I would suggest doing the rt_mutex_postunlock() thing as a separate
patch, it has some merit outside of these changes and reduces the total
amount of complexity in this patch.
Also, I would very much like Thomas to ack this patch before I take it,
but since its conference season this might take a little while. Esp. the
change marked with XXX is something that I'm not sure about.
On Wed, Apr 06, 2016 at 08:59:16PM +0800, Xunlei Pang wrote:
> Current code use pi_waiters_leftmost to record the leftmost waiter,
> but actually it can be get directly from task_struct::pi_waiters
> using rb_first(). The performance penalty introduced by rb_first()
> should be fine, because normally there aren't that many rtmutexes
> chained together for one task.
>
> We don't remove rt_mutex:pi_waiters_leftmost, as it is quite possible
> for many tasks sharing one rtmutex.
>
> Thus, hereby remove pi_waiters_leftmost from task_struct.
Again, I would like Thomas to chime in. This isn't about usual, but very
much about worst case performance -- its RT code after all.
On 2016/04/07 at 02:14, Peter Zijlstra wrote:
> On Wed, Apr 06, 2016 at 08:59:15PM +0800, Xunlei Pang wrote:
>> A crash happened while I'm playing with deadline PI rtmutex.
>>
>> BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
>> IP: [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
>> PGD 232a75067 PUD 230947067 PMD 0
>> Oops: 0000 [#1] SMP
>> CPU: 1 PID: 10994 Comm: a.out Not tainted
>>
>> Call Trace:
>> [<ffffffff810cf8aa>] ? enqueue_task_dl+0x2a/0x320
>> [<ffffffff810b658c>] enqueue_task+0x2c/0x80
>> [<ffffffff810ba763>] activate_task+0x23/0x30
>> [<ffffffff810d0ab5>] pull_dl_task+0x1d5/0x260
>> [<ffffffff810d0be6>] pre_schedule_dl+0x16/0x20
>> [<ffffffff8164e783>] __schedule+0xd3/0x900
>> [<ffffffff8164efd9>] schedule+0x29/0x70
>> [<ffffffff8165035b>] __rt_mutex_slowlock+0x4b/0xc0
>> [<ffffffff81650501>] rt_mutex_slowlock+0xd1/0x190
>> [<ffffffff810eeb33>] rt_mutex_timed_lock+0x53/0x60
>> [<ffffffff810ecbfc>] futex_lock_pi.isra.18+0x28c/0x390
>> [<ffffffff810cfa15>] ? enqueue_task_dl+0x195/0x320
>> [<ffffffff810d0bac>] ? prio_changed_dl+0x6c/0x90
>> [<ffffffff810ed8b0>] do_futex+0x190/0x5b0
>> [<ffffffff810edd50>] SyS_futex+0x80/0x180
>> [<ffffffff8165a089>] system_call_fastpath+0x16/0x1b
>> RIP [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
>>
>> This is because rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi()
>> are only protected by pi_lock when operating pi waiters, while
>> rt_mutex_get_top_task() will access them with rq lock held but
>> not holding pi_lock.
>>
>> In order to tackle it, we introduce a new pointer "pi_top_task"
>> in task_struct, and update it to be the top waiter task(this waiter
>> is updated under pi_lock) in rt_mutex_setprio() which is under
>> both pi_lock and rq lock, then ensure all its accessers be under
>> rq lock (or pi_lock), this can safely fix the crash.
>>
>> This patch is originated from "Peter Zijlstra", with several
>> tweaks and improvements by me.
> I would suggest doing the rt_mutex_postunlock() thing as a separate
> patch, it has some merit outside of these changes and reduces the total
> amount of complexity in this patch.
I think the code change is necessary , as it avoids the invalid task_struct
access issue introduced by PATCH1.
Do you mean just making the code refactor using rt_mutex_postunlock()
as a separate patch? or do I miss something?
>
> Also, I would very much like Thomas to ack this patch before I take it,
> but since its conference season this might take a little while. Esp. the
> change marked with XXX is something that I'm not sure about.
I'm ok with this change, waiting for Thomas.
Regards,
Xunlei
On Fri, Apr 08, 2016 at 04:04:07PM +0800, Xunlei Pang wrote:
> On 2016/04/07 at 02:14, Peter Zijlstra wrote:
> > I would suggest doing the rt_mutex_postunlock() thing as a separate
> > patch, it has some merit outside of these changes and reduces the total
> > amount of complexity in this patch.
>
> I think the code change is necessary , as it avoids the invalid task_struct
> access issue introduced by PATCH1.
>
> Do you mean just making the code refactor using rt_mutex_postunlock()
> as a separate patch? or do I miss something?
This, a separate patch that comes before this one.
But no need to send that until you've received word from Thomas.