2006-10-01 11:37:28

by Esben Nielsen

[permalink] [raw]
Subject: [patch 1/5] Fix timeout bug in rtmutex in 2.6.18-rt

This patch adds a new interface to the scheduler: A task can be scheduled in
LIFO order for a while instead of the usual FIFO order. This is more or less
equivalent to raising it's priority by 1/2.

This property is now needed by the rtmutexes to solve the last few issues, but
I can imagine it can be usefull for other subsystems too.

include/linux/init_task.h | 1
include/linux/sched.h | 62 ++++++++++++++++++++++++++++++++++++++++++++++
kernel/sched.c | 29 +++++++++++++++++----
3 files changed, 87 insertions(+), 5 deletions(-)

Index: linux-2.6.18-rt/include/linux/init_task.h
===================================================================
--- linux-2.6.18-rt.orig/include/linux/init_task.h
+++ linux-2.6.18-rt/include/linux/init_task.h
@@ -91,6 +91,7 @@ extern struct group_info init_groups;
.prio = MAX_PRIO-20, \
.static_prio = MAX_PRIO-20, \
.normal_prio = MAX_PRIO-20, \
+ .sched_lifo = 0, \
.policy = SCHED_NORMAL, \
.cpus_allowed = CPU_MASK_ALL, \
.mm = NULL, \
Index: linux-2.6.18-rt/include/linux/sched.h
===================================================================
--- linux-2.6.18-rt.orig/include/linux/sched.h
+++ linux-2.6.18-rt/include/linux/sched.h
@@ -928,6 +928,7 @@ struct task_struct {
int prio, static_prio, normal_prio;
struct list_head run_list;
struct prio_array *array;
+ int sched_lifo;

unsigned short ioprio;
unsigned int btrace_seq;
@@ -1329,6 +1330,67 @@ extern struct task_struct *curr_task(int
extern void set_curr_task(int cpu, struct task_struct *p);
extern void set_thread_priority(struct task_struct *p, int prio);

+/*
+ * sched_lifo: A task can be sched-lifo mode and be sure to be scheduled before
+ * any other task with the same or lower priority - except for later arriving
+ * tasks with the sched_lifo property set.
+ *
+ * It is supposed to work similar to irq-flags:
+ *
+ * int old_sched_lifo;
+ * ...
+ * old_sched_lifo = enter_sched_lifo();
+ * ...
+ * leave_sched_lifo(old_sched_lifo);
+ *
+ * The purpose is that the sched-lifo sections can be easily nested.
+ *
+ * With the get/enter/leave_sched_lifo_other() the lifo status on another task
+ * can be manipulated, The status is neither atomic, nor protected by any lock.
+ * Therefore it is up to the user of those function to ensure that the
+ * operations a properly serialized. The easiest will be not to use
+ * *_sched_lifo_other() functions.
+ */
+
+static inline int get_sched_lifo_other(struct task_struct *task)
+{
+ return task->sched_lifo;
+}
+
+static inline int get_sched_lifo(void)
+{
+ return get_sched_lifo_other(current);
+}
+
+static inline int enter_sched_lifo_other(struct task_struct *task)
+{
+ int old = task->sched_lifo;
+ task->sched_lifo = 1;
+ return old;
+}
+
+static inline int enter_sched_lifo(void)
+{
+ return enter_sched_lifo_other(current);
+}
+
+
+static inline void leave_sched_lifo_other(struct task_struct *task,
+ int old_value)
+{
+ task->sched_lifo = old_value;
+ /*
+ * if sched_lifo == 0 should we move to the tail of the runqueue ?
+ * what if we never sleeped while in sched_lifo ?
+ */
+}
+
+static inline void leave_sched_lifo(int old_value)
+{
+ leave_sched_lifo_other(current, old_value);
+}
+
+
void yield(void);
void __yield(void);

Index: linux-2.6.18-rt/kernel/sched.c
===================================================================
--- linux-2.6.18-rt.orig/kernel/sched.c
+++ linux-2.6.18-rt/kernel/sched.c
@@ -164,8 +164,8 @@
(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))

-#define TASK_PREEMPTS_CURR(p, rq) \
- ((p)->prio < (rq)->curr->prio)
+#define TASK_PREEMPTS(p,q) task_preempts(p,q)
+#define TASK_PREEMPTS_CURR(p, rq) TASK_PREEMPTS(p,(rq)->curr)

/*
* task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
@@ -707,6 +707,17 @@ sched_info_switch(struct task_struct *pr
#define sched_info_switch(t, next) do { } while (0)
#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */

+int task_preempts(struct task_struct *p, struct task_struct *q)
+{
+ if (p->prio < q->prio)
+ return 1;
+
+ if (p->prio > q->prio)
+ return 0;
+
+ return p->sched_lifo;
+}
+
#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
static __cacheline_aligned_in_smp atomic_t rt_overload;
#endif
@@ -770,7 +781,12 @@ static void enqueue_task(struct task_str
{
WARN_ON_ONCE(p->flags & PF_DEAD);
sched_info_queued(p);
- list_add_tail(&p->run_list, array->queue + p->prio);
+
+ if (p->sched_lifo)
+ list_add(&p->run_list, array->queue + p->prio);
+ else
+ list_add_tail(&p->run_list, array->queue + p->prio);
+
__set_bit(p->prio, array->bitmap);
array->nr_active++;
p->array = array;
@@ -783,7 +799,10 @@ static void enqueue_task(struct task_str
*/
static void requeue_task(struct task_struct *p, struct prio_array *array)
{
- list_move_tail(&p->run_list, array->queue + p->prio);
+ if (p->sched_lifo)
+ list_move(&p->run_list, array->queue + p->prio);
+ else
+ list_move_tail(&p->run_list, array->queue + p->prio);
}

static inline void
@@ -1463,7 +1482,7 @@ static void balance_rt_tasks(struct rq *
* Do we have an RT task that preempts
* the to-be-scheduled task?
*/
- if (p && (p->prio < next->prio)) {
+ if (p && TASK_PREEMPTS(p,next)) {
WARN_ON(p == src_rq->curr);
WARN_ON(!p->array);
schedstat_inc(this_rq, rto_pulled);

--


2006-11-10 14:49:52

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch 1/5] Fix timeout bug in rtmutex in 2.6.18-rt


* Esben Nielsen <[email protected]> wrote:

> include/linux/init_task.h | 1
> include/linux/sched.h | 62
> kernel/sched.c | 29 +++++++++++++++++----

what kernel tree is this supposed to be against? Neither vanilla nor
-rt7 2.6.18 works:

Hunk #1 FAILED at 91.
1 out of 1 hunk FAILED -- rejects in file include/linux/init_task.h
patching file include/linux/sched.h
Hunk #1 FAILED at 928.
Hunk #2 FAILED at 1330.
2 out of 2 hunks FAILED -- rejects in file include/linux/sched.h
patching file kernel/sched.c
Hunk #1 succeeded at 157 with fuzz 2 (offset -7 lines).
Hunk #2 FAILED at 700.
Hunk #3 FAILED at 774.
Hunk #4 FAILED at 792.
Hunk #5 FAILED at 1475.
4 out of 5 hunks FAILED -- rejects in file kernel/sched.c

Ingo

2006-11-10 16:18:00

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch 1/5] Fix timeout bug in rtmutex in 2.6.18-rt


* Ingo Molnar <[email protected]> wrote:

>
> * Esben Nielsen <[email protected]> wrote:
>
> > include/linux/init_task.h | 1
> > include/linux/sched.h | 62
> > kernel/sched.c | 29 +++++++++++++++++----
>
> what kernel tree is this supposed to be against? Neither vanilla nor
> -rt7 2.6.18 works:

ah, whitespace damage ... every line in the patch has an extra space
character in front of it.

Ingo

2006-11-10 18:03:55

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch 1/5] Fix timeout bug in rtmutex in 2.6.18-rt


on SMP i had to remove this assert:

Index: linux/kernel/rtmutex.c
===================================================================
--- linux.orig/kernel/rtmutex.c
+++ linux/kernel/rtmutex.c
@@ -823,7 +823,7 @@ rt_spin_lock_slowunlock(struct rt_mutex
}
wakeup_next_waiter(lock, 1);
spin_unlock_irqrestore(&lock->wait_lock, flags);
- BUG_ON(current->boosting_prio != MAX_PRIO);
+// BUG_ON(current->boosting_prio != MAX_PRIO);
/* Undo pi boosting.when necessary */
rt_mutex_adjust_prio(current);
}

because it triggered almost immediately after bootup.

Ingo

2006-11-13 09:45:25

by Esben Nielsen

[permalink] [raw]
Subject: Re: [patch 1/5] Fix timeout bug in rtmutex in 2.6.18-rt



On Fri, 10 Nov 2006, Ingo Molnar wrote:

>
> * Esben Nielsen <[email protected]> wrote:
>
>> include/linux/init_task.h | 1
>> include/linux/sched.h | 62
>> kernel/sched.c | 29 +++++++++++++++++----
>
> what kernel tree is this supposed to be against? Neither vanilla nor
> -rt7 2.6.18 works:

Sorry, I missed the exact number. I believe it was 2.6.18-rt5 or
2.6.18-rt6. I know it wasn't -rt7 because Thomas changed the locking..

Esben

>
> Hunk #1 FAILED at 91.
> 1 out of 1 hunk FAILED -- rejects in file include/linux/init_task.h
> patching file include/linux/sched.h
> Hunk #1 FAILED at 928.
> Hunk #2 FAILED at 1330.
> 2 out of 2 hunks FAILED -- rejects in file include/linux/sched.h
> patching file kernel/sched.c
> Hunk #1 succeeded at 157 with fuzz 2 (offset -7 lines).
> Hunk #2 FAILED at 700.
> Hunk #3 FAILED at 774.
> Hunk #4 FAILED at 792.
> Hunk #5 FAILED at 1475.
> 4 out of 5 hunks FAILED -- rejects in file kernel/sched.c
>
> Ingo
>

2006-11-13 10:41:05

by Esben Nielsen

[permalink] [raw]
Subject: Re: [patch 1/5] Fix timeout bug in rtmutex in 2.6.18-rt

On Fri, 10 Nov 2006, Ingo Molnar wrote:

>
> on SMP i had to remove this assert:
>
> Index: linux/kernel/rtmutex.c
> ===================================================================
> --- linux.orig/kernel/rtmutex.c
> +++ linux/kernel/rtmutex.c
> @@ -823,7 +823,7 @@ rt_spin_lock_slowunlock(struct rt_mutex
> }
> wakeup_next_waiter(lock, 1);
> spin_unlock_irqrestore(&lock->wait_lock, flags);
> - BUG_ON(current->boosting_prio != MAX_PRIO);
> +// BUG_ON(current->boosting_prio != MAX_PRIO);
> /* Undo pi boosting.when necessary */
> rt_mutex_adjust_prio(current);
> }
>
> because it triggered almost immediately after bootup.
>


Hmm, boosting_prio should be MAX_PRIO at that point. boosting_prio should
only be different from MAX_PRIO while a task is trying to lock a mutex -
i.e. while it is in a *_slowlock() function.

On which kernel have you applied the patch?


I am looking at 2.6.18-rt6 and trying to see if there is a race conditions.
I can't see it that easily (or I would probably have seen it already :-).
task->boosting_prio is not set when task->pi_lock is not taken. And then
it is only set to something else than MAX_PRIO if task->pi_blocked_on is
set.


> Ingo
>

Esben