2000-12-27 00:55:00

by Andrew Morton

[permalink] [raw]
Subject: [prepatch] 2.4 waitqueues

It's been quiet around here lately...


This is a rework of the 2.4 wakeup code based on the discussions Andrea
and I had last week. There were two basic problems:

- If two tasks are on a waitqueue in exclusive mode and one gets
woken, it will put itself back into TASK_[UN]INTERRUPTIBLE state for a
few instructions and can soak up another wakeup which should have
gone to the other task.

- If a task is on two waitqueues and two CPUs simultaneously run a
wakeup, one on each waitqueue, they can both try to wake the same
task which may be a lost wakeup, depending upon how the waitqueue
users are coded.

The first problem is the most serious. The second is kinda baroque...

The approach taken by this patch is the one which Andrea is proposing
for 2.2: if a task was already on the runqueue, continue scanning for
another exclusive task to wake up.

It ended up getting complicated because of the "find a process affine
to this CPU" thing. Plus I did go slightly berzerk, but I believe the
result is pretty good.

- wake_up_process() now returns a success value if it managed to move
something to the runqueue.

Tidied up the code here a bit as well.

wake_up_process_synchronous() is no more.

- Got rid of all the debugging ifdefs - these have been folded into
wait.h

- Removed all the USE_RW_WAIT_QUEUE_SPINLOCK code and just used
spinlocks.

The read_lock option was pretty questionable anyway. It hasn't had
the widespread testing and, umm, the kernel is using wq_write_lock
*everywhere* anyway, so enabling USE_RW_WAIT_QUEUE_SPINLOCK wouldn't
change anything, except for using a more expensive spinlock!

So it's gone.

- Introduces a kernel-wide macro `SMP_KERNEL'. This is designed to
be used as a `compiled ifdef' in place of `#ifdef CONFIG_SMP'. There
are a few examples in __wake_up_common().

People shouldn't go wild with this, because gcc's dead code
elimination isn't perfect. But it's nice for little things.

- This patch's _use_ of SMP_KERNEL in __wake_up_common is fairly
significant. There was quite a lot of code in that function which
was an unnecessary burden for UP systems. All gone now.

- This patch shrinks sched.o by 100 bytes (SMP) and 300 bytes (UP).
Note that try_to_wake_up() is now only expanded in a single place
in __wake_up_common(). It has a large footprint.

- I have finally had enough of printk() deadlocking or going
infinitely mutually recursive on me so printk()'s wake_up(log_wait)
call has been moved into a tq_timer callback.

- SLEEP_ON_VAR, SLEEP_ON_HEAD and SLEEP_ON_TAIL have been changed. I
see no valid reason why these functions were, effectively, doing
this:

spin_lock_irqsave(lock, flags);
spin_unlock(lock);
schedule();
spin_lock(lock);
spin_unlock_irqrestore(lock, flags);

What's the point in saving the interrupt status in `flags'? If the
caller _wants_ interrupt status preserved then the caller is buggy,
because schedule() enables interrupts. 2.2 does the same thing.

So this has been changed to:

spin_lock_irq(lock);
spin_unlock(lock);
schedule();
spin_lock(lock);
spin_unlock_irq(lock);

Or did I miss something?



--- linux-2.4.0-test13pre4-ac2/include/linux/sched.h Fri Dec 22 16:00:26 2000
+++ linux-akpm/include/linux/sched.h Wed Dec 27 01:17:06 2000
@@ -545,7 +545,7 @@
extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
signed long timeout));
-extern void FASTCALL(wake_up_process(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process(struct task_struct * tsk));

#define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,WQ_FLAG_EXCLUSIVE)
#define wake_up_all(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,0)
--- linux-2.4.0-test13pre4-ac2/include/linux/wait.h Tue Nov 21 20:11:21 2000
+++ linux-akpm/include/linux/wait.h Wed Dec 27 01:13:54 2000
@@ -19,30 +19,10 @@
#include <asm/processor.h>

/*
- * Temporary debugging help until all code is converted to the new
- * waitqueue usage.
+ * Debug control. Slow but useful.
*/
#define WAITQUEUE_DEBUG 0

-#if WAITQUEUE_DEBUG
-extern int printk(const char *fmt, ...);
-#define WQ_BUG() do { \
- printk("wq bug, forcing oops.\n"); \
- BUG(); \
-} while (0)
-
-#define CHECK_MAGIC(x) if (x != (long)&(x)) \
- { printk("bad magic %lx (should be %lx), ", (long)x, (long)&(x)); WQ_BUG(); }
-
-#define CHECK_MAGIC_WQHEAD(x) do { \
- if (x->__magic != (long)&(x->__magic)) { \
- printk("bad magic %lx (should be %lx, creator %lx), ", \
- x->__magic, (long)&(x->__magic), x->__creator); \
- WQ_BUG(); \
- } \
-} while (0)
-#endif
-
struct __wait_queue {
unsigned int flags;
#define WQ_FLAG_EXCLUSIVE 0x01
@@ -55,42 +35,8 @@
};
typedef struct __wait_queue wait_queue_t;

-/*
- * 'dual' spinlock architecture. Can be switched between spinlock_t and
- * rwlock_t locks via changing this define. Since waitqueues are quite
- * decoupled in the new architecture, lightweight 'simple' spinlocks give
- * us slightly better latencies and smaller waitqueue structure size.
- */
-#define USE_RW_WAIT_QUEUE_SPINLOCK 0
-
-#if USE_RW_WAIT_QUEUE_SPINLOCK
-# define wq_lock_t rwlock_t
-# define WAITQUEUE_RW_LOCK_UNLOCKED RW_LOCK_UNLOCKED
-
-# define wq_read_lock read_lock
-# define wq_read_lock_irqsave read_lock_irqsave
-# define wq_read_unlock_irqrestore read_unlock_irqrestore
-# define wq_read_unlock read_unlock
-# define wq_write_lock_irq write_lock_irq
-# define wq_write_lock_irqsave write_lock_irqsave
-# define wq_write_unlock_irqrestore write_unlock_irqrestore
-# define wq_write_unlock write_unlock
-#else
-# define wq_lock_t spinlock_t
-# define WAITQUEUE_RW_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED
-
-# define wq_read_lock spin_lock
-# define wq_read_lock_irqsave spin_lock_irqsave
-# define wq_read_unlock spin_unlock
-# define wq_read_unlock_irqrestore spin_unlock_irqrestore
-# define wq_write_lock_irq spin_lock_irq
-# define wq_write_lock_irqsave spin_lock_irqsave
-# define wq_write_unlock_irqrestore spin_unlock_irqrestore
-# define wq_write_unlock spin_unlock
-#endif
-
struct __wait_queue_head {
- wq_lock_t lock;
+ spinlock_t lock;
struct list_head task_list;
#if WAITQUEUE_DEBUG
long __magic;
@@ -99,35 +45,85 @@
};
typedef struct __wait_queue_head wait_queue_head_t;

+
+/*
+ * Debugging macros. We eschew `do { } while (0)' because gcc can generate
+ * spurious .aligns.
+ */
#if WAITQUEUE_DEBUG
-# define __WAITQUEUE_DEBUG_INIT(name) \
- , (long)&(name).__magic, 0
-# define __WAITQUEUE_HEAD_DEBUG_INIT(name) \
- , (long)&(name).__magic, (long)&(name).__magic
+#define WQ_BUG() BUG()
+#define CHECK_MAGIC(x) \
+ do { \
+ if ((x) != (long)&(x)) { \
+ printk("bad magic %lx (should be %lx), ", \
+ (long)x, (long)&(x)); \
+ WQ_BUG(); \
+ } \
+ } while (0)
+#define CHECK_MAGIC_WQHEAD(x) \
+ do { \
+ if ((x)->__magic != (long)&((x)->__magic)) { \
+ printk("bad magic %lx (should be %lx, creator %lx), ", \
+ (x)->__magic, (long)&((x)->__magic), (x)->__creator); \
+ WQ_BUG(); \
+ } \
+ } while (0)
+#define WQ_CHECK_LIST_HEAD(list) \
+ do { \
+ if (!list->next || !list->prev) \
+ WQ_BUG(); \
+ } while(0)
+#define WQ_NOTE_WAKER(tsk) \
+ do { \
+ tsk->__waker = (long)__builtin_return_address(0); \
+ } while (0)
+#else
+#define WQ_BUG()
+#define CHECK_MAGIC(x)
+#define CHECK_MAGIC_WQHEAD(x)
+#define WQ_CHECK_LIST_HEAD(list)
+#define WQ_NOTE_WAKER(tsk)
+#endif
+
+/*
+ * Macros for declaration and initialisaton of the datatypes
+ */
+
+#if WAITQUEUE_DEBUG
+# define __WAITQUEUE_DEBUG_INIT(name) (long)&(name).__magic, 0
+# define __WAITQUEUE_HEAD_DEBUG_INIT(name) (long)&(name).__magic, (long)&(name).__magic
#else
# define __WAITQUEUE_DEBUG_INIT(name)
# define __WAITQUEUE_HEAD_DEBUG_INIT(name)
#endif

-#define __WAITQUEUE_INITIALIZER(name,task) \
- { 0x0, task, { NULL, NULL } __WAITQUEUE_DEBUG_INIT(name)}
-#define DECLARE_WAITQUEUE(name,task) \
- wait_queue_t name = __WAITQUEUE_INITIALIZER(name,task)
-
-#define __WAIT_QUEUE_HEAD_INITIALIZER(name) \
-{ WAITQUEUE_RW_LOCK_UNLOCKED, { &(name).task_list, &(name).task_list } \
- __WAITQUEUE_HEAD_DEBUG_INIT(name)}
+#define __WAITQUEUE_INITIALIZER(name, tsk) { \
+ task: tsk, \
+ task_list: { NULL, NULL }, \
+ __WAITQUEUE_DEBUG_INIT(name)}
+
+#define DECLARE_WAITQUEUE(name, tsk) \
+ wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)
+
+#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \
+ lock: SPIN_LOCK_UNLOCKED, \
+ task_list: { &(name).task_list, &(name).task_list }, \
+ __WAITQUEUE_HEAD_DEBUG_INIT(name)}

#define DECLARE_WAIT_QUEUE_HEAD(name) \
wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)

+/*
+ * Inline functions
+ */
+
static inline void init_waitqueue_head(wait_queue_head_t *q)
{
#if WAITQUEUE_DEBUG
if (!q)
WQ_BUG();
#endif
- q->lock = WAITQUEUE_RW_LOCK_UNLOCKED;
+ q->lock = SPIN_LOCK_UNLOCKED;
INIT_LIST_HEAD(&q->task_list);
#if WAITQUEUE_DEBUG
q->__magic = (long)&q->__magic;
@@ -135,8 +131,7 @@
#endif
}

-static inline void init_waitqueue_entry(wait_queue_t *q,
- struct task_struct *p)
+static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
{
#if WAITQUEUE_DEBUG
if (!q || !p)
--- linux-2.4.0-test13pre4-ac2/include/linux/smp.h Sat Sep 9 16:19:30 2000
+++ linux-akpm/include/linux/smp.h Wed Dec 27 01:13:54 2000
@@ -88,4 +88,15 @@
#define cpu_online_map 1

#endif
+
+/*
+ * SMP_KERNEL may be used in very simple `if' statements in place
+ * of `#ifdef CONFIG_SMP'
+ */
+#ifdef CONFIG_SMP
+#define SMP_KERNEL 1
+#else
+#define SMP_KERNEL 0
+#endif
+
#endif
--- linux-2.4.0-test13pre4-ac2/kernel/sched.c Tue Dec 12 19:24:23 2000
+++ linux-akpm/kernel/sched.c Wed Dec 27 01:52:08 2000
@@ -326,9 +326,10 @@
* "current->state = TASK_RUNNING" to mark yourself runnable
* without the overhead of this.
*/
-inline void wake_up_process(struct task_struct * p)
+static inline int try_to_wake_up(struct task_struct * p, int synchronous)
{
unsigned long flags;
+ int success = 0;

/*
* We want the common case fall through straight, thus the goto.
@@ -338,25 +339,17 @@
if (task_on_runqueue(p))
goto out;
add_to_runqueue(p);
- reschedule_idle(p);
+ if (!synchronous)
+ reschedule_idle(p);
+ success = 1;
out:
spin_unlock_irqrestore(&runqueue_lock, flags);
+ return success;
}

-static inline void wake_up_process_synchronous(struct task_struct * p)
+inline int wake_up_process(struct task_struct * p)
{
- unsigned long flags;
-
- /*
- * We want the common case fall through straight, thus the goto.
- */
- spin_lock_irqsave(&runqueue_lock, flags);
- p->state = TASK_RUNNING;
- if (task_on_runqueue(p))
- goto out;
- add_to_runqueue(p);
-out:
- spin_unlock_irqrestore(&runqueue_lock, flags);
+ return try_to_wake_up(p, 0);
}

static void process_timeout(unsigned long __data)
@@ -689,76 +682,78 @@
return;
}

+/*
+ * The core wakeup function. Non-exclusive wakeups just wake everything up.
+ * If it's an exclusive wakeup then we wake all the non-exclusive tasks
+ * and one exclusive task.
+ * If called from interrupt context we wake the least-recently queued exclusive task
+ * which wants to run on the current CPU.
+ * If not called from interrupt context we simply wake the least-recently queued
+ * exclusive task.
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero
+ * in this (rare) case, and we handle it by rescanning the exclusive tasks and
+ * trying to wake *someone*.
+ */
static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
unsigned int wq_mode, const int sync)
{
- struct list_head *tmp, *head;
- struct task_struct *p, *best_exclusive;
+ struct list_head *curr_sleeper, *head, *first_nonaffine_exclusive;
+ struct task_struct *p;
unsigned long flags;
- int best_cpu, irq;
+ int best_cpu, do_affine;

if (!q)
goto out;

- best_cpu = smp_processor_id();
- irq = in_interrupt();
- best_exclusive = NULL;
- wq_write_lock_irqsave(&q->lock, flags);
-
-#if WAITQUEUE_DEBUG
+ if (SMP_KERNEL) {
+ best_cpu = smp_processor_id();
+ do_affine = in_interrupt();
+ first_nonaffine_exclusive = NULL;
+ }
+ spin_lock_irqsave(&q->lock, flags);
CHECK_MAGIC_WQHEAD(q);
-#endif
-
head = &q->task_list;
-#if WAITQUEUE_DEBUG
- if (!head->next || !head->prev)
- WQ_BUG();
-#endif
- tmp = head->next;
- while (tmp != head) {
- unsigned int state;
- wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
-
- tmp = tmp->next;
+ WQ_CHECK_LIST_HEAD(head);
+ curr_sleeper = head->next;
+retry:
+ while (curr_sleeper != head) {
+ wait_queue_t *curr = list_entry(curr_sleeper, wait_queue_t, task_list);

-#if WAITQUEUE_DEBUG
CHECK_MAGIC(curr->__magic);
-#endif
p = curr->task;
- state = p->state;
- if (state & mode) {
-#if WAITQUEUE_DEBUG
- curr->__waker = (long)__builtin_return_address(0);
-#endif
- /*
- * If waking up from an interrupt context then
- * prefer processes which are affine to this
- * CPU.
- */
- if (irq && (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)) {
- if (!best_exclusive)
- best_exclusive = p;
- if (p->processor == best_cpu) {
- best_exclusive = p;
- break;
- }
+ if (p->state & mode) {
+ WQ_NOTE_WAKER(curr);
+
+ if (SMP_KERNEL && do_affine && p->processor != best_cpu &&
+ (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)) {
+ if (first_nonaffine_exclusive == NULL)
+ first_nonaffine_exclusive = curr_sleeper;
} else {
- if (sync)
- wake_up_process_synchronous(p);
- else
- wake_up_process(p);
- if (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)
- break;
+ if (try_to_wake_up(p, sync)) {
+ if (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)
+ goto woke_one;
+ }
}
}
+ curr_sleeper = curr_sleeper->next;
}
- if (best_exclusive) {
- if (sync)
- wake_up_process_synchronous(best_exclusive);
- else
- wake_up_process(best_exclusive);
+
+ if (SMP_KERNEL && first_nonaffine_exclusive) {
+ /*
+ * If we get here, there were exclusive sleepers on the queue, but we didn't
+ * wake any up. We've already tried to wake all the sleepers who are affine
+ * to this CPU and we failed. So we now try _all_ the exclusive sleepers.
+ * We start with the least-recently-queued non-affine task. It's almost certainly
+ * not on the runqueue, so we'll terminate the above loop on the first pass.
+ */
+ do_affine = 0;
+ curr_sleeper = first_nonaffine_exclusive;
+ first_nonaffine_exclusive = NULL;
+ goto retry;
}
- wq_write_unlock_irqrestore(&q->lock, flags);
+woke_one:
+ spin_unlock_irqrestore(&q->lock, flags);
out:
return;
}
@@ -774,19 +769,18 @@
}

#define SLEEP_ON_VAR \
- unsigned long flags; \
wait_queue_t wait; \
init_waitqueue_entry(&wait, current);

#define SLEEP_ON_HEAD \
- wq_write_lock_irqsave(&q->lock,flags); \
+ spin_lock_irq(&q->lock); \
__add_wait_queue(q, &wait); \
- wq_write_unlock(&q->lock);
+ spin_unlock(&q->lock);

#define SLEEP_ON_TAIL \
- wq_write_lock_irq(&q->lock); \
+ spin_lock_irq(&q->lock); \
__remove_wait_queue(q, &wait); \
- wq_write_unlock_irqrestore(&q->lock,flags);
+ spin_unlock_irq(&q->lock);

void interruptible_sleep_on(wait_queue_head_t *q)
{
--- linux-2.4.0-test13pre4-ac2/kernel/printk.c Sat Dec 23 17:24:20 2000
+++ linux-akpm/kernel/printk.c Wed Dec 27 01:13:54 2000
@@ -19,6 +19,7 @@
#include <linux/smp_lock.h>
#include <linux/console.h>
#include <linux/init.h>
+#include <linux/tqueue.h>

#include <asm/uaccess.h>

@@ -251,6 +252,16 @@
return do_syslog(type, buf, len);
}

+/*
+ * We can get deadlocks or infinite recursion calling wake_up() from within
+ * printk(), so we use a tq_timer callback. But don't put a printk() in
+ * queue_task()!
+ */
+static void wake_log_wait(void *dummy)
+{
+ wake_up_interruptible(&log_wait);
+}
+
asmlinkage int printk(const char *fmt, ...)
{
va_list args;
@@ -259,6 +270,9 @@
int line_feed;
static signed char msg_level = -1;
long flags;
+ static struct tq_struct log_wait_waker = {
+ routine: wake_log_wait,
+ };

spin_lock_irqsave(&console_lock, flags);
va_start(args, fmt);
@@ -308,7 +322,7 @@
msg_level = -1;
}
spin_unlock_irqrestore(&console_lock, flags);
- wake_up_interruptible(&log_wait);
+ queue_task(&log_wait_waker, &tq_timer);
return i;
}

--- linux-2.4.0-test13pre4-ac2/kernel/fork.c Fri Dec 22 16:00:26 2000
+++ linux-akpm/kernel/fork.c Wed Dec 27 00:37:49 2000
@@ -38,29 +38,29 @@
{
unsigned long flags;

- wq_write_lock_irqsave(&q->lock, flags);
+ spin_lock_irqsave(&q->lock, flags);
wait->flags = 0;
__add_wait_queue(q, wait);
- wq_write_unlock_irqrestore(&q->lock, flags);
+ spin_unlock_irqrestore(&q->lock, flags);
}

void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)
{
unsigned long flags;

- wq_write_lock_irqsave(&q->lock, flags);
+ spin_lock_irqsave(&q->lock, flags);
wait->flags = WQ_FLAG_EXCLUSIVE;
__add_wait_queue_tail(q, wait);
- wq_write_unlock_irqrestore(&q->lock, flags);
+ spin_unlock_irqrestore(&q->lock, flags);
}

void remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
{
unsigned long flags;

- wq_write_lock_irqsave(&q->lock, flags);
+ spin_lock_irqsave(&q->lock, flags);
__remove_wait_queue(q, wait);
- wq_write_unlock_irqrestore(&q->lock, flags);
+ spin_unlock_irqrestore(&q->lock, flags);
}

void __init fork_init(unsigned long mempages)


2000-12-27 03:07:18

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [prepatch] 2.4 waitqueues

On Wed, Dec 27, 2000 at 11:29:06AM +1100, Andrew Morton wrote:
> - Got rid of all the debugging ifdefs - these have been folded into
> wait.h

Why? Such debugging code is just disabled so it doesn't get compiled in, but if
somebody wants he can enable it changing the #define in the sources to catch
missing initialization of the waitqueue. I disagree in removing it.

> - Removed all the USE_RW_WAIT_QUEUE_SPINLOCK code and just used
> spinlocks.
>
> The read_lock option was pretty questionable anyway. It hasn't had
> the widespread testing and, umm, the kernel is using wq_write_lock
> *everywhere* anyway, so enabling USE_RW_WAIT_QUEUE_SPINLOCK wouldn't
> change anything, except for using a more expensive spinlock!
>
> So it's gone.

It's true that wake_up_process doesn't scale, but O(N) scans of the waitqueue
could be imposed by the smp-irq-wakeup logic if there's no CPU affine task
registered, so it makes sense at least in theory. Of course right now it can
only hurt because as you say wake_up is using the write_lock too. But I'd
prefer not to drop it since it become safe (no wake-one missed event) once we
fix all the wake-one races checking the wake_up_process retval, so I'd suggest
to use the wq_read_lock in browse of the waitqueue instead of removing
wq_*_lock.

> - Introduces a kernel-wide macro `SMP_KERNEL'. This is designed to
> be used as a `compiled ifdef' in place of `#ifdef CONFIG_SMP'. There
> are a few examples in __wake_up_common().
>
> People shouldn't go wild with this, because gcc's dead code
> elimination isn't perfect. But it's nice for little things.

I think the main issue is that SMP_KERNEL isn't complete and I'm not sure if it
really helps readability to increase the ways to write smp-compile-time code.
For example in your example you probably get complains about unitialized
variables in the UP compile, then I think it's nicer not to rely on gcc for the
reasons you said. The fact the thing is little is not much relevant if gcc gets
it wrong (however I believe gcc will get it right) because the little thing
could be in a fast path like in the example usage in wake_up().

> - This patch's _use_ of SMP_KERNEL in __wake_up_common is fairly
> significant. There was quite a lot of code in that function which
> was an unnecessary burden for UP systems. All gone now.

The same could be achived with the usual #ifdef CONFIG_SMP of course ;).

> - I have finally had enough of printk() deadlocking or going
> infinitely mutually recursive on me so printk()'s wake_up(log_wait)
> call has been moved into a tq_timer callback.

That's only overhead for mainstream kernel. Nobody is allowed to call printk
from wake_up(). For debugging wake_up() the above hack is fine of course (but
that is stuff for a debugging tree not for the mainstream one). Or at least
it should be put inside an #ifdef DEBUG or something like that.

> - SLEEP_ON_VAR, SLEEP_ON_HEAD and SLEEP_ON_TAIL have been changed. I
> see no valid reason why these functions were, effectively, doing
> this:
>
> spin_lock_irqsave(lock, flags);
> spin_unlock(lock);
> schedule();
> spin_lock(lock);
> spin_unlock_irqrestore(lock, flags);
>
> What's the point in saving the interrupt status in `flags'? If the

Because old drivers could be doing ugly stuff like this:

cli()
for (;;) {
if (!resource_available)
sleep_on(&wait_for_resource)
else
break
}
...
sti()

If you don't save and restore flags the second sleep_on could be entered with
irq enabled and the wakeup from irq could happen before registering in the
waitqueue causing a lost wakeup.

Maybe none driver is doing the above, but I think there's no point to
microoptimize sleep_on by breaking horrible [but currently safe] usages like
the above one at runtime, because if the driver cares about performance it
wasn't using sleep_on since the first place ;). The right way to optimize the
code is to remove sleep_on_*, that wouldn't risk to break stuff silenty at
runtime. I believe it's a 2.5.x item (I really don't recommend to drop sleep_on
right now for 2.4.x).

The patch is mostly ok IMHO, but I'd prefer if you could you rework it so that
it only fixes the race by checking the wake_up_process* retval plus the UP
cleanups in wake_up using the legacy #ifdef CONFIG_SMP way.

Thanks!

Andrea

2000-12-27 03:23:02

by Andrew Morton

[permalink] [raw]
Subject: Re: [prepatch] 2.4 waitqueues

Andrea Arcangeli wrote:
>
> On Wed, Dec 27, 2000 at 11:29:06AM +1100, Andrew Morton wrote:
> > - Got rid of all the debugging ifdefs - these have been folded into
> > wait.h
>
> Why? Such debugging code is just disabled so it doesn't get compiled in, but if
> somebody wants he can enable it changing the #define in the sources to catch
> missing initialization of the waitqueue. I disagree in removing it.

Oh, it's all still there, but it's now all in the header file:

#ifdef DEBUG
#define foo() printk(stuff)
#else
#define foo()
#endif

> > - Removed all the USE_RW_WAIT_QUEUE_SPINLOCK code and just used
> > spinlocks.
> >
> > The read_lock option was pretty questionable anyway. It hasn't had
> > the widespread testing and, umm, the kernel is using wq_write_lock
> > *everywhere* anyway, so enabling USE_RW_WAIT_QUEUE_SPINLOCK wouldn't
> > change anything, except for using a more expensive spinlock!
> >
> > So it's gone.
>
> It's true that wake_up_process doesn't scale, but O(N) scans of the waitqueue
> could be imposed by the smp-irq-wakeup logic if there's no CPU affine task
> registered, so it makes sense at least in theory. Of course right now it can
> only hurt because as you say wake_up is using the write_lock too. But I'd
> prefer not to drop it since it become safe (no wake-one missed event) once we
> fix all the wake-one races checking the wake_up_process retval, so I'd suggest
> to use the wq_read_lock in browse of the waitqueue instead of removing
> wq_*_lock.

OK.

> > - Introduces a kernel-wide macro `SMP_KERNEL'. This is designed to
> > be used as a `compiled ifdef' in place of `#ifdef CONFIG_SMP'. There
> > are a few examples in __wake_up_common().
> >
> > People shouldn't go wild with this, because gcc's dead code
> > elimination isn't perfect. But it's nice for little things.
>
> I think the main issue is that SMP_KERNEL isn't complete and I'm not sure if it
> really helps readability to increase the ways to write smp-compile-time code.
> For example in your example you probably get complains about unitialized
> variables in the UP compile, then I think it's nicer not to rely on gcc for the
> reasons you said. The fact the thing is little is not much relevant if gcc gets
> it wrong (however I believe gcc will get it right) because the little thing
> could be in a fast path like in the example usage in wake_up().

No, gcc doesn't give any warnings about uninitialised vars. It _could_
give warnings about unused ones, but doesn't.

> > - This patch's _use_ of SMP_KERNEL in __wake_up_common is fairly
> > significant. There was quite a lot of code in that function which
> > was an unnecessary burden for UP systems. All gone now.
>
> The same could be achived with the usual #ifdef CONFIG_SMP of course ;).
>
> > - I have finally had enough of printk() deadlocking or going
> > infinitely mutually recursive on me so printk()'s wake_up(log_wait)
> > call has been moved into a tq_timer callback.
>
> That's only overhead for mainstream kernel. Nobody is allowed to call printk
> from wake_up(). For debugging wake_up() the above hack is fine of course (but
> that is stuff for a debugging tree not for the mainstream one). Or at least
> it should be put inside an #ifdef DEBUG or something like that.

It's not just open-coded printks - it's oopses. If you take an oops or a
BUG or a panic within wake_up() or _anywhere_ with the runqueue_lock held,
the machine deadlocks and you don't get any diagnostics. This is bad.

We really do need to get that wake_up() out of printk(). tq_timer may
not be the best way. Suggestions welcome.

> > - SLEEP_ON_VAR, SLEEP_ON_HEAD and SLEEP_ON_TAIL have been changed. I
> > see no valid reason why these functions were, effectively, doing
> > this:
> >
> > spin_lock_irqsave(lock, flags);
> > spin_unlock(lock);
> > schedule();
> > spin_lock(lock);
> > spin_unlock_irqrestore(lock, flags);
> >
> > What's the point in saving the interrupt status in `flags'? If the
>
> Because old drivers could be doing ugly stuff like this:
>
> cli()
> for (;;) {
> if (!resource_available)
> sleep_on(&wait_for_resource)
> else
> break
> }
> ...
> sti()
>
> If you don't save and restore flags the second sleep_on could be entered with
> irq enabled and the wakeup from irq could happen before registering in the
> waitqueue causing a lost wakeup.
>
> Maybe none driver is doing the above, but I think there's no point to
> microoptimize sleep_on by breaking horrible [but currently safe] usages like
> the above one at runtime, because if the driver cares about performance it
> wasn't using sleep_on since the first place ;). The right way to optimize the
> code is to remove sleep_on_*, that wouldn't risk to break stuff silenty at
> runtime. I believe it's a 2.5.x item (I really don't recommend to drop sleep_on
> right now for 2.4.x).

OK, I'll put it back. Thanks.

2000-12-27 04:10:57

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [prepatch] 2.4 waitqueues

On Wed, Dec 27, 2000 at 01:57:12PM +1100, Andrew Morton wrote:
> Oh, it's all still there, but it's now all in the header file:
>
> #ifdef DEBUG
> #define foo() printk(stuff)
> #else
> #define foo()
> #endif

I intentionally didn't focused on such part of your patch because I understood
from the description that debugging stuff was gone and I wasn't going to agree
with such part regardless of the implementation it was really the case ;).

Now reading such part of the patch it looks a fine cleanup of course.

> No, gcc doesn't give any warnings about uninitialised vars. It _could_
> give warnings about unused ones, but doesn't.

So then you're wasting stack in UP compile.

Not a big deal but still I'd prefer the CONFIG_SMP #ifdef though, it looks even
more obvious that it's a compile check and at least in your usage I cannot see
a relevant readability advantage. And my own feeling is not having to rely on
more things to produce the wanted asm when there's no relevant advantage, but
if Linus likes it I won't object further.

> It's not just open-coded printks - it's oopses. If you take an oops or a
> BUG or a panic within wake_up() or _anywhere_ with the runqueue_lock held,
> the machine deadlocks and you don't get any diagnostics. This is bad.
> We really do need to get that wake_up() out of printk(). tq_timer may
> not be the best way. Suggestions welcome.

For kernel autodiagnostics we need to trust something. This something is
everything that gets into the oops path. wake_up is one of those things. You
want to replace wake_up with queue_task, why shouldn't queue_task break instead
of wake_up? You're replacing a thing with another thing that can of course
break too if there's a bug (hardware or software). But they're so core things
that we need to trust anyway and we need to get them right from sources without
relying on any testing anyways. So I simply don't see any advantage of using
queue_task other than making things slower and more complex. Debugging hardware
bugs isn't a good point either. Of course I see why you were doing that during
developement, and that's fine for local debugging ;)

For the runqueue_lock right way to not having to trust schedule as well is to
add it to the bust_spinlocks list.

Andrea

2000-12-27 12:11:10

by Andrew Morton

[permalink] [raw]
Subject: Re: [prepatch] 2.4 waitqueues

Andrea Arcangeli wrote:
>
>
> Not a big deal but still I'd prefer the CONFIG_SMP #ifdef though, it looks even
> more obvious that it's a compile check and at least in your usage I cannot see
> a relevant readability advantage. And my own feeling is not having to rely on
> more things to produce the wanted asm when there's no relevant advantage, but
> if Linus likes it I won't object further.

Oh sob, what have you done? My beautiful patch is full of ifdefs!

> > It's not just open-coded printks - it's oopses. If you take an oops or a
> > BUG or a panic within wake_up() or _anywhere_ with the runqueue_lock held,
> > the machine deadlocks and you don't get any diagnostics. This is bad.
> > We really do need to get that wake_up() out of printk(). tq_timer may
> > not be the best way. Suggestions welcome.
>
> For kernel autodiagnostics we need to trust something. This something is
> everything that gets into the oops path. wake_up is one of those things. You
> want to replace wake_up with queue_task, why shouldn't queue_task break instead
> of wake_up? You're replacing a thing with another thing that can of course
> break too if there's a bug (hardware or software).

umm.. This use of queue_task almost _can't_ fail. That's the point.

When kumon@fujitsu's 8-way was taking an oops in schedule() it took
several days of mucking about to get the runqueue_lock deadlock out
of the way. In fact we never got a decent backtrace because of it.

> But they're so core things
> that we need to trust anyway and we need to get them right from sources without
> relying on any testing anyways. So I simply don't see any advantage of using
> queue_task other than making things slower and more complex.

It's actually faster if you're doing more than one printk
per jiffy.

> For the runqueue_lock right way to not having to trust schedule as well is to
> add it to the bust_spinlocks list.

Yes. Several weeks ago I did put up a patch which was designed to avoid
all the remaining oops-deadlocks. Amongst other things it did this:

if (!oops_in_progress)
wake_up_interruptible(&log_wait);

I'll resuscitate that patch. Using this approach we can still get infinite
recursion with WAITQUEUE_DEBUG enabled, but I guess we can live with that.

Anyway, here's the revised patch. Unless Linus wants SMP_KERNEL, I think
we're done with this.



--- linux-2.4.0-test13pre4-ac2/include/linux/sched.h Fri Dec 22 16:00:26 2000
+++ linux-akpm/include/linux/sched.h Wed Dec 27 22:03:16 2000
@@ -545,7 +545,7 @@
extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
signed long timeout));
-extern void FASTCALL(wake_up_process(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process(struct task_struct * tsk));

#define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,WQ_FLAG_EXCLUSIVE)
#define wake_up_all(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,0)
--- linux-2.4.0-test13pre4-ac2/include/linux/wait.h Tue Nov 21 20:11:21 2000
+++ linux-akpm/include/linux/wait.h Wed Dec 27 22:30:50 2000
@@ -19,30 +19,10 @@
#include <asm/processor.h>

/*
- * Temporary debugging help until all code is converted to the new
- * waitqueue usage.
+ * Debug control. Slow but useful.
*/
#define WAITQUEUE_DEBUG 0

-#if WAITQUEUE_DEBUG
-extern int printk(const char *fmt, ...);
-#define WQ_BUG() do { \
- printk("wq bug, forcing oops.\n"); \
- BUG(); \
-} while (0)
-
-#define CHECK_MAGIC(x) if (x != (long)&(x)) \
- { printk("bad magic %lx (should be %lx), ", (long)x, (long)&(x)); WQ_BUG(); }
-
-#define CHECK_MAGIC_WQHEAD(x) do { \
- if (x->__magic != (long)&(x->__magic)) { \
- printk("bad magic %lx (should be %lx, creator %lx), ", \
- x->__magic, (long)&(x->__magic), x->__creator); \
- WQ_BUG(); \
- } \
-} while (0)
-#endif
-
struct __wait_queue {
unsigned int flags;
#define WQ_FLAG_EXCLUSIVE 0x01
@@ -99,24 +79,70 @@
};
typedef struct __wait_queue_head wait_queue_head_t;

+
+/*
+ * Debugging macros. We eschew `do { } while (0)' because gcc can generate
+ * spurious .aligns.
+ */
+#if WAITQUEUE_DEBUG
+#define WQ_BUG() BUG()
+#define CHECK_MAGIC(x) \
+ do { \
+ if ((x) != (long)&(x)) { \
+ printk("bad magic %lx (should be %lx), ", \
+ (long)x, (long)&(x)); \
+ WQ_BUG(); \
+ } \
+ } while (0)
+#define CHECK_MAGIC_WQHEAD(x) \
+ do { \
+ if ((x)->__magic != (long)&((x)->__magic)) { \
+ printk("bad magic %lx (should be %lx, creator %lx), ", \
+ (x)->__magic, (long)&((x)->__magic), (x)->__creator); \
+ WQ_BUG(); \
+ } \
+ } while (0)
+#define WQ_CHECK_LIST_HEAD(list) \
+ do { \
+ if (!list->next || !list->prev) \
+ WQ_BUG(); \
+ } while(0)
+#define WQ_NOTE_WAKER(tsk) \
+ do { \
+ tsk->__waker = (long)__builtin_return_address(0); \
+ } while (0)
+#else
+#define WQ_BUG()
+#define CHECK_MAGIC(x)
+#define CHECK_MAGIC_WQHEAD(x)
+#define WQ_CHECK_LIST_HEAD(list)
+#define WQ_NOTE_WAKER(tsk)
+#endif
+
+/*
+ * Macros for declaration and initialisaton of the datatypes
+ */
+
#if WAITQUEUE_DEBUG
-# define __WAITQUEUE_DEBUG_INIT(name) \
- , (long)&(name).__magic, 0
-# define __WAITQUEUE_HEAD_DEBUG_INIT(name) \
- , (long)&(name).__magic, (long)&(name).__magic
+# define __WAITQUEUE_DEBUG_INIT(name) (long)&(name).__magic, 0
+# define __WAITQUEUE_HEAD_DEBUG_INIT(name) (long)&(name).__magic, (long)&(name).__magic
#else
# define __WAITQUEUE_DEBUG_INIT(name)
# define __WAITQUEUE_HEAD_DEBUG_INIT(name)
#endif

-#define __WAITQUEUE_INITIALIZER(name,task) \
- { 0x0, task, { NULL, NULL } __WAITQUEUE_DEBUG_INIT(name)}
-#define DECLARE_WAITQUEUE(name,task) \
- wait_queue_t name = __WAITQUEUE_INITIALIZER(name,task)
-
-#define __WAIT_QUEUE_HEAD_INITIALIZER(name) \
-{ WAITQUEUE_RW_LOCK_UNLOCKED, { &(name).task_list, &(name).task_list } \
- __WAITQUEUE_HEAD_DEBUG_INIT(name)}
+#define __WAITQUEUE_INITIALIZER(name, tsk) { \
+ task: tsk, \
+ task_list: { NULL, NULL }, \
+ __WAITQUEUE_DEBUG_INIT(name)}
+
+#define DECLARE_WAITQUEUE(name, tsk) \
+ wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)
+
+#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \
+ lock: WAITQUEUE_RW_LOCK_UNLOCKED, \
+ task_list: { &(name).task_list, &(name).task_list }, \
+ __WAITQUEUE_HEAD_DEBUG_INIT(name)}

#define DECLARE_WAIT_QUEUE_HEAD(name) \
wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
@@ -135,8 +161,7 @@
#endif
}

-static inline void init_waitqueue_entry(wait_queue_t *q,
- struct task_struct *p)
+static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
{
#if WAITQUEUE_DEBUG
if (!q || !p)
--- linux-2.4.0-test13pre4-ac2/kernel/sched.c Tue Dec 12 19:24:23 2000
+++ linux-akpm/kernel/sched.c Wed Dec 27 16:33:13 2000
@@ -326,9 +326,10 @@
* "current->state = TASK_RUNNING" to mark yourself runnable
* without the overhead of this.
*/
-inline void wake_up_process(struct task_struct * p)
+static inline int try_to_wake_up(struct task_struct * p, int synchronous)
{
unsigned long flags;
+ int success = 0;

/*
* We want the common case fall through straight, thus the goto.
@@ -338,25 +339,17 @@
if (task_on_runqueue(p))
goto out;
add_to_runqueue(p);
- reschedule_idle(p);
+ if (!synchronous)
+ reschedule_idle(p);
+ success = 1;
out:
spin_unlock_irqrestore(&runqueue_lock, flags);
+ return success;
}

-static inline void wake_up_process_synchronous(struct task_struct * p)
+inline int wake_up_process(struct task_struct * p)
{
- unsigned long flags;
-
- /*
- * We want the common case fall through straight, thus the goto.
- */
- spin_lock_irqsave(&runqueue_lock, flags);
- p->state = TASK_RUNNING;
- if (task_on_runqueue(p))
- goto out;
- add_to_runqueue(p);
-out:
- spin_unlock_irqrestore(&runqueue_lock, flags);
+ return try_to_wake_up(p, 0);
}

static void process_timeout(unsigned long __data)
@@ -689,76 +682,88 @@
return;
}

+/*
+ * The core wakeup function. Non-exclusive wakeups just wake everything up.
+ * If it's an exclusive wakeup then we wake all the non-exclusive tasks
+ * and one exclusive task.
+ * If called from interrupt context we wake the least-recently queued exclusive task
+ * which wants to run on the current CPU.
+ * If not called from interrupt context we simply wake the least-recently queued
+ * exclusive task.
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero
+ * in this (rare) case, and we handle it by rescanning the exclusive tasks and
+ * trying to wake *someone*.
+ */
static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
unsigned int wq_mode, const int sync)
{
- struct list_head *tmp, *head;
- struct task_struct *p, *best_exclusive;
+ struct list_head *curr_sleeper, *head;
+ struct task_struct *p;
unsigned long flags;
- int best_cpu, irq;
-
+#ifdef CONFIG_SMP
+ struct list_head *first_nonaffine_exclusive;
+ int best_cpu, do_affine;
+#endif
if (!q)
goto out;

+#ifdef CONFIG_SMP
best_cpu = smp_processor_id();
- irq = in_interrupt();
- best_exclusive = NULL;
- wq_write_lock_irqsave(&q->lock, flags);
-
-#if WAITQUEUE_DEBUG
- CHECK_MAGIC_WQHEAD(q);
+ do_affine = in_interrupt();
+ first_nonaffine_exclusive = NULL;
#endif
-
+ wq_read_lock_irqsave(&q->lock, flags);
+ CHECK_MAGIC_WQHEAD(q);
head = &q->task_list;
-#if WAITQUEUE_DEBUG
- if (!head->next || !head->prev)
- WQ_BUG();
+ WQ_CHECK_LIST_HEAD(head);
+ curr_sleeper = head->next;
+#ifdef CONFIG_SMP
+retry:
#endif
- tmp = head->next;
- while (tmp != head) {
- unsigned int state;
- wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+ while (curr_sleeper != head) {
+ wait_queue_t *curr = list_entry(curr_sleeper, wait_queue_t, task_list);

- tmp = tmp->next;
-
-#if WAITQUEUE_DEBUG
CHECK_MAGIC(curr->__magic);
-#endif
p = curr->task;
- state = p->state;
- if (state & mode) {
-#if WAITQUEUE_DEBUG
- curr->__waker = (long)__builtin_return_address(0);
+ if (p->state & mode) {
+ WQ_NOTE_WAKER(curr);
+
+#ifdef CONFIG_SMP
+ if (do_affine && p->processor != best_cpu &&
+ (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)) {
+ if (first_nonaffine_exclusive == NULL)
+ first_nonaffine_exclusive = curr_sleeper;
+ }
+ else
#endif
- /*
- * If waking up from an interrupt context then
- * prefer processes which are affine to this
- * CPU.
- */
- if (irq && (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)) {
- if (!best_exclusive)
- best_exclusive = p;
- if (p->processor == best_cpu) {
- best_exclusive = p;
- break;
+ {
+ if (try_to_wake_up(p, sync)) {
+ if (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)
+ goto woke_one;
}
- } else {
- if (sync)
- wake_up_process_synchronous(p);
- else
- wake_up_process(p);
- if (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)
- break;
}
}
+ curr_sleeper = curr_sleeper->next;
}
- if (best_exclusive) {
- if (sync)
- wake_up_process_synchronous(best_exclusive);
- else
- wake_up_process(best_exclusive);
+
+#ifdef CONFIG_SMP
+ if (first_nonaffine_exclusive) {
+ /*
+ * If we get here, there were exclusive sleepers on the queue, but we didn't
+ * wake any up. We've already tried to wake all the sleepers who are affine
+ * to this CPU and we failed. So we now try _all_ the exclusive sleepers.
+ * We start with the least-recently-queued non-affine task. It's almost certainly
+ * not on the runqueue, so we'll terminate the above loop on the first pass.
+ */
+ do_affine = 0;
+ curr_sleeper = first_nonaffine_exclusive;
+ first_nonaffine_exclusive = NULL;
+ goto retry;
}
- wq_write_unlock_irqrestore(&q->lock, flags);
+#endif
+woke_one:
+ wq_read_unlock_irqrestore(&q->lock, flags);
out:
return;
}

2000-12-27 17:07:13

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [prepatch] 2.4 waitqueues

On Wed, Dec 27, 2000 at 10:45:29PM +1100, Andrew Morton wrote:
> Andrea Arcangeli wrote:
> >
> >
> > Not a big deal but still I'd prefer the CONFIG_SMP #ifdef though, it looks even
> > more obvious that it's a compile check and at least in your usage I cannot see
> > a relevant readability advantage. And my own feeling is not having to rely on
> > more things to produce the wanted asm when there's no relevant advantage, but
> > if Linus likes it I won't object further.
>
> Oh sob, what have you done? My beautiful patch is full of ifdefs!

I was talking only about the CONFIG_SMP thing here. BTW, for other things you
could use the compiler too (instead of the preprocessor) if you really
wanted to.

The new way does:

if (SMP_KERNEL && ...)
xxx

instead of:

#ifdef CONFIG_SMP
if (...)
xxx
#endif

I don't see a big advantage in the first version (and it may waste stack). I
could see advantages if the SMP_KERNEL would be in the middle of a very complex
check, but the above usage wasn't the case.

> umm.. This use of queue_task almost _can't_ fail. That's the point.

It can if the hardware fails. If hardware doesn't fail, also wake_up alomost
can't fail in the same way of queue_task.

> When kumon@fujitsu's 8-way was taking an oops in schedule() it took
> several days of mucking about to get the runqueue_lock deadlock out
> of the way. In fact we never got a decent backtrace because of it.

The runqueue_lock is completly unrelated to doing the wakeup in timer_bh.
runqueue_lock simply needs to be added to the bust_spinlocks() list so
we can see oops from schedule() in SMP.

> It's actually faster if you're doing more than one printk
> per jiffy.

Of course you can skip some wakeup with multiple printk at high frequency but
that has also the side effect that it will overflow the log buffer more easily
and that's not good either.

> > For the runqueue_lock right way to not having to trust schedule as well is to
> > add it to the bust_spinlocks list.
>
> Yes. Several weeks ago I did put up a patch which was designed to avoid
> all the remaining oops-deadlocks. Amongst other things it did this:
>
> if (!oops_in_progress)
> wake_up_interruptible(&log_wait);
>
> I'll resuscitate that patch. Using this approach we can still get infinite
> recursion with WAITQUEUE_DEBUG enabled, but I guess we can live with that.

As said I think if wake_up can oops, queue_task can oops as well, so I don't
see a real advantage in the !oops_in_progress.

> Anyway, here's the revised patch. Unless Linus wants SMP_KERNEL, I think
> we're done with this.

Thanks, I'll check it in some more time but so far it looks very good after a
short look ;)).

Andrea

2000-12-27 22:03:03

by George Anzinger

[permalink] [raw]
Subject: Re: [prepatch] 2.4 waitqueues

Andrew Morton wrote:
>
> It's been quiet around here lately...
>
> This is a rework of the 2.4 wakeup code based on the discussions Andrea
> and I had last week. There were two basic problems:
>
> - If two tasks are on a waitqueue in exclusive mode and one gets
> woken, it will put itself back into TASK_[UN]INTERRUPTIBLE state for a
> few instructions and can soak up another wakeup which should have
> gone to the other task.
>
> - If a task is on two waitqueues and two CPUs simultaneously run a
> wakeup, one on each waitqueue, they can both try to wake the same
> task which may be a lost wakeup, depending upon how the waitqueue
> users are coded.
>
> The first problem is the most serious. The second is kinda baroque...
>
> The approach taken by this patch is the one which Andrea is proposing
> for 2.2: if a task was already on the runqueue, continue scanning for
> another exclusive task to wake up.
>
> It ended up getting complicated because of the "find a process affine
> to this CPU" thing. Plus I did go slightly berzerk, but I believe the
> result is pretty good.
>
> - wake_up_process() now returns a success value if it managed to move
> something to the runqueue.
>
> Tidied up the code here a bit as well.
>
> wake_up_process_synchronous() is no more.
>
> - Got rid of all the debugging ifdefs - these have been folded into
> wait.h
>
> - Removed all the USE_RW_WAIT_QUEUE_SPINLOCK code and just used
> spinlocks.
>
> The read_lock option was pretty questionable anyway. It hasn't had
> the widespread testing and, umm, the kernel is using wq_write_lock
> *everywhere* anyway, so enabling USE_RW_WAIT_QUEUE_SPINLOCK wouldn't
> change anything, except for using a more expensive spinlock!
>
> So it's gone.
>
> - Introduces a kernel-wide macro `SMP_KERNEL'. This is designed to
> be used as a `compiled ifdef' in place of `#ifdef CONFIG_SMP'. There
> are a few examples in __wake_up_common().
>
> People shouldn't go wild with this, because gcc's dead code
> elimination isn't perfect. But it's nice for little things.
>
> - This patch's _use_ of SMP_KERNEL in __wake_up_common is fairly
> significant. There was quite a lot of code in that function which
> was an unnecessary burden for UP systems. All gone now.
>
> - This patch shrinks sched.o by 100 bytes (SMP) and 300 bytes (UP).
> Note that try_to_wake_up() is now only expanded in a single place
> in __wake_up_common(). It has a large footprint.
>
> - I have finally had enough of printk() deadlocking or going
> infinitely mutually recursive on me so printk()'s wake_up(log_wait)
> call has been moved into a tq_timer callback.
>
> - SLEEP_ON_VAR, SLEEP_ON_HEAD and SLEEP_ON_TAIL have been changed. I
> see no valid reason why these functions were, effectively, doing
> this:
>
> spin_lock_irqsave(lock, flags);
> spin_unlock(lock);
> schedule();
> spin_lock(lock);
> spin_unlock_irqrestore(lock, flags);
>
> What's the point in saving the interrupt status in `flags'? If the
> caller _wants_ interrupt status preserved then the caller is buggy,
> because schedule() enables interrupts. 2.2 does the same thing.
>
> So this has been changed to:
>
> spin_lock_irq(lock);
> spin_unlock(lock);
> schedule();
> spin_lock(lock);
> spin_unlock_irq(lock);
>
> Or did I miss something?
>

Um, well, here is a consideration. For preemption work it is desirable
to not burden the xxx_lock_irq(y) code with preemption locks, as the irq
effectively does this for far less cost. The problem is code like this
that does not pair the xxx_lock_irq(y) with a xxx_unlock_irq(y). The
worst offenders are bits of code that do something like:

spin_lock_irq(y);
:
sti();
:
spin_unlock(y);

(Yes, this does happen!)

I suspect that most of this has good reason behind it, but for the
preemptive effort it would be nice to introduce a set of macros like:

xxxx_unlock_noirq() which would acknowledge that the lock used irq but
the unlock is not. And for the above case:

xxx_lock_noirq() which would acknowledge that the irq "lock" is already
held.

Oh, and my reading of sched.c seems to indicate that interrupts are on
at schedule() return (or at least, related to the task being switch out
and not the new task) so the final spin_unlock_irq(lock) above should
not be changing the interrupt state. (This argument is lost in the
driver issue that Andrea points out, of course.)

Or did I miss something?

George

2000-12-28 19:17:39

by Linus Torvalds

[permalink] [raw]
Subject: Re: [prepatch] 2.4 waitqueues



On Wed, 27 Dec 2000, Andrew Morton wrote:
>
> - Introduces a kernel-wide macro `SMP_KERNEL'. This is designed to
> be used as a `compiled ifdef' in place of `#ifdef CONFIG_SMP'. There
> are a few examples in __wake_up_common().

Please don't do this, it screws up the config option autodetection in
"make checkconfig", and also while gcc is reasonably good at deleting dead
code, gcc has absolutely no clue at all about deleting dead variables, and
will leave the stack slots around giving bigger stack usage and worse
cache behaviour.

(The gcc stack slot problem is a generic gcc problem - your approach just
tends to make it worse).

If you want to do this locally somewhere, that's ok, but keep it local.

> - SLEEP_ON_VAR, SLEEP_ON_HEAD and SLEEP_ON_TAIL have been changed. I
> see no valid reason why these functions were, effectively, doing
> this:
>
> spin_lock_irqsave(lock, flags);
> spin_unlock(lock);
> schedule();
> spin_lock(lock);
> spin_unlock_irqrestore(lock, flags);
>
> What's the point in saving the interrupt status in `flags'? If the
> caller _wants_ interrupt status preserved then the caller is buggy,
> because schedule() enables interrupts. 2.2 does the same thing.
>
> So this has been changed to:
>
> spin_lock_irq(lock);
> spin_unlock(lock);
> schedule();
> spin_lock(lock);
> spin_unlock_irq(lock);
>
> Or did I miss something?

I'm a bit nervous about changing the old compatibility cruft, but the
above is probably ok.

Anyway, I'd like you to get rid of the global SMP_KERNEL thing (turning it
into a local one if you want to for this case), _and_ I'd like to see this
patch with the wait-queue spinlock _outside_ the __common_wakeup() path.

Why? Those semaphores will eventually want to re-use the wait-queue
spinlock as a per-semaphore spinlock, and they would need to call
__common_wakeup() with the spinlock held to do so. So let's get the
infrastructure in place.

Linus


2000-12-30 13:08:38

by Andrew Morton

[permalink] [raw]
Subject: Re: [prepatch] 2.4 waitqueues

Linus Torvalds wrote:
>
> On Wed, 27 Dec 2000, Andrew Morton wrote:
> >
> > - Introduces a kernel-wide macro `SMP_KERNEL'. This is designed to
> > be used as a `compiled ifdef' in place of `#ifdef CONFIG_SMP'. There
> > are a few examples in __wake_up_common().
>
> Please don't do this,

OK.

> > So this has been changed to:
> >
> > spin_lock_irq(lock);
> > spin_unlock(lock);
> > schedule();
> > spin_lock(lock);
> > spin_unlock_irq(lock);
> >
> > Or did I miss something?
>
> I'm a bit nervous about changing the old compatibility cruft, but the
> above is probably ok.

Andrea had the following reason to not make this change:

" Because old drivers could be doing ugly stuff like this:
"
" cli()
" for (;;) {
" if (!resource_available)
" sleep_on(&wait_for_resource)
" else
" break
" }
" ...
" sti()
"
" If you don't save and restore flags the second sleep_on could be entered with
" irq enabled and the wakeup from irq could happen before registering in the
" waitqueue causing a lost wakeup.
"

If anyone _is_ doing this, then they're scheduling with the global_irq_lock
held, which doesn't sound good. Anyway, it's probably best not to fiddle with
this stuff today.

> Anyway, I'd like you to get rid of the global SMP_KERNEL thing (turning it
> into a local one if you want to for this case), _and_ I'd like to see this
> patch with the wait-queue spinlock _outside_ the __common_wakeup() path.
>
> Why? Those semaphores will eventually want to re-use the wait-queue
> spinlock as a per-semaphore spinlock, and they would need to call
> __common_wakeup() with the spinlock held to do so. So let's get the
> infrastructure in place
>

OK, did that.

__wake_up() now looks like this:

void __wake_up(wait_queue_head_t *q, unsigned int mode, unsigned int wq_mode)
{
if (q) {
unsigned long flags;
wq_read_lock_irqsave(&q->lock, flags);
__wake_up_common(q, mode, wq_mode, 0);
wq_read_unlock_irqrestore(&q->lock, flags);
}
}

If we want to use the waitqueue lock to replace semaphore_lock
then I'd suggest we change __wake_up to look like:

void __wake_up(wait_queue_head_t *q, unsigned int mode, unsigned int wq_mode)
{
if (q) {
unsigned long flags;
if (!(wq_mode & WQ_FLAG_UNLOCKED))
wq_read_lock_irqsave(&q->lock, flags);
__wake_up_common(q, mode, wq_mode, 0);
if (!(wq_mode & WQ_FLAG_UNLOCKED))
wq_read_unlock_irqrestore(&q->lock, flags);
}
}

So we don't have to expand another instantiation of __wake_up_common. I always
find this tradeoff hard to make up my mind about...

The USE_RW_WAIT_QUEUE_SPINLOCK option is still there, so we _could_
turn on rwlocks for waitqueues in the future. I have tested the
rwlock option and it appears to work. It wasn't safe with the
old (current) __wake_up() implementation, but I think it is safe with
this patch.

rwlocks are a little more scalable here, and will probably allow us to leave
local interrupts enabled while running __wake_up().

BUT! If we want to throw away semaphore_lock (good move) and then
enable rwlocks in the waitqueues, that means that __down() will
have to do a write_lock of the per-waitqueue lock, so it can
call __add_wait_queue_exclusive() and __remove_wait_queue_exclusive().
We'd need to do this anyway, so the waitqueue locking protects the
semaphore's internals.

We should also remove the extra wake_up() in __down(), which
is going to hurt someone's brain :)


Here's the patch, tested on x86 UP and SMP. The only substantive change
since last time is hoisting the `wq_mode & WQ_FLAG_EXCLUSIVE' outside the
browsing loop and then commenting it out!


--- linux-2.4.0-test13-pre6/include/linux/sched.h Sat Dec 30 22:19:58 2000
+++ linux-akpm/include/linux/sched.h Sat Dec 30 22:54:21 2000
@@ -545,7 +545,7 @@
extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
signed long timeout));
-extern void FASTCALL(wake_up_process(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process(struct task_struct * tsk));

#define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,WQ_FLAG_EXCLUSIVE)
#define wake_up_all(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,0)
--- linux-2.4.0-test13-pre6/include/linux/wait.h Sat Dec 30 22:19:58 2000
+++ linux-akpm/include/linux/wait.h Sat Dec 30 22:54:21 2000
@@ -19,30 +19,10 @@
#include <asm/processor.h>

/*
- * Temporary debugging help until all code is converted to the new
- * waitqueue usage.
+ * Debug control. Slow but useful.
*/
#define WAITQUEUE_DEBUG 0

-#if WAITQUEUE_DEBUG
-extern int printk(const char *fmt, ...);
-#define WQ_BUG() do { \
- printk("wq bug, forcing oops.\n"); \
- BUG(); \
-} while (0)
-
-#define CHECK_MAGIC(x) if (x != (long)&(x)) \
- { printk("bad magic %lx (should be %lx), ", (long)x, (long)&(x)); WQ_BUG(); }
-
-#define CHECK_MAGIC_WQHEAD(x) do { \
- if (x->__magic != (long)&(x->__magic)) { \
- printk("bad magic %lx (should be %lx, creator %lx), ", \
- x->__magic, (long)&(x->__magic), x->__creator); \
- WQ_BUG(); \
- } \
-} while (0)
-#endif
-
struct __wait_queue {
unsigned int flags;
#define WQ_FLAG_EXCLUSIVE 0x01
@@ -99,24 +79,70 @@
};
typedef struct __wait_queue_head wait_queue_head_t;

+
+/*
+ * Debugging macros. We eschew `do { } while (0)' because gcc can generate
+ * spurious .aligns.
+ */
+#if WAITQUEUE_DEBUG
+#define WQ_BUG() BUG()
+#define CHECK_MAGIC(x) \
+ do { \
+ if ((x) != (long)&(x)) { \
+ printk("bad magic %lx (should be %lx), ", \
+ (long)x, (long)&(x)); \
+ WQ_BUG(); \
+ } \
+ } while (0)
+#define CHECK_MAGIC_WQHEAD(x) \
+ do { \
+ if ((x)->__magic != (long)&((x)->__magic)) { \
+ printk("bad magic %lx (should be %lx, creator %lx), ", \
+ (x)->__magic, (long)&((x)->__magic), (x)->__creator); \
+ WQ_BUG(); \
+ } \
+ } while (0)
+#define WQ_CHECK_LIST_HEAD(list) \
+ do { \
+ if (!list->next || !list->prev) \
+ WQ_BUG(); \
+ } while(0)
+#define WQ_NOTE_WAKER(tsk) \
+ do { \
+ tsk->__waker = (long)__builtin_return_address(0); \
+ } while (0)
+#else
+#define WQ_BUG()
+#define CHECK_MAGIC(x)
+#define CHECK_MAGIC_WQHEAD(x)
+#define WQ_CHECK_LIST_HEAD(list)
+#define WQ_NOTE_WAKER(tsk)
+#endif
+
+/*
+ * Macros for declaration and initialisaton of the datatypes
+ */
+
#if WAITQUEUE_DEBUG
-# define __WAITQUEUE_DEBUG_INIT(name) \
- , (long)&(name).__magic, 0
-# define __WAITQUEUE_HEAD_DEBUG_INIT(name) \
- , (long)&(name).__magic, (long)&(name).__magic
+# define __WAITQUEUE_DEBUG_INIT(name) (long)&(name).__magic, 0
+# define __WAITQUEUE_HEAD_DEBUG_INIT(name) (long)&(name).__magic, (long)&(name).__magic
#else
# define __WAITQUEUE_DEBUG_INIT(name)
# define __WAITQUEUE_HEAD_DEBUG_INIT(name)
#endif

-#define __WAITQUEUE_INITIALIZER(name,task) \
- { 0x0, task, { NULL, NULL } __WAITQUEUE_DEBUG_INIT(name)}
-#define DECLARE_WAITQUEUE(name,task) \
- wait_queue_t name = __WAITQUEUE_INITIALIZER(name,task)
-
-#define __WAIT_QUEUE_HEAD_INITIALIZER(name) \
-{ WAITQUEUE_RW_LOCK_UNLOCKED, { &(name).task_list, &(name).task_list } \
- __WAITQUEUE_HEAD_DEBUG_INIT(name)}
+#define __WAITQUEUE_INITIALIZER(name, tsk) { \
+ task: tsk, \
+ task_list: { NULL, NULL }, \
+ __WAITQUEUE_DEBUG_INIT(name)}
+
+#define DECLARE_WAITQUEUE(name, tsk) \
+ wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)
+
+#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \
+ lock: WAITQUEUE_RW_LOCK_UNLOCKED, \
+ task_list: { &(name).task_list, &(name).task_list }, \
+ __WAITQUEUE_HEAD_DEBUG_INIT(name)}

#define DECLARE_WAIT_QUEUE_HEAD(name) \
wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
@@ -135,8 +161,7 @@
#endif
}

-static inline void init_waitqueue_entry(wait_queue_t *q,
- struct task_struct *p)
+static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
{
#if WAITQUEUE_DEBUG
if (!q || !p)
--- linux-2.4.0-test13-pre6/kernel/sched.c Sat Dec 30 22:19:58 2000
+++ linux-akpm/kernel/sched.c Sat Dec 30 23:08:51 2000
@@ -326,9 +326,10 @@
* "current->state = TASK_RUNNING" to mark yourself runnable
* without the overhead of this.
*/
-inline void wake_up_process(struct task_struct * p)
+static inline int try_to_wake_up(struct task_struct * p, int synchronous)
{
unsigned long flags;
+ int success = 0;

/*
* We want the common case fall through straight, thus the goto.
@@ -338,25 +339,17 @@
if (task_on_runqueue(p))
goto out;
add_to_runqueue(p);
- reschedule_idle(p);
+ if (!synchronous)
+ reschedule_idle(p);
+ success = 1;
out:
spin_unlock_irqrestore(&runqueue_lock, flags);
+ return success;
}

-static inline void wake_up_process_synchronous(struct task_struct * p)
+inline int wake_up_process(struct task_struct * p)
{
- unsigned long flags;
-
- /*
- * We want the common case fall through straight, thus the goto.
- */
- spin_lock_irqsave(&runqueue_lock, flags);
- p->state = TASK_RUNNING;
- if (task_on_runqueue(p))
- goto out;
- add_to_runqueue(p);
-out:
- spin_unlock_irqrestore(&runqueue_lock, flags);
+ return try_to_wake_up(p, 0);
}

static void process_timeout(unsigned long __data)
@@ -689,88 +682,101 @@
return;
}

+/*
+ * The core wakeup function. Non-exclusive wakeups just wake everything up.
+ * If it's an exclusive wakeup then we wake all the non-exclusive tasks
+ * and one exclusive task.
+ * If called from interrupt context we wake the least-recently queued exclusive task
+ * which wants to run on the current CPU.
+ * If not called from interrupt context we simply wake the least-recently queued
+ * exclusive task.
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero
+ * in this (rare) case, and we handle it by rescanning the exclusive tasks and
+ * trying to wake *someone*.
+ */
static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
unsigned int wq_mode, const int sync)
{
- struct list_head *tmp, *head;
- struct task_struct *p, *best_exclusive;
- unsigned long flags;
- int best_cpu, irq;
-
- if (!q)
- goto out;
-
- best_cpu = smp_processor_id();
- irq = in_interrupt();
- best_exclusive = NULL;
- wq_write_lock_irqsave(&q->lock, flags);
-
-#if WAITQUEUE_DEBUG
- CHECK_MAGIC_WQHEAD(q);
+ struct list_head *curr_sleeper, *head;
+ struct task_struct *p;
+#ifdef CONFIG_SMP
+ struct list_head *first_nonaffine_exclusive = NULL;
+ int best_cpu = smp_processor_id();
+ int do_affine = in_interrupt();
#endif

+ CHECK_MAGIC_WQHEAD(q);
head = &q->task_list;
-#if WAITQUEUE_DEBUG
- if (!head->next || !head->prev)
- WQ_BUG();
+ WQ_CHECK_LIST_HEAD(head);
+ curr_sleeper = head->next;
+ /* Enable the below stmt if/when other bits are defined in wait_queue_head.flags */
+/* wq_mode &= WQ_FLAG_EXCLUSIVE; */
+#ifdef CONFIG_SMP
+retry:
#endif
- tmp = head->next;
- while (tmp != head) {
- unsigned int state;
- wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
-
- tmp = tmp->next;
+ while (curr_sleeper != head) {
+ wait_queue_t *curr = list_entry(curr_sleeper, wait_queue_t, task_list);

-#if WAITQUEUE_DEBUG
CHECK_MAGIC(curr->__magic);
-#endif
p = curr->task;
- state = p->state;
- if (state & mode) {
-#if WAITQUEUE_DEBUG
- curr->__waker = (long)__builtin_return_address(0);
+ if (p->state & mode) {
+ WQ_NOTE_WAKER(curr);
+
+#ifdef CONFIG_SMP
+ if (do_affine && p->processor != best_cpu && (curr->flags & wq_mode)) {
+ if (first_nonaffine_exclusive == NULL)
+ first_nonaffine_exclusive = curr_sleeper;
+ }
+ else
#endif
- /*
- * If waking up from an interrupt context then
- * prefer processes which are affine to this
- * CPU.
- */
- if (irq && (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)) {
- if (!best_exclusive)
- best_exclusive = p;
- if (p->processor == best_cpu) {
- best_exclusive = p;
- break;
+ {
+ if (try_to_wake_up(p, sync)) {
+ if (curr->flags & wq_mode)
+ goto out;
}
- } else {
- if (sync)
- wake_up_process_synchronous(p);
- else
- wake_up_process(p);
- if (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)
- break;
}
}
+ curr_sleeper = curr_sleeper->next;
}
- if (best_exclusive) {
- if (sync)
- wake_up_process_synchronous(best_exclusive);
- else
- wake_up_process(best_exclusive);
+
+#ifdef CONFIG_SMP
+ if (first_nonaffine_exclusive) {
+ /*
+ * If we get here, there were exclusive sleepers on the queue, but we didn't
+ * wake any up. We've already tried to wake all the sleepers who are affine
+ * to this CPU and we failed. So we now try _all_ the exclusive sleepers.
+ * We start with the least-recently-queued non-affine task. It's almost certainly
+ * not on the runqueue, so we'll terminate the above loop on the first pass.
+ */
+ do_affine = 0;
+ curr_sleeper = first_nonaffine_exclusive;
+ first_nonaffine_exclusive = NULL;
+ goto retry;
}
- wq_write_unlock_irqrestore(&q->lock, flags);
+#endif
out:
return;
}

void __wake_up(wait_queue_head_t *q, unsigned int mode, unsigned int wq_mode)
{
- __wake_up_common(q, mode, wq_mode, 0);
+ if (q) {
+ unsigned long flags;
+ wq_read_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, wq_mode, 0);
+ wq_read_unlock_irqrestore(&q->lock, flags);
+ }
}

void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, unsigned int wq_mode)
{
- __wake_up_common(q, mode, wq_mode, 1);
+ if (q) {
+ unsigned long flags;
+ wq_read_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, wq_mode, 1);
+ wq_read_unlock_irqrestore(&q->lock, flags);
+ }
}

#define SLEEP_ON_VAR \