2006-12-17 22:34:40

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH, RFC] reimplement flush_workqueue()

Remove ->remove_sequence, ->insert_sequence, and ->work_done from
struct cpu_workqueue_struct. To implement flush_workqueue() we can
queue a barrier work on each CPU and wait for its completition.

We don't need to worry about CPU going down while we are are sleeping
on the completition. take_over_work() will move this work on another
CPU, and the handler will wake up us eventually.

NOTE: I removed 'int cpu' parameter, flush_workqueue() locks/unlocks
workqueue_mutex unconditionally. It may be restored, but I think it
doesn't make much sense, we take the mutex for the very short time,
and the code becomes simpler.

Signed-off-by: Oleg Nesterov <[email protected]>

workqueue.c | 87 ++++++++++++++++++++----------------------------------------
1 files changed, 29 insertions(+), 58 deletions(-)

--- mm-6.20-rc1/kernel/workqueue.c~1_flush_q 2006-12-18 00:56:58.000000000 +0300
+++ mm-6.20-rc1/kernel/workqueue.c 2006-12-18 01:13:22.000000000 +0300
@@ -36,23 +36,13 @@
/*
* The per-CPU workqueue (if single thread, we always use the first
* possible cpu).
- *
- * The sequence counters are for flush_scheduled_work(). It wants to wait
- * until all currently-scheduled works are completed, but it doesn't
- * want to be livelocked by new, incoming ones. So it waits until
- * remove_sequence is >= the insert_sequence which pertained when
- * flush_scheduled_work() was called.
*/
struct cpu_workqueue_struct {

spinlock_t lock;

- long remove_sequence; /* Least-recently added (next to run) */
- long insert_sequence; /* Next to add */
-
struct list_head worklist;
wait_queue_head_t more_work;
- wait_queue_head_t work_done;

struct workqueue_struct *wq;
struct task_struct *thread;
@@ -138,8 +128,6 @@ static int __run_work(struct cpu_workque
f(work);

spin_lock_irqsave(&cwq->lock, flags);
- cwq->remove_sequence++;
- wake_up(&cwq->work_done);
ret = 1;
}
spin_unlock_irqrestore(&cwq->lock, flags);
@@ -187,7 +175,6 @@ static void __queue_work(struct cpu_work
spin_lock_irqsave(&cwq->lock, flags);
set_wq_data(work, cwq);
list_add_tail(&work->entry, &cwq->worklist);
- cwq->insert_sequence++;
wake_up(&cwq->more_work);
spin_unlock_irqrestore(&cwq->lock, flags);
}
@@ -338,8 +325,6 @@ static void run_workqueue(struct cpu_wor
}

spin_lock_irqsave(&cwq->lock, flags);
- cwq->remove_sequence++;
- wake_up(&cwq->work_done);
}
cwq->run_depth--;
spin_unlock_irqrestore(&cwq->lock, flags);
@@ -394,45 +379,39 @@ static int worker_thread(void *__cwq)
return 0;
}

-/*
- * If cpu == -1 it's a single-threaded workqueue and the caller does not hold
- * workqueue_mutex
- */
-static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq, int cpu)
+struct wq_barrier {
+ struct work_struct work;
+ struct completion done;
+};
+
+static void wq_barrier_func(struct work_struct *work)
+{
+ struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
+ complete(&barr->done);
+}
+
+static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
{
if (cwq->thread == current) {
/*
* Probably keventd trying to flush its own queue. So simply run
* it by hand rather than deadlocking.
*/
- if (cpu != -1)
- mutex_unlock(&workqueue_mutex);
+ mutex_unlock(&workqueue_mutex);
run_workqueue(cwq);
- if (cpu != -1)
- mutex_lock(&workqueue_mutex);
+ mutex_lock(&workqueue_mutex);
} else {
- DEFINE_WAIT(wait);
- long sequence_needed;
+ struct wq_barrier barr = {
+ .work = __WORK_INITIALIZER(barr.work, wq_barrier_func),
+ .done = COMPLETION_INITIALIZER_ONSTACK(barr.done),
+ };

- spin_lock_irq(&cwq->lock);
- sequence_needed = cwq->insert_sequence;
+ __set_bit(WORK_STRUCT_PENDING, &barr.work.management);
+ __queue_work(cwq, &barr.work);

- while (sequence_needed - cwq->remove_sequence > 0) {
- prepare_to_wait(&cwq->work_done, &wait,
- TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(&cwq->lock);
- if (cpu != -1)
- mutex_unlock(&workqueue_mutex);
- schedule();
- if (cpu != -1) {
- mutex_lock(&workqueue_mutex);
- if (!cpu_online(cpu))
- return; /* oops, CPU unplugged */
- }
- spin_lock_irq(&cwq->lock);
- }
- finish_wait(&cwq->work_done, &wait);
- spin_unlock_irq(&cwq->lock);
+ mutex_unlock(&workqueue_mutex);
+ wait_for_completion(&barr.done);
+ mutex_lock(&workqueue_mutex);
}
}

@@ -443,30 +422,25 @@ static void flush_cpu_workqueue(struct c
* Forces execution of the workqueue and blocks until its completion.
* This is typically used in driver shutdown handlers.
*
- * This function will sample each workqueue's current insert_sequence number and
- * will sleep until the head sequence is greater than or equal to that. This
- * means that we sleep until all works which were queued on entry have been
- * handled, but we are not livelocked by new incoming ones.
+ * We sleep until all works which were queued on entry have been handled,
+ * but we are not livelocked by new incoming ones.
*
* This function used to run the workqueues itself. Now we just wait for the
* helper threads to do it.
*/
void fastcall flush_workqueue(struct workqueue_struct *wq)
{
- might_sleep();
-
+ mutex_lock(&workqueue_mutex);
if (is_single_threaded(wq)) {
/* Always use first cpu's area. */
- flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu),
- -1);
+ flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
} else {
int cpu;

- mutex_lock(&workqueue_mutex);
for_each_online_cpu(cpu)
- flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu), cpu);
- mutex_unlock(&workqueue_mutex);
+ flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
}
+ mutex_unlock(&workqueue_mutex);
}
EXPORT_SYMBOL_GPL(flush_workqueue);

@@ -479,12 +453,9 @@ static struct task_struct *create_workqu
spin_lock_init(&cwq->lock);
cwq->wq = wq;
cwq->thread = NULL;
- cwq->insert_sequence = 0;
- cwq->remove_sequence = 0;
cwq->freezeable = freezeable;
INIT_LIST_HEAD(&cwq->worklist);
init_waitqueue_head(&cwq->more_work);
- init_waitqueue_head(&cwq->work_done);

if (is_single_threaded(wq))
p = kthread_create(worker_thread, cwq, "%s", wq->name);


2006-12-18 03:10:15

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()



On Mon, 18 Dec 2006, Oleg Nesterov wrote:
>
> Remove ->remove_sequence, ->insert_sequence, and ->work_done from
> struct cpu_workqueue_struct. To implement flush_workqueue() we can
> queue a barrier work on each CPU and wait for its completition.

Looks fine to me. It's after -rc1 so I won't apply it, but it looks like a
nice cleanup.

Linus

2006-12-19 00:27:36

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()

On Mon, 18 Dec 2006 01:34:16 +0300
Oleg Nesterov <[email protected]> wrote:

> Remove ->remove_sequence, ->insert_sequence, and ->work_done from
> struct cpu_workqueue_struct. To implement flush_workqueue() we can
> queue a barrier work on each CPU and wait for its completition.

Seems sensible. I seem to recall considering doing it that way when I
initially implemeted flush_workqueue(), but I don't recall why I didn't do
this. hmm.

> We don't need to worry about CPU going down while we are are sleeping
> on the completition. take_over_work() will move this work on another
> CPU, and the handler will wake up us eventually.
>
> NOTE: I removed 'int cpu' parameter, flush_workqueue() locks/unlocks
> workqueue_mutex unconditionally. It may be restored, but I think it
> doesn't make much sense, we take the mutex for the very short time,
> and the code becomes simpler.
>

Taking workqueue_mutex() unconditionally in flush_workqueue() means
that we'll deadlock if a single-threaded workqueue callback handler calls
flush_workqueue().

It's an idiotic thing to do, but I think I spotted a site last week which
does this. scsi? Not sure..

2006-12-19 00:43:32

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()

On 12/18, Andrew Morton wrote:
>
> On Mon, 18 Dec 2006 01:34:16 +0300
> Oleg Nesterov <[email protected]> wrote:
>
> > NOTE: I removed 'int cpu' parameter, flush_workqueue() locks/unlocks
> > workqueue_mutex unconditionally. It may be restored, but I think it
> > doesn't make much sense, we take the mutex for the very short time,
> > and the code becomes simpler.
> >
>
> Taking workqueue_mutex() unconditionally in flush_workqueue() means
> that we'll deadlock if a single-threaded workqueue callback handler calls
> flush_workqueue().

Well. But flush_workqueue() drops workqueue_mutex before going to sleep ?

flush_workqueue(single_threaded_wq);
...
mutex_lock(&workqueue_mutex);
...
mutex_unlock(&workqueue_mutex);
wait_for_completition();
handler runs,
calls flush_workqueue(),
workqueue_mutex is free

> It's an idiotic thing to do, but I think I spotted a site last week which
> does this. scsi? Not sure..

Ok, it is time to sleep. I'll look tomorrov and re-send if flush_cpu_workqueue()
really needs "bool workqueue_mutex_is_locked" parameter.

Oleg.

2006-12-19 01:00:53

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()

On Tue, 19 Dec 2006 03:43:19 +0300
Oleg Nesterov <[email protected]> wrote:

> On 12/18, Andrew Morton wrote:
> >
> > On Mon, 18 Dec 2006 01:34:16 +0300
> > Oleg Nesterov <[email protected]> wrote:
> >
> > > NOTE: I removed 'int cpu' parameter, flush_workqueue() locks/unlocks
> > > workqueue_mutex unconditionally. It may be restored, but I think it
> > > doesn't make much sense, we take the mutex for the very short time,
> > > and the code becomes simpler.
> > >
> >
> > Taking workqueue_mutex() unconditionally in flush_workqueue() means
> > that we'll deadlock if a single-threaded workqueue callback handler calls
> > flush_workqueue().
>
> Well. But flush_workqueue() drops workqueue_mutex before going to sleep ?
>
> flush_workqueue(single_threaded_wq);
> ...
> mutex_lock(&workqueue_mutex);
> ...
> mutex_unlock(&workqueue_mutex);
> wait_for_completition();
> handler runs,
> calls flush_workqueue(),
> workqueue_mutex is free

Oh. OK. In that case we can switch to preempt_disable() for the
cpu-hotplug holdoff. Sometime.

> > It's an idiotic thing to do, but I think I spotted a site last week which
> > does this. scsi? Not sure..
>
> Ok, it is time to sleep. I'll look tomorrov and re-send if flush_cpu_workqueue()
> really needs "bool workqueue_mutex_is_locked" parameter.

Hopefully not.

2007-01-04 11:33:15

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()

On Tue, Dec 19, 2006 at 03:43:19AM +0300, Oleg Nesterov wrote:
> > Taking workqueue_mutex() unconditionally in flush_workqueue() means
> > that we'll deadlock if a single-threaded workqueue callback handler calls
> > flush_workqueue().
>
> Well. But flush_workqueue() drops workqueue_mutex before going to sleep ?

... and acquires it again after woken from sleep. That can be a problem, which
will lead to the problem described here:

http://lkml.org/lkml/2006/12/7/374

In brief:

keventd thread hotplug thread
-------------- --------------

run_workqueue()
|
work_fn()
|
flush_workqueue()
|
flush_cpu_workqueue
| cpu_down()
mutex_unlock(wq_mutex); |
(above opens window for hotplug) mutex_lock(wq_mutex);
| /* bring down cpu */
wait_for_completition(); notifier(CPU_DEAD, ..)
| workqueue_cpu_callback
| cleanup_workqueue_thread
| kthread_stop()
|
|
mutex_lock(wq_mutex); <- Can deadlock


The kthread_stop() will wait for keventd() thread to exit, but keventd()
is blocked on mutex_lock(wq_mutex) leading to a deadlock.

>
> flush_workqueue(single_threaded_wq);
> ...
> mutex_lock(&workqueue_mutex);
> ...
> mutex_unlock(&workqueue_mutex);
> wait_for_completition();
> handler runs,
> calls flush_workqueue(),
> workqueue_mutex is free

--
Regards,
vatsa

2007-01-04 12:02:26

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()

On Mon, Dec 18, 2006 at 01:34:16AM +0300, Oleg Nesterov wrote:
> void fastcall flush_workqueue(struct workqueue_struct *wq)
> {
> - might_sleep();
> -
> + mutex_lock(&workqueue_mutex);
> if (is_single_threaded(wq)) {
> /* Always use first cpu's area. */
> - flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu),
> - -1);
> + flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
> } else {
> int cpu;
>
> - mutex_lock(&workqueue_mutex);
> for_each_online_cpu(cpu)


Can compiler optimizations lead to cpu_online_map being cached in a register
while running this loop? AFAICS cpu_online_map is not declared to be
volatile. If it can be cached, then we have the danger of invoking
flush_cpu_workqueue() on a dead cpu (because flush_cpu_workqueue drops
workqueue_mutex, cpu hp events can change cpu_online_map while we are in
flush_cpu_workqueue).

> - flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu), cpu);
> - mutex_unlock(&workqueue_mutex);
> + flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));


--
Regards,
vatsa

2007-01-04 14:28:56

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()

On 01/04, Srivatsa Vaddagiri wrote:
>
> On Tue, Dec 19, 2006 at 03:43:19AM +0300, Oleg Nesterov wrote:
> > > Taking workqueue_mutex() unconditionally in flush_workqueue() means
> > > that we'll deadlock if a single-threaded workqueue callback handler calls
> > > flush_workqueue().
> >
> > Well. But flush_workqueue() drops workqueue_mutex before going to sleep ?
>
> ... and acquires it again after woken from sleep. That can be a problem, which
> will lead to the problem described here:
>
> http://lkml.org/lkml/2006/12/7/374
>
> In brief:
>
> keventd thread hotplug thread
> -------------- --------------
>
> run_workqueue()
> |
> work_fn()
> |
> flush_workqueue()
> |
> flush_cpu_workqueue
> | cpu_down()
> mutex_unlock(wq_mutex); |
> (above opens window for hotplug) mutex_lock(wq_mutex);
> | /* bring down cpu */
> wait_for_completition(); notifier(CPU_DEAD, ..)
> | workqueue_cpu_callback
> | cleanup_workqueue_thread
> | kthread_stop()
> |
> |
> mutex_lock(wq_mutex); <- Can deadlock
>
>
> The kthread_stop() will wait for keventd() thread to exit, but keventd()
> is blocked on mutex_lock(wq_mutex) leading to a deadlock.

Thanks, I need to think about this.

However I am not sure I fully understand the problem.

First, this deadlock was not introduced by recent changes (including "single
threaded flush_workqueue() takes workqueue_mutex too"), yes?

Also, it seems to me we have a much more simple scenario for deadlock.

events/0 runs run_workqueue(), work->func() sleeps or takes a preemtion. CPU 0
dies, keventd thread migrates to another CPU. CPU_DEAD calls kthread_stop() under
workqueue_mutex and waits for until kevents thread exits. Now, if this work (or
another work pending on cwq->worklist) takes workqueue_mutex (for example, does
flush_workqueue) we have a deadlock.

No?

Oleg.

2007-01-04 14:38:04

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()

On 01/04, Srivatsa Vaddagiri wrote:
>
> On Mon, Dec 18, 2006 at 01:34:16AM +0300, Oleg Nesterov wrote:
> > void fastcall flush_workqueue(struct workqueue_struct *wq)
> > {
> > - might_sleep();
> > -
> > + mutex_lock(&workqueue_mutex);
> > if (is_single_threaded(wq)) {
> > /* Always use first cpu's area. */
> > - flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu),
> > - -1);
> > + flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
> > } else {
> > int cpu;
> >
> > - mutex_lock(&workqueue_mutex);
> > for_each_online_cpu(cpu)
>
>
> Can compiler optimizations lead to cpu_online_map being cached in a register
> while running this loop? AFAICS cpu_online_map is not declared to be
> volatile.

But it is not const either,

> If it can be cached,

I believe this would be a compiler's bug. Let's take a more simple example,

while (!condition)
schedule();

What if compiler will cache the value of global 'condition' ?

then we have the danger of invoking
> flush_cpu_workqueue() on a dead cpu (because flush_cpu_workqueue drops
> workqueue_mutex, cpu hp events can change cpu_online_map while we are in
> flush_cpu_workqueue).

Oleg.

2007-01-04 15:57:12

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()

On Thu, Jan 04, 2007 at 05:29:36PM +0300, Oleg Nesterov wrote:
> Thanks, I need to think about this.
>
> However I am not sure I fully understand the problem.
>
> First, this deadlock was not introduced by recent changes (including "single
> threaded flush_workqueue() takes workqueue_mutex too"), yes?

AFAIK this deadlock originated from Andrew's patch here:

http://lkml.org/lkml/2006/12/7/231

(Yes, your patches didnt introduce this. I was just reiterating here my
earlier point that workqueue code is broken of late wrt cpu hotplug).

> Also, it seems to me we have a much more simple scenario for deadlock.
>
> events/0 runs run_workqueue(), work->func() sleeps or takes a preemtion. CPU 0
> dies, keventd thread migrates to another CPU. CPU_DEAD calls kthread_stop() under
> workqueue_mutex and waits for until kevents thread exits. Now, if this work (or
> another work pending on cwq->worklist) takes workqueue_mutex (for example, does
> flush_workqueue) we have a deadlock.
>
> No?

Yes, the above scenario also will cause a deadlock.

I supposed one could avoid the deadlock by having a 'workqueue_mutex_held'
flag and avoid taking the mutex set under some conditions, but IMHO a
more neater solution is to provide a cpu-hotplug lock which works under
all these corner cases. One such proposal was made here:

http://lkml.org/lkml/2006/10/26/65

--
Regards,
vatsa

2007-01-04 16:30:51

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()

On 01/04, Srivatsa Vaddagiri wrote:
>
> On Thu, Jan 04, 2007 at 05:29:36PM +0300, Oleg Nesterov wrote:
> > Thanks, I need to think about this.
> >
> > However I am not sure I fully understand the problem.
> >
> > First, this deadlock was not introduced by recent changes (including "single
> > threaded flush_workqueue() takes workqueue_mutex too"), yes?
>
> AFAIK this deadlock originated from Andrew's patch here:
>
> http://lkml.org/lkml/2006/12/7/231

I don't think so. The core problem is not that we are doing unlock/sleep/lock
with this patch. The thing is: work->func() can't take wq_mutex (and thus use
flush_work/workqueue) because it is possible that CPU_DEAD holds this mutex
and waits for us to complete(kthread_stop_info). I believe this bug is old.

> (Yes, your patches didnt introduce this. I was just reiterating here my
> earlier point that workqueue code is broken of late wrt cpu hotplug).
>
> > Also, it seems to me we have a much more simple scenario for deadlock.
> >
> > events/0 runs run_workqueue(), work->func() sleeps or takes a preemtion. CPU 0
> > dies, keventd thread migrates to another CPU. CPU_DEAD calls kthread_stop() under
> > workqueue_mutex and waits for until kevents thread exits. Now, if this work (or
> > another work pending on cwq->worklist) takes workqueue_mutex (for example, does
> > flush_workqueue) we have a deadlock.
> >
> > No?
>
> Yes, the above scenario also will cause a deadlock.

Ok, thanks for acknowledgement.

> I supposed one could avoid the deadlock by having a 'workqueue_mutex_held'
> flag and avoid taking the mutex set under some conditions,

I am thinking about the same right now. Probably we can do something like this:


int xxx_lock(void)
{
for (;;) {
if (mutex_trylock(wq_mutex))
return 1;

// the owner of wq_mutex sleeps, we can proceed
if (kthread_should_stop())
return 0;
}
}
void xxx_unlock(int yesno)
{
if (yesno)
mutex_unlock(wq_mutext);
}

and then do

locked = xxx_lock();
...
xxx_unlock(locked);

in flush_xxx() instead of plain lock/unlock.

Yes, ugly. I'll try to do something else on weekend.

> but IMHO a
> more neater solution is to provide a cpu-hotplug lock which works under
> all these corner cases. One such proposal was made here:
>
> http://lkml.org/lkml/2006/10/26/65

I'll take a look later, thanks.

Oleg.

2007-01-04 16:57:58

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()

On Thu, Jan 04, 2007 at 07:31:39PM +0300, Oleg Nesterov wrote:
> > AFAIK this deadlock originated from Andrew's patch here:
> >
> > http://lkml.org/lkml/2006/12/7/231
>
> I don't think so. The core problem is not that we are doing unlock/sleep/lock
> with this patch. The thing is: work->func() can't take wq_mutex (and thus use
> flush_work/workqueue) because it is possible that CPU_DEAD holds this mutex
> and waits for us to complete(kthread_stop_info). I believe this bug is old.

Yes, this bug is quite old looks like. Thanks for correcting me.

--
Regards,
vatsa

2007-01-04 17:21:20

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()

On Thu, 4 Jan 2007 17:29:36 +0300
Oleg Nesterov <[email protected]> wrote:

> > In brief:
> >
> > keventd thread hotplug thread
> > -------------- --------------
> >
> > run_workqueue()
> > |
> > work_fn()
> > |
> > flush_workqueue()
> > |
> > flush_cpu_workqueue
> > | cpu_down()
> > mutex_unlock(wq_mutex); |
> > (above opens window for hotplug) mutex_lock(wq_mutex);
> > | /* bring down cpu */
> > wait_for_completition(); notifier(CPU_DEAD, ..)
> > | workqueue_cpu_callback
> > | cleanup_workqueue_thread
> > | kthread_stop()
> > |
> > |
> > mutex_lock(wq_mutex); <- Can deadlock
> >
> >
> > The kthread_stop() will wait for keventd() thread to exit, but keventd()
> > is blocked on mutex_lock(wq_mutex) leading to a deadlock.

This?


--- a/kernel/workqueue.c~flush_workqueue-use-preempt_disable-to-hold-off-cpu-hotplug
+++ a/kernel/workqueue.c
@@ -419,18 +419,22 @@ static void flush_cpu_workqueue(struct c
* Probably keventd trying to flush its own queue. So simply run
* it by hand rather than deadlocking.
*/
- mutex_unlock(&workqueue_mutex);
+ preempt_enable();
+ /*
+ * We can still touch *cwq here because we are keventd, and
+ * hot-unplug will be waiting us to exit.
+ */
run_workqueue(cwq);
- mutex_lock(&workqueue_mutex);
+ preempt_disable();
} else {
struct wq_barrier barr;

init_wq_barrier(&barr);
__queue_work(cwq, &barr.work);

- mutex_unlock(&workqueue_mutex);
+ preempt_enable(); /* Can no longer touch *cwq */
wait_for_completion(&barr.done);
- mutex_lock(&workqueue_mutex);
+ preempt_disable();
}
}

@@ -449,7 +453,7 @@ static void flush_cpu_workqueue(struct c
*/
void fastcall flush_workqueue(struct workqueue_struct *wq)
{
- mutex_lock(&workqueue_mutex);
+ preempt_disable(); /* CPU hotplug */
if (is_single_threaded(wq)) {
/* Always use first cpu's area. */
flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
@@ -459,7 +463,7 @@ void fastcall flush_workqueue(struct wor
for_each_online_cpu(cpu)
flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
}
- mutex_unlock(&workqueue_mutex);
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(flush_workqueue);

_

2007-01-04 18:08:36

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()

On 01/04, Andrew Morton wrote:
>
> On Thu, 4 Jan 2007 17:29:36 +0300
> Oleg Nesterov <[email protected]> wrote:
>
> > > In brief:
> > >
> > > keventd thread hotplug thread
> > > -------------- --------------
> > >
> > > run_workqueue()
> > > |
> > > work_fn()
> > > |
> > > flush_workqueue()
> > > |
> > > flush_cpu_workqueue
> > > | cpu_down()
> > > mutex_unlock(wq_mutex); |
> > > (above opens window for hotplug) mutex_lock(wq_mutex);
> > > | /* bring down cpu */
> > > wait_for_completition(); notifier(CPU_DEAD, ..)
> > > | workqueue_cpu_callback
> > > | cleanup_workqueue_thread
> > > | kthread_stop()
> > > |
> > > |
> > > mutex_lock(wq_mutex); <- Can deadlock
> > >
> > >
> > > The kthread_stop() will wait for keventd() thread to exit, but keventd()
> > > is blocked on mutex_lock(wq_mutex) leading to a deadlock.
>
> This?
>
>
> --- a/kernel/workqueue.c~flush_workqueue-use-preempt_disable-to-hold-off-cpu-hotplug
> +++ a/kernel/workqueue.c
> @@ -419,18 +419,22 @@ static void flush_cpu_workqueue(struct c
> * Probably keventd trying to flush its own queue. So simply run
> * it by hand rather than deadlocking.
> */
> - mutex_unlock(&workqueue_mutex);
> + preempt_enable();

Ah, (looking at _cpu_down()->stop_machine()), so preempt_disable() not only "pins"
the current CPU, it blocks cpu_down(), yes ???

I guess this should work then. I'll try to re-check this code on weekend.

Oleg.

2007-01-04 18:32:04

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()

On Thu, 4 Jan 2007 21:09:01 +0300
Oleg Nesterov <[email protected]> wrote:

> > --- a/kernel/workqueue.c~flush_workqueue-use-preempt_disable-to-hold-off-cpu-hotplug
> > +++ a/kernel/workqueue.c
> > @@ -419,18 +419,22 @@ static void flush_cpu_workqueue(struct c
> > * Probably keventd trying to flush its own queue. So simply run
> > * it by hand rather than deadlocking.
> > */
> > - mutex_unlock(&workqueue_mutex);
> > + preempt_enable();
>
> Ah, (looking at _cpu_down()->stop_machine()), so preempt_disable() not only "pins"
> the current CPU, it blocks cpu_down(), yes ???

yep.

But before we do much more of this we should have a wrapper. Umm

static inline void block_cpu_hotplug(void)
{
preempt_disable();
}

and use that, so people can see why it's being used.

I spose I'll do that and convert this patch to use it.

2007-01-05 08:56:49

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()

On Thu, Jan 04, 2007 at 09:18:50AM -0800, Andrew Morton wrote:
> This?

This can still lead to the problem spotted by Oleg here:

http://lkml.org/lkml/2006/12/30/37

and you would need a similar patch he posted there.

> void fastcall flush_workqueue(struct workqueue_struct *wq)
> {
> - mutex_lock(&workqueue_mutex);
> + preempt_disable(); /* CPU hotplug */
> if (is_single_threaded(wq)) {
> /* Always use first cpu's area. */
> flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
> @@ -459,7 +463,7 @@ void fastcall flush_workqueue(struct wor
> for_each_online_cpu(cpu)
> flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
> }
> - mutex_unlock(&workqueue_mutex);
> + preempt_enable();
> }
> EXPORT_SYMBOL_GPL(flush_workqueue);

--
Regards,
vatsa

2007-01-05 09:03:52

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()

On Thu, Jan 04, 2007 at 10:31:07AM -0800, Andrew Morton wrote:
> But before we do much more of this we should have a wrapper. Umm
>
> static inline void block_cpu_hotplug(void)
> {
> preempt_disable();
> }

Nack.

This will only block cpu down, not cpu_up and hence is a misnomer. I would be
vary of ignoring cpu_up events totally in writing hotplug safe code.

--
Regards,
vatsa

2007-01-05 12:42:08

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()

On 01/05, Srivatsa Vaddagiri wrote:
>
> On Thu, Jan 04, 2007 at 09:18:50AM -0800, Andrew Morton wrote:
> > This?
>
> This can still lead to the problem spotted by Oleg here:
>
> http://lkml.org/lkml/2006/12/30/37
>
> and you would need a similar patch he posted there.

preempt_disable() can't prevent cpu_up, but flush_workqueue() doesn't care
_unless_ cpu_down also happened meantime (and hence a fresh CPU may have
pending work_structs which were moved from a dead CPU).

So you are right, we still need the patch above, but I think we don't have
new problems with preempt_disable().

I might have missed your point though.

Oleg.

2007-01-05 14:06:40

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()

On 01/05, Srivatsa Vaddagiri wrote:
>
> On Thu, Jan 04, 2007 at 10:31:07AM -0800, Andrew Morton wrote:
> > But before we do much more of this we should have a wrapper. Umm
> >
> > static inline void block_cpu_hotplug(void)
> > {
> > preempt_disable();
> > }
>
> Nack.
>
> This will only block cpu down, not cpu_up and hence is a misnomer. I would be
> vary of ignoring cpu_up events totally in writing hotplug safe code.

How about block_cpu_down() ?

These cpu-hotplug races delayed the last workqueue patch I have in my queue.
flush_workqueue() misses an important optimization: we don't need to insert
a barrier and have an extra wake_up + wait_for_completion when cwq has no
pending works. But we need ->current_work (introduced in the next patch) to
implement this correctly.

I'll re-send the patch below later, when we finish with the bug you pointed
out, but it would be nice if you can take a look now.

Oleg.

--- mm-6.20-rc2/kernel/workqueue.c~4_speedup 2006-12-30 18:09:07.000000000 +0300
+++ mm-6.20-rc2/kernel/workqueue.c 2007-01-05 16:32:45.000000000 +0300
@@ -405,12 +405,15 @@ static void wq_barrier_func(struct work_
complete(&barr->done);
}

-static inline void init_wq_barrier(struct wq_barrier *barr)
+static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
+ struct wq_barrier *barr, int tail)
{
INIT_WORK(&barr->work, wq_barrier_func);
__set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));

init_completion(&barr->done);
+
+ insert_work(cwq, &barr->work, tail);
}

static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
@@ -425,13 +428,20 @@ static void flush_cpu_workqueue(struct c
mutex_lock(&workqueue_mutex);
} else {
struct wq_barrier barr;
+ int active = 0;

- init_wq_barrier(&barr);
- __queue_work(cwq, &barr.work);
+ spin_lock_irq(&cwq->lock);
+ if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
+ insert_wq_barrier(cwq, &barr, 1);
+ active = 1;
+ }
+ spin_unlock_irq(&cwq->lock);

- mutex_unlock(&workqueue_mutex);
- wait_for_completion(&barr.done);
- mutex_lock(&workqueue_mutex);
+ if (active) {
+ mutex_unlock(&workqueue_mutex);
+ wait_for_completion(&barr.done);
+ mutex_lock(&workqueue_mutex);
+ }
}
}

@@ -478,8 +488,7 @@ static void wait_on_work(struct cpu_work

spin_lock_irq(&cwq->lock);
if (unlikely(cwq->current_work == work)) {
- init_wq_barrier(&barr);
- insert_work(cwq, &barr.work, 0);
+ insert_wq_barrier(cwq, &barr, 0);
running = 1;
}
spin_unlock_irq(&cwq->lock);

2007-01-06 15:10:16

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

( Srivatsa, Gautham, could you please verify my thinking ? )

On top of fix-flush_workqueue-vs-cpu_dead-race.patch

hotplug_sequence is incremented under "case CPU_DEAD:". This was ok
before flush_workqueue() was changed to use preempt_disable() instead
of workqueue_mutex. However preempt_disable() can't garantee that there
is no CPU_DEAD event in progress (it is possible that flush_workqueue()
runs after STOPMACHINE_EXIT), so flush_workqueue() can miss CPU_DEAD
event.

Increment hotplug_sequence earlier, under CPU_DOWN_PREPARE. We can't
miss the event, the task running flush_workqueue() will be re-scheduled
at least once before CPU actually disappears from cpu_online_map.

We may have a false positive but this is very unlikely and only means
we will have one unneeded "goto again".

Note: this patch depends on
handle-cpu_lock_acquire-and-cpu_lock_release-in-workqueue_cpu_callback
but only "textually".

Signed-off-by: Oleg Nesterov <[email protected]>

--- mm-6.20-rc3/kernel/workqueue.c~1_down 2007-01-06 16:15:59.000000000 +0300
+++ mm-6.20-rc3/kernel/workqueue.c 2007-01-06 17:52:10.000000000 +0300
@@ -886,12 +886,15 @@ static int __devinit workqueue_cpu_callb
}
break;

+ case CPU_DOWN_PREPARE:
+ hotplug_sequence++;
+ break;
+
case CPU_DEAD:
list_for_each_entry(wq, &workqueues, list)
cleanup_workqueue_thread(wq, hotcpu);
list_for_each_entry(wq, &workqueues, list)
take_over_work(wq, hotcpu);
- hotplug_sequence++;
break;

case CPU_LOCK_RELEASE:

2007-01-06 15:12:05

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()

On Fri, Jan 05, 2007 at 03:42:46PM +0300, Oleg Nesterov wrote:
> preempt_disable() can't prevent cpu_up, but flush_workqueue() doesn't care
> _unless_ cpu_down also happened meantime (and hence a fresh CPU may have
> pending work_structs which were moved from a dead CPU).

Yes, that was what I had in mind.

> So you are right, we still need the patch above, but I think we don't have
> new problems with preempt_disable().

Right, preempt_disable() hasn't added any new problem. Its just
revealing the same problem as earlier, by opening up window for cpu
hotplug events to happen in the middle of flush_workqueue().

Ideally I would have liked a lock_cpu_hotplug() equivalent which blocks
all cpu hotplug events during the entire flush_workqueue(). In its
absence, I guess we just need to deal with all these ugly races ..

In summary, I think we need to go ahead with the preemp_disable() patch
in flush_workqueue() from Andrew and the race fix you posted here:

http://lkml.org/lkml/2006/12/30/37

--
Regards,
vatsa

2007-01-06 15:25:37

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH, RFC] reimplement flush_workqueue()

On Fri, Jan 05, 2007 at 05:07:17PM +0300, Oleg Nesterov wrote:
> How about block_cpu_down() ?

Maybe ..not sure

If we do introduce such a function, we may need to convert several
preempt_disable() that are there already (with intent of blocking
cpu_down) to block_cpu_down() ..

> These cpu-hotplug races delayed the last workqueue patch I have in my queue.
> flush_workqueue() misses an important optimization: we don't need to insert
> a barrier and have an extra wake_up + wait_for_completion when cwq has no
> pending works. But we need ->current_work (introduced in the next patch) to
> implement this correctly.
>
> I'll re-send the patch below later, when we finish with the bug you pointed
> out, but it would be nice if you can take a look now.

The patch seems fine to me, though I dont see any cpu hotplug
related problem being exposed/solved in this patch.

--
Regards,
vatsa

2007-01-06 15:45:16

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On Sat, Jan 06, 2007 at 06:10:36PM +0300, Oleg Nesterov wrote:
> Increment hotplug_sequence earlier, under CPU_DOWN_PREPARE. We can't
> miss the event, the task running flush_workqueue() will be re-scheduled
> at least once before CPU actually disappears from cpu_online_map.

Eww ..what happens if flush_workqueue() starts after CPU_DOWN_PREPARE?

CPU_DOWN_PREPARE(8)
hotplug_sequence++ = 10

flush_workqueue()
sequence = 10
flush cpus 1 ....7

CPU_DEAD(8)
take_over_work(8->1)

return not flushing dead cpu8 (=BUG)


??

--
Regards,
vatsa

2007-01-06 16:29:50

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On 01/06, Srivatsa Vaddagiri wrote:
>
> On Sat, Jan 06, 2007 at 06:10:36PM +0300, Oleg Nesterov wrote:
> > Increment hotplug_sequence earlier, under CPU_DOWN_PREPARE. We can't
> > miss the event, the task running flush_workqueue() will be re-scheduled
> > at least once before CPU actually disappears from cpu_online_map.
>
> Eww ..what happens if flush_workqueue() starts after CPU_DOWN_PREPARE?
^^^^^
Stupid me. Thanks.

> CPU_DOWN_PREPARE(8)
> hotplug_sequence++ = 10
>
> flush_workqueue()
> sequence = 10
> flush cpus 1 ....7
>
> CPU_DEAD(8)
> take_over_work(8->1)
>
> return not flushing dead cpu8 (=BUG)

I'll try to do something else tomorrow. Do you see a simple soulution?

The current usage of workqueue_mutex (I mean stable kernel) is broken
and deadlockable. We really need to change it.

Oleg.

2007-01-06 16:39:06

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On Sat, Jan 06, 2007 at 07:30:35PM +0300, Oleg Nesterov wrote:
> Stupid me. Thanks.
>
> I'll try to do something else tomorrow. Do you see a simple soulution?

Sigh ..I dont see a simple solution, unless we have something like
lock_cpu_hotplug() ..

Andrew,
This workqueue problem has exposed a classic example of how
tough/miserable it can be to write hotplug safe code w/o something like
lock_cpu_hotplug() ..Are you still inclined towards banning it? :)

FYI, the lock_cpu_hotplug() rewrite proposed by Gautham at
http://lkml.org/lkml/2006/10/26/65 may still need refinement to avoid
all the kind of deadlocks we have unearthed with workqueue example. I
can review that design with Gautham if there is some interest to
revive lock_cpu_hotplug() ..

> The current usage of workqueue_mutex (I mean stable kernel) is broken
> and deadlockable. We really need to change it.

Yep ..

--
Regards,
vatsa

2007-01-06 17:33:20

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On 01/06, Srivatsa Vaddagiri wrote:
>
> On Sat, Jan 06, 2007 at 07:30:35PM +0300, Oleg Nesterov wrote:
> > Stupid me. Thanks.
> >
> > I'll try to do something else tomorrow. Do you see a simple soulution?
>
> Sigh ..I dont see a simple solution, unless we have something like
> lock_cpu_hotplug() ..

I suspect this can't help either.

The problem is that flush_workqueue() may be called while cpu hotplug event
in progress and CPU_DEAD waits for kthread_stop(), so we have the same dead
lock if work->func() does flush_workqueue(). This means that Andrew's change
to use preempt_disable() is good and anyway needed.

I am starting to believe we need some more intrusive changes in workquue.c.

Oleg.

2007-01-06 19:12:10

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On Sat, 6 Jan 2007 22:08:51 +0530
Srivatsa Vaddagiri <[email protected]> wrote:

> This workqueue problem has exposed a classic example of how
> tough/miserable it can be to write hotplug safe code w/o something like
> lock_cpu_hotplug() ..Are you still inclined towards banning it? :)

I don't ban stuff - I just advocate ;)

I would still prefer that we not try to invent a new magical lock,
but yes, the current approach is looking troublesome.

> FYI, the lock_cpu_hotplug() rewrite proposed by Gautham at
> http://lkml.org/lkml/2006/10/26/65 may still need refinement to avoid
> all the kind of deadlocks we have unearthed with workqueue example. I
> can review that design with Gautham if there is some interest to
> revive lock_cpu_hotplug() ..

Has anyone thought seriously about using the process freezer in the
cpu-down/cpu-up paths? That way we don't need to lock anything anywhere?

2007-01-06 19:17:22

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update


* Andrew Morton <[email protected]> wrote:

> > FYI, the lock_cpu_hotplug() rewrite proposed by Gautham at
> > http://lkml.org/lkml/2006/10/26/65 may still need refinement to
> > avoid all the kind of deadlocks we have unearthed with workqueue
> > example. I can review that design with Gautham if there is some
> > interest to revive lock_cpu_hotplug() ..
>
> Has anyone thought seriously about using the process freezer in the
> cpu-down/cpu-up paths? That way we don't need to lock anything
> anywhere?

yes, yes, yes - lets please do that! The process freezer is already used
for suspend, for hibernate and recently for kprobes - so its performance
and robustness is being relied on and verified from multiple angles. I
can see no reason why it couldnt be made really fast even on large
boxes, if the need arises. (but even the current one is fast enough for
any human-driven CPU hotplug stuff)

Ingo

2007-01-07 10:43:49

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On Sat, Jan 06, 2007 at 08:34:16PM +0300, Oleg Nesterov wrote:
> I suspect this can't help either.
>
> The problem is that flush_workqueue() may be called while cpu hotplug event
> in progress and CPU_DEAD waits for kthread_stop(), so we have the same dead
> lock if work->func() does flush_workqueue(). This means that Andrew's change
> to use preempt_disable() is good and anyway needed.

Well ..a lock_cpu_hotplug() in run_workqueue() and support for recursive
calls to lock_cpu_hotplug() by the same thread will avoid the problem
you mention. This will need changes to task_struct to track the
recursion depth. Alternately this can be supported w/o changes to
task_struct by 'biasing' readers over writers as I believe Gautham's
patches [1] do.

1. http://lkml.org/lkml/2006/10/26/65

--
Regards,
vatsa



2007-01-07 11:00:22

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On Sat, Jan 06, 2007 at 11:11:17AM -0800, Andrew Morton wrote:
> Has anyone thought seriously about using the process freezer in the
> cpu-down/cpu-up paths? That way we don't need to lock anything anywhere?

How would this provide a stable access to cpu_online_map in functions
that need to block while accessing it (as flush_workqueue requires)?

--
Regards,
vatsa

2007-01-07 12:55:12

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On 01/07, Srivatsa Vaddagiri wrote:
>
> On Sat, Jan 06, 2007 at 08:34:16PM +0300, Oleg Nesterov wrote:
> > I suspect this can't help either.
> >
> > The problem is that flush_workqueue() may be called while cpu hotplug event
> > in progress and CPU_DEAD waits for kthread_stop(), so we have the same dead
> > lock if work->func() does flush_workqueue(). This means that Andrew's change
> > to use preempt_disable() is good and anyway needed.
>
> Well ..a lock_cpu_hotplug() in run_workqueue() and support for recursive
> calls to lock_cpu_hotplug() by the same thread will avoid the problem
> you mention.

Srivatsa, I'm completely new to cpu-hotplug, so please correct me if I'm
wrong (in fact I _hope_ I am wrong) but as I see it, the hotplug/workqueue
interaction is broken by design, it can't be fixed by changing just locking.

Once again. CPU dies, CPU_DEAD calls kthread_stop() and sleeps until
cwq->thread exits. To do so, this thread must at least complete the
currently running work->func().

work->func() calls flush_workque(WQ), it does lock_cpu_hotplug() or
_whatever_. Now the question, does it block?

if YES:
This is what the stable tree does - deadlock.

if NOT:
This is what we have with Andrew's s/mutex_lock/preempt_disable/
patch - race or deadlock, we have a choice.

Suppose that WQ has pending works on that dead CPU. Note that
at this point this CPU does not present on cpu_online_map.
This means that (without other changes) we have lost.

- flush_workque(WQ) can't return until CPU_DEAD transfers
these works to some another CPU on the cpu_online_map.

- CPU_DEAD can't do take_over_work() untill flush_workque()
returns.

Andrew, Ingo, this also means that freezer can't solve this particular
problem either (if i am right).

Thoughts?

Oleg.


2007-01-07 14:21:50

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On 01/07, Oleg Nesterov wrote:
>
> Thoughts?

How about:

CPU_DEAD does nothing. After __cpu_disable() cwq->thread runs on
all CPUs and becomes idle when it flushes cwq->worklist: nobody
will add work_struct on that list.

CPU_UP:
if (!cwq->thread)
create_workqueue_thread();
else
set_cpus_allowed(newcpu);

flush_workqueue:
for_each_possible_cpu() // NOT online!
if (cwq->thread)
flush_cpu_workqueue()

Oleg.

2007-01-07 14:41:38

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On 01/07, Oleg Nesterov wrote:
>
> How about:
>
> CPU_DEAD does nothing. After __cpu_disable() cwq->thread runs on
> all CPUs and becomes idle when it flushes cwq->worklist: nobody
> will add work_struct on that list.

Also, we can add cpu_workqueue_struct->should_exit_after_flush flag, but
we have to be careful to serialize with destroy_workqueue/CPU_UP.

> CPU_UP:
> if (!cwq->thread)
> create_workqueue_thread();
> else
> set_cpus_allowed(newcpu);
>
> flush_workqueue:
> for_each_possible_cpu() // NOT online!
> if (cwq->thread)
> flush_cpu_workqueue()

2007-01-07 16:23:06

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On Sun, Jan 07, 2007 at 03:56:03PM +0300, Oleg Nesterov wrote:
> Srivatsa, I'm completely new to cpu-hotplug, so please correct me if I'm
> wrong (in fact I _hope_ I am wrong) but as I see it, the hotplug/workqueue
> interaction is broken by design, it can't be fixed by changing just locking.
>
> Once again. CPU dies, CPU_DEAD calls kthread_stop() and sleeps until
> cwq->thread exits. To do so, this thread must at least complete the
> currently running work->func().

If run_workqueue() takes a lock_cpu_hotplug() successfully, then we shouldnt
even reach till this point, as it will block writers (cpu_down/up) until it
completes.


run_workqueue()
---------------

try_again:
rc = lock_cpu_hotplug_interruptible();

if (rc && kthread_should_stop())
return;

if (rc != 0)
goto try_again;

/* cpu_down/up shouldnt happen now untill we call unlock_cpu_hotplug */
while (!list_empty(..))
work->func();

unlock_cpu_hotplug();


If work->func() calls something (say flush_workqueue()) which requires a
lock_cpu_hotplug() again, there are two ways to support it:

Method 1: Add a field, hotplug_lock_held, in task_struct

If current->hotplug_lock_held > 1, then lock_cpu_hotplug()
merely increments it and returns success. Its counterpart,
unlock_cpu_hotplug() will decrement the count.

Easiest to implement. However additional field is required in
each task_struct, which may not be attractive for some.

Method 2 : Bias readers over writers:

This method will support recursive calls to lock_cpu_hotplug()
by the same thread, w/o requiring a field in task_struct. To
accomplish this, readers are biased over writers i.e


reader1_lock(); <- success

writer1_lock(); <- blocks on reader1


reader2_lock(); <- success

A fair lock would have blocked reader2_lock() until
writer1_lock()/writer1_unlock() is complete, but since we are required to
support recursion w/o maintaining a task_struct field, we let reader2_lock()
succeed, even though it could be from a different thread.

> Andrew, Ingo, this also means that freezer can't solve this particular
> problem either (if i am right).

freezer wont give stable access to cpu_online_map either, as could typically be
required in functions like flush_workqueue.

--
Regards,
vatsa

2007-01-07 16:43:59

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On Sun, Jan 07, 2007 at 05:22:46PM +0300, Oleg Nesterov wrote:
> On 01/07, Oleg Nesterov wrote:
> >
> > Thoughts?
>
> How about:
>
> CPU_DEAD does nothing. After __cpu_disable() cwq->thread runs on
> all CPUs and becomes idle when it flushes cwq->worklist: nobody
^^^

all except dead cpus that is.

> will add work_struct on that list.

If CPU_DEAD does nothing, then the dead cpu's workqueue list may be
non-empty. How will it be flushed, given that no thread can run on the
dead cpu?

We could consider CPU_DEAD moving over work atleast (and not killing
worker threads also). In that case, cwq->thread can flush its work,
however it now requires serialization among worker threads, since more
than one worker thread can now be servicing the same CPU's workqueue
list (this will beat the very purpose of maintaining per-cpu threads to
avoid synchronization between them).

Finally, I am concerned about the (un)friendliness of this programming
model, where programmers are restricted in not having a stable access to
cpu_online_map at all -and- also requiring them to code in non-obvious
terms. Granted that writing hotplug-safe code is non-trivial, but the
absence of "safe access to online_map" will make it more complicated.

--
Regards,
vatsa

2007-01-07 17:02:10

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On Sun, Jan 07, 2007 at 10:13:44PM +0530, Srivatsa Vaddagiri wrote:
> If CPU_DEAD does nothing, then the dead cpu's workqueue list may be
> non-empty. How will it be flushed, given that no thread can run on the
> dead cpu?
>
> We could consider CPU_DEAD moving over work atleast (and not killing
> worker threads also). In that case, cwq->thread can flush its work,
> however it now requires serialization among worker threads, since more
> than one worker thread can now be servicing the same CPU's workqueue
> list (this will beat the very purpose of maintaining per-cpu threads to
> avoid synchronization between them).

I guess you could have cwq->thread flush only it's cpu's workqueue by
running on another cpu, which will avoid the need to synchronize
between worker threads. I am not 100% sure if that breaks workqueue
model in any way (since we could have two worker threads running on the
same CPU, but servicing different queues). Hopefully it doesnt.

However the concern expressed below remains ..

> Finally, I am concerned about the (un)friendliness of this programming
> model, where programmers are restricted in not having a stable access to
> cpu_online_map at all -and- also requiring them to code in non-obvious
> terms. Granted that writing hotplug-safe code is non-trivial, but the
> absence of "safe access to online_map" will make it more complicated.

--
Regards,
vatsa

2007-01-07 17:08:56

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On 01/07, Srivatsa Vaddagiri wrote:
>
> On Sun, Jan 07, 2007 at 03:56:03PM +0300, Oleg Nesterov wrote:
> > Srivatsa, I'm completely new to cpu-hotplug, so please correct me if I'm
> > wrong (in fact I _hope_ I am wrong) but as I see it, the hotplug/workqueue
> > interaction is broken by design, it can't be fixed by changing just locking.
> >
> > Once again. CPU dies, CPU_DEAD calls kthread_stop() and sleeps until
> > cwq->thread exits. To do so, this thread must at least complete the
> > currently running work->func().
>
> If run_workqueue() takes a lock_cpu_hotplug() successfully, then we shouldnt
> even reach till this point, as it will block writers (cpu_down/up) until it
> completes.
>
>
> run_workqueue()
> ---------------
>
> try_again:
> rc = lock_cpu_hotplug_interruptible();
>
> if (rc && kthread_should_stop())
> return;
>
> if (rc != 0)
> goto try_again;
>
> /* cpu_down/up shouldnt happen now untill we call unlock_cpu_hotplug */
> while (!list_empty(..))
> work->func();

This mean that every work->func() which may sleep delays cpu_down/up unpredictable,
not good. What about work->func which sleeps then re-queues itself? I guess we can
solve this, but this is what I said "other changes".

Also, lock_cpu_hotplug() should be per-cpu, otherwise we have livelock.

Not that I am against lock_cpu_hotplug (I can't judge), but its usage in run_workqueue
looks like complication to me. I may be wrong. But the main problem we don't have it :)

Oleg.

2007-01-07 17:17:29

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On 01/07, Srivatsa Vaddagiri wrote:
>
> On Sun, Jan 07, 2007 at 05:22:46PM +0300, Oleg Nesterov wrote:
> > On 01/07, Oleg Nesterov wrote:
> > >
> > > Thoughts?
> >
> > How about:
> >
> > CPU_DEAD does nothing. After __cpu_disable() cwq->thread runs on
> > all CPUs and becomes idle when it flushes cwq->worklist: nobody
> ^^^
>
> all except dead cpus that is.

yes, of course.

>
> > will add work_struct on that list.
>
> If CPU_DEAD does nothing, then the dead cpu's workqueue list may be
> non-empty. How will it be flushed, given that no thread can run on the
> dead cpu?

But cwq->thread is not bound to the dead CPU at this point, it was aleady
migrated (like all other threads which had that CPU in ->cpus_allowed).

> Finally, I am concerned about the (un)friendliness of this programming
> model, where programmers are restricted in not having a stable access to
> cpu_online_map at all -and- also requiring them to code in non-obvious
> terms. Granted that writing hotplug-safe code is non-trivial, but the
> absence of "safe access to online_map" will make it more complicated.

I guess you misunderstood me, I meant CPU_DEAD does nothing only in
workqueue.c:workqueue_cpu_callback().

Oleg.

2007-01-07 17:32:35

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On 01/07, Srivatsa Vaddagiri wrote:
>
> On Sun, Jan 07, 2007 at 10:13:44PM +0530, Srivatsa Vaddagiri wrote:
>
> I guess you could have cwq->thread flush only it's cpu's workqueue by
> running on another cpu,

yes, this is what I meant,

> which will avoid the need to synchronize
> between worker threads. I am not 100% sure if that breaks workqueue
> model in any way (since we could have two worker threads running on the
> same CPU, but servicing different queues). Hopefully it doesnt.

We are already doing this on CPU_DEAD->kthread_stop().

> However the concern expressed below remains ..
>
> > Finally, I am concerned about the (un)friendliness of this programming
> > model, where programmers are restricted in not having a stable access to
> > cpu_online_map at all -and- also requiring them to code in non-obvious
> > terms. Granted that writing hotplug-safe code is non-trivial, but the
> > absence of "safe access to online_map" will make it more complicated.

please see the previous message.

Srivatsa, I don't claim my idea is the best. Actually I still hope somebody
else will suggest something better and simpler :)

Oleg.

2007-01-07 20:01:27

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On Sun, 7 Jan 2007 16:30:13 +0530
Srivatsa Vaddagiri <[email protected]> wrote:

> On Sat, Jan 06, 2007 at 11:11:17AM -0800, Andrew Morton wrote:
> > Has anyone thought seriously about using the process freezer in the
> > cpu-down/cpu-up paths? That way we don't need to lock anything anywhere?
>
> How would this provide a stable access to cpu_online_map in functions
> that need to block while accessing it (as flush_workqueue requires)?

If a thread simply blocks, that will not permit a cpu plug/unplug to proceed.

The thread had to explicitly call try_to_freeze(). CPU plug/unplug will
not occur (and cpu_online_map will not change) until every process in the
machine has called try_to_freeze()).

So the problem which you're referring to will only occur if a workqueue
callback function calls try_to_freeze(), which would be mad.



Plus flush_workqueue() is on the way out. We're slowly edging towards a
working cancel_work() which will only block if the work which you're trying
to cancel is presently running. With that, pretty much all the
flush_workqueue() calls go away, and all these accidental rarely-occurring
deadlocks go away too.

2007-01-07 21:01:57

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

Now when we have ->current_work we can avoid adding a barrier and waiting for
its completition when cwq's queue is empty.

Note: this change is also useful if we change flush_workqueue() to also check
the dead CPUs.

Signed-off-by: Oleg Nesterov <[email protected]>

--- mm-6.20-rc3/kernel/workqueue.c~1_opt 2007-01-07 23:15:50.000000000 +0300
+++ mm-6.20-rc3/kernel/workqueue.c 2007-01-07 23:26:45.000000000 +0300
@@ -405,12 +405,15 @@ static void wq_barrier_func(struct work_
complete(&barr->done);
}

-static inline void init_wq_barrier(struct wq_barrier *barr)
+static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
+ struct wq_barrier *barr, int tail)
{
INIT_WORK(&barr->work, wq_barrier_func);
__set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));

init_completion(&barr->done);
+
+ insert_work(cwq, &barr->work, tail);
}

static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
@@ -429,13 +432,20 @@ static void flush_cpu_workqueue(struct c
preempt_disable();
} else {
struct wq_barrier barr;
+ int active = 0;

- init_wq_barrier(&barr);
- __queue_work(cwq, &barr.work);
+ spin_lock_irq(&cwq->lock);
+ if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
+ insert_wq_barrier(cwq, &barr, 1);
+ active = 1;
+ }
+ spin_unlock_irq(&cwq->lock);

- preempt_enable(); /* Can no longer touch *cwq */
- wait_for_completion(&barr.done);
- preempt_disable();
+ if (active) {
+ preempt_enable();
+ wait_for_completion(&barr.done);
+ preempt_disable();
+ }
}
}

@@ -482,8 +492,7 @@ static void wait_on_work(struct cpu_work

spin_lock_irq(&cwq->lock);
if (unlikely(cwq->current_work == work)) {
- init_wq_barrier(&barr);
- insert_work(cwq, &barr.work, 0);
+ insert_wq_barrier(cwq, &barr, 0);
running = 1;
}
spin_unlock_irq(&cwq->lock);

2007-01-07 21:51:18

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On 01/07, Andrew Morton wrote:
>
> Plus flush_workqueue() is on the way out. We're slowly edging towards a
> working cancel_work() which will only block if the work which you're trying
> to cancel is presently running. With that, pretty much all the
> flush_workqueue() calls go away, and all these accidental rarely-occurring
> deadlocks go away too.

So. If we can forget about the race we have - fine. Otherwise, how about the
patch below? It is untested and needs a review. I can't suggest any simpler
now.

Change flush_workqueue() to use for_each_possible_cpu(). This means that
flush_cpu_workqueue() may hit CPU which is already dead. However in that
case

if (!list_empty(&cwq->worklist) || cwq->current_work != NULL)

means that CPU_DEAD in progress, it will do kthread_stop() + take_over_work()
so we can proceed and insert a barrier. We hold cwq->lock, so we are safe.

This patch replaces fix-flush_workqueue-vs-cpu_dead-race.patch which was
broken by switching to preempt_disable (now we don't need locking at all).
Note that migrate_sequence (was hotplug_sequence) is incremented under
cwq->lock. Since flush_workqueue does lock/unlock of cwq->lock on all CPUs,
it must see the new value if take_over_work() happened before we checked
this cwq, and this is the case we should worry about: otherwise we added
a barrier.

Srivatsa?

--- mm-6.20-rc3/kernel/workqueue.c~2_race 2007-01-08 00:07:07.000000000 +0300
+++ mm-6.20-rc3/kernel/workqueue.c 2007-01-08 00:28:55.000000000 +0300
@@ -65,6 +65,7 @@ struct workqueue_struct {

/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
threads to each one as cpus come/go. */
+static long migrate_sequence __read_mostly;
static DEFINE_MUTEX(workqueue_mutex);
static LIST_HEAD(workqueues);

@@ -422,13 +423,7 @@ static void flush_cpu_workqueue(struct c
* Probably keventd trying to flush its own queue. So simply run
* it by hand rather than deadlocking.
*/
- preempt_enable();
- /*
- * We can still touch *cwq here because we are keventd, and
- * hot-unplug will be waiting us to exit.
- */
run_workqueue(cwq);
- preempt_disable();
} else {
struct wq_barrier barr;
int active = 0;
@@ -441,9 +436,7 @@ static void flush_cpu_workqueue(struct c
spin_unlock_irq(&cwq->lock);

if (active) {
- preempt_enable();
wait_for_completion(&barr.done);
- preempt_disable();
}
}
}
@@ -463,17 +456,21 @@ static void flush_cpu_workqueue(struct c
*/
void fastcall flush_workqueue(struct workqueue_struct *wq)
{
- preempt_disable(); /* CPU hotplug */
if (is_single_threaded(wq)) {
/* Always use first cpu's area. */
flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
} else {
+ long sequence;
int cpu;
+again:
+ sequence = migrate_sequence;

- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu)
flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
+
+ if (unlikely(sequence != migrate_sequence))
+ goto again;
}
- preempt_enable();
}
EXPORT_SYMBOL_GPL(flush_workqueue);

@@ -545,18 +542,22 @@ out:
}
EXPORT_SYMBOL_GPL(flush_work);

-static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
- int cpu, int freezeable)
+static void init_cpu_workqueue(struct workqueue_struct *wq,
+ struct cpu_workqueue_struct *cwq, int freezeable)
{
- struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
- struct task_struct *p;
-
spin_lock_init(&cwq->lock);
cwq->wq = wq;
cwq->thread = NULL;
cwq->freezeable = freezeable;
INIT_LIST_HEAD(&cwq->worklist);
init_waitqueue_head(&cwq->more_work);
+}
+
+static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
+ int cpu)
+{
+ struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+ struct task_struct *p;

if (is_single_threaded(wq))
p = kthread_create(worker_thread, cwq, "%s", wq->name);
@@ -589,15 +590,20 @@ struct workqueue_struct *__create_workqu
mutex_lock(&workqueue_mutex);
if (singlethread) {
INIT_LIST_HEAD(&wq->list);
- p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
+ init_cpu_workqueue(wq, per_cpu_ptr(wq->cpu_wq, singlethread_cpu),
+ freezeable);
+ p = create_workqueue_thread(wq, singlethread_cpu);
if (!p)
destroy = 1;
else
wake_up_process(p);
} else {
list_add(&wq->list, &workqueues);
+ for_each_possible_cpu(cpu)
+ init_cpu_workqueue(wq, per_cpu_ptr(wq->cpu_wq, cpu),
+ freezeable);
for_each_online_cpu(cpu) {
- p = create_workqueue_thread(wq, cpu, freezeable);
+ p = create_workqueue_thread(wq, cpu);
if (p) {
kthread_bind(p, cpu);
wake_up_process(p);
@@ -833,6 +839,7 @@ static void take_over_work(struct workqu

spin_lock_irq(&cwq->lock);
list_replace_init(&cwq->worklist, &list);
+ migrate_sequence++;

while (!list_empty(&list)) {
printk("Taking work for %s\n", wq->name);
@@ -859,7 +866,7 @@ static int __devinit workqueue_cpu_callb
case CPU_UP_PREPARE:
/* Create a new workqueue thread for it. */
list_for_each_entry(wq, &workqueues, list) {
- if (!create_workqueue_thread(wq, hotcpu, 0)) {
+ if (!create_workqueue_thread(wq, hotcpu)) {
printk("workqueue for %i failed\n", hotcpu);
return NOTIFY_BAD;
}

2007-01-08 15:22:24

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On Mon, Jan 08, 2007 at 12:51:03AM +0300, Oleg Nesterov wrote:
> Change flush_workqueue() to use for_each_possible_cpu(). This means that
> flush_cpu_workqueue() may hit CPU which is already dead. However in that
> case
>
> if (!list_empty(&cwq->worklist) || cwq->current_work != NULL)
>
> means that CPU_DEAD in progress, it will do kthread_stop() + take_over_work()
> so we can proceed and insert a barrier. We hold cwq->lock, so we are safe.
>
> This patch replaces fix-flush_workqueue-vs-cpu_dead-race.patch which was
> broken by switching to preempt_disable (now we don't need locking at all).
> Note that migrate_sequence (was hotplug_sequence) is incremented under
> cwq->lock. Since flush_workqueue does lock/unlock of cwq->lock on all CPUs,
> it must see the new value if take_over_work() happened before we checked
> this cwq, and this is the case we should worry about: otherwise we added
> a barrier.
>
> Srivatsa?

This is head-spinning :)

Spotted atleast these problems:

1. run_workqueue()->work.func()->flush_work()->mutex_lock(workqueue_mutex)
deadlocks if we are blocked in cleanup_workqueue_thread()->kthread_stop()
for the same worker thread to exit.

Looks possible in practice to me.

2.

CPU_DEAD->cleanup_workqueue_thread->(cwq->thread = NULL)->kthread_stop() ..
^^^^^^^^^^^^^^^^^^^^
|___ Problematic

Now while we are blocked here, if a work->func() calls
flush_workqueue->flush_cpu_workqueue, we clearly cant identify that event
thread is trying to flush its own queue (cwq->thread == current test
fails) and hence we will deadlock.

A lock_cpu_hotplug(), or any other ability to block concurrent hotplug
operations from happening, in run_workqueue would have avoided both the above
races.

Alternatively, for the second race, I guess we can avoid setting
cwq->thread = NULL in cleanup_workqueue_thread() till the thread has exited,
but I am not sure if that opens up any other race. The first race seems
harder to fix ..

I wonder if spin (spinroot.com) or some other formal model can make this job of
spotting-races easier for us ..

--
Regards,
vatsa

2007-01-08 15:37:36

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On Sun, Jan 07, 2007 at 11:59:57AM -0800, Andrew Morton wrote:
> > How would this provide a stable access to cpu_online_map in functions
> > that need to block while accessing it (as flush_workqueue requires)?
>
> If a thread simply blocks, that will not permit a cpu plug/unplug to proceed.
>
> The thread had to explicitly call try_to_freeze(). CPU plug/unplug will
> not occur (and cpu_online_map will not change) until every process in the
> machine has called try_to_freeze()).

Maybe my misunderstanding of the code, but:

Looking at the code, it seems to me that try_to_freeze() will be called very
likely from the signal-delivery path (get_signal_to_deliver()). Isnt
that correct? If so, consider a thread as below:


Thread1 :
for_each_online_cpu() { online_map = 0x1111 at this point }
do_some_thing();
kmalloc(); <- blocks


Can't Thread1 be frozen in the above blocking state w/o it voluntarily
calling try_to_freeze? If so, online_map can change when it returns
from kmalloc() ..

> So the problem which you're referring to will only occur if a workqueue
> callback function calls try_to_freeze(), which would be mad.
>
>
> Plus flush_workqueue() is on the way out. We're slowly edging towards a
> working cancel_work() which will only block if the work which you're trying
> to cancel is presently running. With that, pretty much all the
> flush_workqueue() calls go away, and all these accidental rarely-occurring
> deadlocks go away too.

Fundamentally, I think it is important to give the ability to block
concurrent hotplug operations from happening ..

--
Regards,
vatsa

2007-01-08 15:55:59

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On 01/08, Srivatsa Vaddagiri wrote:
>
> On Mon, Jan 08, 2007 at 12:51:03AM +0300, Oleg Nesterov wrote:
> > Change flush_workqueue() to use for_each_possible_cpu(). This means that
> > flush_cpu_workqueue() may hit CPU which is already dead. However in that
> > case
> >
> > if (!list_empty(&cwq->worklist) || cwq->current_work != NULL)
> >
> > means that CPU_DEAD in progress, it will do kthread_stop() + take_over_work()
> > so we can proceed and insert a barrier. We hold cwq->lock, so we are safe.
> >
> > This patch replaces fix-flush_workqueue-vs-cpu_dead-race.patch which was
> > broken by switching to preempt_disable (now we don't need locking at all).
> > Note that migrate_sequence (was hotplug_sequence) is incremented under
> > cwq->lock. Since flush_workqueue does lock/unlock of cwq->lock on all CPUs,
> > it must see the new value if take_over_work() happened before we checked
> > this cwq, and this is the case we should worry about: otherwise we added
> > a barrier.
> >
> > Srivatsa?
>
> This is head-spinning :)

Thank you for review!

> Spotted atleast these problems:
>
> 1. run_workqueue()->work.func()->flush_work()->mutex_lock(workqueue_mutex)
> deadlocks if we are blocked in cleanup_workqueue_thread()->kthread_stop()
> for the same worker thread to exit.
>
> Looks possible in practice to me.

Yes, this is the same (old) problem as we have/had with flush_workqueue().
We can convert flush_work() to use preempt_disable (this is not straightforward,
but easy), or forbid to call flush_work() from work.func().

This patch doesn't touch this problem.

> 2.
>
> CPU_DEAD->cleanup_workqueue_thread->(cwq->thread = NULL)->kthread_stop() ..
> ^^^^^^^^^^^^^^^^^^^^
> |___ Problematic

Hmm... This should not be possible? cwq->thread != NULL on CPU_DEAD event.
Event IF it was NULL, we don't call kthread_stop() in that case.

> Now while we are blocked here, if a work->func() calls
> flush_workqueue->flush_cpu_workqueue, we clearly cant identify that event
> thread is trying to flush its own queue (cwq->thread == current test
> fails) and hence we will deadlock.

Could you clarify? I believe cwq->thread == current test always works, we never
"substitute" cwq->thread.

> A lock_cpu_hotplug(), or any other ability to block concurrent hotplug
> operations from happening, in run_workqueue would have avoided both the above
> races.

I still don't think this is a good idea. We also need
is_cpu_down_waits_for_lock_cpu_hotplug()

helper, otherwise we have a deadlock if work->func() sleeps and re-queues itself.

> Alternatively, for the second race, I guess we can avoid setting
> cwq->thread = NULL in cleanup_workqueue_thread() till the thread has exited,

Yes, http://marc.theaimsgroup.com/?l=linux-kernel&m=116818097927685, I believe
we can do this later. This way workqueue will have almost zero interaction
with cpu-hotplug, and cpu UP/DOWN event won't be delayed by sleeping work.func().
take_over_work() can go away, this also allows us to simplify things.

Thanks!

Oleg.

2007-01-08 16:31:58

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On Mon, Jan 08, 2007 at 06:56:38PM +0300, Oleg Nesterov wrote:
> > Spotted atleast these problems:
> >
> > 1. run_workqueue()->work.func()->flush_work()->mutex_lock(workqueue_mutex)
> > deadlocks if we are blocked in cleanup_workqueue_thread()->kthread_stop()
> > for the same worker thread to exit.
> >
> > Looks possible in practice to me.
>
> Yes, this is the same (old) problem as we have/had with flush_workqueue().
> We can convert flush_work() to use preempt_disable (this is not straightforward,
> but easy), or forbid to call flush_work() from work.func().

I think I noticed other problems of avoiding workqueue_mutex() in
flush_work() ..don't recall the exact problem.

> > 2.
> >
> > CPU_DEAD->cleanup_workqueue_thread->(cwq->thread = NULL)->kthread_stop() ..
> > ^^^^^^^^^^^^^^^^^^^^
> > |___ Problematic
>
> Hmm... This should not be possible? cwq->thread != NULL on CPU_DEAD event.

sure, cwq->thread != NULL at CPU_DEAD event. However
cleanup_workqueue_thread() will set it to NULL and block in
kthread_stop(), waiting for the kthread to finish run_workqueue and
exit. If one of the work functions being run by run_workqueue() calls
flush_workqueue()->flush_cpu_workqueue() now, flush_cpu_workqueue() can fail to
recognize that "keventd is trying to flush its own queue" which can
cause deadlocks.

> > Now while we are blocked here, if a work->func() calls
> > flush_workqueue->flush_cpu_workqueue, we clearly cant identify that event
> > thread is trying to flush its own queue (cwq->thread == current test
> > fails) and hence we will deadlock.
>
> Could you clarify? I believe cwq->thread == current test always works, we never
> "substitute" cwq->thread.

The test fails in the window described above.

> > A lock_cpu_hotplug(), or any other ability to block concurrent hotplug
> > operations from happening, in run_workqueue would have avoided both the above
> > races.
>
> I still don't think this is a good idea. We also need
> is_cpu_down_waits_for_lock_cpu_hotplug()
>
> helper, otherwise we have a deadlock if work->func() sleeps and re-queues itself.

Can you elaborate this a bit?

> > Alternatively, for the second race, I guess we can avoid setting
> > cwq->thread = NULL in cleanup_workqueue_thread() till the thread has exited,
>
> Yes, http://marc.theaimsgroup.com/?l=linux-kernel&m=116818097927685, I believe
> we can do this later. This way workqueue will have almost zero interaction
> with cpu-hotplug, and cpu UP/DOWN event won't be delayed by sleeping work.func().
> take_over_work() can go away, this also allows us to simplify things.

I agree it minimizes the interactions. Maybe worth attempting. However I
suspect it may not be as simple as it appears :)


--
Regards,
vatsa

2007-01-08 17:08:04

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On 01/08, Srivatsa Vaddagiri wrote:
>
> On Mon, Jan 08, 2007 at 06:56:38PM +0300, Oleg Nesterov wrote:
> > > 2.
> > >
> > > CPU_DEAD->cleanup_workqueue_thread->(cwq->thread = NULL)->kthread_stop() ..
> > > ^^^^^^^^^^^^^^^^^^^^
> > > |___ Problematic
> >
> > Hmm... This should not be possible? cwq->thread != NULL on CPU_DEAD event.
>
> sure, cwq->thread != NULL at CPU_DEAD event. However
> cleanup_workqueue_thread() will set it to NULL and block in
> kthread_stop(), waiting for the kthread to finish run_workqueue and
> exit.

Ah, missed you point, thanks. Yet another old problem which was not introduced
by recent changes. And yet another indication we should avoid kthread_stop()
on CPU_DEAD event :) I believe this is easy to fix, but need to think more.

> > > A lock_cpu_hotplug(), or any other ability to block concurrent hotplug
> > > operations from happening, in run_workqueue would have avoided both the above
> > > races.
> >
> > I still don't think this is a good idea. We also need
> > is_cpu_down_waits_for_lock_cpu_hotplug()
> >
> > helper, otherwise we have a deadlock if work->func() sleeps and re-queues itself.
>
> Can you elaborate this a bit?

If work->func() re-queues itself, run_workqueue() never returns because
->worklist is never empty. This means we should somehow check and detect
that cpu-hotplug blocked because we hold lock_cpu_hotplug(). In that case
run_workqueue() should return, and drop the lock. This will complicate
worker_thread/run_workqueue further.

run_workqueue:

while (!list_empty(&cwq->worklist)) {
...
// We hold lock_cpu_hotplug(), cpu event can't make
// progress.
...
}

> > Yes, http://marc.theaimsgroup.com/?l=linux-kernel&m=116818097927685, I believe
> > we can do this later. This way workqueue will have almost zero interaction
> > with cpu-hotplug, and cpu UP/DOWN event won't be delayed by sleeping work.func().
> > take_over_work() can go away, this also allows us to simplify things.
>
> I agree it minimizes the interactions. Maybe worth attempting. However I
> suspect it may not be as simple as it appears :)

Yes, that is why this patch only does the first step: flush_workqueue() checks
the dead CPUs as well, this change is minimal.

Do you see any problems this patch adds?

Oleg.

2007-01-08 18:37:37

by Pallipadi, Venkatesh

[permalink] [raw]
Subject: RE: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update



>-----Original Message-----
>From: [email protected]
>[mailto:[email protected]] On Behalf Of Oleg Nesterov
>Sent: Monday, January 08, 2007 9:07 AM
>To: Srivatsa Vaddagiri
>Cc: Andrew Morton; David Howells; Christoph Hellwig; Ingo
>Molnar; Linus Torvalds; [email protected]; Gautham shenoy
>Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
>
>On 01/08, Srivatsa Vaddagiri wrote:
>>
>> On Mon, Jan 08, 2007 at 06:56:38PM +0300, Oleg Nesterov wrote:
>> > > 2.
>> > >
>> > > CPU_DEAD->cleanup_workqueue_thread->(cwq->thread =
>NULL)->kthread_stop() ..
>> > > ^^^^^^^^^^^^^^^^^^^^
>> > > |___ Problematic
>> >
>> > Hmm... This should not be possible? cwq->thread != NULL on
>CPU_DEAD event.
>>
>> sure, cwq->thread != NULL at CPU_DEAD event. However
>> cleanup_workqueue_thread() will set it to NULL and block in
>> kthread_stop(), waiting for the kthread to finish run_workqueue and
>> exit.
>
>Ah, missed you point, thanks. Yet another old problem which
>was not introduced
>by recent changes. And yet another indication we should avoid
>kthread_stop()
>on CPU_DEAD event :) I believe this is easy to fix, but need
>to think more.

The current code is workqueue-hptplug path is full of races. I stumbled
upon atleast couple of different deadlock situations being discussed
here with ondemand governor using workqueue and trying to flush during
cpu hot remove.

Specifically, a three way deadlock involving kthread_stop() with
workqueue_mutex held and work itself blocked on some other mutex held by
another task trying to flush the workqueue.

One other approach I was thinking about, was to do all the hardwork in
workqueue CPU_DOWN_PREPARE callback rather than in CPU_DEAD.
We can call cleanup_workqueue_thread and take_over_work in DOWN_PREPARE,
With that, I don't think we need to hold the workqueue_mutex across
these two callbacks and eliminate the deadlocks related to
flush_workqueue.
Do you think this approach would simply things around here?

Thanks,
Venki

2007-01-08 23:55:47

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist


Just a sad little note that I've utterly lost the plot on the workqueue
changes.

schedule_on_each_cpu-use-preempt_disable.patch
reimplement-flush_workqueue.patch
implement-flush_work.patch
implement-flush_work-sanity.patch
implement-flush_work_keventd.patch
flush_workqueue-use-preempt_disable-to-hold-off-cpu-hotplug.patch
flush_cpu_workqueue-dont-flush-an-empty-worklist.patch
aio-use-flush_work.patch
kblockd-use-flush_work.patch
relayfs-use-flush_keventd_work.patch
tg3-use-flush_keventd_work.patch
e1000-use-flush_keventd_work.patch
libata-use-flush_work.patch
phy-use-flush_work.patch
#bridge-avoid-using-noautorel-workqueues.patch
#
extend-notifier_call_chain-to-count-nr_calls-made.patch
extend-notifier_call_chain-to-count-nr_calls-made-fixes.patch
extend-notifier_call_chain-to-count-nr_calls-made-fixes-2.patch
define-and-use-new-eventscpu_lock_acquire-and-cpu_lock_release.patch
define-and-use-new-eventscpu_lock_acquire-and-cpu_lock_release-fix.patch
eliminate-lock_cpu_hotplug-in-kernel-schedc.patch
eliminate-lock_cpu_hotplug-in-kernel-schedc-fix.patch
handle-cpu_lock_acquire-and-cpu_lock_release-in-workqueue_cpu_callback.patch
fix-flush_workqueue-vs-cpu_dead-race.patch

I don't know which of these are good, bad or indifferent.

Furthermore I don't know which of these need to be tossed overboard if/when
we get around to using the task freezer for CPU hotplug synchronisation.
Hopefully, a lot of them. I don't really understand why we're continuing
to struggle with the existing approach before that question is settled.

2007-01-09 01:12:30

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On Mon, Jan 08, 2007 at 10:37:25AM -0800, Pallipadi, Venkatesh wrote:
> One other approach I was thinking about, was to do all the hardwork in
> workqueue CPU_DOWN_PREPARE callback rather than in CPU_DEAD.

Between DOWN_PREPARE and DEAD, more work can get added to the cpu's
workqueue. So DOWN_PREPARE is kind of early to take_over_work ..

> We can call cleanup_workqueue_thread and take_over_work in DOWN_PREPARE,
> With that, I don't think we need to hold the workqueue_mutex across
> these two callbacks and eliminate the deadlocks related to
> flush_workqueue.
> Do you think this approach would simply things around here?

--
Regards,
vatsa

2007-01-09 04:39:24

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On Mon, Jan 08, 2007 at 08:06:35PM +0300, Oleg Nesterov wrote:
> Ah, missed you point, thanks. Yet another old problem which was not introduced
> by recent changes. And yet another indication we should avoid kthread_stop()
> on CPU_DEAD event :) I believe this is easy to fix, but need to think more.

I think the problem is not just with CPU_DEAD. Anyone who calls
cleanup_workqueue_thread (say destroy_workqueue?) will see this race.

Do you see any problems if cleanup_workqueue_thread is changed as:

cleanup_workqueue_thread()
{
kthread_stop(p);
spin_lock(cwq->lock);
cwq->thread = NULL;
spin_unlock(cwq->lock);
}


> run_workqueue:
>
> while (!list_empty(&cwq->worklist)) {
> ...
> // We hold lock_cpu_hotplug(), cpu event can't make
> // progress.
> ...
> }

Ok ..yes a cpu_event_waits_for_lock() helper will help here.

> > I agree it minimizes the interactions. Maybe worth attempting. However I
> > suspect it may not be as simple as it appears :)
>
> Yes, that is why this patch only does the first step: flush_workqueue() checks
> the dead CPUs as well, this change is minimal.
>
> Do you see any problems this patch adds?

I dont see as of now. I suspect we will know better when we implement
the patch to eliminate CPU_DEAD handling in workqueue.c

--
Regards,
vatsa

2007-01-09 05:04:27

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On Mon, Jan 08, 2007 at 03:54:28PM -0800, Andrew Morton wrote:
> Furthermore I don't know which of these need to be tossed overboard if/when
> we get around to using the task freezer for CPU hotplug synchronisation.
> Hopefully, a lot of them. I don't really understand why we're continuing
> to struggle with the existing approach before that question is settled.

Good point!

Fundamentally, I think we need to answer this question:

"Do we provide *some* mechanism to block concurrent hotplug operations
from happening? By hotplug operations I mean both changes to the bitmap
and execution of all baclbacks in CPU_DEAD/ONLINE etc"

If NO, then IMHO we will be forever fixing races

If YES, then what is that mechanism? freeze_processes()? or a magical
lock?

freeze_processes() cant be that mechanism, if my understanding of it is
correct - see http://lkml.org/lkml/2007/1/8/149 and
http://marc.theaimsgroup.com/?l=linux-kernel&m=116817460726058.

I would be happy to be corrected if the above impression of
freeze_processes() is corrected ..

--
Regards,
vatsa

2007-01-09 05:30:11

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On Tue, 9 Jan 2007 10:34:17 +0530
Srivatsa Vaddagiri <[email protected]> wrote:

> On Mon, Jan 08, 2007 at 03:54:28PM -0800, Andrew Morton wrote:
> > Furthermore I don't know which of these need to be tossed overboard if/when
> > we get around to using the task freezer for CPU hotplug synchronisation.
> > Hopefully, a lot of them. I don't really understand why we're continuing
> > to struggle with the existing approach before that question is settled.
>
> Good point!
>
> Fundamentally, I think we need to answer this question:
>
> "Do we provide *some* mechanism to block concurrent hotplug operations
> from happening? By hotplug operations I mean both changes to the bitmap
> and execution of all baclbacks in CPU_DEAD/ONLINE etc"
>
> If NO, then IMHO we will be forever fixing races
>
> If YES, then what is that mechanism? freeze_processes()? or a magical
> lock?
>
> freeze_processes() cant be that mechanism, if my understanding of it is
> correct - see http://lkml.org/lkml/2007/1/8/149

That's not correct. freeze_processes() will freeze *all* processes. All
of them are forced to enter refrigerator(). With the mysterious exception
of some I/O-related kernel threads, which might need some thought.

> and
> http://marc.theaimsgroup.com/?l=linux-kernel&m=116817460726058.

Am not sure how that's related.

> I would be happy to be corrected if the above impression of
> freeze_processes() is corrected ..

It could be that the freezer needs a bit of work for this application.
Obviously we're not interested in the handling of disk I/O, so we'd really
like to do a simple
try_to_freeze_tasks(FREEZER_USER_SPACE|FREEZER_KERNEL_THREADS), but the
code isn't set up to do that (it should be). The other non-swsusp callers
probably want this change as well. But that's all a minor matter.

2007-01-09 07:01:15

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist


* Andrew Morton <[email protected]> wrote:

> > I would be happy to be corrected if the above impression of
> > freeze_processes() is corrected ..
>
> It could be that the freezer needs a bit of work for this application.
> Obviously we're not interested in the handling of disk I/O, so we'd
> really like to do a simple
> try_to_freeze_tasks(FREEZER_USER_SPACE|FREEZER_KERNEL_THREADS), but
> the code isn't set up to do that (it should be). The other non-swsusp
> callers probably want this change as well. But that's all a minor
> matter.

yes. The freezer does the fundamentally right thing: it stops all tasks
in user-space or waits for them to return back to user-space to stop
them there, or if it's a pure kernel-space task it waits until that
kernel-space task voluntarily stop.

Once the system is in such a state, and all processing has been stopped,
all of the kernel's data structures are completely 'unused', and we can:

- patch the kernel freely (kprobes)

- save+stop the kernel (sw-suspend)

- remove a CPU (CPU hotplug and suspend)

- (We could also use this mechanism to live-migrate the kernel to
another system btw., possibly useful for containers)

- (We could also use this mechanism to create a live snapshot of a
running kernel, together with an LVM snapshot of filesystem state,
for possible restore point later on.)

It is a very powerful mechanism that has really nice properties - we
should work on this one shared infrastructure instead of adding zillions
of per-subsystem CPU hotplug locks.

Ingo

2007-01-09 09:33:22

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On Mon, Jan 08, 2007 at 09:26:56PM -0800, Andrew Morton wrote:
> That's not correct. freeze_processes() will freeze *all* processes.

I am not arguing whether all processes will be frozen. However my question was
on the freeze point. Let me ask the question with an example:

rtasd thread (arch/powerpc/platforms/pseries/rtasd.c) executes this simple
loop:


static int rtasd(void *unused)
{

i = first_cpu(cpu_online_map);

while (1) {

set_cpus_allowed(current, cpumask_of_cpu(i)); /* can block */

/* we should now be running on cpu i */

do_something_on_a_cpu(i);

/* sleep for some time */

i = next_cpu(cpu, cpu_online_map);
}

}

This thread makes absolutely -no- calls to try_to_freeze() in its lifetime.

1. Does this mean that the thread can't be frozen? (lets say that the
thread's PF_NOFREEZE is not set)

AFAICS it can still be frozen by sending it a signal and have the signal
delivery code call try_to_freeze() ..

2. If the thread can be frozen at any arbitrary point of its execution, then I
dont see what prevents cpu_online_map from changing under the feet of rtasd
thread,

In other words, we would have failed to provide the ability to *block*
hotplug operations from happening concurrently.

> All of them are forced to enter refrigerator().
^^^^^^

*forced*, yes ..that's the point of concern ..


Warm regards,
vatsa

2007-01-09 09:49:19

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist


* Srivatsa Vaddagiri <[email protected]> wrote:

> This thread makes absolutely -no- calls to try_to_freeze() in its
> lifetime.

that's a plain bug. suspend will fail for example. A kernel thread
either needs to mark itself PF_NOFREEZE (which means it can be ignored
and zapped with impunity), or needs to call into try_to_freeze() with a
sane frequency.

Ingo

2007-01-09 09:53:17

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On Tue, 9 Jan 2007 15:03:02 +0530
Srivatsa Vaddagiri <[email protected]> wrote:

> On Mon, Jan 08, 2007 at 09:26:56PM -0800, Andrew Morton wrote:
> > That's not correct. freeze_processes() will freeze *all* processes.
>
> I am not arguing whether all processes will be frozen. However my question was
> on the freeze point. Let me ask the question with an example:
>
> rtasd thread (arch/powerpc/platforms/pseries/rtasd.c) executes this simple
> loop:
>
>
> static int rtasd(void *unused)
> {
>
> i = first_cpu(cpu_online_map);
>
> while (1) {
>
> set_cpus_allowed(current, cpumask_of_cpu(i)); /* can block */
>
> /* we should now be running on cpu i */
>
> do_something_on_a_cpu(i);
>
> /* sleep for some time */
>
> i = next_cpu(cpu, cpu_online_map);
> }
>
> }
>
> This thread makes absolutely -no- calls to try_to_freeze() in its lifetime.

Looks like a bug to me. powerpc does appear to try to support the freezer.

> 1. Does this mean that the thread can't be frozen? (lets say that the
> thread's PF_NOFREEZE is not set)

yup. I'd expect the freeze_processes() call would fail if this thread is
running.

> AFAICS it can still be frozen by sending it a signal and have the signal
> delivery code call try_to_freeze() ..

kernel threads don't take signals in the same manner as userspace. A
kernel thread needs to explicitly poll, via

if (signal_pending(current))
do_something()

rtasd doesn't do that, and using signals in-kernel is considered lame.

> 2. If the thread can be frozen at any arbitrary point of its execution, then I
> dont see what prevents cpu_online_map from changing under the feet of rtasd
> thread,

It cannot.

2007-01-09 10:09:39

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On Tue, Jan 09, 2007 at 01:51:52AM -0800, Andrew Morton wrote:
> > This thread makes absolutely -no- calls to try_to_freeze() in its lifetime.
>
> Looks like a bug to me. powerpc does appear to try to support the freezer.
>
> > 1. Does this mean that the thread can't be frozen? (lets say that the
> > thread's PF_NOFREEZE is not set)
>
> yup. I'd expect the freeze_processes() call would fail if this thread is
> running.

ok.

>
> > AFAICS it can still be frozen by sending it a signal and have the signal
> > delivery code call try_to_freeze() ..
>
> kernel threads don't take signals in the same manner as userspace. A
> kernel thread needs to explicitly poll, via
>
> if (signal_pending(current))
> do_something()

Thanks for the education! I feel much better about the use of process
freezer now ..

> > 2. If the thread can be frozen at any arbitrary point of its execution, then I
> > dont see what prevents cpu_online_map from changing under the feet of rtasd
> > thread,
>
> It cannot.

Excellent ..

I just hope the latency of freeze_processes() is tolerable ..

--
Regards,
vatsa

2007-01-09 10:20:19

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On Tue, 9 Jan 2007 15:39:26 +0530
Srivatsa Vaddagiri <[email protected]> wrote:

> I just hope the latency of freeze_processes() is tolerable ..

It'll be roughly proportional to the number of processes, I guess: if we
have 100,000 processes (or threads) doing sleep(1000000) then yeah, it'll
take some time to wake them all up and capture them in refrigerator().

But I suspect a suitable fix for any problems which arise there is to
implement gang-offlining and onlining, rather than the present
one-cpu-at-a-time. That'd be pretty simple to do: we already have sysfs
interfaces which take a cpumask.

2007-01-09 14:39:10

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update

On 01/09, Srivatsa Vaddagiri wrote:
>
> On Mon, Jan 08, 2007 at 08:06:35PM +0300, Oleg Nesterov wrote:
> > Ah, missed you point, thanks. Yet another old problem which was not introduced
> > by recent changes. And yet another indication we should avoid kthread_stop()
> > on CPU_DEAD event :) I believe this is easy to fix, but need to think more.
>
> I think the problem is not just with CPU_DEAD. Anyone who calls
> cleanup_workqueue_thread (say destroy_workqueue?) will see this race.

destroy_workqueue() first does flush_workqueue(), so it should be ok.

Anyway I agree with you, we shouldn't clear cwq->thread until it exits,

> Do you see any problems if cleanup_workqueue_thread is changed as:
>
> cleanup_workqueue_thread()
> {
> kthread_stop(p);
> spin_lock(cwq->lock);
> cwq->thread = NULL;
> spin_unlock(cwq->lock);
> }

I think the same. In fact I suspect we even don't need spin_lock, but didn't
have a time to read the code since our discussion.

Oleg.

2007-01-09 15:08:46

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On 01/08, Andrew Morton wrote:
>
> That's not correct. freeze_processes() will freeze *all* processes. All
> of them are forced to enter refrigerator(). With the mysterious exception
> of some I/O-related kernel threads, which might need some thought.

Yes, and we can remove a dead CPU safely, this part is ok.

> > and
> > http://marc.theaimsgroup.com/?l=linux-kernel&m=116817460726058.
>
> Am not sure how that's related.

but at some point we should thaw processes, including cwq->thread which
should die. So we are doing things like take_over_work() and this is the
source of races, because the dead CPU is not on cpu_online_map.

flush_workqueue() doesn't use any locks now. If we use freezer to implement
cpu-hotplug nothing will change, we still have races.

It looks so obvious to me, so I'm starting to suspect I missed something
very simple :(

Oleg.

2007-01-09 15:59:52

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On Tue, Jan 09, 2007 at 06:07:55PM +0300, Oleg Nesterov wrote:
> but at some point we should thaw processes, including cwq->thread which
> should die.

I am presuming we will thaw processes after all CPU_DEAD handlers have
run.

> So we are doing things like take_over_work() and this is the
> source of races, because the dead CPU is not on cpu_online_map.
>
> flush_workqueue() doesn't use any locks now. If we use freezer to implement
> cpu-hotplug nothing will change, we still have races.

We have races -if- CPU_DEAD handling can run concurrently with a ongoing
flush_workqueue. From my recent understanding of process freezer, this
is not possible. In other words, flush_workqueue() can be its old
implementation as below w/o any races:

some_thread:

for_each_online_cpu(i)
flush_cpu_workqueue(i);

As long as this loop is running, cpu_down/up will not proceed. This means,
cpu_online_map is stable even if flush_cpu_workqueue blocks ..

Once this loop is complete and all threads have called try_to_freeze,
cpu_down will proceed to change the bit map and run CPU_DEAD handlers
of everyone. I am presuimg we will thaw processes only after all
CPU_DEAD/ONLINE handlers have run (dont know if that is a problem).
In that case do you still see races? Yes, this would require some
changes in worker_thread to check for kthread_should_stop() after
try_to_freeze returns ...


--
Regards,
vatsa

2007-01-09 16:38:55

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On 01/09, Srivatsa Vaddagiri wrote:
>
> On Tue, Jan 09, 2007 at 06:07:55PM +0300, Oleg Nesterov wrote:
> > but at some point we should thaw processes, including cwq->thread which
> > should die.
>
> I am presuming we will thaw processes after all CPU_DEAD handlers have
> run.
>
> > So we are doing things like take_over_work() and this is the
> > source of races, because the dead CPU is not on cpu_online_map.
> >
> > flush_workqueue() doesn't use any locks now. If we use freezer to implement
> > cpu-hotplug nothing will change, we still have races.
>
> We have races -if- CPU_DEAD handling can run concurrently with a ongoing
> flush_workqueue. From my recent understanding of process freezer, this
> is not possible. In other words, flush_workqueue() can be its old
> implementation as below w/o any races:
>
> some_thread:
>
> for_each_online_cpu(i)
> flush_cpu_workqueue(i);
>
> As long as this loop is running, cpu_down/up will not proceed. This means,
> cpu_online_map is stable even if flush_cpu_workqueue blocks ..
>
> Once this loop is complete and all threads have called try_to_freeze,
> cpu_down will proceed to change the bit map and run CPU_DEAD handlers
> of everyone. I am presuimg we will thaw processes only after all
> CPU_DEAD/ONLINE handlers have run.

We can't do this. We should thaw cwq->thread (which was bound to the
dead CPU) to complete CPU_DEAD event. So we still need some changes.

Oleg.

2007-01-09 16:46:23

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On Tue, Jan 09, 2007 at 07:38:15PM +0300, Oleg Nesterov wrote:
> We can't do this. We should thaw cwq->thread (which was bound to the
> dead CPU) to complete CPU_DEAD event. So we still need some changes.

I noticed that, but I presumed kthread_stop() will post a wakeup which
will bring it out of frozen state. Looking at refrigerator(), I realize
that is not possible.

So CPU_DEAD should do a thaw_process on the kthread before doing a
kthread_stop?

--
Regards,
vatsa

2007-01-09 16:57:29

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On 01/09, Srivatsa Vaddagiri wrote:
>
> On Tue, Jan 09, 2007 at 07:38:15PM +0300, Oleg Nesterov wrote:
> > We can't do this. We should thaw cwq->thread (which was bound to the
> > dead CPU) to complete CPU_DEAD event. So we still need some changes.
>
> I noticed that, but I presumed kthread_stop() will post a wakeup which
> will bring it out of frozen state. Looking at refrigerator(), I realize
> that is not possible.
>
> So CPU_DEAD should do a thaw_process on the kthread before doing a
> kthread_stop?

Probably we can do this, or that. In any case we need changes, that
was my point.

And the best change I believe is to _remove_ CPU_DEAD handling from
workqueue.c as I suggested before. This kthread_stop() is not a good
idea per se, it calls wake_up_process(), but we should in fact use
wake_up(&cwq->more_work). Yes, work->func() should be ready for the
false wakeups, but still.

But for now I hope the last "draft" patch is enough. I'll continue
on next weekend.

Oleg.

2007-01-14 23:55:06

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

How about the pseudo-code below?

workqueue_mutex is only used to protect "struct list_head workqueues",
all workqueue operations can run in parallel with cpuhotplug callback path.
take_over_work(), migrate_sequence, CPU_LOCK_ACQUIRE/RELEASE go away.

I'd like to make a couple of cleanups (and fix schedule_on_each_cpu) before
sending the patch, but if somebody doesn't like this intrusive change, he
can nack it right now.

Oleg.

struct cpu_workqueue_srtuct {
...
int should_stop;
...
};

// also used by flush_work/flush_workqueue
static cpumask_t cpu_populated_map __read_mostly;

/*
* NOTE: the caller must not touch *cwq if this func returns true
*/
static inline int cwq_should_stop(struct cpu_workqueue_struct *cwq)
{
int should_stop = cwq->should_stop;

if (unlikely(should_stop)) {
spin_lock_irq(&cwq->lock);
should_stop = cwq->should_stop && list_empty(&cwq->worklist);
if (should_stop)
cwq->thread = NULL;
spin_unlock_irq(&cwq->lock);
}

return should_stop;
}

static int worker_thread(void *cwq)
{
while (!cwq_should_stop(cwq)) {
...
run_workqueue();
...
}
}

static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
{
struct task_struct *p;

spin_lock_irq(&cwq->lock);
cwq->should_stop = 0;
p = cwq->thread;
spin_unlock_irq(&cwq->lock);

if (!p) {
struct workqueue_struct *wq = cwq->wq;
const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";

p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
/*
* Nobody can add the work_struct to this cwq,
* if (caller is __create_workqueue)
* nobody should see this wq
* else // caller is CPU_UP_PREPARE
* cpu is not on cpu_online_map
* so we can abort safely.
*/
if (IS_ERR(p))
return PTR_ERR(p);

if (!is_single_threaded(wq))
kthread_bind(p, cpu);
/*
* Cancels affinity if the caller is CPU_UP_PREPARE.
* Needs a cleanup, but OK.
*/
wake_up_process(p);
cwq->thread = p;
}

return 0;
}

struct workqueue_struct *__create_workqueue(const char *name,
int singlethread, int freezeable)
{
struct workqueue_struct *wq;
struct cpu_workqueue_struct *cwq;
int err = 0, cpu;

wq = kzalloc(sizeof(*wq), GFP_KERNEL);
if (!wq)
return NULL;

wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
if (!wq->cpu_wq) {
kfree(wq);
return NULL;
}

wq->name = name;
wq->freezeable = freezeable;

if (singlethread) {
INIT_LIST_HEAD(&wq->list);
cwq = init_cpu_workqueue(wq, singlethread_cpu);
err = create_workqueue_thread(cwq, singlethread_cpu);
} else {
mutex_lock(&workqueue_mutex);
list_add(&wq->list, &workqueues);

for_each_possible_cpu(cpu) {
cwq = init_cpu_workqueue(wq, cpu);
if (err || !cpu_isset(cpu, cpu_populated_map))
continue;
err = create_workqueue_thread(cwq, cpu);
}
mutex_unlock(&workqueue_mutex);
}

if (err) {
destroy_workqueue(wq);
wq = NULL;
}
return wq;
}

static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
{
struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
struct wq_barrier barr;
int alive = 0;

spin_lock_irq(&cwq->lock);
if (cwq->thread != NULL) {
insert_wq_barrier(cwq, &barr, 1);
cwq->should_stop = 1;
alive = 1;
}
spin_unlock_irq(&cwq->lock);

if (alive) {
wait_for_completion(&barr.done);

while (unlikely(cwq->thread != NULL))
cpu_relax();
/*
* Wait until cwq->thread unlocks cwq->lock,
* it won't touch *cwq after that.
*/
smp_rmb();
spin_unlock_wait(&cwq->lock);
}
}

void destroy_workqueue(struct workqueue_struct *wq)
{
if (is_single_threaded(wq))
cleanup_workqueue_thread(wq, singlethread_cpu);
else {
int cpu;

mutex_lock(&workqueue_mutex);
list_del(&wq->list);
mutex_unlock(&workqueue_mutex);

for_each_cpu_mask(cpu, cpu_populated_map)
cleanup_workqueue_thread(wq, cpu);
}

free_percpu(wq->cpu_wq);
kfree(wq);
}

static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
struct workqueue_struct *wq;
struct cpu_workqueue_struct *cwq;
unsigned int cpu = (unsigned long)hcpu;
int ret = NOTIFY_OK;

mutex_lock(&workqueue_mutex);
if (action == CPU_UP_PREPARE)
cpu_set(cpu, cpu_populated_map);

list_for_each_entry(wq, &workqueues, list) {
cwq = per_cpu_ptr(wq->cpu_wq, cpu);

switch (action) {
case CPU_UP_PREPARE:
if (create_workqueue_thread(cwq, cpu))
ret = NOTIFY_BAD;
break;

case CPU_ONLINE:
set_cpus_allowed(cwq->thread, cpumask_of_cpu(cpu));
break;

case CPU_UP_CANCELED:
case CPU_DEAD:
cwq->should_stop = 1;
wake_up(&cwq->more_work);
break;
}

if (ret != NOTIFY_OK) {
printk(KERN_ERR "workqueue for %i failed\n", cpu);
break;
}
}
mutex_unlock(&workqueue_mutex);

return ret;
}

void init_workqueues(void)
{
...
cpu_populated_map = cpu_online_map;
...
}

2007-01-15 04:33:15

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On Mon, Jan 15, 2007 at 02:54:10AM +0300, Oleg Nesterov wrote:
> How about the pseudo-code below?

Some quick comments:

- singlethread_cpu needs to be hotplug safe (broken currently)

- Any reason why cpu_populated_map is not modified on CPU_DEAD?

- I feel more comfortable if workqueue_cpu_callback were to take
workqueue_mutex in LOCK_ACQ and release it in LOCK_RELEASE
notifications. This will provide stable access to cpu_populated_map
to functions like __create_workqueue.

Finally, I wonder if these changes will be unnecessary if we move to
process freezer based hotplug locking ...

--
Regards,
vatsa

2007-01-15 12:54:51

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On 01/15, Srivatsa Vaddagiri wrote:
>
> On Mon, Jan 15, 2007 at 02:54:10AM +0300, Oleg Nesterov wrote:
> > How about the pseudo-code below?
>
> Some quick comments:
>
> - singlethread_cpu needs to be hotplug safe (broken currently)

Why? Could you explain?

> - Any reason why cpu_populated_map is not modified on CPU_DEAD?

Because CPU_DEAD/CPU_UP_CANCELED doesn't wait for cwq->thread to exit.
cpu_populated_map never shrinks, it only grows on CPU_UP_PREPARE.

We can change this, but it needs some more code, and I am not sure
we need it. Note that a "false" bit in cpu_populated_map only means
that flush_work/flush_workqueue/destroy_workqueu will do lock/unlock
of cwq->lock, nothing more.

> - I feel more comfortable if workqueue_cpu_callback were to take
> workqueue_mutex in LOCK_ACQ and release it in LOCK_RELEASE
> notifications.

The whole purpose of this change to avoid this!

Gautham R Shenoy wrote:
>
> Oh! I was refering to the other set of workqueue deadlock woes. The
> ones caused when subsystems (like cpufreq) try to create/destroy
> workqueue from the cpuhotplug callback path.
>
> Creation/destruction of workqueue requires us to take workqueue_mutex,
> which would have already been taken during CPU_LOCK_ACQUIRE.

With this change workqueue_mutex is only taken to protect workqueues
list, why should we hold it for (say) CPU_UP_PREPARE->CPU_ONLINE path?

> This will provide stable access to cpu_populated_map
> to functions like __create_workqueue.

I think this is not needed.

> Finally, I wonder if these changes will be unnecessary if we move to
> process freezer based hotplug locking ...

This change ir not strictly necessary but imho make the code better and
shrinks .text by 379 bytes.

But I believe that freezer will change nothing for workqueue. We still
need take_over_work(), and hacks like migrate_sequence. And no, CPU_DEAD
can't just thaw cwq->thread which was bound to the dead CPU to complete
kthread_stop(), we should thaw all processes.

Oleg.

2007-01-15 13:08:56

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On 01/15, Oleg Nesterov wrote:
>
> cpu_populated_map never shrinks, it only grows on CPU_UP_PREPARE.

__create_workqueue() should not use it. Needs a fix.

Oleg.

2007-01-15 16:19:48

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On Mon, Jan 15, 2007 at 03:54:01PM +0300, Oleg Nesterov wrote:
> > - singlethread_cpu needs to be hotplug safe (broken currently)
>
> Why? Could you explain?

What if 'singlethread_cpu' dies?

> > - Any reason why cpu_populated_map is not modified on CPU_DEAD?
>
> Because CPU_DEAD/CPU_UP_CANCELED doesn't wait for cwq->thread to exit.
> cpu_populated_map never shrinks, it only grows on CPU_UP_PREPARE.
>
> We can change this, but it needs some more code, and I am not sure
> we need it. Note that a "false" bit in cpu_populated_map only means
> that flush_work/flush_workqueue/destroy_workqueu will do lock/unlock
> of cwq->lock, nothing more.

What abt __create_workqueue/schedule_on_each_cpu?

> > - I feel more comfortable if workqueue_cpu_callback were to take
> > workqueue_mutex in LOCK_ACQ and release it in LOCK_RELEASE
> > notifications.
>
> The whole purpose of this change to avoid this!

I guess it depends on how __create_workqueue/schedule_on_each_cpu is
modified (whether we take/release lock upon LOCK_ACQ/LOCK_RELEASE)

> > Finally, I wonder if these changes will be unnecessary if we move to
> > process freezer based hotplug locking ...
>
> This change ir not strictly necessary but imho make the code better and
> shrinks .text by 379 bytes.
>
> But I believe that freezer will change nothing for workqueue. We still
> need take_over_work(), and hacks like migrate_sequence. And no, CPU_DEAD
> can't just thaw cwq->thread which was bound to the dead CPU to complete
> kthread_stop(), we should thaw all processes.

What abt stopping that thread in CPU_DOWN_PREPARE (before freezing
processes)? I understand that it may add to the latency, but compared to
the overall latency of process freezer, I suspect it may not be much.

--
Regards,
vatsa

2007-01-15 16:54:38

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On 01/15, Srivatsa Vaddagiri wrote:
>
> On Mon, Jan 15, 2007 at 03:54:01PM +0300, Oleg Nesterov wrote:
> > > - singlethread_cpu needs to be hotplug safe (broken currently)
> >
> > Why? Could you explain?
>
> What if 'singlethread_cpu' dies?

Still can't understand you. Probably you missed what singlethread_cpu is.

singlethread_cpu is just a "random" bit from cpu_possible_map. Single
threaded workqueue is not bound to any cpu. We only need it to be sure that
percpu_data.ptrs[singlethread_cpu] is populated by __percpu_alloc_mask().

> > > - Any reason why cpu_populated_map is not modified on CPU_DEAD?
> >
> > Because CPU_DEAD/CPU_UP_CANCELED doesn't wait for cwq->thread to exit.
> > cpu_populated_map never shrinks, it only grows on CPU_UP_PREPARE.
> >
> > We can change this, but it needs some more code, and I am not sure
> > we need it. Note that a "false" bit in cpu_populated_map only means
> > that flush_work/flush_workqueue/destroy_workqueu will do lock/unlock
> > of cwq->lock, nothing more.
>
> What abt __create_workqueue/schedule_on_each_cpu?

As I said already __create_workqueue() needs a fix, schedule_on_each_cpu()
is already broken, and should be fixed as well.

> > > - I feel more comfortable if workqueue_cpu_callback were to take
> > > workqueue_mutex in LOCK_ACQ and release it in LOCK_RELEASE
> > > notifications.
> >
> > The whole purpose of this change to avoid this!
>
> I guess it depends on how __create_workqueue/schedule_on_each_cpu is
> modified (whether we take/release lock upon LOCK_ACQ/LOCK_RELEASE)

Sorry, can't understand this...

> > > Finally, I wonder if these changes will be unnecessary if we move to
> > > process freezer based hotplug locking ...
> >
> > This change ir not strictly necessary but imho make the code better and
> > shrinks .text by 379 bytes.
> >
> > But I believe that freezer will change nothing for workqueue. We still
> > need take_over_work(), and hacks like migrate_sequence. And no, CPU_DEAD
> > can't just thaw cwq->thread which was bound to the dead CPU to complete
> > kthread_stop(), we should thaw all processes.
>
> What abt stopping that thread in CPU_DOWN_PREPARE (before freezing
> processes)? I understand that it may add to the latency, but compared to
> the overall latency of process freezer, I suspect it may not be much.

Srivatsa, why do you think this would be better?

It add to the complexity! What do you mean by "stopping that thread" ?
Kill it? - this is wrong. Make it TASK_STOPPED? - very untrivial and I
can't see how this helps.

Oleg.

2007-01-16 05:26:11

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On Mon, Jan 15, 2007 at 07:55:16PM +0300, Oleg Nesterov wrote:
> > What if 'singlethread_cpu' dies?
>
> Still can't understand you. Probably you missed what singlethread_cpu is.

oops yes ..I had mistakenly thought that create_workqueue_thread() will
bind worker thread to singlethread_cpu for single_threaded workqueue.
So it isn't a problem.

> > What abt __create_workqueue/schedule_on_each_cpu?
>
> As I said already __create_workqueue() needs a fix, schedule_on_each_cpu()
> is already broken, and should be fixed as well.

__create_workqueue() creates worker threads for all online CPUs
currently. Accessing the online_map could be racy unless we
serialize the access with hotplug event (thr' a mutex like workqueue
mutex held between LOCK_ACQ/LOCK_RELEASE messages or process freezer)
OR take special measures as was done in flush_workqueue. How were
you considering to deal with that raciness?

> > > The whole purpose of this change to avoid this!
> >
> > I guess it depends on how __create_workqueue/schedule_on_each_cpu is
> > modified (whether we take/release lock upon LOCK_ACQ/LOCK_RELEASE)
>
> Sorry, can't understand this...

I meant to say that depending on how we modify
__create_workqueue/schedule_on_each_cpu to avoid racy-access to
online_map, we can debate whether workqueue mutex needs to be held
between LOCK_ACQ/LOCK_RELEASE messages in the callback.

> > What abt stopping that thread in CPU_DOWN_PREPARE (before freezing
> > processes)? I understand that it may add to the latency, but compared to
> > the overall latency of process freezer, I suspect it may not be much.
>
> Srivatsa, why do you think this would be better?
>
> It add to the complexity! What do you mean by "stopping that thread" ?
> Kill it? - this is wrong.

I meant issuing kthread_stop() in DOWN_PREPARE so that worker
thread exits itself (much before CPU is actually brought down).

Do you see any problems with that?

Even if there are problems with it, how abt something like below:

workqueue_cpu_callback()
{

CPU_DEAD:
/* threads are still frozen at this point */
take_over_work();
kthread_mark_stop(worker_thread);
break;

CPU_CLEAN_THREADS:
/* all threads resumed by now */
kthread_stop(worker_thread); /* task_struct ref required? */
break;

}

kthread_mark_stop() will mark somewhere in task_struct that the thread
should exit when it comes out of refrigerator.

worker_thread()
{

while (!kthread_should_stop()) {
if (cwq->freezeable)
try_to_freeze();

if (kthread_marked_stop(current))
break;

...

}
}

The advantage I see above is that, when take_over_work() is running, we wont
race with functions like flush_workqueue() (threads still frozen at that
point) and hence we avoid hacks like migrate_sequence. This will also
let functions like flush_workqueue() easily access cpu_online_map as
below -without- any special locking/hacks (which I consider a great
benefit for programmers).

flush_workqueue()
{
for_each_online_cpu(i)
flush_cpu_workqueue(i);
}

Do you see any problems with this later approach?

--
Regards,
vatsa

2007-01-16 13:28:34

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On 01/16, Srivatsa Vaddagiri wrote:
>
> On Mon, Jan 15, 2007 at 07:55:16PM +0300, Oleg Nesterov wrote:
> >
> > As I said already __create_workqueue() needs a fix, schedule_on_each_cpu()
> > is already broken, and should be fixed as well.
>
> __create_workqueue() creates worker threads for all online CPUs
> currently. Accessing the online_map could be racy unless we
> serialize the access with hotplug event (thr' a mutex like workqueue
> mutex held between LOCK_ACQ/LOCK_RELEASE messages or process freezer)
> OR take special measures as was done in flush_workqueue. How were
> you considering to deal with that raciness?

This is easy to fix in multiple ways. For example (this is not the best
approach), we can make cpu_creation_map. Like cpu_populate_map, but CPU
is cleared on CPU_DEAD/CPU_UP_CANCELED.

I'll continue when I have a time.

> > > What abt stopping that thread in CPU_DOWN_PREPARE (before freezing
> > > processes)? I understand that it may add to the latency, but compared to
> > > the overall latency of process freezer, I suspect it may not be much.
>
> I meant issuing kthread_stop() in DOWN_PREPARE so that worker
> thread exits itself (much before CPU is actually brought down).

Deadlock if work_struct re-queues itself.

> Even if there are problems with it, how abt something like below:
>
> workqueue_cpu_callback()
> {
>
> CPU_DEAD:
> /* threads are still frozen at this point */
> take_over_work();

No, we can't move a currently executing work. This will break flush_workxxx().

> kthread_mark_stop(worker_thread);
> break;
>
> CPU_CLEAN_THREADS:
> /* all threads resumed by now */
> kthread_stop(worker_thread); /* task_struct ref required? */

yes, required,

> kthread_mark_stop() will mark somewhere in task_struct that the thread
> should exit when it comes out of refrigerator.
>
> worker_thread()
> {
>
> while (!kthread_should_stop()) {
> if (cwq->freezeable)
> try_to_freeze();
>
> if (kthread_marked_stop(current))
> break;

Racy. Because of kthread_stop() above we should clear cwq->thread somehow.
But we can't do this: this workqueue may be already destroyed.

> The advantage I see above is that, when take_over_work() is running, we wont
> race with functions like flush_workqueue() (threads still frozen at that
> point) and hence we avoid hacks like migrate_sequence.

See above.

Please note that the code I posted does something like kthread_mark_stop(), but
it operates on cwq, not on task_struct, this makes a big difference. And it doesn't
need take_over_work() at all. And it doesn't need additional complications. Instead,
it lessens both the source and compiled code.

Note that I am not against using freezer for cpu-hotplug. My point is that this
can't help much for the current workqueue.c, and it will completely transparent
with the change I suggest.

Oleg.

2007-01-17 06:17:15

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On Tue, Jan 16, 2007 at 04:27:25PM +0300, Oleg Nesterov wrote:
> > I meant issuing kthread_stop() in DOWN_PREPARE so that worker
> > thread exits itself (much before CPU is actually brought down).
>
> Deadlock if work_struct re-queues itself.

Are you referring to the problem you described here?

http://lkml.org/lkml/2007/1/8/173

If so, then it can easily be prevented by having run_workqueue() check for
kthread_should_stop() in its while loop?

> > Even if there are problems with it, how abt something like below:
> >
> > workqueue_cpu_callback()
> > {
> >
> > CPU_DEAD:
> > /* threads are still frozen at this point */
> > take_over_work();
>
> No, we can't move a currently executing work. This will break flush_workxxx().

What do you mean by "currently" executing work? worker thread executing
some work on the cpu? That is not possible, because all threads are
frozen at this point. There cant be any ongoing flush_workxxx() as well
because of this, which should avoid breaking flush_workxxx() ..

> > if (kthread_marked_stop(current))
> > break;
>
> Racy. Because of kthread_stop() above we should clear cwq->thread somehow.
> But we can't do this: this workqueue may be already destroyed.

We will surely take workqueue_mutex in CPU_CLEAN_THREADS (when it
traverses the workqueues list), which should avoid this race?

> Please note that the code I posted does something like kthread_mark_stop(), but
> it operates on cwq, not on task_struct, this makes a big difference.

Ok sure ..putting the flag in cwq makes sense. Others also can follow
similar trick for stopping threads (like ksoftirqd).

> And it doesn't need take_over_work() at all. And it doesn't need additional
> complications. Instead, it lessens both the source and compiled code.

I guess either way, changes are required.

1st method, what you are suggesting:

- Needs separate bitmap(s), cpu_populated_map and possible another
for create_workqueue()?
- flush_workqueue() traverses thr a separate bitmap
cpu_populated_map (separate from the online map) while
create_workqueue() traverses the other bitmap

2nd method:

- Avoids the need for maintenance of separate bitmaps (uses
existing cpu_online_map). All functions can safely use
the online_map w/o any races. Personally this is why I like
this approach.
- Needs changes in worker_thread to exit right after it comes
out of refrigerator.

I havent made any changes as per 2nd method to see the resulting code
size, so I cant comment on code sizes.

Another point is that once we create code as in 1st method, which
maintains separate bitmaps, that will easily get replicated (over time)
to other subsystems. Is that a good thing?

--
Regards,
vatsa

2007-01-17 15:48:44

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On 01/17, Srivatsa Vaddagiri wrote:
>
> On Tue, Jan 16, 2007 at 04:27:25PM +0300, Oleg Nesterov wrote:
> > > I meant issuing kthread_stop() in DOWN_PREPARE so that worker
> > > thread exits itself (much before CPU is actually brought down).
> >
> > Deadlock if work_struct re-queues itself.
>
> Are you referring to the problem you described here?
>
> http://lkml.org/lkml/2007/1/8/173
>
> If so, then it can easily be prevented by having run_workqueue() check for
> kthread_should_stop() in its while loop?

flush_workqueue() also calls run_workqueue().

> > > workqueue_cpu_callback()
> > > {
> > >
> > > CPU_DEAD:
> > > /* threads are still frozen at this point */
> > > take_over_work();
> >
> > No, we can't move a currently executing work. This will break flush_workxxx().
>
> What do you mean by "currently" executing work? worker thread executing
> some work on the cpu? That is not possible, because all threads are
> frozen at this point. There cant be any ongoing flush_workxxx() as well
> because of this, which should avoid breaking flush_workxxx() ..

work->func() sleeps/freezed. We can't move the rest of pending jobs before
it completes. This will break flush_workxxx. And no, this is not because
we use barriers now.

> 1st method, what you are suggesting:
>
> - Needs separate bitmap(s), cpu_populated_map and possible another
> for create_workqueue()?
> - flush_workqueue() traverses thr a separate bitmap
> cpu_populated_map (separate from the online map) while
> create_workqueue() traverses the other bitmap

Yes, we need the additional bitmap. This is optimization, we can just use
cpu_possible_map. create_workqueue() can use cpu_online_map + "int new_cpu".

Yes, this is a complication. But still this is much simpler (IMHO) than
we have now. And imho better.

> 2nd method:
>
> - Avoids the need for maintenance of separate bitmaps (uses
> existing cpu_online_map). All functions can safely use
> the online_map w/o any races. Personally this is why I like
> this approach.
> - Needs changes in worker_thread to exit right after it comes
> out of refrigerator.
>
> I havent made any changes as per 2nd method to see the resulting code
> size, so I cant comment on code sizes.

Yes, yes, yes, let's see the code first! :) Then we can compare.
Right now:
- cpu-hotplug doesn't use freezer yet
- all ideas about using it to improve workqueue.c were wrong

> Another point is that once we create code as in 1st method, which
> maintains separate bitmaps, that will easily get replicated (over time)
> to other subsystems. Is that a good thing?

Honestly, I can't understand your point. Why it will get replicated?
Because another subsystem will need cpu_populated_map too? We can remove
"static" and move cpu_populated_map to kernel/cpu.c then.

Btw, I agree it is good to have a sleeping lock to protect cpu_online_map.
But it should be separate from workqueue_mutex, and it is not needed for
create/destroy/flush funcs.

Oleg.

2007-01-17 16:12:24

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On Wed, Jan 17, 2007 at 06:47:16PM +0300, Oleg Nesterov wrote:
> > What do you mean by "currently" executing work? worker thread executing
> > some work on the cpu? That is not possible, because all threads are
> > frozen at this point. There cant be any ongoing flush_workxxx() as well
> > because of this, which should avoid breaking flush_workxxx() ..
>
> work->func() sleeps/freezed.

Didnt Andrew call that (work->func calling try_to_freeze) madness?

http://lkml.org/lkml/2007/01/07/166

Does that happen in practice?

--
Regards,
vatsa

2007-01-17 16:25:12

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On Wed, Jan 17, 2007 at 06:47:16PM +0300, Oleg Nesterov wrote:
> Btw, I agree it is good to have a sleeping lock to protect cpu_online_map.
> But it should be separate from workqueue_mutex, and it is not needed for
> create/destroy/flush funcs.

Which is what lock_cpu_hotplug() attempted to provide but which
has recd lot of flak in recent days. I guess if someone implements
process freezer based locking of cpu_online_map, we will know better how
suited later is for cpu hotplug.

--
Regards,
vatsa

2007-01-17 17:03:27

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist

On 01/17, Srivatsa Vaddagiri wrote:
>
> On Wed, Jan 17, 2007 at 06:47:16PM +0300, Oleg Nesterov wrote:
> > > What do you mean by "currently" executing work? worker thread executing
> > > some work on the cpu? That is not possible, because all threads are
> > > frozen at this point. There cant be any ongoing flush_workxxx() as well
> > > because of this, which should avoid breaking flush_workxxx() ..
> >
> > work->func() sleeps/freezed.
>
> Didnt Andrew call that (work->func calling try_to_freeze) madness?
>
> http://lkml.org/lkml/2007/01/07/166

This means we should move try_to_freeze() to run_workqueue() or we should
forbid auto-requeued works.

I don't have a time to do anything till weekend, but please see a "final"
version below.

- Do you see any bugs?

- Do you agree it is better than we have now?

If/when we use freezer/lock_cpu_hotplug() we probably can simplfy things
further.

Note: schedule_on_each_cpu() remains broken.

Oleg.

struct cpu_workqueue_srtuct {
...
int should_stop;
...
};

static cpumask_t cpu_populated_map __read_mostly; //also used in flush_work...
static int embryonic_cpu __read_mostly = -1;

/*
* NOTE: the caller must not touch *cwq if this func returns true
*/
static inline int cwq_should_stop(struct cpu_workqueue_struct *cwq)
{
int should_stop = cwq->should_stop;

if (unlikely(should_stop)) {
spin_lock_irq(&cwq->lock);
should_stop = cwq->should_stop && list_empty(&cwq->worklist);
if (should_stop)
cwq->thread = NULL;
spin_unlock_irq(&cwq->lock);
}

return should_stop;
}

static int worker_thread(void *cwq)
{
...
while (!cwq_should_stop(cwq)) {
if (cwq->wq->freezeable)
try_to_freeze();

prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
if (!cwq->should_stop && list_empty(&cwq->worklist))
schedule();
finish_wait(&cwq->more_work, &wait);

run_workqueue(cwq);
}
return 0;
}

static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
{
struct task_struct *p;

spin_lock_irq(&cwq->lock);
cwq->should_stop = 0;
p = cwq->thread;
spin_unlock_irq(&cwq->lock);

if (!p) {
struct workqueue_struct *wq = cwq->wq;
const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";

p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
/*
* Nobody can add the work_struct to this cwq,
* if (caller is __create_workqueue)
* nobody should see this wq
* else // caller is CPU_UP_PREPARE
* cpu is not on cpu_online_map
* so we can abort safely.
*/
if (IS_ERR(p))
return PTR_ERR(p);

if (!is_single_threaded(wq))
kthread_bind(p, cpu);
/*
* Cancels affinity if the caller is CPU_UP_PREPARE.
* Needs a cleanup, but OK.
*/
wake_up_process(p);
cwq->thread = p;
}

return 0;
}

struct workqueue_struct *__create_workqueue(const char *name,
int singlethread, int freezeable)
{
struct workqueue_struct *wq;
struct cpu_workqueue_struct *cwq;
int err = 0, cpu;

wq = kzalloc(sizeof(*wq), GFP_KERNEL);
if (!wq)
return NULL;

wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
if (!wq->cpu_wq) {
kfree(wq);
return NULL;
}

wq->name = name;
wq->freezeable = freezeable;

if (singlethread) {
INIT_LIST_HEAD(&wq->list);
cwq = init_cpu_workqueue(wq, singlethread_cpu);
err = create_workqueue_thread(cwq, singlethread_cpu);
} else {
mutex_lock(&workqueue_mutex);
list_add(&wq->list, &workqueues);

for_each_possible_cpu(cpu) {
cwq = init_cpu_workqueue(wq, cpu);
if (err || !(cpu_online(cpu) || cpu == embryonic_cpu))
continue;
err = create_workqueue_thread(cwq, cpu);
}
mutex_unlock(&workqueue_mutex);
}

if (err) {
destroy_workqueue(wq);
wq = NULL;
}
return wq;
}

static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
{
struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
struct wq_barrier barr;
int alive = 0;

spin_lock_irq(&cwq->lock);
if (cwq->thread != NULL) {
insert_wq_barrier(cwq, &barr, 1);
cwq->should_stop = 1;
alive = 1;
}
spin_unlock_irq(&cwq->lock);

if (alive) {
wait_for_completion(&barr.done);

while (unlikely(cwq->thread != NULL))
cpu_relax();
/*
* Wait until cwq->thread unlocks cwq->lock,
* it won't touch *cwq after that.
*/
smp_rmb();
spin_unlock_wait(&cwq->lock);
}
}

void destroy_workqueue(struct workqueue_struct *wq)
{
if (is_single_threaded(wq))
cleanup_workqueue_thread(wq, singlethread_cpu);
else {
int cpu;

mutex_lock(&workqueue_mutex);
list_del(&wq->list);
mutex_unlock(&workqueue_mutex);

for_each_cpu_mask(cpu, cpu_populated_map)
cleanup_workqueue_thread(wq, cpu);
}

free_percpu(wq->cpu_wq);
kfree(wq);
}

static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
struct workqueue_struct *wq;
struct cpu_workqueue_struct *cwq;
unsigned int cpu = (unsigned long)hcpu;
int ret = NOTIFY_OK;

mutex_lock(&workqueue_mutex);
embryonic_cpu = -1;
if (action == CPU_UP_PREPARE) {
cpu_set(cpu, cpu_populated_map);
embryonic_cpu = cpu;
}

list_for_each_entry(wq, &workqueues, list) {
cwq = per_cpu_ptr(wq->cpu_wq, cpu);

switch (action) {
case CPU_UP_PREPARE:
if (create_workqueue_thread(cwq, cpu))
ret = NOTIFY_BAD;
break;

case CPU_ONLINE:
set_cpus_allowed(cwq->thread, cpumask_of_cpu(cpu));
break;

case CPU_UP_CANCELED:
case CPU_DEAD:
cwq->should_stop = 1;
wake_up(&cwq->more_work);
break;
}

if (ret != NOTIFY_OK) {
printk(KERN_ERR "workqueue for %i failed\n", cpu);
break;
}
}
mutex_unlock(&workqueue_mutex);

return ret;
}

void init_workqueues(void)
{
...
cpu_populated_map = cpu_online_map;
...
}