Thanks to Jarek Poplawski for the ideas and for spotting the bug in the
initial draft patch.
cancel_rearming_delayed_work() currently has many limitations, because it
requires that dwork always re-arms itself via queue_delayed_work(). So it
hangs forever if dwork doesn't do this, or cancel_rearming_delayed_work/
cancel_delayed_work was already called. It uses flush_workqueue() in a loop,
so it can't be used if workqueue was freezed, and it is potentially live-
lockable on busy system if delay is small.
With this patch cancel_rearming_delayed_work() doesn't make any assumptions
about dwork, it can re-arm itself via queue_delayed_work(), or queue_work(),
or do nothing.
As a "side effect", cancel_work_sync() was changed to handle re-arming works
as well.
Disadvantages:
- this patch adds wmb() to insert_work().
- slowdowns the fast path (when del_timer() succeeds on entry) of
cancel_rearming_delayed_work(), because wait_on_work() is called
unconditionally. In that case, compared to the old version, we are
doing "unneeded" lock/unlock for each online CPU.
On the other hand, this means we don't need to use cancel_work_sync()
after cancel_rearming_delayed_work().
- complicates the code (.text grows by 130 bytes).
Signed-off-by: Oleg Nesterov <[email protected]>
--- OLD/kernel/workqueue.c~1_CRDW 2007-05-02 23:29:07.000000000 +0400
+++ OLD/kernel/workqueue.c 2007-05-03 22:42:29.000000000 +0400
@@ -120,6 +120,11 @@ static void insert_work(struct cpu_workq
struct work_struct *work, int tail)
{
set_wq_data(work, cwq);
+ /*
+ * Ensure that we get the right work->data if we see the
+ * result of list_add() below, see try_to_grab_pending().
+ */
+ smp_wmb();
if (tail)
list_add_tail(&work->entry, &cwq->worklist);
else
@@ -381,7 +386,46 @@ void fastcall flush_workqueue(struct wor
}
EXPORT_SYMBOL_GPL(flush_workqueue);
-static void wait_on_work(struct cpu_workqueue_struct *cwq,
+/*
+ * Upon a successfull return, the caller "owns" WORK_STRUCT_PENDING bit,
+ * so this work can't be re-armed in any way.
+ */
+static int try_to_grab_pending(struct work_struct *work)
+{
+ struct cpu_workqueue_struct *cwq;
+ int ret = 0;
+
+ if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
+ return 1;
+
+ /*
+ * The queueing is in progress, or it is already queued. Try to
+ * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
+ */
+
+ cwq = get_wq_data(work);
+ if (!cwq)
+ return ret;
+
+ spin_lock_irq(&cwq->lock);
+ if (!list_empty(&work->entry)) {
+ /*
+ * This work is queued, but perhaps we locked the wrong cwq.
+ * In that case we must see the new value after rmb(), see
+ * insert_work()->wmb().
+ */
+ smp_rmb();
+ if (cwq == get_wq_data(work)) {
+ list_del_init(&work->entry);
+ ret = 1;
+ }
+ }
+ spin_unlock_irq(&cwq->lock);
+
+ return ret;
+}
+
+static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
struct work_struct *work)
{
struct wq_barrier barr;
@@ -398,20 +442,7 @@ static void wait_on_work(struct cpu_work
wait_for_completion(&barr.done);
}
-/**
- * cancel_work_sync - block until a work_struct's callback has terminated
- * @work: the work which is to be flushed
- *
- * cancel_work_sync() will attempt to cancel the work if it is queued. If the
- * work's callback appears to be running, cancel_work_sync() will block until
- * it has completed.
- *
- * cancel_work_sync() is designed to be used when the caller is tearing down
- * data structures which the callback function operates upon. It is expected
- * that, prior to calling cancel_work_sync(), the caller has arranged for the
- * work to not be requeued.
- */
-void cancel_work_sync(struct work_struct *work)
+static void wait_on_work(struct work_struct *work)
{
struct cpu_workqueue_struct *cwq;
struct workqueue_struct *wq;
@@ -421,29 +452,59 @@ void cancel_work_sync(struct work_struct
might_sleep();
cwq = get_wq_data(work);
- /* Was it ever queued ? */
if (!cwq)
return;
- /*
- * This work can't be re-queued, no need to re-check that
- * get_wq_data() is still the same when we take cwq->lock.
- */
- spin_lock_irq(&cwq->lock);
- list_del_init(&work->entry);
- work_clear_pending(work);
- spin_unlock_irq(&cwq->lock);
-
wq = cwq->wq;
cpu_map = wq_cpu_map(wq);
for_each_cpu_mask(cpu, *cpu_map)
- wait_on_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+ wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+}
+
+/**
+ * cancel_work_sync - block until a work_struct's callback has terminated
+ * @work: the work which is to be flushed
+ *
+ * cancel_work_sync() will cancel the work if it is queued. If the work's
+ * callback appears to be running, cancel_work_sync() will block until it
+ * has completed.
+ *
+ * It is possible to use this function if the work re-queues itself. It can
+ * cancel the work even if it migrates to another workqueue, however in that
+ * case it only garantees that work->func() has completed on the last queued
+ * workqueue.
+ *
+ * The caller must ensure that workqueue_struct on which this work was last
+ * queued can't be destroyed before this function returns.
+ */
+void cancel_work_sync(struct work_struct *work)
+{
+ while (!try_to_grab_pending(work))
+ ;
+ wait_on_work(work);
+ work_clear_pending(work);
}
EXPORT_SYMBOL_GPL(cancel_work_sync);
+/**
+ * cancel_rearming_delayed_work - reliably kill off a delayed work.
+ * @dwork: the delayed work struct
+ *
+ * It is possible to use this function if dwork rearms itself via queue_work()
+ * or queue_delayed_work(). See also the comment for cancel_work_sync().
+ */
+void cancel_rearming_delayed_work(struct delayed_work *dwork)
+{
+ while (!del_timer(&dwork->timer) &&
+ !try_to_grab_pending(&dwork->work))
+ ;
+ wait_on_work(&dwork->work);
+ work_clear_pending(&dwork->work);
+}
+EXPORT_SYMBOL(cancel_rearming_delayed_work);
-static struct workqueue_struct *keventd_wq;
+static struct workqueue_struct *keventd_wq __read_mostly;
/**
* schedule_work - put work task in global workqueue
@@ -530,28 +591,6 @@ void flush_scheduled_work(void)
EXPORT_SYMBOL(flush_scheduled_work);
/**
- * cancel_rearming_delayed_work - kill off a delayed work whose handler rearms the delayed work.
- * @dwork: the delayed work struct
- *
- * Note that the work callback function may still be running on return from
- * cancel_delayed_work(). Run flush_workqueue() or cancel_work_sync() to wait
- * on it.
- */
-void cancel_rearming_delayed_work(struct delayed_work *dwork)
-{
- struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
-
- /* Was it ever queued ? */
- if (cwq != NULL) {
- struct workqueue_struct *wq = cwq->wq;
-
- while (!cancel_delayed_work(dwork))
- flush_workqueue(wq);
- }
-}
-EXPORT_SYMBOL(cancel_rearming_delayed_work);
-
-/**
* execute_in_process_context - reliably execute the routine with user context
* @fn: the function to execute
* @ew: guaranteed storage for the execute work structure (must
On Fri, 4 May 2007 00:42:26 +0400
Oleg Nesterov <[email protected]> wrote:
> Thanks to Jarek Poplawski for the ideas and for spotting the bug in the
> initial draft patch.
>
> cancel_rearming_delayed_work() currently has many limitations, because it
> requires that dwork always re-arms itself via queue_delayed_work(). So it
> hangs forever if dwork doesn't do this, or cancel_rearming_delayed_work/
> cancel_delayed_work was already called. It uses flush_workqueue() in a loop,
> so it can't be used if workqueue was freezed, and it is potentially live-
> lockable on busy system if delay is small.
>
> With this patch cancel_rearming_delayed_work() doesn't make any assumptions
> about dwork, it can re-arm itself via queue_delayed_work(), or queue_work(),
> or do nothing.
>
> As a "side effect", cancel_work_sync() was changed to handle re-arming works
> as well.
>
> Disadvantages:
>
> - this patch adds wmb() to insert_work().
>
> - slowdowns the fast path (when del_timer() succeeds on entry) of
> cancel_rearming_delayed_work(), because wait_on_work() is called
> unconditionally. In that case, compared to the old version, we are
> doing "unneeded" lock/unlock for each online CPU.
>
> On the other hand, this means we don't need to use cancel_work_sync()
> after cancel_rearming_delayed_work().
>
> - complicates the code (.text grows by 130 bytes).
>
hm, this is getting complex.
> + while (!try_to_grab_pending(work))
> + ;
The patch adds a couple of spinloops. Normally we put a cpu_relax() into
such loops. It can make a very large difference under some circumstances.
> + while (!del_timer(&dwork->timer) &&
> + !try_to_grab_pending(&dwork->work))
> + ;
On 05/03, Andrew Morton wrote:
>
> On Fri, 4 May 2007 00:42:26 +0400
> Oleg Nesterov <[email protected]> wrote:
>
> > Disadvantages:
> >
> > - this patch adds wmb() to insert_work().
> >
> > - slowdowns the fast path (when del_timer() succeeds on entry) of
> > cancel_rearming_delayed_work(), because wait_on_work() is called
> > unconditionally. In that case, compared to the old version, we are
> > doing "unneeded" lock/unlock for each online CPU.
> >
> > On the other hand, this means we don't need to use cancel_work_sync()
> > after cancel_rearming_delayed_work().
> >
> > - complicates the code (.text grows by 130 bytes).
> >
>
> hm, this is getting complex.
Yes, and I can't say I like this patch very much.
First, I am not really sure it is terribly useful. Yes, cancel_rearming_delayed_work
sucks, but did we have any problem in practice? The most annoying problem is that it
cant't cancel @dwork which doesn't re-arm itself unconditionally. But this is not so
common, and ata_port_flush_task() shows an example how to do this. However, it also
shows that this is not so trivial, and work->func() should participate.
Also, we can solve this problem in more simple way. For example, we can shift
"timer->function = delayed_work_timer_fn" from queue_delayed_work() to INIT_DELAYED_WORK().
Then, roughly,
cancel_rearming_delayed_work(dwork)
{
dwork->timer->function = do_nothing_timer_fn;
del_timer_sync(&dwork->timer);
wait_on_work(&dwork->work);
dwork->timer->function = delayed_work_timer_fn;
del_timer(&dwork->timer);
work_clear_pending(&dwork->work
}
But this is so hackish, and doesn't work if work->func() use queue_work() or
queue_delayed_work(delay = 0) to re-arm itself. Perhaps we can forbid this,
and make a simpler patch.
> > + while (!try_to_grab_pending(work))
> > + ;
>
> The patch adds a couple of spinloops. Normally we put a cpu_relax() into
> such loops. It can make a very large difference under some circumstances.
Ah, yes. I'll send a fix along with a little comments update.
Oleg.
on top of
make-cancel_rearming_delayed_work-reliable-spelling.patch
Add cpu-relax() into spinloops, and a comments update.
Small note. The new implementation has another downside. Suppose that rearming
work->func() gets a preemtion after setting WORK_STRUCT_PENDING but before
add_timer/__queue_work. In that case cancel_rearming_delayed_work() will burn
CPU in a busy-wait loop. Fortunately this can happen only with CONFIG_PREEMPT
and we spin with preemption enabled.
We can avoid this,
void cancel_rearming_delayed_work(struct delayed_work *dwork)
{
int retry;
do {
retry = !del_timer(&dwork->timer) &&
!try_to_grab_pending(&dwork->work);
wait_on_work(&dwork->work);
} while (retry);
work_clear_pending(&dwork->work);
}
but I don't think this is worth fixing.
Signed-off-by: Oleg Nesterov <[email protected]>
--- OLD/kernel/workqueue.c~2_RELAX 2007-05-05 23:17:48.000000000 +0400
+++ OLD/kernel/workqueue.c 2007-05-05 23:59:59.000000000 +0400
@@ -475,13 +475,16 @@ static void wait_on_work(struct work_str
* case it only guarantees that work->func() has completed on the last queued
* workqueue.
*
+ * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not
+ * pending, otherwise it goes into a busy-wait loop until the timer expires.
+ *
* The caller must ensure that workqueue_struct on which this work was last
* queued can't be destroyed before this function returns.
*/
void cancel_work_sync(struct work_struct *work)
{
while (!try_to_grab_pending(work))
- ;
+ cpu_relax();
wait_on_work(work);
work_clear_pending(work);
}
@@ -498,7 +501,7 @@ void cancel_rearming_delayed_work(struct
{
while (!del_timer(&dwork->timer) &&
!try_to_grab_pending(&dwork->work))
- ;
+ cpu_relax();
wait_on_work(&dwork->work);
work_clear_pending(&dwork->work);
}
On Sun, May 06, 2007 at 01:32:13AM +0400, Oleg Nesterov wrote:
> on top of
> make-cancel_rearming_delayed_work-reliable-spelling.patch
>
> Add cpu-relax() into spinloops, and a comments update.
>
> Small note. The new implementation has another downside. Suppose that rearming
> work->func() gets a preemtion after setting WORK_STRUCT_PENDING but before
> add_timer/__queue_work. In that case cancel_rearming_delayed_work() will burn
> CPU in a busy-wait loop. Fortunately this can happen only with CONFIG_PREEMPT
> and we spin with preemption enabled.
>
> We can avoid this,
>
> void cancel_rearming_delayed_work(struct delayed_work *dwork)
> {
> int retry;
>
> do {
> retry = !del_timer(&dwork->timer) &&
> !try_to_grab_pending(&dwork->work);
> wait_on_work(&dwork->work);
> } while (retry);
>
> work_clear_pending(&dwork->work);
> }
>
> but I don't think this is worth fixing.
I think so.
There is a lot of new things in the final version of this
patch. I guess, there was no such problem in the previous
version.
I can also see you have new doubts about usefulness, which
I cannot understand:
- even if there are some slowdowns, where does it matter?
- the "old" method uses only one method of cancelling, i.e.
del_timer, not trying to stop requeuing or to remove from
the queue; it seems to be effective only with long delayed
timers, and its real problems are probably mostly invisible.
BTW, I'm still not convinced all additions are needed:
the "old" cancel_rearming_ doesn't care about checking
or waiting on anything after del_timer positive.
Regards,
Jarek P.
PS: I'll try to check this all in the evening and will
write tomorrow, if found something interesting.
On 05/07, Jarek Poplawski wrote:
>
> There is a lot of new things in the final version of this
> patch. I guess, there was no such problem in the previous
> version.
No, this is basically the same patch + re-check-cwq-after-lock,
the latter is mostly needed to prevent racing with CPU-hotplug.
> I can also see you have new doubts about usefulness, which
> I cannot understand:
> - even if there are some slowdowns, where does it matter?
> - the "old" method uses only one method of cancelling, i.e.
> del_timer, not trying to stop requeuing or to remove from
> the queue; it seems to be effective only with long delayed
> timers, and its real problems are probably mostly invisible.
The slowdown is small, changelog mentions it just to be "fair".
I am not happy with the complication this patch adds, mostly
I hate this smb_wmb() in insert_work(). I have an idea how to
remove it later, but this needs another patch not related to
workqueue.c.
> BTW, I'm still not convinced all additions are needed:
> the "old" cancel_rearming_ doesn't care about checking
> or waiting on anything after del_timer positive.
It would be very strange to do wait_on_work() only in case
when del_timer() failed. This way we still need to do
cancel_work_sync() after cancel_rearming_delayed_work(),
but only when del_timer() failed, ugly. Note also that
wait_on_work() does not sleep if work->func() is not running.
Also, consider this callback:
void work_handler(struct work_struct *w)
{
struct delayed_work dw = container_of(...);
queue_delayed_work(dw, delay);
// <------------- cancel_rearming_delayed_work()
cancel_delayed_work(dw);
queue_delayed_work(dw, another_delay);
}
Yes, this is strange and ugly. But correct! The current version
(before this patch) can't cancel this delayed_work. The new
implementation works correctly. So I think it is far better to
do wait_on_work() unconditionally.
> PS: I'll try to check this all in the evening and will
> write tomorrow, if found something interesting.
Yes, please!
Oleg.
On Mon, May 07, 2007 at 02:34:20PM +0400, Oleg Nesterov wrote:
> On 05/07, Jarek Poplawski wrote:
> >
> > There is a lot of new things in the final version of this
> > patch. I guess, there was no such problem in the previous
> > version.
>
> No, this is basically the same patch + re-check-cwq-after-lock,
> the latter is mostly needed to prevent racing with CPU-hotplug.
>
> > I can also see you have new doubts about usefulness, which
> > I cannot understand:
> > - even if there are some slowdowns, where does it matter?
> > - the "old" method uses only one method of cancelling, i.e.
> > del_timer, not trying to stop requeuing or to remove from
> > the queue; it seems to be effective only with long delayed
> > timers, and its real problems are probably mostly invisible.
>
> The slowdown is small, changelog mentions it just to be "fair".
>
> I am not happy with the complication this patch adds, mostly
> I hate this smb_wmb() in insert_work(). I have an idea how to
> remove it later, but this needs another patch not related to
> workqueue.c.
>
> > BTW, I'm still not convinced all additions are needed:
> > the "old" cancel_rearming_ doesn't care about checking
> > or waiting on anything after del_timer positive.
>
> It would be very strange to do wait_on_work() only in case
> when del_timer() failed. This way we still need to do
> cancel_work_sync() after cancel_rearming_delayed_work(),
> but only when del_timer() failed, ugly. Note also that
> wait_on_work() does not sleep if work->func() is not running.
>
> Also, consider this callback:
>
> void work_handler(struct work_struct *w)
> {
> struct delayed_work dw = container_of(...);
>
> queue_delayed_work(dw, delay);
>
> // <------------- cancel_rearming_delayed_work()
>
> cancel_delayed_work(dw);
> queue_delayed_work(dw, another_delay);
> }
>
> Yes, this is strange and ugly. But correct! The current version
I guess pseudo code below is not that strange, but real usecase:
probe()
{
INIT_DELAYED_WORK(...);
/* we're not issuing queue_delayed_work() in probe(), work will
* be started by interrupt */
return;
}
remove()
{
/* hang will happen here if there was no queue_delyed_work()
* call (like there was no interrupts, which starts rearming
* work */
cancel_rearming_delayed_work();
}
Your patch will fix it, right?
> Oleg.
Good luck,
--
Anton Vorontsov
email: [email protected]
backup email: [email protected]
irc://irc.freenode.org/bd2
On 05/07, Anton Vorontsov wrote:
>
> I guess pseudo code below is not that strange, but real usecase:
>
> probe()
> {
> INIT_DELAYED_WORK(...);
> /* we're not issuing queue_delayed_work() in probe(), work will
> * be started by interrupt */
> return;
> }
>
> remove()
> {
> /* hang will happen here if there was no queue_delyed_work()
> * call (like there was no interrupts, which starts rearming
> * work */
> cancel_rearming_delayed_work();
> }
>
>
> Your patch will fix it, right?
Yes, the new implemantation should work correctly.
However, this particular case was already fixed earlier,
workqueue-make-cancel_rearming_delayed_workqueue-work-on-idle-dwork.patch
http://marc.info/?l=linux-mm-commits&m=117081275901499
Note that INIT_DELAYED_WORK() sets work->data = 0, and
cancel_rearming_delayed_work() does "Was it ever queued"
check.
Still, before this patch, cancel_rearming_delayed_work() hangs
if this work was used before, but not active + re-arming now.
Oleg.
Hi,
Below are some of my suggestions:
On Fri, May 04, 2007 at 12:42:26AM +0400, Oleg Nesterov wrote:
...
> --- OLD/kernel/workqueue.c~1_CRDW 2007-05-02 23:29:07.000000000 +0400
> +++ OLD/kernel/workqueue.c 2007-05-03 22:42:29.000000000 +0400
> @@ -120,6 +120,11 @@ static void insert_work(struct cpu_workq
> struct work_struct *work, int tail)
> {
> set_wq_data(work, cwq);
> + /*
> + * Ensure that we get the right work->data if we see the
> + * result of list_add() below, see try_to_grab_pending().
> + */
> + smp_wmb();
> if (tail)
> list_add_tail(&work->entry, &cwq->worklist);
> else
> @@ -381,7 +386,46 @@ void fastcall flush_workqueue(struct wor
> }
> EXPORT_SYMBOL_GPL(flush_workqueue);
>
> -static void wait_on_work(struct cpu_workqueue_struct *cwq,
> +/*
> + * Upon a successfull return, the caller "owns" WORK_STRUCT_PENDING bit,
> + * so this work can't be re-armed in any way.
> + */
> +static int try_to_grab_pending(struct work_struct *work)
> +{
> + struct cpu_workqueue_struct *cwq;
> + int ret = 0;
> +
> + if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
> + return 1;
Previous version used this check to run del_timer, and I think, it
was good idea. So, maybe, try something like this:
- run once del_timer before the loop (in cancel_rearming_ only),
- add a parmeter to try_to_grab_pending, e.g. "rearming",
- add here something like this:
else if (rearming && del_timer(&work->timer)
return 1;
> +
> + /*
> + * The queueing is in progress, or it is already queued. Try to
> + * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
> + */
> +
> + cwq = get_wq_data(work);
> + if (!cwq)
> + return ret;
Probably you meant:
return 1;
BTW, probably there is some special reason it's in the loop now,
so maybe you should add a comment, why the old way (at the beginning
of a cancel_ function) is not enough.
I think adding the first check:
if (!list_empty(&work->entry)) {
without the lock is usable here.
> +
> + spin_lock_irq(&cwq->lock);
> + if (!list_empty(&work->entry)) {
> + /*
> + * This work is queued, but perhaps we locked the wrong cwq.
> + * In that case we must see the new value after rmb(), see
> + * insert_work()->wmb().
> + */
> + smp_rmb();
> + if (cwq == get_wq_data(work)) {
> + list_del_init(&work->entry);
> + ret = 1;
> + }
> + }
> + spin_unlock_irq(&cwq->lock);
> +
> + return ret;
> +}
> +
...
> +void cancel_work_sync(struct work_struct *work)
> +{
> + while (!try_to_grab_pending(work))
So this could be:
while (!try_to_grab_pending(work, 0))
> + ;
> + wait_on_work(work);
> + work_clear_pending(work);
> }
> EXPORT_SYMBOL_GPL(cancel_work_sync);
>
> +/**
> + * cancel_rearming_delayed_work - reliably kill off a delayed work.
> + * @dwork: the delayed work struct
> + *
> + * It is possible to use this function if dwork rearms itself via queue_work()
> + * or queue_delayed_work(). See also the comment for cancel_work_sync().
> + */
> +void cancel_rearming_delayed_work(struct delayed_work *dwork)
> +{
> + while (!del_timer(&dwork->timer) &&
> + !try_to_grab_pending(&dwork->work))
...and this like here:
if (!del_timer(&dwork->timer)
while (!try_to_grab_pending(&dwork->work, 1))
> + ;
> + wait_on_work(&dwork->work);
> + work_clear_pending(&dwork->work);
> +}
> +EXPORT_SYMBOL(cancel_rearming_delayed_work);
I didn't have as much time as needed, so I'll try to look
at this yet.
Cheers,
Jarek P.
On Mon, May 07, 2007 at 02:34:20PM +0400, Oleg Nesterov wrote:
> On 05/07, Jarek Poplawski wrote:
...
> I am not happy with the complication this patch adds, mostly
> I hate this smb_wmb() in insert_work(). I have an idea how to
> remove it later, but this needs another patch not related to
> workqueue.c.
Probably I don't feel those barriers enough, but IMHO,
there is something wrong, if they are needed here, with
all this made per cpu.
>
> > BTW, I'm still not convinced all additions are needed:
> > the "old" cancel_rearming_ doesn't care about checking
> > or waiting on anything after del_timer positive.
>
> It would be very strange to do wait_on_work() only in case
> when del_timer() failed. This way we still need to do
> cancel_work_sync() after cancel_rearming_delayed_work(),
> but only when del_timer() failed, ugly. Note also that
> wait_on_work() does not sleep if work->func() is not running.
>
> Also, consider this callback:
>
> void work_handler(struct work_struct *w)
> {
> struct delayed_work dw = container_of(...);
>
> queue_delayed_work(dw, delay);
>
> // <------------- cancel_rearming_delayed_work()
>
> cancel_delayed_work(dw);
> queue_delayed_work(dw, another_delay);
> }
>
> Yes, this is strange and ugly. But correct! The current version
> (before this patch) can't cancel this delayed_work. The new
> implementation works correctly. So I think it is far better to
> do wait_on_work() unconditionally.
I think there are possible many curious, but correct things yet,
e.g. handler, which fires timer or another work, which rearms
this work... But maybe it would be resonable to try to separate
new things which are really necessary, so IMHO, (practical)
elimination of endless or excessive looping. If there is needed
some more functionality, maybe there should be second version
added e.g. cancel_rearming_delayed_work_sync(). Without this
you risk, people would really think the old version was better,
if you knew what you were doing. And the worst thing possible:
they'll start to use their own, lighter solutions.
I think, there is no reason, such function should need more
than one loop of one cpu workqueue to cancel any "normal" work
(or maybe two - if locking is spared), and it's really hard
to understand, why anybody should wait all those "not mine",
so probably slow and buggy works in a queue, do their part of
needless locking/sleeping/looping and let "my" excellent
driver to be reloaded... And I think you are doing this now
with some reserves (kind of "with one hand") - because the
_PENDING flag is overloaded here and one flag still free.
I really cannot see how sparing one, two or even three flag
checks during a loop could be worth even one run_workqueue
loop more without them.
Yesterday I've written something probably against my intention:
so, if there is any CPU burning place, which could be fixed,
it's definitely worth fixing. I'm not sure what exactly place
did you mean - if spinlocking in wait_on_work - maybe it's
a sign this place isn't optimal too: it seems to me, this all
inserting/waiting for completion could be done under existing
locks e.g. in run_workqueue (maybe with some flag?).
Jarek P.
On 05/08, Jarek Poplawski wrote:
>
> On Mon, May 07, 2007 at 02:34:20PM +0400, Oleg Nesterov wrote:
> > On 05/07, Jarek Poplawski wrote:
> ...
> > I am not happy with the complication this patch adds, mostly
> > I hate this smb_wmb() in insert_work(). I have an idea how to
> > remove it later, but this needs another patch not related to
> > workqueue.c.
>
> Probably I don't feel those barriers enough, but IMHO,
> there is something wrong
As I said, I hate this barrier (and new complications) too.
We can make the things a bit better:
- introduce smp_mb__before_spin_lock() // noop on x86
- then, after some minimal code changes, we have
set_wq_data();
smp_mb__before_spin_lock();
spin_lock_irqsave(&cwq->lock);
list_add();
spin_unlock_irqrestore();
> if they are needed here, with all this made per cpu.
CPU-hotplug can change CPU.
> >
> > > BTW, I'm still not convinced all additions are needed:
> > > the "old" cancel_rearming_ doesn't care about checking
> > > or waiting on anything after del_timer positive.
> >
> > It would be very strange to do wait_on_work() only in case
> > when del_timer() failed. This way we still need to do
> > cancel_work_sync() after cancel_rearming_delayed_work(),
> > but only when del_timer() failed, ugly. Note also that
> > wait_on_work() does not sleep if work->func() is not running.
> >
> > Also, consider this callback:
> >
> > void work_handler(struct work_struct *w)
> > {
> > struct delayed_work dw = container_of(...);
> >
> > queue_delayed_work(dw, delay);
> >
> > // <------------- cancel_rearming_delayed_work()
> >
> > cancel_delayed_work(dw);
> > queue_delayed_work(dw, another_delay);
> > }
> >
> > Yes, this is strange and ugly. But correct! The current version
> > (before this patch) can't cancel this delayed_work. The new
> > implementation works correctly. So I think it is far better to
> > do wait_on_work() unconditionally.
>
> I think there are possible many curious, but correct things yet,
> e.g. handler, which fires timer or another work, which rearms
> this work... But maybe it would be resonable to try to separate
> new things which are really necessary, so IMHO, (practical)
> elimination of endless or excessive looping. If there is needed
> some more functionality, maybe there should be second version
> added e.g. cancel_rearming_delayed_work_sync(). Without this
> you risk, people would really think the old version was better,
> if you knew what you were doing. And the worst thing possible:
> they'll start to use their own, lighter solutions.
My goal is just rename cancel_rearming_delayed_work() to
cancel_delayed_work_sync(), because now it doesn't require that @dwork
re-arms itself, and it doesn't require to do cancel_work_sync() after.
cancel_rearming_delayed_work() will be obsolete, just like
cancel_rearming_delayed_workqueue().
I am strongly against adding many different variants of cancel_xxx().
With this patch the API is simple
- cancel_delayed_work() stops the timer
- cancel_rearming_delayed_work() stops the timer and work,
doesn't need cancel_work_sync/flush_workqueue
> I think, there is no reason, such function should need more
> than one loop of one cpu workqueue to cancel any "normal" work
> (or maybe two - if locking is spared), and it's really hard
> to understand, why anybody should wait all those "not mine",
> so probably slow and buggy works in a queue, do their part of
> needless locking/sleeping/looping and let "my" excellent
> driver to be reloaded... And I think you are doing this now
> with some reserves (kind of "with one hand") - because the
> _PENDING flag is overloaded here and one flag still free.
> I really cannot see how sparing one, two or even three flag
> checks during a loop could be worth even one run_workqueue
> loop more without them.
When work->func() re-arms itself, both queue_work() and queue_delayed_work()
may change CPU if CPU_DOWN_xxx happens. Note that this is possible
even if we use freezer for cpu-hotplug, because the timer migrates to
another CPU anyway.
> Yesterday I've written something probably against my intention:
> so, if there is any CPU burning place, which could be fixed,
> it's definitely worth fixing.
Perhaps yes, but see below.
> I'm not sure what exactly place
> did you mean - if spinlocking in wait_on_work - maybe it's
> a sign this place isn't optimal too: it seems to me, this all
> inserting/waiting for completion could be done under existing
> locks e.g. in run_workqueue (maybe with some flag?).
Sorry, can't understand this. Inserting uses existing lock, namely
cwq->lock.
Just in case, I think wait_on_cpu_work() can check ->current_work without
cwq->lock, but I am not sure yet. We can remove unneeded "new |= " from
set_wq_data(), we can do a couple of other optimizations. However, there
are already _a lot_ of workqueue changes in -mm tree. We can do this later.
Right now my only concern is correctness.
Oleg.
On 05/08, Jarek Poplawski wrote:
>
> On Fri, May 04, 2007 at 12:42:26AM +0400, Oleg Nesterov wrote:
> ...
> > +static int try_to_grab_pending(struct work_struct *work)
> > +{
> > + struct cpu_workqueue_struct *cwq;
> > + int ret = 0;
> > +
> > + if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
> > + return 1;
>
> Previous version used this check to run del_timer, and I think, it
> was good idea. So, maybe, try something like this:
>
> - run once del_timer before the loop (in cancel_rearming_ only),
hmm, cancel_rearming_ does del_timer() before try_to_grab_pending().
> - add a parmeter to try_to_grab_pending, e.g. "rearming",
> - add here something like this:
>
> else if (rearming && del_timer(&work->timer)
> return 1;
I thought about adding such a parameter, and I don't like this. This is
a matter of taste, of course, but _imho_ this uglifies the code.
In any case, unless we do completely different patch, the sequence should be
del_timer() - a pending timer is the most common case
test_and_set_bit(WORK_STRUCT_PENDING) - the work is idle
try-to-steal-the-queued-work
This is what we are doing now.
> > +
> > + /*
> > + * The queueing is in progress, or it is already queued. Try to
> > + * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
> > + */
> > +
> > + cwq = get_wq_data(work);
> > + if (!cwq)
> > + return ret;
>
> Probably you meant:
> return 1;
No, we should return 0. This can happen if the queueing of the freshly-
initialized @dwork is in progress.
NOTE: right now try_to_grab_pending() is called from cancel_xxx() only, so
this can't happen (it would be meaningless to do cancel_xxx if somebody else
can queue this work or start the timer), but I'd like try_to_grab_pending()
to be as generic as possible.
So, we should either return 0, or add BUG_ON(!cwq).
> BTW, probably there is some special reason it's in the loop now,
> so maybe you should add a comment, why the old way (at the beginning
> of a cancel_ function) is not enough.
see above. Note that wait_on_work() checks cwq == NULL as well. This is
because I do not want to tie try_to_grab_pending() and wait_on_work(),
they should be generic, and could be used independently.
> I think adding the first check:
>
> if (!list_empty(&work->entry)) {
>
> without the lock is usable here.
We should optimize the common case, and most probably this work is
already queued, so I don't this it is worth to enlarge .text.
> > +void cancel_work_sync(struct work_struct *work)
> > +{
> > + while (!try_to_grab_pending(work))
>
> So this could be:
> while (!try_to_grab_pending(work, 0))
>
> > + ;
> > + wait_on_work(work);
> > + work_clear_pending(work);
> > }
> > ...
> > +void cancel_rearming_delayed_work(struct delayed_work *dwork)
> > +{
> > + while (!del_timer(&dwork->timer) &&
> > + !try_to_grab_pending(&dwork->work))
>
> ...and this like here:
>
> if (!del_timer(&dwork->timer)
> while (!try_to_grab_pending(&dwork->work, 1))
>
>
> > + ;
> > + wait_on_work(&dwork->work);
> > + work_clear_pending(&dwork->work);
> > +}
As I said, I personally don't like this, mostly because this complicates
the code a bit more.
> I didn't have as much time as needed, so I'll try to look
> at this yet.
Yes, please, and thank you very much for review!
Oleg.
On Tue, May 08, 2007 at 04:02:21PM +0400, Oleg Nesterov wrote:
> On 05/08, Jarek Poplawski wrote:
> >
> > On Mon, May 07, 2007 at 02:34:20PM +0400, Oleg Nesterov wrote:
> > > On 05/07, Jarek Poplawski wrote:
...
> I am strongly against adding many different variants of cancel_xxx().
> With this patch the API is simple
>
> - cancel_delayed_work() stops the timer
>
> - cancel_rearming_delayed_work() stops the timer and work,
> doesn't need cancel_work_sync/flush_workqueue
But most often there is a simple rearming (but not neccessarily
always) work for which the first is not enough and the second
an overkill (especially with system/workqueues loaded).
> When work->func() re-arms itself, both queue_work() and queue_delayed_work()
> may change CPU if CPU_DOWN_xxx happens. Note that this is possible
> even if we use freezer for cpu-hotplug, because the timer migrates to
> another CPU anyway.
Thanks for explanation - I try to think about this.
> > I'm not sure what exactly place
> > did you mean - if spinlocking in wait_on_work - maybe it's
> > a sign this place isn't optimal too: it seems to me, this all
> > inserting/waiting for completion could be done under existing
> > locks e.g. in run_workqueue (maybe with some flag?).
>
> Sorry, can't understand this. Inserting uses existing lock, namely
> cwq->lock.
I meant held locks - maybe e.g. after seeing some flag set (or some
other global/per_cpu/atomic/whatever variable) in a processed work
insert_wq_barrier could be done in already locked place. Or maybe
the whole idea of these completions should be rethinked, for
something lighter (i.e. lockless)?
>
> Just in case, I think wait_on_cpu_work() can check ->current_work without
> cwq->lock, but I am not sure yet. We can remove unneeded "new |= " from
> set_wq_data(), we can do a couple of other optimizations. However, there
> are already _a lot_ of workqueue changes in -mm tree. We can do this later.
> Right now my only concern is correctness.
I agree 100% - it's -mm, the old way is working, so why hurry?
Jarek P.
On Tue, May 08, 2007 at 04:31:02PM +0400, Oleg Nesterov wrote:
> On 05/08, Jarek Poplawski wrote:
> >
> > On Fri, May 04, 2007 at 12:42:26AM +0400, Oleg Nesterov wrote:
> > ...
> > > +static int try_to_grab_pending(struct work_struct *work)
> > > +{
> > > + struct cpu_workqueue_struct *cwq;
> > > + int ret = 0;
> > > +
> > > + if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
> > > + return 1;
> >
> > Previous version used this check to run del_timer, and I think, it
> > was good idea. So, maybe, try something like this:
> >
> > - run once del_timer before the loop (in cancel_rearming_ only),
>
> hmm, cancel_rearming_ does del_timer() before try_to_grab_pending().
>
> > - add a parmeter to try_to_grab_pending, e.g. "rearming",
> > - add here something like this:
> >
> > else if (rearming && del_timer(&work->timer)
> > return 1;
>
> I thought about adding such a parameter, and I don't like this. This is
> a matter of taste, of course, but _imho_ this uglifies the code.
>
> In any case, unless we do completely different patch, the sequence should be
>
> del_timer() - a pending timer is the most common case
>
> test_and_set_bit(WORK_STRUCT_PENDING) - the work is idle
>
> try-to-steal-the-queued-work
>
> This is what we are doing now.
I simply don't like to call del_timer(), where not needed, but maybe
it's not so expensive and we can afford it...
>
> > > +
> > > + /*
> > > + * The queueing is in progress, or it is already queued. Try to
> > > + * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
> > > + */
> > > +
> > > + cwq = get_wq_data(work);
> > > + if (!cwq)
> > > + return ret;
> >
> > Probably you meant:
> > return 1;
>
> No, we should return 0. This can happen if the queueing of the freshly-
> initialized @dwork is in progress.
>
> NOTE: right now try_to_grab_pending() is called from cancel_xxx() only, so
> this can't happen (it would be meaningless to do cancel_xxx if somebody else
> can queue this work or start the timer), but I'd like try_to_grab_pending()
> to be as generic as possible.
>
> So, we should either return 0, or add BUG_ON(!cwq).
...And you prefer endless loop. Seems brave!
...
> Yes, please, and thank you very much for review!
You welcome & my pleasure,
Bye,
Jarek P.
On 05/08, Jarek Poplawski wrote:
>
> On Tue, May 08, 2007 at 04:31:02PM +0400, Oleg Nesterov wrote:
> >
> > I thought about adding such a parameter, and I don't like this. This is
> > a matter of taste, of course, but _imho_ this uglifies the code.
> >
> > In any case, unless we do completely different patch, the sequence should be
> >
> > del_timer() - a pending timer is the most common case
> >
> > test_and_set_bit(WORK_STRUCT_PENDING) - the work is idle
> >
> > try-to-steal-the-queued-work
> >
> > This is what we are doing now.
>
> I simply don't like to call del_timer(), where not needed, but maybe
> it's not so expensive and we can afford it...
But this is the most common case, that was my point.
Look, we can add
if (!get_wq_data(work))
/* it was never queued */
return;
at the very start of cancel_xxx() functions. This will optimize out
del_timer + try_to_grab_pending() _sometimes_. Should we do this?
I don't think so. I think it is better to save a couple of bytes
from i-cache, but not to try to optimize the unlikely case.
> >
> > > > +
> > > > + /*
> > > > + * The queueing is in progress, or it is already queued. Try to
> > > > + * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
> > > > + */
> > > > +
> > > > + cwq = get_wq_data(work);
> > > > + if (!cwq)
> > > > + return ret;
> > >
> > > Probably you meant:
> > > return 1;
> >
> > No, we should return 0. This can happen if the queueing of the freshly-
> > initialized @dwork is in progress.
> >
> > NOTE: right now try_to_grab_pending() is called from cancel_xxx() only, so
> > this can't happen (it would be meaningless to do cancel_xxx if somebody else
> > can queue this work or start the timer), but I'd like try_to_grab_pending()
> > to be as generic as possible.
> >
> > So, we should either return 0, or add BUG_ON(!cwq).
>
> ...And you prefer endless loop. Seems brave!
No, no, the loop won't be endless. Why do you think so? We spin until the timer
is started, or until the work is queued.
Oleg.
On Tue, May 08, 2007 at 03:56:48PM +0200, Jarek Poplawski wrote:
> > So, we should either return 0, or add BUG_ON(!cwq).
>
> ...And you prefer endless loop. Seems brave!
Sorry! (Maybe you're not so brave?) Seems clear _PENDING should
save us here - I hope.
Bye, bye,
Jarek P.
On Tue, May 08, 2007 at 06:05:17PM +0400, Oleg Nesterov wrote:
> On 05/08, Jarek Poplawski wrote:
> >
> > On Tue, May 08, 2007 at 04:31:02PM +0400, Oleg Nesterov wrote:
> > >
> > > I thought about adding such a parameter, and I don't like this. This is
> > > a matter of taste, of course, but _imho_ this uglifies the code.
> > >
> > > In any case, unless we do completely different patch, the sequence should be
> > >
> > > del_timer() - a pending timer is the most common case
> > >
> > > test_and_set_bit(WORK_STRUCT_PENDING) - the work is idle
> > >
> > > try-to-steal-the-queued-work
> > >
> > > This is what we are doing now.
> >
> > I simply don't like to call del_timer(), where not needed, but maybe
> > it's not so expensive and we can afford it...
>
> But this is the most common case, that was my point.
And I agree it should start...
>
> Look, we can add
>
> if (!get_wq_data(work))
> /* it was never queued */
> return;
>
> at the very start of cancel_xxx() functions. This will optimize out
> del_timer + try_to_grab_pending() _sometimes_. Should we do this?
> I don't think so. I think it is better to save a couple of bytes
> from i-cache, but not to try to optimize the unlikely case.
But the most of our test is already done in test_and_set_bit.
I think it's more about taste, so let's forget...
> > > So, we should either return 0, or add BUG_ON(!cwq).
> >
> > ...And you prefer endless loop. Seems brave!
>
> No, no, the loop won't be endless. Why do you think so? We spin until the timer
> is started, or until the work is queued.
Probably you are right - I was afraid about some inactive
work with no timer to come.
So, I think this all looks fine, and maybe one more needless ack
won't do any harm here:
Acked-by: Jarek Poplawski <[email protected]>
Hello, Oleg.
Oleg Nesterov wrote:
> + /*
> + * Ensure that we get the right work->data if we see the
> + * result of list_add() below, see try_to_grab_pending().
> + */
> + smp_wmb();
I don't think we need this
> + if (!list_empty(&work->entry)) {
> + /*
> + * This work is queued, but perhaps we locked the wrong cwq.
> + * In that case we must see the new value after rmb(), see
> + * insert_work()->wmb().
> + */
> + smp_rmb();
> + if (cwq == get_wq_data(work)) {
> + list_del_init(&work->entry);
> + ret = 1;
> + }
and this. After grabbing cwq lock, compare it to get_wq_data() first,
if they are the same, both are using the same lock so there's no
reason for the barriers. If they are different, it doesn't matter
anyway. We need to retry the locking.
retry:
cwq = get_sw_data(work);
if (!cwq)
return ret;
spin_lock_irq(&cwq->lock);
if (unlikely(cwq != get_wq_data(work))) {
/* oops wrong cwq */
spin_unlock_irq(&cwq->lock);
goto retry; /* or just return 0; */
}
if (!list_empty(&work->entry)) {
list_del_init(&work->entry);
ret = 1;
}
spin_unlock_irq(&cwq->lock);
Although grabbing cwq->lock doesn't mean anything to other cwqs, it
does guarantee work->wq_data can't be changed to or from it, so the
cwq equality test is guaranteed to work.
> + * It is possible to use this function if the work re-queues itself. It can
> + * cancel the work even if it migrates to another workqueue, however in that
> + * case it only garantees that work->func() has completed on the last queued
> + * workqueue.
We first prevent requeueing from happening; then, wait for each cwq to
finish the work, so I think we're guaranteed that they're finished on
all cpus. Otherwise, the 'sync' part isn't too useful as it means all
rearming tasks might be running on completion of cancel_work_sync().
Thanks a lot for fixing this.
--
tejun
On 05/11, Tejun Heo wrote:
>
> Oleg Nesterov wrote:
> > + /*
> > + * Ensure that we get the right work->data if we see the
> > + * result of list_add() below, see try_to_grab_pending().
> > + */
> > + smp_wmb();
>
> I don't think we need this
>
> > + if (!list_empty(&work->entry)) {
> > + /*
> > + * This work is queued, but perhaps we locked the wrong cwq.
> > + * In that case we must see the new value after rmb(), see
> > + * insert_work()->wmb().
> > + */
> > + smp_rmb();
> > + if (cwq == get_wq_data(work)) {
> > + list_del_init(&work->entry);
> > + ret = 1;
> > + }
>
> and this. After grabbing cwq lock, compare it to get_wq_data() first,
> if they are the same, both are using the same lock so there's no
> reason for the barriers. If they are different, it doesn't matter
> anyway. We need to retry the locking.
I think this is not right. The problem is that work->data (its
WORK_STRUCT_WQ_DATA_MASK part, cwq) could be changed _without_ holding
the old cwq->lock.
Suppose that CPU_0 does queue_delayed_work(dwork). We start the timer,
work->data points to cwq_0. CPU_DEAD comes, the timer migrates to
CPU_1, but work->data was not changed yet.
> retry:
> cwq = get_sw_data(work);
> if (!cwq)
> return ret;
>
> spin_lock_irq(&cwq->lock);
> if (unlikely(cwq != get_wq_data(work))) {
> /* oops wrong cwq */
> spin_unlock_irq(&cwq->lock);
> goto retry; /* or just return 0; */
> }
dwork->timer fires, delayed_work_timer_fn() changes work->data to cwq_1
and queues this work on cwq_1.
> if (!list_empty(&work->entry)) {
> list_del_init(&work->entry);
Oops! we are holding cwq_0->lock, but modify cwq_1->worklist.
Actually, we have the same problem with a plain queue_work() if _cpu_down()
comes at "right" time.
However, I agree, this smp_wmb() in insert_work() should die. We can
introduce "smp_mb__before_spinlock()" (no-op on x86 at least) to kill it.
> > + * It is possible to use this function if the work re-queues itself. It can
> > + * cancel the work even if it migrates to another workqueue, however in that
> > + * case it only garantees that work->func() has completed on the last queued
> > + * workqueue.
>
> We first prevent requeueing from happening; then, wait for each cwq to
> finish the work, so I think we're guaranteed that they're finished on
> all cpus. Otherwise, the 'sync' part isn't too useful as it means all
> rearming tasks might be running on completion of cancel_work_sync().
Yes, sure, you are right. What I meant was:
struct workqueue_struct *WQ1, *WQ1;
void work_func(struct work_struct *self)
{
// migrate on another workqueue
queue_work(WQ2, self);
do_something();
}
queue_work(WQ1, work);
Now, cancel_work_sync() can't guarantee that this work has finished
its execution on WQ1.
Thanks for looking at this!
Oleg.
Oleg Nesterov wrote:
>> and this. After grabbing cwq lock, compare it to get_wq_data() first,
>> if they are the same, both are using the same lock so there's no
>> reason for the barriers. If they are different, it doesn't matter
>> anyway. We need to retry the locking.
>
> I think this is not right. The problem is that work->data (its
> WORK_STRUCT_WQ_DATA_MASK part, cwq) could be changed _without_ holding
> the old cwq->lock.
>
> Suppose that CPU_0 does queue_delayed_work(dwork). We start the timer,
> work->data points to cwq_0. CPU_DEAD comes, the timer migrates to
> CPU_1, but work->data was not changed yet.
>
>> retry:
>> cwq = get_sw_data(work);
>> if (!cwq)
>> return ret;
>>
>> spin_lock_irq(&cwq->lock);
>> if (unlikely(cwq != get_wq_data(work))) {
>> /* oops wrong cwq */
>> spin_unlock_irq(&cwq->lock);
>> goto retry; /* or just return 0; */
>> }
>
> dwork->timer fires, delayed_work_timer_fn() changes work->data to cwq_1
> and queues this work on cwq_1.
>
>> if (!list_empty(&work->entry)) {
>> list_del_init(&work->entry);
>
> Oops! we are holding cwq_0->lock, but modify cwq_1->worklist.
>
> Actually, we have the same problem with a plain queue_work() if _cpu_down()
> comes at "right" time.
>
> However, I agree, this smp_wmb() in insert_work() should die. We can
> introduce "smp_mb__before_spinlock()" (no-op on x86 at least) to kill it.
Yeah, right, we allow cwq pointer to change without holding the lock.
Although I still think that is where we should fix the problem. Taking
down CPU is a cold cold path. We can afford a lot of overhead there.
IMHO, if we can do that, it would be far better than memory barrier
dance which tends to be difficult to understand and thus prove/maintain
correctness. I'll think about it more.
>>> + * It is possible to use this function if the work re-queues itself. It can
>>> + * cancel the work even if it migrates to another workqueue, however in that
>>> + * case it only garantees that work->func() has completed on the last queued
>>> + * workqueue.
>> We first prevent requeueing from happening; then, wait for each cwq to
>> finish the work, so I think we're guaranteed that they're finished on
>> all cpus. Otherwise, the 'sync' part isn't too useful as it means all
>> rearming tasks might be running on completion of cancel_work_sync().
>
> Yes, sure, you are right. What I meant was:
>
> struct workqueue_struct *WQ1, *WQ1;
>
> void work_func(struct work_struct *self)
> {
> // migrate on another workqueue
> queue_work(WQ2, self);
>
> do_something();
> }
>
> queue_work(WQ1, work);
>
> Now, cancel_work_sync() can't guarantee that this work has finished
> its execution on WQ1.
Right again, I misunderstood that the migration was between cwqs.
Things definitely won't work if it's between different wqs.
Thanks.
--
tejun
On 05/11, Tejun Heo wrote:
>
> Oleg Nesterov wrote:
> >
> > However, I agree, this smp_wmb() in insert_work() should die. We can
> > introduce "smp_mb__before_spinlock()" (no-op on x86 at least) to kill it.
>
> Yeah, right, we allow cwq pointer to change without holding the lock.
> Although I still think that is where we should fix the problem. Taking
> down CPU is a cold cold path. We can afford a lot of overhead there.
> IMHO, if we can do that, it would be far better than memory barrier
> dance which tends to be difficult to understand and thus prove/maintain
> correctness. I'll think about it more.
Yes I hate this barrier too. That is why changelog explicitly mentions it.
With some trivial code modifications we can move set_wq_data() from insert_work()
to __queue_work(), then
void set_wq_data(work, cwq)
{
struct cpu_workqueue_struct *old = get_wq_data(work);
if (likely(cwq == old))
return;
if (old)
spin_lock(old->lock);
atomic_long_set(&work->data, ...);
if (old)
spin_lock(old->lock);
}
I can't say I like this very much, though. I'd prefer use smp_mb__before_spinlock().
Probably we can do something else.
But first I'd like to kill cwq_should_stop(). (Gautham, Srivatsa, you were
right, but I was blind, deaf, and stupid).
Oleg.
Oleg Nesterov wrote:
> On 05/11, Tejun Heo wrote:
>> Oleg Nesterov wrote:
>>> However, I agree, this smp_wmb() in insert_work() should die. We can
>>> introduce "smp_mb__before_spinlock()" (no-op on x86 at least) to kill it.
>> Yeah, right, we allow cwq pointer to change without holding the lock.
>> Although I still think that is where we should fix the problem. Taking
>> down CPU is a cold cold path. We can afford a lot of overhead there.
>> IMHO, if we can do that, it would be far better than memory barrier
>> dance which tends to be difficult to understand and thus prove/maintain
>> correctness. I'll think about it more.
>
> Yes I hate this barrier too. That is why changelog explicitly mentions it.
>
> With some trivial code modifications we can move set_wq_data() from insert_work()
> to __queue_work(), then
>
> void set_wq_data(work, cwq)
> {
> struct cpu_workqueue_struct *old = get_wq_data(work);
>
> if (likely(cwq == old))
> return;
>
> if (old)
> spin_lock(old->lock);
>
> atomic_long_set(&work->data, ...);
>
> if (old)
> spin_lock(old->lock);
> }
>
> I can't say I like this very much, though. I'd prefer use smp_mb__before_spinlock().
> Probably we can do something else.
Eeek... I don't like the above either. I've been thinking about the
barriers a bit more and what makes it different from simple locked
enter/leave model. As our pointer can change without locking,
work->entry, which is always manipulated locked, is used as a mean to
validate the pointer and we need barrier there because the update to
work->entry and work->wq_data aren't atomic - new validity test result
can be read together with old pointer. Clever && cryptic, I have to
say. :-)
Fortunately, we have one bit left in the flags and can use it to mark
pointer validity instead of list_empty() test. flags and wq_data live
in the same atomic_long and thus can be updated together atomically.
* insert_work() sets VALID bit and the cwq pointer using one
atomic_long_set().
* queue_delayed_work_on() sets the cwq pointer but not the VALID bit.
* run_workqueue() clears the cwq pointer and VALID bit while holding
lock before executing the work.
* try_to_grab_pending() checks VALID && pointers equal after grabbing
cwq->lock.
What do you think? Is there any hole?
Thanks.
--
tejun
On 05/12, Tejun Heo wrote:
>
> Oleg Nesterov wrote:
> >
> > Yes I hate this barrier too. That is why changelog explicitly mentions it.
> > Probably we can do something else.
>
> Fortunately, we have one bit left in the flags and can use it to mark
> pointer validity instead of list_empty() test. flags and wq_data live
> in the same atomic_long and thus can be updated together atomically.
Heh. I thought about another bit-in-pointer too. I can't explain this,
but I personally hate these bits even more than barriers.
I must admit, your suggestion is much more clean compared to what I
had in mind.
> * insert_work() sets VALID bit and the cwq pointer using one
> atomic_long_set().
OK, but not very suitable, we need to copy-and-paste set_wq_data(),
because it is also used by queue_delayed_work(). Not a big problem.
> * queue_delayed_work_on() sets the cwq pointer but not the VALID bit.
... and queue_delayed_work(), of course.
> * run_workqueue() clears the cwq pointer and VALID bit while holding
> lock before executing the work.
We should not clear cwq. I guess you meant "does list_del() and clears
VALID bit".
> * try_to_grab_pending() checks VALID && pointers equal after grabbing
> cwq->lock.
We don't even need to check the pointers. VALID bit is always changed
under cwq->lock. So, if we see VALID under cwq->lock, we have a right
pointer.
> What do you think? Is there any hole?
I'd be happy to say I found many nasty unfixable holes in your idea,
but I can't see any :) Something like the patch below, I guess.
Ugh. OK, the barrier is evil. As I said, we can convert smp_wmb() to
smp_bm__before_spinlock() (which we don't currently have, but which we
imho need regardless to workqueue.c), but anyway it is not easy to
understand the code with barriers.
OTOH, the new VALID bit is _only_ needed to implement a special cancel_xxx
functions. And unlike wmb/smp_bm__before_spinlock it is not hidden in
insert_work(), but spreads over the queue/unqueue path. And. the
bit-in-pointer is always a hack, imho. And. It is a bit difficult to
understand why do we need a VALID bit when we have !list_empty(), so
_perhaps_ a barrier is not much worse.
I am looking at the patch below, and I can't undestand whether this is
good change or not. I am biased, of course. Tejun, if you say "I strongly
believe this is better", I'll send the patch.
Anybody else has an opinion?
Oleg.
--- t/kernel/workqueue.c~ 2007-05-13 22:32:45.000000000 +0400
+++ t/kernel/workqueue.c 2007-05-13 22:37:42.000000000 +0400
@@ -120,11 +120,7 @@ static void insert_work(struct cpu_workq
struct work_struct *work, int tail)
{
set_wq_data(work, cwq);
- /*
- * Ensure that we get the right work->data if we see the
- * result of list_add() below, see try_to_grab_pending().
- */
- smp_wmb();
+ __set_bit(WORK_STRUCT_QUEUED, work_data_bits(work));
if (tail)
list_add_tail(&work->entry, &cwq->worklist);
else
@@ -231,6 +227,12 @@ int queue_delayed_work_on(int cpu, struc
}
EXPORT_SYMBOL_GPL(queue_delayed_work_on);
+static inline void dequeue_work(struct work_struct *work)
+{
+ __clear_bit(WORK_STRUCT_QUEUED, work_data_bits(work));
+ list_del_init(&work->entry);
+}
+
static void run_workqueue(struct cpu_workqueue_struct *cwq)
{
spin_lock_irq(&cwq->lock);
@@ -246,8 +248,8 @@ static void run_workqueue(struct cpu_wor
struct work_struct, entry);
work_func_t f = work->func;
+ dequeue_work(work);
cwq->current_work = work;
- list_del_init(cwq->worklist.next);
spin_unlock_irq(&cwq->lock);
BUG_ON(get_wq_data(work) != cwq);
@@ -410,17 +412,9 @@ static int try_to_grab_pending(struct wo
return ret;
spin_lock_irq(&cwq->lock);
- if (!list_empty(&work->entry)) {
- /*
- * This work is queued, but perhaps we locked the wrong cwq.
- * In that case we must see the new value after rmb(), see
- * insert_work()->wmb().
- */
- smp_rmb();
- if (cwq == get_wq_data(work)) {
- list_del_init(&work->entry);
- ret = 1;
- }
+ if (test_bit(WORK_STRUCT_QUEUED, work_data_bits(work))) {
+ dequeue_work(work);
+ ret = 1;
}
spin_unlock_irq(&cwq->lock);
Hello,
Oleg Nesterov wrote:
> On 05/12, Tejun Heo wrote:
>> Oleg Nesterov wrote:
>>> Yes I hate this barrier too. That is why changelog explicitly mentions it.
>>> Probably we can do something else.
>> Fortunately, we have one bit left in the flags and can use it to mark
>> pointer validity instead of list_empty() test. flags and wq_data live
>> in the same atomic_long and thus can be updated together atomically.
>
> Heh. I thought about another bit-in-pointer too. I can't explain this,
> but I personally hate these bits even more than barriers.
I'm the other way around but it's like saying "I like donkey poo better
than horse poo". Let's accept that we have different tastes in poos.
> I must admit, your suggestion is much more clean compared to what I
> had in mind.
Gee... what did you have in mind? Twisted one. :-P
>> * insert_work() sets VALID bit and the cwq pointer using one
>> atomic_long_set().
>
> OK, but not very suitable, we need to copy-and-paste set_wq_data(),
> because it is also used by queue_delayed_work(). Not a big problem.
Yeap, we'll need to either adjust set/get_wq_data() interface a bit or
open code it. More on this later.
>> * queue_delayed_work_on() sets the cwq pointer but not the VALID bit.
>
> ... and queue_delayed_work(), of course.
Yeap.
>> * run_workqueue() clears the cwq pointer and VALID bit while holding
>> lock before executing the work.
>
> We should not clear cwq. I guess you meant "does list_del() and clears
> VALID bit".
Yeap, I've used the term 'clearing' as more of a logical term - making
the pointer invalid, but is there any reason why we can't clear the
pointer itself?
>> * try_to_grab_pending() checks VALID && pointers equal after grabbing
>> cwq->lock.
>
> We don't even need to check the pointers. VALID bit is always changed
> under cwq->lock. So, if we see VALID under cwq->lock, we have a right
> pointer.
But there are multiple cwq->lock's. VALID can be set with one of other
cwq->lock's locked.
>> What do you think? Is there any hole?
>
> I'd be happy to say I found many nasty unfixable holes in your idea,
> but I can't see any :) Something like the patch below, I guess.
>
> Ugh. OK, the barrier is evil. As I said, we can convert smp_wmb() to
> smp_bm__before_spinlock() (which we don't currently have, but which we
> imho need regardless to workqueue.c), but anyway it is not easy to
> understand the code with barriers.
I didn't really get the smp_mb__before_spinlock() thing. How are you
planning to use it? spinlock() is already a read barrier. What do you
gain by making it also a write barrier?
> OTOH, the new VALID bit is _only_ needed to implement a special cancel_xxx
> functions. And unlike wmb/smp_bm__before_spinlock it is not hidden in
> insert_work(), but spreads over the queue/unqueue path. And. the
> bit-in-pointer is always a hack, imho. And. It is a bit difficult to
> understand why do we need a VALID bit when we have !list_empty(), so
> _perhaps_ a barrier is not much worse.
>
> I am looking at the patch below, and I can't undestand whether this is
> good change or not. I am biased, of course. Tejun, if you say "I strongly
> believe this is better", I'll send the patch.
We're deep into per-cpu synchronization neverland and one can only trust
one's own Alice. I can't say my Alice is better than yours but I'll try
to present why I like mine.
I like the VALID bit thing because we can make it look and act similar
to the relatively regular per-cpu locked enter/leave model people can
understand without too much difficulty. If we change get/set_wq() such
that they contain the VALID bit thing inside them, the code can be
explained as simple (really?) locked enter/leave model. That's why I
mentioned clearing the cwq pointer while locked. The closer we stay to
the known model, the easier to understand and explain.
Maybe the bit shouldn't be called VALID but IGNORE, CACHE_ONLY or
somesuch. The only reason why we can't do real enter/leave model is the
cwq caching for delayed works, so it might make more sense to mark the
pointer as invalid while being used for caching rather than the other
way around.
Thanks for working on this neverland thing. :-)
--
tejun
Hi Tejun,
Sorry, I can't give a "complete" reply today (will do tomorrow),
just one note right now...
On 05/13, Tejun Heo wrote:
>
> Oleg Nesterov wrote:
> >> * try_to_grab_pending() checks VALID && pointers equal after grabbing
> >> cwq->lock.
> >
> > We don't even need to check the pointers. VALID bit is always changed
> > under cwq->lock. So, if we see VALID under cwq->lock, we have a right
> > pointer.
>
> But there are multiple cwq->lock's. VALID can be set with one of other
> cwq->lock's locked.
Oops. Thanks for correcting me. _This_ was a reson for a stupid barrier!
Oleg.
On 05/13, Tejun Heo wrote:
>
> Oleg Nesterov wrote:
> > Heh. I thought about another bit-in-pointer too. I can't explain this,
> > but I personally hate these bits even more than barriers.
>
> I'm the other way around but it's like saying "I like donkey poo better
> than horse poo". Let's accept that we have different tastes in poos.
Yes sure :)
> >> * run_workqueue() clears the cwq pointer and VALID bit while holding
> >> lock before executing the work.
> >
> > We should not clear cwq. I guess you meant "does list_del() and clears
> > VALID bit".
>
> Yeap, I've used the term 'clearing' as more of a logical term - making
> the pointer invalid, but is there any reason why we can't clear the
> pointer itself?
Because this breaks cancel_work_sync(work), it needs get_wq_data(work) for
wait_on_work(work).
> >> * try_to_grab_pending() checks VALID && pointers equal after grabbing
> >> cwq->lock.
> >
> > We don't even need to check the pointers. VALID bit is always changed
> > under cwq->lock. So, if we see VALID under cwq->lock, we have a right
> > pointer.
>
> But there are multiple cwq->lock's. VALID can be set with one of other
> cwq->lock's locked.
Yes, you are right, thanks.
So, try_to_grab_pending() should check "VALID && pointers equal" atomically.
We can't do "if (VALID && cwq == get_wq_data(work))". We should do something
like this
(((long)cwq) | VALID | PENDING) == atomic_long_read(&work->data)
Yes? I need to think more about this.
> I didn't really get the smp_mb__before_spinlock() thing. How are you
> planning to use it? spinlock() is already a read barrier. What do you
> gain by making it also a write barrier?
As I said, we can shift set_wq_data() from insert_work() to __queue_work(),
static void insert_work(struct cpu_workqueue_struct *cwq,
struct work_struct *work, int tail)
{
if (tail)
list_add_tail(&work->entry, &cwq->worklist);
else
list_add(&work->entry, &cwq->worklist);
wake_up(&cwq->more_work);
}
static void __queue_work(struct cpu_workqueue_struct *cwq,
struct work_struct *work)
{
unsigned long flags;
set_wq_data(work, cwq);
smp_mb_before_spinlock();
spin_lock_irqsave(&cwq->lock, flags);
insert_work(cwq, work, 1);
spin_unlock_irqrestore(&cwq->lock, flags);
}
this needs very minor changes.
BTW, in _theory_, spinlock() is not a read barrier, yes?
>From Documentation/memory-barriers.txt
Memory operations that occur before a LOCK operation may appear to happen
after it completes.
Oleg.
Oleg Nesterov wrote:
>> Yeap, I've used the term 'clearing' as more of a logical term - making
>> the pointer invalid, but is there any reason why we can't clear the
>> pointer itself?
>
> Because this breaks cancel_work_sync(work), it needs get_wq_data(work) for
> wait_on_work(work).
Right. That's the other user of cached work->wq_data pointer.
> So, try_to_grab_pending() should check "VALID && pointers equal" atomically.
> We can't do "if (VALID && cwq == get_wq_data(work))". We should do something
> like this
>
> (((long)cwq) | VALID | PENDING) == atomic_long_read(&work->data)
>
> Yes? I need to think more about this.
I don't think the test for PENDING should be atomic too. cwq pointer
and VALID is one package. PENDING lives its own life as a atomic bit
switch.
>> I didn't really get the smp_mb__before_spinlock() thing. How are you
>> planning to use it? spinlock() is already a read barrier. What do you
>> gain by making it also a write barrier?
>
> As I said, we can shift set_wq_data() from insert_work() to __queue_work(),
OIC.
> this needs very minor changes.
>
> BTW, in _theory_, spinlock() is not a read barrier, yes?
It actually is.
> From Documentation/memory-barriers.txt
>
> Memory operations that occur before a LOCK operation may appear to happen
> after it completes.
Which means that spin_lock() isn't a write barrier. lock is read
barrier, unlock is write barrier. Otherwise, locking doesn't make much
sense. If we're going the barrier way, I think we're better off with
explicit smp_wmb(). It's only barrier() on x86/64.
Thanks.
--
tejun
On Tue, May 15, 2007 at 10:26:41AM +0200, Tejun Heo wrote:
> Oleg Nesterov wrote:
...
> > So, try_to_grab_pending() should check "VALID && pointers equal" atomically.
> > We can't do "if (VALID && cwq == get_wq_data(work))". We should do something
> > like this
> >
> > (((long)cwq) | VALID | PENDING) == atomic_long_read(&work->data)
> >
> > Yes? I need to think more about this.
>
> I don't think the test for PENDING should be atomic too. cwq pointer
> and VALID is one package. PENDING lives its own life as a atomic bit
> switch.
Hi,
I've overheared somebody is talking about my favorite 2-nd bit!
Probably I miss many points (your talk isn't the most clear),
but I wonder if this bit couldn't be used otherwise: try_to_grab_
sets the bit - others know cancel is pending, so don't disturb:
e.g. insert_work doesn't queue (at least after works' cpu change,
which seems to be the main problem here). Probably there is
no reason to test this bit in all places - only in the most
problematic; so, in insert_work maybe only after checking
the cpu was changed. If this way is possible, we could avoid
setting the VALID bit when not needed (no cancel pending). Maybe
we could also think about some form of cooperation - e.g. clearing
of this or other bit to ack the work was catched - of course
this last thing could be too much, so not necessarily now.
Regards,
Jarek P.
On 05/15, Tejun Heo wrote:
>
> Oleg Nesterov wrote:
>
> > So, try_to_grab_pending() should check "VALID && pointers equal" atomically.
> > We can't do "if (VALID && cwq == get_wq_data(work))". We should do something
> > like this
> >
> > (((long)cwq) | VALID | PENDING) == atomic_long_read(&work->data)
> >
> > Yes? I need to think more about this.
>
> I don't think the test for PENDING should be atomic too. cwq pointer
> and VALID is one package. PENDING lives its own life as a atomic bit
> switch.
Yes sure, it should not be atomic. But (VALID && !PENDING) == BUG, so we
can't just "kill" PENDING form the check above.
> > BTW, in _theory_, spinlock() is not a read barrier, yes?
>
> It actually is.
>
> > From Documentation/memory-barriers.txt
> >
> > Memory operations that occur before a LOCK operation may appear to happen
> > after it completes.
>
> Which means that spin_lock() isn't a write barrier.
yes, it is not very clear which "Memory operations" memory-barriers.txt
describes.
> lock is read
> barrier, unlock is write barrier.
(To avoid a possible confusion: I am not arguing, I am trying to understand,
and please also note "in _theory_" above)
Is it so? Shoudn't we document this if it is true?
> Otherwise, locking doesn't make much
> sense.
Why? Could you please give a code example we have which relies on this?
> If we're going the barrier way, I think we're better off with
> explicit smp_wmb(). It's only barrier() on x86/64.
Yes. But note that we don't have any reason to do set_wq_data() under
cwq->lock (this is also true for wake_up(->more_work) btw), so it makes
sense to do this change anyway. And "wmb + spin_lock" looks a bit strange,
I _suspect_ spin_lock() means a full barrier on most platforms.
Could you also look at
http://marc.info/?t=116275561700001&r=1
and, in particular,
http://marc.info/?l=linux-kernel&m=116281136122456
Thanks!
Oleg.
On 05/15, Jarek Poplawski wrote:
>
> I've overheared somebody is talking about my favorite 2-nd bit!
> Probably I miss many points (your talk isn't the most clear),
> but I wonder if this bit couldn't be used otherwise: try_to_grab_
> sets the bit - others know cancel is pending, so don't disturb:
> e.g. insert_work doesn't queue (at least after works' cpu change,
> which seems to be the main problem here). Probably there is
> no reason to test this bit in all places - only in the most
> problematic; so, in insert_work maybe only after checking
> the cpu was changed. If this way is possible, we could avoid
> setting the VALID bit when not needed (no cancel pending). Maybe
> we could also think about some form of cooperation - e.g. clearing
> of this or other bit to ack the work was catched - of course
> this last thing could be too much, so not necessarily now.
We already discussed this... Surely, we can do this. I believe
this will complicate (and _imho_ uglify) the code too much.
May be I am wrong. Please provide a detailed description?
Oleg.
On Wed, May 16, 2007 at 02:08:12AM +0400, Oleg Nesterov wrote:
> On 05/15, Jarek Poplawski wrote:
> >
> > I've overheared somebody is talking about my favorite 2-nd bit!
...
> We already discussed this... Surely, we can do this. I believe
> this will complicate (and _imho_ uglify) the code too much.
Yes, but now I see you are about to change your mind about
this bit...
> May be I am wrong. Please provide a detailed description?
I'll try too look more at your solution - maybe it's "not
so bad". I've probably exaggerated yesterday about this
overhead of setting the VALID bit.
Cheers,
Jarek P.
Hello, Oleg.
Oleg Nesterov wrote:
>>> Yes? I need to think more about this.
>> I don't think the test for PENDING should be atomic too. cwq pointer
>> and VALID is one package. PENDING lives its own life as a atomic bit
>> switch.
>
> Yes sure, it should not be atomic. But (VALID && !PENDING) == BUG, so we
> can't just "kill" PENDING form the check above.
We can also mask it out, which is about the same as what we're currently
doing. :-)
>>> Memory operations that occur before a LOCK operation may appear to happen
>>> after it completes.
>> Which means that spin_lock() isn't a write barrier.
>
> yes, it is not very clear which "Memory operations" memory-barriers.txt
> describes.
>
>> lock is read
>> barrier, unlock is write barrier.
>
> (To avoid a possible confusion: I am not arguing, I am trying to understand,
> and please also note "in _theory_" above)
>
> Is it so? Shoudn't we document this if it is true?
>
>> Otherwise, locking doesn't make much
>> sense.
>
> Why? Could you please give a code example we have which relies on this?
Let's say there's a shared data structure protected by a spinlock and
two threads are accessing it.
1. thr1 locks spin
2. thr1 updates data structure
3. thr1 unlocks spin
4. thr2 locks spin
5. thr2 accesses data structure
6. thr2 unlocks spin
If spin_unlock is not a write barrier and spin_lock is not a read
barrier, nothing guarantees memory accesses from step#5 will see the
changes made in step#2. Memory fetch can occur during updates in step#2
or even before that.
In _theory_, if you have, say, partial/blocked memory barrier thing
which acts as a barrier to certain range of memory accesses, in this
case memory accesses made while holding spinlock, they don't have to be
real memory barriers but AFAIK there isn't any. I don't think it's
practical to worry about such thing at this point. It's way too
theoretical. But, yes, it sure needs documentation.
>> If we're going the barrier way, I think we're better off with
>> explicit smp_wmb(). It's only barrier() on x86/64.
>
> Yes. But note that we don't have any reason to do set_wq_data() under
> cwq->lock (this is also true for wake_up(->more_work) btw), so it makes
> sense to do this change anyway. And "wmb + spin_lock" looks a bit strange,
> I _suspect_ spin_lock() means a full barrier on most platforms.
>
> Could you also look at
> http://marc.info/?t=116275561700001&r=1
>
> and, in particular,
> http://marc.info/?l=linux-kernel&m=116281136122456
This is because spin_lock() isn't a write barrier, right? I totally
agree with you there.
Thanks.
--
tejun
Hello Tejun,
On 05/16, Tejun Heo wrote:
>
> >> lock is read arrier, unlock is write barrier.
>
> Let's say there's a shared data structure protected by a spinlock and
> two threads are accessing it.
>
> 1. thr1 locks spin
> 2. thr1 updates data structure
> 3. thr1 unlocks spin
> 4. thr2 locks spin
> 5. thr2 accesses data structure
> 6. thr2 unlocks spin
>
> If spin_unlock is not a write barrier and spin_lock is not a read
> barrier, nothing guarantees memory accesses from step#5 will see the
> changes made in step#2. Memory fetch can occur during updates in step#2
> or even before that.
Ah, but this is something different. Both lock/unlock are full barriers,
but they protect only one direction. A memory op must not leak out of the
critical section, but it may leak in.
A = B; // 1
lock(); // 2
C = D; // 3
this can be re-ordered to
lock(); // 2
C = D; // 3
A = B; // 1
but 2 and 3 must not be re-ordered.
To be sure, I contacted Paul E. McKenney privately, and his reply is
> No. See for example IA64 in file include/asm-ia64/spinlock.h,
> line 34 for spin_lock() and line 92 for spin_unlock(). The
> spin_lock() case uses a ,acq completer, which will allow preceding
> reads to be reordered into the critical section. The spin_unlock()
> uses the ,rel completer, which will allow subsequent writes to be
> reordered into the critical section. The locking primitives are
> guaranteed to keep accesses bound within the critical section, but
> are free to let outside accesses be reordered into the critical
> section.
>
> Download the Itanium Volume 2 manual:
>
> http://developer.intel.com/design/itanium/manuals/245318.htm
>
> Table 2.3 on page 2:489 (physical page 509) shows an example of how
> the rel and acq completers work.
> > Could you also look at
> > http://marc.info/?t=116275561700001&r=1
> >
> > and, in particular,
> > http://marc.info/?l=linux-kernel&m=116281136122456
>
> This is because spin_lock() isn't a write barrier, right? I totally
> agree with you there.
Yes, but in fact I think wake_up() needs a full mb() semantics (which we
don't have _in theory_), because try_to_wake_up() first checks task->state
and does nothing if it is TASK_RUNNING.
That is why I think that smp_mb__before_spinlock() may be useful not only
for workqueue.c
Oleg.
Hello, Oleg.
Oleg Nesterov wrote:
> Hello Tejun,
>
> On 05/16, Tejun Heo wrote:
>>>> lock is read arrier, unlock is write barrier.
>> Let's say there's a shared data structure protected by a spinlock and
>> two threads are accessing it.
>>
>> 1. thr1 locks spin
>> 2. thr1 updates data structure
>> 3. thr1 unlocks spin
>> 4. thr2 locks spin
>> 5. thr2 accesses data structure
>> 6. thr2 unlocks spin
>>
>> If spin_unlock is not a write barrier and spin_lock is not a read
>> barrier, nothing guarantees memory accesses from step#5 will see the
>> changes made in step#2. Memory fetch can occur during updates in step#2
>> or even before that.
>
> Ah, but this is something different. Both lock/unlock are full barriers,
> but they protect only one direction. A memory op must not leak out of the
> critical section, but it may leak in.
>
> A = B; // 1
> lock(); // 2
> C = D; // 3
>
> this can be re-ordered to
>
> lock(); // 2
> C = D; // 3
> A = B; // 1
>
> but 2 and 3 must not be re-ordered.
OIC. Right, barriers with directionality would do that.
> To be sure, I contacted Paul E. McKenney privately, and his reply is
>
> > No. See for example IA64 in file include/asm-ia64/spinlock.h,
> > line 34 for spin_lock() and line 92 for spin_unlock(). The
> > spin_lock() case uses a ,acq completer, which will allow preceding
> > reads to be reordered into the critical section. The spin_unlock()
> > uses the ,rel completer, which will allow subsequent writes to be
> > reordered into the critical section. The locking primitives are
> > guaranteed to keep accesses bound within the critical section, but
> > are free to let outside accesses be reordered into the critical
> > section.
> >
> > Download the Itanium Volume 2 manual:
> >
> > http://developer.intel.com/design/itanium/manuals/245318.htm
> >
> > Table 2.3 on page 2:489 (physical page 509) shows an example of how
> > the rel and acq completers work.
And, there actually is such a beast. Thanks for the enlightenment.
Care to document these?
>>> Could you also look at
>>> http://marc.info/?t=116275561700001&r=1
>>>
>>> and, in particular,
>>> http://marc.info/?l=linux-kernel&m=116281136122456
>> This is because spin_lock() isn't a write barrier, right? I totally
>> agree with you there.
>
> Yes, but in fact I think wake_up() needs a full mb() semantics (which we
> don't have _in theory_), because try_to_wake_up() first checks task->state
> and does nothing if it is TASK_RUNNING.
>
> That is why I think that smp_mb__before_spinlock() may be useful not only
> for workqueue.c
Yeap, I agree.
--
tejun
On Wed, May 16, 2007 at 10:52:03PM +0400, Oleg Nesterov wrote:
...
> Ah, but this is something different. Both lock/unlock are full barriers,
> but they protect only one direction. A memory op must not leak out of the
> critical section, but it may leak in.
>
> A = B; // 1
> lock(); // 2
> C = D; // 3
>
> this can be re-ordered to
>
> lock(); // 2
> C = D; // 3
> A = B; // 1
>
> but 2 and 3 must not be re-ordered.
>
> To be sure, I contacted Paul E. McKenney privately, and his reply is
>
> > No. See for example IA64 in file include/asm-ia64/spinlock.h,
> > line 34 for spin_lock() and line 92 for spin_unlock(). The
> > spin_lock() case uses a ,acq completer, which will allow preceding
> > reads to be reordered into the critical section. The spin_unlock()
> > uses the ,rel completer, which will allow subsequent writes to be
> > reordered into the critical section. The locking primitives are
> > guaranteed to keep accesses bound within the critical section, but
> > are free to let outside accesses be reordered into the critical
> > section.
> >
> > Download the Itanium Volume 2 manual:
> >
> > http://developer.intel.com/design/itanium/manuals/245318.htm
> >
> > Table 2.3 on page 2:489 (physical page 509) shows an example of how
> > the rel and acq completers work.
Hi,
I managed to find some time to look at this all once more,
and here is my current opinion:
1. I think this above mentioned lock vs. barrier behavior is
in compliance with memory-barriers.txt, and BTW I'd like to
thank and congratulate David Howells of very good style
(and beautiful pictures).
2. IMHO the current solution with smp barriers is very good:
these barriers are really needed and they should be enough.
Oleg repeats all the time he hates barriers, but I think
it's wrong approach - they should be seen as something
natural for programming modern processors and trying to
change the code only to avoid them is wrong (unless it
could be done with some minor changes).
3. The alternative solution without barriers, based on the
idea of Tejun Heo and presented in the patch proposal from
2007-05-13, could be probably a little faster (for some
processors), after some changes:
- I think, instead of separate __set_bit and __clear_bit,
WORK_STRUCT_QUEUED bit should be changed during set_wq_data
(additional parameter or new variant eg: set_wq_data_queued),
and similarly something like work_clear_pending_queued -
so, without additional operations instead of these barriers.
- changing this if condition in try_to_grab_pending.
But there is a question - is this worth of one (last) bit
occupied only for optimisation (but not always)?
So, maybe it would be better to think about solving those
other problems in workqueue, mentioned by Oleg somewhere?
Regards,
Jarek P.
On Fri, May 18, 2007 at 09:35:17AM +0200, Jarek Poplawski wrote:
> On Wed, May 16, 2007 at 10:52:03PM +0400, Oleg Nesterov wrote:
...
> 3. The alternative solution without barriers, based on the
> idea of Tejun Heo and presented in the patch proposal from
> 2007-05-13, could be probably a little faster (for some
> processors), after some changes:
I mean "a little faster" than current version. Of course
the second change is needed for other, discussed reasons.
Jarek P.
Hello,
Jarek Poplawski wrote:
> 2. IMHO the current solution with smp barriers is very good:
> these barriers are really needed and they should be enough.
> Oleg repeats all the time he hates barriers, but I think
> it's wrong approach - they should be seen as something
> natural for programming modern processors and trying to
> change the code only to avoid them is wrong (unless it
> could be done with some minor changes).
>
> 3. The alternative solution without barriers, based on the
> idea of Tejun Heo and presented in the patch proposal from
> 2007-05-13, could be probably a little faster (for some
> processors), after some changes:
>
> - I think, instead of separate __set_bit and __clear_bit,
> WORK_STRUCT_QUEUED bit should be changed during set_wq_data
> (additional parameter or new variant eg: set_wq_data_queued),
> and similarly something like work_clear_pending_queued -
> so, without additional operations instead of these barriers.
>
> - changing this if condition in try_to_grab_pending.
>
> But there is a question - is this worth of one (last) bit
> occupied only for optimisation (but not always)?
I wasn't really aiming for performance optimization. I agree that we
have to live with barriers but it's also true that they and other
synchronization constructs can be difficult to understand and thus to
verify, so IMHO, when it comes to synchronization constructs, it's best
stick to or make things look similar to more established ways, so that
people can categorize the usage and understand it more easily.
Locked enter/leave model, where each cpu has its own lock and whatever
entity enters and leaves a cpu while holding the respective per-cpu
lock, guarantees that while holding a per-cpu lock, no one enters or
leaves the cpu. It's conceptually simple and basically the semantic
that we're all trying to achieve here.
I think things can be made more similar to the locked enter/leave model
using the extra bit by putting manipulation and testing of the bit into
accesor functions, so readability was my primary concern not
performance. But, then again, we can probably make the barrier() model
readable enough too with proper accessor functions and plenty of
comments above them.
--
tejun
On Fri, May 18, 2007 at 03:33:49PM +0200, Tejun Heo wrote:
...
> I wasn't really aiming for performance optimization. I agree that we
> have to live with barriers but it's also true that they and other
> synchronization constructs can be difficult to understand and thus to
> verify, so IMHO, when it comes to synchronization constructs, it's best
> stick to or make things look similar to more established ways, so that
> people can categorize the usage and understand it more easily.
Sorry, I'm not convinced by this...
If we want to stick to more understanable, established ways,
why don't we use a global lock (not per cpu but per work or
per workqueue) for insert_work(), which would be really the
simplest construct here. Probably because some compromise is
needed. On the other hand many locks could be replaced with
elaborate constructs with atomic operations like this. But
for some reasons this usually isn't done.
IMHO it's better to get used to barriers, because, as seen
in this thread, even after finding better solution, the
problem of execution ordering didn't disappear. Another
"strange" question is: if these barriers are so tough, why
Oleg managed to do them seemingly right from the very first
time? But even if he would do some error here, I think it
should be easier to check for this one problem only, than to
risk the "stability" of the whole algorithm by changing more.
Of course now, after the time was spent anyway, this new
"old-way" is more attractive.
>
> Locked enter/leave model, where each cpu has its own lock and whatever
> entity enters and leaves a cpu while holding the respective per-cpu
> lock, guarantees that while holding a per-cpu lock, no one enters or
> leaves the cpu. It's conceptually simple and basically the semantic
> that we're all trying to achieve here.
I prefer simplicity over perfomance too. But there is a
question of price... These modern things like RCU or barriers
are unavoidable, and probably we should get used to think the
ALPHA is right and 'normal' here - not x86 (by limiting
compilers and programmers with making the most of all those
new caches).
>
> I think things can be made more similar to the locked enter/leave model
> using the extra bit by putting manipulation and testing of the bit into
> accesor functions, so readability was my primary concern not
> performance. But, then again, we can probably make the barrier() model
> readable enough too with proper accessor functions and plenty of
> comments above them.
Yes, the good thing is Oleg can choose between two good
solutions here! (I don't envy him...)
Thanks for your response, sorry for my delay &
Best regards,
Jarek P.
Hello,
Jarek Poplawski wrote:
> If we want to stick to more understanable, established ways,
> why don't we use a global lock (not per cpu but per work or
> per workqueue) for insert_work(), which would be really the
> simplest construct here. Probably because some compromise is
> needed. On the other hand many locks could be replaced with
> elaborate constructs with atomic operations like this. But
> for some reasons this usually isn't done.
>
> IMHO it's better to get used to barriers, because, as seen
> in this thread, even after finding better solution, the
> problem of execution ordering didn't disappear. Another
> "strange" question is: if these barriers are so tough, why
> Oleg managed to do them seemingly right from the very first
> time? But even if he would do some error here, I think it
> should be easier to check for this one problem only, than to
> risk the "stability" of the whole algorithm by changing more.
> Of course now, after the time was spent anyway, this new
> "old-way" is more attractive.
Well, it might be just because I'm used to what I call 'locked
enter/leave' model. If people don't find that easier to understand, no
objection from me and kudos to Oleg for getting it right from the beginning.
>> I think things can be made more similar to the locked enter/leave model
>> using the extra bit by putting manipulation and testing of the bit into
>> accesor functions, so readability was my primary concern not
>> performance. But, then again, we can probably make the barrier() model
>> readable enough too with proper accessor functions and plenty of
>> comments above them.
>
> Yes, the good thing is Oleg can choose between two good
> solutions here! (I don't envy him...)
As long as it's documented sufficiently, both are good. It's your
decision, Oleg. :-)
--
tejun
On Mon, May 21, 2007 at 10:59:46AM +0200, Tejun Heo wrote:
...
> Well, it might be just because I'm used to what I call 'locked
> enter/leave' model. If people don't find that easier to understand, no
> objection from me and kudos to Oleg for getting it right from the beginning.
I think most people here, at least me too, are used to the same.
At first I didn't even think whether Oleg's feelings about these
barriers were right - it was obvious to me any barriers (!) should
be avoided.
But while reading David's manual I started to doubt: there had to
be some reasons these CPUs were done such nasty way. And probably
it wasn't about punishing the programmers. So, maybe sometimes
good, old habits aren't the best advisors...
Cheers,
Jarek P.