From: Daniel Wagner <[email protected]>
Hi,
As Peter correctly pointed out in [1] a simple conversion from
wait to swait in completion.c wont work. I played a bit around and
came up with this rather ugly idea.
So in case complete_all() is called in hard irq context we just wake
up one waiter and let that one call swake_up_all(). For this I needed
to somehow transfer this information from complete_all() to
wait_for_completion(). The only working idea I found was to introduce
a new flag in struct completion. Ideas to overcome this problem
are highly appreciated.
I did also some performance measurement with below test program. The
test creates a trigger thread and a bunch of waiter threads. The
trigger thread calls complete_all() either from thread context or from
hard irq context. Time needed for 1000 iterations measured. This was
done on a idle IvyBridge machine with 64 logial cores (E5-4610).
waiter_nr: number of waiter threads
irqwork: 0 complete_all() from thread context, 1 complete_all() from irq_work()
wait:
waiter_nr 64
irqwork 0
count 66.000000
mean 0.378318
std 0.018468
min 0.344000
25% 0.364000
50% 0.382500
75% 0.395000
max 0.407000
swait:
waiter_nr 64
irqwork 1
count 86.000000
mean 0.315221
std 0.007115
min 0.291000
25% 0.312000
50% 0.316500
75% 0.320000
max 0.329000
swait:
waiter_nr 64
irqwork 0
count 81.000000
mean 0.344642
std 0.021708
min 0.294000
25% 0.336000
50% 0.341000
75% 0.355000
max 0.403000
cheers,
daniel
completion-test.c:
#include <linux/module.h>
#include <linux/wait.h>
#include <linux/swait.h>
#include <linux/kthread.h>
#include <linux/threads.h>
#include <linux/slab.h>
#include <linux/delay.h>
#include <linux/completion.h>
#include <linux/irq_work.h>
static unsigned int waiter_nr = 5;
static bool irqwork = true;
static unsigned int counter = 1000;
module_param(waiter_nr, uint,
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
MODULE_PARM_DESC(waiter_nr, "Number of waiter threads");
module_param(irqwork, bool,
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
MODULE_PARM_DESC(irqwork, "irqwork");
module_param(counter, uint,
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
MODULE_PARM_DESC(counter, "counter");
struct completion_test {
/* We need two completions to avoid a race with reinit of the
* completion.
*/
struct completion sync_stage1;
struct completion sync_stage2;
wait_queue_head_t wq_stage1;
wait_queue_head_t wq_stage2;
atomic_t cnt_stage1;
atomic_t cnt_stage2;
struct irq_work irq_work;
};
static struct completion_test test_data;
static struct task_struct **waiter_tasks;
static struct task_struct *trigger_task;
static void trigger_irq(struct irq_work *arg)
{
struct completion_test *ct =
container_of(arg, struct completion_test, irq_work);
complete_all(&ct->sync_stage1);
}
static int waiter(void *arg)
{
struct completion_test *ct = arg;
for (;;) {
atomic_inc(&ct->cnt_stage1);
wake_up(&ct->wq_stage1);
wait_for_completion_interruptible(&ct->sync_stage1);
if (kthread_should_stop())
break;
atomic_inc(&ct->cnt_stage2);
wake_up(&ct->wq_stage2);
wait_for_completion_interruptible(&ct->sync_stage2);
if (kthread_should_stop())
break;
}
return 0;
}
static int trigger(void *arg)
{
struct completion_test *ct = arg;
struct timespec ts_start, ts;
unsigned long cnt;
cnt = counter;
ts_start = current_kernel_time();
for (;;) {
cnt--;
if (cnt == 0) {
ts = timespec_sub(current_kernel_time(), ts_start);
printk("%ld.%.9ld\n", ts.tv_sec, ts.tv_nsec);
cnt = counter;
ts_start = current_kernel_time();
}
wait_event_interruptible(ct->wq_stage1,
!(atomic_read(&ct->cnt_stage1) < waiter_nr));
if (kthread_should_stop()) {
complete_all(&ct->sync_stage1);
break;
}
atomic_set(&ct->cnt_stage2, 0);
reinit_completion(&ct->sync_stage2);
if (irqwork)
irq_work_queue(&ct->irq_work);
else
complete_all(&ct->sync_stage1);
wait_event_interruptible(ct->wq_stage2,
!(atomic_read(&ct->cnt_stage2) < waiter_nr));
if (kthread_should_stop()) {
complete_all(&ct->sync_stage2);
break;
}
reinit_completion(&ct->sync_stage1);
atomic_set(&ct->cnt_stage1, 0);
complete_all(&ct->sync_stage2);
}
return 0;
}
static void __exit completion_test_module_cleanup(void)
{
unsigned int i;
if (trigger_task)
kthread_stop(trigger_task);
if (waiter_tasks) {
for (i = 0; i < waiter_nr; i++) {
if (waiter_tasks[i] && !IS_ERR(waiter_tasks[i]))
kthread_stop(waiter_tasks[i]);
}
kfree(waiter_tasks);
}
}
static int __init completion_test_module_init(void)
{
struct completion_test *ct = &test_data;
unsigned int i;
int err;
init_completion(&ct->sync_stage1);
init_completion(&ct->sync_stage2);
init_waitqueue_head(&ct->wq_stage1);
init_waitqueue_head(&ct->wq_stage2);
atomic_set(&ct->cnt_stage1, 0);
atomic_set(&ct->cnt_stage2, 0);
init_irq_work(&ct->irq_work, trigger_irq);
waiter_tasks = kcalloc(waiter_nr, sizeof(waiter_tasks[0]), GFP_KERNEL);
if (!waiter_tasks) {
printk("out of memory\n");
err = -ENOMEM;
goto unwind;
}
for (i = 0; i < waiter_nr; i++) {
waiter_tasks[i] = kthread_run(waiter, ct, "waiter");
if (IS_ERR(waiter_tasks[i])) {
err = -PTR_ERR(waiter_tasks[i]);
goto unwind;
}
}
trigger_task = kthread_run(trigger, ct, "trigger");
if (IS_ERR(trigger_task)) {
err = -PTR_ERR(trigger_task);
goto unwind;
}
return 0;
unwind:
completion_test_module_cleanup();
return err;
}
module_init(completion_test_module_init);
module_exit(completion_test_module_cleanup);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Daniel Wagner");
MODULE_DESCRIPTION("completion test");
[1] http://thread.gmane.org/gmane.linux.kernel/2034867/focus=2034873
Daniel Wagner (1):
sched/completion: convert completions to use simple wait queues
include/linux/completion.h | 14 ++++++++++----
kernel/sched/completion.c | 41 +++++++++++++++++++++++++----------------
2 files changed, 35 insertions(+), 20 deletions(-)
--
2.5.0
From: Daniel Wagner <[email protected]>
Completions have no long lasting callbacks and therefore do not need
the complex waitqueue variant. Use simple waitqueues which reduces
the contention on the waitqueue lock.
This was a carry forward from v3.10-rt, with some RT specific chunks,
dropped, and updated to align with names that were chosen to match the
simple waitqueue support.
[wagi: Added flag to defer swake_up_all() from irq context]
Signed-off-by: Daniel Wagner <[email protected]>
---
include/linux/completion.h | 14 ++++++++++----
kernel/sched/completion.c | 41 +++++++++++++++++++++++++----------------
2 files changed, 35 insertions(+), 20 deletions(-)
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 5d5aaae..a6b2e07 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -8,7 +8,7 @@
* See kernel/sched/completion.c for details.
*/
-#include <linux/wait.h>
+#include <linux/swait.h>
/*
* struct completion - structure used to maintain state for a "completion"
@@ -22,13 +22,17 @@
* reinit_completion(), and macros DECLARE_COMPLETION(),
* DECLARE_COMPLETION_ONSTACK().
*/
+
+#define COMPLETION_DEFER (1 << 0)
+
struct completion {
+ unsigned int flags;
unsigned int done;
- wait_queue_head_t wait;
+ struct swait_queue_head wait;
};
#define COMPLETION_INITIALIZER(work) \
- { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
+ { 0, 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
#define COMPLETION_INITIALIZER_ONSTACK(work) \
({ init_completion(&work); work; })
@@ -72,8 +76,9 @@ struct completion {
*/
static inline void init_completion(struct completion *x)
{
+ x->flags = 0;
x->done = 0;
- init_waitqueue_head(&x->wait);
+ init_swait_queue_head(&x->wait);
}
/**
@@ -85,6 +90,7 @@ static inline void init_completion(struct completion *x)
*/
static inline void reinit_completion(struct completion *x)
{
+ x->flags = 0;
x->done = 0;
}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 8d0f35d..95b08a9 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -30,10 +30,10 @@ void complete(struct completion *x)
{
unsigned long flags;
- spin_lock_irqsave(&x->wait.lock, flags);
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
x->done++;
- __wake_up_locked(&x->wait, TASK_NORMAL, 1);
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ swake_up_locked(&x->wait);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete);
@@ -50,10 +50,15 @@ void complete_all(struct completion *x)
{
unsigned long flags;
- spin_lock_irqsave(&x->wait.lock, flags);
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
x->done += UINT_MAX/2;
- __wake_up_locked(&x->wait, TASK_NORMAL, 0);
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
+ if (in_irq()) {
+ x->flags = COMPLETION_DEFER;
+ swake_up(&x->wait);
+ } else {
+ swake_up_all(&x->wait);
+ }
}
EXPORT_SYMBOL(complete_all);
@@ -62,20 +67,20 @@ do_wait_for_common(struct completion *x,
long (*action)(long), long timeout, int state)
{
if (!x->done) {
- DECLARE_WAITQUEUE(wait, current);
+ DECLARE_SWAITQUEUE(wait);
- __add_wait_queue_tail_exclusive(&x->wait, &wait);
+ __prepare_to_swait(&x->wait, &wait);
do {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
__set_current_state(state);
- spin_unlock_irq(&x->wait.lock);
+ raw_spin_unlock_irq(&x->wait.lock);
timeout = action(timeout);
- spin_lock_irq(&x->wait.lock);
+ raw_spin_lock_irq(&x->wait.lock);
} while (!x->done && timeout);
- __remove_wait_queue(&x->wait, &wait);
+ __finish_swait(&x->wait, &wait);
if (!x->done)
return timeout;
}
@@ -89,9 +94,13 @@ __wait_for_common(struct completion *x,
{
might_sleep();
- spin_lock_irq(&x->wait.lock);
+ raw_spin_lock_irq(&x->wait.lock);
timeout = do_wait_for_common(x, action, timeout, state);
- spin_unlock_irq(&x->wait.lock);
+ raw_spin_unlock_irq(&x->wait.lock);
+ if (x->flags & COMPLETION_DEFER) {
+ x->flags = 0;
+ swake_up_all(&x->wait);
+ }
return timeout;
}
@@ -277,12 +286,12 @@ bool try_wait_for_completion(struct completion *x)
if (!READ_ONCE(x->done))
return 0;
- spin_lock_irqsave(&x->wait.lock, flags);
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
ret = 0;
else
x->done--;
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
return ret;
}
EXPORT_SYMBOL(try_wait_for_completion);
@@ -311,7 +320,7 @@ bool completion_done(struct completion *x)
* after it's acquired the lock.
*/
smp_rmb();
- spin_unlock_wait(&x->wait.lock);
+ raw_spin_unlock_wait(&x->wait.lock);
return true;
}
EXPORT_SYMBOL(completion_done);
--
2.5.0
* Daniel Wagner | 2016-03-08 16:59:13 [+0100]:
>Hi,
Hi,
>As Peter correctly pointed out in [1] a simple conversion from
>wait to swait in completion.c wont work. I played a bit around and
>came up with this rather ugly idea.
besides all the things I mentioned privatly, here is what I have
currently in -RT:
+void swake_up_all_locked(struct swait_queue_head *q)
+{
+ struct swait_queue *curr;
+ int wakes = 0;
+
+ while (!list_empty(&q->task_list)) {
+
+ curr = list_first_entry(&q->task_list, typeof(*curr),
+ task_list);
+ wake_up_process(curr->task);
+ list_del_init(&curr->task_list);
+ wakes++;
+ }
+ WARN_ON(wakes > 2);
+}
+EXPORT_SYMBOL(swake_up_all_locked);
the remaining part is what you have. The only user so far is complete()
and currently I see ony complete_all() with zero or one waiter.
If none of my boxes die over the night, I intend to release this
tomorrow in -RT and see if someone else triggers the limit.
However I don't think if your DEFER flag solution is all that bad. I
have also the block-mq in -RT using swait and they perform wakes with
irqs-off. Not in -RT but mainline. So me might need something to make it
work properly. But if we defer the wakeup they might come at us and
complain about the latency…
Sebastian
On 03/08/2016 06:52 PM, Sebastian Andrzej Siewior wrote:
> However I don't think if your DEFER flag solution is all that bad. I
> have also the block-mq in -RT using swait and they perform wakes with
> irqs-off. Not in -RT but mainline. So me might need something to make it
> work properly. But if we defer the wakeup they might come at us and
> complain about the latency…
I intended to extend the test code to measure the latency diff as well.
Just to get a feeling how bad it is. At least with a lot of waiters on
the completion the current measurement indicate an improvement. I guess
with real workload the things look quite different.
cheers,
daniel
On Tue, Mar 08, 2016 at 06:52:06PM +0100, Sebastian Andrzej Siewior wrote:
> * Daniel Wagner | 2016-03-08 16:59:13 [+0100]:
>
> >Hi,
> Hi,
>
> >As Peter correctly pointed out in [1] a simple conversion from
> >wait to swait in completion.c wont work. I played a bit around and
> >came up with this rather ugly idea.
>
> besides all the things I mentioned privatly, here is what I have
> currently in -RT:
>
> +void swake_up_all_locked(struct swait_queue_head *q)
> +{
> + struct swait_queue *curr;
> + int wakes = 0;
> +
> + while (!list_empty(&q->task_list)) {
> +
> + curr = list_first_entry(&q->task_list, typeof(*curr),
> + task_list);
> + wake_up_process(curr->task);
> + list_del_init(&curr->task_list);
> + wakes++;
> + }
> + WARN_ON(wakes > 2);
> +}
> +EXPORT_SYMBOL(swake_up_all_locked);
>
> the remaining part is what you have. The only user so far is complete()
> and currently I see ony complete_all() with zero or one waiter.
> If none of my boxes die over the night, I intend to release this
> tomorrow in -RT and see if someone else triggers the limit.
>
> However I don't think if your DEFER flag solution is all that bad. I
> have also the block-mq in -RT using swait and they perform wakes with
> irqs-off. Not in -RT but mainline. So me might need something to make
> it work properly. But if we defer the wakeup they might come at us and
> complain about the latency???
Is it really just about latency? Does this deferral not lead to an
inversion in the case where the single woken task isn't the highest
priority waiter on the completion (and doesn't run due to a
middle-priority thing spinning)?
In order for this to work, it seems like the chosen waiter would need to
inherit the highest priority of all waiters (which AFAICT isn't
happening).
Josh
* Josh Cartwright | 2016-03-08 12:26:56 [-0600]:
>Is it really just about latency? Does this deferral not lead to an
>inversion in the case where the single woken task isn't the highest
>priority waiter on the completion (and doesn't run due to a
>middle-priority thing spinning)?
This would be case, yes. Not only with deferral. Say you have two
waters: 1st one is MID-prio and the second is HI-prio. Currently after
the wakeup of the MID-prio waiter you get preempted. Waking all of them
at once would put the second waiter first on the CPU.
Samething without the deferral flag.
>In order for this to work, it seems like the chosen waiter would need to
>inherit the highest priority of all waiters (which AFAICT isn't
>happening).
sorting the waiters by priority? This will be fun. This is only done for
the rtmutex waiters.
> Josh
Sebastian
On Wed, 9 Mar 2016 13:24:23 +0100
Sebastian Andrzej Siewior <[email protected]> wrote:
> * Josh Cartwright | 2016-03-08 12:26:56 [-0600]:
>
> >Is it really just about latency? Does this deferral not lead to an
> >inversion in the case where the single woken task isn't the highest
> >priority waiter on the completion (and doesn't run due to a
> >middle-priority thing spinning)?
>
> This would be case, yes. Not only with deferral. Say you have two
> waters: 1st one is MID-prio and the second is HI-prio. Currently after
> the wakeup of the MID-prio waiter you get preempted. Waking all of them
> at once would put the second waiter first on the CPU.
> Samething without the deferral flag.
>
> >In order for this to work, it seems like the chosen waiter would need to
> >inherit the highest priority of all waiters (which AFAICT isn't
> >happening).
>
> sorting the waiters by priority? This will be fun. This is only done for
> the rtmutex waiters.
>
Hmm, perhaps we should use an rbtree to sort simple waiters by
priority :-)
Probably wont make them simple anymore.
-- Steve
On 03/28/2016 08:57 PM, Steven Rostedt wrote:
> On Wed, 9 Mar 2016 13:24:23 +0100
> Sebastian Andrzej Siewior <[email protected]> wrote:
>
>> * Josh Cartwright | 2016-03-08 12:26:56 [-0600]:
>>
>>> Is it really just about latency? Does this deferral not lead to an
>>> inversion in the case where the single woken task isn't the highest
>>> priority waiter on the completion (and doesn't run due to a
>>> middle-priority thing spinning)?
>>
>> This would be case, yes. Not only with deferral. Say you have two
>> waters: 1st one is MID-prio and the second is HI-prio. Currently after
>> the wakeup of the MID-prio waiter you get preempted. Waking all of them
>> at once would put the second waiter first on the CPU.
>> Samething without the deferral flag.
>>
>>> In order for this to work, it seems like the chosen waiter would need to
>>> inherit the highest priority of all waiters (which AFAICT isn't
>>> happening).
>>
>> sorting the waiters by priority? This will be fun. This is only done for
>> the rtmutex waiters.
>>
>
> Hmm, perhaps we should use an rbtree to sort simple waiters by
> priority :-)
>
> Probably wont make them simple anymore.
I am going through the users of complete_all(). So far I couldn't
identify any user with more than one waiter. That doesn't mean they
don't exist (maybe overlooked).
The -rt version of this patch contains a WARN_ON() in complete_all() if
there are more than one waiter. During resume some warnings are printed.
Let's see if there are more reports showing up.
If the common use case is one waiter and only in slow paths we see
several waiters I would like to avoid to adding fairness and increase
the complexity.
cheers,
daniel