From: Andrei Vagin <[email protected]>
Add complete_on_current_cpu, wake_up_poll_on_current_cpu helpers to wake
up tasks on the current CPU.
These two helpers are useful when the task needs to make a synchronous context
switch to another task. In this context, synchronous means it wakes up the
target task and falls asleep right after that.
One example of such workloads is seccomp user notifies. This mechanism allows
the supervisor process handles system calls on behalf of a target process.
While the supervisor is handling an intercepted system call, the target process
will be blocked in the kernel, waiting for a response to come back.
On-CPU context switches are much faster than regular ones.
Signed-off-by: Andrei Vagin <[email protected]>
---
include/linux/completion.h | 1 +
include/linux/swait.h | 1 +
include/linux/wait.h | 3 +++
kernel/sched/completion.c | 12 ++++++++++++
kernel/sched/core.c | 2 +-
kernel/sched/swait.c | 11 +++++++++++
kernel/sched/wait.c | 5 +++++
7 files changed, 34 insertions(+), 1 deletion(-)
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 62b32b19e0a8..fb2915676574 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -116,6 +116,7 @@ extern bool try_wait_for_completion(struct completion *x);
extern bool completion_done(struct completion *x);
extern void complete(struct completion *);
+extern void complete_on_current_cpu(struct completion *x);
extern void complete_all(struct completion *);
#endif
diff --git a/include/linux/swait.h b/include/linux/swait.h
index 6a8c22b8c2a5..1f27b254adf5 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -147,6 +147,7 @@ static inline bool swq_has_sleeper(struct swait_queue_head *wq)
extern void swake_up_one(struct swait_queue_head *q);
extern void swake_up_all(struct swait_queue_head *q);
extern void swake_up_locked(struct swait_queue_head *q);
+extern void swake_up_locked_on_current_cpu(struct swait_queue_head *q);
extern void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state);
extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
diff --git a/include/linux/wait.h b/include/linux/wait.h
index a0307b516b09..5ec7739400f4 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -210,6 +210,7 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
}
int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
+void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
unsigned int mode, void *key, wait_queue_entry_t *bookmark);
@@ -237,6 +238,8 @@ void __wake_up_pollfree(struct wait_queue_head *wq_head);
#define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m))
#define wake_up_poll(x, m) \
__wake_up(x, TASK_NORMAL, 1, poll_to_key(m))
+#define wake_up_poll_on_current_cpu(x, m) \
+ __wake_up_on_current_cpu(x, TASK_NORMAL, poll_to_key(m))
#define wake_up_locked_poll(x, m) \
__wake_up_locked_key((x), TASK_NORMAL, poll_to_key(m))
#define wake_up_interruptible_poll(x, m) \
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index d57a5c1c1cd9..a1931a79c05a 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -38,6 +38,18 @@ void complete(struct completion *x)
}
EXPORT_SYMBOL(complete);
+void complete_on_current_cpu(struct completion *x)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
+
+ if (x->done != UINT_MAX)
+ x->done++;
+ swake_up_locked_on_current_cpu(&x->wait);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+
/**
* complete_all: - signals all threads waiting on this completion
* @x: holds the state of this particular completion
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6478e819eb99..c81866821139 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6874,7 +6874,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
- WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
return try_to_wake_up(curr->private, mode, wake_flags);
}
EXPORT_SYMBOL(default_wake_function);
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 76b9b796e695..9ebe23868942 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -31,6 +31,17 @@ void swake_up_locked(struct swait_queue_head *q)
}
EXPORT_SYMBOL(swake_up_locked);
+void swake_up_locked_on_current_cpu(struct swait_queue_head *q)
+{
+ struct swait_queue *curr;
+
+ if (list_empty(&q->task_list))
+ return;
+
+ curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
+ try_to_wake_up(curr->task, TASK_NORMAL, WF_CURRENT_CPU);
+ list_del_init(&curr->task_list);
+}
/*
* Wake up all waiters. This is an interface which is solely exposed for
* completions and not for general usage.
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 133b74730738..47803a0b8d5d 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -161,6 +161,11 @@ int __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
}
EXPORT_SYMBOL(__wake_up);
+void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key)
+{
+ __wake_up_common_lock(wq_head, mode, 1, WF_CURRENT_CPU, key);
+}
+
/*
* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
*/
--
2.39.0.314.g84b9a713c41-goog
On Tue, Jan 10, 2023 at 01:30:08PM -0800, Andrei Vagin wrote:
> diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
> index d57a5c1c1cd9..a1931a79c05a 100644
> --- a/kernel/sched/completion.c
> +++ b/kernel/sched/completion.c
> @@ -38,6 +38,18 @@ void complete(struct completion *x)
> }
> EXPORT_SYMBOL(complete);
>
> +void complete_on_current_cpu(struct completion *x)
> +{
> + unsigned long flags;
> +
> + raw_spin_lock_irqsave(&x->wait.lock, flags);
> +
> + if (x->done != UINT_MAX)
> + x->done++;
> + swake_up_locked_on_current_cpu(&x->wait);
> + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
> +}
> +
> /**
> * complete_all: - signals all threads waiting on this completion
> * @x: holds the state of this particular completion
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 6478e819eb99..c81866821139 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6874,7 +6874,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
> int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
> void *key)
> {
> - WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
> + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
> return try_to_wake_up(curr->private, mode, wake_flags);
> }
> EXPORT_SYMBOL(default_wake_function);
> diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
> index 76b9b796e695..9ebe23868942 100644
> --- a/kernel/sched/swait.c
> +++ b/kernel/sched/swait.c
> @@ -31,6 +31,17 @@ void swake_up_locked(struct swait_queue_head *q)
> }
> EXPORT_SYMBOL(swake_up_locked);
>
> +void swake_up_locked_on_current_cpu(struct swait_queue_head *q)
> +{
> + struct swait_queue *curr;
> +
> + if (list_empty(&q->task_list))
> + return;
> +
> + curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
> + try_to_wake_up(curr->task, TASK_NORMAL, WF_CURRENT_CPU);
> + list_del_init(&curr->task_list);
> +}
> /*
> * Wake up all waiters. This is an interface which is solely exposed for
> * completions and not for general usage.
> diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
> index 133b74730738..47803a0b8d5d 100644
> --- a/kernel/sched/wait.c
> +++ b/kernel/sched/wait.c
> @@ -161,6 +161,11 @@ int __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
> }
> EXPORT_SYMBOL(__wake_up);
>
> +void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key)
> +{
> + __wake_up_common_lock(wq_head, mode, 1, WF_CURRENT_CPU, key);
> +}
> +
> /*
> * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
> */
So I hate this patch, it adds a whole ton of duplication and litter for
no real reason afaict. For instance you could've just added an
wake_flags argument to swake_up_locked(), there's not *that* many users
-- unlike complete().
And for complete() instead of fully duplicating the function (always a
bad idea) you could've made complete_flags() and implemented complete()
using that, or something.
Anyway, let me go (finally) have a look at Chen's patch, since that
might render all this moot.
On Mon, Jan 16, 2023 at 1:59 AM Peter Zijlstra <[email protected]> wrote:
>
> On Tue, Jan 10, 2023 at 01:30:08PM -0800, Andrei Vagin wrote:
>
> > diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
> > index d57a5c1c1cd9..a1931a79c05a 100644
> > --- a/kernel/sched/completion.c
> > +++ b/kernel/sched/completion.c
> > @@ -38,6 +38,18 @@ void complete(struct completion *x)
> > }
> > EXPORT_SYMBOL(complete);
> >
> > +void complete_on_current_cpu(struct completion *x)
> > +{
> > + unsigned long flags;
> > +
> > + raw_spin_lock_irqsave(&x->wait.lock, flags);
> > +
> > + if (x->done != UINT_MAX)
> > + x->done++;
> > + swake_up_locked_on_current_cpu(&x->wait);
> > + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
> > +}
> > +
> > /**
> > * complete_all: - signals all threads waiting on this completion
> > * @x: holds the state of this particular completion
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 6478e819eb99..c81866821139 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -6874,7 +6874,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
> > int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
> > void *key)
> > {
> > - WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
> > + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
> > return try_to_wake_up(curr->private, mode, wake_flags);
> > }
> > EXPORT_SYMBOL(default_wake_function);
> > diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
> > index 76b9b796e695..9ebe23868942 100644
> > --- a/kernel/sched/swait.c
> > +++ b/kernel/sched/swait.c
> > @@ -31,6 +31,17 @@ void swake_up_locked(struct swait_queue_head *q)
> > }
> > EXPORT_SYMBOL(swake_up_locked);
> >
> > +void swake_up_locked_on_current_cpu(struct swait_queue_head *q)
> > +{
> > + struct swait_queue *curr;
> > +
> > + if (list_empty(&q->task_list))
> > + return;
> > +
> > + curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
> > + try_to_wake_up(curr->task, TASK_NORMAL, WF_CURRENT_CPU);
> > + list_del_init(&curr->task_list);
> > +}
> > /*
> > * Wake up all waiters. This is an interface which is solely exposed for
> > * completions and not for general usage.
> > diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
> > index 133b74730738..47803a0b8d5d 100644
> > --- a/kernel/sched/wait.c
> > +++ b/kernel/sched/wait.c
> > @@ -161,6 +161,11 @@ int __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
> > }
> > EXPORT_SYMBOL(__wake_up);
> >
> > +void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key)
> > +{
> > + __wake_up_common_lock(wq_head, mode, 1, WF_CURRENT_CPU, key);
> > +}
> > +
> > /*
> > * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
> > */
>
> So I hate this patch, it adds a whole ton of duplication and litter for
> no real reason afaict. For instance you could've just added an
> wake_flags argument to swake_up_locked(), there's not *that* many users
> -- unlike complete().
>
> And for complete() instead of fully duplicating the function (always a
> bad idea) you could've made complete_flags() and implemented complete()
> using that, or something.
>
> Anyway, let me go (finally) have a look at Chen's patch, since that
> might render all this moot.
If it is the only concern, it is easy to fix. I think I decided to do it
this way because swake_up_locked is in include/linux/swait.h, but wakeup
flags are in kernel/sched/sched.h. I agree that it is better to avoid
this code duplication.
Regarding Chen's proposed patch, unfortunately, it does not solve our
problem. It works only in narrow conditions. One of the major problems
is that it does nothing if there are idle cores. The advantage of my
changes is that they work predictably, and they provide benefits
regardless of other workloads running on the same host.
Thanks,
Andrei
On Wed, Jan 18, 2023 at 10:45:33PM -0800, Andrei Vagin wrote:
> On Mon, Jan 16, 2023 at 1:59 AM Peter Zijlstra <[email protected]> wrote:
> > So I hate this patch, it adds a whole ton of duplication and litter for
> > no real reason afaict. For instance you could've just added an
> > wake_flags argument to swake_up_locked(), there's not *that* many users
> > -- unlike complete().
> >
> > And for complete() instead of fully duplicating the function (always a
> > bad idea) you could've made complete_flags() and implemented complete()
> > using that, or something.
> >
> > Anyway, let me go (finally) have a look at Chen's patch, since that
> > might render all this moot.
>
> If it is the only concern, it is easy to fix. I think I decided to do it
> this way because swake_up_locked is in include/linux/swait.h, but wakeup
> flags are in kernel/sched/sched.h. I agree that it is better to avoid
> this code duplication.
Thanks.
> Regarding Chen's proposed patch, unfortunately, it does not solve our
> problem. It works only in narrow conditions. One of the major problems
> is that it does nothing if there are idle cores. The advantage of my
> changes is that they work predictably, and they provide benefits
> regardless of other workloads running on the same host.
Oh well.. carry on then. Can't always have two birds thing I suppose :-)