This introduces a per-filter flag (SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
that makes it so that when notifications are received by the supervisor
the notifying process will transition to wait killable semantics. Although
wait killable isn't a set of semantics formally exposed to userspace,
the concept is searchable. If the notifying process is signaled prior
to the notification being received by the userspace agent, it will
be handled as normal.
One quirk about how this is handled is that the notifying process
only switches to TASK_KILLABLE if it receives a wakeup from either
an addfd or a signal. This is to avoid an unnecessary wakeup of
the notifying task.
Signed-off-by: Sargun Dhillon <[email protected]>
---
.../userspace-api/seccomp_filter.rst | 8 ++++
include/linux/seccomp.h | 3 +-
include/uapi/linux/seccomp.h | 2 +
kernel/seccomp.c | 42 ++++++++++++++++++-
4 files changed, 52 insertions(+), 3 deletions(-)
diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
index 539e9d4a4860..204cf5ba511a 100644
--- a/Documentation/userspace-api/seccomp_filter.rst
+++ b/Documentation/userspace-api/seccomp_filter.rst
@@ -271,6 +271,14 @@ notifying process it will be replaced. The supervisor can also add an FD, and
respond atomically by using the ``SECCOMP_ADDFD_FLAG_SEND`` flag and the return
value will be the injected file descriptor number.
+The notifying process can be preempted, resulting in the notification being
+aborted. This can be problematic when trying to take actions on behalf of the
+notifying process that are long-running and typically retryable (mounting a
+filesytem). Alternatively, the at filter installation time, the
+``SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV`` flag can be set. This flag makes it
+such that when a user notification is received by the supervisor, the notifying
+process will ignore non-fatal signals until the response is sent.
+
It is worth noting that ``struct seccomp_data`` contains the values of register
arguments to the syscall, but does not contain pointers to memory. The task's
memory is accessible to suitably privileged traces via ``ptrace()`` or
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 0c564e5d40ff..d31d76be4982 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -8,7 +8,8 @@
SECCOMP_FILTER_FLAG_LOG | \
SECCOMP_FILTER_FLAG_SPEC_ALLOW | \
SECCOMP_FILTER_FLAG_NEW_LISTENER | \
- SECCOMP_FILTER_FLAG_TSYNC_ESRCH)
+ SECCOMP_FILTER_FLAG_TSYNC_ESRCH | \
+ SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
/* sizeof() the first published struct seccomp_notif_addfd */
#define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 78074254ab98..0fdc6ef02b94 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -23,6 +23,8 @@
#define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
#define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
+/* Received notifications wait in killable state (only respond to fatal signals) */
+#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV (1UL << 5)
/*
* All BPF programs must return a 32-bit value.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index db10e73d06e0..9291b0843cb2 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -201,6 +201,8 @@ static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
* the filter can be freed.
* @cache: cache of arch/syscall mappings to actions
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
+ * @wait_killable_recv: Put notifying process in killable state once the
+ * notification is received by the userspace listener.
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
* @notif: the struct that holds all notification related information
@@ -221,6 +223,7 @@ struct seccomp_filter {
refcount_t refs;
refcount_t users;
bool log;
+ bool wait_killable_recv;
struct action_cache cache;
struct seccomp_filter *prev;
struct bpf_prog *prog;
@@ -894,6 +897,10 @@ static long seccomp_attach_filter(unsigned int flags,
if (flags & SECCOMP_FILTER_FLAG_LOG)
filter->log = true;
+ /* Set wait killable flag, if present. */
+ if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
+ filter->wait_killable_recv = true;
+
/*
* If there is an existing filter, make it the prev and don't drop its
* task reference.
@@ -1081,6 +1088,12 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn
complete(&addfd->completion);
}
+static bool should_sleep_killable(struct seccomp_filter *match,
+ struct seccomp_knotif *n)
+{
+ return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT;
+}
+
static int seccomp_do_user_notification(int this_syscall,
struct seccomp_filter *match,
const struct seccomp_data *sd)
@@ -1111,11 +1124,25 @@ static int seccomp_do_user_notification(int this_syscall,
* This is where we wait for a reply from userspace.
*/
do {
+ bool wait_killable = should_sleep_killable(match, &n);
+
mutex_unlock(&match->notify_lock);
- err = wait_for_completion_interruptible(&n.ready);
+ if (wait_killable)
+ err = wait_for_completion_killable(&n.ready);
+ else
+ err = wait_for_completion_interruptible(&n.ready);
mutex_lock(&match->notify_lock);
- if (err != 0)
+
+ if (err != 0) {
+ /*
+ * Check to see if the notifcation got picked up and
+ * whether we should switch to wait killable.
+ */
+ if (!wait_killable && should_sleep_killable(match, &n))
+ continue;
+
goto interrupted;
+ }
addfd = list_first_entry_or_null(&n.addfd,
struct seccomp_kaddfd, list);
@@ -1485,6 +1512,9 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
mutex_lock(&filter->notify_lock);
knotif = find_notification(filter, unotif.id);
if (knotif) {
+ /* Reset the process to make sure it's not stuck */
+ if (should_sleep_killable(filter, knotif))
+ complete(&knotif->ready);
knotif->state = SECCOMP_NOTIFY_INIT;
up(&filter->notif->request);
}
@@ -1830,6 +1860,14 @@ static long seccomp_set_mode_filter(unsigned int flags,
((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0))
return -EINVAL;
+ /*
+ * The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't make sense
+ * without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag.
+ */
+ if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) &&
+ ((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0))
+ return -EINVAL;
+
/* Prepare the new filter before holding any locks. */
prepared = seccomp_prepare_user_filter(filter);
if (IS_ERR(prepared))
--
2.25.1
On Thu, Apr 28, 2022 at 07:31:12PM -0700, Sargun Dhillon wrote:
> This introduces a per-filter flag (SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
> that makes it so that when notifications are received by the supervisor
> the notifying process will transition to wait killable semantics. Although
> wait killable isn't a set of semantics formally exposed to userspace,
> the concept is searchable. If the notifying process is signaled prior
> to the notification being received by the userspace agent, it will
> be handled as normal.
>
> One quirk about how this is handled is that the notifying process
> only switches to TASK_KILLABLE if it receives a wakeup from either
> an addfd or a signal. This is to avoid an unnecessary wakeup of
> the notifying task.
>
> Signed-off-by: Sargun Dhillon <[email protected]>
> ---
> .../userspace-api/seccomp_filter.rst | 8 ++++
> include/linux/seccomp.h | 3 +-
> include/uapi/linux/seccomp.h | 2 +
> kernel/seccomp.c | 42 ++++++++++++++++++-
> 4 files changed, 52 insertions(+), 3 deletions(-)
>
> diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
> index 539e9d4a4860..204cf5ba511a 100644
> --- a/Documentation/userspace-api/seccomp_filter.rst
> +++ b/Documentation/userspace-api/seccomp_filter.rst
> @@ -271,6 +271,14 @@ notifying process it will be replaced. The supervisor can also add an FD, and
> respond atomically by using the ``SECCOMP_ADDFD_FLAG_SEND`` flag and the return
> value will be the injected file descriptor number.
>
> +The notifying process can be preempted, resulting in the notification being
> +aborted. This can be problematic when trying to take actions on behalf of the
> +notifying process that are long-running and typically retryable (mounting a
> +filesytem). Alternatively, the at filter installation time, the
typo: "the at" -> "at"
> +``SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV`` flag can be set. This flag makes it
> +such that when a user notification is received by the supervisor, the notifying
> +process will ignore non-fatal signals until the response is sent.
> +
> It is worth noting that ``struct seccomp_data`` contains the values of register
> arguments to the syscall, but does not contain pointers to memory. The task's
> memory is accessible to suitably privileged traces via ``ptrace()`` or
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index 0c564e5d40ff..d31d76be4982 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -8,7 +8,8 @@
> SECCOMP_FILTER_FLAG_LOG | \
> SECCOMP_FILTER_FLAG_SPEC_ALLOW | \
> SECCOMP_FILTER_FLAG_NEW_LISTENER | \
> - SECCOMP_FILTER_FLAG_TSYNC_ESRCH)
> + SECCOMP_FILTER_FLAG_TSYNC_ESRCH | \
> + SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
>
> /* sizeof() the first published struct seccomp_notif_addfd */
> #define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index 78074254ab98..0fdc6ef02b94 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -23,6 +23,8 @@
> #define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
> #define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
> #define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
> +/* Received notifications wait in killable state (only respond to fatal signals) */
> +#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV (1UL << 5)
>
> /*
> * All BPF programs must return a 32-bit value.
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index db10e73d06e0..9291b0843cb2 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -201,6 +201,8 @@ static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
> * the filter can be freed.
> * @cache: cache of arch/syscall mappings to actions
> * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
> + * @wait_killable_recv: Put notifying process in killable state once the
> + * notification is received by the userspace listener.
> * @prev: points to a previously installed, or inherited, filter
> * @prog: the BPF program to evaluate
> * @notif: the struct that holds all notification related information
> @@ -221,6 +223,7 @@ struct seccomp_filter {
> refcount_t refs;
> refcount_t users;
> bool log;
> + bool wait_killable_recv;
> struct action_cache cache;
> struct seccomp_filter *prev;
> struct bpf_prog *prog;
> @@ -894,6 +897,10 @@ static long seccomp_attach_filter(unsigned int flags,
> if (flags & SECCOMP_FILTER_FLAG_LOG)
> filter->log = true;
>
> + /* Set wait killable flag, if present. */
> + if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
> + filter->wait_killable_recv = true;
> +
> /*
> * If there is an existing filter, make it the prev and don't drop its
> * task reference.
> @@ -1081,6 +1088,12 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn
> complete(&addfd->completion);
> }
>
> +static bool should_sleep_killable(struct seccomp_filter *match,
> + struct seccomp_knotif *n)
> +{
> + return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT;
> +}
> +
> static int seccomp_do_user_notification(int this_syscall,
> struct seccomp_filter *match,
> const struct seccomp_data *sd)
> @@ -1111,11 +1124,25 @@ static int seccomp_do_user_notification(int this_syscall,
> * This is where we wait for a reply from userspace.
> */
> do {
> + bool wait_killable = should_sleep_killable(match, &n);
> +
> mutex_unlock(&match->notify_lock);
> - err = wait_for_completion_interruptible(&n.ready);
> + if (wait_killable)
> + err = wait_for_completion_killable(&n.ready);
> + else
> + err = wait_for_completion_interruptible(&n.ready);
> mutex_lock(&match->notify_lock);
> - if (err != 0)
> +
> + if (err != 0) {
> + /*
> + * Check to see if the notifcation got picked up and
> + * whether we should switch to wait killable.
> + */
> + if (!wait_killable && should_sleep_killable(match, &n))
> + continue;
> +
> goto interrupted;
> + }
>
> addfd = list_first_entry_or_null(&n.addfd,
> struct seccomp_kaddfd, list);
> @@ -1485,6 +1512,9 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
> mutex_lock(&filter->notify_lock);
> knotif = find_notification(filter, unotif.id);
> if (knotif) {
> + /* Reset the process to make sure it's not stuck */
> + if (should_sleep_killable(filter, knotif))
> + complete(&knotif->ready);
> knotif->state = SECCOMP_NOTIFY_INIT;
> up(&filter->notif->request);
> }
> @@ -1830,6 +1860,14 @@ static long seccomp_set_mode_filter(unsigned int flags,
> ((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0))
> return -EINVAL;
>
> + /*
> + * The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't make sense
> + * without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag.
> + */
> + if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) &&
> + ((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0))
> + return -EINVAL;
> +
> /* Prepare the new filter before holding any locks. */
> prepared = seccomp_prepare_user_filter(filter);
> if (IS_ERR(prepared))
Otherwise, looks good. Thanks for bringing this back up!
--
Kees Cook
On Fri, Apr 29, 2022 at 05:14:37PM +0000, Sargun Dhillon wrote:
> On Fri, Apr 29, 2022 at 11:42:15AM +0200, Rodrigo Campos wrote:
> > On Fri, Apr 29, 2022 at 4:32 AM Sargun Dhillon <[email protected]> wrote:
> > > the concept is searchable. If the notifying process is signaled prior
> > > to the notification being received by the userspace agent, it will
> > > be handled as normal.
> >
> > Why is that? Why not always handle in the same way (if wait killable
> > is set, wait like that)
> >
>
> The goal is to avoid two things:
> 1. Unncessary work - Often times, we see workloads that implement techniques
> like hedging (Also known as request racing[1]). In fact, RFC3484
> (destination address selection) gets implemented where the DNS library
> will connect to many backend addresses and whichever one comes back first
> "wins".
> 2. Side effects - We don't want a situation where a syscall is in progress
> that is non-trivial to rollback (mount), and from user space's perspective
> this syscall never completed.
>
> Blocking before the syscall even starts is excessive. When we looked at this
> we found that with runtimes like Golang, they can get into a bad situation
> if they have many (1000s) of threads that are in the middle of a syscall
> because all of them need to elide prior to GC. In this case the runtime
> prioritizes the liveness of GC vs. the syscalls.
>
> That being said, there may be some syscalls in a filter that need the suggested
> behaviour. I can imagine introducing a new flag
> (say SECCOMP_FILTER_FLAG_WAIT_KILLABLE) that applies to all states.
> Alternatively, in one implementation, I put the behaviour in the data
> field of the return from the BPF filter.
I'd add something like the above to the commit log, just to have it
around.
--
Kees Cook
On Fri, Apr 29, 2022 at 4:32 AM Sargun Dhillon <[email protected]> wrote:
> diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
> index 539e9d4a4860..204cf5ba511a 100644
> --- a/Documentation/userspace-api/seccomp_filter.rst
> +++ b/Documentation/userspace-api/seccomp_filter.rst
> @@ -271,6 +271,14 @@ notifying process it will be replaced. The supervisor can also add an FD, and
> respond atomically by using the ``SECCOMP_ADDFD_FLAG_SEND`` flag and the return
> value will be the injected file descriptor number.
>
> +The notifying process can be preempted, resulting in the notification being
> +aborted. This can be problematic when trying to take actions on behalf of the
> +notifying process that are long-running and typically retryable (mounting a
> +filesytem). Alternatively, the at filter installation time, the
> +``SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV`` flag can be set. This flag makes it
> +such that when a user notification is received by the supervisor, the notifying
> +process will ignore non-fatal signals until the response is sent.
Maybe:
This flags ignores non-fatal signals that arrive after the supervisor
received the notification
I mean, I want to make it clear that if a signal arrives before the
notification was received by the supervisor, then it will be
interrupted anyways.
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index db10e73d06e0..9291b0843cb2 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -1485,6 +1512,9 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
> mutex_lock(&filter->notify_lock);
> knotif = find_notification(filter, unotif.id);
> if (knotif) {
> + /* Reset the process to make sure it's not stuck */
> + if (should_sleep_killable(filter, knotif))
> + complete(&knotif->ready);
> knotif->state = SECCOMP_NOTIFY_INIT;
> up(&filter->notif->request);
(I couldn't git-am this locally, so maybe I'm injecting this at the
wrong parts mentally when looking at the other code for more context.
Sorry if that is the case :))
Why do we need to complete() only in this error path? As far as I can
see this is on the error path where the copy to userspace failed and
we want to reset this notification.
I think that is wrong, we want to wake up the other side not just on
the error path, but on the non-error path (in fact, do we want to do
this on the error path? It seems like a no-op, but don't see any
reason to do it).
We _need_ to call complete() in the non error path here so the other
side wakes up and switches to a killable wait. As we are not doing
this (for the non error path), this will basically not achieve a
wait_killable() at all.
I think this was probably an oversight adapting the patch from last
year. Is it possble? Because it seems that in the previous version we
sent last year[1] (if you can link them next time it will be way
simpler :)) you had a new ioctl() and the call to complete() was
handled there, in seccomp_notify_set_wait_killable(). Now, as this is
part of the filter (and as I said last year, I think this way looks
better) that call to complete() was completely forgotten.
Is it possible that this is not really working as intended, then? Am I
missing something?
Best,
Rodrigo
[1]: https://lore.kernel.org/all/[email protected]/
On Fri, Apr 29, 2022 at 8:20 PM Kees Cook <[email protected]> wrote:
> On Fri, Apr 29, 2022 at 05:14:37PM +0000, Sargun Dhillon wrote:
> > On Fri, Apr 29, 2022 at 11:42:15AM +0200, Rodrigo Campos wrote:
> > > On Fri, Apr 29, 2022 at 4:32 AM Sargun Dhillon <[email protected]> wrote:
> > > > the concept is searchable. If the notifying process is signaled prior
> > > > to the notification being received by the userspace agent, it will
> > > > be handled as normal.
> > >
> > > Why is that? Why not always handle in the same way (if wait killable
> > > is set, wait like that)
> > >
> >
> > The goal is to avoid two things:
> > 1. Unncessary work - Often times, we see workloads that implement techniques
> > like hedging (Also known as request racing[1]). In fact, RFC3484
> > (destination address selection) gets implemented where the DNS library
> > will connect to many backend addresses and whichever one comes back first
> > "wins".
> > 2. Side effects - We don't want a situation where a syscall is in progress
> > that is non-trivial to rollback (mount), and from user space's perspective
> > this syscall never completed.
> >
> > Blocking before the syscall even starts is excessive. When we looked at this
> > we found that with runtimes like Golang, they can get into a bad situation
> > if they have many (1000s) of threads that are in the middle of a syscall
> > because all of them need to elide prior to GC. In this case the runtime
> > prioritizes the liveness of GC vs. the syscalls.
> >
> > That being said, there may be some syscalls in a filter that need the suggested
> > behaviour. I can imagine introducing a new flag
> > (say SECCOMP_FILTER_FLAG_WAIT_KILLABLE) that applies to all states.
> > Alternatively, in one implementation, I put the behaviour in the data
> > field of the return from the BPF filter.
Makes sense, if we need to, we can implement that in the future too.
> I'd add something like the above to the commit log, just to have it
> around.
Yes, please. It was not obvious to me.
On Mon, May 02, 2022 at 04:15:07PM +0200, Rodrigo Campos wrote:
> On Fri, Apr 29, 2022 at 4:32 AM Sargun Dhillon <[email protected]> wrote:
> > diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
> > index 539e9d4a4860..204cf5ba511a 100644
> > --- a/Documentation/userspace-api/seccomp_filter.rst
> > +++ b/Documentation/userspace-api/seccomp_filter.rst
> > @@ -271,6 +271,14 @@ notifying process it will be replaced. The supervisor can also add an FD, and
> > respond atomically by using the ``SECCOMP_ADDFD_FLAG_SEND`` flag and the return
> > value will be the injected file descriptor number.
> >
> > +The notifying process can be preempted, resulting in the notification being
> > +aborted. This can be problematic when trying to take actions on behalf of the
> > +notifying process that are long-running and typically retryable (mounting a
> > +filesytem). Alternatively, the at filter installation time, the
> > +``SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV`` flag can be set. This flag makes it
> > +such that when a user notification is received by the supervisor, the notifying
> > +process will ignore non-fatal signals until the response is sent.
>
> Maybe:
>
> This flags ignores non-fatal signals that arrive after the supervisor
> received the notification
>
> I mean, I want to make it clear that if a signal arrives before the
> notification was received by the supervisor, then it will be
> interrupted anyways.
>
>
Added: Signals that are sent prior to the notification being received by
userspace are handled normally.
> > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > index db10e73d06e0..9291b0843cb2 100644
> > --- a/kernel/seccomp.c
> > +++ b/kernel/seccomp.c
> > @@ -1485,6 +1512,9 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
> > mutex_lock(&filter->notify_lock);
> > knotif = find_notification(filter, unotif.id);
> > if (knotif) {
> > + /* Reset the process to make sure it's not stuck */
> > + if (should_sleep_killable(filter, knotif))
> > + complete(&knotif->ready);
> > knotif->state = SECCOMP_NOTIFY_INIT;
> > up(&filter->notif->request);
>
> (I couldn't git-am this locally, so maybe I'm injecting this at the
> wrong parts mentally when looking at the other code for more context.
> Sorry if that is the case :))
>
> Why do we need to complete() only in this error path? As far as I can
> see this is on the error path where the copy to userspace failed and
> we want to reset this notification.
This error path acts as follows
(Say, S: Supervisor, P: Notifying Process, U: User)
P: 2 <-- Pid
P: getppid() // Generated notification
P: Waiting in wait_interruptible state
S: Calls receive notification, and the codepath gets up to the poin
where it's copying the notification to userspace
U: kill -SIGURG 2 // Send a kill signal to the notifying process
P: Waiting in the wait_killable state
S: Kernel fails to copy notification into user memory, and resets
the notification and returns an error
If we do not have the reset, the P will never return to wait interruptible.
This is the only code path that a notification can go init -> sent -> init.
>
> I think that is wrong, we want to wake up the other side not just on
> the error path, but on the non-error path (in fact, do we want to do
> this on the error path? It seems like a no-op, but don't see any
> reason to do it).
>
It's unneccessary. Why do it? It just means we wake up a process without reason.
Wakeups are expensive.
> We _need_ to call complete() in the non error path here so the other
> side wakes up and switches to a killable wait. As we are not doing
> this (for the non error path), this will basically not achieve a
> wait_killable() at all.
>
No, because here, we check if we were waiting interruptible, and
then we switch to wait_killable:
/*
* Check to see if the notifcation got picked up and
* whether we should switch to wait killable.
*/
if (!wait_killable && should_sleep_killable(match, &n))
continue;
This could probably be:
if (fatal_signal_pending(current))
break;
if (!wait_killable && should_sleep_killable(match, &n))
continue;
But, that check for fatal_signal_pending seems to be unneccessary (or something we'll get
for free in the next iteration).
> I think this was probably an oversight adapting the patch from last
> year. Is it possble? Because it seems that in the previous version we
> sent last year[1] (if you can link them next time it will be way
> simpler :)) you had a new ioctl() and the call to complete() was
> handled there, in seccomp_notify_set_wait_killable(). Now, as this is
> part of the filter (and as I said last year, I think this way looks
> better) that call to complete() was completely forgotten.
>
> Is it possible that this is not really working as intended, then? Am I
> missing something?
>
>
> Best,
> Rodrigo
>
>
> [1]: https://lore.kernel.org/all/[email protected]/
On Mon, May 2, 2022 at 6:04 PM Sargun Dhillon <[email protected]> wrote:
>
> On Mon, May 02, 2022 at 04:15:07PM +0200, Rodrigo Campos wrote:
> > (I couldn't git-am this locally, so maybe I'm injecting this at the
> > wrong parts mentally when looking at the other code for more context.
> > Sorry if that is the case :))
> >
> > Why do we need to complete() only in this error path? As far as I can
> > see this is on the error path where the copy to userspace failed and
> > we want to reset this notification.
> This error path acts as follows
> (Say, S: Supervisor, P: Notifying Process, U: User)
>
> P: 2 <-- Pid
> P: getppid() // Generated notification
> P: Waiting in wait_interruptible state
> S: Calls receive notification, and the codepath gets up to the poin
> where it's copying the notification to userspace
> U: kill -SIGURG 2 // Send a kill signal to the notifying process
> P: Waiting in the wait_killable state
> S: Kernel fails to copy notification into user memory, and resets
> the notification and returns an error
>
> If we do not have the reset, the P will never return to wait interruptible.
Ohhh, because we want the wait to be interruptible again! Right, I
forgot we should reset to that state again, until the notification is
indeed handled.
What if we say something along those lines in the comment, then? Like:
// Make the other side go back to wait interruptible, the notification
is not SENT.
Something like that would at least help me in the future :)
> > We _need_ to call complete() in the non error path here so the other
> > side wakes up and switches to a killable wait. As we are not doing
> > this (for the non error path), this will basically not achieve a
> > wait_killable() at all.
> >
> No, because here, we check if we were waiting interruptible, and
> then we switch to wait_killable:
Ohhh, right right right. This is lazily changing to wait killable only
when something already wakes up the process. Sorry, I overlooked that.
Best,
Rodrigo