From: Andrei Vagin <[email protected]>
seccomp_unotify allows more privileged processes do actions on behalf
of less privileged processes.
In many cases, the workflow is fully synchronous. It means a target
process triggers a system call and passes controls to a supervisor
process that handles the system call and returns controls back to the
target process. In this context, "synchronous" means that only one
process is running and another one is waiting.
The new WF_CURRENT_CPU flag advises the scheduler to move the wakee to
the current CPU. For such synchronous workflows, it makes context
switches a few times faster.
Right now, each interaction takes 12µs. With this patch, it takes about
3µs.
v2: clean up the first patch and add the test.
v3: update commit messages and a few fixes suggested by Kees Cook.
Cc: Andy Lutomirski <[email protected]>
Cc: Christian Brauner <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
Cc: Kees Cook <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Juri Lelli <[email protected]>
Cc: Peter Oskolkov <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Tycho Andersen <[email protected]>
Cc: Will Drewry <[email protected]>
Cc: Vincent Guittot <[email protected]>
Andrei Vagin (4):
seccomp: don't use semaphore and wait_queue together
sched: add a few helpers to wake up tasks on the current cpu
seccomp: add the synchronous mode for seccomp_unotify
selftest/seccomp: add a new test for the sync mode of
seccomp_user_notify
Peter Oskolkov (1):
sched: add WF_CURRENT_CPU and externise ttwu
include/linux/completion.h | 1 +
include/linux/swait.h | 1 +
include/linux/wait.h | 3 +
include/uapi/linux/seccomp.h | 4 +
kernel/sched/completion.c | 12 +++
kernel/sched/core.c | 5 +-
kernel/sched/fair.c | 4 +
kernel/sched/sched.h | 13 +--
kernel/sched/swait.c | 11 +++
kernel/sched/wait.c | 5 ++
kernel/seccomp.c | 72 +++++++++++++--
tools/testing/selftests/seccomp/seccomp_bpf.c | 88 +++++++++++++++++++
12 files changed, 204 insertions(+), 15 deletions(-)
--
2.37.2
From: Andrei Vagin <[email protected]>
seccomp_unotify allows more privileged processes do actions on behalf
of less privileged processes.
In many cases, the workflow is fully synchronous. It means a target
process triggers a system call and passes controls to a supervisor
process that handles the system call and returns controls to the target
process. In this context, "synchronous" means that only one process is
running and another one is waiting.
There is the WF_CURRENT_CPU flag that is used to advise the scheduler to
move the wakee to the current CPU. For such synchronous workflows, it
makes context switches a few times faster.
Right now, each interaction takes 12µs. With this patch, it takes about
3µs.
This change introduce the SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP flag that
it used to enable the sync mode.
Signed-off-by: Andrei Vagin <[email protected]>
---
include/uapi/linux/seccomp.h | 4 ++++
kernel/seccomp.c | 31 +++++++++++++++++++++++++++++--
2 files changed, 33 insertions(+), 2 deletions(-)
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 0fdc6ef02b94..dbfc9b37fcae 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -115,6 +115,8 @@ struct seccomp_notif_resp {
__u32 flags;
};
+#define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0)
+
/* valid flags for seccomp_notif_addfd */
#define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */
#define SECCOMP_ADDFD_FLAG_SEND (1UL << 1) /* Addfd and return it, atomically */
@@ -150,4 +152,6 @@ struct seccomp_notif_addfd {
#define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, \
struct seccomp_notif_addfd)
+#define SECCOMP_IOCTL_NOTIF_SET_FLAGS SECCOMP_IOW(4, __u64)
+
#endif /* _UAPI_LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 876022e9c88c..0a62d44f4898 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -143,9 +143,12 @@ struct seccomp_kaddfd {
* filter->notify_lock.
* @next_id: The id of the next request.
* @notifications: A list of struct seccomp_knotif elements.
+ * @flags: A set of SECCOMP_USER_NOTIF_FD_* flags.
*/
+
struct notification {
atomic_t requests;
+ u32 flags;
u64 next_id;
struct list_head notifications;
};
@@ -1117,7 +1120,10 @@ static int seccomp_do_user_notification(int this_syscall,
INIT_LIST_HEAD(&n.addfd);
atomic_add(1, &match->notif->requests);
- wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
+ if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+ wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM);
+ else
+ wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
/*
* This is where we wait for a reply from userspace.
@@ -1593,7 +1599,10 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
knotif->error = resp.error;
knotif->val = resp.val;
knotif->flags = resp.flags;
- complete(&knotif->ready);
+ if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+ complete_on_current_cpu(&knotif->ready);
+ else
+ complete(&knotif->ready);
out:
mutex_unlock(&filter->notify_lock);
return ret;
@@ -1623,6 +1632,22 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
return ret;
}
+static long seccomp_notify_set_flags(struct seccomp_filter *filter,
+ unsigned long flags)
+{
+ long ret;
+
+ if (flags & ~SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+ return -EINVAL;
+
+ ret = mutex_lock_interruptible(&filter->notify_lock);
+ if (ret < 0)
+ return ret;
+ filter->notif->flags = flags;
+ mutex_unlock(&filter->notify_lock);
+ return 0;
+}
+
static long seccomp_notify_addfd(struct seccomp_filter *filter,
struct seccomp_notif_addfd __user *uaddfd,
unsigned int size)
@@ -1752,6 +1777,8 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
case SECCOMP_IOCTL_NOTIF_ID_VALID:
return seccomp_notify_id_valid(filter, buf);
+ case SECCOMP_IOCTL_NOTIF_SET_FLAGS:
+ return seccomp_notify_set_flags(filter, arg);
}
/* Extensible Argument ioctls */
--
2.38.1.493.g58b659f92b-goog
From: Andrei Vagin <[email protected]>
Test output:
RUN global.user_notification_sync ...
seccomp_bpf.c:4279:user_notification_sync:basic: 8655 nsec/syscall
seccomp_bpf.c:4279:user_notification_sync:sync: 2919 nsec/syscall
OK global.user_notification_sync
Signed-off-by: Andrei Vagin <[email protected]>
---
tools/testing/selftests/seccomp/seccomp_bpf.c | 88 +++++++++++++++++++
1 file changed, 88 insertions(+)
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 4ae6c8991307..605c120ba2c2 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -4241,6 +4241,94 @@ TEST(user_notification_addfd_rlimit)
close(memfd);
}
+/* USER_NOTIF_BENCH_TIMEOUT is 100 miliseconds. */
+#define USER_NOTIF_BENCH_TIMEOUT 100000000ULL
+#define NSECS_PER_SEC 1000000000ULL
+
+#ifndef SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP
+#define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0)
+#define SECCOMP_IOCTL_NOTIF_SET_FLAGS SECCOMP_IOW(4, __u64)
+#endif
+
+static uint64_t user_notification_sync_loop(struct __test_metadata *_metadata,
+ char *test_name, int listener)
+{
+ struct timespec ts;
+ uint64_t start, end, nr;
+ struct seccomp_notif req = {};
+ struct seccomp_notif_resp resp = {};
+
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ start = ts.tv_nsec + ts.tv_sec * NSECS_PER_SEC;
+ for (end = start, nr = 0; end - start < USER_NOTIF_BENCH_TIMEOUT; nr++) {
+ memset(&req, 0, sizeof(req));
+ req.pid = 0;
+ ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+
+ ASSERT_EQ(req.data.nr, __NR_getppid);
+
+ resp.id = req.id;
+ resp.error = 0;
+ resp.val = USER_NOTIF_MAGIC;
+ resp.flags = 0;
+ ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ end = ts.tv_nsec + ts.tv_sec * NSECS_PER_SEC;
+ }
+ TH_LOG("%s:\t%lld nsec/syscall", test_name, USER_NOTIF_BENCH_TIMEOUT / nr);
+ return nr;
+}
+
+TEST(user_notification_sync)
+{
+ pid_t pid;
+ long ret;
+ int status, listener;
+ unsigned long calls, sync_calls;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ listener = user_notif_syscall(__NR_getppid,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
+ ASSERT_GE(listener, 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ while (1) {
+ ret = syscall(__NR_getppid);
+ if (ret == USER_NOTIF_MAGIC)
+ continue;
+ break;
+ }
+ _exit(1);
+ }
+
+ calls = user_notification_sync_loop(_metadata, "basic", listener);
+
+ /* Try to set invalid flags. */
+ EXPECT_SYSCALL_RETURN(-EINVAL,
+ ioctl(listener, SECCOMP_IOCTL_NOTIF_SET_FLAGS, 0xffffffff, 0));
+
+ ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SET_FLAGS,
+ SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP, 0), 0);
+
+ sync_calls = user_notification_sync_loop(_metadata, "sync", listener);
+
+ EXPECT_GT(sync_calls, calls);
+
+ kill(pid, SIGKILL);
+ ASSERT_EQ(waitpid(pid, &status, 0), pid);
+ ASSERT_EQ(true, WIFSIGNALED(status));
+ ASSERT_EQ(SIGKILL, WTERMSIG(status));
+}
+
+
/* Make sure PTRACE_O_SUSPEND_SECCOMP requires CAP_SYS_ADMIN. */
FIXTURE(O_SUSPEND_SECCOMP) {
pid_t pid;
--
2.38.1.493.g58b659f92b-goog
On Thu, Nov 10, 2022 at 11:31:49PM -0800, Andrei Vagin wrote:
> From: Andrei Vagin <[email protected]>
>
> seccomp_unotify allows more privileged processes do actions on behalf
> of less privileged processes.
>
> In many cases, the workflow is fully synchronous. It means a target
> process triggers a system call and passes controls to a supervisor
> process that handles the system call and returns controls back to the
> target process. In this context, "synchronous" means that only one
> process is running and another one is waiting.
>
> The new WF_CURRENT_CPU flag advises the scheduler to move the wakee to
> the current CPU. For such synchronous workflows, it makes context
> switches a few times faster.
>
> Right now, each interaction takes 12?s. With this patch, it takes about
> 3?s.
>
> v2: clean up the first patch and add the test.
> v3: update commit messages and a few fixes suggested by Kees Cook.
Thanks for the update! If I can get Acks from the sched folks, I think
this looks good to take.
-Kees
>
> Cc: Andy Lutomirski <[email protected]>
> Cc: Christian Brauner <[email protected]>
> Cc: Dietmar Eggemann <[email protected]>
> Cc: Kees Cook <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Juri Lelli <[email protected]>
> Cc: Peter Oskolkov <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Tycho Andersen <[email protected]>
> Cc: Will Drewry <[email protected]>
> Cc: Vincent Guittot <[email protected]>
>
> Andrei Vagin (4):
> seccomp: don't use semaphore and wait_queue together
> sched: add a few helpers to wake up tasks on the current cpu
> seccomp: add the synchronous mode for seccomp_unotify
> selftest/seccomp: add a new test for the sync mode of
> seccomp_user_notify
>
> Peter Oskolkov (1):
> sched: add WF_CURRENT_CPU and externise ttwu
>
> include/linux/completion.h | 1 +
> include/linux/swait.h | 1 +
> include/linux/wait.h | 3 +
> include/uapi/linux/seccomp.h | 4 +
> kernel/sched/completion.c | 12 +++
> kernel/sched/core.c | 5 +-
> kernel/sched/fair.c | 4 +
> kernel/sched/sched.h | 13 +--
> kernel/sched/swait.c | 11 +++
> kernel/sched/wait.c | 5 ++
> kernel/seccomp.c | 72 +++++++++++++--
> tools/testing/selftests/seccomp/seccomp_bpf.c | 88 +++++++++++++++++++
> 12 files changed, 204 insertions(+), 15 deletions(-)
>
> --
> 2.37.2
>
--
Kees Cook
On Fri, Nov 18, 2022 at 2:38 PM Kees Cook <[email protected]> wrote:
>
> On Thu, Nov 10, 2022 at 11:31:49PM -0800, Andrei Vagin wrote:
> > From: Andrei Vagin <[email protected]>
> >
> > seccomp_unotify allows more privileged processes do actions on behalf
> > of less privileged processes.
> >
> > In many cases, the workflow is fully synchronous. It means a target
> > process triggers a system call and passes controls to a supervisor
> > process that handles the system call and returns controls back to the
> > target process. In this context, "synchronous" means that only one
> > process is running and another one is waiting.
> >
> > The new WF_CURRENT_CPU flag advises the scheduler to move the wakee to
> > the current CPU. For such synchronous workflows, it makes context
> > switches a few times faster.
> >
> > Right now, each interaction takes 12盜. With this patch, it takes about
> > 3盜.
> >
> > v2: clean up the first patch and add the test.
> > v3: update commit messages and a few fixes suggested by Kees Cook.
>
> Thanks for the update! If I can get Acks from the sched folks, I think
> this looks good to take.
Peter, Ingo, could you take a look at this series?
Thanks,
Andrei
On Mon, Nov 21, 2022 at 11:52 PM Andrei Vagin <[email protected]> wrote:
>
> On Fri, Nov 18, 2022 at 2:38 PM Kees Cook <[email protected]> wrote:
> >
> > On Thu, Nov 10, 2022 at 11:31:49PM -0800, Andrei Vagin wrote:
> > > From: Andrei Vagin <[email protected]>
> > >
> > > seccomp_unotify allows more privileged processes do actions on behalf
> > > of less privileged processes.
> > >
> > > In many cases, the workflow is fully synchronous. It means a target
> > > process triggers a system call and passes controls to a supervisor
> > > process that handles the system call and returns controls back to the
> > > target process. In this context, "synchronous" means that only one
> > > process is running and another one is waiting.
> > >
> > > The new WF_CURRENT_CPU flag advises the scheduler to move the wakee to
> > > the current CPU. For such synchronous workflows, it makes context
> > > switches a few times faster.
> > >
> > > Right now, each interaction takes 12盜. With this patch, it takes about
> > > 3盜.
> > >
> > > v2: clean up the first patch and add the test.
> > > v3: update commit messages and a few fixes suggested by Kees Cook.
> >
> > Thanks for the update! If I can get Acks from the sched folks, I think
> > this looks good to take.
>
> Peter, Ingo, could you take a look at this series?
Friendly ping
>
> Thanks,
> Andrei