[re-send with smaller CC list]
This adds the ability for threads to request seccomp filter
synchronization across their thread group (at filter attach time).
For example, for Chrome to make sure graphic driver threads are fully
confined after seccomp filters have been attached.
To support this, locking on seccomp changes is introduced, along with
refactoring of no_new_privs. Races with thread creation/death are handled
via tasklist_lock.
This includes a new syscall (instead of adding a new prctl option),
as suggested by Andy Lutomirski and Michael Kerrisk.
Thanks!
-Kees
v6:
- switch from seccomp-specific lock to thread-group lock to gain atomicity
- implement seccomp syscall across all architectures with seccomp filter
- clean up sparse warnings around locking
v5:
- move includes around (drysdale)
- drop set_nnp return value (luto)
- use smp_load_acquire/store_release (luto)
- merge nnp changes to seccomp always, fewer ifdef (luto)
v4:
- cleaned up locking further, as noticed by David Drysdale
v3:
- added SECCOMP_EXT_ACT_FILTER for new filter install options
v2:
- reworked to avoid clone races
Wires up the new seccomp syscall.
Signed-off-by: Kees Cook <[email protected]>
---
arch/arm/include/uapi/asm/unistd.h | 1 +
arch/arm/kernel/calls.S | 1 +
2 files changed, 2 insertions(+)
diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h
index ba94446c72d9..e21b4a069701 100644
--- a/arch/arm/include/uapi/asm/unistd.h
+++ b/arch/arm/include/uapi/asm/unistd.h
@@ -409,6 +409,7 @@
#define __NR_sched_setattr (__NR_SYSCALL_BASE+380)
#define __NR_sched_getattr (__NR_SYSCALL_BASE+381)
#define __NR_renameat2 (__NR_SYSCALL_BASE+382)
+#define __NR_seccomp (__NR_SYSCALL_BASE+383)
/*
* This may need to be greater than __NR_last_syscall+1 in order to
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index 8f51bdcdacbb..bea85f97f363 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -392,6 +392,7 @@
/* 380 */ CALL(sys_sched_setattr)
CALL(sys_sched_getattr)
CALL(sys_renameat2)
+ CALL(sys_seccomp)
#ifndef syscalls_counted
.equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
#define syscalls_counted
--
1.7.9.5
This adds the new "seccomp" syscall with both an "operation" and "flags"
parameter for future expansion. The third argument is a pointer value,
used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
Signed-off-by: Kees Cook <[email protected]>
Cc: [email protected]
---
arch/x86/syscalls/syscall_32.tbl | 1 +
arch/x86/syscalls/syscall_64.tbl | 1 +
include/linux/syscalls.h | 2 ++
include/uapi/asm-generic/unistd.h | 4 ++-
include/uapi/linux/seccomp.h | 4 +++
kernel/seccomp.c | 63 ++++++++++++++++++++++++++++++++-----
kernel/sys_ni.c | 3 ++
7 files changed, 69 insertions(+), 9 deletions(-)
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index d6b867921612..7527eac24122 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -360,3 +360,4 @@
351 i386 sched_setattr sys_sched_setattr
352 i386 sched_getattr sys_sched_getattr
353 i386 renameat2 sys_renameat2
+354 i386 seccomp sys_seccomp
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index ec255a1646d2..16272a6c12b7 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -323,6 +323,7 @@
314 common sched_setattr sys_sched_setattr
315 common sched_getattr sys_sched_getattr
316 common renameat2 sys_renameat2
+317 common seccomp sys_seccomp
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b0881a0ed322..1713977ee26f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
unsigned long idx1, unsigned long idx2);
asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
+asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
+ const char __user *uargs);
#endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 333640608087..65acbf0e2867 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -699,9 +699,11 @@ __SYSCALL(__NR_sched_setattr, sys_sched_setattr)
__SYSCALL(__NR_sched_getattr, sys_sched_getattr)
#define __NR_renameat2 276
__SYSCALL(__NR_renameat2, sys_renameat2)
+#define __NR_seccomp 277
+__SYSCALL(__NR_seccomp, sys_seccomp)
#undef __NR_syscalls
-#define __NR_syscalls 277
+#define __NR_syscalls 278
/*
* All syscalls below here should go away really,
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index ac2dc9f72973..b258878ba754 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -10,6 +10,10 @@
#define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */
#define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
+/* Valid operations for seccomp syscall. */
+#define SECCOMP_SET_MODE_STRICT 0
+#define SECCOMP_SET_MODE_FILTER 1
+
/*
* All BPF programs must return a 32-bit value.
* The bottom 16-bits are for optional return data.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 39d32c2904fc..c0cafa9e84af 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -19,6 +19,7 @@
#include <linux/sched.h>
#include <linux/seccomp.h>
#include <linux/slab.h>
+#include <linux/syscalls.h>
/* #define SECCOMP_DEBUG 1 */
@@ -301,8 +302,8 @@ free_prog:
*
* Returns filter on success and ERR_PTR otherwise.
*/
-static
-struct seccomp_filter *seccomp_prepare_user_filter(char __user *user_filter)
+static struct seccomp_filter *
+seccomp_prepare_user_filter(const char __user *user_filter)
{
struct sock_fprog fprog;
struct seccomp_filter *filter = ERR_PTR(-EFAULT);
@@ -325,19 +326,25 @@ out:
/**
* seccomp_attach_filter: validate and attach filter
+ * @flags: flags to change filter behavior
* @filter: seccomp filter to add to the current process
*
* Caller must be holding current->sighand->siglock lock.
*
* Returns 0 on success, -ve on error.
*/
-static long seccomp_attach_filter(struct seccomp_filter *filter)
+static long seccomp_attach_filter(unsigned int flags,
+ struct seccomp_filter *filter)
{
unsigned long total_insns;
struct seccomp_filter *walker;
BUG_ON(!spin_is_locked(¤t->sighand->siglock));
+ /* Validate flags. */
+ if (flags != 0)
+ return -EINVAL;
+
/* Validate resulting filter length. */
total_insns = filter->len;
for (walker = current->seccomp.filter; walker; walker = filter->prev)
@@ -541,6 +548,7 @@ out:
#ifdef CONFIG_SECCOMP_FILTER
/**
* seccomp_set_mode_filter: internal function for setting seccomp filter
+ * @flags: flags to change filter behavior
* @filter: struct sock_fprog containing filter
*
* This function may be called repeatedly to install additional filters.
@@ -551,7 +559,8 @@ out:
*
* Returns 0 on success or -EINVAL on failure.
*/
-static long seccomp_set_mode_filter(char __user *filter)
+static long seccomp_set_mode_filter(unsigned int flags,
+ const char __user *filter)
{
const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
struct seccomp_filter *prepared = NULL;
@@ -569,7 +578,7 @@ static long seccomp_set_mode_filter(char __user *filter)
if (!seccomp_check_mode(current, seccomp_mode))
goto out;
- ret = seccomp_attach_filter(prepared);
+ ret = seccomp_attach_filter(flags, prepared);
if (ret)
goto out;
/* Do not free the successfully attached filter. */
@@ -583,12 +592,35 @@ out_free:
return ret;
}
#else
-static inline long seccomp_set_mode_filter(char __user *filter)
+static inline long seccomp_set_mode_filter(unsigned int flags,
+ const char __user *filter)
{
return -EINVAL;
}
#endif
+/* Common entry point for both prctl and syscall. */
+static long do_seccomp(unsigned int op, unsigned int flags,
+ const char __user *uargs)
+{
+ switch (op) {
+ case SECCOMP_SET_MODE_STRICT:
+ if (flags != 0 || uargs != NULL)
+ return -EINVAL;
+ return seccomp_set_mode_strict();
+ case SECCOMP_SET_MODE_FILTER:
+ return seccomp_set_mode_filter(flags, uargs);
+ default:
+ return -EINVAL;
+ }
+}
+
+SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
+ const char __user *, uargs)
+{
+ return do_seccomp(op, flags, uargs);
+}
+
/**
* prctl_set_seccomp: configures current->seccomp.mode
* @seccomp_mode: requested mode to use
@@ -598,12 +630,27 @@ static inline long seccomp_set_mode_filter(char __user *filter)
*/
long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
{
+ unsigned int op;
+ char __user *uargs;
+
switch (seccomp_mode) {
case SECCOMP_MODE_STRICT:
- return seccomp_set_mode_strict();
+ op = SECCOMP_SET_MODE_STRICT;
+ /*
+ * Setting strict mode through prctl always ignored filter,
+ * so make sure it is always NULL here to pass the internal
+ * check in do_seccomp().
+ */
+ uargs = NULL;
+ break;
case SECCOMP_MODE_FILTER:
- return seccomp_set_mode_filter(filter);
+ op = SECCOMP_SET_MODE_FILTER;
+ uargs = filter;
+ break;
default:
return -EINVAL;
}
+
+ /* prctl interface doesn't have flags, so they are always zero. */
+ return do_seccomp(op, 0, uargs);
}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 36441b51b5df..2904a2105914 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -213,3 +213,6 @@ cond_syscall(compat_sys_open_by_handle_at);
/* compare kernel pointers */
cond_syscall(sys_kcmp);
+
+/* operate on Secure Computing state */
+cond_syscall(sys_seccomp);
--
1.7.9.5
Normally, task_struct.seccomp.filter is only ever read or modified by
the task that owns it (current). This property aids in fast access
during system call filtering as read access is lockless.
Updating the pointer from another task, however, opens up race
conditions. To allow cross-thread filter pointer updates, writes to
the seccomp fields are now protected by the sighand spinlock (which
is unique to the thread group). Read access remains lockless because
pointer updates themselves are atomic. However, writes (or cloning)
often entail additional checking (like maximum instruction counts)
which require locking to perform safely.
In the case of cloning threads, the child is invisible to the system
until it enters the task list. To make sure a child can't be cloned from
a thread and left in a prior state, seccomp duplication is additionally
moved under the tasklist_lock. Then parent and child are certain have
the same seccomp state when they exit the lock.
Based on patches by Will Drewry and David Drysdale.
Signed-off-by: Kees Cook <[email protected]>
---
include/linux/seccomp.h | 6 +++---
kernel/fork.c | 40 ++++++++++++++++++++++++++++++++++++----
kernel/seccomp.c | 22 ++++++++++++++++------
3 files changed, 55 insertions(+), 13 deletions(-)
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 4054b0994071..9ff98b4bfe2e 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -14,11 +14,11 @@ struct seccomp_filter;
*
* @mode: indicates one of the valid values above for controlled
* system calls available to a process.
- * @filter: The metadata and ruleset for determining what system calls
- * are allowed for a task.
+ * @filter: must always point to a valid seccomp-filter or NULL as it is
+ * accessed without locking during system call entry.
*
* @filter must only be accessed from the context of current as there
- * is no locking.
+ * is no read locking.
*/
struct seccomp {
int mode;
diff --git a/kernel/fork.c b/kernel/fork.c
index d2799d1fc952..6b2a9add1079 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -315,6 +315,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
goto free_ti;
tsk->stack = ti;
+#ifdef CONFIG_SECCOMP
+ /*
+ * We must handle setting up seccomp filters once we're under
+ * the tasklist_lock in case orig has changed between now and
+ * then. Until then, filter must be NULL to avoid messing up
+ * the usage counts on the error path calling free_task.
+ */
+ tsk->seccomp.filter = NULL;
+#endif
setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
@@ -1081,6 +1090,23 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
+static void copy_seccomp(struct task_struct *p)
+{
+#ifdef CONFIG_SECCOMP
+ /*
+ * Must be called with sighand->lock held. Child lock not needed
+ * since it is not yet in tasklist.
+ */
+ BUG_ON(!spin_is_locked(¤t->sighand->siglock));
+
+ get_seccomp_filter(current);
+ p->seccomp = current->seccomp;
+
+ if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
+ set_tsk_thread_flag(p, TIF_SECCOMP);
+#endif
+}
+
SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
{
current->clear_child_tid = tidptr;
@@ -1142,6 +1168,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
{
int retval;
struct task_struct *p;
+ unsigned long irqflags;
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1196,7 +1223,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto fork_out;
ftrace_graph_init_task(p);
- get_seccomp_filter(p);
rt_mutex_init_task(p);
@@ -1434,7 +1460,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->parent_exec_id = current->self_exec_id;
}
- spin_lock(¤t->sighand->siglock);
+ spin_lock_irqsave(¤t->sighand->siglock, irqflags);
+
+ /*
+ * Copy seccomp details explicitly here, in case they were changed
+ * before holding tasklist_lock.
+ */
+ copy_seccomp(p);
/*
* Process group and session signals need to be delivered to just the
@@ -1446,7 +1478,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
*/
recalc_sigpending();
if (signal_pending(current)) {
- spin_unlock(¤t->sighand->siglock);
+ spin_unlock_irqrestore(¤t->sighand->siglock, irqflags);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
goto bad_fork_free_pid;
@@ -1486,7 +1518,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
}
total_forks++;
- spin_unlock(¤t->sighand->siglock);
+ spin_unlock_irqrestore(¤t->sighand->siglock, irqflags);
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 7a9257ddd69c..33655302b658 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -174,12 +174,12 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
*/
static u32 seccomp_run_filters(int syscall)
{
- struct seccomp_filter *f;
+ struct seccomp_filter *f = smp_load_acquire(¤t->seccomp.filter);
struct seccomp_data sd;
u32 ret = SECCOMP_RET_ALLOW;
/* Ensure unexpected behavior doesn't result in failing open. */
- if (WARN_ON(current->seccomp.filter == NULL))
+ if (WARN_ON(f == NULL))
return SECCOMP_RET_KILL;
populate_seccomp_data(&sd);
@@ -188,7 +188,7 @@ static u32 seccomp_run_filters(int syscall)
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
*/
- for (f = current->seccomp.filter; f; f = f->prev) {
+ for (; f; f = smp_load_acquire(&f->prev)) {
u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi);
if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
ret = cur_ret;
@@ -305,6 +305,8 @@ out:
* seccomp_attach_filter: validate and attach filter
* @filter: seccomp filter to add to the current process
*
+ * Caller must be holding current->sighand->siglock lock.
+ *
* Returns 0 on success, -ve on error.
*/
static long seccomp_attach_filter(struct seccomp_filter *filter)
@@ -312,6 +314,8 @@ static long seccomp_attach_filter(struct seccomp_filter *filter)
unsigned long total_insns;
struct seccomp_filter *walker;
+ BUG_ON(!spin_is_locked(¤t->sighand->siglock));
+
/* Validate resulting filter length. */
total_insns = filter->len;
for (walker = current->seccomp.filter; walker; walker = filter->prev)
@@ -324,7 +328,7 @@ static long seccomp_attach_filter(struct seccomp_filter *filter)
* task reference.
*/
filter->prev = current->seccomp.filter;
- current->seccomp.filter = filter;
+ smp_store_release(¤t->seccomp.filter, filter);
return 0;
}
@@ -332,7 +336,7 @@ static long seccomp_attach_filter(struct seccomp_filter *filter)
/* get_seccomp_filter - increments the reference count of the filter on @tsk */
void get_seccomp_filter(struct task_struct *tsk)
{
- struct seccomp_filter *orig = tsk->seccomp.filter;
+ struct seccomp_filter *orig = smp_load_acquire(&tsk->seccomp.filter);
if (!orig)
return;
/* Reference count is bounded by the number of total processes. */
@@ -346,7 +350,7 @@ void put_seccomp_filter(struct task_struct *tsk)
/* Clean up single-reference branches iteratively. */
while (orig && atomic_dec_and_test(&orig->usage)) {
struct seccomp_filter *freeme = orig;
- orig = orig->prev;
+ orig = smp_load_acquire(&orig->prev);
kfree(freeme);
}
}
@@ -498,6 +502,7 @@ long prctl_get_seccomp(void)
static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter)
{
struct seccomp_filter *prepared = NULL;
+ unsigned long irqflags;
long ret = -EINVAL;
#ifdef CONFIG_SECCOMP_FILTER
@@ -509,6 +514,9 @@ static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter)
}
#endif
+ if (unlikely(!lock_task_sighand(current, &irqflags)))
+ goto out_free;
+
if (current->seccomp.mode &&
current->seccomp.mode != seccomp_mode)
goto out;
@@ -536,6 +544,8 @@ static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter)
current->seccomp.mode = seccomp_mode;
set_thread_flag(TIF_SECCOMP);
out:
+ unlock_task_sighand(current, &irqflags);
+out_free:
kfree(prepared);
return ret;
}
--
1.7.9.5
Applying restrictive seccomp filter programs to large or diverse
codebases often requires handling threads which may be started early in
the process lifetime (e.g., by code that is linked in). While it is
possible to apply permissive programs prior to process start up, it is
difficult to further restrict the kernel ABI to those threads after that
point.
This change adds a new seccomp syscall flag to SECCOMP_SET_MODE_FILTER for
synchronizing thread group seccomp filters at filter installation time.
When calling seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
filter) an attempt will be made to synchronize all threads in current's
threadgroup to its new seccomp filter program. This is possible iff all
threads are using a filter that is an ancestor to the filter current is
attempting to synchronize to. NULL filters (where the task is running as
SECCOMP_MODE_NONE) are also treated as ancestors allowing threads to be
transitioned into SECCOMP_MODE_FILTER. If prctrl(PR_SET_NO_NEW_PRIVS,
...) has been set on the calling thread, no_new_privs will be set for
all synchronized threads too. On success, 0 is returned. On failure,
the pid of one of the failing threads will be returned and no filters
will have been applied.
The race conditions are against another thread requesting TSYNC, another
thread performing a clone, and another thread changing its filter. The
sighand lock is sufficient for these cases, though the clone case is
assisted by the tasklist_lock so that new threads must have a duplicate
of its parent seccomp state when it appears on the tasklist.
Based on patches by Will Drewry.
Suggested-by: Julien Tinnes <[email protected]>
Signed-off-by: Kees Cook <[email protected]>
---
arch/Kconfig | 1 +
include/uapi/linux/seccomp.h | 4 ++
kernel/seccomp.c | 135 +++++++++++++++++++++++++++++++++++++++++-
3 files changed, 138 insertions(+), 2 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index 97ff872c7acc..0eae9df35b88 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -321,6 +321,7 @@ config HAVE_ARCH_SECCOMP_FILTER
- secure_computing is called from a ptrace_event()-safe context
- secure_computing return value is checked and a return value of -1
results in the system call being skipped immediately.
+ - seccomp syscall wired up
config SECCOMP_FILTER
def_bool y
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index b258878ba754..3e651f757b48 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -14,6 +14,10 @@
#define SECCOMP_SET_MODE_STRICT 0
#define SECCOMP_SET_MODE_FILTER 1
+/* Valid flags for SECCOMP_SET_MODE_FILTER */
+#define SECCOMP_FILTER_FLAG_TSYNC 1
+#define SECCOMP_FILTER_FLAG_MASK ~(SECCOMP_FILTER_FLAG_TSYNC)
+
/*
* All BPF programs must return a 32-bit value.
* The bottom 16-bits are for optional return data.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index c0cafa9e84af..d03d470ca36b 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -26,6 +26,7 @@
#ifdef CONFIG_SECCOMP_FILTER
#include <asm/syscall.h>
#include <linux/filter.h>
+#include <linux/pid.h>
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/tracehook.h>
@@ -219,6 +220,107 @@ static inline void seccomp_assign_mode(struct task_struct *task,
}
#ifdef CONFIG_SECCOMP_FILTER
+/* Returns 1 if the candidate is an ancestor. */
+static int is_ancestor(struct seccomp_filter *candidate,
+ struct seccomp_filter *child)
+{
+ /* NULL is the root ancestor. */
+ if (candidate == NULL)
+ return 1;
+ for (; child; child = child->prev)
+ if (child == candidate)
+ return 1;
+ return 0;
+}
+
+/**
+ * seccomp_can_sync_threads: checks if all threads can be synchronized
+ *
+ * Expects both tasklist_lock and current->sighand->siglock to be held.
+ *
+ * Returns 0 on success, -ve on error, or the pid of a thread which was
+ * either not in the correct seccomp mode or it did not have an ancestral
+ * seccomp filter.
+ */
+static pid_t seccomp_can_sync_threads(void)
+{
+ struct task_struct *thread, *caller;
+
+ BUG_ON(write_can_lock(&tasklist_lock));
+ BUG_ON(!spin_is_locked(¤t->sighand->siglock));
+
+ if (current->seccomp.mode != SECCOMP_MODE_FILTER)
+ return -EACCES;
+
+ /* Validate all threads being eligible for synchronization. */
+ thread = caller = current;
+ for_each_thread(caller, thread) {
+ pid_t failed;
+
+ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
+ (thread->seccomp.mode == SECCOMP_MODE_FILTER &&
+ is_ancestor(thread->seccomp.filter,
+ caller->seccomp.filter)))
+ continue;
+
+ /* Return the first thread that cannot be synchronized. */
+ failed = task_pid_vnr(thread);
+ /* If the pid cannot be resolved, then return -ESRCH */
+ if (failed == 0)
+ failed = -ESRCH;
+ return failed;
+ }
+
+ return 0;
+}
+
+/**
+ * seccomp_sync_threads: sets all threads to use current's filter
+ *
+ * Expects both tasklist_lock and current->sighand->siglock to be held,
+ * and for seccomp_can_sync_threads() to have returned success already
+ * without dropping the locks.
+ *
+ */
+static void seccomp_sync_threads(void)
+{
+ struct task_struct *thread, *caller;
+
+ BUG_ON(write_can_lock(&tasklist_lock));
+ BUG_ON(!spin_is_locked(¤t->sighand->siglock));
+
+ /* Synchronize all threads. */
+ thread = caller = current;
+ for_each_thread(caller, thread) {
+ /* Get a task reference for the new leaf node. */
+ get_seccomp_filter(caller);
+ /*
+ * Drop the task reference to the shared ancestor since
+ * current's path will hold a reference. (This also
+ * allows a put before the assignment.)
+ */
+ put_seccomp_filter(thread);
+ thread->seccomp.filter = caller->seccomp.filter;
+ /* Opt the other thread into seccomp if needed.
+ * As threads are considered to be trust-realm
+ * equivalent (see ptrace_may_access), it is safe to
+ * allow one thread to transition the other.
+ */
+ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
+ /*
+ * Don't let an unprivileged task work around
+ * the no_new_privs restriction by creating
+ * a thread that sets it up, enters seccomp,
+ * then dies.
+ */
+ if (task_no_new_privs(caller))
+ task_set_no_new_privs(thread);
+
+ seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
+ }
+ }
+}
+
/**
* seccomp_prepare_filter: Prepares a seccomp filter for use.
* @fprog: BPF program to install
@@ -342,7 +444,7 @@ static long seccomp_attach_filter(unsigned int flags,
BUG_ON(!spin_is_locked(¤t->sighand->siglock));
/* Validate flags. */
- if (flags != 0)
+ if (flags & SECCOMP_FILTER_FLAG_MASK)
return -EINVAL;
/* Validate resulting filter length. */
@@ -352,6 +454,15 @@ static long seccomp_attach_filter(unsigned int flags,
if (total_insns > MAX_INSNS_PER_PATH)
return -ENOMEM;
+ /* If thread sync has been requested, check that it is possible. */
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
+ int ret;
+
+ ret = seccomp_can_sync_threads();
+ if (ret)
+ return ret;
+ }
+
/*
* If there is an existing filter, make it the prev and don't drop its
* task reference.
@@ -359,6 +470,10 @@ static long seccomp_attach_filter(unsigned int flags,
filter->prev = current->seccomp.filter;
smp_store_release(¤t->seccomp.filter, filter);
+ /* Now that the new filter is in place, synchronize to all threads. */
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC)
+ seccomp_sync_threads();
+
return 0;
}
@@ -564,7 +679,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
{
const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
struct seccomp_filter *prepared = NULL;
- unsigned long irqflags;
+ unsigned long irqflags, taskflags;
long ret = -EINVAL;
/* Prepare the new filter outside of any locking. */
@@ -572,6 +687,17 @@ static long seccomp_set_mode_filter(unsigned int flags,
if (IS_ERR(prepared))
return PTR_ERR(prepared);
+ /*
+ * If we're doing thread sync, we must hold tasklist_lock
+ * to make sure seccomp filter changes are stable on threads
+ * entering or leaving the task list. And we must take it
+ * before the sighand lock to avoid deadlocking.
+ */
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC)
+ write_lock_irqsave(&tasklist_lock, taskflags);
+ else
+ __acquire(&tasklist_lock); /* keep sparse happy */
+
if (unlikely(!lock_task_sighand(current, &irqflags)))
goto out_free;
@@ -588,6 +714,11 @@ static long seccomp_set_mode_filter(unsigned int flags,
out:
unlock_task_sighand(current, &irqflags);
out_free:
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC)
+ write_unlock_irqrestore(&tasklist_lock, taskflags);
+ else
+ __release(&tasklist_lock); /* keep sparse happy */
+
kfree(prepared);
return ret;
}
--
1.7.9.5
Since seccomp transitions between threads requires updates to the
no_new_privs flag to be atomic, changes must be atomic. This moves the nnp
flag into the seccomp field as a separate unsigned long for atomic access.
Signed-off-by: Kees Cook <[email protected]>
Acked-by: Andy Lutomirski <[email protected]>
---
fs/exec.c | 4 ++--
include/linux/sched.h | 13 ++++++++++---
include/linux/seccomp.h | 8 +++++++-
kernel/seccomp.c | 2 +-
kernel/sys.c | 4 ++--
security/apparmor/domain.c | 4 ++--
6 files changed, 24 insertions(+), 11 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index 238b7aa26f68..614fcb993739 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1233,7 +1233,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
* This isn't strictly necessary, but it makes it harder for LSMs to
* mess up.
*/
- if (current->no_new_privs)
+ if (task_no_new_privs(current))
bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
t = p;
@@ -1271,7 +1271,7 @@ int prepare_binprm(struct linux_binprm *bprm)
bprm->cred->egid = current_egid();
if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
- !current->no_new_privs &&
+ !task_no_new_privs(current) &&
kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) &&
kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) {
/* Set-uid? */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ea74596014a2..50b41affb7b1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1307,9 +1307,6 @@ struct task_struct {
* execve */
unsigned in_iowait:1;
- /* task may not gain privileges */
- unsigned no_new_privs:1;
-
/* Revert to default priority/policy when forking */
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
@@ -2525,6 +2522,16 @@ static inline void task_unlock(struct task_struct *p)
spin_unlock(&p->alloc_lock);
}
+static inline bool task_no_new_privs(struct task_struct *p)
+{
+ return test_bit(SECCOMP_FLAG_NO_NEW_PRIVS, &p->seccomp.flags);
+}
+
+static inline void task_set_no_new_privs(struct task_struct *p)
+{
+ set_bit(SECCOMP_FLAG_NO_NEW_PRIVS, &p->seccomp.flags);
+}
+
extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
unsigned long *flags);
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 9ff98b4bfe2e..6a5e2d0ec912 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -3,6 +3,8 @@
#include <uapi/linux/seccomp.h>
+#define SECCOMP_FLAG_NO_NEW_PRIVS 0 /* task may not gain privs */
+
#ifdef CONFIG_SECCOMP
#include <linux/thread_info.h>
@@ -16,6 +18,7 @@ struct seccomp_filter;
* system calls available to a process.
* @filter: must always point to a valid seccomp-filter or NULL as it is
* accessed without locking during system call entry.
+ * @flags: flags under task->sighand->siglock lock
*
* @filter must only be accessed from the context of current as there
* is no read locking.
@@ -23,6 +26,7 @@ struct seccomp_filter;
struct seccomp {
int mode;
struct seccomp_filter *filter;
+ unsigned long flags;
};
extern int __secure_computing(int);
@@ -51,7 +55,9 @@ static inline int seccomp_mode(struct seccomp *s)
#include <linux/errno.h>
-struct seccomp { };
+struct seccomp {
+ unsigned long flags;
+};
struct seccomp_filter { };
static inline int secure_computing(int this_syscall) { return 0; }
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 33655302b658..7ec99b99e400 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -219,7 +219,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
* This avoids scenarios where unprivileged tasks can affect the
* behavior of privileged children.
*/
- if (!current->no_new_privs &&
+ if (!task_no_new_privs(current) &&
security_capable_noaudit(current_cred(), current_user_ns(),
CAP_SYS_ADMIN) != 0)
return ERR_PTR(-EACCES);
diff --git a/kernel/sys.c b/kernel/sys.c
index 66a751ebf9d9..ce8129192a26 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1990,12 +1990,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
if (arg2 != 1 || arg3 || arg4 || arg5)
return -EINVAL;
- current->no_new_privs = 1;
+ task_set_no_new_privs(current);
break;
case PR_GET_NO_NEW_PRIVS:
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
- return current->no_new_privs ? 1 : 0;
+ return task_no_new_privs(current) ? 1 : 0;
case PR_GET_THP_DISABLE:
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index 452567d3a08e..d97cba3e3849 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -621,7 +621,7 @@ int aa_change_hat(const char *hats[], int count, u64 token, bool permtest)
* There is no exception for unconfined as change_hat is not
* available.
*/
- if (current->no_new_privs)
+ if (task_no_new_privs(current))
return -EPERM;
/* released below */
@@ -776,7 +776,7 @@ int aa_change_profile(const char *ns_name, const char *hname, bool onexec,
* no_new_privs is set because this aways results in a reduction
* of permissions.
*/
- if (current->no_new_privs && !unconfined(profile)) {
+ if (task_no_new_privs(current) && !unconfined(profile)) {
put_cred(cred);
return -EPERM;
}
--
1.7.9.5
In preparation for having other callers of the seccomp mode setting
logic, split the prctl entry point away from the core logic that performs
seccomp mode setting.
Signed-off-by: Kees Cook <[email protected]>
---
kernel/seccomp.c | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index f6d76bebe69f..552b972b8f83 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -465,7 +465,7 @@ long prctl_get_seccomp(void)
}
/**
- * prctl_set_seccomp: configures current->seccomp.mode
+ * seccomp_set_mode: internal function for setting seccomp mode
* @seccomp_mode: requested mode to use
* @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
*
@@ -478,7 +478,7 @@ long prctl_get_seccomp(void)
*
* Returns 0 on success or -EINVAL on failure.
*/
-long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
+static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter)
{
long ret = -EINVAL;
@@ -509,3 +509,15 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
out:
return ret;
}
+
+/**
+ * prctl_set_seccomp: configures current->seccomp.mode
+ * @seccomp_mode: requested mode to use
+ * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
+{
+ return seccomp_set_mode(seccomp_mode, filter);
+}
--
1.7.9.5
In preparation for adding seccomp locking, move filter creation away
from where it is checked and applied. This will allow for locking where
no memory allocation is happening. The validation, filter attachment,
and seccomp mode setting can all happen under the future locks.
Signed-off-by: Kees Cook <[email protected]>
---
kernel/seccomp.c | 86 ++++++++++++++++++++++++++++++++++++------------------
1 file changed, 58 insertions(+), 28 deletions(-)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 552b972b8f83..7a9257ddd69c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -18,6 +18,7 @@
#include <linux/compat.h>
#include <linux/sched.h>
#include <linux/seccomp.h>
+#include <linux/slab.h>
/* #define SECCOMP_DEBUG 1 */
@@ -26,7 +27,6 @@
#include <linux/filter.h>
#include <linux/ptrace.h>
#include <linux/security.h>
-#include <linux/slab.h>
#include <linux/tracehook.h>
#include <linux/uaccess.h>
@@ -197,27 +197,21 @@ static u32 seccomp_run_filters(int syscall)
}
/**
- * seccomp_attach_filter: Attaches a seccomp filter to current.
+ * seccomp_prepare_filter: Prepares a seccomp filter for use.
* @fprog: BPF program to install
*
- * Returns 0 on success or an errno on failure.
+ * Returns filter on success or an ERR_PTR on failure.
*/
-static long seccomp_attach_filter(struct sock_fprog *fprog)
+static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *filter;
unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
- unsigned long total_insns = fprog->len;
struct sock_filter *fp;
int new_len;
long ret;
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
- return -EINVAL;
-
- for (filter = current->seccomp.filter; filter; filter = filter->prev)
- total_insns += filter->len + 4; /* include a 4 instr penalty */
- if (total_insns > MAX_INSNS_PER_PATH)
- return -ENOMEM;
+ return ERR_PTR(-EINVAL);
/*
* Installing a seccomp filter requires that the task has
@@ -228,11 +222,11 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
if (!current->no_new_privs &&
security_capable_noaudit(current_cred(), current_user_ns(),
CAP_SYS_ADMIN) != 0)
- return -EACCES;
+ return ERR_PTR(-EACCES);
fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
if (!fp)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
/* Copy the instructions from fprog. */
ret = -EFAULT;
@@ -270,31 +264,26 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
atomic_set(&filter->usage, 1);
filter->len = new_len;
- /*
- * If there is an existing filter, make it the prev and don't drop its
- * task reference.
- */
- filter->prev = current->seccomp.filter;
- current->seccomp.filter = filter;
- return 0;
+ return filter;
free_filter:
kfree(filter);
free_prog:
kfree(fp);
- return ret;
+ return ERR_PTR(ret);
}
/**
- * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
+ * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
* @user_filter: pointer to the user data containing a sock_fprog.
*
- * Returns 0 on success and non-zero otherwise.
+ * Returns filter on success and ERR_PTR otherwise.
*/
-static long seccomp_attach_user_filter(char __user *user_filter)
+static
+struct seccomp_filter *seccomp_prepare_user_filter(char __user *user_filter)
{
struct sock_fprog fprog;
- long ret = -EFAULT;
+ struct seccomp_filter *filter = ERR_PTR(-EFAULT);
#ifdef CONFIG_COMPAT
if (is_compat_task()) {
@@ -307,9 +296,37 @@ static long seccomp_attach_user_filter(char __user *user_filter)
#endif
if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
goto out;
- ret = seccomp_attach_filter(&fprog);
+ filter = seccomp_prepare_filter(&fprog);
out:
- return ret;
+ return filter;
+}
+
+/**
+ * seccomp_attach_filter: validate and attach filter
+ * @filter: seccomp filter to add to the current process
+ *
+ * Returns 0 on success, -ve on error.
+ */
+static long seccomp_attach_filter(struct seccomp_filter *filter)
+{
+ unsigned long total_insns;
+ struct seccomp_filter *walker;
+
+ /* Validate resulting filter length. */
+ total_insns = filter->len;
+ for (walker = current->seccomp.filter; walker; walker = filter->prev)
+ total_insns += walker->len + 4; /* include a 4 instr penalty */
+ if (total_insns > MAX_INSNS_PER_PATH)
+ return -ENOMEM;
+
+ /*
+ * If there is an existing filter, make it the prev and don't drop its
+ * task reference.
+ */
+ filter->prev = current->seccomp.filter;
+ current->seccomp.filter = filter;
+
+ return 0;
}
/* get_seccomp_filter - increments the reference count of the filter on @tsk */
@@ -480,8 +497,18 @@ long prctl_get_seccomp(void)
*/
static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter)
{
+ struct seccomp_filter *prepared = NULL;
long ret = -EINVAL;
+#ifdef CONFIG_SECCOMP_FILTER
+ /* Prepare the new filter outside of the seccomp lock. */
+ if (seccomp_mode == SECCOMP_MODE_FILTER) {
+ prepared = seccomp_prepare_user_filter(filter);
+ if (IS_ERR(prepared))
+ return PTR_ERR(prepared);
+ }
+#endif
+
if (current->seccomp.mode &&
current->seccomp.mode != seccomp_mode)
goto out;
@@ -495,9 +522,11 @@ static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter)
break;
#ifdef CONFIG_SECCOMP_FILTER
case SECCOMP_MODE_FILTER:
- ret = seccomp_attach_user_filter(filter);
+ ret = seccomp_attach_filter(prepared);
if (ret)
goto out;
+ /* Do not free the successfully attached filter. */
+ prepared = NULL;
break;
#endif
default:
@@ -507,6 +536,7 @@ static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter)
current->seccomp.mode = seccomp_mode;
set_thread_flag(TIF_SECCOMP);
out:
+ kfree(prepared);
return ret;
}
--
1.7.9.5
Extracts the common check/assign logic, and separates the two mode
setting paths to make things more readable with fewer #ifdefs within
function bodies.
Signed-off-by: Kees Cook <[email protected]>
---
kernel/seccomp.c | 124 +++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 85 insertions(+), 39 deletions(-)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 7ec99b99e400..39d32c2904fc 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -195,7 +195,29 @@ static u32 seccomp_run_filters(int syscall)
}
return ret;
}
+#endif /* CONFIG_SECCOMP_FILTER */
+static inline bool seccomp_check_mode(struct task_struct *task,
+ unsigned long seccomp_mode)
+{
+ BUG_ON(!spin_is_locked(&task->sighand->siglock));
+
+ if (task->seccomp.mode && task->seccomp.mode != seccomp_mode)
+ return false;
+
+ return true;
+}
+
+static inline void seccomp_assign_mode(struct task_struct *task,
+ unsigned long seccomp_mode)
+{
+ BUG_ON(!spin_is_locked(&task->sighand->siglock));
+
+ task->seccomp.mode = seccomp_mode;
+ set_tsk_thread_flag(task, TIF_SECCOMP);
+}
+
+#ifdef CONFIG_SECCOMP_FILTER
/**
* seccomp_prepare_filter: Prepares a seccomp filter for use.
* @fprog: BPF program to install
@@ -486,69 +508,86 @@ long prctl_get_seccomp(void)
}
/**
- * seccomp_set_mode: internal function for setting seccomp mode
- * @seccomp_mode: requested mode to use
- * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
+ * seccomp_set_mode_strict: internal function for setting strict seccomp
*
- * This function may be called repeatedly with a @seccomp_mode of
- * SECCOMP_MODE_FILTER to install additional filters. Every filter
- * successfully installed will be evaluated (in reverse order) for each system
- * call the task makes.
+ * Once current->seccomp.mode is non-zero, it may not be changed.
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+static long seccomp_set_mode_strict(void)
+{
+ const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
+ unsigned long irqflags;
+ int ret = -EINVAL;
+
+ if (unlikely(!lock_task_sighand(current, &irqflags)))
+ return -EINVAL;
+
+ if (!seccomp_check_mode(current, seccomp_mode))
+ goto out;
+
+#ifdef TIF_NOTSC
+ disable_TSC();
+#endif
+ seccomp_assign_mode(current, seccomp_mode);
+ ret = 0;
+
+out:
+ unlock_task_sighand(current, &irqflags);
+
+ return ret;
+}
+
+#ifdef CONFIG_SECCOMP_FILTER
+/**
+ * seccomp_set_mode_filter: internal function for setting seccomp filter
+ * @filter: struct sock_fprog containing filter
+ *
+ * This function may be called repeatedly to install additional filters.
+ * Every filter successfully installed will be evaluated (in reverse order)
+ * for each system call the task makes.
*
* Once current->seccomp.mode is non-zero, it may not be changed.
*
* Returns 0 on success or -EINVAL on failure.
*/
-static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter)
+static long seccomp_set_mode_filter(char __user *filter)
{
+ const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
struct seccomp_filter *prepared = NULL;
unsigned long irqflags;
long ret = -EINVAL;
-#ifdef CONFIG_SECCOMP_FILTER
- /* Prepare the new filter outside of the seccomp lock. */
- if (seccomp_mode == SECCOMP_MODE_FILTER) {
- prepared = seccomp_prepare_user_filter(filter);
- if (IS_ERR(prepared))
- return PTR_ERR(prepared);
- }
-#endif
+ /* Prepare the new filter outside of any locking. */
+ prepared = seccomp_prepare_user_filter(filter);
+ if (IS_ERR(prepared))
+ return PTR_ERR(prepared);
if (unlikely(!lock_task_sighand(current, &irqflags)))
goto out_free;
- if (current->seccomp.mode &&
- current->seccomp.mode != seccomp_mode)
+ if (!seccomp_check_mode(current, seccomp_mode))
goto out;
- switch (seccomp_mode) {
- case SECCOMP_MODE_STRICT:
- ret = 0;
-#ifdef TIF_NOTSC
- disable_TSC();
-#endif
- break;
-#ifdef CONFIG_SECCOMP_FILTER
- case SECCOMP_MODE_FILTER:
- ret = seccomp_attach_filter(prepared);
- if (ret)
- goto out;
- /* Do not free the successfully attached filter. */
- prepared = NULL;
- break;
-#endif
- default:
+ ret = seccomp_attach_filter(prepared);
+ if (ret)
goto out;
- }
+ /* Do not free the successfully attached filter. */
+ prepared = NULL;
- current->seccomp.mode = seccomp_mode;
- set_thread_flag(TIF_SECCOMP);
+ seccomp_assign_mode(current, seccomp_mode);
out:
unlock_task_sighand(current, &irqflags);
out_free:
kfree(prepared);
return ret;
}
+#else
+static inline long seccomp_set_mode_filter(char __user *filter)
+{
+ return -EINVAL;
+}
+#endif
/**
* prctl_set_seccomp: configures current->seccomp.mode
@@ -559,5 +598,12 @@ out_free:
*/
long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
{
- return seccomp_set_mode(seccomp_mode, filter);
+ switch (seccomp_mode) {
+ case SECCOMP_MODE_STRICT:
+ return seccomp_set_mode_strict();
+ case SECCOMP_MODE_FILTER:
+ return seccomp_set_mode_filter(filter);
+ default:
+ return -EINVAL;
+ }
}
--
1.7.9.5
Wires up the new seccomp syscall.
Signed-off-by: Kees Cook <[email protected]>
---
arch/mips/include/uapi/asm/unistd.h | 15 +++++++++------
arch/mips/kernel/scall32-o32.S | 1 +
arch/mips/kernel/scall64-64.S | 1 +
arch/mips/kernel/scall64-n32.S | 1 +
arch/mips/kernel/scall64-o32.S | 1 +
5 files changed, 13 insertions(+), 6 deletions(-)
diff --git a/arch/mips/include/uapi/asm/unistd.h b/arch/mips/include/uapi/asm/unistd.h
index 5805414777e0..9bc13eaf9d67 100644
--- a/arch/mips/include/uapi/asm/unistd.h
+++ b/arch/mips/include/uapi/asm/unistd.h
@@ -372,16 +372,17 @@
#define __NR_sched_setattr (__NR_Linux + 349)
#define __NR_sched_getattr (__NR_Linux + 350)
#define __NR_renameat2 (__NR_Linux + 351)
+#define __NR_seccomp (__NR_Linux + 352)
/*
* Offset of the last Linux o32 flavoured syscall
*/
-#define __NR_Linux_syscalls 351
+#define __NR_Linux_syscalls 352
#endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */
#define __NR_O32_Linux 4000
-#define __NR_O32_Linux_syscalls 351
+#define __NR_O32_Linux_syscalls 352
#if _MIPS_SIM == _MIPS_SIM_ABI64
@@ -701,16 +702,17 @@
#define __NR_sched_setattr (__NR_Linux + 309)
#define __NR_sched_getattr (__NR_Linux + 310)
#define __NR_renameat2 (__NR_Linux + 311)
+#define __NR_seccomp (__NR_Linux + 312)
/*
* Offset of the last Linux 64-bit flavoured syscall
*/
-#define __NR_Linux_syscalls 311
+#define __NR_Linux_syscalls 312
#endif /* _MIPS_SIM == _MIPS_SIM_ABI64 */
#define __NR_64_Linux 5000
-#define __NR_64_Linux_syscalls 311
+#define __NR_64_Linux_syscalls 312
#if _MIPS_SIM == _MIPS_SIM_NABI32
@@ -1034,15 +1036,16 @@
#define __NR_sched_setattr (__NR_Linux + 313)
#define __NR_sched_getattr (__NR_Linux + 314)
#define __NR_renameat2 (__NR_Linux + 315)
+#define __NR_seccomp (__NR_Linux + 316)
/*
* Offset of the last N32 flavoured syscall
*/
-#define __NR_Linux_syscalls 315
+#define __NR_Linux_syscalls 316
#endif /* _MIPS_SIM == _MIPS_SIM_NABI32 */
#define __NR_N32_Linux 6000
-#define __NR_N32_Linux_syscalls 315
+#define __NR_N32_Linux_syscalls 316
#endif /* _UAPI_ASM_UNISTD_H */
diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S
index 3245474f19d5..ab02d14f1b5c 100644
--- a/arch/mips/kernel/scall32-o32.S
+++ b/arch/mips/kernel/scall32-o32.S
@@ -578,3 +578,4 @@ EXPORT(sys_call_table)
PTR sys_sched_setattr
PTR sys_sched_getattr /* 4350 */
PTR sys_renameat2
+ PTR sys_seccomp
diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S
index be2fedd4ae33..010dccf128ec 100644
--- a/arch/mips/kernel/scall64-64.S
+++ b/arch/mips/kernel/scall64-64.S
@@ -431,4 +431,5 @@ EXPORT(sys_call_table)
PTR sys_sched_setattr
PTR sys_sched_getattr /* 5310 */
PTR sys_renameat2
+ PTR sys_seccomp
.size sys_call_table,.-sys_call_table
diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S
index c1dbcda4b816..c3b3b6525df5 100644
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -424,4 +424,5 @@ EXPORT(sysn32_call_table)
PTR sys_sched_setattr
PTR sys_sched_getattr
PTR sys_renameat2 /* 6315 */
+ PTR sys_seccomp
.size sysn32_call_table,.-sysn32_call_table
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index f1343ccd7ed7..bb1550b1f501 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -557,4 +557,5 @@ EXPORT(sys32_call_table)
PTR sys_sched_setattr
PTR sys_sched_getattr /* 4350 */
PTR sys_renameat2
+ PTR sys_seccomp
.size sys32_call_table,.-sys32_call_table
--
1.7.9.5
On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook <[email protected]> wrote:
> This adds the new "seccomp" syscall with both an "operation" and "flags"
> parameter for future expansion. The third argument is a pointer value,
> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
Question for the linux-abi people:
What's the preferred way to do this these days? This syscall is a
general purpose "adjust the seccomp state" thing. The alternative
would be a specific new syscall to add a filter with a flags argument.
--Andy
On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook <[email protected]> wrote:
> This adds the new "seccomp" syscall with both an "operation" and "flags"
> parameter for future expansion. The third argument is a pointer value,
> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>
> Signed-off-by: Kees Cook <[email protected]>
> Cc: [email protected]
> ---
> arch/x86/syscalls/syscall_32.tbl | 1 +
> arch/x86/syscalls/syscall_64.tbl | 1 +
> include/linux/syscalls.h | 2 ++
> include/uapi/asm-generic/unistd.h | 4 ++-
> include/uapi/linux/seccomp.h | 4 +++
> kernel/seccomp.c | 63 ++++++++++++++++++++++++++++++++-----
> kernel/sys_ni.c | 3 ++
> 7 files changed, 69 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
> index d6b867921612..7527eac24122 100644
> --- a/arch/x86/syscalls/syscall_32.tbl
> +++ b/arch/x86/syscalls/syscall_32.tbl
> @@ -360,3 +360,4 @@
> 351 i386 sched_setattr sys_sched_setattr
> 352 i386 sched_getattr sys_sched_getattr
> 353 i386 renameat2 sys_renameat2
> +354 i386 seccomp sys_seccomp
> diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
> index ec255a1646d2..16272a6c12b7 100644
> --- a/arch/x86/syscalls/syscall_64.tbl
> +++ b/arch/x86/syscalls/syscall_64.tbl
> @@ -323,6 +323,7 @@
> 314 common sched_setattr sys_sched_setattr
> 315 common sched_getattr sys_sched_getattr
> 316 common renameat2 sys_renameat2
> +317 common seccomp sys_seccomp
>
> #
> # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index b0881a0ed322..1713977ee26f 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
> asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
> unsigned long idx1, unsigned long idx2);
> asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
> + const char __user *uargs);
It looks odd to add 'flags' argument to syscall that is not even used.
It don't think it will be extensible this way.
'uargs' is used only in 2nd command as well and it's not 'char __user *'
but rather 'struct sock_fprog __user *'
I think it makes more sense to define only first argument as 'int op' and the
rest as variable length array.
Something like:
long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
then different commands can interpret 'attrs' differently.
if op == mode_strict, then attrs == NULL, len == 0
if op == mode_filter, then attrs->nla_type == seccomp_bpf_filter
and nla_data(attrs) is 'struct sock_fprog'
If we decide to add new types of filters or new commands, the syscall prototype
won't need to change. New commands can be added preserving backward
compatibility.
The basic TLV concept has been around forever in netlink world. imo makes
sense to use it with new syscalls. Passing 'struct xxx' into syscalls
is the thing
of the past. TLV style is more extensible. Fields of structures can become
optional in the future, new fields added, etc.
'struct nlattr' brings the same benefits to kernel api as protobuf did
to user land.
> #endif
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 333640608087..65acbf0e2867 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -699,9 +699,11 @@ __SYSCALL(__NR_sched_setattr, sys_sched_setattr)
> __SYSCALL(__NR_sched_getattr, sys_sched_getattr)
> #define __NR_renameat2 276
> __SYSCALL(__NR_renameat2, sys_renameat2)
> +#define __NR_seccomp 277
> +__SYSCALL(__NR_seccomp, sys_seccomp)
>
> #undef __NR_syscalls
> -#define __NR_syscalls 277
> +#define __NR_syscalls 278
>
> /*
> * All syscalls below here should go away really,
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index ac2dc9f72973..b258878ba754 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -10,6 +10,10 @@
> #define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */
> #define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
>
> +/* Valid operations for seccomp syscall. */
> +#define SECCOMP_SET_MODE_STRICT 0
> +#define SECCOMP_SET_MODE_FILTER 1
> +
> /*
> * All BPF programs must return a 32-bit value.
> * The bottom 16-bits are for optional return data.
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 39d32c2904fc..c0cafa9e84af 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -19,6 +19,7 @@
> #include <linux/sched.h>
> #include <linux/seccomp.h>
> #include <linux/slab.h>
> +#include <linux/syscalls.h>
>
> /* #define SECCOMP_DEBUG 1 */
>
> @@ -301,8 +302,8 @@ free_prog:
> *
> * Returns filter on success and ERR_PTR otherwise.
> */
> -static
> -struct seccomp_filter *seccomp_prepare_user_filter(char __user *user_filter)
> +static struct seccomp_filter *
> +seccomp_prepare_user_filter(const char __user *user_filter)
> {
> struct sock_fprog fprog;
> struct seccomp_filter *filter = ERR_PTR(-EFAULT);
> @@ -325,19 +326,25 @@ out:
>
> /**
> * seccomp_attach_filter: validate and attach filter
> + * @flags: flags to change filter behavior
> * @filter: seccomp filter to add to the current process
> *
> * Caller must be holding current->sighand->siglock lock.
> *
> * Returns 0 on success, -ve on error.
> */
> -static long seccomp_attach_filter(struct seccomp_filter *filter)
> +static long seccomp_attach_filter(unsigned int flags,
> + struct seccomp_filter *filter)
> {
> unsigned long total_insns;
> struct seccomp_filter *walker;
>
> BUG_ON(!spin_is_locked(¤t->sighand->siglock));
>
> + /* Validate flags. */
> + if (flags != 0)
> + return -EINVAL;
> +
> /* Validate resulting filter length. */
> total_insns = filter->len;
> for (walker = current->seccomp.filter; walker; walker = filter->prev)
> @@ -541,6 +548,7 @@ out:
> #ifdef CONFIG_SECCOMP_FILTER
> /**
> * seccomp_set_mode_filter: internal function for setting seccomp filter
> + * @flags: flags to change filter behavior
> * @filter: struct sock_fprog containing filter
> *
> * This function may be called repeatedly to install additional filters.
> @@ -551,7 +559,8 @@ out:
> *
> * Returns 0 on success or -EINVAL on failure.
> */
> -static long seccomp_set_mode_filter(char __user *filter)
> +static long seccomp_set_mode_filter(unsigned int flags,
> + const char __user *filter)
> {
> const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
> struct seccomp_filter *prepared = NULL;
> @@ -569,7 +578,7 @@ static long seccomp_set_mode_filter(char __user *filter)
> if (!seccomp_check_mode(current, seccomp_mode))
> goto out;
>
> - ret = seccomp_attach_filter(prepared);
> + ret = seccomp_attach_filter(flags, prepared);
> if (ret)
> goto out;
> /* Do not free the successfully attached filter. */
> @@ -583,12 +592,35 @@ out_free:
> return ret;
> }
> #else
> -static inline long seccomp_set_mode_filter(char __user *filter)
> +static inline long seccomp_set_mode_filter(unsigned int flags,
> + const char __user *filter)
> {
> return -EINVAL;
> }
> #endif
>
> +/* Common entry point for both prctl and syscall. */
> +static long do_seccomp(unsigned int op, unsigned int flags,
> + const char __user *uargs)
> +{
> + switch (op) {
> + case SECCOMP_SET_MODE_STRICT:
> + if (flags != 0 || uargs != NULL)
> + return -EINVAL;
> + return seccomp_set_mode_strict();
> + case SECCOMP_SET_MODE_FILTER:
> + return seccomp_set_mode_filter(flags, uargs);
> + default:
> + return -EINVAL;
> + }
> +}
> +
> +SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
> + const char __user *, uargs)
> +{
> + return do_seccomp(op, flags, uargs);
> +}
> +
> /**
> * prctl_set_seccomp: configures current->seccomp.mode
> * @seccomp_mode: requested mode to use
> @@ -598,12 +630,27 @@ static inline long seccomp_set_mode_filter(char __user *filter)
> */
> long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
> {
> + unsigned int op;
> + char __user *uargs;
> +
> switch (seccomp_mode) {
> case SECCOMP_MODE_STRICT:
> - return seccomp_set_mode_strict();
> + op = SECCOMP_SET_MODE_STRICT;
> + /*
> + * Setting strict mode through prctl always ignored filter,
> + * so make sure it is always NULL here to pass the internal
> + * check in do_seccomp().
> + */
> + uargs = NULL;
> + break;
> case SECCOMP_MODE_FILTER:
> - return seccomp_set_mode_filter(filter);
> + op = SECCOMP_SET_MODE_FILTER;
> + uargs = filter;
> + break;
> default:
> return -EINVAL;
> }
> +
> + /* prctl interface doesn't have flags, so they are always zero. */
> + return do_seccomp(op, 0, uargs);
> }
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 36441b51b5df..2904a2105914 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -213,3 +213,6 @@ cond_syscall(compat_sys_open_by_handle_at);
>
> /* compare kernel pointers */
> cond_syscall(sys_kcmp);
> +
> +/* operate on Secure Computing state */
> +cond_syscall(sys_seccomp);
> --
> 1.7.9.5
>
On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov <[email protected]> wrote:
> On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook <[email protected]> wrote:
>> This adds the new "seccomp" syscall with both an "operation" and "flags"
>> parameter for future expansion. The third argument is a pointer value,
>> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
>> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>>
>> Signed-off-by: Kees Cook <[email protected]>
>> Cc: [email protected]
>> ---
>> arch/x86/syscalls/syscall_32.tbl | 1 +
>> arch/x86/syscalls/syscall_64.tbl | 1 +
>> include/linux/syscalls.h | 2 ++
>> include/uapi/asm-generic/unistd.h | 4 ++-
>> include/uapi/linux/seccomp.h | 4 +++
>> kernel/seccomp.c | 63 ++++++++++++++++++++++++++++++++-----
>> kernel/sys_ni.c | 3 ++
>> 7 files changed, 69 insertions(+), 9 deletions(-)
>>
>> diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
>> index d6b867921612..7527eac24122 100644
>> --- a/arch/x86/syscalls/syscall_32.tbl
>> +++ b/arch/x86/syscalls/syscall_32.tbl
>> @@ -360,3 +360,4 @@
>> 351 i386 sched_setattr sys_sched_setattr
>> 352 i386 sched_getattr sys_sched_getattr
>> 353 i386 renameat2 sys_renameat2
>> +354 i386 seccomp sys_seccomp
>> diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
>> index ec255a1646d2..16272a6c12b7 100644
>> --- a/arch/x86/syscalls/syscall_64.tbl
>> +++ b/arch/x86/syscalls/syscall_64.tbl
>> @@ -323,6 +323,7 @@
>> 314 common sched_setattr sys_sched_setattr
>> 315 common sched_getattr sys_sched_getattr
>> 316 common renameat2 sys_renameat2
>> +317 common seccomp sys_seccomp
>>
>> #
>> # x32-specific system call numbers start at 512 to avoid cache impact
>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>> index b0881a0ed322..1713977ee26f 100644
>> --- a/include/linux/syscalls.h
>> +++ b/include/linux/syscalls.h
>> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>> asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>> unsigned long idx1, unsigned long idx2);
>> asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
>> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
>> + const char __user *uargs);
>
> It looks odd to add 'flags' argument to syscall that is not even used.
> It don't think it will be extensible this way.
> 'uargs' is used only in 2nd command as well and it's not 'char __user *'
> but rather 'struct sock_fprog __user *'
> I think it makes more sense to define only first argument as 'int op' and the
> rest as variable length array.
> Something like:
> long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
> then different commands can interpret 'attrs' differently.
> if op == mode_strict, then attrs == NULL, len == 0
> if op == mode_filter, then attrs->nla_type == seccomp_bpf_filter
> and nla_data(attrs) is 'struct sock_fprog'
Eww. If the operation doesn't imply the type, then I think we've
totally screwed up.
> If we decide to add new types of filters or new commands, the syscall prototype
> won't need to change. New commands can be added preserving backward
> compatibility.
> The basic TLV concept has been around forever in netlink world. imo makes
> sense to use it with new syscalls. Passing 'struct xxx' into syscalls
> is the thing
> of the past. TLV style is more extensible. Fields of structures can become
> optional in the future, new fields added, etc.
> 'struct nlattr' brings the same benefits to kernel api as protobuf did
> to user land.
I see no reason to bring nl_attr into this.
Admittedly, I've never dealt with nl_attr, but everything
netlink-related I've even been involved in has involved some sort of
API atrocity.
--Andy
On Fri, Jun 13, 2014 at 2:25 PM, Andy Lutomirski <[email protected]> wrote:
> On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov <[email protected]> wrote:
>> On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook <[email protected]> wrote:
>>> This adds the new "seccomp" syscall with both an "operation" and "flags"
>>> parameter for future expansion. The third argument is a pointer value,
>>> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
>>> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>>>
>>> Signed-off-by: Kees Cook <[email protected]>
>>> Cc: [email protected]
>>> ---
>>> arch/x86/syscalls/syscall_32.tbl | 1 +
>>> arch/x86/syscalls/syscall_64.tbl | 1 +
>>> include/linux/syscalls.h | 2 ++
>>> include/uapi/asm-generic/unistd.h | 4 ++-
>>> include/uapi/linux/seccomp.h | 4 +++
>>> kernel/seccomp.c | 63 ++++++++++++++++++++++++++++++++-----
>>> kernel/sys_ni.c | 3 ++
>>> 7 files changed, 69 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
>>> index d6b867921612..7527eac24122 100644
>>> --- a/arch/x86/syscalls/syscall_32.tbl
>>> +++ b/arch/x86/syscalls/syscall_32.tbl
>>> @@ -360,3 +360,4 @@
>>> 351 i386 sched_setattr sys_sched_setattr
>>> 352 i386 sched_getattr sys_sched_getattr
>>> 353 i386 renameat2 sys_renameat2
>>> +354 i386 seccomp sys_seccomp
>>> diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
>>> index ec255a1646d2..16272a6c12b7 100644
>>> --- a/arch/x86/syscalls/syscall_64.tbl
>>> +++ b/arch/x86/syscalls/syscall_64.tbl
>>> @@ -323,6 +323,7 @@
>>> 314 common sched_setattr sys_sched_setattr
>>> 315 common sched_getattr sys_sched_getattr
>>> 316 common renameat2 sys_renameat2
>>> +317 common seccomp sys_seccomp
>>>
>>> #
>>> # x32-specific system call numbers start at 512 to avoid cache impact
>>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>>> index b0881a0ed322..1713977ee26f 100644
>>> --- a/include/linux/syscalls.h
>>> +++ b/include/linux/syscalls.h
>>> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>>> asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>>> unsigned long idx1, unsigned long idx2);
>>> asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
>>> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
>>> + const char __user *uargs);
>>
>> It looks odd to add 'flags' argument to syscall that is not even used.
>> It don't think it will be extensible this way.
>> 'uargs' is used only in 2nd command as well and it's not 'char __user *'
>> but rather 'struct sock_fprog __user *'
>> I think it makes more sense to define only first argument as 'int op' and the
>> rest as variable length array.
>> Something like:
>> long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
>> then different commands can interpret 'attrs' differently.
>> if op == mode_strict, then attrs == NULL, len == 0
>> if op == mode_filter, then attrs->nla_type == seccomp_bpf_filter
>> and nla_data(attrs) is 'struct sock_fprog'
>
> Eww. If the operation doesn't imply the type, then I think we've
> totally screwed up.
>
>> If we decide to add new types of filters or new commands, the syscall prototype
>> won't need to change. New commands can be added preserving backward
>> compatibility.
>> The basic TLV concept has been around forever in netlink world. imo makes
>> sense to use it with new syscalls. Passing 'struct xxx' into syscalls
>> is the thing
>> of the past. TLV style is more extensible. Fields of structures can become
>> optional in the future, new fields added, etc.
>> 'struct nlattr' brings the same benefits to kernel api as protobuf did
>> to user land.
>
> I see no reason to bring nl_attr into this.
>
> Admittedly, I've never dealt with nl_attr, but everything
> netlink-related I've even been involved in has involved some sort of
> API atrocity.
netlink has a lot of legacy and there is genetlink which is not pretty
either because of extra socket creation, binding, dealing with packet
loss issues, but the key concept of variable length encoding is sound.
Right now seccomp has two commands and they already don't fit
into single syscall neatly. Are you saying there should be two syscalls
here? What about another seccomp related command? Another syscall?
imo all seccomp related commands needs to be mux/demux-ed under
one syscall. What is the way to mux/demux potentially very different
commands under one syscall? I cannot think of anything better than
TLV style. 'struct nlattr' is what we have today and I think it works fine.
I'm not suggesting to bring the whole netlink into the picture, but rather
TLV style of encoding different arguments for different commands.
On Fri, Jun 13, 2014 at 2:37 PM, Alexei Starovoitov <[email protected]> wrote:
> On Fri, Jun 13, 2014 at 2:25 PM, Andy Lutomirski <[email protected]> wrote:
>> On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov <[email protected]> wrote:
>>> On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook <[email protected]> wrote:
>>>> This adds the new "seccomp" syscall with both an "operation" and "flags"
>>>> parameter for future expansion. The third argument is a pointer value,
>>>> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
>>>> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>>>>
>>>> Signed-off-by: Kees Cook <[email protected]>
>>>> Cc: [email protected]
>>>> ---
>>>> arch/x86/syscalls/syscall_32.tbl | 1 +
>>>> arch/x86/syscalls/syscall_64.tbl | 1 +
>>>> include/linux/syscalls.h | 2 ++
>>>> include/uapi/asm-generic/unistd.h | 4 ++-
>>>> include/uapi/linux/seccomp.h | 4 +++
>>>> kernel/seccomp.c | 63 ++++++++++++++++++++++++++++++++-----
>>>> kernel/sys_ni.c | 3 ++
>>>> 7 files changed, 69 insertions(+), 9 deletions(-)
>>>>
>>>> diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
>>>> index d6b867921612..7527eac24122 100644
>>>> --- a/arch/x86/syscalls/syscall_32.tbl
>>>> +++ b/arch/x86/syscalls/syscall_32.tbl
>>>> @@ -360,3 +360,4 @@
>>>> 351 i386 sched_setattr sys_sched_setattr
>>>> 352 i386 sched_getattr sys_sched_getattr
>>>> 353 i386 renameat2 sys_renameat2
>>>> +354 i386 seccomp sys_seccomp
>>>> diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
>>>> index ec255a1646d2..16272a6c12b7 100644
>>>> --- a/arch/x86/syscalls/syscall_64.tbl
>>>> +++ b/arch/x86/syscalls/syscall_64.tbl
>>>> @@ -323,6 +323,7 @@
>>>> 314 common sched_setattr sys_sched_setattr
>>>> 315 common sched_getattr sys_sched_getattr
>>>> 316 common renameat2 sys_renameat2
>>>> +317 common seccomp sys_seccomp
>>>>
>>>> #
>>>> # x32-specific system call numbers start at 512 to avoid cache impact
>>>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>>>> index b0881a0ed322..1713977ee26f 100644
>>>> --- a/include/linux/syscalls.h
>>>> +++ b/include/linux/syscalls.h
>>>> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>>>> asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>>>> unsigned long idx1, unsigned long idx2);
>>>> asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
>>>> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
>>>> + const char __user *uargs);
>>>
>>> It looks odd to add 'flags' argument to syscall that is not even used.
>>> It don't think it will be extensible this way.
>>> 'uargs' is used only in 2nd command as well and it's not 'char __user *'
>>> but rather 'struct sock_fprog __user *'
>>> I think it makes more sense to define only first argument as 'int op' and the
>>> rest as variable length array.
>>> Something like:
>>> long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
>>> then different commands can interpret 'attrs' differently.
>>> if op == mode_strict, then attrs == NULL, len == 0
>>> if op == mode_filter, then attrs->nla_type == seccomp_bpf_filter
>>> and nla_data(attrs) is 'struct sock_fprog'
>>
>> Eww. If the operation doesn't imply the type, then I think we've
>> totally screwed up.
>>
>>> If we decide to add new types of filters or new commands, the syscall prototype
>>> won't need to change. New commands can be added preserving backward
>>> compatibility.
>>> The basic TLV concept has been around forever in netlink world. imo makes
>>> sense to use it with new syscalls. Passing 'struct xxx' into syscalls
>>> is the thing
>>> of the past. TLV style is more extensible. Fields of structures can become
>>> optional in the future, new fields added, etc.
>>> 'struct nlattr' brings the same benefits to kernel api as protobuf did
>>> to user land.
>>
>> I see no reason to bring nl_attr into this.
>>
>> Admittedly, I've never dealt with nl_attr, but everything
>> netlink-related I've even been involved in has involved some sort of
>> API atrocity.
>
> netlink has a lot of legacy and there is genetlink which is not pretty
> either because of extra socket creation, binding, dealing with packet
> loss issues, but the key concept of variable length encoding is sound.
> Right now seccomp has two commands and they already don't fit
> into single syscall neatly. Are you saying there should be two syscalls
> here? What about another seccomp related command? Another syscall?
> imo all seccomp related commands needs to be mux/demux-ed under
> one syscall. What is the way to mux/demux potentially very different
> commands under one syscall? I cannot think of anything better than
> TLV style. 'struct nlattr' is what we have today and I think it works fine.
> I'm not suggesting to bring the whole netlink into the picture, but rather
> TLV style of encoding different arguments for different commands.
I'm unconvinced. These are simple commands, and I think the interface
should be simple. Syscalls are cheap.
As an example, the interface could be:
int seccomp_add_filter(const struct sock_fprog *filter, unsigned int flags);
The "tsync" operation would be seccomp_add_filter(NULL,
SECCOMP_ADD_FILTER_TSYNC) -- it's equivalent to adding an
always-accept filter and syncing threads.
But, frankly, this kind of stuff should probably be "do operation X".
IIUC nl_attr is more like "do something, with these tags and values",
which results in oddities like whatever should happen of more than one
tag is set.
--Andy
--
Andy Lutomirski
AMA Capital Management, LLC
On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov <[email protected]> wrote:
> On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook <[email protected]> wrote:
>> This adds the new "seccomp" syscall with both an "operation" and "flags"
>> parameter for future expansion. The third argument is a pointer value,
>> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
>> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>>
>> Signed-off-by: Kees Cook <[email protected]>
>> Cc: [email protected]
>> ---
>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>> index b0881a0ed322..1713977ee26f 100644
>> --- a/include/linux/syscalls.h
>> +++ b/include/linux/syscalls.h
>> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>> asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>> unsigned long idx1, unsigned long idx2);
>> asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
>> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
>> + const char __user *uargs);
>
> It looks odd to add 'flags' argument to syscall that is not even used.
FWIW, "flags" is given use in the next patch to support the tsync option.
-Kees
--
Kees Cook
Chrome OS Security
On Fri, Jun 13, 2014 at 2:42 PM, Andy Lutomirski <[email protected]> wrote:
> On Fri, Jun 13, 2014 at 2:37 PM, Alexei Starovoitov <[email protected]> wrote:
>> On Fri, Jun 13, 2014 at 2:25 PM, Andy Lutomirski <[email protected]> wrote:
>>> On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov <[email protected]> wrote:
>>>> On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook <[email protected]> wrote:
>>>>> This adds the new "seccomp" syscall with both an "operation" and "flags"
>>>>> parameter for future expansion. The third argument is a pointer value,
>>>>> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
>>>>> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>>>>>
>>>>> Signed-off-by: Kees Cook <[email protected]>
>>>>> Cc: [email protected]
>>>>> ---
>>>>> arch/x86/syscalls/syscall_32.tbl | 1 +
>>>>> arch/x86/syscalls/syscall_64.tbl | 1 +
>>>>> include/linux/syscalls.h | 2 ++
>>>>> include/uapi/asm-generic/unistd.h | 4 ++-
>>>>> include/uapi/linux/seccomp.h | 4 +++
>>>>> kernel/seccomp.c | 63 ++++++++++++++++++++++++++++++++-----
>>>>> kernel/sys_ni.c | 3 ++
>>>>> 7 files changed, 69 insertions(+), 9 deletions(-)
>>>>>
>>>>> diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
>>>>> index d6b867921612..7527eac24122 100644
>>>>> --- a/arch/x86/syscalls/syscall_32.tbl
>>>>> +++ b/arch/x86/syscalls/syscall_32.tbl
>>>>> @@ -360,3 +360,4 @@
>>>>> 351 i386 sched_setattr sys_sched_setattr
>>>>> 352 i386 sched_getattr sys_sched_getattr
>>>>> 353 i386 renameat2 sys_renameat2
>>>>> +354 i386 seccomp sys_seccomp
>>>>> diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
>>>>> index ec255a1646d2..16272a6c12b7 100644
>>>>> --- a/arch/x86/syscalls/syscall_64.tbl
>>>>> +++ b/arch/x86/syscalls/syscall_64.tbl
>>>>> @@ -323,6 +323,7 @@
>>>>> 314 common sched_setattr sys_sched_setattr
>>>>> 315 common sched_getattr sys_sched_getattr
>>>>> 316 common renameat2 sys_renameat2
>>>>> +317 common seccomp sys_seccomp
>>>>>
>>>>> #
>>>>> # x32-specific system call numbers start at 512 to avoid cache impact
>>>>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>>>>> index b0881a0ed322..1713977ee26f 100644
>>>>> --- a/include/linux/syscalls.h
>>>>> +++ b/include/linux/syscalls.h
>>>>> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>>>>> asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>>>>> unsigned long idx1, unsigned long idx2);
>>>>> asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
>>>>> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
>>>>> + const char __user *uargs);
>>>>
>>>> It looks odd to add 'flags' argument to syscall that is not even used.
>>>> It don't think it will be extensible this way.
>>>> 'uargs' is used only in 2nd command as well and it's not 'char __user *'
>>>> but rather 'struct sock_fprog __user *'
>>>> I think it makes more sense to define only first argument as 'int op' and the
>>>> rest as variable length array.
>>>> Something like:
>>>> long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
>>>> then different commands can interpret 'attrs' differently.
>>>> if op == mode_strict, then attrs == NULL, len == 0
>>>> if op == mode_filter, then attrs->nla_type == seccomp_bpf_filter
>>>> and nla_data(attrs) is 'struct sock_fprog'
>>>
>>> Eww. If the operation doesn't imply the type, then I think we've
>>> totally screwed up.
>>>
>>>> If we decide to add new types of filters or new commands, the syscall prototype
>>>> won't need to change. New commands can be added preserving backward
>>>> compatibility.
>>>> The basic TLV concept has been around forever in netlink world. imo makes
>>>> sense to use it with new syscalls. Passing 'struct xxx' into syscalls
>>>> is the thing
>>>> of the past. TLV style is more extensible. Fields of structures can become
>>>> optional in the future, new fields added, etc.
>>>> 'struct nlattr' brings the same benefits to kernel api as protobuf did
>>>> to user land.
>>>
>>> I see no reason to bring nl_attr into this.
>>>
>>> Admittedly, I've never dealt with nl_attr, but everything
>>> netlink-related I've even been involved in has involved some sort of
>>> API atrocity.
>>
>> netlink has a lot of legacy and there is genetlink which is not pretty
>> either because of extra socket creation, binding, dealing with packet
>> loss issues, but the key concept of variable length encoding is sound.
>> Right now seccomp has two commands and they already don't fit
>> into single syscall neatly. Are you saying there should be two syscalls
>> here? What about another seccomp related command? Another syscall?
>> imo all seccomp related commands needs to be mux/demux-ed under
>> one syscall. What is the way to mux/demux potentially very different
>> commands under one syscall? I cannot think of anything better than
>> TLV style. 'struct nlattr' is what we have today and I think it works fine.
>> I'm not suggesting to bring the whole netlink into the picture, but rather
>> TLV style of encoding different arguments for different commands.
>
> I'm unconvinced. These are simple commands, and I think the interface
> should be simple. Syscalls are cheap.
>
> As an example, the interface could be:
>
> int seccomp_add_filter(const struct sock_fprog *filter, unsigned int flags);
>
> The "tsync" operation would be seccomp_add_filter(NULL,
> SECCOMP_ADD_FILTER_TSYNC) -- it's equivalent to adding an
> always-accept filter and syncing threads.
I think you convinced me that tsync should be part of adding a filter
(since now there are no failure side-effects), so this specific
example I would expect EFAULT from. But ...
>
> But, frankly, this kind of stuff should probably be "do operation X".
> IIUC nl_attr is more like "do something, with these tags and values",
> which results in oddities like whatever should happen of more than one
> tag is set.
I have no objection to eliminating seccomp-strict from the syscall,
and just making this the "add seccomp filter" syscall. My only
hesitation would be that if we need something besides adding a filter
in the future, we'd be back to extending this awkwardly or adding
another syscall. That's why I went with the "operation" argument. I'm
not opposed to passing attrs and len, but seccomp_add_filter does feel
cleaner.
-Kees
--
Kees Cook
Chrome OS Security
On Fri, Jun 13, 2014 at 2:42 PM, Andy Lutomirski <[email protected]> wrote:
> On Fri, Jun 13, 2014 at 2:37 PM, Alexei Starovoitov <[email protected]> wrote:
>> On Fri, Jun 13, 2014 at 2:25 PM, Andy Lutomirski <[email protected]> wrote:
>>> On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov <[email protected]> wrote:
>>>> On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook <[email protected]> wrote:
>>>>> This adds the new "seccomp" syscall with both an "operation" and "flags"
>>>>> parameter for future expansion. The third argument is a pointer value,
>>>>> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
>>>>> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>>>>>
>>>>> Signed-off-by: Kees Cook <[email protected]>
>>>>> Cc: [email protected]
>>>>> ---
>>>>> arch/x86/syscalls/syscall_32.tbl | 1 +
>>>>> arch/x86/syscalls/syscall_64.tbl | 1 +
>>>>> include/linux/syscalls.h | 2 ++
>>>>> include/uapi/asm-generic/unistd.h | 4 ++-
>>>>> include/uapi/linux/seccomp.h | 4 +++
>>>>> kernel/seccomp.c | 63 ++++++++++++++++++++++++++++++++-----
>>>>> kernel/sys_ni.c | 3 ++
>>>>> 7 files changed, 69 insertions(+), 9 deletions(-)
>>>>>
>>>>> diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
>>>>> index d6b867921612..7527eac24122 100644
>>>>> --- a/arch/x86/syscalls/syscall_32.tbl
>>>>> +++ b/arch/x86/syscalls/syscall_32.tbl
>>>>> @@ -360,3 +360,4 @@
>>>>> 351 i386 sched_setattr sys_sched_setattr
>>>>> 352 i386 sched_getattr sys_sched_getattr
>>>>> 353 i386 renameat2 sys_renameat2
>>>>> +354 i386 seccomp sys_seccomp
>>>>> diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
>>>>> index ec255a1646d2..16272a6c12b7 100644
>>>>> --- a/arch/x86/syscalls/syscall_64.tbl
>>>>> +++ b/arch/x86/syscalls/syscall_64.tbl
>>>>> @@ -323,6 +323,7 @@
>>>>> 314 common sched_setattr sys_sched_setattr
>>>>> 315 common sched_getattr sys_sched_getattr
>>>>> 316 common renameat2 sys_renameat2
>>>>> +317 common seccomp sys_seccomp
>>>>>
>>>>> #
>>>>> # x32-specific system call numbers start at 512 to avoid cache impact
>>>>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>>>>> index b0881a0ed322..1713977ee26f 100644
>>>>> --- a/include/linux/syscalls.h
>>>>> +++ b/include/linux/syscalls.h
>>>>> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>>>>> asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>>>>> unsigned long idx1, unsigned long idx2);
>>>>> asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
>>>>> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
>>>>> + const char __user *uargs);
>>>>
>>>> It looks odd to add 'flags' argument to syscall that is not even used.
>>>> It don't think it will be extensible this way.
>>>> 'uargs' is used only in 2nd command as well and it's not 'char __user *'
>>>> but rather 'struct sock_fprog __user *'
>>>> I think it makes more sense to define only first argument as 'int op' and the
>>>> rest as variable length array.
>>>> Something like:
>>>> long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
>>>> then different commands can interpret 'attrs' differently.
>>>> if op == mode_strict, then attrs == NULL, len == 0
>>>> if op == mode_filter, then attrs->nla_type == seccomp_bpf_filter
>>>> and nla_data(attrs) is 'struct sock_fprog'
>>>
>>> Eww. If the operation doesn't imply the type, then I think we've
>>> totally screwed up.
>>>
>>>> If we decide to add new types of filters or new commands, the syscall prototype
>>>> won't need to change. New commands can be added preserving backward
>>>> compatibility.
>>>> The basic TLV concept has been around forever in netlink world. imo makes
>>>> sense to use it with new syscalls. Passing 'struct xxx' into syscalls
>>>> is the thing
>>>> of the past. TLV style is more extensible. Fields of structures can become
>>>> optional in the future, new fields added, etc.
>>>> 'struct nlattr' brings the same benefits to kernel api as protobuf did
>>>> to user land.
>>>
>>> I see no reason to bring nl_attr into this.
>>>
>>> Admittedly, I've never dealt with nl_attr, but everything
>>> netlink-related I've even been involved in has involved some sort of
>>> API atrocity.
>>
>> netlink has a lot of legacy and there is genetlink which is not pretty
>> either because of extra socket creation, binding, dealing with packet
>> loss issues, but the key concept of variable length encoding is sound.
>> Right now seccomp has two commands and they already don't fit
>> into single syscall neatly. Are you saying there should be two syscalls
>> here? What about another seccomp related command? Another syscall?
>> imo all seccomp related commands needs to be mux/demux-ed under
>> one syscall. What is the way to mux/demux potentially very different
>> commands under one syscall? I cannot think of anything better than
>> TLV style. 'struct nlattr' is what we have today and I think it works fine.
>> I'm not suggesting to bring the whole netlink into the picture, but rather
>> TLV style of encoding different arguments for different commands.
>
> I'm unconvinced. These are simple commands, and I think the interface
> should be simple. Syscalls are cheap.
>
> As an example, the interface could be:
>
> int seccomp_add_filter(const struct sock_fprog *filter, unsigned int flags);
>
> The "tsync" operation would be seccomp_add_filter(NULL,
> SECCOMP_ADD_FILTER_TSYNC) -- it's equivalent to adding an
> always-accept filter and syncing threads.
>
> But, frankly, this kind of stuff should probably be "do operation X".
> IIUC nl_attr is more like "do something, with these tags and values",
> which results in oddities like whatever should happen of more than one
> tag is set.
TLV is a price of extensibility vs simplicity.
Say we have a syscall_foo(struct XX __user *x); that takes
struct XX {
int flag;
int var1;
};
now we want to add another variable to the structure that will be used
only when certain flag is set. You cannot do it easily without
breaking old user binaries, since new structure will be:
struct XX2 {
int flag;
int var1;
int var2;
};
if we do copy_from_user(,, sizeof(struct XX2)) it will brake old programs.
Potentially we can do it the ugly way:
copy_from_user(,, sizeof(struct XX)), then check flag and do another
copy_from_user(,, sizeof(struct XX2)) just to fetch extra argument.
but then another day passes and yet another new flag means that both
var1 and var2 are unused and it needs 'u64 var3' instead.
Now kernel looks very ugly by doing multiple copy_from_user() of different
structures.
It would be much cleaner with nlattr:
syscall_foo(struct nlattr *attrs, int len);
kernel fetches 'len' bytes onces and then picks var[123] fields from nlattr
array. In other words nlattr array is a way to represent flexible structure
where fields can come and go in the future.
This was a simple example. Consider the case where var1 and var2
are arrays of things.
imo the old:
struct sock_fprog {
unsigned short len; /* Number of filter blocks */
struct sock_filter __user *filter;
};
is the example of inflexible user interface.
It could have been single 'struct nlattr'
nlattr->nla_len == length of filter program in bytes
nlattr->nla_type == ARRAY_OF_SOCK_FILTER constant
nla_data(nlattr) - variable length array of 'struct sock_filter' bpf
instructions.
Right now I'd like to extend it for the work I'm doing around eBPF, but
I cannot, since it's rigid. If it was TLV, I could have easily added new flags,
new types, new sections to bpf programs.
For user space it would have been just as easy to populate such
'struct nlattr' as to populate 'struct sock_fprog'.
Hi Kees,
On Wed, Jun 11, 2014 at 5:25 AM, Kees Cook <[email protected]> wrote:
> This adds the new "seccomp" syscall with both an "operation" and "flags"
> parameter for future expansion. The third argument is a pointer value,
> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
I assume there'll be another iteration of these patches. With that
next iteration, could you write a man page (or at least free text
structured like a man page) that comprehensively documents the
user-space API?
Thanks,
Michael
> Signed-off-by: Kees Cook <[email protected]>
> Cc: [email protected]
> ---
> arch/x86/syscalls/syscall_32.tbl | 1 +
> arch/x86/syscalls/syscall_64.tbl | 1 +
> include/linux/syscalls.h | 2 ++
> include/uapi/asm-generic/unistd.h | 4 ++-
> include/uapi/linux/seccomp.h | 4 +++
> kernel/seccomp.c | 63 ++++++++++++++++++++++++++++++++-----
> kernel/sys_ni.c | 3 ++
> 7 files changed, 69 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
> index d6b867921612..7527eac24122 100644
> --- a/arch/x86/syscalls/syscall_32.tbl
> +++ b/arch/x86/syscalls/syscall_32.tbl
> @@ -360,3 +360,4 @@
> 351 i386 sched_setattr sys_sched_setattr
> 352 i386 sched_getattr sys_sched_getattr
> 353 i386 renameat2 sys_renameat2
> +354 i386 seccomp sys_seccomp
> diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
> index ec255a1646d2..16272a6c12b7 100644
> --- a/arch/x86/syscalls/syscall_64.tbl
> +++ b/arch/x86/syscalls/syscall_64.tbl
> @@ -323,6 +323,7 @@
> 314 common sched_setattr sys_sched_setattr
> 315 common sched_getattr sys_sched_getattr
> 316 common renameat2 sys_renameat2
> +317 common seccomp sys_seccomp
>
> #
> # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index b0881a0ed322..1713977ee26f 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
> asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
> unsigned long idx1, unsigned long idx2);
> asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
> + const char __user *uargs);
> #endif
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 333640608087..65acbf0e2867 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -699,9 +699,11 @@ __SYSCALL(__NR_sched_setattr, sys_sched_setattr)
> __SYSCALL(__NR_sched_getattr, sys_sched_getattr)
> #define __NR_renameat2 276
> __SYSCALL(__NR_renameat2, sys_renameat2)
> +#define __NR_seccomp 277
> +__SYSCALL(__NR_seccomp, sys_seccomp)
>
> #undef __NR_syscalls
> -#define __NR_syscalls 277
> +#define __NR_syscalls 278
>
> /*
> * All syscalls below here should go away really,
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index ac2dc9f72973..b258878ba754 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -10,6 +10,10 @@
> #define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */
> #define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
>
> +/* Valid operations for seccomp syscall. */
> +#define SECCOMP_SET_MODE_STRICT 0
> +#define SECCOMP_SET_MODE_FILTER 1
> +
> /*
> * All BPF programs must return a 32-bit value.
> * The bottom 16-bits are for optional return data.
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 39d32c2904fc..c0cafa9e84af 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -19,6 +19,7 @@
> #include <linux/sched.h>
> #include <linux/seccomp.h>
> #include <linux/slab.h>
> +#include <linux/syscalls.h>
>
> /* #define SECCOMP_DEBUG 1 */
>
> @@ -301,8 +302,8 @@ free_prog:
> *
> * Returns filter on success and ERR_PTR otherwise.
> */
> -static
> -struct seccomp_filter *seccomp_prepare_user_filter(char __user *user_filter)
> +static struct seccomp_filter *
> +seccomp_prepare_user_filter(const char __user *user_filter)
> {
> struct sock_fprog fprog;
> struct seccomp_filter *filter = ERR_PTR(-EFAULT);
> @@ -325,19 +326,25 @@ out:
>
> /**
> * seccomp_attach_filter: validate and attach filter
> + * @flags: flags to change filter behavior
> * @filter: seccomp filter to add to the current process
> *
> * Caller must be holding current->sighand->siglock lock.
> *
> * Returns 0 on success, -ve on error.
> */
> -static long seccomp_attach_filter(struct seccomp_filter *filter)
> +static long seccomp_attach_filter(unsigned int flags,
> + struct seccomp_filter *filter)
> {
> unsigned long total_insns;
> struct seccomp_filter *walker;
>
> BUG_ON(!spin_is_locked(¤t->sighand->siglock));
>
> + /* Validate flags. */
> + if (flags != 0)
> + return -EINVAL;
> +
> /* Validate resulting filter length. */
> total_insns = filter->len;
> for (walker = current->seccomp.filter; walker; walker = filter->prev)
> @@ -541,6 +548,7 @@ out:
> #ifdef CONFIG_SECCOMP_FILTER
> /**
> * seccomp_set_mode_filter: internal function for setting seccomp filter
> + * @flags: flags to change filter behavior
> * @filter: struct sock_fprog containing filter
> *
> * This function may be called repeatedly to install additional filters.
> @@ -551,7 +559,8 @@ out:
> *
> * Returns 0 on success or -EINVAL on failure.
> */
> -static long seccomp_set_mode_filter(char __user *filter)
> +static long seccomp_set_mode_filter(unsigned int flags,
> + const char __user *filter)
> {
> const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
> struct seccomp_filter *prepared = NULL;
> @@ -569,7 +578,7 @@ static long seccomp_set_mode_filter(char __user *filter)
> if (!seccomp_check_mode(current, seccomp_mode))
> goto out;
>
> - ret = seccomp_attach_filter(prepared);
> + ret = seccomp_attach_filter(flags, prepared);
> if (ret)
> goto out;
> /* Do not free the successfully attached filter. */
> @@ -583,12 +592,35 @@ out_free:
> return ret;
> }
> #else
> -static inline long seccomp_set_mode_filter(char __user *filter)
> +static inline long seccomp_set_mode_filter(unsigned int flags,
> + const char __user *filter)
> {
> return -EINVAL;
> }
> #endif
>
> +/* Common entry point for both prctl and syscall. */
> +static long do_seccomp(unsigned int op, unsigned int flags,
> + const char __user *uargs)
> +{
> + switch (op) {
> + case SECCOMP_SET_MODE_STRICT:
> + if (flags != 0 || uargs != NULL)
> + return -EINVAL;
> + return seccomp_set_mode_strict();
> + case SECCOMP_SET_MODE_FILTER:
> + return seccomp_set_mode_filter(flags, uargs);
> + default:
> + return -EINVAL;
> + }
> +}
> +
> +SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
> + const char __user *, uargs)
> +{
> + return do_seccomp(op, flags, uargs);
> +}
> +
> /**
> * prctl_set_seccomp: configures current->seccomp.mode
> * @seccomp_mode: requested mode to use
> @@ -598,12 +630,27 @@ static inline long seccomp_set_mode_filter(char __user *filter)
> */
> long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
> {
> + unsigned int op;
> + char __user *uargs;
> +
> switch (seccomp_mode) {
> case SECCOMP_MODE_STRICT:
> - return seccomp_set_mode_strict();
> + op = SECCOMP_SET_MODE_STRICT;
> + /*
> + * Setting strict mode through prctl always ignored filter,
> + * so make sure it is always NULL here to pass the internal
> + * check in do_seccomp().
> + */
> + uargs = NULL;
> + break;
> case SECCOMP_MODE_FILTER:
> - return seccomp_set_mode_filter(filter);
> + op = SECCOMP_SET_MODE_FILTER;
> + uargs = filter;
> + break;
> default:
> return -EINVAL;
> }
> +
> + /* prctl interface doesn't have flags, so they are always zero. */
> + return do_seccomp(op, 0, uargs);
> }
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 36441b51b5df..2904a2105914 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -213,3 +213,6 @@ cond_syscall(compat_sys_open_by_handle_at);
>
> /* compare kernel pointers */
> cond_syscall(sys_kcmp);
> +
> +/* operate on Secure Computing state */
> +cond_syscall(sys_seccomp);
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-api" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/