Normally, task_struck.seccomp.filter is only ever read or modified by
the task that owns it (current). This property aids in fast access
during system call filtering as read access is lockless.
Updating the pointer from another task, however, opens up race
conditions. To allow cross-task filter pointer updates, writes to
the pointer are now protected by the task_lock. Read access remains
lockless because pointer updates themselves are atomic. However,
writes often entail additional checking (like maximum instruction
counts) which require locking to perform safely.
Signed-off-by: Will Drewry <[email protected]>
---
include/linux/seccomp.h | 4 ++--
kernel/seccomp.c | 22 +++++++++++++---------
2 files changed, 15 insertions(+), 11 deletions(-)
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 6f19cfd..85c0895 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -17,8 +17,8 @@ struct seccomp_filter;
* @filter: The metadata and ruleset for determining what system calls
* are allowed for a task.
*
- * @filter must only be accessed from the context of current as there
- * is no locking.
+ * @filter must always point to a valid seccomp_filter or NULL as it is
+ * accessed without locking during system call entry.
*/
struct seccomp {
int mode;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index b7a1004..71512e4 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -201,18 +201,18 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
*/
static u32 seccomp_run_filters(int syscall)
{
- struct seccomp_filter *f;
+ struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
u32 ret = SECCOMP_RET_ALLOW;
/* Ensure unexpected behavior doesn't result in failing open. */
- if (WARN_ON(current->seccomp.filter == NULL))
+ if (WARN_ON(f == NULL))
return SECCOMP_RET_KILL;
/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
*/
- for (f = current->seccomp.filter; f; f = f->prev) {
+ for (; f; f = ACCESS_ONCE(f->prev)) {
u32 cur_ret = sk_run_filter(NULL, f->insns);
if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
ret = cur_ret;
@@ -228,7 +228,7 @@ static u32 seccomp_run_filters(int syscall)
*/
static long seccomp_attach_filter(struct sock_fprog *fprog)
{
- struct seccomp_filter *filter;
+ struct seccomp_filter *filter, *f;
unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
unsigned long total_insns = fprog->len;
long ret;
@@ -236,11 +236,6 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return -EINVAL;
- for (filter = current->seccomp.filter; filter; filter = filter->prev)
- total_insns += filter->len + 4; /* include a 4 instr penalty */
- if (total_insns > MAX_INSNS_PER_PATH)
- return -ENOMEM;
-
/*
* Installing a seccomp filter requires that the task have
* CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
@@ -260,6 +255,13 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
atomic_set(&filter->usage, 1);
filter->len = fprog->len;
+ task_lock(current); /* protect current->seccomp.filter from updates */
+ for (f = current->seccomp.filter; f; f = f->prev)
+ total_insns += f->len + 4; /* include a 4 instr penalty */
+ ret = -ENOMEM;
+ if (total_insns > MAX_INSNS_PER_PATH)
+ goto fail;
+
/* Copy the instructions from fprog. */
ret = -EFAULT;
if (copy_from_user(filter->insns, fprog->filter, fp_size))
@@ -281,8 +283,10 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
*/
filter->prev = current->seccomp.filter;
current->seccomp.filter = filter;
+ task_unlock(current);
return 0;
fail:
+ task_unlock(current);
kfree(filter);
return ret;
}
--
1.7.9.5
Applying restrictive seccomp filter programs to large or diverse
codebases often requires handling threads which may be started early in
the process lifetime (e.g., by code that is linked in). While it is
possible to apply permissive programs prior to process start up, it is
difficult to further restrict the kernel ABI to those threads after that
point.
This change adds a new seccomp "extension" for synchronizing thread
group seccomp filters and a prctl() for accessing that functionality.
The need for the added prctl() is due to the lack of reserved arguments
in PR_SET_SECCOMP.
When prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT_TSYNC, 0, 0) is called, it
will attempt to synchronize all threads in current's threadgroup to its
seccomp filter program. This is possible iff all threads are using a
filter that is an ancestor to the filter current is attempting to
synchronize to. NULL filters (where the task is running as
SECCOMP_MODE_NONE) are also treated as ancestors allowing threads to be
transitioned into SECCOMP_MODE_FILTER. On success, 0 is returned. On
failure, the pid of one of the failing threads will be returned.
Suggested-by: Julien Tinnes <[email protected]>
Signed-off-by: Will Drewry <[email protected]>
---
include/linux/seccomp.h | 7 +++
include/uapi/linux/prctl.h | 6 ++
include/uapi/linux/seccomp.h | 6 ++
kernel/seccomp.c | 128 ++++++++++++++++++++++++++++++++++++++++++
kernel/sys.c | 3 +
5 files changed, 150 insertions(+)
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 85c0895..3163db6 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -77,6 +77,8 @@ static inline int seccomp_mode(struct seccomp *s)
extern void put_seccomp_filter(struct task_struct *tsk);
extern void get_seccomp_filter(struct task_struct *tsk);
extern u32 seccomp_bpf_load(int off);
+extern long prctl_seccomp_ext(unsigned long, unsigned long,
+ unsigned long, unsigned long);
#else /* CONFIG_SECCOMP_FILTER */
static inline void put_seccomp_filter(struct task_struct *tsk)
{
@@ -86,5 +88,10 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
{
return;
}
+static inline long prctl_seccomp_ext(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ return -EINVAL;
+}
#endif /* CONFIG_SECCOMP_FILTER */
#endif /* _LINUX_SECCOMP_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 289760f..5dcd5d3 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -149,4 +149,10 @@
#define PR_GET_TID_ADDRESS 40
+/*
+ * Access seccomp extensions
+ * See Documentation/prctl/seccomp_filter.txt for more details.
+ */
+#define PR_SECCOMP_EXT 41
+
#endif /* _LINUX_PRCTL_H */
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index ac2dc9f..49b5279 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -10,6 +10,12 @@
#define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */
#define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
+/* Valid extension types as arg2 for prctl(PR_SECCOMP_EXT) */
+#define SECCOMP_EXT_ACT 1
+
+/* Valid extension actions as arg3 to prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT) */
+#define SECCOMP_EXT_ACT_TSYNC 1 /* attempt to synchronize thread filters */
+
/*
* All BPF programs must return a 32-bit value.
* The bottom 16-bits are for optional return data.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 71512e4..8a0de7b 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -24,6 +24,7 @@
#ifdef CONFIG_SECCOMP_FILTER
#include <asm/syscall.h>
#include <linux/filter.h>
+#include <linux/pid.h>
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/slab.h>
@@ -220,6 +221,108 @@ static u32 seccomp_run_filters(int syscall)
return ret;
}
+/* Returns 1 if the candidate is an ancestor. */
+static int is_ancestor(struct seccomp_filter *candidate,
+ struct seccomp_filter *child)
+{
+ /* NULL is the root ancestor. */
+ if (candidate == NULL)
+ return 1;
+ for (; child; child = child->prev)
+ if (child == candidate)
+ return 1;
+ return 0;
+}
+
+/**
+ * seccomp_sync_threads: sets all threads to use current's filter
+ *
+ * Returns 0 on success or the pid of a thread which was either not
+ * in the correct seccomp mode or it did not have an ancestral
+ * seccomp filter. current must be in seccomp.mode=2 already.
+ */
+static pid_t seccomp_sync_threads(void)
+{
+ struct task_struct *thread, *caller;
+ pid_t failed = 0;
+ thread = caller = current;
+
+ read_lock(&tasklist_lock);
+ if (thread_group_empty(caller))
+ goto done;
+ while_each_thread(caller, thread) {
+ task_lock(thread);
+ /*
+ * All threads must not be in SECCOMP_MODE_STRICT to
+ * be eligible for synchronization.
+ */
+ if ((thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
+ thread->seccomp.mode == SECCOMP_MODE_FILTER) &&
+ is_ancestor(thread->seccomp.filter,
+ caller->seccomp.filter)) {
+ /* Get a task reference for the new leaf node. */
+ get_seccomp_filter(caller);
+ /*
+ * Drop the task reference to the shared ancestor since
+ * current's path will hold a reference. (This also
+ * allows a put before the assignment.)
+ */
+ put_seccomp_filter(thread);
+ thread->seccomp.filter = caller->seccomp.filter;
+ /* Opt the other thread into seccomp if needed.
+ * As threads are considered to be trust-realm
+ * equivalent (see ptrace_may_access), it is safe to
+ * allow one thread to transition the other.
+ */
+ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
+ thread->seccomp.mode = SECCOMP_MODE_FILTER;
+ /*
+ * Don't let an unprivileged task work around
+ * the no_new_privs restriction by creating
+ * a thread that sets it up, enters seccomp,
+ * then dies.
+ */
+ if (caller->no_new_privs)
+ thread->no_new_privs = 1;
+ set_tsk_thread_flag(thread, TIF_SECCOMP);
+ }
+ } else {
+ /* Keep the last sibling that failed to return. */
+ struct pid *pid = get_task_pid(thread, PIDTYPE_PID);
+ failed = pid_vnr(pid);
+ put_pid(pid);
+ /* If the pid cannot be resolved, then return -ESRCH */
+ if (failed == 0)
+ failed = -ESRCH;
+ }
+ task_unlock(thread);
+ }
+done:
+ read_unlock(&tasklist_lock);
+ return failed;
+}
+
+/**
+ * seccomp_extended_action: performs the specific action
+ * @action: the enum of the action to perform.
+ *
+ * Returns 0 on success. On failure, it returns -EINVAL
+ * on a invalid action or -EACCES if the seccomp mode is
+ * invalid.
+ */
+static long seccomp_extended_action(int action)
+{
+ switch (action) {
+ case SECCOMP_EXT_ACT_TSYNC:
+ if (current->seccomp.mode != SECCOMP_MODE_FILTER)
+ return -EACCES;
+ return seccomp_sync_threads();
+ default:
+ break;
+ }
+ return -EINVAL;
+}
+
/**
* seccomp_attach_filter: Attaches a seccomp filter to current.
* @fprog: BPF program to install
@@ -471,6 +574,31 @@ long prctl_get_seccomp(void)
}
/**
+ * prctl_seccomp_ext: exposed extension behaviors for seccomp
+ * @cmd: the type of extension being called
+ * @arg[123]: the arguments for the extension
+ * (at present, arg2 and arg3 must be 0)
+ *
+ * Returns >= 0 on success and < 0 on failure.
+ * Invalid arguments return -EINVAL.
+ * Improper seccomp mode will result in -EACCES.
+ *
+ * SECCOMP_EXT_TYPE_ACT, SECCOMP_EXT_ACT_TSYNC will return 0 on success
+ * or the last thread pid that it cannot synchronize.
+ */
+long prctl_seccomp_ext(unsigned long type, unsigned long arg1,
+ unsigned long arg2, unsigned long arg3)
+{
+ if (type != SECCOMP_EXT_ACT)
+ return -EINVAL;
+ /* arg2 and arg3 are currently unused. */
+ if (arg2 || arg3)
+ return -EINVAL;
+ /* For action extensions, arg1 is the identifier. */
+ return seccomp_extended_action(arg1);
+}
+
+/**
* prctl_set_seccomp: configures current->seccomp.mode
* @seccomp_mode: requested mode to use
* @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
diff --git a/kernel/sys.c b/kernel/sys.c
index c723113..73d599a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1919,6 +1919,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SET_SECCOMP:
error = prctl_set_seccomp(arg2, (char __user *)arg3);
break;
+ case PR_SECCOMP_EXT:
+ error = prctl_seccomp_ext(arg2, arg3, arg4, arg5);
+ break;
case PR_GET_TSC:
error = GET_TSC_CTL(arg2);
break;
--
1.7.9.5
(missed this on the first run)
Add an entry for the PR_SECCOMP_EXT entry point and the
only existing consumer, SECCOMP_EXT_ACT_TSYNC.
Signed-off-by: Will Drewry <[email protected]>
---
Documentation/prctl/seccomp_filter.txt | 28 +++++++++++++++++++++++++++-
1 file changed, 27 insertions(+), 1 deletion(-)
diff --git a/Documentation/prctl/seccomp_filter.txt b/Documentation/prctl/seccomp_filter.txt
index 1e469ef..b296701 100644
--- a/Documentation/prctl/seccomp_filter.txt
+++ b/Documentation/prctl/seccomp_filter.txt
@@ -166,10 +166,36 @@ The samples/seccomp/ directory contains both an x86-specific example
and a more generic example of a higher level macro interface for BPF
program generation.
+Extensions
+----------
+
+SECCOMP_MODE_FILTER supports an additional entry point for accessing
+extended behavior through prctl(PR_SECCOMP_EXT). Only one extension
+exists today:
+
+SECCOMP_EXT_ACT_TSYNC:
+ If the calling task is running under SECCOMP_MODE_FILTER, it
+ may call prctl() to synchronize the seccomp filter of its
+ threads. As seccomp behavior is per-task, any thread under
+ SECCOMP_MODE_STRICT will be unaffected, as will any thread
+ under SECCOMP_MODE_FILTER that does not have a filter that is
+ in the filter tree ancestry for the caller. Any threads that
+ are in SECCOMP_MODE_NONE will be transitioned to
+ SECCOMP_MODE_FILTER if possible.
+
+ Usage:
+ prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT_TSYNC, 0, 0);
+
+ If any threads cannot be transitioned, the call will return one
+ of the process ids. All other threads will have been transitioned.
+ A return value of 0 indicates success. On a negative return value,
+ the errno will be populated appropriately:
+ EINVAL indicates invalid arguments.
+ EACCES indicates invalid seccomp mode.
Adding architecture support
------------------------
+---------------------------
See arch/Kconfig for the authoritative requirements. In general, if an
architecture supports both ptrace_event and seccomp, it will be able to
--
1.7.9.5
On 01/13/2014 12:30 PM, Will Drewry wrote:
> Applying restrictive seccomp filter programs to large or diverse
> codebases often requires handling threads which may be started early in
> the process lifetime (e.g., by code that is linked in). While it is
> possible to apply permissive programs prior to process start up, it is
> difficult to further restrict the kernel ABI to those threads after that
> point.
>
> This change adds a new seccomp "extension" for synchronizing thread
> group seccomp filters and a prctl() for accessing that functionality.
> The need for the added prctl() is due to the lack of reserved arguments
> in PR_SET_SECCOMP.
>
> When prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT_TSYNC, 0, 0) is called, it
> will attempt to synchronize all threads in current's threadgroup to its
> seccomp filter program. This is possible iff all threads are using a
> filter that is an ancestor to the filter current is attempting to
> synchronize to. NULL filters (where the task is running as
> SECCOMP_MODE_NONE) are also treated as ancestors allowing threads to be
> transitioned into SECCOMP_MODE_FILTER. On success, 0 is returned. On
> failure, the pid of one of the failing threads will be returned.
>
> Suggested-by: Julien Tinnes <[email protected]>
> Signed-off-by: Will Drewry <[email protected]>
> ---
> include/linux/seccomp.h | 7 +++
> include/uapi/linux/prctl.h | 6 ++
> include/uapi/linux/seccomp.h | 6 ++
> kernel/seccomp.c | 128 ++++++++++++++++++++++++++++++++++++++++++
> kernel/sys.c | 3 +
> 5 files changed, 150 insertions(+)
>
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index 85c0895..3163db6 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -77,6 +77,8 @@ static inline int seccomp_mode(struct seccomp *s)
> extern void put_seccomp_filter(struct task_struct *tsk);
> extern void get_seccomp_filter(struct task_struct *tsk);
> extern u32 seccomp_bpf_load(int off);
> +extern long prctl_seccomp_ext(unsigned long, unsigned long,
> + unsigned long, unsigned long);
> #else /* CONFIG_SECCOMP_FILTER */
> static inline void put_seccomp_filter(struct task_struct *tsk)
> {
> @@ -86,5 +88,10 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
> {
> return;
> }
> +static inline long prctl_seccomp_ext(unsigned long arg2, unsigned long arg3,
> + unsigned long arg4, unsigned long arg5)
> +{
> + return -EINVAL;
> +}
> #endif /* CONFIG_SECCOMP_FILTER */
> #endif /* _LINUX_SECCOMP_H */
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index 289760f..5dcd5d3 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -149,4 +149,10 @@
>
> #define PR_GET_TID_ADDRESS 40
>
> +/*
> + * Access seccomp extensions
> + * See Documentation/prctl/seccomp_filter.txt for more details.
> + */
> +#define PR_SECCOMP_EXT 41
> +
> #endif /* _LINUX_PRCTL_H */
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index ac2dc9f..49b5279 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -10,6 +10,12 @@
> #define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */
> #define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
>
> +/* Valid extension types as arg2 for prctl(PR_SECCOMP_EXT) */
> +#define SECCOMP_EXT_ACT 1
> +
> +/* Valid extension actions as arg3 to prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT) */
> +#define SECCOMP_EXT_ACT_TSYNC 1 /* attempt to synchronize thread filters */
> +
> /*
> * All BPF programs must return a 32-bit value.
> * The bottom 16-bits are for optional return data.
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 71512e4..8a0de7b 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -24,6 +24,7 @@
> #ifdef CONFIG_SECCOMP_FILTER
> #include <asm/syscall.h>
> #include <linux/filter.h>
> +#include <linux/pid.h>
> #include <linux/ptrace.h>
> #include <linux/security.h>
> #include <linux/slab.h>
> @@ -220,6 +221,108 @@ static u32 seccomp_run_filters(int syscall)
> return ret;
> }
>
> +/* Returns 1 if the candidate is an ancestor. */
> +static int is_ancestor(struct seccomp_filter *candidate,
> + struct seccomp_filter *child)
> +{
> + /* NULL is the root ancestor. */
> + if (candidate == NULL)
> + return 1;
> + for (; child; child = child->prev)
> + if (child == candidate)
> + return 1;
> + return 0;
> +}
> +
> +/**
> + * seccomp_sync_threads: sets all threads to use current's filter
> + *
> + * Returns 0 on success or the pid of a thread which was either not
> + * in the correct seccomp mode or it did not have an ancestral
> + * seccomp filter. current must be in seccomp.mode=2 already.
> + */
> +static pid_t seccomp_sync_threads(void)
> +{
> + struct task_struct *thread, *caller;
> + pid_t failed = 0;
> + thread = caller = current;
> +
> + read_lock(&tasklist_lock);
> + if (thread_group_empty(caller))
> + goto done;
> + while_each_thread(caller, thread) {
> + task_lock(thread);
> + /*
> + * All threads must not be in SECCOMP_MODE_STRICT to
> + * be eligible for synchronization.
> + */
> + if ((thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
> + thread->seccomp.mode == SECCOMP_MODE_FILTER) &&
> + is_ancestor(thread->seccomp.filter,
> + caller->seccomp.filter)) {
> + /* Get a task reference for the new leaf node. */
> + get_seccomp_filter(caller);
> + /*
> + * Drop the task reference to the shared ancestor since
> + * current's path will hold a reference. (This also
> + * allows a put before the assignment.)
> + */
> + put_seccomp_filter(thread);
> + thread->seccomp.filter = caller->seccomp.filter;
> + /* Opt the other thread into seccomp if needed.
> + * As threads are considered to be trust-realm
> + * equivalent (see ptrace_may_access), it is safe to
> + * allow one thread to transition the other.
> + */
> + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
> + thread->seccomp.mode = SECCOMP_MODE_FILTER;
> + /*
> + * Don't let an unprivileged task work around
> + * the no_new_privs restriction by creating
> + * a thread that sets it up, enters seccomp,
> + * then dies.
> + */
> + if (caller->no_new_privs)
> + thread->no_new_privs = 1;
> + set_tsk_thread_flag(thread, TIF_SECCOMP);
no_new_privs is a bitfield, and some of the other bits in there look
like things that might not want to be read and written back from another
thread.
Would it be too annoying to require that the other threads already have
no_new_privs set?
--Andy
On Mon, Jan 13, 2014 at 5:36 PM, Andy Lutomirski <[email protected]> wrote:
> On 01/13/2014 12:30 PM, Will Drewry wrote:
>> Applying restrictive seccomp filter programs to large or diverse
>> codebases often requires handling threads which may be started early in
>> the process lifetime (e.g., by code that is linked in). While it is
>> possible to apply permissive programs prior to process start up, it is
>> difficult to further restrict the kernel ABI to those threads after that
>> point.
>>
>> This change adds a new seccomp "extension" for synchronizing thread
>> group seccomp filters and a prctl() for accessing that functionality.
>> The need for the added prctl() is due to the lack of reserved arguments
>> in PR_SET_SECCOMP.
>>
>> When prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT_TSYNC, 0, 0) is called, it
>> will attempt to synchronize all threads in current's threadgroup to its
>> seccomp filter program. This is possible iff all threads are using a
>> filter that is an ancestor to the filter current is attempting to
>> synchronize to. NULL filters (where the task is running as
>> SECCOMP_MODE_NONE) are also treated as ancestors allowing threads to be
>> transitioned into SECCOMP_MODE_FILTER. On success, 0 is returned. On
>> failure, the pid of one of the failing threads will be returned.
>>
>> Suggested-by: Julien Tinnes <[email protected]>
>> Signed-off-by: Will Drewry <[email protected]>
>> ---
>> include/linux/seccomp.h | 7 +++
>> include/uapi/linux/prctl.h | 6 ++
>> include/uapi/linux/seccomp.h | 6 ++
>> kernel/seccomp.c | 128 ++++++++++++++++++++++++++++++++++++++++++
>> kernel/sys.c | 3 +
>> 5 files changed, 150 insertions(+)
>>
>> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
>> index 85c0895..3163db6 100644
>> --- a/include/linux/seccomp.h
>> +++ b/include/linux/seccomp.h
>> @@ -77,6 +77,8 @@ static inline int seccomp_mode(struct seccomp *s)
>> extern void put_seccomp_filter(struct task_struct *tsk);
>> extern void get_seccomp_filter(struct task_struct *tsk);
>> extern u32 seccomp_bpf_load(int off);
>> +extern long prctl_seccomp_ext(unsigned long, unsigned long,
>> + unsigned long, unsigned long);
>> #else /* CONFIG_SECCOMP_FILTER */
>> static inline void put_seccomp_filter(struct task_struct *tsk)
>> {
>> @@ -86,5 +88,10 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
>> {
>> return;
>> }
>> +static inline long prctl_seccomp_ext(unsigned long arg2, unsigned long arg3,
>> + unsigned long arg4, unsigned long arg5)
>> +{
>> + return -EINVAL;
>> +}
>> #endif /* CONFIG_SECCOMP_FILTER */
>> #endif /* _LINUX_SECCOMP_H */
>> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
>> index 289760f..5dcd5d3 100644
>> --- a/include/uapi/linux/prctl.h
>> +++ b/include/uapi/linux/prctl.h
>> @@ -149,4 +149,10 @@
>>
>> #define PR_GET_TID_ADDRESS 40
>>
>> +/*
>> + * Access seccomp extensions
>> + * See Documentation/prctl/seccomp_filter.txt for more details.
>> + */
>> +#define PR_SECCOMP_EXT 41
>> +
>> #endif /* _LINUX_PRCTL_H */
>> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
>> index ac2dc9f..49b5279 100644
>> --- a/include/uapi/linux/seccomp.h
>> +++ b/include/uapi/linux/seccomp.h
>> @@ -10,6 +10,12 @@
>> #define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */
>> #define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
>>
>> +/* Valid extension types as arg2 for prctl(PR_SECCOMP_EXT) */
>> +#define SECCOMP_EXT_ACT 1
>> +
>> +/* Valid extension actions as arg3 to prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT) */
>> +#define SECCOMP_EXT_ACT_TSYNC 1 /* attempt to synchronize thread filters */
>> +
>> /*
>> * All BPF programs must return a 32-bit value.
>> * The bottom 16-bits are for optional return data.
>> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>> index 71512e4..8a0de7b 100644
>> --- a/kernel/seccomp.c
>> +++ b/kernel/seccomp.c
>> @@ -24,6 +24,7 @@
>> #ifdef CONFIG_SECCOMP_FILTER
>> #include <asm/syscall.h>
>> #include <linux/filter.h>
>> +#include <linux/pid.h>
>> #include <linux/ptrace.h>
>> #include <linux/security.h>
>> #include <linux/slab.h>
>> @@ -220,6 +221,108 @@ static u32 seccomp_run_filters(int syscall)
>> return ret;
>> }
>>
>> +/* Returns 1 if the candidate is an ancestor. */
>> +static int is_ancestor(struct seccomp_filter *candidate,
>> + struct seccomp_filter *child)
>> +{
>> + /* NULL is the root ancestor. */
>> + if (candidate == NULL)
>> + return 1;
>> + for (; child; child = child->prev)
>> + if (child == candidate)
>> + return 1;
>> + return 0;
>> +}
>> +
>> +/**
>> + * seccomp_sync_threads: sets all threads to use current's filter
>> + *
>> + * Returns 0 on success or the pid of a thread which was either not
>> + * in the correct seccomp mode or it did not have an ancestral
>> + * seccomp filter. current must be in seccomp.mode=2 already.
>> + */
>> +static pid_t seccomp_sync_threads(void)
>> +{
>> + struct task_struct *thread, *caller;
>> + pid_t failed = 0;
>> + thread = caller = current;
>> +
>> + read_lock(&tasklist_lock);
>> + if (thread_group_empty(caller))
>> + goto done;
>> + while_each_thread(caller, thread) {
>> + task_lock(thread);
>> + /*
>> + * All threads must not be in SECCOMP_MODE_STRICT to
>> + * be eligible for synchronization.
>> + */
>> + if ((thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
>> + thread->seccomp.mode == SECCOMP_MODE_FILTER) &&
>> + is_ancestor(thread->seccomp.filter,
>> + caller->seccomp.filter)) {
>> + /* Get a task reference for the new leaf node. */
>> + get_seccomp_filter(caller);
>> + /*
>> + * Drop the task reference to the shared ancestor since
>> + * current's path will hold a reference. (This also
>> + * allows a put before the assignment.)
>> + */
>> + put_seccomp_filter(thread);
>> + thread->seccomp.filter = caller->seccomp.filter;
>> + /* Opt the other thread into seccomp if needed.
>> + * As threads are considered to be trust-realm
>> + * equivalent (see ptrace_may_access), it is safe to
>> + * allow one thread to transition the other.
>> + */
>> + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
>> + thread->seccomp.mode = SECCOMP_MODE_FILTER;
>> + /*
>> + * Don't let an unprivileged task work around
>> + * the no_new_privs restriction by creating
>> + * a thread that sets it up, enters seccomp,
>> + * then dies.
>> + */
>> + if (caller->no_new_privs)
>> + thread->no_new_privs = 1;
>> + set_tsk_thread_flag(thread, TIF_SECCOMP);
>
> no_new_privs is a bitfield, and some of the other bits in there look
> like things that might not want to be read and written back from another
> thread.
Ah :/ Good catch!
> Would it be too annoying to require that the other threads already have
> no_new_privs set?
Hrm, it's pretty painful in the edge cases where you don't control the
process initialization which might setup threads you need to ensnare.
Would it be crazy to do something like below in sched.h?
- unsigned no_new_privs:1;
+ unsigned no_new_privs;
It feels like a big hammer though, but it also seems weird to wrap those
bitfields with task_lock. Any suggestions are welcome! I'll think about
this a bit more and see if there is a good way to do this transition
safely and cheaply.
thanks!
On 01/13, Will Drewry wrote:
>
> When prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT_TSYNC, 0, 0) is called, it
> will attempt to synchronize all threads in current's threadgroup to its
> seccomp filter program.
TBH, I do not understand what this patch actually does ;) I'll try to
read it later. Still a couple of nits.
> +static pid_t seccomp_sync_threads(void)
> +{
> + struct task_struct *thread, *caller;
> + pid_t failed = 0;
> + thread = caller = current;
> +
> + read_lock(&tasklist_lock);
> + if (thread_group_empty(caller))
> + goto done;
You can check thread_group_empty() before tasklist_lock, otherwise
this fast-path before while_each_thread() makes no sense.
> + while_each_thread(caller, thread) {
> + task_lock(thread);
Could you remind what task_lock() protects wrt seccomp?
> + } else {
> + /* Keep the last sibling that failed to return. */
> + struct pid *pid = get_task_pid(thread, PIDTYPE_PID);
> + failed = pid_vnr(pid);
> + put_pid(pid);
> + /* If the pid cannot be resolved, then return -ESRCH */
> + if (failed == 0)
> + failed = -ESRCH;
You can just do
failed = task_pid_vnr(thread);
"failed == 0" is not possible either way (we are doing while_each_thread
under tasklist, the task can't do detach_pid).
Oleg.
On 01/13, Will Drewry wrote:
>
> +static pid_t seccomp_sync_threads(void)
> +{
> + struct task_struct *thread, *caller;
> + pid_t failed = 0;
> + thread = caller = current;
> +
> + read_lock(&tasklist_lock);
> + if (thread_group_empty(caller))
> + goto done;
> + while_each_thread(caller, thread) {
> + task_lock(thread);
perhaps we take task_lock() to serialize with another caller of
seccomp_sync_threads()...
If yes, then perhaps you can use ->siglock instead of tasklist_lock
and do not use task_lock(). It would be even better to rely on rcu,
but:
> + get_seccomp_filter(caller);
> + /*
> + * Drop the task reference to the shared ancestor since
> + * current's path will hold a reference. (This also
> + * allows a put before the assignment.)
> + */
> + put_seccomp_filter(thread);
> + thread->seccomp.filter = caller->seccomp.filter;
As I said, I do not understand this patch yet, but this looks suspicious.
Why we can't race with this thread doing clone(CLONE_THREAD) ? We do
not the the new thread yet, but its ->seccomp can be already copied
by copy_process(), no?
Oleg.
On 01/14, Oleg Nesterov wrote:
>
> > + get_seccomp_filter(caller);
> > + /*
> > + * Drop the task reference to the shared ancestor since
> > + * current's path will hold a reference. (This also
> > + * allows a put before the assignment.)
> > + */
> > + put_seccomp_filter(thread);
> > + thread->seccomp.filter = caller->seccomp.filter;
>
> As I said, I do not understand this patch yet, but this looks suspicious.
>
> Why we can't race with this thread doing clone(CLONE_THREAD) ? We do
> not the the new thread yet, but its ->seccomp can be already copied
> by copy_process(), no?
And it seems that this can obviously race with seccomp_attach_filter()
called by this "thread".
Oleg.
On 01/14, Oleg Nesterov wrote:
>
> On 01/14, Oleg Nesterov wrote:
> >
> > > + get_seccomp_filter(caller);
> > > + /*
> > > + * Drop the task reference to the shared ancestor since
> > > + * current's path will hold a reference. (This also
> > > + * allows a put before the assignment.)
> > > + */
> > > + put_seccomp_filter(thread);
> > > + thread->seccomp.filter = caller->seccomp.filter;
> >
> > As I said, I do not understand this patch yet, but this looks suspicious.
> >
> > Why we can't race with this thread doing clone(CLONE_THREAD) ? We do
> > not the the new thread yet, but its ->seccomp can be already copied
> > by copy_process(), no?
>
> And it seems that this can obviously race with seccomp_attach_filter()
> called by this "thread".
Heh. I just noticed that this patch is not first in series, and I wasn't
cc'ed. I found this one on marc.info,
http://marc.info/?l=linux-kernel&m=138964557211277
this explains task_lock(). But this can't fix the race with copy_process,
and the patch itself doesn't look right... if nothing else, we can't do
copy_from_user() under task_lock().
Oleg.
On Tue, Jan 14, 2014 at 10:59 AM, Will Drewry <[email protected]> wrote:
> On Mon, Jan 13, 2014 at 5:36 PM, Andy Lutomirski <[email protected]> wrote:
>> On 01/13/2014 12:30 PM, Will Drewry wrote:
>>> Applying restrictive seccomp filter programs to large or diverse
>>> codebases often requires handling threads which may be started early in
>>> the process lifetime (e.g., by code that is linked in). While it is
>>> possible to apply permissive programs prior to process start up, it is
>>> difficult to further restrict the kernel ABI to those threads after that
>>> point.
>>>
>>> This change adds a new seccomp "extension" for synchronizing thread
>>> group seccomp filters and a prctl() for accessing that functionality.
>>> The need for the added prctl() is due to the lack of reserved arguments
>>> in PR_SET_SECCOMP.
>>>
>>> When prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT_TSYNC, 0, 0) is called, it
>>> will attempt to synchronize all threads in current's threadgroup to its
>>> seccomp filter program. This is possible iff all threads are using a
>>> filter that is an ancestor to the filter current is attempting to
>>> synchronize to. NULL filters (where the task is running as
>>> SECCOMP_MODE_NONE) are also treated as ancestors allowing threads to be
>>> transitioned into SECCOMP_MODE_FILTER. On success, 0 is returned. On
>>> failure, the pid of one of the failing threads will be returned.
>>>
>>> Suggested-by: Julien Tinnes <[email protected]>
>>> Signed-off-by: Will Drewry <[email protected]>
>>> ---
>>> include/linux/seccomp.h | 7 +++
>>> include/uapi/linux/prctl.h | 6 ++
>>> include/uapi/linux/seccomp.h | 6 ++
>>> kernel/seccomp.c | 128 ++++++++++++++++++++++++++++++++++++++++++
>>> kernel/sys.c | 3 +
>>> 5 files changed, 150 insertions(+)
>>>
>>> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
>>> index 85c0895..3163db6 100644
>>> --- a/include/linux/seccomp.h
>>> +++ b/include/linux/seccomp.h
>>> @@ -77,6 +77,8 @@ static inline int seccomp_mode(struct seccomp *s)
>>> extern void put_seccomp_filter(struct task_struct *tsk);
>>> extern void get_seccomp_filter(struct task_struct *tsk);
>>> extern u32 seccomp_bpf_load(int off);
>>> +extern long prctl_seccomp_ext(unsigned long, unsigned long,
>>> + unsigned long, unsigned long);
>>> #else /* CONFIG_SECCOMP_FILTER */
>>> static inline void put_seccomp_filter(struct task_struct *tsk)
>>> {
>>> @@ -86,5 +88,10 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
>>> {
>>> return;
>>> }
>>> +static inline long prctl_seccomp_ext(unsigned long arg2, unsigned long arg3,
>>> + unsigned long arg4, unsigned long arg5)
>>> +{
>>> + return -EINVAL;
>>> +}
>>> #endif /* CONFIG_SECCOMP_FILTER */
>>> #endif /* _LINUX_SECCOMP_H */
>>> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
>>> index 289760f..5dcd5d3 100644
>>> --- a/include/uapi/linux/prctl.h
>>> +++ b/include/uapi/linux/prctl.h
>>> @@ -149,4 +149,10 @@
>>>
>>> #define PR_GET_TID_ADDRESS 40
>>>
>>> +/*
>>> + * Access seccomp extensions
>>> + * See Documentation/prctl/seccomp_filter.txt for more details.
>>> + */
>>> +#define PR_SECCOMP_EXT 41
>>> +
>>> #endif /* _LINUX_PRCTL_H */
>>> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
>>> index ac2dc9f..49b5279 100644
>>> --- a/include/uapi/linux/seccomp.h
>>> +++ b/include/uapi/linux/seccomp.h
>>> @@ -10,6 +10,12 @@
>>> #define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */
>>> #define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
>>>
>>> +/* Valid extension types as arg2 for prctl(PR_SECCOMP_EXT) */
>>> +#define SECCOMP_EXT_ACT 1
>>> +
>>> +/* Valid extension actions as arg3 to prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT) */
>>> +#define SECCOMP_EXT_ACT_TSYNC 1 /* attempt to synchronize thread filters */
>>> +
>>> /*
>>> * All BPF programs must return a 32-bit value.
>>> * The bottom 16-bits are for optional return data.
>>> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>>> index 71512e4..8a0de7b 100644
>>> --- a/kernel/seccomp.c
>>> +++ b/kernel/seccomp.c
>>> @@ -24,6 +24,7 @@
>>> #ifdef CONFIG_SECCOMP_FILTER
>>> #include <asm/syscall.h>
>>> #include <linux/filter.h>
>>> +#include <linux/pid.h>
>>> #include <linux/ptrace.h>
>>> #include <linux/security.h>
>>> #include <linux/slab.h>
>>> @@ -220,6 +221,108 @@ static u32 seccomp_run_filters(int syscall)
>>> return ret;
>>> }
>>>
>>> +/* Returns 1 if the candidate is an ancestor. */
>>> +static int is_ancestor(struct seccomp_filter *candidate,
>>> + struct seccomp_filter *child)
>>> +{
>>> + /* NULL is the root ancestor. */
>>> + if (candidate == NULL)
>>> + return 1;
>>> + for (; child; child = child->prev)
>>> + if (child == candidate)
>>> + return 1;
>>> + return 0;
>>> +}
>>> +
>>> +/**
>>> + * seccomp_sync_threads: sets all threads to use current's filter
>>> + *
>>> + * Returns 0 on success or the pid of a thread which was either not
>>> + * in the correct seccomp mode or it did not have an ancestral
>>> + * seccomp filter. current must be in seccomp.mode=2 already.
>>> + */
>>> +static pid_t seccomp_sync_threads(void)
>>> +{
>>> + struct task_struct *thread, *caller;
>>> + pid_t failed = 0;
>>> + thread = caller = current;
>>> +
>>> + read_lock(&tasklist_lock);
>>> + if (thread_group_empty(caller))
>>> + goto done;
>>> + while_each_thread(caller, thread) {
>>> + task_lock(thread);
>>> + /*
>>> + * All threads must not be in SECCOMP_MODE_STRICT to
>>> + * be eligible for synchronization.
>>> + */
>>> + if ((thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
>>> + thread->seccomp.mode == SECCOMP_MODE_FILTER) &&
>>> + is_ancestor(thread->seccomp.filter,
>>> + caller->seccomp.filter)) {
>>> + /* Get a task reference for the new leaf node. */
>>> + get_seccomp_filter(caller);
>>> + /*
>>> + * Drop the task reference to the shared ancestor since
>>> + * current's path will hold a reference. (This also
>>> + * allows a put before the assignment.)
>>> + */
>>> + put_seccomp_filter(thread);
>>> + thread->seccomp.filter = caller->seccomp.filter;
>>> + /* Opt the other thread into seccomp if needed.
>>> + * As threads are considered to be trust-realm
>>> + * equivalent (see ptrace_may_access), it is safe to
>>> + * allow one thread to transition the other.
>>> + */
>>> + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
>>> + thread->seccomp.mode = SECCOMP_MODE_FILTER;
>>> + /*
>>> + * Don't let an unprivileged task work around
>>> + * the no_new_privs restriction by creating
>>> + * a thread that sets it up, enters seccomp,
>>> + * then dies.
>>> + */
>>> + if (caller->no_new_privs)
>>> + thread->no_new_privs = 1;
>>> + set_tsk_thread_flag(thread, TIF_SECCOMP);
>>
>> no_new_privs is a bitfield, and some of the other bits in there look
>> like things that might not want to be read and written back from another
>> thread.
>
> Ah :/ Good catch!
>
>> Would it be too annoying to require that the other threads already have
>> no_new_privs set?
>
> Hrm, it's pretty painful in the edge cases where you don't control the
> process initialization which might setup threads you need to ensnare.
>
> Would it be crazy to do something like below in sched.h?
> - unsigned no_new_privs:1;
> + unsigned no_new_privs;
set_bit, etc. would also work. (Although there isn't a 32-bit set_bit
AFAIK, or at least there isn't one that works on 64-bit BE archs.)
Also, is 'unsigned' actually safe for this purpose, on all supported
archs/compilers? I'm pretty sure it's okay by C++11 rules, but those
don't apply here. Maybe some day the kernel will move to C11 and life
will be good.
>
> It feels like a big hammer though, but it also seems weird to wrap those
> bitfields with task_lock. Any suggestions are welcome! I'll think about
> this a bit more and see if there is a good way to do this transition
> safely and cheaply.
Hmm. I bet you could move no_new_privs somewhere else in task_lock
where there's a bit free. It could also go in 'struct creds', but I
think that's even worse from your perspective.
Here's another dumb idea: Add an accessor task_no_new_privs(struct
task_struct *) and move no_new_privs into struct seccomp (i.e. make it
a bit in the seccomp mode). It kind of sucks on !CONFIG_SECCOMP, but
it's free if CONFIG_SECCOMP.
P.S. Have you seen the Linux Capsicum port? It fiddles with seccomp
mode, too, and I suspect it needs a fair amount of work, but I really
like the idea.
--Andy
On Tue, Jan 14, 2014 at 2:13 PM, Oleg Nesterov <[email protected]> wrote:
> On 01/14, Oleg Nesterov wrote:
>>
>> On 01/14, Oleg Nesterov wrote:
>> >
>> > > + get_seccomp_filter(caller);
>> > > + /*
>> > > + * Drop the task reference to the shared ancestor since
>> > > + * current's path will hold a reference. (This also
>> > > + * allows a put before the assignment.)
>> > > + */
>> > > + put_seccomp_filter(thread);
>> > > + thread->seccomp.filter = caller->seccomp.filter;
>> >
>> > As I said, I do not understand this patch yet, but this looks suspicious.
>> >
>> > Why we can't race with this thread doing clone(CLONE_THREAD) ? We do
>> > not the the new thread yet, but its ->seccomp can be already copied
>> > by copy_process(), no?
Ah - I thought the tasklist_lock would catch that, but of course that
happens before
the tasklist_lock is needed.
>>
>> And it seems that this can obviously race with seccomp_attach_filter()
>> called by this "thread".
And... I was hoping the task_lock would cover any attach cases, but
missing the copy_process() is a problem.
>
> Heh. I just noticed that this patch is not first in series, and I wasn't
> cc'ed. I found this one on marc.info,
Sorry! I shouldn't have relied on cc-cmd, I usually mess it up.
>
> http://marc.info/?l=linux-kernel&m=138964557211277
>
> this explains task_lock(). But this can't fix the race with copy_process,
> and the patch itself doesn't look right... if nothing else, we can't do
> copy_from_user() under task_lock().
Thanks -- I'll take a more critical look!
On Tue, Jan 14, 2014 at 2:24 PM, Andy Lutomirski <[email protected]> wrote:
> On Tue, Jan 14, 2014 at 10:59 AM, Will Drewry <[email protected]> wrote:
>> On Mon, Jan 13, 2014 at 5:36 PM, Andy Lutomirski <[email protected]> wrote:
>>> On 01/13/2014 12:30 PM, Will Drewry wrote:
>>>> Applying restrictive seccomp filter programs to large or diverse
>>>> codebases often requires handling threads which may be started early in
>>>> the process lifetime (e.g., by code that is linked in). While it is
>>>> possible to apply permissive programs prior to process start up, it is
>>>> difficult to further restrict the kernel ABI to those threads after that
>>>> point.
>>>>
>>>> This change adds a new seccomp "extension" for synchronizing thread
>>>> group seccomp filters and a prctl() for accessing that functionality.
>>>> The need for the added prctl() is due to the lack of reserved arguments
>>>> in PR_SET_SECCOMP.
>>>>
>>>> When prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT_TSYNC, 0, 0) is called, it
>>>> will attempt to synchronize all threads in current's threadgroup to its
>>>> seccomp filter program. This is possible iff all threads are using a
>>>> filter that is an ancestor to the filter current is attempting to
>>>> synchronize to. NULL filters (where the task is running as
>>>> SECCOMP_MODE_NONE) are also treated as ancestors allowing threads to be
>>>> transitioned into SECCOMP_MODE_FILTER. On success, 0 is returned. On
>>>> failure, the pid of one of the failing threads will be returned.
>>>>
>>>> Suggested-by: Julien Tinnes <[email protected]>
>>>> Signed-off-by: Will Drewry <[email protected]>
>>>> ---
>>>> include/linux/seccomp.h | 7 +++
>>>> include/uapi/linux/prctl.h | 6 ++
>>>> include/uapi/linux/seccomp.h | 6 ++
>>>> kernel/seccomp.c | 128 ++++++++++++++++++++++++++++++++++++++++++
>>>> kernel/sys.c | 3 +
>>>> 5 files changed, 150 insertions(+)
>>>>
>>>> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
>>>> index 85c0895..3163db6 100644
>>>> --- a/include/linux/seccomp.h
>>>> +++ b/include/linux/seccomp.h
>>>> @@ -77,6 +77,8 @@ static inline int seccomp_mode(struct seccomp *s)
>>>> extern void put_seccomp_filter(struct task_struct *tsk);
>>>> extern void get_seccomp_filter(struct task_struct *tsk);
>>>> extern u32 seccomp_bpf_load(int off);
>>>> +extern long prctl_seccomp_ext(unsigned long, unsigned long,
>>>> + unsigned long, unsigned long);
>>>> #else /* CONFIG_SECCOMP_FILTER */
>>>> static inline void put_seccomp_filter(struct task_struct *tsk)
>>>> {
>>>> @@ -86,5 +88,10 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
>>>> {
>>>> return;
>>>> }
>>>> +static inline long prctl_seccomp_ext(unsigned long arg2, unsigned long arg3,
>>>> + unsigned long arg4, unsigned long arg5)
>>>> +{
>>>> + return -EINVAL;
>>>> +}
>>>> #endif /* CONFIG_SECCOMP_FILTER */
>>>> #endif /* _LINUX_SECCOMP_H */
>>>> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
>>>> index 289760f..5dcd5d3 100644
>>>> --- a/include/uapi/linux/prctl.h
>>>> +++ b/include/uapi/linux/prctl.h
>>>> @@ -149,4 +149,10 @@
>>>>
>>>> #define PR_GET_TID_ADDRESS 40
>>>>
>>>> +/*
>>>> + * Access seccomp extensions
>>>> + * See Documentation/prctl/seccomp_filter.txt for more details.
>>>> + */
>>>> +#define PR_SECCOMP_EXT 41
>>>> +
>>>> #endif /* _LINUX_PRCTL_H */
>>>> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
>>>> index ac2dc9f..49b5279 100644
>>>> --- a/include/uapi/linux/seccomp.h
>>>> +++ b/include/uapi/linux/seccomp.h
>>>> @@ -10,6 +10,12 @@
>>>> #define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */
>>>> #define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
>>>>
>>>> +/* Valid extension types as arg2 for prctl(PR_SECCOMP_EXT) */
>>>> +#define SECCOMP_EXT_ACT 1
>>>> +
>>>> +/* Valid extension actions as arg3 to prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT) */
>>>> +#define SECCOMP_EXT_ACT_TSYNC 1 /* attempt to synchronize thread filters */
>>>> +
>>>> /*
>>>> * All BPF programs must return a 32-bit value.
>>>> * The bottom 16-bits are for optional return data.
>>>> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>>>> index 71512e4..8a0de7b 100644
>>>> --- a/kernel/seccomp.c
>>>> +++ b/kernel/seccomp.c
>>>> @@ -24,6 +24,7 @@
>>>> #ifdef CONFIG_SECCOMP_FILTER
>>>> #include <asm/syscall.h>
>>>> #include <linux/filter.h>
>>>> +#include <linux/pid.h>
>>>> #include <linux/ptrace.h>
>>>> #include <linux/security.h>
>>>> #include <linux/slab.h>
>>>> @@ -220,6 +221,108 @@ static u32 seccomp_run_filters(int syscall)
>>>> return ret;
>>>> }
>>>>
>>>> +/* Returns 1 if the candidate is an ancestor. */
>>>> +static int is_ancestor(struct seccomp_filter *candidate,
>>>> + struct seccomp_filter *child)
>>>> +{
>>>> + /* NULL is the root ancestor. */
>>>> + if (candidate == NULL)
>>>> + return 1;
>>>> + for (; child; child = child->prev)
>>>> + if (child == candidate)
>>>> + return 1;
>>>> + return 0;
>>>> +}
>>>> +
>>>> +/**
>>>> + * seccomp_sync_threads: sets all threads to use current's filter
>>>> + *
>>>> + * Returns 0 on success or the pid of a thread which was either not
>>>> + * in the correct seccomp mode or it did not have an ancestral
>>>> + * seccomp filter. current must be in seccomp.mode=2 already.
>>>> + */
>>>> +static pid_t seccomp_sync_threads(void)
>>>> +{
>>>> + struct task_struct *thread, *caller;
>>>> + pid_t failed = 0;
>>>> + thread = caller = current;
>>>> +
>>>> + read_lock(&tasklist_lock);
>>>> + if (thread_group_empty(caller))
>>>> + goto done;
>>>> + while_each_thread(caller, thread) {
>>>> + task_lock(thread);
>>>> + /*
>>>> + * All threads must not be in SECCOMP_MODE_STRICT to
>>>> + * be eligible for synchronization.
>>>> + */
>>>> + if ((thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
>>>> + thread->seccomp.mode == SECCOMP_MODE_FILTER) &&
>>>> + is_ancestor(thread->seccomp.filter,
>>>> + caller->seccomp.filter)) {
>>>> + /* Get a task reference for the new leaf node. */
>>>> + get_seccomp_filter(caller);
>>>> + /*
>>>> + * Drop the task reference to the shared ancestor since
>>>> + * current's path will hold a reference. (This also
>>>> + * allows a put before the assignment.)
>>>> + */
>>>> + put_seccomp_filter(thread);
>>>> + thread->seccomp.filter = caller->seccomp.filter;
>>>> + /* Opt the other thread into seccomp if needed.
>>>> + * As threads are considered to be trust-realm
>>>> + * equivalent (see ptrace_may_access), it is safe to
>>>> + * allow one thread to transition the other.
>>>> + */
>>>> + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
>>>> + thread->seccomp.mode = SECCOMP_MODE_FILTER;
>>>> + /*
>>>> + * Don't let an unprivileged task work around
>>>> + * the no_new_privs restriction by creating
>>>> + * a thread that sets it up, enters seccomp,
>>>> + * then dies.
>>>> + */
>>>> + if (caller->no_new_privs)
>>>> + thread->no_new_privs = 1;
>>>> + set_tsk_thread_flag(thread, TIF_SECCOMP);
>>>
>>> no_new_privs is a bitfield, and some of the other bits in there look
>>> like things that might not want to be read and written back from another
>>> thread.
>>
>> Ah :/ Good catch!
>>
>>> Would it be too annoying to require that the other threads already have
>>> no_new_privs set?
>>
>> Hrm, it's pretty painful in the edge cases where you don't control the
>> process initialization which might setup threads you need to ensnare.
>>
>> Would it be crazy to do something like below in sched.h?
>> - unsigned no_new_privs:1;
>> + unsigned no_new_privs;
>
> set_bit, etc. would also work. (Although there isn't a 32-bit set_bit
> AFAIK, or at least there isn't one that works on 64-bit BE archs.)
I wasn't sure if I could use set_bit() in a way that wouldn't get me
banned from submitting patches forever :)
> Also, is 'unsigned' actually safe for this purpose, on all supported
> archs/compilers? I'm pretty sure it's okay by C++11 rules, but those
> don't apply here. Maybe some day the kernel will move to C11 and life
> will be good.
>
>>
>> It feels like a big hammer though, but it also seems weird to wrap those
>> bitfields with task_lock. Any suggestions are welcome! I'll think about
>> this a bit more and see if there is a good way to do this transition
>> safely and cheaply.
>
> Hmm. I bet you could move no_new_privs somewhere else in task_lock
> where there's a bit free. It could also go in 'struct creds', but I
> think that's even worse from your perspective.
>
> Here's another dumb idea: Add an accessor task_no_new_privs(struct
> task_struct *) and move no_new_privs into struct seccomp (i.e. make it
> a bit in the seccomp mode). It kind of sucks on !CONFIG_SECCOMP, but
> it's free if CONFIG_SECCOMP.
That'd certainly be fine with me. I was considering adding a
"needs_transition" bit to struct seccomp, but moving nnp there could be tidy.
I'd need to make sure reading it locklessly still makes sense, but I really
don't want to put a lock on the syscall path...
> P.S. Have you seen the Linux Capsicum port? It fiddles with seccomp
> mode, too, and I suspect it needs a fair amount of work, but I really
> like the idea.
Yup - that's a whole other thread to be had! It sets a new mode as short-hand,
but what it needs really is a seccomp<->LSM interaction mechanism. That's on
my todo list next to help sort out.
thanks!
On Tue, Jan 14, 2014 at 1:21 PM, Oleg Nesterov <[email protected]> wrote:
> On 01/13, Will Drewry wrote:
>>
>> +static pid_t seccomp_sync_threads(void)
>> +{
>> + struct task_struct *thread, *caller;
>> + pid_t failed = 0;
>> + thread = caller = current;
>> +
>> + read_lock(&tasklist_lock);
>> + if (thread_group_empty(caller))
>> + goto done;
>> + while_each_thread(caller, thread) {
>> + task_lock(thread);
>
> perhaps we take task_lock() to serialize with another caller of
> seccomp_sync_threads()...
Sorry for the patch being unclear! The task_lock is meant to protect
against the assignment of a seccomp.filter pointer which was meant to
protect against the target task calling seccomp_attach_filter while
another task is changing its filter pointer.
> If yes, then perhaps you can use ->siglock instead of tasklist_lock
> and do not use task_lock(). It would be even better to rely on rcu,
> but:
Exactly. The siglock seems like it makes more sense.
>> + get_seccomp_filter(caller);
>> + /*
>> + * Drop the task reference to the shared ancestor since
>> + * current's path will hold a reference. (This also
>> + * allows a put before the assignment.)
>> + */
>> + put_seccomp_filter(thread);
>> + thread->seccomp.filter = caller->seccomp.filter;
>
> As I said, I do not understand this patch yet, but this looks suspicious.
>
> Why we can't race with this thread doing clone(CLONE_THREAD) ? We do
> not the the new thread yet, but its ->seccomp can be already copied
> by copy_process(), no?
Yeah I missed that. That said, I think the worst of it would be that
the new thread
gets the old filter. It should still get_seccomp_filter() its own
reference to the filter.
I'm not clear if it's possible to ensure that there is no pathological
condition where
a thread races and is created without being synchronized. I'll see if
the siglock helps
here and walk the clone() code again to see what else I missed.
thanks!
On Tue, Jan 14, 2014 at 12:59 PM, Will Drewry <[email protected]> wrote:
> On Tue, Jan 14, 2014 at 2:24 PM, Andy Lutomirski <[email protected]> wrote:
>> On Tue, Jan 14, 2014 at 10:59 AM, Will Drewry <[email protected]> wrote:
>>> On Mon, Jan 13, 2014 at 5:36 PM, Andy Lutomirski <[email protected]> wrote:
>>>> On 01/13/2014 12:30 PM, Will Drewry wrote:
>>>>> Applying restrictive seccomp filter programs to large or diverse
>>>>> codebases often requires handling threads which may be started early in
>>>>> the process lifetime (e.g., by code that is linked in). While it is
>>>>> possible to apply permissive programs prior to process start up, it is
>>>>> difficult to further restrict the kernel ABI to those threads after that
>>>>> point.
>>>>>
>>>>> This change adds a new seccomp "extension" for synchronizing thread
>>>>> group seccomp filters and a prctl() for accessing that functionality.
>>>>> The need for the added prctl() is due to the lack of reserved arguments
>>>>> in PR_SET_SECCOMP.
>>>>>
>>>>> When prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT_TSYNC, 0, 0) is called, it
>>>>> will attempt to synchronize all threads in current's threadgroup to its
>>>>> seccomp filter program. This is possible iff all threads are using a
>>>>> filter that is an ancestor to the filter current is attempting to
>>>>> synchronize to. NULL filters (where the task is running as
>>>>> SECCOMP_MODE_NONE) are also treated as ancestors allowing threads to be
>>>>> transitioned into SECCOMP_MODE_FILTER. On success, 0 is returned. On
>>>>> failure, the pid of one of the failing threads will be returned.
>>>>>
>>>>> Suggested-by: Julien Tinnes <[email protected]>
>>>>> Signed-off-by: Will Drewry <[email protected]>
>>>>> ---
>>>>> include/linux/seccomp.h | 7 +++
>>>>> include/uapi/linux/prctl.h | 6 ++
>>>>> include/uapi/linux/seccomp.h | 6 ++
>>>>> kernel/seccomp.c | 128 ++++++++++++++++++++++++++++++++++++++++++
>>>>> kernel/sys.c | 3 +
>>>>> 5 files changed, 150 insertions(+)
>>>>>
>>>>> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
>>>>> index 85c0895..3163db6 100644
>>>>> --- a/include/linux/seccomp.h
>>>>> +++ b/include/linux/seccomp.h
>>>>> @@ -77,6 +77,8 @@ static inline int seccomp_mode(struct seccomp *s)
>>>>> extern void put_seccomp_filter(struct task_struct *tsk);
>>>>> extern void get_seccomp_filter(struct task_struct *tsk);
>>>>> extern u32 seccomp_bpf_load(int off);
>>>>> +extern long prctl_seccomp_ext(unsigned long, unsigned long,
>>>>> + unsigned long, unsigned long);
>>>>> #else /* CONFIG_SECCOMP_FILTER */
>>>>> static inline void put_seccomp_filter(struct task_struct *tsk)
>>>>> {
>>>>> @@ -86,5 +88,10 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
>>>>> {
>>>>> return;
>>>>> }
>>>>> +static inline long prctl_seccomp_ext(unsigned long arg2, unsigned long arg3,
>>>>> + unsigned long arg4, unsigned long arg5)
>>>>> +{
>>>>> + return -EINVAL;
>>>>> +}
>>>>> #endif /* CONFIG_SECCOMP_FILTER */
>>>>> #endif /* _LINUX_SECCOMP_H */
>>>>> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
>>>>> index 289760f..5dcd5d3 100644
>>>>> --- a/include/uapi/linux/prctl.h
>>>>> +++ b/include/uapi/linux/prctl.h
>>>>> @@ -149,4 +149,10 @@
>>>>>
>>>>> #define PR_GET_TID_ADDRESS 40
>>>>>
>>>>> +/*
>>>>> + * Access seccomp extensions
>>>>> + * See Documentation/prctl/seccomp_filter.txt for more details.
>>>>> + */
>>>>> +#define PR_SECCOMP_EXT 41
>>>>> +
>>>>> #endif /* _LINUX_PRCTL_H */
>>>>> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
>>>>> index ac2dc9f..49b5279 100644
>>>>> --- a/include/uapi/linux/seccomp.h
>>>>> +++ b/include/uapi/linux/seccomp.h
>>>>> @@ -10,6 +10,12 @@
>>>>> #define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */
>>>>> #define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
>>>>>
>>>>> +/* Valid extension types as arg2 for prctl(PR_SECCOMP_EXT) */
>>>>> +#define SECCOMP_EXT_ACT 1
>>>>> +
>>>>> +/* Valid extension actions as arg3 to prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT) */
>>>>> +#define SECCOMP_EXT_ACT_TSYNC 1 /* attempt to synchronize thread filters */
>>>>> +
>>>>> /*
>>>>> * All BPF programs must return a 32-bit value.
>>>>> * The bottom 16-bits are for optional return data.
>>>>> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>>>>> index 71512e4..8a0de7b 100644
>>>>> --- a/kernel/seccomp.c
>>>>> +++ b/kernel/seccomp.c
>>>>> @@ -24,6 +24,7 @@
>>>>> #ifdef CONFIG_SECCOMP_FILTER
>>>>> #include <asm/syscall.h>
>>>>> #include <linux/filter.h>
>>>>> +#include <linux/pid.h>
>>>>> #include <linux/ptrace.h>
>>>>> #include <linux/security.h>
>>>>> #include <linux/slab.h>
>>>>> @@ -220,6 +221,108 @@ static u32 seccomp_run_filters(int syscall)
>>>>> return ret;
>>>>> }
>>>>>
>>>>> +/* Returns 1 if the candidate is an ancestor. */
>>>>> +static int is_ancestor(struct seccomp_filter *candidate,
>>>>> + struct seccomp_filter *child)
>>>>> +{
>>>>> + /* NULL is the root ancestor. */
>>>>> + if (candidate == NULL)
>>>>> + return 1;
>>>>> + for (; child; child = child->prev)
>>>>> + if (child == candidate)
>>>>> + return 1;
>>>>> + return 0;
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * seccomp_sync_threads: sets all threads to use current's filter
>>>>> + *
>>>>> + * Returns 0 on success or the pid of a thread which was either not
>>>>> + * in the correct seccomp mode or it did not have an ancestral
>>>>> + * seccomp filter. current must be in seccomp.mode=2 already.
>>>>> + */
>>>>> +static pid_t seccomp_sync_threads(void)
>>>>> +{
>>>>> + struct task_struct *thread, *caller;
>>>>> + pid_t failed = 0;
>>>>> + thread = caller = current;
>>>>> +
>>>>> + read_lock(&tasklist_lock);
>>>>> + if (thread_group_empty(caller))
>>>>> + goto done;
>>>>> + while_each_thread(caller, thread) {
>>>>> + task_lock(thread);
>>>>> + /*
>>>>> + * All threads must not be in SECCOMP_MODE_STRICT to
>>>>> + * be eligible for synchronization.
>>>>> + */
>>>>> + if ((thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
>>>>> + thread->seccomp.mode == SECCOMP_MODE_FILTER) &&
>>>>> + is_ancestor(thread->seccomp.filter,
>>>>> + caller->seccomp.filter)) {
>>>>> + /* Get a task reference for the new leaf node. */
>>>>> + get_seccomp_filter(caller);
>>>>> + /*
>>>>> + * Drop the task reference to the shared ancestor since
>>>>> + * current's path will hold a reference. (This also
>>>>> + * allows a put before the assignment.)
>>>>> + */
>>>>> + put_seccomp_filter(thread);
>>>>> + thread->seccomp.filter = caller->seccomp.filter;
>>>>> + /* Opt the other thread into seccomp if needed.
>>>>> + * As threads are considered to be trust-realm
>>>>> + * equivalent (see ptrace_may_access), it is safe to
>>>>> + * allow one thread to transition the other.
>>>>> + */
>>>>> + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
>>>>> + thread->seccomp.mode = SECCOMP_MODE_FILTER;
>>>>> + /*
>>>>> + * Don't let an unprivileged task work around
>>>>> + * the no_new_privs restriction by creating
>>>>> + * a thread that sets it up, enters seccomp,
>>>>> + * then dies.
>>>>> + */
>>>>> + if (caller->no_new_privs)
>>>>> + thread->no_new_privs = 1;
>>>>> + set_tsk_thread_flag(thread, TIF_SECCOMP);
>>>>
>>>> no_new_privs is a bitfield, and some of the other bits in there look
>>>> like things that might not want to be read and written back from another
>>>> thread.
>>>
>>> Ah :/ Good catch!
>>>
>>>> Would it be too annoying to require that the other threads already have
>>>> no_new_privs set?
>>>
>>> Hrm, it's pretty painful in the edge cases where you don't control the
>>> process initialization which might setup threads you need to ensnare.
>>>
>>> Would it be crazy to do something like below in sched.h?
>>> - unsigned no_new_privs:1;
>>> + unsigned no_new_privs;
>>
>> set_bit, etc. would also work. (Although there isn't a 32-bit set_bit
>> AFAIK, or at least there isn't one that works on 64-bit BE archs.)
>
> I wasn't sure if I could use set_bit() in a way that wouldn't get me
> banned from submitting patches forever :)
You'd at least have to get rid of the bitfield -- set_bit on a
bitfield would be... bad. :)
>
>> Also, is 'unsigned' actually safe for this purpose, on all supported
>> archs/compilers? I'm pretty sure it's okay by C++11 rules, but those
>> don't apply here. Maybe some day the kernel will move to C11 and life
>> will be good.
>>
>>>
>>> It feels like a big hammer though, but it also seems weird to wrap those
>>> bitfields with task_lock. Any suggestions are welcome! I'll think about
>>> this a bit more and see if there is a good way to do this transition
>>> safely and cheaply.
>>
>> Hmm. I bet you could move no_new_privs somewhere else in task_lock
>> where there's a bit free. It could also go in 'struct creds', but I
>> think that's even worse from your perspective.
>>
>> Here's another dumb idea: Add an accessor task_no_new_privs(struct
>> task_struct *) and move no_new_privs into struct seccomp (i.e. make it
>> a bit in the seccomp mode). It kind of sucks on !CONFIG_SECCOMP, but
>> it's free if CONFIG_SECCOMP.
>
> That'd certainly be fine with me. I was considering adding a
> "needs_transition" bit to struct seccomp, but moving nnp there could be tidy.
> I'd need to make sure reading it locklessly still makes sense, but I really
> don't want to put a lock on the syscall path...
You may be able to cheat a bit: I don't think that reading
no_new_privs needs to be fast -- it only matters in a small set of
security-related syscalls and in execve.
Have you considered RCU for the seccomp state?
--Andy
On Tue, Jan 14, 2014 at 3:09 PM, Andy Lutomirski <[email protected]> wrote:
> On Tue, Jan 14, 2014 at 12:59 PM, Will Drewry <[email protected]> wrote:
>> On Tue, Jan 14, 2014 at 2:24 PM, Andy Lutomirski <[email protected]> wrote:
>>> On Tue, Jan 14, 2014 at 10:59 AM, Will Drewry <[email protected]> wrote:
>>>> On Mon, Jan 13, 2014 at 5:36 PM, Andy Lutomirski <[email protected]> wrote:
>>>>> On 01/13/2014 12:30 PM, Will Drewry wrote:
>>>>>> Applying restrictive seccomp filter programs to large or diverse
>>>>>> codebases often requires handling threads which may be started early in
>>>>>> the process lifetime (e.g., by code that is linked in). While it is
>>>>>> possible to apply permissive programs prior to process start up, it is
>>>>>> difficult to further restrict the kernel ABI to those threads after that
>>>>>> point.
>>>>>>
>>>>>> This change adds a new seccomp "extension" for synchronizing thread
>>>>>> group seccomp filters and a prctl() for accessing that functionality.
>>>>>> The need for the added prctl() is due to the lack of reserved arguments
>>>>>> in PR_SET_SECCOMP.
>>>>>>
>>>>>> When prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT_TSYNC, 0, 0) is called, it
>>>>>> will attempt to synchronize all threads in current's threadgroup to its
>>>>>> seccomp filter program. This is possible iff all threads are using a
>>>>>> filter that is an ancestor to the filter current is attempting to
>>>>>> synchronize to. NULL filters (where the task is running as
>>>>>> SECCOMP_MODE_NONE) are also treated as ancestors allowing threads to be
>>>>>> transitioned into SECCOMP_MODE_FILTER. On success, 0 is returned. On
>>>>>> failure, the pid of one of the failing threads will be returned.
>>>>>>
>>>>>> Suggested-by: Julien Tinnes <[email protected]>
>>>>>> Signed-off-by: Will Drewry <[email protected]>
>>>>>> ---
>>>>>> include/linux/seccomp.h | 7 +++
>>>>>> include/uapi/linux/prctl.h | 6 ++
>>>>>> include/uapi/linux/seccomp.h | 6 ++
>>>>>> kernel/seccomp.c | 128 ++++++++++++++++++++++++++++++++++++++++++
>>>>>> kernel/sys.c | 3 +
>>>>>> 5 files changed, 150 insertions(+)
>>>>>>
>>>>>> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
>>>>>> index 85c0895..3163db6 100644
>>>>>> --- a/include/linux/seccomp.h
>>>>>> +++ b/include/linux/seccomp.h
>>>>>> @@ -77,6 +77,8 @@ static inline int seccomp_mode(struct seccomp *s)
>>>>>> extern void put_seccomp_filter(struct task_struct *tsk);
>>>>>> extern void get_seccomp_filter(struct task_struct *tsk);
>>>>>> extern u32 seccomp_bpf_load(int off);
>>>>>> +extern long prctl_seccomp_ext(unsigned long, unsigned long,
>>>>>> + unsigned long, unsigned long);
>>>>>> #else /* CONFIG_SECCOMP_FILTER */
>>>>>> static inline void put_seccomp_filter(struct task_struct *tsk)
>>>>>> {
>>>>>> @@ -86,5 +88,10 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
>>>>>> {
>>>>>> return;
>>>>>> }
>>>>>> +static inline long prctl_seccomp_ext(unsigned long arg2, unsigned long arg3,
>>>>>> + unsigned long arg4, unsigned long arg5)
>>>>>> +{
>>>>>> + return -EINVAL;
>>>>>> +}
>>>>>> #endif /* CONFIG_SECCOMP_FILTER */
>>>>>> #endif /* _LINUX_SECCOMP_H */
>>>>>> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
>>>>>> index 289760f..5dcd5d3 100644
>>>>>> --- a/include/uapi/linux/prctl.h
>>>>>> +++ b/include/uapi/linux/prctl.h
>>>>>> @@ -149,4 +149,10 @@
>>>>>>
>>>>>> #define PR_GET_TID_ADDRESS 40
>>>>>>
>>>>>> +/*
>>>>>> + * Access seccomp extensions
>>>>>> + * See Documentation/prctl/seccomp_filter.txt for more details.
>>>>>> + */
>>>>>> +#define PR_SECCOMP_EXT 41
>>>>>> +
>>>>>> #endif /* _LINUX_PRCTL_H */
>>>>>> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
>>>>>> index ac2dc9f..49b5279 100644
>>>>>> --- a/include/uapi/linux/seccomp.h
>>>>>> +++ b/include/uapi/linux/seccomp.h
>>>>>> @@ -10,6 +10,12 @@
>>>>>> #define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */
>>>>>> #define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
>>>>>>
>>>>>> +/* Valid extension types as arg2 for prctl(PR_SECCOMP_EXT) */
>>>>>> +#define SECCOMP_EXT_ACT 1
>>>>>> +
>>>>>> +/* Valid extension actions as arg3 to prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT) */
>>>>>> +#define SECCOMP_EXT_ACT_TSYNC 1 /* attempt to synchronize thread filters */
>>>>>> +
>>>>>> /*
>>>>>> * All BPF programs must return a 32-bit value.
>>>>>> * The bottom 16-bits are for optional return data.
>>>>>> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>>>>>> index 71512e4..8a0de7b 100644
>>>>>> --- a/kernel/seccomp.c
>>>>>> +++ b/kernel/seccomp.c
>>>>>> @@ -24,6 +24,7 @@
>>>>>> #ifdef CONFIG_SECCOMP_FILTER
>>>>>> #include <asm/syscall.h>
>>>>>> #include <linux/filter.h>
>>>>>> +#include <linux/pid.h>
>>>>>> #include <linux/ptrace.h>
>>>>>> #include <linux/security.h>
>>>>>> #include <linux/slab.h>
>>>>>> @@ -220,6 +221,108 @@ static u32 seccomp_run_filters(int syscall)
>>>>>> return ret;
>>>>>> }
>>>>>>
>>>>>> +/* Returns 1 if the candidate is an ancestor. */
>>>>>> +static int is_ancestor(struct seccomp_filter *candidate,
>>>>>> + struct seccomp_filter *child)
>>>>>> +{
>>>>>> + /* NULL is the root ancestor. */
>>>>>> + if (candidate == NULL)
>>>>>> + return 1;
>>>>>> + for (; child; child = child->prev)
>>>>>> + if (child == candidate)
>>>>>> + return 1;
>>>>>> + return 0;
>>>>>> +}
>>>>>> +
>>>>>> +/**
>>>>>> + * seccomp_sync_threads: sets all threads to use current's filter
>>>>>> + *
>>>>>> + * Returns 0 on success or the pid of a thread which was either not
>>>>>> + * in the correct seccomp mode or it did not have an ancestral
>>>>>> + * seccomp filter. current must be in seccomp.mode=2 already.
>>>>>> + */
>>>>>> +static pid_t seccomp_sync_threads(void)
>>>>>> +{
>>>>>> + struct task_struct *thread, *caller;
>>>>>> + pid_t failed = 0;
>>>>>> + thread = caller = current;
>>>>>> +
>>>>>> + read_lock(&tasklist_lock);
>>>>>> + if (thread_group_empty(caller))
>>>>>> + goto done;
>>>>>> + while_each_thread(caller, thread) {
>>>>>> + task_lock(thread);
>>>>>> + /*
>>>>>> + * All threads must not be in SECCOMP_MODE_STRICT to
>>>>>> + * be eligible for synchronization.
>>>>>> + */
>>>>>> + if ((thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
>>>>>> + thread->seccomp.mode == SECCOMP_MODE_FILTER) &&
>>>>>> + is_ancestor(thread->seccomp.filter,
>>>>>> + caller->seccomp.filter)) {
>>>>>> + /* Get a task reference for the new leaf node. */
>>>>>> + get_seccomp_filter(caller);
>>>>>> + /*
>>>>>> + * Drop the task reference to the shared ancestor since
>>>>>> + * current's path will hold a reference. (This also
>>>>>> + * allows a put before the assignment.)
>>>>>> + */
>>>>>> + put_seccomp_filter(thread);
>>>>>> + thread->seccomp.filter = caller->seccomp.filter;
>>>>>> + /* Opt the other thread into seccomp if needed.
>>>>>> + * As threads are considered to be trust-realm
>>>>>> + * equivalent (see ptrace_may_access), it is safe to
>>>>>> + * allow one thread to transition the other.
>>>>>> + */
>>>>>> + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
>>>>>> + thread->seccomp.mode = SECCOMP_MODE_FILTER;
>>>>>> + /*
>>>>>> + * Don't let an unprivileged task work around
>>>>>> + * the no_new_privs restriction by creating
>>>>>> + * a thread that sets it up, enters seccomp,
>>>>>> + * then dies.
>>>>>> + */
>>>>>> + if (caller->no_new_privs)
>>>>>> + thread->no_new_privs = 1;
>>>>>> + set_tsk_thread_flag(thread, TIF_SECCOMP);
>>>>>
>>>>> no_new_privs is a bitfield, and some of the other bits in there look
>>>>> like things that might not want to be read and written back from another
>>>>> thread.
>>>>
>>>> Ah :/ Good catch!
>>>>
>>>>> Would it be too annoying to require that the other threads already have
>>>>> no_new_privs set?
>>>>
>>>> Hrm, it's pretty painful in the edge cases where you don't control the
>>>> process initialization which might setup threads you need to ensnare.
>>>>
>>>> Would it be crazy to do something like below in sched.h?
>>>> - unsigned no_new_privs:1;
>>>> + unsigned no_new_privs;
>>>
>>> set_bit, etc. would also work. (Although there isn't a 32-bit set_bit
>>> AFAIK, or at least there isn't one that works on 64-bit BE archs.)
>>
>> I wasn't sure if I could use set_bit() in a way that wouldn't get me
>> banned from submitting patches forever :)
>
> You'd at least have to get rid of the bitfield -- set_bit on a
> bitfield would be... bad. :)
>
>>
>>> Also, is 'unsigned' actually safe for this purpose, on all supported
>>> archs/compilers? I'm pretty sure it's okay by C++11 rules, but those
>>> don't apply here. Maybe some day the kernel will move to C11 and life
>>> will be good.
>>>
>>>>
>>>> It feels like a big hammer though, but it also seems weird to wrap those
>>>> bitfields with task_lock. Any suggestions are welcome! I'll think about
>>>> this a bit more and see if there is a good way to do this transition
>>>> safely and cheaply.
>>>
>>> Hmm. I bet you could move no_new_privs somewhere else in task_lock
>>> where there's a bit free. It could also go in 'struct creds', but I
>>> think that's even worse from your perspective.
>>>
>>> Here's another dumb idea: Add an accessor task_no_new_privs(struct
>>> task_struct *) and move no_new_privs into struct seccomp (i.e. make it
>>> a bit in the seccomp mode). It kind of sucks on !CONFIG_SECCOMP, but
>>> it's free if CONFIG_SECCOMP.
>>
>> That'd certainly be fine with me. I was considering adding a
>> "needs_transition" bit to struct seccomp, but moving nnp there could be tidy.
>> I'd need to make sure reading it locklessly still makes sense, but I really
>> don't want to put a lock on the syscall path...
>
> You may be able to cheat a bit: I don't think that reading
> no_new_privs needs to be fast -- it only matters in a small set of
> security-related syscalls and in execve.
Good point.
> Have you considered RCU for the seccomp state?
I was hoping to avoid it and as much locking as possible, but
it may end up being where this needs to be. Especially as Oleg
points out deficiencies in my current locking strategy!
On 01/14, Will Drewry wrote:
>
> On Tue, Jan 14, 2014 at 1:21 PM, Oleg Nesterov <[email protected]> wrote:
>
> >> + get_seccomp_filter(caller);
> >> + /*
> >> + * Drop the task reference to the shared ancestor since
> >> + * current's path will hold a reference. (This also
> >> + * allows a put before the assignment.)
> >> + */
> >> + put_seccomp_filter(thread);
> >> + thread->seccomp.filter = caller->seccomp.filter;
> >
> > As I said, I do not understand this patch yet, but this looks suspicious.
> >
> > Why we can't race with this thread doing clone(CLONE_THREAD) ? We do
> > not the the new thread yet, but its ->seccomp can be already copied
> > by copy_process(), no?
>
> Yeah I missed that. That said, I think the worst of it would be that
> the new thread
> gets the old filter.
Yes, but this means you can trust SECCOMP_EXT_ACT_TSYNC.
> I'll see if
> the siglock helps
> here and walk the clone() code again to see what else I missed.
No, siglock itself can't help to avoid this race. Unless you move
copy_process()->get_seccomp_filter() under the same lock, and in
this case it should also re-copy ->seccomp. Not nice.
But note task_lock() (or any other per-thread locking) is wrong.
Just look at the code above. We hold task_lock(thread) but not
task_lock(caller). What if another thread calls seccomp_sync_threads()
and changes caller->seccomp right after get_seccomp_filter(caller).
And even get_seccomp_filter() itself becomes racy. I think the
locking is seriously broken in this series.
Oleg.
On 01/15, Oleg Nesterov wrote:
>
> I think the
> locking is seriously broken in this series.
And imho seccomp_sync_threads() should fail "safely".
IOW, I think it should do while_each_thread() twice. The first
iteration should just check SECCOMP_MODE_FILTER/is_ancestor()
and fail if necessary. The 2nd one should change other threads.
Btw, it seems that is_ancestor() doesn't need the NULL check,
it is not called if SECCOMP_MODE_DISABLED ?
Oleg.
On Wed, Jan 15, 2014 at 1:04 PM, Oleg Nesterov <[email protected]> wrote:
> On 01/14, Will Drewry wrote:
>>
>> On Tue, Jan 14, 2014 at 1:21 PM, Oleg Nesterov <[email protected]> wrote:
>>
>> >> + get_seccomp_filter(caller);
>> >> + /*
>> >> + * Drop the task reference to the shared ancestor since
>> >> + * current's path will hold a reference. (This also
>> >> + * allows a put before the assignment.)
>> >> + */
>> >> + put_seccomp_filter(thread);
>> >> + thread->seccomp.filter = caller->seccomp.filter;
>> >
>> > As I said, I do not understand this patch yet, but this looks suspicious.
>> >
>> > Why we can't race with this thread doing clone(CLONE_THREAD) ? We do
>> > not the the new thread yet, but its ->seccomp can be already copied
>> > by copy_process(), no?
>>
>> Yeah I missed that. That said, I think the worst of it would be that
>> the new thread
>> gets the old filter.
>
> Yes, but this means you can trust SECCOMP_EXT_ACT_TSYNC.
>
>> I'll see if
>> the siglock helps
>> here and walk the clone() code again to see what else I missed.
>
> No, siglock itself can't help to avoid this race. Unless you move
> copy_process()->get_seccomp_filter() under the same lock, and in
> this case it should also re-copy ->seccomp. Not nice.
Yeah - not at all. I'll rethink it. I was too excited about how easy
is_ancestor works, but the locking is really the hard part.
> But note task_lock() (or any other per-thread locking) is wrong.
> Just look at the code above. We hold task_lock(thread) but not
> task_lock(caller). What if another thread calls seccomp_sync_threads()
> and changes caller->seccomp right after get_seccomp_filter(caller).
Yup - I was thinking of tasklist_lock as a non-multi-reader lock,
which is wrong.
The task_lock(current) would clearly cover that case, but I need to walk through
all the interactions paying more attention to the lock being used.
> And even get_seccomp_filter() itself becomes racy. I think the
> locking is seriously broken in this series.
It certainly needs to be better applied :)
thanks!