This series is the result of Fabricio and I going around a few times
on possible solutions for finding a way to enhance RET_KILL to kill
the process group. There's a lot of ways this could be done, but I
wanted something that felt cleanest. As it happens, Tyler's recent
patch series for logging improvement also needs to know a litte bit
more during filter runs, and the solution for both is to pass back
the matched filter. This lets us examine it here for RET_KILL and
in the future for logging changes.
The filter passing is patch 1, the new flag for RET_KILL is patch 2.
Some test refactoring is in patch 3 for the RET_DATA ordering, and
patch 4 is the test for the new RET_KILL flag.
Please take a look!
Thanks,
-Kees
SECCOMP_RET_KILL is supposed to kill the current thread (and userspace
depends on this), so test for this, distinct from killing the entire
process. This also tests killing the entire process with the new
SECCOMP_FILTER_FLAG_KILL_PROCESS flag. (This also moves a bunch of
defines up earlier in the file to use them earlier.)
Signed-off-by: Kees Cook <[email protected]>
---
tools/testing/selftests/seccomp/seccomp_bpf.c | 182 ++++++++++++++++++++------
1 file changed, 141 insertions(+), 41 deletions(-)
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index ee78a53da5d1..68b9faf23ca6 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -87,6 +87,51 @@ struct seccomp_data {
};
#endif
+#ifndef __NR_seccomp
+# if defined(__i386__)
+# define __NR_seccomp 354
+# elif defined(__x86_64__)
+# define __NR_seccomp 317
+# elif defined(__arm__)
+# define __NR_seccomp 383
+# elif defined(__aarch64__)
+# define __NR_seccomp 277
+# elif defined(__hppa__)
+# define __NR_seccomp 338
+# elif defined(__powerpc__)
+# define __NR_seccomp 358
+# elif defined(__s390__)
+# define __NR_seccomp 348
+# else
+# warning "seccomp syscall number unknown for this architecture"
+# define __NR_seccomp 0xffff
+# endif
+#endif
+
+#ifndef SECCOMP_SET_MODE_STRICT
+#define SECCOMP_SET_MODE_STRICT 0
+#endif
+
+#ifndef SECCOMP_SET_MODE_FILTER
+#define SECCOMP_SET_MODE_FILTER 1
+#endif
+
+#ifndef SECCOMP_FILTER_FLAG_TSYNC
+#define SECCOMP_FILTER_FLAG_TSYNC 1
+#endif
+
+#ifndef SECCOMP_FILTER_FLAG_KILL_PROCESS
+#define SECCOMP_FILTER_FLAG_KILL_PROCESS 2
+#endif
+
+#ifndef seccomp
+int seccomp(unsigned int op, unsigned int flags, void *args)
+{
+ errno = 0;
+ return syscall(__NR_seccomp, op, flags, args);
+}
+#endif
+
#if __BYTE_ORDER == __LITTLE_ENDIAN
#define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
#elif __BYTE_ORDER == __BIG_ENDIAN
@@ -520,6 +565,102 @@ TEST_SIGNAL(KILL_one_arg_six, SIGSYS)
close(fd);
}
+/* This is a thread task to die via seccomp filter violation. */
+void *kill_thread(void *data)
+{
+ bool die = (bool)data;
+
+ if (die) {
+ prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
+ return (void *)SIBLING_EXIT_FAILURE;
+ }
+
+ return (void *)SIBLING_EXIT_UNKILLED;
+}
+
+/* Prepare a thread that will kill itself or both of us. */
+void kill_thread_or_group(struct __test_metadata *_metadata, bool kill_process)
+{
+ pthread_t thread;
+ void *status;
+ unsigned int flags;
+ /* Kill only when calling __NR_prctl. */
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+
+ ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ flags = kill_process ? SECCOMP_FILTER_FLAG_KILL_PROCESS : 0;
+ ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog)) {
+ if (kill_process)
+ TH_LOG("Kernel does not support SECCOMP_FILTER_FLAG_KILL_PROCESS");
+ else
+ TH_LOG("Kernel does not support seccomp syscall");
+ }
+
+ /* Start a thread that will exit immediately. */
+ ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false));
+ ASSERT_EQ(0, pthread_join(thread, &status));
+ ASSERT_EQ(SIBLING_EXIT_UNKILLED, (unsigned long)status);
+
+ /* Start a thread that will die immediately. */
+ ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)true));
+ ASSERT_EQ(0, pthread_join(thread, &status));
+ ASSERT_NE(SIBLING_EXIT_FAILURE, (unsigned long)status);
+
+ /* Only the thread died. Let parent know this thread didn't die. */
+ exit(42);
+}
+
+TEST(KILL_thread)
+{
+ int status;
+ pid_t child_pid;
+
+ child_pid = fork();
+ ASSERT_LE(0, child_pid);
+ if (child_pid == 0) {
+ kill_thread_or_group(_metadata, false);
+ _exit(38);
+ }
+
+ ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+
+ /* If only the thread was killed, we'll see exit 42. */
+ ASSERT_EQ(1, WIFEXITED(status));
+ ASSERT_EQ(42, WEXITSTATUS(status));
+}
+
+TEST(KILL_process)
+{
+ int status;
+ pid_t child_pid;
+
+ child_pid = fork();
+ ASSERT_LE(0, child_pid);
+ if (child_pid == 0) {
+ kill_thread_or_group(_metadata, true);
+ _exit(38);
+ }
+
+ ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+
+ /* If the entire process was killed, we'll see SIGSYS. */
+ ASSERT_EQ(1, WIFSIGNALED(status));
+ ASSERT_EQ(SIGSYS, WTERMSIG(status));
+}
+
/* TODO(wad) add 64-bit versus 32-bit arg tests. */
TEST(arg_out_of_range)
{
@@ -1675,47 +1816,6 @@ TEST_F_SIGNAL(TRACE_syscall, kill_after_ptrace, SIGSYS)
EXPECT_NE(self->mypid, syscall(__NR_getpid));
}
-#ifndef __NR_seccomp
-# if defined(__i386__)
-# define __NR_seccomp 354
-# elif defined(__x86_64__)
-# define __NR_seccomp 317
-# elif defined(__arm__)
-# define __NR_seccomp 383
-# elif defined(__aarch64__)
-# define __NR_seccomp 277
-# elif defined(__hppa__)
-# define __NR_seccomp 338
-# elif defined(__powerpc__)
-# define __NR_seccomp 358
-# elif defined(__s390__)
-# define __NR_seccomp 348
-# else
-# warning "seccomp syscall number unknown for this architecture"
-# define __NR_seccomp 0xffff
-# endif
-#endif
-
-#ifndef SECCOMP_SET_MODE_STRICT
-#define SECCOMP_SET_MODE_STRICT 0
-#endif
-
-#ifndef SECCOMP_SET_MODE_FILTER
-#define SECCOMP_SET_MODE_FILTER 1
-#endif
-
-#ifndef SECCOMP_FILTER_FLAG_TSYNC
-#define SECCOMP_FILTER_FLAG_TSYNC 1
-#endif
-
-#ifndef seccomp
-int seccomp(unsigned int op, unsigned int flags, void *args)
-{
- errno = 0;
- return syscall(__NR_seccomp, op, flags, args);
-}
-#endif
-
TEST(seccomp_syscall)
{
struct sock_filter filter[] = {
--
2.7.4
Both the upcoming logging improvements and changes to RET_KILL will need
to know which filter a given seccomp return value originated from. In
order to delay logic processing of result until after the seccomp loop,
this adds a single pointer assignment on matches. This will allow both
log and RET_KILL logic to work off the filter rather than doing more
expensive tests inside the time-critical run_filters loop.
Running tight cycles of getpid() with filters attached shows no measurable
difference in speed.
Suggested-by: Tyler Hicks <[email protected]>
Signed-off-by: Kees Cook <[email protected]>
---
kernel/seccomp.c | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 98b59b5db90b..8bdcf01379e4 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -171,10 +171,12 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
/**
* seccomp_run_filters - evaluates all seccomp filters against @sd
* @sd: optional seccomp data to be passed to filters
+ * @match: stores struct seccomp_filter that resulted in the return value
*
* Returns valid seccomp BPF response codes.
*/
-static u32 seccomp_run_filters(const struct seccomp_data *sd)
+static u32 seccomp_run_filters(const struct seccomp_data *sd,
+ struct seccomp_filter **match)
{
struct seccomp_data sd_local;
u32 ret = SECCOMP_RET_ALLOW;
@@ -198,8 +200,10 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd)
for (; f; f = f->prev) {
u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
- if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
+ if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) {
ret = cur_ret;
+ *match = f;
+ }
}
return ret;
}
@@ -566,6 +570,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
const bool recheck_after_trace)
{
u32 filter_ret, action;
+ struct seccomp_filter *match = NULL;
int data;
/*
@@ -574,7 +579,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
*/
rmb();
- filter_ret = seccomp_run_filters(sd);
+ filter_ret = seccomp_run_filters(sd, &match);
data = filter_ret & SECCOMP_RET_DATA;
action = filter_ret & SECCOMP_RET_ACTION;
--
2.7.4
This refactors the errno tests (since they all use the same pattern for
their filter) and adds a RET_DATA field ordering test.
Signed-off-by: Kees Cook <[email protected]>
---
tools/testing/selftests/seccomp/seccomp_bpf.c | 95 ++++++++++++++++-----------
1 file changed, 58 insertions(+), 37 deletions(-)
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 73f5ea6778ce..ee78a53da5d1 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -136,7 +136,7 @@ TEST(no_new_privs_support)
}
}
-/* Tests kernel support by checking for a copy_from_user() fault on * NULL. */
+/* Tests kernel support by checking for a copy_from_user() fault on NULL. */
TEST(mode_filter_support)
{
long ret;
@@ -541,26 +541,30 @@ TEST(arg_out_of_range)
EXPECT_EQ(EINVAL, errno);
}
+#define ERRNO_FILTER(name, errno) \
+ struct sock_filter _read_filter_##name[] = { \
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS, \
+ offsetof(struct seccomp_data, nr)), \
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1), \
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | errno), \
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), \
+ }; \
+ struct sock_fprog prog_##name = { \
+ .len = (unsigned short)ARRAY_SIZE(_read_filter_##name), \
+ .filter = _read_filter_##name, \
+ }
+
+/* Make sure basic errno values are correctly passed through a filter. */
TEST(ERRNO_valid)
{
- struct sock_filter filter[] = {
- BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
- offsetof(struct seccomp_data, nr)),
- BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
- BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | E2BIG),
- BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
- };
- struct sock_fprog prog = {
- .len = (unsigned short)ARRAY_SIZE(filter),
- .filter = filter,
- };
+ ERRNO_FILTER(valid, E2BIG);
long ret;
pid_t parent = getppid();
ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
ASSERT_EQ(0, ret);
- ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_valid);
ASSERT_EQ(0, ret);
EXPECT_EQ(parent, syscall(__NR_getppid));
@@ -568,26 +572,17 @@ TEST(ERRNO_valid)
EXPECT_EQ(E2BIG, errno);
}
+/* Make sure an errno of zero is correctly handled by the arch code. */
TEST(ERRNO_zero)
{
- struct sock_filter filter[] = {
- BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
- offsetof(struct seccomp_data, nr)),
- BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
- BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | 0),
- BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
- };
- struct sock_fprog prog = {
- .len = (unsigned short)ARRAY_SIZE(filter),
- .filter = filter,
- };
+ ERRNO_FILTER(zero, 0);
long ret;
pid_t parent = getppid();
ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
ASSERT_EQ(0, ret);
- ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_zero);
ASSERT_EQ(0, ret);
EXPECT_EQ(parent, syscall(__NR_getppid));
@@ -595,26 +590,21 @@ TEST(ERRNO_zero)
EXPECT_EQ(0, read(0, NULL, 0));
}
+/*
+ * The SECCOMP_RET_DATA mask is 16 bits wide, but errno is smaller.
+ * This tests that the errno value gets capped correctly, fixed by
+ * 580c57f10768 ("seccomp: cap SECCOMP_RET_ERRNO data to MAX_ERRNO").
+ */
TEST(ERRNO_capped)
{
- struct sock_filter filter[] = {
- BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
- offsetof(struct seccomp_data, nr)),
- BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
- BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | 4096),
- BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
- };
- struct sock_fprog prog = {
- .len = (unsigned short)ARRAY_SIZE(filter),
- .filter = filter,
- };
+ ERRNO_FILTER(capped, 4096);
long ret;
pid_t parent = getppid();
ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
ASSERT_EQ(0, ret);
- ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_capped);
ASSERT_EQ(0, ret);
EXPECT_EQ(parent, syscall(__NR_getppid));
@@ -622,6 +612,37 @@ TEST(ERRNO_capped)
EXPECT_EQ(4095, errno);
}
+/*
+ * Filters are processed in reverse order: last applied is executed first.
+ * Since only the SECCOMP_RET_ACTION mask is tested for return values, the
+ * SECCOMP_RET_DATA mask results will follow the most recently applied
+ * matching filter return (and not the lowest or highest value).
+ */
+TEST(ERRNO_order)
+{
+ ERRNO_FILTER(first, 11);
+ ERRNO_FILTER(second, 13);
+ ERRNO_FILTER(third, 12);
+ long ret;
+ pid_t parent = getppid();
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_first);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_second);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_third);
+ ASSERT_EQ(0, ret);
+
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ EXPECT_EQ(-1, read(0, NULL, 0));
+ EXPECT_EQ(12, errno);
+}
+
FIXTURE_DATA(TRAP) {
struct sock_fprog prog;
};
--
2.7.4
Right now, SECCOMP_RET_KILL kills the current thread. There have been
a few requests for RET_KILL to kill the entire process (the thread
group), but since seccomp's u32 return values are ABI, and ordered by
lowest value, with RET_KILL as 0, there isn't a trivial way to provide
an even smaller value that would mean the more restrictive action of
killing the thread group.
Instead, create a filter flag that indicates that a RET_KILL from this
filter must kill the process rather than the thread. This can be set
(and not cleared) via the new SECCOMP_FILTER_FLAG_KILL_PROCESS flag.
Pros:
- the logic for the filter action is contained in the filter.
- userspace can detect support for the feature since earlier kernels
will reject the new flag.
Cons:
- depends on adding an assignment to the seccomp_run_filters() loop
(previous patch).
Alternatives to this approach with pros/cons:
- Use a new test during seccomp_run_filters() that treats the RET_DATA
mask of a RET_KILL action as special. If a new bit is set in the data,
then treat the return value as -1 (lower than 0).
Pros:
- the logic for the filter action is contained in the filter.
Cons:
- added complexity to time-sensitive seccomp_run_filters() loop.
- there isn't a trivial way for userspace to detect if the kernel
supports the feature (earlier kernels will silently ignore the
RET_DATA and only kill the thread).
- Have SECCOMP_FILTER_FLAG_KILL_PROCESS attach to the seccomp struct
rather than the filter.
Pros:
- no change needed to seccomp_run_filters() loop.
Cons:
- the change in behavior technically originates external to the
filter, which allows for later filters to "enhance" a previously
applied filter's RET_KILL to kill the entire process, which may
be unexpected.
Signed-off-by: Kees Cook <[email protected]>
---
include/linux/seccomp.h | 3 ++-
include/uapi/linux/seccomp.h | 3 ++-
kernel/seccomp.c | 12 +++++++++++-
3 files changed, 15 insertions(+), 3 deletions(-)
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index ecc296c137cd..59d001ba655c 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -3,7 +3,8 @@
#include <uapi/linux/seccomp.h>
-#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC)
+#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \
+ SECCOMP_FILTER_FLAG_KILL_PROCESS)
#ifdef CONFIG_SECCOMP
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 0f238a43ff1e..4b75d8c297b6 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -15,7 +15,8 @@
#define SECCOMP_SET_MODE_FILTER 1
/* Valid flags for SECCOMP_SET_MODE_FILTER */
-#define SECCOMP_FILTER_FLAG_TSYNC 1
+#define SECCOMP_FILTER_FLAG_TSYNC 1
+#define SECCOMP_FILTER_FLAG_KILL_PROCESS 2
/*
* All BPF programs must return a 32-bit value.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 8bdcf01379e4..931eb9cbd093 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -44,6 +44,7 @@
* is only needed for handling filters shared across tasks.
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
+ * @kill_process: if true, RET_KILL will kill process rather than thread.
*
* seccomp_filter objects are organized in a tree linked via the @prev
* pointer. For any task, it appears to be a singly-linked list starting
@@ -59,6 +60,7 @@ struct seccomp_filter {
refcount_t usage;
struct seccomp_filter *prev;
struct bpf_prog *prog;
+ bool kill_process;
};
/* Limit any path through the tree to 256KB worth of instructions. */
@@ -448,6 +450,10 @@ static long seccomp_attach_filter(unsigned int flags,
return ret;
}
+ /* Set process-killing flag, if present. */
+ if (flags & SECCOMP_FILTER_FLAG_KILL_PROCESS)
+ filter->kill_process = true;
+
/*
* If there is an existing filter, make it the prev and don't drop its
* task reference.
@@ -658,7 +664,11 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
seccomp_init_siginfo(&info, this_syscall, data);
do_coredump(&info);
}
- do_exit(SIGSYS);
+ /* Kill entire thread group if requested (or went haywire). */
+ if (!match || match->kill_process)
+ do_group_exit(SIGSYS);
+ else
+ do_exit(SIGSYS);
}
unreachable();
--
2.7.4
On 08/02/2017 10:19 PM, Kees Cook wrote:
> Both the upcoming logging improvements and changes to RET_KILL will need
> to know which filter a given seccomp return value originated from. In
> order to delay logic processing of result until after the seccomp loop,
> this adds a single pointer assignment on matches. This will allow both
> log and RET_KILL logic to work off the filter rather than doing more
> expensive tests inside the time-critical run_filters loop.
>
> Running tight cycles of getpid() with filters attached shows no measurable
> difference in speed.
>
> Suggested-by: Tyler Hicks <[email protected]>
> Signed-off-by: Kees Cook <[email protected]>
> ---
> kernel/seccomp.c | 11 ++++++++---
> 1 file changed, 8 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 98b59b5db90b..8bdcf01379e4 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -171,10 +171,12 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
> /**
> * seccomp_run_filters - evaluates all seccomp filters against @sd
> * @sd: optional seccomp data to be passed to filters
> + * @match: stores struct seccomp_filter that resulted in the return value
> *
> * Returns valid seccomp BPF response codes.
> */
> -static u32 seccomp_run_filters(const struct seccomp_data *sd)
> +static u32 seccomp_run_filters(const struct seccomp_data *sd,
> + struct seccomp_filter **match)
> {
> struct seccomp_data sd_local;
> u32 ret = SECCOMP_RET_ALLOW;
My version of this patch initialized *match to f here. The reason I did
that is because if BPF_PROG_RUN() returns RET_ALLOW for all
filters, I didn't want *match to remain NULL when seccomp_run_filters()
returns. FILTER_FLAG_LOG nor FILTER_FLAG_KILL_PROCESS would be affected
by this because they don't care about RET_ALLOW actions but there could
conceivably be a filter flag in the future that cares about RET_ALLOW
and not initializing *match to the first filter could result in a latent
bug for that filter flag.
I'm fine with not adding the initialization since this is a hot path and
it doesn't help any of the currently existing/planned filter flags but I
wanted to at least mention it.
Reviewed-by: Tyler Hicks <[email protected]>
Tyler
> @@ -198,8 +200,10 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd)
> for (; f; f = f->prev) {
> u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
>
> - if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
> + if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) {
> ret = cur_ret;
> + *match = f;
> + }
> }
> return ret;
> }
> @@ -566,6 +570,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
> const bool recheck_after_trace)
> {
> u32 filter_ret, action;
> + struct seccomp_filter *match = NULL;
> int data;
>
> /*
> @@ -574,7 +579,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
> */
> rmb();
>
> - filter_ret = seccomp_run_filters(sd);
> + filter_ret = seccomp_run_filters(sd, &match);
> data = filter_ret & SECCOMP_RET_DATA;
> action = filter_ret & SECCOMP_RET_ACTION;
>
>
On 08/02/2017 10:19 PM, Kees Cook wrote:
> Right now, SECCOMP_RET_KILL kills the current thread. There have been
> a few requests for RET_KILL to kill the entire process (the thread
> group), but since seccomp's u32 return values are ABI, and ordered by
> lowest value, with RET_KILL as 0, there isn't a trivial way to provide
> an even smaller value that would mean the more restrictive action of
> killing the thread group.
>
> Instead, create a filter flag that indicates that a RET_KILL from this
> filter must kill the process rather than the thread. This can be set
> (and not cleared) via the new SECCOMP_FILTER_FLAG_KILL_PROCESS flag.
>
> Pros:
> - the logic for the filter action is contained in the filter.
> - userspace can detect support for the feature since earlier kernels
> will reject the new flag.
> Cons:
> - depends on adding an assignment to the seccomp_run_filters() loop
> (previous patch).
>
> Alternatives to this approach with pros/cons:
>
> - Use a new test during seccomp_run_filters() that treats the RET_DATA
> mask of a RET_KILL action as special. If a new bit is set in the data,
> then treat the return value as -1 (lower than 0).
> Pros:
> - the logic for the filter action is contained in the filter.
> Cons:
> - added complexity to time-sensitive seccomp_run_filters() loop.
> - there isn't a trivial way for userspace to detect if the kernel
> supports the feature (earlier kernels will silently ignore the
> RET_DATA and only kill the thread).
I prefer using a filter flag over a special RET_DATA mask but, for
completeness, I wanted to mention that SECCOMP_GET_ACTION_AVAIL
operation could be extended to validate special RET_DATA masks. However,
I don't think that is a clean design.
> - Have SECCOMP_FILTER_FLAG_KILL_PROCESS attach to the seccomp struct
> rather than the filter.
> Pros:
> - no change needed to seccomp_run_filters() loop.
> Cons:
> - the change in behavior technically originates external to the
> filter, which allows for later filters to "enhance" a previously
> applied filter's RET_KILL to kill the entire process, which may
> be unexpected.
>
> Signed-off-by: Kees Cook <[email protected]>
> ---
> include/linux/seccomp.h | 3 ++-
> include/uapi/linux/seccomp.h | 3 ++-
> kernel/seccomp.c | 12 +++++++++++-
> 3 files changed, 15 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index ecc296c137cd..59d001ba655c 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -3,7 +3,8 @@
>
> #include <uapi/linux/seccomp.h>
>
> -#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC)
> +#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \
> + SECCOMP_FILTER_FLAG_KILL_PROCESS)
>
> #ifdef CONFIG_SECCOMP
>
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index 0f238a43ff1e..4b75d8c297b6 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -15,7 +15,8 @@
> #define SECCOMP_SET_MODE_FILTER 1
>
> /* Valid flags for SECCOMP_SET_MODE_FILTER */
> -#define SECCOMP_FILTER_FLAG_TSYNC 1
> +#define SECCOMP_FILTER_FLAG_TSYNC 1
> +#define SECCOMP_FILTER_FLAG_KILL_PROCESS 2
>
> /*
> * All BPF programs must return a 32-bit value.
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 8bdcf01379e4..931eb9cbd093 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -44,6 +44,7 @@
> * is only needed for handling filters shared across tasks.
> * @prev: points to a previously installed, or inherited, filter
> * @prog: the BPF program to evaluate
> + * @kill_process: if true, RET_KILL will kill process rather than thread.
> *
> * seccomp_filter objects are organized in a tree linked via the @prev
> * pointer. For any task, it appears to be a singly-linked list starting
> @@ -59,6 +60,7 @@ struct seccomp_filter {
> refcount_t usage;
> struct seccomp_filter *prev;
> struct bpf_prog *prog;
> + bool kill_process;
Just a reminder to move bool up to be the 2nd member of the struct for
improved struct packing. (You already said you were going to this while
you were reviewing my logging patches)
> };
>
> /* Limit any path through the tree to 256KB worth of instructions. */
> @@ -448,6 +450,10 @@ static long seccomp_attach_filter(unsigned int flags,
> return ret;
> }
>
> + /* Set process-killing flag, if present. */
> + if (flags & SECCOMP_FILTER_FLAG_KILL_PROCESS)
> + filter->kill_process = true;
> +
> /*
> * If there is an existing filter, make it the prev and don't drop its
> * task reference.
> @@ -658,7 +664,11 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
> seccomp_init_siginfo(&info, this_syscall, data);
> do_coredump(&info);
> }
> - do_exit(SIGSYS);
> + /* Kill entire thread group if requested (or went haywire). */
> + if (!match || match->kill_process)
I found this conditional to be a little surprising. I would have expected:
if (match && match->kill_process)
do_group_exit(SIGSYS);
else
do_exit(SIGSYS);
A resulting NULL match pointer is unexpected but callers of
seccomp_run_filters() are expected to handle such a situation so it is
possible. Are you preferring to fail closed as much as possible
(considering that killing the process is more restrictive than killing
the thread)?
I don't know the specific situations that would result in the match
pointer not being set in seccomp_run_filters() but I'm curious if
existing, old userspace code that only expected the thread to be killed
could potentially tickle such a situation and result in the entire
thread group being unexpectedly killed.
FWIW, I like the way the conditional is written and am only thinking out
loud about the possible side effects.
Side note: I initially expected to see
Documentation/userspace-api/seccomp_filter.rst being updated in this
patch and then remembered that seccomp(2) isn't documented in that file
and, therefore, it isn't trivial to add blurbs about filter flags in
there. We should fix that after the dust settles on these two patch sets.
Tyler
> + do_group_exit(SIGSYS);
> + else
> + do_exit(SIGSYS);
> }
>
> unreachable();
>
On 08/02/2017 10:19 PM, Kees Cook wrote:
> This refactors the errno tests (since they all use the same pattern for
> their filter) and adds a RET_DATA field ordering test.
>
> Signed-off-by: Kees Cook <[email protected]>
This all looks good and is a great idea.
Reviewed-by: Tyler Hicks <[email protected]>
Tyler
> ---
> tools/testing/selftests/seccomp/seccomp_bpf.c | 95 ++++++++++++++++-----------
> 1 file changed, 58 insertions(+), 37 deletions(-)
>
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 73f5ea6778ce..ee78a53da5d1 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -136,7 +136,7 @@ TEST(no_new_privs_support)
> }
> }
>
> -/* Tests kernel support by checking for a copy_from_user() fault on * NULL. */
> +/* Tests kernel support by checking for a copy_from_user() fault on NULL. */
> TEST(mode_filter_support)
> {
> long ret;
> @@ -541,26 +541,30 @@ TEST(arg_out_of_range)
> EXPECT_EQ(EINVAL, errno);
> }
>
> +#define ERRNO_FILTER(name, errno) \
> + struct sock_filter _read_filter_##name[] = { \
> + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, \
> + offsetof(struct seccomp_data, nr)), \
> + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1), \
> + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | errno), \
> + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), \
> + }; \
> + struct sock_fprog prog_##name = { \
> + .len = (unsigned short)ARRAY_SIZE(_read_filter_##name), \
> + .filter = _read_filter_##name, \
> + }
> +
> +/* Make sure basic errno values are correctly passed through a filter. */
> TEST(ERRNO_valid)
> {
> - struct sock_filter filter[] = {
> - BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
> - offsetof(struct seccomp_data, nr)),
> - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
> - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | E2BIG),
> - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
> - };
> - struct sock_fprog prog = {
> - .len = (unsigned short)ARRAY_SIZE(filter),
> - .filter = filter,
> - };
> + ERRNO_FILTER(valid, E2BIG);
> long ret;
> pid_t parent = getppid();
>
> ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
> ASSERT_EQ(0, ret);
>
> - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
> + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_valid);
> ASSERT_EQ(0, ret);
>
> EXPECT_EQ(parent, syscall(__NR_getppid));
> @@ -568,26 +572,17 @@ TEST(ERRNO_valid)
> EXPECT_EQ(E2BIG, errno);
> }
>
> +/* Make sure an errno of zero is correctly handled by the arch code. */
> TEST(ERRNO_zero)
> {
> - struct sock_filter filter[] = {
> - BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
> - offsetof(struct seccomp_data, nr)),
> - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
> - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | 0),
> - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
> - };
> - struct sock_fprog prog = {
> - .len = (unsigned short)ARRAY_SIZE(filter),
> - .filter = filter,
> - };
> + ERRNO_FILTER(zero, 0);
> long ret;
> pid_t parent = getppid();
>
> ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
> ASSERT_EQ(0, ret);
>
> - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
> + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_zero);
> ASSERT_EQ(0, ret);
>
> EXPECT_EQ(parent, syscall(__NR_getppid));
> @@ -595,26 +590,21 @@ TEST(ERRNO_zero)
> EXPECT_EQ(0, read(0, NULL, 0));
> }
>
> +/*
> + * The SECCOMP_RET_DATA mask is 16 bits wide, but errno is smaller.
> + * This tests that the errno value gets capped correctly, fixed by
> + * 580c57f10768 ("seccomp: cap SECCOMP_RET_ERRNO data to MAX_ERRNO").
> + */
> TEST(ERRNO_capped)
> {
> - struct sock_filter filter[] = {
> - BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
> - offsetof(struct seccomp_data, nr)),
> - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
> - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | 4096),
> - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
> - };
> - struct sock_fprog prog = {
> - .len = (unsigned short)ARRAY_SIZE(filter),
> - .filter = filter,
> - };
> + ERRNO_FILTER(capped, 4096);
> long ret;
> pid_t parent = getppid();
>
> ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
> ASSERT_EQ(0, ret);
>
> - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
> + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_capped);
> ASSERT_EQ(0, ret);
>
> EXPECT_EQ(parent, syscall(__NR_getppid));
> @@ -622,6 +612,37 @@ TEST(ERRNO_capped)
> EXPECT_EQ(4095, errno);
> }
>
> +/*
> + * Filters are processed in reverse order: last applied is executed first.
> + * Since only the SECCOMP_RET_ACTION mask is tested for return values, the
> + * SECCOMP_RET_DATA mask results will follow the most recently applied
> + * matching filter return (and not the lowest or highest value).
> + */
> +TEST(ERRNO_order)
> +{
> + ERRNO_FILTER(first, 11);
> + ERRNO_FILTER(second, 13);
> + ERRNO_FILTER(third, 12);
> + long ret;
> + pid_t parent = getppid();
> +
> + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
> + ASSERT_EQ(0, ret);
> +
> + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_first);
> + ASSERT_EQ(0, ret);
> +
> + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_second);
> + ASSERT_EQ(0, ret);
> +
> + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_third);
> + ASSERT_EQ(0, ret);
> +
> + EXPECT_EQ(parent, syscall(__NR_getppid));
> + EXPECT_EQ(-1, read(0, NULL, 0));
> + EXPECT_EQ(12, errno);
> +}
> +
> FIXTURE_DATA(TRAP) {
> struct sock_fprog prog;
> };
>
On 08/02/2017 10:19 PM, Kees Cook wrote:
> SECCOMP_RET_KILL is supposed to kill the current thread (and userspace
> depends on this), so test for this, distinct from killing the entire
> process. This also tests killing the entire process with the new
> SECCOMP_FILTER_FLAG_KILL_PROCESS flag. (This also moves a bunch of
> defines up earlier in the file to use them earlier.)
>
> Signed-off-by: Kees Cook <[email protected]>
> ---
> tools/testing/selftests/seccomp/seccomp_bpf.c | 182 ++++++++++++++++++++------
> 1 file changed, 141 insertions(+), 41 deletions(-)
>
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index ee78a53da5d1..68b9faf23ca6 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -87,6 +87,51 @@ struct seccomp_data {
> };
> #endif
>
> +#ifndef __NR_seccomp
> +# if defined(__i386__)
> +# define __NR_seccomp 354
> +# elif defined(__x86_64__)
> +# define __NR_seccomp 317
> +# elif defined(__arm__)
> +# define __NR_seccomp 383
> +# elif defined(__aarch64__)
> +# define __NR_seccomp 277
> +# elif defined(__hppa__)
> +# define __NR_seccomp 338
> +# elif defined(__powerpc__)
> +# define __NR_seccomp 358
> +# elif defined(__s390__)
> +# define __NR_seccomp 348
> +# else
> +# warning "seccomp syscall number unknown for this architecture"
> +# define __NR_seccomp 0xffff
> +# endif
> +#endif
> +
> +#ifndef SECCOMP_SET_MODE_STRICT
> +#define SECCOMP_SET_MODE_STRICT 0
> +#endif
> +
> +#ifndef SECCOMP_SET_MODE_FILTER
> +#define SECCOMP_SET_MODE_FILTER 1
> +#endif
> +
> +#ifndef SECCOMP_FILTER_FLAG_TSYNC
> +#define SECCOMP_FILTER_FLAG_TSYNC 1
> +#endif
> +
> +#ifndef SECCOMP_FILTER_FLAG_KILL_PROCESS
> +#define SECCOMP_FILTER_FLAG_KILL_PROCESS 2
> +#endif
> +
> +#ifndef seccomp
> +int seccomp(unsigned int op, unsigned int flags, void *args)
> +{
> + errno = 0;
> + return syscall(__NR_seccomp, op, flags, args);
> +}
> +#endif
> +
> #if __BYTE_ORDER == __LITTLE_ENDIAN
> #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
> #elif __BYTE_ORDER == __BIG_ENDIAN
> @@ -520,6 +565,102 @@ TEST_SIGNAL(KILL_one_arg_six, SIGSYS)
> close(fd);
> }
>
> +/* This is a thread task to die via seccomp filter violation. */
> +void *kill_thread(void *data)
> +{
> + bool die = (bool)data;
> +
> + if (die) {
> + prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
> + return (void *)SIBLING_EXIT_FAILURE;
> + }
> +
> + return (void *)SIBLING_EXIT_UNKILLED;
> +}
> +
> +/* Prepare a thread that will kill itself or both of us. */
> +void kill_thread_or_group(struct __test_metadata *_metadata, bool kill_process)
> +{
> + pthread_t thread;
> + void *status;
> + unsigned int flags;
> + /* Kill only when calling __NR_prctl. */
> + struct sock_filter filter[] = {
> + BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
> + offsetof(struct seccomp_data, nr)),
> + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
> + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
> + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
> + };
> + struct sock_fprog prog = {
> + .len = (unsigned short)ARRAY_SIZE(filter),
> + .filter = filter,
> + };
> +
> + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
> + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
> + }
> +
> + flags = kill_process ? SECCOMP_FILTER_FLAG_KILL_PROCESS : 0;
> + ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog)) {
> + if (kill_process)
> + TH_LOG("Kernel does not support SECCOMP_FILTER_FLAG_KILL_PROCESS");
> + else
> + TH_LOG("Kernel does not support seccomp syscall");
> + }
> +
> + /* Start a thread that will exit immediately. */
> + ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false));
> + ASSERT_EQ(0, pthread_join(thread, &status));
> + ASSERT_EQ(SIBLING_EXIT_UNKILLED, (unsigned long)status);
> +
> + /* Start a thread that will die immediately. */
> + ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)true));
> + ASSERT_EQ(0, pthread_join(thread, &status));
> + ASSERT_NE(SIBLING_EXIT_FAILURE, (unsigned long)status);
> +
> + /* Only the thread died. Let parent know this thread didn't die. */
This read a little odd to me. How about, "Only the created thread died.
Let parent know the this creating thread didn't die."?
> + exit(42);
> +}
> +
> +TEST(KILL_thread)
> +{
> + int status;
> + pid_t child_pid;
> +
> + child_pid = fork();
> + ASSERT_LE(0, child_pid);
> + if (child_pid == 0) {
> + kill_thread_or_group(_metadata, false);
> + _exit(38);
> + }
> +
> + ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
> +
> + /* If only the thread was killed, we'll see exit 42. */
> + ASSERT_EQ(1, WIFEXITED(status));
This is probably nitpicky but, after reading the wait(2) man page, I
feel like this should be ASSERT_TRUE(WIFEXITED(status)) instead of
comparing to 1. There's no documented guarantee that 1 will be returned.
> + ASSERT_EQ(42, WEXITSTATUS(status));
> +}
> +
> +TEST(KILL_process)
> +{
> + int status;
> + pid_t child_pid;
> +
> + child_pid = fork();
> + ASSERT_LE(0, child_pid);
> + if (child_pid == 0) {
> + kill_thread_or_group(_metadata, true);
> + _exit(38);
> + }
> +
> + ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
> +
> + /* If the entire process was killed, we'll see SIGSYS. */
> + ASSERT_EQ(1, WIFSIGNALED(status));
Same here. ASSERT_TRUE(WIFSIGNALED(status)).
With these small changes,
Reviewed-by: Tyler Hicks <[email protected]>
Tyler
> + ASSERT_EQ(SIGSYS, WTERMSIG(status));
> +}
> +
> /* TODO(wad) add 64-bit versus 32-bit arg tests. */
> TEST(arg_out_of_range)
> {
> @@ -1675,47 +1816,6 @@ TEST_F_SIGNAL(TRACE_syscall, kill_after_ptrace, SIGSYS)
> EXPECT_NE(self->mypid, syscall(__NR_getpid));
> }
>
> -#ifndef __NR_seccomp
> -# if defined(__i386__)
> -# define __NR_seccomp 354
> -# elif defined(__x86_64__)
> -# define __NR_seccomp 317
> -# elif defined(__arm__)
> -# define __NR_seccomp 383
> -# elif defined(__aarch64__)
> -# define __NR_seccomp 277
> -# elif defined(__hppa__)
> -# define __NR_seccomp 338
> -# elif defined(__powerpc__)
> -# define __NR_seccomp 358
> -# elif defined(__s390__)
> -# define __NR_seccomp 348
> -# else
> -# warning "seccomp syscall number unknown for this architecture"
> -# define __NR_seccomp 0xffff
> -# endif
> -#endif
> -
> -#ifndef SECCOMP_SET_MODE_STRICT
> -#define SECCOMP_SET_MODE_STRICT 0
> -#endif
> -
> -#ifndef SECCOMP_SET_MODE_FILTER
> -#define SECCOMP_SET_MODE_FILTER 1
> -#endif
> -
> -#ifndef SECCOMP_FILTER_FLAG_TSYNC
> -#define SECCOMP_FILTER_FLAG_TSYNC 1
> -#endif
> -
> -#ifndef seccomp
> -int seccomp(unsigned int op, unsigned int flags, void *args)
> -{
> - errno = 0;
> - return syscall(__NR_seccomp, op, flags, args);
> -}
> -#endif
> -
> TEST(seccomp_syscall)
> {
> struct sock_filter filter[] = {
>
On 08/07/2017 08:03 PM, Tyler Hicks wrote:
> On 08/02/2017 10:19 PM, Kees Cook wrote:
>> Both the upcoming logging improvements and changes to RET_KILL will need
>> to know which filter a given seccomp return value originated from. In
>> order to delay logic processing of result until after the seccomp loop,
>> this adds a single pointer assignment on matches. This will allow both
>> log and RET_KILL logic to work off the filter rather than doing more
>> expensive tests inside the time-critical run_filters loop.
>>
>> Running tight cycles of getpid() with filters attached shows no measurable
>> difference in speed.
>>
>> Suggested-by: Tyler Hicks <[email protected]>
>> Signed-off-by: Kees Cook <[email protected]>
>> ---
>> kernel/seccomp.c | 11 ++++++++---
>> 1 file changed, 8 insertions(+), 3 deletions(-)
>>
>> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>> index 98b59b5db90b..8bdcf01379e4 100644
>> --- a/kernel/seccomp.c
>> +++ b/kernel/seccomp.c
>> @@ -171,10 +171,12 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
>> /**
>> * seccomp_run_filters - evaluates all seccomp filters against @sd
>> * @sd: optional seccomp data to be passed to filters
>> + * @match: stores struct seccomp_filter that resulted in the return value
Thinking just a bit more about this patch, can you document that @match
may be NULL upon return?
Tyler
>> *
>> * Returns valid seccomp BPF response codes.
>> */
>> -static u32 seccomp_run_filters(const struct seccomp_data *sd)
>> +static u32 seccomp_run_filters(const struct seccomp_data *sd,
>> + struct seccomp_filter **match)
>> {
>> struct seccomp_data sd_local;
>> u32 ret = SECCOMP_RET_ALLOW;
>
> My version of this patch initialized *match to f here. The reason I did
> that is because if BPF_PROG_RUN() returns RET_ALLOW for all
> filters, I didn't want *match to remain NULL when seccomp_run_filters()
> returns. FILTER_FLAG_LOG nor FILTER_FLAG_KILL_PROCESS would be affected
> by this because they don't care about RET_ALLOW actions but there could
> conceivably be a filter flag in the future that cares about RET_ALLOW
> and not initializing *match to the first filter could result in a latent
> bug for that filter flag.
>
> I'm fine with not adding the initialization since this is a hot path and
> it doesn't help any of the currently existing/planned filter flags but I
> wanted to at least mention it.
>
> Reviewed-by: Tyler Hicks <[email protected]>
>
> Tyler
>
>> @@ -198,8 +200,10 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd)
>> for (; f; f = f->prev) {
>> u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
>>
>> - if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
>> + if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) {
>> ret = cur_ret;
>> + *match = f;
>> + }
>> }
>> return ret;
>> }
>> @@ -566,6 +570,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
>> const bool recheck_after_trace)
>> {
>> u32 filter_ret, action;
>> + struct seccomp_filter *match = NULL;
>> int data;
>>
>> /*
>> @@ -574,7 +579,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
>> */
>> rmb();
>>
>> - filter_ret = seccomp_run_filters(sd);
>> + filter_ret = seccomp_run_filters(sd, &match);
>> data = filter_ret & SECCOMP_RET_DATA;
>> action = filter_ret & SECCOMP_RET_ACTION;
>>
>>
>
>
On Mon, Aug 7, 2017 at 6:29 PM, Tyler Hicks <[email protected]> wrote:
>> + /* Only the thread died. Let parent know this thread didn't die. */
>
> This read a little odd to me. How about, "Only the created thread died.
> Let parent know the this creating thread didn't die."?
Sounds good. I've updated this to be more descriptive.
>> + ASSERT_EQ(1, WIFEXITED(status));
>
> This is probably nitpicky but, after reading the wait(2) man page, I
> feel like this should be ASSERT_TRUE(WIFEXITED(status)) instead of
> comparing to 1. There's no documented guarantee that 1 will be returned.
That's a fair point. I've updated this and WIFSIGNALED now, thanks!
-Kees
--
Kees Cook
Pixel Security
On Mon, Aug 7, 2017 at 6:03 PM, Tyler Hicks <[email protected]> wrote:
>> -static u32 seccomp_run_filters(const struct seccomp_data *sd)
>> +static u32 seccomp_run_filters(const struct seccomp_data *sd,
>> + struct seccomp_filter **match)
>> {
>> struct seccomp_data sd_local;
>> u32 ret = SECCOMP_RET_ALLOW;
>
> My version of this patch initialized *match to f here. The reason I did
> that is because if BPF_PROG_RUN() returns RET_ALLOW for all
> filters, I didn't want *match to remain NULL when seccomp_run_filters()
> returns. FILTER_FLAG_LOG nor FILTER_FLAG_KILL_PROCESS would be affected
> by this because they don't care about RET_ALLOW actions but there could
> conceivably be a filter flag in the future that cares about RET_ALLOW
> and not initializing *match to the first filter could result in a latent
> bug for that filter flag.
Very true, yes. I did intentionally adjust this because I wanted to
keep the hot path as untouched as possible.
> I'm fine with not adding the initialization since this is a hot path and
> it doesn't help any of the currently existing/planned filter flags but I
> wanted to at least mention it.
Yeah, and while I doubt I'll want to ever check "match" for RET_ALLOW,
I'll add a big comment there to explain it.
> Reviewed-by: Tyler Hicks <[email protected]>
Thanks!
-Kees
--
Kees Cook
Pixel Security
On Mon, Aug 7, 2017 at 6:23 PM, Tyler Hicks <[email protected]> wrote:
> On 08/02/2017 10:19 PM, Kees Cook wrote:
>> Right now, SECCOMP_RET_KILL kills the current thread. There have been
>> a few requests for RET_KILL to kill the entire process (the thread
>> group), but since seccomp's u32 return values are ABI, and ordered by
>> lowest value, with RET_KILL as 0, there isn't a trivial way to provide
>> an even smaller value that would mean the more restrictive action of
>> killing the thread group.
>>
>> Instead, create a filter flag that indicates that a RET_KILL from this
>> filter must kill the process rather than the thread. This can be set
>> (and not cleared) via the new SECCOMP_FILTER_FLAG_KILL_PROCESS flag.
>>
>> Pros:
>> - the logic for the filter action is contained in the filter.
>> - userspace can detect support for the feature since earlier kernels
>> will reject the new flag.
>> Cons:
>> - depends on adding an assignment to the seccomp_run_filters() loop
>> (previous patch).
>>
>> Alternatives to this approach with pros/cons:
>>
>> - Use a new test during seccomp_run_filters() that treats the RET_DATA
>> mask of a RET_KILL action as special. If a new bit is set in the data,
>> then treat the return value as -1 (lower than 0).
>> Pros:
>> - the logic for the filter action is contained in the filter.
>> Cons:
>> - added complexity to time-sensitive seccomp_run_filters() loop.
>> - there isn't a trivial way for userspace to detect if the kernel
>> supports the feature (earlier kernels will silently ignore the
>> RET_DATA and only kill the thread).
>
> I prefer using a filter flag over a special RET_DATA mask but, for
> completeness, I wanted to mention that SECCOMP_GET_ACTION_AVAIL
> operation could be extended to validate special RET_DATA masks. However,
> I don't think that is a clean design.
>
>> - Have SECCOMP_FILTER_FLAG_KILL_PROCESS attach to the seccomp struct
>> rather than the filter.
>> Pros:
>> - no change needed to seccomp_run_filters() loop.
>> Cons:
>> - the change in behavior technically originates external to the
>> filter, which allows for later filters to "enhance" a previously
>> applied filter's RET_KILL to kill the entire process, which may
>> be unexpected.
>>
>> Signed-off-by: Kees Cook <[email protected]>
>> ---
>> include/linux/seccomp.h | 3 ++-
>> include/uapi/linux/seccomp.h | 3 ++-
>> kernel/seccomp.c | 12 +++++++++++-
>> 3 files changed, 15 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
>> index ecc296c137cd..59d001ba655c 100644
>> --- a/include/linux/seccomp.h
>> +++ b/include/linux/seccomp.h
>> @@ -3,7 +3,8 @@
>>
>> #include <uapi/linux/seccomp.h>
>>
>> -#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC)
>> +#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \
>> + SECCOMP_FILTER_FLAG_KILL_PROCESS)
>>
>> #ifdef CONFIG_SECCOMP
>>
>> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
>> index 0f238a43ff1e..4b75d8c297b6 100644
>> --- a/include/uapi/linux/seccomp.h
>> +++ b/include/uapi/linux/seccomp.h
>> @@ -15,7 +15,8 @@
>> #define SECCOMP_SET_MODE_FILTER 1
>>
>> /* Valid flags for SECCOMP_SET_MODE_FILTER */
>> -#define SECCOMP_FILTER_FLAG_TSYNC 1
>> +#define SECCOMP_FILTER_FLAG_TSYNC 1
>> +#define SECCOMP_FILTER_FLAG_KILL_PROCESS 2
>>
>> /*
>> * All BPF programs must return a 32-bit value.
>> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>> index 8bdcf01379e4..931eb9cbd093 100644
>> --- a/kernel/seccomp.c
>> +++ b/kernel/seccomp.c
>> @@ -44,6 +44,7 @@
>> * is only needed for handling filters shared across tasks.
>> * @prev: points to a previously installed, or inherited, filter
>> * @prog: the BPF program to evaluate
>> + * @kill_process: if true, RET_KILL will kill process rather than thread.
>> *
>> * seccomp_filter objects are organized in a tree linked via the @prev
>> * pointer. For any task, it appears to be a singly-linked list starting
>> @@ -59,6 +60,7 @@ struct seccomp_filter {
>> refcount_t usage;
>> struct seccomp_filter *prev;
>> struct bpf_prog *prog;
>> + bool kill_process;
>
> Just a reminder to move bool up to be the 2nd member of the struct for
> improved struct packing. (You already said you were going to this while
> you were reviewing my logging patches)
Thanks! Yeah, done now.
>> };
>>
>> /* Limit any path through the tree to 256KB worth of instructions. */
>> @@ -448,6 +450,10 @@ static long seccomp_attach_filter(unsigned int flags,
>> return ret;
>> }
>>
>> + /* Set process-killing flag, if present. */
>> + if (flags & SECCOMP_FILTER_FLAG_KILL_PROCESS)
>> + filter->kill_process = true;
>> +
>> /*
>> * If there is an existing filter, make it the prev and don't drop its
>> * task reference.
>> @@ -658,7 +664,11 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
>> seccomp_init_siginfo(&info, this_syscall, data);
>> do_coredump(&info);
>> }
>> - do_exit(SIGSYS);
>> + /* Kill entire thread group if requested (or went haywire). */
>> + if (!match || match->kill_process)
>
> I found this conditional to be a little surprising. I would have expected:
>
> if (match && match->kill_process)
> do_group_exit(SIGSYS);
> else
> do_exit(SIGSYS);
>
> A resulting NULL match pointer is unexpected but callers of
> seccomp_run_filters() are expected to handle such a situation so it is
> possible. Are you preferring to fail closed as much as possible
> (considering that killing the process is more restrictive than killing
> the thread)?
Yeah, that was my thinking. The only way for this to happen is if we
have a totally impossible state: NULL filter in struct seccomp but
seccomp mode is non-zero. I'll add a comment above this.
> I don't know the specific situations that would result in the match
> pointer not being set in seccomp_run_filters() but I'm curious if
> existing, old userspace code that only expected the thread to be killed
> could potentially tickle such a situation and result in the entire
> thread group being unexpectedly killed.
There should be no way for userspace to reach this condition. It
shouldn't ever be possible to set the seccomp mode without a filter
attached.
> FWIW, I like the way the conditional is written and am only thinking out
> loud about the possible side effects.
Yup, very appreciated!
> Side note: I initially expected to see
> Documentation/userspace-api/seccomp_filter.rst being updated in this
> patch and then remembered that seccomp(2) isn't documented in that file
> and, therefore, it isn't trivial to add blurbs about filter flags in
> there. We should fix that after the dust settles on these two patch sets.
Hm, excellent point. Agreed, let's fix that as a separate step.
-Kees
--
Kees Cook
Pixel Security