From: YiFei Zhu <[email protected]>
SECCOMP_CACHE_NR_ONLY will only operate on syscalls that do not
access any syscall arguments or instruction pointer. To facilitate
this we need a static analyser to know whether a filter will
return allow regardless of syscall arguments for a given
architecture number / syscall number pair. This is implemented
here with a pseudo-emulator, and stored in a per-filter bitmap.
Each common BPF instruction (stolen from Kees's list [1]) are
emulated. Any weirdness or loading from a syscall argument will
cause the emulator to bail.
The emulation is also halted if it reaches a return. In that case,
if it returns an SECCOMP_RET_ALLOW, the syscall is marked as good.
Filter dependency is resolved at attach time. If a filter depends
on more filters, then we perform an and on its bitmask against its
dependee; if the dependee does not guarantee to allow the syscall,
then the depender is also marked not to guarantee to allow the
syscall.
[1] https://lore.kernel.org/lkml/[email protected]/
Signed-off-by: YiFei Zhu <[email protected]>
---
arch/Kconfig | 25 ++++++
kernel/seccomp.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 218 insertions(+), 1 deletion(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index 6dfc5673215d..8cc3dc87f253 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -489,6 +489,31 @@ config SECCOMP_FILTER
See Documentation/userspace-api/seccomp_filter.rst for details.
+choice
+ prompt "Seccomp filter cache"
+ default SECCOMP_CACHE_NONE
+ depends on SECCOMP_FILTER
+ help
+ Seccomp filters can potentially incur large overhead for each
+ system call. This can alleviate some of the overhead.
+
+ If in doubt, select 'syscall numbers only'.
+
+config SECCOMP_CACHE_NONE
+ bool "None"
+ help
+ No caching is done. Seccomp filters will be called each time
+ a system call occurs in a seccomp-guarded task.
+
+config SECCOMP_CACHE_NR_ONLY
+ bool "Syscall number only"
+ depends on !HAVE_SPARSE_SYSCALL_NR
+ help
+ For each syscall number, if the seccomp filter has a fixed
+ result, store that result in a bitmap to speed up system calls.
+
+endchoice
+
config HAVE_ARCH_STACKLEAK
bool
help
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 3ee59ce0a323..20d33378a092 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -143,6 +143,32 @@ struct notification {
struct list_head notifications;
};
+#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY
+/**
+ * struct seccomp_cache_filter_data - container for cache's per-filter data
+ *
+ * @syscall_ok: A bitmap for each architecture number, where each bit
+ * represents whether the filter will always allow the syscall.
+ */
+struct seccomp_cache_filter_data {
+ DECLARE_BITMAP(syscall_ok[ARRAY_SIZE(syscall_arches)], NR_syscalls);
+};
+
+#define SECCOMP_EMU_MAX_PENDING_STATES 64
+#else
+struct seccomp_cache_filter_data { };
+
+static inline int seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+ return 0;
+}
+
+static inline void seccomp_cache_inherit(struct seccomp_filter *sfilter,
+ const struct seccomp_filter *prev)
+{
+}
+#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
+
/**
* struct seccomp_filter - container for seccomp BPF programs
*
@@ -185,6 +211,7 @@ struct seccomp_filter {
struct notification *notif;
struct mutex notify_lock;
wait_queue_head_t wqh;
+ struct seccomp_cache_filter_data cache;
};
/* Limit any path through the tree to 256KB worth of instructions. */
@@ -530,6 +557,139 @@ static inline void seccomp_sync_threads(unsigned long flags)
}
}
+#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY
+/**
+ * struct seccomp_emu_env - container for seccomp emulator environment
+ *
+ * @filter: The cBPF filter instructions.
+ * @nr: The syscall number we are emulating.
+ * @arch: The architecture number we are emulating.
+ * @syscall_ok: Emulation result, whether it is okay for seccomp to cache the
+ * syscall.
+ */
+struct seccomp_emu_env {
+ struct sock_filter *filter;
+ int arch;
+ int nr;
+ bool syscall_ok;
+};
+
+/**
+ * struct seccomp_emu_state - container for seccomp emulator state
+ *
+ * @next: The next pending state. This structure is a linked list.
+ * @pc: The current program counter.
+ * @areg: the value of that A register.
+ */
+struct seccomp_emu_state {
+ struct seccomp_emu_state *next;
+ int pc;
+ u32 areg;
+};
+
+/**
+ * seccomp_emu_step - step one instruction in the emulator
+ * @env: The emulator environment
+ * @state: The emulator state
+ *
+ * Returns 1 to halt emulation, 0 to continue, or -errno if error occurred.
+ */
+static int seccomp_emu_step(struct seccomp_emu_env *env,
+ struct seccomp_emu_state *state)
+{
+ struct sock_filter *ftest = &env->filter[state->pc++];
+ u16 code = ftest->code;
+ u32 k = ftest->k;
+ bool compare;
+
+ switch (code) {
+ case BPF_LD | BPF_W | BPF_ABS:
+ if (k == offsetof(struct seccomp_data, nr))
+ state->areg = env->nr;
+ else if (k == offsetof(struct seccomp_data, arch))
+ state->areg = env->arch;
+ else
+ return 1;
+
+ return 0;
+ case BPF_JMP | BPF_JA:
+ state->pc += k;
+ return 0;
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ switch (BPF_OP(code)) {
+ case BPF_JEQ:
+ compare = state->areg == k;
+ break;
+ case BPF_JGT:
+ compare = state->areg > k;
+ break;
+ case BPF_JGE:
+ compare = state->areg >= k;
+ break;
+ case BPF_JSET:
+ compare = state->areg & k;
+ break;
+ default:
+ WARN_ON(true);
+ return -EINVAL;
+ }
+
+ state->pc += compare ? ftest->jt : ftest->jf;
+ return 0;
+ case BPF_ALU | BPF_AND | BPF_K:
+ state->areg &= k;
+ return 0;
+ case BPF_RET | BPF_K:
+ env->syscall_ok = k == SECCOMP_RET_ALLOW;
+ return 1;
+ default:
+ return 1;
+ }
+}
+
+/**
+ * seccomp_cache_prepare - emulate the filter to find cachable syscalls
+ * @sfilter: The seccomp filter
+ *
+ * Returns 0 if successful or -errno if error occurred.
+ */
+int seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+ struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
+ struct sock_filter *filter = fprog->filter;
+ int arch, nr, res = 0;
+
+ for (arch = 0; arch < ARRAY_SIZE(syscall_arches); arch++) {
+ for (nr = 0; nr < NR_syscalls; nr++) {
+ struct seccomp_emu_env env = {0};
+ struct seccomp_emu_state state = {0};
+
+ env.filter = filter;
+ env.arch = syscall_arches[arch];
+ env.nr = nr;
+
+ while (true) {
+ res = seccomp_emu_step(&env, &state);
+ if (res)
+ break;
+ }
+
+ if (res < 0)
+ goto out;
+
+ if (env.syscall_ok)
+ set_bit(nr, sfilter->cache.syscall_ok[arch]);
+ }
+ }
+
+out:
+ return res;
+}
+#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
+
/**
* seccomp_prepare_filter: Prepares a seccomp filter for use.
* @fprog: BPF program to install
@@ -540,7 +700,8 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *sfilter;
int ret;
- const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
+ const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
+ IS_ENABLED(CONFIG_SECCOMP_CACHE_NR_ONLY);
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
@@ -571,6 +732,13 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
return ERR_PTR(ret);
}
+ ret = seccomp_cache_prepare(sfilter);
+ if (ret < 0) {
+ bpf_prog_destroy(sfilter->prog);
+ kfree(sfilter);
+ return ERR_PTR(ret);
+ }
+
refcount_set(&sfilter->refs, 1);
refcount_set(&sfilter->users, 1);
init_waitqueue_head(&sfilter->wqh);
@@ -606,6 +774,29 @@ seccomp_prepare_user_filter(const char __user *user_filter)
return filter;
}
+#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY
+/**
+ * seccomp_cache_inherit - mask accept bitmap against previous filter
+ * @sfilter: The seccomp filter
+ * @sfilter: The previous seccomp filter
+ */
+static void seccomp_cache_inherit(struct seccomp_filter *sfilter,
+ const struct seccomp_filter *prev)
+{
+ int arch;
+
+ if (!prev)
+ return;
+
+ for (arch = 0; arch < ARRAY_SIZE(syscall_arches); arch++) {
+ bitmap_and(sfilter->cache.syscall_ok[arch],
+ sfilter->cache.syscall_ok[arch],
+ prev->cache.syscall_ok[arch],
+ NR_syscalls);
+ }
+}
+#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
+
/**
* seccomp_attach_filter: validate and attach filter
* @flags: flags to change filter behavior
@@ -655,6 +846,7 @@ static long seccomp_attach_filter(unsigned int flags,
* task reference.
*/
filter->prev = current->seccomp.filter;
+ seccomp_cache_inherit(filter, filter->prev);
current->seccomp.filter = filter;
atomic_inc(¤t->seccomp.filter_count);
--
2.28.0
On Thu, Sep 24, 2020 at 07:44:18AM -0500, YiFei Zhu wrote:
> From: YiFei Zhu <[email protected]>
>
> SECCOMP_CACHE_NR_ONLY will only operate on syscalls that do not
> access any syscall arguments or instruction pointer. To facilitate
> this we need a static analyser to know whether a filter will
> return allow regardless of syscall arguments for a given
> architecture number / syscall number pair. This is implemented
> here with a pseudo-emulator, and stored in a per-filter bitmap.
>
> Each common BPF instruction (stolen from Kees's list [1]) are
> emulated. Any weirdness or loading from a syscall argument will
> cause the emulator to bail.
>
> The emulation is also halted if it reaches a return. In that case,
> if it returns an SECCOMP_RET_ALLOW, the syscall is marked as good.
>
> Filter dependency is resolved at attach time. If a filter depends
> on more filters, then we perform an and on its bitmask against its
> dependee; if the dependee does not guarantee to allow the syscall,
> then the depender is also marked not to guarantee to allow the
> syscall.
>
> [1] https://lore.kernel.org/lkml/[email protected]/
>
> Signed-off-by: YiFei Zhu <[email protected]>
> ---
> arch/Kconfig | 25 ++++++
> kernel/seccomp.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 218 insertions(+), 1 deletion(-)
>
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 6dfc5673215d..8cc3dc87f253 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -489,6 +489,31 @@ config SECCOMP_FILTER
>
> See Documentation/userspace-api/seccomp_filter.rst for details.
>
> +choice
> + prompt "Seccomp filter cache"
> + default SECCOMP_CACHE_NONE
> + depends on SECCOMP_FILTER
> + help
> + Seccomp filters can potentially incur large overhead for each
> + system call. This can alleviate some of the overhead.
> +
> + If in doubt, select 'syscall numbers only'.
> +
> +config SECCOMP_CACHE_NONE
> + bool "None"
> + help
> + No caching is done. Seccomp filters will be called each time
> + a system call occurs in a seccomp-guarded task.
> +
> +config SECCOMP_CACHE_NR_ONLY
> + bool "Syscall number only"
> + depends on !HAVE_SPARSE_SYSCALL_NR
> + help
> + For each syscall number, if the seccomp filter has a fixed
> + result, store that result in a bitmap to speed up system calls.
> +
> +endchoice
I'm not interested in seccomp having a config option for this. It should
entire exist or not, and that depends on the per-architecture support.
You mentioned in another thread that you wanted it to let people play
with this support in some way. Can you elaborate on this? My perspective
is that of distro and vendor kernels: there is _one_ config and end
users can't really do anything about it without rolling their own
kernels.
> +#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY
> +/**
> + * struct seccomp_cache_filter_data - container for cache's per-filter data
> + *
> + * @syscall_ok: A bitmap for each architecture number, where each bit
> + * represents whether the filter will always allow the syscall.
> + */
> +struct seccomp_cache_filter_data {
> + DECLARE_BITMAP(syscall_ok[ARRAY_SIZE(syscall_arches)], NR_syscalls);
> +};
So, as Jann pointed out, using NR_syscalls only accidentally works --
they're actually different sizes and there isn't strictly any reason to
expect one to be smaller than another. So, we need to either choose the
max() in asm/linux/seccomp.h or be more efficient with space usage and
use explicitly named bitmaps (how my v1 does things).
> +
> +#define SECCOMP_EMU_MAX_PENDING_STATES 64
This isn't used in this patch; likely leftover/in need of moving?
> +#else
> +struct seccomp_cache_filter_data { };
> +
> +static inline int seccomp_cache_prepare(struct seccomp_filter *sfilter)
> +{
> + return 0;
> +}
> +
> +static inline void seccomp_cache_inherit(struct seccomp_filter *sfilter,
> + const struct seccomp_filter *prev)
> +{
> +}
> +#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
> +
> /**
> * struct seccomp_filter - container for seccomp BPF programs
> *
> @@ -185,6 +211,7 @@ struct seccomp_filter {
> struct notification *notif;
> struct mutex notify_lock;
> wait_queue_head_t wqh;
> + struct seccomp_cache_filter_data cache;
I moved this up in the structure to see if I could benefit from cache
line sharing. In either case, we must verify (with "pahole") that we do
not induce massive padding in the struct.
But yes, attaching this to the filter is the right way to go.
> };
>
> /* Limit any path through the tree to 256KB worth of instructions. */
> @@ -530,6 +557,139 @@ static inline void seccomp_sync_threads(unsigned long flags)
> }
> }
>
> +#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY
> +/**
> + * struct seccomp_emu_env - container for seccomp emulator environment
> + *
> + * @filter: The cBPF filter instructions.
> + * @nr: The syscall number we are emulating.
> + * @arch: The architecture number we are emulating.
> + * @syscall_ok: Emulation result, whether it is okay for seccomp to cache the
> + * syscall.
> + */
> +struct seccomp_emu_env {
> + struct sock_filter *filter;
> + int arch;
> + int nr;
> + bool syscall_ok;
nit: "ok" is too vague. We mean either "constant action" or "allow" (or
"filter" in the negative case).
> +};
> +
> +/**
> + * struct seccomp_emu_state - container for seccomp emulator state
> + *
> + * @next: The next pending state. This structure is a linked list.
> + * @pc: The current program counter.
> + * @areg: the value of that A register.
> + */
> +struct seccomp_emu_state {
> + struct seccomp_emu_state *next;
> + int pc;
> + u32 areg;
> +};
Why is this split out? (i.e. why is it not just a self-contained loop
the way Jann wrote it?)
> +
> +/**
> + * seccomp_emu_step - step one instruction in the emulator
> + * @env: The emulator environment
> + * @state: The emulator state
> + *
> + * Returns 1 to halt emulation, 0 to continue, or -errno if error occurred.
I appreciate the -errno intent, but it actually risks making these
changes break existing userspace filters: if something is unhandled in
the emulator in a way we don't find during design and testing, the
filter load will actually _fail_ instead of just falling back to "run
filter". Failures should be reported (WARN_ON_ONCE()), but my v1
intentionally lets this continue.
> + */
> +static int seccomp_emu_step(struct seccomp_emu_env *env,
> + struct seccomp_emu_state *state)
> +{
> + struct sock_filter *ftest = &env->filter[state->pc++];
> + u16 code = ftest->code;
> + u32 k = ftest->k;
> + bool compare;
> +
> + switch (code) {
> + case BPF_LD | BPF_W | BPF_ABS:
> + if (k == offsetof(struct seccomp_data, nr))
> + state->areg = env->nr;
> + else if (k == offsetof(struct seccomp_data, arch))
> + state->areg = env->arch;
> + else
> + return 1;
> +
> + return 0;
> + case BPF_JMP | BPF_JA:
> + state->pc += k;
> + return 0;
> + case BPF_JMP | BPF_JEQ | BPF_K:
> + case BPF_JMP | BPF_JGE | BPF_K:
> + case BPF_JMP | BPF_JGT | BPF_K:
> + case BPF_JMP | BPF_JSET | BPF_K:
> + switch (BPF_OP(code)) {
> + case BPF_JEQ:
> + compare = state->areg == k;
> + break;
> + case BPF_JGT:
> + compare = state->areg > k;
> + break;
> + case BPF_JGE:
> + compare = state->areg >= k;
> + break;
> + case BPF_JSET:
> + compare = state->areg & k;
> + break;
> + default:
> + WARN_ON(true);
> + return -EINVAL;
> + }
> +
> + state->pc += compare ? ftest->jt : ftest->jf;
> + return 0;
> + case BPF_ALU | BPF_AND | BPF_K:
> + state->areg &= k;
> + return 0;
> + case BPF_RET | BPF_K:
> + env->syscall_ok = k == SECCOMP_RET_ALLOW;
> + return 1;
> + default:
> + return 1;
> + }
> +}
This version appears to have removed all the comments; I liked Jann's
comments and I had rearranged things a bit to make it more readable
(IMO) for people that do not immediate understand BPF. :)
> +
> +/**
> + * seccomp_cache_prepare - emulate the filter to find cachable syscalls
> + * @sfilter: The seccomp filter
> + *
> + * Returns 0 if successful or -errno if error occurred.
> + */
> +int seccomp_cache_prepare(struct seccomp_filter *sfilter)
> +{
> + struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
> + struct sock_filter *filter = fprog->filter;
> + int arch, nr, res = 0;
> +
> + for (arch = 0; arch < ARRAY_SIZE(syscall_arches); arch++) {
> + for (nr = 0; nr < NR_syscalls; nr++) {
> + struct seccomp_emu_env env = {0};
> + struct seccomp_emu_state state = {0};
> +
> + env.filter = filter;
> + env.arch = syscall_arches[arch];
> + env.nr = nr;
> +
> + while (true) {
> + res = seccomp_emu_step(&env, &state);
> + if (res)
> + break;
> + }
> +
> + if (res < 0)
> + goto out;
> +
> + if (env.syscall_ok)
> + set_bit(nr, sfilter->cache.syscall_ok[arch]);
I don't really like the complexity here, passing around syscall_ok, etc.
I feel like seccomp_emu_step() should be self-contained to say "allow or
filter" directly.
I also prefer an inversion to the logic: if we start bitmaps as "default
allow", we only ever increase the filtering cases: we can never
accidentally ADD an allow to the bitmap. (This was an intentional design
in the RFC and v1 to do as much as possible to fail safe.)
> + }
> + }
> +
> +out:
> + return res;
> +}
> +#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
> +
> /**
> * seccomp_prepare_filter: Prepares a seccomp filter for use.
> * @fprog: BPF program to install
> @@ -540,7 +700,8 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
> {
> struct seccomp_filter *sfilter;
> int ret;
> - const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
> + const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
> + IS_ENABLED(CONFIG_SECCOMP_CACHE_NR_ONLY);
>
> if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
> return ERR_PTR(-EINVAL);
> @@ -571,6 +732,13 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
> return ERR_PTR(ret);
> }
>
> + ret = seccomp_cache_prepare(sfilter);
> + if (ret < 0) {
> + bpf_prog_destroy(sfilter->prog);
> + kfree(sfilter);
> + return ERR_PTR(ret);
> + }
Why do the prepare here instead of during attach? (And note that it
should not be written to fail.)
> +
> refcount_set(&sfilter->refs, 1);
> refcount_set(&sfilter->users, 1);
> init_waitqueue_head(&sfilter->wqh);
> @@ -606,6 +774,29 @@ seccomp_prepare_user_filter(const char __user *user_filter)
> return filter;
> }
>
> +#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY
> +/**
> + * seccomp_cache_inherit - mask accept bitmap against previous filter
> + * @sfilter: The seccomp filter
> + * @sfilter: The previous seccomp filter
> + */
> +static void seccomp_cache_inherit(struct seccomp_filter *sfilter,
> + const struct seccomp_filter *prev)
> +{
> + int arch;
> +
> + if (!prev)
> + return;
> +
> + for (arch = 0; arch < ARRAY_SIZE(syscall_arches); arch++) {
> + bitmap_and(sfilter->cache.syscall_ok[arch],
> + sfilter->cache.syscall_ok[arch],
> + prev->cache.syscall_ok[arch],
> + NR_syscalls);
> + }
And, as per being as defensive as I can imagine, this should be a
one-way mask: we can only remove bits from syscall_ok, never add them.
sfilter must be constructed so that it can only ever have fewer or the
same bits set as prev.
> +}
> +#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
> +
> /**
> * seccomp_attach_filter: validate and attach filter
> * @flags: flags to change filter behavior
> @@ -655,6 +846,7 @@ static long seccomp_attach_filter(unsigned int flags,
> * task reference.
> */
> filter->prev = current->seccomp.filter;
> + seccomp_cache_inherit(filter, filter->prev);
In the RFC I did this inherit earlier (in the emulation stage) to
benefit from the RET_KILL results, but that's not very useful any more.
However, I think it's still code-locality better to keep the bit
manipulation logic as close together as possible for readability.
> current->seccomp.filter = filter;
> atomic_inc(¤t->seccomp.filter_count);
>
> --
> 2.28.0
>
--
Kees Cook
[resending this, forgot to hit reply all...]
On Thu, Sep 24, 2020 at 6:25 PM Kees Cook <[email protected]> wrote:
> I'm not interested in seccomp having a config option for this. It should
> entire exist or not, and that depends on the per-architecture support.
> You mentioned in another thread that you wanted it to let people play
> with this support in some way. Can you elaborate on this? My perspective
> is that of distro and vendor kernels: there is _one_ config and end
> users can't really do anything about it without rolling their own
> kernels.
That's one. The other is to allow future optional extensions, like
syscall-argument-capable accelerators.
Distro / vendor kernels will keep defaults anyways, no?
> So, as Jann pointed out, using NR_syscalls only accidentally works --
> they're actually different sizes and there isn't strictly any reason to
> expect one to be smaller than another. So, we need to either choose the
> max() in asm/linux/seccomp.h or be more efficient with space usage and
> use explicitly named bitmaps (how my v1 does things).
Right.
> This isn't used in this patch; likely leftover/in need of moving?
Correct. Will remove.
> I moved this up in the structure to see if I could benefit from cache
> line sharing. In either case, we must verify (with "pahole") that we do
> not induce massive padding in the struct.
>
> But yes, attaching this to the filter is the right way to go.
Right. I don't think it would cause massive padding with all I know
about padding learnt from [1].
I'm used to use gdb to look at structure layout, and this is what I see:
(gdb) ptype /o struct seccomp_filter
/* offset | size */ type = struct seccomp_filter {
/* 0 | 4 */ refcount_t refs;
/* 4 | 4 */ refcount_t users;
/* 8 | 1 */ bool log;
/* XXX 7-byte hole */
/* 16 | 8 */ struct seccomp_filter *prev;
[...]
/* 264 | 112 */ struct seccomp_cache_filter_data {
/* 264 | 112 */ unsigned long syscall_ok[2][7];
/* total size (bytes): 112 */
} cache;
/* total size (bytes): 376 */
}
The bitmaps are long-aligned; so is the prev-pointer. If we want we
can put the cache struct right before prev and that should not
introduce any new holes. It's the refcounts and the bool that's not
cooperative.
> nit: "ok" is too vague. We mean either "constant action" or "allow" (or
> "filter" in the negative case).
Right.
> Why is this split out? (i.e. why is it not just a self-contained loop
> the way Jann wrote it?)
Because my brain thinks like a finite state machine and this function
is a state transition. ;) Though yeah I agree a loop is probably more
readable.
> I appreciate the -errno intent, but it actually risks making these
> changes break existing userspace filters: if something is unhandled in
> the emulator in a way we don't find during design and testing, the
> filter load will actually _fail_ instead of just falling back to "run
> filter". Failures should be reported (WARN_ON_ONCE()), but my v1
> intentionally lets this continue.
Right.
> This version appears to have removed all the comments; I liked Jann's
> comments and I had rearranged things a bit to make it more readable
> (IMO) for people that do not immediate understand BPF. :)
Right.
> > +/**
> > + * seccomp_cache_prepare - emulate the filter to find cachable syscalls
> > + * @sfilter: The seccomp filter
> > + *
> > + * Returns 0 if successful or -errno if error occurred.
> > + */
> > +int seccomp_cache_prepare(struct seccomp_filter *sfilter)
> > +{
> > + struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
> > + struct sock_filter *filter = fprog->filter;
> > + int arch, nr, res = 0;
> > +
> > + for (arch = 0; arch < ARRAY_SIZE(syscall_arches); arch++) {
> > + for (nr = 0; nr < NR_syscalls; nr++) {
> > + struct seccomp_emu_env env = {0};
Btw, do you know what is the initial state of the A register at the
start of BPF execution? In my RFC I assumed it's unknown but then in
v1 after the "reg_known" removal the register is assumed to be 0. Idk
if it is correct to assume so.
> I don't really like the complexity here, passing around syscall_ok, etc.
> I feel like seccomp_emu_step() should be self-contained to say "allow or
> filter" directly.
Ok.
> I also prefer an inversion to the logic: if we start bitmaps as "default
> allow", we only ever increase the filtering cases: we can never
> accidentally ADD an allow to the bitmap. (This was an intentional design
> in the RFC and v1 to do as much as possible to fail safe.)
Wait why? If it's default allow, what if you hit an error? You can
accidentally not remove an allow from the bitmap, and that is much
more of an issue than accidentally not add an allow. I don't
understand your reasoning of "accidentally ADD an allow", an action
will only occur when everything is right, but an action might not
occur if some random shenanigans happen. Hence, the non-action /
default side should be the fail-safe side, rather than the action
side.
> Why do the prepare here instead of during attach? (And note that it
> should not be written to fail.)
Right.
> And, as per being as defensive as I can imagine, this should be a
> one-way mask: we can only remove bits from syscall_ok, never add them.
> sfilter must be constructed so that it can only ever have fewer or the
> same bits set as prev.
Right.
> In the RFC I did this inherit earlier (in the emulation stage) to
> benefit from the RET_KILL results, but that's not very useful any more.
> However, I think it's still code-locality better to keep the bit
> manipulation logic as close together as possible for readability.
Right.
[1] http://www.catb.org/esr/structure-packing/#_structure_alignment_and_padding
YiFei Zhu
On Thu, Sep 24, 2020 at 10:04 PM YiFei Zhu <[email protected]> wrote:
> > Why do the prepare here instead of during attach? (And note that it
> > should not be written to fail.)
>
> Right.
During attach a spinlock (current->sighand->siglock) is held. Do we
really want to put the emulator in the "atomic section"?
YiFei Zhu
> On Sep 25, 2020, at 12:42 PM, Kees Cook <[email protected]> wrote:
>
> On Fri, Sep 25, 2020 at 11:45:05AM -0500, YiFei Zhu wrote:
>> On Thu, Sep 24, 2020 at 10:04 PM YiFei Zhu <[email protected]> wrote:
>>>> Why do the prepare here instead of during attach? (And note that it
>>>> should not be written to fail.)
>>>
>>> Right.
>>
>> During attach a spinlock (current->sighand->siglock) is held. Do we
>> really want to put the emulator in the "atomic section"?
>
> It's a good point, but I had some other ideas around it that lead to me
> a different conclusion. Here's what I've got in my head:
>
> I don't view filter attach (nor the siglock) as fastpath: the lock is
> rarely contested and the "long time" will only be during filter attach.
>
> When performing filter emulation, all the syscalls that are already
> marked as "must run filter" on the previous filter can be skipped for
> the new filter, since it cannot change the outcome, which makes the
> emulation step faster.
>
> The previous filter's bitmap isn't "stable" until siglock is held.
>
> If we do the emulation step before siglock, we have to always do full
> evaluation of all syscalls, and then merge the bitmap during attach.
> That means all filters ever attached will take maximal time to perform
> emulation.
>
> I prefer the idea of the emulation step taking advantage of the bitmap
> optimization, since the kernel spends less time doing work over the life
> of the process tree. It's certainly marginal, but it also lets all the
> bitmap manipulation stay in one place (as opposed to being split between
> "prepare" and "attach").
>
> What do you think?
>
>
I’m wondering if we should be much much lazier. We could potentially wait until someone actually tries to do a given syscall before we try to evaluate whether the result is fixed.
On Fri, Sep 25, 2020 at 11:45:05AM -0500, YiFei Zhu wrote:
> On Thu, Sep 24, 2020 at 10:04 PM YiFei Zhu <[email protected]> wrote:
> > > Why do the prepare here instead of during attach? (And note that it
> > > should not be written to fail.)
> >
> > Right.
>
> During attach a spinlock (current->sighand->siglock) is held. Do we
> really want to put the emulator in the "atomic section"?
It's a good point, but I had some other ideas around it that lead to me
a different conclusion. Here's what I've got in my head:
I don't view filter attach (nor the siglock) as fastpath: the lock is
rarely contested and the "long time" will only be during filter attach.
When performing filter emulation, all the syscalls that are already
marked as "must run filter" on the previous filter can be skipped for
the new filter, since it cannot change the outcome, which makes the
emulation step faster.
The previous filter's bitmap isn't "stable" until siglock is held.
If we do the emulation step before siglock, we have to always do full
evaluation of all syscalls, and then merge the bitmap during attach.
That means all filters ever attached will take maximal time to perform
emulation.
I prefer the idea of the emulation step taking advantage of the bitmap
optimization, since the kernel spends less time doing work over the life
of the process tree. It's certainly marginal, but it also lets all the
bitmap manipulation stay in one place (as opposed to being split between
"prepare" and "attach").
What do you think?
--
Kees Cook
On Fri, Sep 25, 2020 at 12:51:20PM -0700, Andy Lutomirski wrote:
>
>
> > On Sep 25, 2020, at 12:42 PM, Kees Cook <[email protected]> wrote:
> >
> > On Fri, Sep 25, 2020 at 11:45:05AM -0500, YiFei Zhu wrote:
> >> On Thu, Sep 24, 2020 at 10:04 PM YiFei Zhu <[email protected]> wrote:
> >>>> Why do the prepare here instead of during attach? (And note that it
> >>>> should not be written to fail.)
> >>>
> >>> Right.
> >>
> >> During attach a spinlock (current->sighand->siglock) is held. Do we
> >> really want to put the emulator in the "atomic section"?
> >
> > It's a good point, but I had some other ideas around it that lead to me
> > a different conclusion. Here's what I've got in my head:
> >
> > I don't view filter attach (nor the siglock) as fastpath: the lock is
> > rarely contested and the "long time" will only be during filter attach.
> >
> > When performing filter emulation, all the syscalls that are already
> > marked as "must run filter" on the previous filter can be skipped for
> > the new filter, since it cannot change the outcome, which makes the
> > emulation step faster.
> >
> > The previous filter's bitmap isn't "stable" until siglock is held.
> >
> > If we do the emulation step before siglock, we have to always do full
> > evaluation of all syscalls, and then merge the bitmap during attach.
> > That means all filters ever attached will take maximal time to perform
> > emulation.
> >
> > I prefer the idea of the emulation step taking advantage of the bitmap
> > optimization, since the kernel spends less time doing work over the life
> > of the process tree. It's certainly marginal, but it also lets all the
> > bitmap manipulation stay in one place (as opposed to being split between
> > "prepare" and "attach").
> >
> > What do you think?
> >
> >
>
> I’m wondering if we should be much much lazier. We could potentially wait until someone actually tries to do a given syscall before we try to evaluate whether the result is fixed.
That seems like we'd need to track yet another bitmap of "did we emulate
this yet?" And it means the filter isn't really "done" until you run
another syscall? eeh, I'm not a fan: it scratches at my desire for
determinism. ;) Or maybe my implementation imagination is missing
something?
--
Kees Cook
On Fri, Sep 25, 2020 at 1:37 PM Kees Cook <[email protected]> wrote:
>
> On Fri, Sep 25, 2020 at 12:51:20PM -0700, Andy Lutomirski wrote:
> >
> >
> > > On Sep 25, 2020, at 12:42 PM, Kees Cook <[email protected]> wrote:
> > >
> > > On Fri, Sep 25, 2020 at 11:45:05AM -0500, YiFei Zhu wrote:
> > >> On Thu, Sep 24, 2020 at 10:04 PM YiFei Zhu <[email protected]> wrote:
> > >>>> Why do the prepare here instead of during attach? (And note that it
> > >>>> should not be written to fail.)
> > >>>
> > >>> Right.
> > >>
> > >> During attach a spinlock (current->sighand->siglock) is held. Do we
> > >> really want to put the emulator in the "atomic section"?
> > >
> > > It's a good point, but I had some other ideas around it that lead to me
> > > a different conclusion. Here's what I've got in my head:
> > >
> > > I don't view filter attach (nor the siglock) as fastpath: the lock is
> > > rarely contested and the "long time" will only be during filter attach.
> > >
> > > When performing filter emulation, all the syscalls that are already
> > > marked as "must run filter" on the previous filter can be skipped for
> > > the new filter, since it cannot change the outcome, which makes the
> > > emulation step faster.
> > >
> > > The previous filter's bitmap isn't "stable" until siglock is held.
> > >
> > > If we do the emulation step before siglock, we have to always do full
> > > evaluation of all syscalls, and then merge the bitmap during attach.
> > > That means all filters ever attached will take maximal time to perform
> > > emulation.
> > >
> > > I prefer the idea of the emulation step taking advantage of the bitmap
> > > optimization, since the kernel spends less time doing work over the life
> > > of the process tree. It's certainly marginal, but it also lets all the
> > > bitmap manipulation stay in one place (as opposed to being split between
> > > "prepare" and "attach").
> > >
> > > What do you think?
> > >
> > >
> >
> > I’m wondering if we should be much much lazier. We could potentially wait until someone actually tries to do a given syscall before we try to evaluate whether the result is fixed.
>
> That seems like we'd need to track yet another bitmap of "did we emulate
> this yet?" And it means the filter isn't really "done" until you run
> another syscall? eeh, I'm not a fan: it scratches at my desire for
> determinism. ;) Or maybe my implementation imagination is missing
> something?
>
We'd need at least three states per syscall: unknown, always-allow,
and need-to-run-filter.
The downsides are less determinism and a bit of an uglier
implementation. The upside is that we don't need to loop over all
syscalls at load -- instead the time that each operation takes is
independent of the total number of syscalls on the system. And we can
entirely avoid, say, evaluating the x32 case until the task tries an
x32 syscall.
I think it's at least worth considering.
--Andy
On Fri, Sep 25, 2020 at 02:07:46PM -0700, Andy Lutomirski wrote:
> On Fri, Sep 25, 2020 at 1:37 PM Kees Cook <[email protected]> wrote:
> >
> > On Fri, Sep 25, 2020 at 12:51:20PM -0700, Andy Lutomirski wrote:
> > >
> > >
> > > > On Sep 25, 2020, at 12:42 PM, Kees Cook <[email protected]> wrote:
> > > >
> > > > On Fri, Sep 25, 2020 at 11:45:05AM -0500, YiFei Zhu wrote:
> > > >> On Thu, Sep 24, 2020 at 10:04 PM YiFei Zhu <[email protected]> wrote:
> > > >>>> Why do the prepare here instead of during attach? (And note that it
> > > >>>> should not be written to fail.)
> > > >>>
> > > >>> Right.
> > > >>
> > > >> During attach a spinlock (current->sighand->siglock) is held. Do we
> > > >> really want to put the emulator in the "atomic section"?
> > > >
> > > > It's a good point, but I had some other ideas around it that lead to me
> > > > a different conclusion. Here's what I've got in my head:
> > > >
> > > > I don't view filter attach (nor the siglock) as fastpath: the lock is
> > > > rarely contested and the "long time" will only be during filter attach.
> > > >
> > > > When performing filter emulation, all the syscalls that are already
> > > > marked as "must run filter" on the previous filter can be skipped for
> > > > the new filter, since it cannot change the outcome, which makes the
> > > > emulation step faster.
> > > >
> > > > The previous filter's bitmap isn't "stable" until siglock is held.
> > > >
> > > > If we do the emulation step before siglock, we have to always do full
> > > > evaluation of all syscalls, and then merge the bitmap during attach.
> > > > That means all filters ever attached will take maximal time to perform
> > > > emulation.
> > > >
> > > > I prefer the idea of the emulation step taking advantage of the bitmap
> > > > optimization, since the kernel spends less time doing work over the life
> > > > of the process tree. It's certainly marginal, but it also lets all the
> > > > bitmap manipulation stay in one place (as opposed to being split between
> > > > "prepare" and "attach").
> > > >
> > > > What do you think?
> > > >
> > > >
> > >
> > > I’m wondering if we should be much much lazier. We could potentially wait until someone actually tries to do a given syscall before we try to evaluate whether the result is fixed.
> >
> > That seems like we'd need to track yet another bitmap of "did we emulate
> > this yet?" And it means the filter isn't really "done" until you run
> > another syscall? eeh, I'm not a fan: it scratches at my desire for
> > determinism. ;) Or maybe my implementation imagination is missing
> > something?
> >
>
> We'd need at least three states per syscall: unknown, always-allow,
> and need-to-run-filter.
>
> The downsides are less determinism and a bit of an uglier
> implementation. The upside is that we don't need to loop over all
> syscalls at load -- instead the time that each operation takes is
> independent of the total number of syscalls on the system. And we can
> entirely avoid, say, evaluating the x32 case until the task tries an
> x32 syscall.
>
> I think it's at least worth considering.
Yeah, worth considering. I do still think the time spent in emulation is
SO small that it doesn't matter running all of the syscalls at attach
time. The filters are tiny and fail quickly if anything "interesting"
start to happen. ;)
--
Kees Cook
> On Sep 25, 2020, at 4:49 PM, Kees Cook <[email protected]> wrote:
>
> On Fri, Sep 25, 2020 at 02:07:46PM -0700, Andy Lutomirski wrote:
>>> On Fri, Sep 25, 2020 at 1:37 PM Kees Cook <[email protected]> wrote:
>>>
>>> On Fri, Sep 25, 2020 at 12:51:20PM -0700, Andy Lutomirski wrote:
>>>>
>>>>
>>>>> On Sep 25, 2020, at 12:42 PM, Kees Cook <[email protected]> wrote:
>>>>>
>>>>> On Fri, Sep 25, 2020 at 11:45:05AM -0500, YiFei Zhu wrote:
>>>>>> On Thu, Sep 24, 2020 at 10:04 PM YiFei Zhu <[email protected]> wrote:
>>>>>>>> Why do the prepare here instead of during attach? (And note that it
>>>>>>>> should not be written to fail.)
>>>>>>>
>>>>>>> Right.
>>>>>>
>>>>>> During attach a spinlock (current->sighand->siglock) is held. Do we
>>>>>> really want to put the emulator in the "atomic section"?
>>>>>
>>>>> It's a good point, but I had some other ideas around it that lead to me
>>>>> a different conclusion. Here's what I've got in my head:
>>>>>
>>>>> I don't view filter attach (nor the siglock) as fastpath: the lock is
>>>>> rarely contested and the "long time" will only be during filter attach.
>>>>>
>>>>> When performing filter emulation, all the syscalls that are already
>>>>> marked as "must run filter" on the previous filter can be skipped for
>>>>> the new filter, since it cannot change the outcome, which makes the
>>>>> emulation step faster.
>>>>>
>>>>> The previous filter's bitmap isn't "stable" until siglock is held.
>>>>>
>>>>> If we do the emulation step before siglock, we have to always do full
>>>>> evaluation of all syscalls, and then merge the bitmap during attach.
>>>>> That means all filters ever attached will take maximal time to perform
>>>>> emulation.
>>>>>
>>>>> I prefer the idea of the emulation step taking advantage of the bitmap
>>>>> optimization, since the kernel spends less time doing work over the life
>>>>> of the process tree. It's certainly marginal, but it also lets all the
>>>>> bitmap manipulation stay in one place (as opposed to being split between
>>>>> "prepare" and "attach").
>>>>>
>>>>> What do you think?
>>>>>
>>>>>
>>>>
>>>> I’m wondering if we should be much much lazier. We could potentially wait until someone actually tries to do a given syscall before we try to evaluate whether the result is fixed.
>>>
>>> That seems like we'd need to track yet another bitmap of "did we emulate
>>> this yet?" And it means the filter isn't really "done" until you run
>>> another syscall? eeh, I'm not a fan: it scratches at my desire for
>>> determinism. ;) Or maybe my implementation imagination is missing
>>> something?
>>>
>>
>> We'd need at least three states per syscall: unknown, always-allow,
>> and need-to-run-filter.
>>
>> The downsides are less determinism and a bit of an uglier
>> implementation. The upside is that we don't need to loop over all
>> syscalls at load -- instead the time that each operation takes is
>> independent of the total number of syscalls on the system. And we can
>> entirely avoid, say, evaluating the x32 case until the task tries an
>> x32 syscall.
>>
>> I think it's at least worth considering.
>
> Yeah, worth considering. I do still think the time spent in emulation is
> SO small that it doesn't matter running all of the syscalls at attach
> time. The filters are tiny and fail quickly if anything "interesting"
> start to happen. ;)
>
There’s a middle ground, too: do it lazily per arch. So we would allocate and populate the compat bitmap the first time a compat syscall is attempted and do the same for x32. This may help avoid the annoying extra memory usage and 3x startup overhead while retaining full functionality.
On Fri, Sep 25, 2020 at 4:07 PM Andy Lutomirski <[email protected]> wrote:
> We'd need at least three states per syscall: unknown, always-allow,
> and need-to-run-filter.
>
> The downsides are less determinism and a bit of an uglier
> implementation. The upside is that we don't need to loop over all
> syscalls at load -- instead the time that each operation takes is
> independent of the total number of syscalls on the system. And we can
> entirely avoid, say, evaluating the x32 case until the task tries an
> x32 syscall.
I was really afraid of multiple tasks writing to the bitmaps at once,
hence I used bitmap-per-task. Now I think about it, if this stays
lockless, the worst thing that can happen is that a write undo a bit
set by another task. In this case, if the "known" bit is cleared then
the worst would be the emulation is run many times. But if the "always
allow" is cleared but not "known" bit then we have an issue: the
syscall will always be executed in BPF.
Is it worth holding a spinlock here?
Though I'll try to get the benchmark numbers for the emulator later tonight.
YiFei Zhu
> On Sep 25, 2020, at 6:23 PM, YiFei Zhu <[email protected]> wrote:
>
> On Fri, Sep 25, 2020 at 4:07 PM Andy Lutomirski <[email protected]> wrote:
>> We'd need at least three states per syscall: unknown, always-allow,
>> and need-to-run-filter.
>>
>> The downsides are less determinism and a bit of an uglier
>> implementation. The upside is that we don't need to loop over all
>> syscalls at load -- instead the time that each operation takes is
>> independent of the total number of syscalls on the system. And we can
>> entirely avoid, say, evaluating the x32 case until the task tries an
>> x32 syscall.
>
> I was really afraid of multiple tasks writing to the bitmaps at once,
> hence I used bitmap-per-task. Now I think about it, if this stays
> lockless, the worst thing that can happen is that a write undo a bit
> set by another task. In this case, if the "known" bit is cleared then
> the worst would be the emulation is run many times. But if the "always
> allow" is cleared but not "known" bit then we have an issue: the
> syscall will always be executed in BPF.
>
If you interleave the bits, then you can read and write them atomically — both bits for any given syscall will be in the same word.
> Is it worth holding a spinlock here?
>
> Though I'll try to get the benchmark numbers for the emulator later tonight.
>
> YiFei Zhu
On Fri, Sep 25, 2020 at 07:47:47PM -0700, Andy Lutomirski wrote:
>
> > On Sep 25, 2020, at 6:23 PM, YiFei Zhu <[email protected]> wrote:
> >
> > On Fri, Sep 25, 2020 at 4:07 PM Andy Lutomirski <[email protected]> wrote:
> >> We'd need at least three states per syscall: unknown, always-allow,
> >> and need-to-run-filter.
> >>
> >> The downsides are less determinism and a bit of an uglier
> >> implementation. The upside is that we don't need to loop over all
> >> syscalls at load -- instead the time that each operation takes is
> >> independent of the total number of syscalls on the system. And we can
> >> entirely avoid, say, evaluating the x32 case until the task tries an
> >> x32 syscall.
> >
> > I was really afraid of multiple tasks writing to the bitmaps at once,
> > hence I used bitmap-per-task. Now I think about it, if this stays
> > lockless, the worst thing that can happen is that a write undo a bit
> > set by another task. In this case, if the "known" bit is cleared then
> > the worst would be the emulation is run many times. But if the "always
> > allow" is cleared but not "known" bit then we have an issue: the
> > syscall will always be executed in BPF.
> >
>
> If you interleave the bits, then you can read and write them atomically — both bits for any given syscall will be in the same word.
I think we can just hold the spinlock. :)
--
Kees Cook