From: YiFei Zhu <[email protected]>
The fast (common) path for seccomp should be that the filter permits
the syscall to pass through, and failing seccomp is expected to be
an exceptional case; it is not expected for userspace to call a
denylisted syscall over and over.
This first finds the current allow bitmask by iterating through
syscall_arches[] array and comparing it to the one in struct
seccomp_data; this loop is expected to be unrolled. It then
does a test_bit against the bitmask. If the bit is set, then
there is no need to run the full filter; it returns
SECCOMP_RET_ALLOW immediately.
Co-developed-by: Dimitrios Skarlatos <[email protected]>
Signed-off-by: Dimitrios Skarlatos <[email protected]>
Signed-off-by: YiFei Zhu <[email protected]>
---
kernel/seccomp.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 52 insertions(+)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index f09c9e74ae05..bed3b2a7f6c8 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -172,6 +172,12 @@ struct seccomp_cache_filter_data { };
static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
{
}
+
+static inline bool seccomp_cache_check(const struct seccomp_filter *sfilter,
+ const struct seccomp_data *sd)
+{
+ return false;
+}
#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
/**
@@ -331,6 +337,49 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
return 0;
}
+#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY
+static bool seccomp_cache_check_bitmap(const void *bitmap, size_t bitmap_size,
+ int syscall_nr)
+{
+ if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size))
+ return false;
+ syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
+
+ return test_bit(syscall_nr, bitmap);
+}
+
+/**
+ * seccomp_cache_check - lookup seccomp cache
+ * @sfilter: The seccomp filter
+ * @sd: The seccomp data to lookup the cache with
+ *
+ * Returns true if the seccomp_data is cached and allowed.
+ */
+static bool seccomp_cache_check(const struct seccomp_filter *sfilter,
+ const struct seccomp_data *sd)
+{
+ int syscall_nr = sd->nr;
+ const struct seccomp_cache_filter_data *cache = &sfilter->cache;
+
+#ifdef SECCOMP_ARCH_DEFAULT
+ if (likely(sd->arch == SECCOMP_ARCH_DEFAULT))
+ return seccomp_cache_check_bitmap(cache->syscall_allow_default,
+ SECCOMP_ARCH_DEFAULT_NR,
+ syscall_nr);
+#endif /* SECCOMP_ARCH_DEFAULT */
+
+#ifdef SECCOMP_ARCH_COMPAT
+ if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
+ return seccomp_cache_check_bitmap(cache->syscall_allow_compat,
+ SECCOMP_ARCH_COMPAT_NR,
+ syscall_nr);
+#endif /* SECCOMP_ARCH_COMPAT */
+
+ WARN_ON_ONCE(true);
+ return false;
+}
+#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
+
/**
* seccomp_run_filters - evaluates all seccomp filters against @sd
* @sd: optional seccomp data to be passed to filters
@@ -353,6 +402,9 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
if (WARN_ON(f == NULL))
return SECCOMP_RET_KILL_PROCESS;
+ if (seccomp_cache_check(f, sd))
+ return SECCOMP_RET_ALLOW;
+
/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
--
2.28.0
On Wed, Sep 30, 2020 at 10:19:14AM -0500, YiFei Zhu wrote:
> From: YiFei Zhu <[email protected]>
>
> The fast (common) path for seccomp should be that the filter permits
> the syscall to pass through, and failing seccomp is expected to be
> an exceptional case; it is not expected for userspace to call a
> denylisted syscall over and over.
>
> This first finds the current allow bitmask by iterating through
> syscall_arches[] array and comparing it to the one in struct
> seccomp_data; this loop is expected to be unrolled. It then
> does a test_bit against the bitmask. If the bit is set, then
> there is no need to run the full filter; it returns
> SECCOMP_RET_ALLOW immediately.
>
> Co-developed-by: Dimitrios Skarlatos <[email protected]>
> Signed-off-by: Dimitrios Skarlatos <[email protected]>
> Signed-off-by: YiFei Zhu <[email protected]>
I'd like the content/ordering of this and the emulator patch to be reorganized a bit.
I'd like to see the infrastructure of the cache added first (along with
the "always allow" test logic in this patch), with the emulator missing:
i.e. the patch is a logical no-op: no behavior changes because nothing
ever changes the cache bits, but all the operational logic, structure
changes, etc, is in place. Then the next patch would be replacing the
no-op with the emulator.
> ---
> kernel/seccomp.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 52 insertions(+)
>
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index f09c9e74ae05..bed3b2a7f6c8 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -172,6 +172,12 @@ struct seccomp_cache_filter_data { };
> static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
> {
> }
> +
> +static inline bool seccomp_cache_check(const struct seccomp_filter *sfilter,
bikeshedding: "cache check" doesn't tell me anything about what it's
actually checking for. How about calling this seccomp_is_constant_allow() or
something that reflects both the "bool" return ("is") and what that bool
means ("should always be allowed").
> + const struct seccomp_data *sd)
> +{
> + return false;
> +}
> #endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
>
> /**
> @@ -331,6 +337,49 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
> return 0;
> }
>
> +#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY
> +static bool seccomp_cache_check_bitmap(const void *bitmap, size_t bitmap_size,
Please also mark as "inline".
> + int syscall_nr)
> +{
> + if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size))
> + return false;
> + syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
> +
> + return test_bit(syscall_nr, bitmap);
> +}
> +
> +/**
> + * seccomp_cache_check - lookup seccomp cache
> + * @sfilter: The seccomp filter
> + * @sd: The seccomp data to lookup the cache with
> + *
> + * Returns true if the seccomp_data is cached and allowed.
> + */
> +static bool seccomp_cache_check(const struct seccomp_filter *sfilter,
inline too.
> + const struct seccomp_data *sd)
> +{
> + int syscall_nr = sd->nr;
> + const struct seccomp_cache_filter_data *cache = &sfilter->cache;
> +
> +#ifdef SECCOMP_ARCH_DEFAULT
> + if (likely(sd->arch == SECCOMP_ARCH_DEFAULT))
> + return seccomp_cache_check_bitmap(cache->syscall_allow_default,
> + SECCOMP_ARCH_DEFAULT_NR,
> + syscall_nr);
> +#endif /* SECCOMP_ARCH_DEFAULT */
> +
> +#ifdef SECCOMP_ARCH_COMPAT
> + if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
> + return seccomp_cache_check_bitmap(cache->syscall_allow_compat,
> + SECCOMP_ARCH_COMPAT_NR,
> + syscall_nr);
> +#endif /* SECCOMP_ARCH_COMPAT */
> +
> + WARN_ON_ONCE(true);
> + return false;
> +}
> +#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
> +
> /**
> * seccomp_run_filters - evaluates all seccomp filters against @sd
> * @sd: optional seccomp data to be passed to filters
> @@ -353,6 +402,9 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
> if (WARN_ON(f == NULL))
> return SECCOMP_RET_KILL_PROCESS;
>
> + if (seccomp_cache_check(f, sd))
> + return SECCOMP_RET_ALLOW;
> +
> /*
> * All filters in the list are evaluated and the lowest BPF return
> * value always takes priority (ignoring the DATA).
> --
> 2.28.0
>
Otherwise, yup, looks good.
--
Kees Cook
On Wed, Sep 30, 2020 at 4:32 PM Kees Cook <[email protected]> wrote:
>
> On Wed, Sep 30, 2020 at 10:19:14AM -0500, YiFei Zhu wrote:
> > From: YiFei Zhu <[email protected]>
> >
> > The fast (common) path for seccomp should be that the filter permits
> > the syscall to pass through, and failing seccomp is expected to be
> > an exceptional case; it is not expected for userspace to call a
> > denylisted syscall over and over.
> >
> > This first finds the current allow bitmask by iterating through
> > syscall_arches[] array and comparing it to the one in struct
> > seccomp_data; this loop is expected to be unrolled. It then
> > does a test_bit against the bitmask. If the bit is set, then
> > there is no need to run the full filter; it returns
> > SECCOMP_RET_ALLOW immediately.
> >
> > Co-developed-by: Dimitrios Skarlatos <[email protected]>
> > Signed-off-by: Dimitrios Skarlatos <[email protected]>
> > Signed-off-by: YiFei Zhu <[email protected]>
>
> I'd like the content/ordering of this and the emulator patch to be reorganized a bit.
> I'd like to see the infrastructure of the cache added first (along with
> the "always allow" test logic in this patch), with the emulator missing:
> i.e. the patch is a logical no-op: no behavior changes because nothing
> ever changes the cache bits, but all the operational logic, structure
> changes, etc, is in place. Then the next patch would be replacing the
> no-op with the emulator.
>
> > ---
> > kernel/seccomp.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 52 insertions(+)
> >
> > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > index f09c9e74ae05..bed3b2a7f6c8 100644
> > --- a/kernel/seccomp.c
> > +++ b/kernel/seccomp.c
> > @@ -172,6 +172,12 @@ struct seccomp_cache_filter_data { };
> > static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
> > {
> > }
> > +
> > +static inline bool seccomp_cache_check(const struct seccomp_filter *sfilter,
>
> bikeshedding: "cache check" doesn't tell me anything about what it's
> actually checking for. How about calling this seccomp_is_constant_allow() or
> something that reflects both the "bool" return ("is") and what that bool
> means ("should always be allowed").
We have a naming conflict here. I'm about to rename
seccomp_emu_is_const_allow to seccomp_is_const_allow. Adding another
seccomp_is_constant_allow is confusing. Suggestions?
I think I would prefer to change seccomp_cache_check to
seccomp_cache_check_allow. While in this patch set seccomp_cache_check
does imply the filter is "constant" allow, argument-processing cache
may change this, and specifying an "allow" in the name specifies the
'what that bool means ("should always be allowed")'.
YiFei Zhu
On Thu, Oct 08, 2020 at 07:17:39PM -0500, YiFei Zhu wrote:
> On Wed, Sep 30, 2020 at 4:32 PM Kees Cook <[email protected]> wrote:
> >
> > On Wed, Sep 30, 2020 at 10:19:14AM -0500, YiFei Zhu wrote:
> > > From: YiFei Zhu <[email protected]>
> > >
> > > The fast (common) path for seccomp should be that the filter permits
> > > the syscall to pass through, and failing seccomp is expected to be
> > > an exceptional case; it is not expected for userspace to call a
> > > denylisted syscall over and over.
> > >
> > > This first finds the current allow bitmask by iterating through
> > > syscall_arches[] array and comparing it to the one in struct
> > > seccomp_data; this loop is expected to be unrolled. It then
> > > does a test_bit against the bitmask. If the bit is set, then
> > > there is no need to run the full filter; it returns
> > > SECCOMP_RET_ALLOW immediately.
> > >
> > > Co-developed-by: Dimitrios Skarlatos <[email protected]>
> > > Signed-off-by: Dimitrios Skarlatos <[email protected]>
> > > Signed-off-by: YiFei Zhu <[email protected]>
> >
> > I'd like the content/ordering of this and the emulator patch to be reorganized a bit.
> > I'd like to see the infrastructure of the cache added first (along with
> > the "always allow" test logic in this patch), with the emulator missing:
> > i.e. the patch is a logical no-op: no behavior changes because nothing
> > ever changes the cache bits, but all the operational logic, structure
> > changes, etc, is in place. Then the next patch would be replacing the
> > no-op with the emulator.
> >
> > > ---
> > > kernel/seccomp.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
> > > 1 file changed, 52 insertions(+)
> > >
> > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > > index f09c9e74ae05..bed3b2a7f6c8 100644
> > > --- a/kernel/seccomp.c
> > > +++ b/kernel/seccomp.c
> > > @@ -172,6 +172,12 @@ struct seccomp_cache_filter_data { };
> > > static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
> > > {
> > > }
> > > +
> > > +static inline bool seccomp_cache_check(const struct seccomp_filter *sfilter,
> >
> > bikeshedding: "cache check" doesn't tell me anything about what it's
> > actually checking for. How about calling this seccomp_is_constant_allow() or
> > something that reflects both the "bool" return ("is") and what that bool
> > means ("should always be allowed").
>
> We have a naming conflict here. I'm about to rename
> seccomp_emu_is_const_allow to seccomp_is_const_allow. Adding another
> seccomp_is_constant_allow is confusing. Suggestions?
>
> I think I would prefer to change seccomp_cache_check to
> seccomp_cache_check_allow. While in this patch set seccomp_cache_check
> does imply the filter is "constant" allow, argument-processing cache
> may change this, and specifying an "allow" in the name specifies the
> 'what that bool means ("should always be allowed")'.
Yeah, that seems good.
--
Kees Cook