This patchset introduces two new operations which essentially
splits the SECCOMP_SET_MODE_FILTER process into two steps:
SECCOMP_LOAD_FILTER and SECCOMP_ATTACH_FILTER.
The SECCOMP_LOAD_FILTER loads the filter and returns a fd
which can be pinned to bpffs. This extends the lifetime of the
filter and thus can be reused by different processes.
With this new operation, we can eliminate a hot path of JITing
BPF program (the filter) where we apply the same seccomp filter
to thousands of micro VMs on a bare metal instance.
The SECCOMP_ATTACH_FILTER is used to attach a loaded filter.
The filter is represented by a fd which is either returned
from SECCOMP_LOAD_FILTER or obtained from bpffs using bpf syscall.
Hengqi Chen (2):
seccomp: Introduce SECCOMP_LOAD_FILTER operation
seccomp: Introduce SECCOMP_ATTACH_FILTER operation
include/uapi/linux/seccomp.h | 2 +
kernel/seccomp.c | 138 ++++++++++++++++++++++++++++++++++-
2 files changed, 136 insertions(+), 4 deletions(-)
--
2.34.1
This patch adds a new operation named SECCOMP_LOAD_FILTER.
It accepts the same arguments as SECCOMP_SET_MODE_FILTER
but only performs the loading process. If succeed, return a
new fd associated with the JITed BPF program (the filter).
The filter can then be pinned to bpffs using the returned
fd and reused for different processes.
Signed-off-by: Hengqi Chen <[email protected]>
---
include/uapi/linux/seccomp.h | 1 +
kernel/seccomp.c | 64 ++++++++++++++++++++++++++++++++++++
2 files changed, 65 insertions(+)
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index dbfc9b37fcae..ee2c83697810 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -16,6 +16,7 @@
#define SECCOMP_SET_MODE_FILTER 1
#define SECCOMP_GET_ACTION_AVAIL 2
#define SECCOMP_GET_NOTIF_SIZES 3
+#define SECCOMP_LOAD_FILTER 4
/* Valid flags for SECCOMP_SET_MODE_FILTER */
#define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 255999ba9190..7aff22f56a91 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1996,12 +1996,71 @@ static long seccomp_set_mode_filter(unsigned int flags,
seccomp_filter_free(prepared);
return ret;
}
+
+static long seccomp_load_filter(const char __user *filter)
+{
+ struct sock_fprog fprog;
+ struct bpf_prog *prog;
+ int ret = -EFAULT;
+ const bool save_orig =
+#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE)
+ true;
+#else
+ false;
+#endif
+
+#ifdef CONFIG_COMPAT
+ if (in_compat_syscall()) {
+ struct compat_sock_fprog fprog32;
+ if (copy_from_user(&fprog32, filter, sizeof(fprog32)))
+ goto out;
+ fprog.len = fprog32.len;
+ fprog.filter = compat_ptr(fprog32.filter);
+ } else /* falls through to the if below. */
+#endif
+ if (copy_from_user(&fprog, filter, sizeof(fprog)))
+ goto out;
+
+ ret = -EINVAL;
+ if (fprog.len == 0 || fprog.len > BPF_MAXINSNS)
+ goto out;
+
+ BUG_ON(INT_MAX / fprog.len < sizeof(struct sock_filter));
+
+ ret = bpf_prog_create_from_user(&prog, &fprog, seccomp_check_filter, save_orig);
+ if (ret < 0)
+ goto out;
+
+ ret = security_bpf_prog_alloc(prog->aux);
+ if (ret) {
+ ret = -EINVAL;
+ goto prog_free;
+ }
+
+ prog->aux->user = get_current_user();
+ atomic64_set(&prog->aux->refcnt, 1);
+
+ ret = bpf_prog_new_fd(prog);
+ if (ret < 0)
+ bpf_prog_put(prog);
+ return ret;
+
+prog_free:
+ bpf_prog_free(prog);
+out:
+ return ret;
+}
#else
static inline long seccomp_set_mode_filter(unsigned int flags,
const char __user *filter)
{
return -EINVAL;
}
+
+static inline long seccomp_load_filter(const char __user *filter)
+{
+ return -EINVAL;
+}
#endif
static long seccomp_get_action_avail(const char __user *uaction)
@@ -2063,6 +2122,11 @@ static long do_seccomp(unsigned int op, unsigned int flags,
return -EINVAL;
return seccomp_get_notif_sizes(uargs);
+ case SECCOMP_LOAD_FILTER:
+ if (flags != 0)
+ return -EINVAL;
+
+ return seccomp_load_filter(uargs);
default:
return -EINVAL;
}
--
2.34.1
The SECCOMP_ATTACH_FILTER operation is used to attach
a loaded filter to the current process. The loaded filter
is represented by a fd which is either returned by the
SECCOMP_LOAD_FILTER operation or obtained from bpffs using
bpf syscall.
Signed-off-by: Hengqi Chen <[email protected]>
---
include/uapi/linux/seccomp.h | 1 +
kernel/seccomp.c | 74 ++++++++++++++++++++++++++++++++++--
2 files changed, 71 insertions(+), 4 deletions(-)
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index ee2c83697810..fbe30262fdfc 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -17,6 +17,7 @@
#define SECCOMP_GET_ACTION_AVAIL 2
#define SECCOMP_GET_NOTIF_SIZES 3
#define SECCOMP_LOAD_FILTER 4
+#define SECCOMP_ATTACH_FILTER 5
/* Valid flags for SECCOMP_SET_MODE_FILTER */
#define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 7aff22f56a91..adfafee4c3da 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -862,7 +862,7 @@ static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
#endif /* SECCOMP_ARCH_NATIVE */
/**
- * seccomp_attach_filter: validate and attach filter
+ * seccomp_do_attach_filter: validate and attach filter
* @flags: flags to change filter behavior
* @filter: seccomp filter to add to the current process
*
@@ -873,8 +873,8 @@ static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
* seccomp mode or did not have an ancestral seccomp filter
* - in NEW_LISTENER mode: the fd of the new listener
*/
-static long seccomp_attach_filter(unsigned int flags,
- struct seccomp_filter *filter)
+static long seccomp_do_attach_filter(unsigned int flags,
+ struct seccomp_filter *filter)
{
unsigned long total_insns;
struct seccomp_filter *walker;
@@ -1969,7 +1969,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
goto out;
}
- ret = seccomp_attach_filter(flags, prepared);
+ ret = seccomp_do_attach_filter(flags, prepared);
if (ret)
goto out;
/* Do not free the successfully attached filter. */
@@ -2050,6 +2050,62 @@ static long seccomp_load_filter(const char __user *filter)
out:
return ret;
}
+
+static long seccomp_attach_filter(const char __user *ufd)
+{
+ const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
+ struct seccomp_filter *sfilter;
+ struct bpf_prog *prog;
+ struct file *filp;
+ int flags = 0;
+ int fd, ret;
+
+ if (copy_from_user(&fd, ufd, sizeof(fd)))
+ return -EFAULT;
+
+ filp = fget(fd);
+ if (!filp)
+ return -EBADF;
+
+ if (filp->f_op != &bpf_prog_fops) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ prog = filp->private_data;
+
+ sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
+ if (!sfilter) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ sfilter->prog = prog;
+ refcount_set(&sfilter->refs, 1);
+ refcount_set(&sfilter->users, 1);
+ mutex_init(&sfilter->notify_lock);
+ init_waitqueue_head(&sfilter->wqh);
+
+ spin_lock_irq(¤t->sighand->siglock);
+
+ ret = -EINVAL;
+ if (!seccomp_may_assign_mode(seccomp_mode))
+ goto out_unlock;
+
+ ret = seccomp_do_attach_filter(flags, sfilter);
+ if (ret)
+ goto out_unlock;
+
+ sfilter = NULL;
+ seccomp_assign_mode(current, seccomp_mode, flags);
+
+out_unlock:
+ spin_unlock_irq(¤t->sighand->siglock);
+ seccomp_filter_free(sfilter);
+out:
+ fput(filp);
+ return ret;
+}
#else
static inline long seccomp_set_mode_filter(unsigned int flags,
const char __user *filter)
@@ -2061,6 +2117,11 @@ static inline long seccomp_load_filter(const char __user *filter)
{
return -EINVAL;
}
+
+static inline long seccomp_attach_filter(const char __user *ufd)
+{
+ return -EINVAL;
+}
#endif
static long seccomp_get_action_avail(const char __user *uaction)
@@ -2127,6 +2188,11 @@ static long do_seccomp(unsigned int op, unsigned int flags,
return -EINVAL;
return seccomp_load_filter(uargs);
+ case SECCOMP_ATTACH_FILTER:
+ if (flags != 0)
+ return -EINVAL;
+
+ return seccomp_attach_filter(uargs);
default:
return -EINVAL;
}
--
2.34.1
On Tue, Oct 03, 2023 at 08:38:34AM +0000, Hengqi Chen wrote:
> This patchset introduces two new operations which essentially
> splits the SECCOMP_SET_MODE_FILTER process into two steps:
> SECCOMP_LOAD_FILTER and SECCOMP_ATTACH_FILTER.
>
> The SECCOMP_LOAD_FILTER loads the filter and returns a fd
> which can be pinned to bpffs. This extends the lifetime of the
> filter and thus can be reused by different processes.
> With this new operation, we can eliminate a hot path of JITing
> BPF program (the filter) where we apply the same seccomp filter
> to thousands of micro VMs on a bare metal instance.
>
> The SECCOMP_ATTACH_FILTER is used to attach a loaded filter.
> The filter is represented by a fd which is either returned
> from SECCOMP_LOAD_FILTER or obtained from bpffs using bpf syscall.
Interesting! I like this idea, thanks for writing it up.
Two design notes:
- Can you reuse/refactor seccomp_prepare_filter() instead of duplicating
the logic into two new functions?
- Is there a way to make sure the BPF program coming from the fd is one
that was built via SECCOMP_LOAD_FILTER? (I want to make sure we can
never confuse a non-seccomp program into getting loaded into seccomp.)
-Kees
--
Kees Cook
On 10/3/23 10:38, Hengqi Chen wrote:
> This patchset introduces two new operations which essentially
> splits the SECCOMP_SET_MODE_FILTER process into two steps:
> SECCOMP_LOAD_FILTER and SECCOMP_ATTACH_FILTER.
>
> The SECCOMP_LOAD_FILTER loads the filter and returns a fd
> which can be pinned to bpffs. This extends the lifetime of the
> filter and thus can be reused by different processes.
A quick question to see if handling something else too is
possible/reasonable to do here too.
Let me explain our use case first.
For us (Alban in cc) it would be great if we can extend the lifetime of
the fd returned, so the process managing a seccomp notification in
userspace can easly crash or be updated. Today, if the agent that got
the fd crashes, all the "notify-syscalls" return ENOSYS in the target
process.
Our use case is we created a seccomp agent to use in Kubernetes
(github.com/kinvolk/seccompagent) and we need to handle either the agent
crashing or upgrading it. We were thinking tricks to have another
container that just stores fds and make sure that never crashes, but it
is not ideal (we checked tricks to use systemd to store our fds, but it
is not simpler either to use from containers).
If the agent crashes today, all the syscalls return ENOSYS. It will be
great if we can make the process doing the syscall just wait until a new
process to handle the notifications is up and the syscalls done in the
meantime are just queued. A mode of saying "if the agent crashes, just
queue notifications, one agent to pick them up will come back soon" (we
can of course limit reasonably the notification queue).
It seems the split here would not just work for that use case. I think
we would need to pin the attachment.
Do you think handling that is something reasonable to do in this series too?
I'll be afk until end next week. I'll catch up as soon as I'm back with
internet :)
Best,
Rodrigo
+ BPF maintainers
On Wed, Oct 4, 2023 at 2:02 AM Kees Cook <[email protected]> wrote:
>
> On Tue, Oct 03, 2023 at 08:38:34AM +0000, Hengqi Chen wrote:
> > This patchset introduces two new operations which essentially
> > splits the SECCOMP_SET_MODE_FILTER process into two steps:
> > SECCOMP_LOAD_FILTER and SECCOMP_ATTACH_FILTER.
> >
> > The SECCOMP_LOAD_FILTER loads the filter and returns a fd
> > which can be pinned to bpffs. This extends the lifetime of the
> > filter and thus can be reused by different processes.
> > With this new operation, we can eliminate a hot path of JITing
> > BPF program (the filter) where we apply the same seccomp filter
> > to thousands of micro VMs on a bare metal instance.
> >
> > The SECCOMP_ATTACH_FILTER is used to attach a loaded filter.
> > The filter is represented by a fd which is either returned
> > from SECCOMP_LOAD_FILTER or obtained from bpffs using bpf syscall.
>
> Interesting! I like this idea, thanks for writing it up.
>
> Two design notes:
>
> - Can you reuse/refactor seccomp_prepare_filter() instead of duplicating
> the logic into two new functions?
>
Sure, will do.
> - Is there a way to make sure the BPF program coming from the fd is one
> that was built via SECCOMP_LOAD_FILTER? (I want to make sure we can
> never confuse a non-seccomp program into getting loaded into seccomp.)
>
Maybe we can add a new prog type enum like BPF_PROG_TYPE_SECCOMP
for seccomp filter.
> -Kees
>
> --
> Kees Cook
>
Cheers,
--
Hengqi
On Wed, Oct 4, 2023 at 10:03 PM Rodrigo Campos <[email protected]> wrote:
>
> On 10/3/23 10:38, Hengqi Chen wrote:
> > This patchset introduces two new operations which essentially
> > splits the SECCOMP_SET_MODE_FILTER process into two steps:
> > SECCOMP_LOAD_FILTER and SECCOMP_ATTACH_FILTER.
> >
> > The SECCOMP_LOAD_FILTER loads the filter and returns a fd
> > which can be pinned to bpffs. This extends the lifetime of the
> > filter and thus can be reused by different processes.
>
> A quick question to see if handling something else too is
> possible/reasonable to do here too.
>
> Let me explain our use case first.
>
> For us (Alban in cc) it would be great if we can extend the lifetime of
> the fd returned, so the process managing a seccomp notification in
> userspace can easly crash or be updated. Today, if the agent that got
> the fd crashes, all the "notify-syscalls" return ENOSYS in the target
> process.
>
> Our use case is we created a seccomp agent to use in Kubernetes
> (github.com/kinvolk/seccompagent) and we need to handle either the agent
> crashing or upgrading it. We were thinking tricks to have another
> container that just stores fds and make sure that never crashes, but it
> is not ideal (we checked tricks to use systemd to store our fds, but it
> is not simpler either to use from containers).
>
> If the agent crashes today, all the syscalls return ENOSYS. It will be
> great if we can make the process doing the syscall just wait until a new
> process to handle the notifications is up and the syscalls done in the
> meantime are just queued. A mode of saying "if the agent crashes, just
> queue notifications, one agent to pick them up will come back soon" (we
> can of course limit reasonably the notification queue).
>
> It seems the split here would not just work for that use case. I think
> we would need to pin the attachment.
>
> Do you think handling that is something reasonable to do in this series too?
>
I am not familiar with this notification mechanism, but it seems unrelated.
This patchset is trying to reuse the seccomp filter itself.
> I'll be afk until end next week. I'll catch up as soon as I'm back with
> internet :)
>
>
>
> Best,
> Rodrigo
--
Hengqi