Introduce the BPF_F_CGROUP_DEVICE_GUARD flag for BPF_PROG_LOAD
which allows to set a cgroup device program to be a device guard.
Later this may be used to guard actions on device nodes in
non-initial userns. For this reason we provide the helper function
cgroup_bpf_device_guard_enabled() to check if a task has a cgroups
device program which is a device guard in its effective set of bpf
programs.
Signed-off-by: Michael Weiß <[email protected]>
---
include/linux/bpf-cgroup.h | 7 +++++++
include/linux/bpf.h | 1 +
include/uapi/linux/bpf.h | 5 +++++
kernel/bpf/cgroup.c | 30 ++++++++++++++++++++++++++++++
kernel/bpf/syscall.c | 5 ++++-
tools/include/uapi/linux/bpf.h | 5 +++++
6 files changed, 52 insertions(+), 1 deletion(-)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 57e9e109257e..112b6093f9fd 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -184,6 +184,8 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
return array != &bpf_empty_prog_array.hdr;
}
+bool cgroup_bpf_device_guard_enabled(struct task_struct *task);
+
/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \
({ \
@@ -476,6 +478,11 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
return 0;
}
+static bool cgroup_bpf_device_guard_enabled(struct task_struct *task)
+{
+ return false;
+}
+
#define cgroup_bpf_enabled(atype) (0)
#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; })
#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) ({ 0; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f58895830ada..313cce8aee05 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1384,6 +1384,7 @@ struct bpf_prog_aux {
bool sleepable;
bool tail_call_reachable;
bool xdp_has_frags;
+ bool cgroup_device_guard;
/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
const struct btf_type *attach_func_proto;
/* function name for valid attach_btf_id */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 60a9d59beeab..3be57f7957b1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1165,6 +1165,11 @@ enum bpf_link_type {
*/
#define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6)
+/* If BPF_F_CGROUP_DEVICE_GUARD is used in BPF_PROG_LOAD command, the loaded
+ * program will be allowed to guard device access inside user namespaces.
+ */
+#define BPF_F_CGROUP_DEVICE_GUARD (1U << 7)
+
/* link_create.kprobe_multi.flags used in LINK_CREATE command for
* BPF_TRACE_KPROBE_MULTI attach type to create return probe.
*/
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 5b2741aa0d9b..230693ca4cdb 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -2505,6 +2505,36 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
const struct bpf_prog_ops cg_sockopt_prog_ops = {
};
+bool
+cgroup_bpf_device_guard_enabled(struct task_struct *task)
+{
+ bool ret;
+ const struct bpf_prog_array *array;
+ const struct bpf_prog_array_item *item;
+ const struct bpf_prog *prog;
+ struct cgroup *cgrp = task_dfl_cgroup(task);
+
+ ret = false;
+
+ array = rcu_access_pointer(cgrp->bpf.effective[CGROUP_DEVICE]);
+ if (array == &bpf_empty_prog_array.hdr)
+ return ret;
+
+ mutex_lock(&cgroup_mutex);
+ array = rcu_dereference_protected(cgrp->bpf.effective[CGROUP_DEVICE],
+ lockdep_is_held(&cgroup_mutex));
+ item = &array->items[0];
+ while ((prog = READ_ONCE(item->prog))) {
+ if (prog->aux->cgroup_device_guard) {
+ ret = true;
+ break;
+ }
+ item++;
+ }
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
/* Common helpers for cgroup hooks. */
const struct bpf_func_proto *
cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a2aef900519c..33ea67c702c1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2564,7 +2564,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
BPF_F_SLEEPABLE |
BPF_F_TEST_RND_HI32 |
BPF_F_XDP_HAS_FRAGS |
- BPF_F_XDP_DEV_BOUND_ONLY))
+ BPF_F_XDP_DEV_BOUND_ONLY |
+ BPF_F_CGROUP_DEVICE_GUARD))
return -EINVAL;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -2651,6 +2652,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
prog->aux->dev_bound = !!attr->prog_ifindex;
prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
+ prog->aux->cgroup_device_guard =
+ attr->prog_flags & BPF_F_CGROUP_DEVICE_GUARD;
err = security_bpf_prog_alloc(prog->aux);
if (err)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 60a9d59beeab..3be57f7957b1 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1165,6 +1165,11 @@ enum bpf_link_type {
*/
#define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6)
+/* If BPF_F_CGROUP_DEVICE_GUARD is used in BPF_PROG_LOAD command, the loaded
+ * program will be allowed to guard device access inside user namespaces.
+ */
+#define BPF_F_CGROUP_DEVICE_GUARD (1U << 7)
+
/* link_create.kprobe_multi.flags used in LINK_CREATE command for
* BPF_TRACE_KPROBE_MULTI attach type to create return probe.
*/
--
2.30.2
On Mon, Aug 14, 2023 at 4:32 PM Michael Weiß
<[email protected]> wrote:
>
> Introduce the BPF_F_CGROUP_DEVICE_GUARD flag for BPF_PROG_LOAD
> which allows to set a cgroup device program to be a device guard.
> Later this may be used to guard actions on device nodes in
> non-initial userns. For this reason we provide the helper function
> cgroup_bpf_device_guard_enabled() to check if a task has a cgroups
> device program which is a device guard in its effective set of bpf
> programs.
>
> Signed-off-by: Michael Weiß <[email protected]>
Hi Michael!
Thanks for working on this. It's also very useful for the LXC system
containers project.
> ---
> include/linux/bpf-cgroup.h | 7 +++++++
> include/linux/bpf.h | 1 +
> include/uapi/linux/bpf.h | 5 +++++
> kernel/bpf/cgroup.c | 30 ++++++++++++++++++++++++++++++
> kernel/bpf/syscall.c | 5 ++++-
> tools/include/uapi/linux/bpf.h | 5 +++++
> 6 files changed, 52 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index 57e9e109257e..112b6093f9fd 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -184,6 +184,8 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
> return array != &bpf_empty_prog_array.hdr;
> }
>
> +bool cgroup_bpf_device_guard_enabled(struct task_struct *task);
> +
> /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
> #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \
> ({ \
> @@ -476,6 +478,11 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
> return 0;
> }
>
> +static bool cgroup_bpf_device_guard_enabled(struct task_struct *task)
> +{
> + return false;
> +}
> +
> #define cgroup_bpf_enabled(atype) (0)
> #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; })
> #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) ({ 0; })
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index f58895830ada..313cce8aee05 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1384,6 +1384,7 @@ struct bpf_prog_aux {
> bool sleepable;
> bool tail_call_reachable;
> bool xdp_has_frags;
> + bool cgroup_device_guard;
> /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
> const struct btf_type *attach_func_proto;
> /* function name for valid attach_btf_id */
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 60a9d59beeab..3be57f7957b1 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1165,6 +1165,11 @@ enum bpf_link_type {
> */
> #define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6)
>
> +/* If BPF_F_CGROUP_DEVICE_GUARD is used in BPF_PROG_LOAD command, the loaded
> + * program will be allowed to guard device access inside user namespaces.
> + */
> +#define BPF_F_CGROUP_DEVICE_GUARD (1U << 7)
> +
> /* link_create.kprobe_multi.flags used in LINK_CREATE command for
> * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
> */
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 5b2741aa0d9b..230693ca4cdb 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -2505,6 +2505,36 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
> const struct bpf_prog_ops cg_sockopt_prog_ops = {
> };
>
> +bool
> +cgroup_bpf_device_guard_enabled(struct task_struct *task)
> +{
> + bool ret;
> + const struct bpf_prog_array *array;
> + const struct bpf_prog_array_item *item;
> + const struct bpf_prog *prog;
> + struct cgroup *cgrp = task_dfl_cgroup(task);
> +
> + ret = false;
> +
> + array = rcu_access_pointer(cgrp->bpf.effective[CGROUP_DEVICE]);
> + if (array == &bpf_empty_prog_array.hdr)
> + return ret;
> +
> + mutex_lock(&cgroup_mutex);
-> cgroup_lock();
> + array = rcu_dereference_protected(cgrp->bpf.effective[CGROUP_DEVICE],
> + lockdep_is_held(&cgroup_mutex));
> + item = &array->items[0];
> + while ((prog = READ_ONCE(item->prog))) {
> + if (prog->aux->cgroup_device_guard) {
> + ret = true;
> + break;
> + }
> + item++;
> + }
> + mutex_unlock(&cgroup_mutex);
Don't we want to make this function specific to "current" task? This
allows to make locking lighter like in
__cgroup_bpf_check_dev_permission
https://github.com/torvalds/linux/blob/2ccdd1b13c591d306f0401d98dedc4bdcd02b421/kernel/bpf/cgroup.c#L1531
Here we have only RCU read lock.
AFAIK, cgroup_mutex is a heavily-contended lock.
Kind regards,
Alex
> + return ret;
> +}
> +
> /* Common helpers for cgroup hooks. */
> const struct bpf_func_proto *
> cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index a2aef900519c..33ea67c702c1 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2564,7 +2564,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
> BPF_F_SLEEPABLE |
> BPF_F_TEST_RND_HI32 |
> BPF_F_XDP_HAS_FRAGS |
> - BPF_F_XDP_DEV_BOUND_ONLY))
> + BPF_F_XDP_DEV_BOUND_ONLY |
> + BPF_F_CGROUP_DEVICE_GUARD))
> return -EINVAL;
>
> if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
> @@ -2651,6 +2652,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
> prog->aux->dev_bound = !!attr->prog_ifindex;
> prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
> prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
> + prog->aux->cgroup_device_guard =
> + attr->prog_flags & BPF_F_CGROUP_DEVICE_GUARD;
>
> err = security_bpf_prog_alloc(prog->aux);
> if (err)
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 60a9d59beeab..3be57f7957b1 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -1165,6 +1165,11 @@ enum bpf_link_type {
> */
> #define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6)
>
> +/* If BPF_F_CGROUP_DEVICE_GUARD is used in BPF_PROG_LOAD command, the loaded
> + * program will be allowed to guard device access inside user namespaces.
> + */
> +#define BPF_F_CGROUP_DEVICE_GUARD (1U << 7)
> +
> /* link_create.kprobe_multi.flags used in LINK_CREATE command for
> * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
> */
>
> --
> 2.30.2
>
On Mon, Aug 14, 2023 at 04:26:09PM +0200, Michael Weiß wrote:
> Introduce the BPF_F_CGROUP_DEVICE_GUARD flag for BPF_PROG_LOAD
> which allows to set a cgroup device program to be a device guard.
Currently we block access to devices unconditionally in may_open_dev().
Anything that's mounted by an unprivileged containers will get
SB_I_NODEV set in s_i_flags.
Then we currently mediate device access in:
* inode_permission()
-> devcgroup_inode_permission()
* vfs_mknod()
-> devcgroup_inode_mknod()
* blkdev_get_by_dev() // sget()/sget_fc(), other ways to open block devices and friends
-> devcgroup_check_permission()
* drivers/gpu/drm/amd/amdkfd // weird restrictions on showing gpu info afaict
-> devcgroup_check_permission()
All your new flag does is to bypass that SB_I_NODEV check afaict and let
it proceed to the devcgroup_*() checks for the vfs layer.
But I don't get the semantics yet.
Is that a flag which is set on BPF_PROG_TYPE_CGROUP_DEVICE programs or
is that a flag on random bpf programs? It looks like it would be the
latter but design-wise I would expect this to be a property of the
device program itself.
On Tue, Aug 29, 2023 at 03:35:46PM +0200, Alexander Mikhalitsyn wrote:
> On Fri, Aug 18, 2023 at 12:11 AM Alexei Starovoitov
> <[email protected]> wrote:
> >
> > On Tue, Aug 15, 2023 at 10:59:22AM +0200, Christian Brauner wrote:
> > > On Mon, Aug 14, 2023 at 04:26:09PM +0200, Michael Weiß wrote:
> > > > Introduce the BPF_F_CGROUP_DEVICE_GUARD flag for BPF_PROG_LOAD
> > > > which allows to set a cgroup device program to be a device guard.
> > >
> > > Currently we block access to devices unconditionally in may_open_dev().
> > > Anything that's mounted by an unprivileged containers will get
> > > SB_I_NODEV set in s_i_flags.
> > >
> > > Then we currently mediate device access in:
> > >
> > > * inode_permission()
> > > -> devcgroup_inode_permission()
> > > * vfs_mknod()
> > > -> devcgroup_inode_mknod()
> > > * blkdev_get_by_dev() // sget()/sget_fc(), other ways to open block devices and friends
> > > -> devcgroup_check_permission()
> > > * drivers/gpu/drm/amd/amdkfd // weird restrictions on showing gpu info afaict
> > > -> devcgroup_check_permission()
> > >
> > > All your new flag does is to bypass that SB_I_NODEV check afaict and let
> > > it proceed to the devcgroup_*() checks for the vfs layer.
> > >
> > > But I don't get the semantics yet.
> > > Is that a flag which is set on BPF_PROG_TYPE_CGROUP_DEVICE programs or
> > > is that a flag on random bpf programs? It looks like it would be the
> > > latter but design-wise I would expect this to be a property of the
> > > device program itself.
> >
> > Looks like patch 4 attemps to bypass usual permission checks with:
> > @@ -3976,9 +3979,19 @@ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
> > if (error)
> > return error;
> >
> > - if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
> > - !capable(CAP_MKNOD))
> > - return -EPERM;
> > + /*
> > + * In case of a device cgroup restirction allow mknod in user
> > + * namespace. Otherwise just check global capability; thus,
> > + * mknod is also disabled for user namespace other than the
> > + * initial one.
> > + */
> > + if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout) {
> > + if (devcgroup_task_is_guarded(current)) {
> > + if (!ns_capable(current_user_ns(), CAP_MKNOD))
> > + return -EPERM;
> > + } else if (!capable(CAP_MKNOD))
> > + return -EPERM;
> > + }
> >
>
> Dear colleagues,
>
> > which pretty much sounds like authoritative LSM that was brought up in the past
> > and LSM folks didn't like it.
>
> Thanks for pointing this out, Alexei!
> I've searched through the LKML archives and found a thread about this:
> https://lore.kernel.org/all/CAEf4BzaBt0W3sWh_L4RRXEFYdBotzVEnQdqC7BO+PNWtD7eSUA@mail.gmail.com/
>
> As far as I understand, disagreement here is about a practice of
> skipping kernel-built capability checks based
> on LSM hooks, right?
>
> +CC Paul Moore <[email protected]>
>
> >
> > If vfs folks are ok with this special bypass of permissions in vfs_mknod()
> > we can talk about kernel->bpf api details.
> > The way it's done with BPF_F_CGROUP_DEVICE_GUARD flag is definitely no go,
> > but no point going into bpf details now until agreement on bypass is made.
Afaiu the original concern was specifically about an LSM allowing to
bypass other LSMs or DAC permissions. But this wouldn't be the case
here. The general inode access LSM permission mediation is separate from
specific device access management: the security_inode_permission() LSM
hook would still be called and thus LSMs restrictions would continue to
apply exactly as they do now.
For cgroup v1 device access management was a cgroup controller with
management interface through files. It then was ported to an eBPF
program attachable to cgroups for cgroup v2. Arguably, it should
probably have been ported to an LSM hook or a separate LSM and untied
from cgroups completely. The confusion here seems to indicate that that
would have been the right way to go.
Because right now device access management seems its own form of
mandatory access control.