2023-06-10 16:12:24

by Yi He

[permalink] [raw]
Subject: [PATCH] Add a sysctl option to disable bpf offensive helpers.

Some eBPF helper functions have been long regarded as problematic[1].
More than just used for powerful rootkit, these features can also be
exploited to harm the containers by perform various attacks to the
processes outside the container in the enrtire VM, such as process
DoS, information theft, and container escape.

When a container is granted to run eBPF tracing programs (which
need CAP_SYS_ADMIN), it can use the eBPF KProbe programs to hijack the
process outside the contianer and to escape the containers. This kind
of risks is limited as privieleged containers are warned and can hardly
be accessed by the attackers.

Even without CAP_SYS_ADMIN, since Linux 5.6, programs with CAP_BPF +
CAP_PERFMON can use dangerous eBPF helpers such as bpf_read_user to steal
sensitive data (e.g., sshd/nginx private key) in other containers.

Currently, eBPF users just enable CAP_SYS_ADMIN and also enable the
offensive features. Since lots of eBPF tools are distributed via
containers, attackers may perform supply chain attacks to create and
spread their eBPF malware, To prevent the abuse of these helpers, we
introduce a new sysctl option (sysctl_offensive_bpf_disabled) to
cofine the usages of five dangerous helpers:
- bpf_probe_write_user
- bpf_probe_read_user
- bpf_probe_read_kernel
- bpf_send_signal
- bpf_override_return

The default value of sysctl_offensive_bpf_disabled is 0, which means
all the five helpers are enabled. By setting sysctl_offensive_bpf_disabled
to 1, these helpers cannot be used util a reboot. By setting it to 2,
these helpers cannot be used but privieleged users can modify this flag
to 0.

For benign eBPF programs such as Cillium, they do not need these features
and can set the sysctl_offensive_bpf_disabled to 1 after initialization.


[1] https://embracethered.com/blog/posts/2021/offensive-bpf/


Signed-off-by: Yi He <[email protected]>
---
include/linux/bpf.h | 2 ++
kernel/bpf/syscall.c | 33 +++++++++++++++++++++++
kernel/configs/android-recommended.config | 1 +
kernel/trace/bpf_trace.c | 21 ++++++++-------
tools/testing/selftests/bpf/config | 1 +
5 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 456f33b9d205..61c723a589f8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2043,6 +2043,8 @@ bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align,

extern int sysctl_unprivileged_bpf_disabled;

+extern int sysctl_offensive_bpf_disabled;
+
static inline bool bpf_allow_ptr_leaks(void)
{
return perfmon_capable();
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 14f39c1e573e..6b8c8ee1ea22 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -58,6 +58,9 @@ static DEFINE_SPINLOCK(link_idr_lock);
int sysctl_unprivileged_bpf_disabled __read_mostly =
IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;

+int sysctl_offensive_bpf_disabled __read_mostly =
+ IS_BUILTIN(CONFIG_BPF_OFFENSIVE_BPF_OFF) ? 2 : 0;
+
static const struct bpf_map_ops * const bpf_map_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
#define BPF_MAP_TYPE(_id, _ops) \
@@ -5385,6 +5388,27 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write,
return ret;
}

+static int bpf_offensive_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, offensive_enable = *(int *)table->data;
+ bool locked_state = offensive_enable == 1;
+ struct ctl_table tmp = *table;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ tmp.data = &offensive_enable;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret) {
+ if (locked_state && offensive_enable != 1)
+ return -EPERM;
+ *(int *)table->data = offensive_enable;
+ }
+
+ return ret;
+}
+
static struct ctl_table bpf_syscall_table[] = {
{
.procname = "unprivileged_bpf_disabled",
@@ -5395,6 +5419,15 @@ static struct ctl_table bpf_syscall_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
},
+ {
+ .procname = "offensive_bpf_disabled",
+ .data = &sysctl_offensive_bpf_disabled,
+ .maxlen = sizeof(sysctl_offensive_bpf_disabled),
+ .mode = 0644,
+ .proc_handler = bpf_offensive_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
{
.procname = "bpf_stats_enabled",
.data = &bpf_stats_enabled_key.key,
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index e400fbbc8aba..cca75258af72 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -1,5 +1,6 @@
# KEEP ALPHABETICALLY SORTED
# CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set
+# CONFIG_BPF_OFFENSIVE_BPF_OFF is not set
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
# CONFIG_INPUT_MOUSE is not set
# CONFIG_LEGACY_PTYS is not set
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 8deb22a99abe..5bdd0bee3e45 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1432,17 +1432,18 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_probe_write_user:
- return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
- NULL : bpf_get_probe_write_proto();
+ return (security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ||
+ sysctl_offensive_bpf_disabled) ? NULL : bpf_get_probe_write_proto();
case BPF_FUNC_probe_read_user:
- return &bpf_probe_read_user_proto;
+ return sysctl_offensive_bpf_disabled ? NULL : &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
- return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
- NULL : &bpf_probe_read_kernel_proto;
+ return (security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ||
+ sysctl_offensive_bpf_disabled) ? NULL : &bpf_probe_read_kernel_proto;
case BPF_FUNC_probe_read_user_str:
- return &bpf_probe_read_user_str_proto;
+ return sysctl_offensive_bpf_disabled ? NULL : &bpf_probe_read_user_str_proto;
case BPF_FUNC_probe_read_kernel_str:
- return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
+ return (security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ||
+ sysctl_offensive_bpf_disabled) ?
NULL : &bpf_probe_read_kernel_str_proto;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
case BPF_FUNC_probe_read:
@@ -1459,9 +1460,9 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_cgrp_storage_delete_proto;
#endif
case BPF_FUNC_send_signal:
- return &bpf_send_signal_proto;
+ return sysctl_offensive_bpf_disabled ? NULL : &bpf_send_signal_proto;
case BPF_FUNC_send_signal_thread:
- return &bpf_send_signal_thread_proto;
+ return sysctl_offensive_bpf_disabled ? NULL : &bpf_send_signal_thread_proto;
case BPF_FUNC_perf_event_read_value:
return &bpf_perf_event_read_value_proto;
case BPF_FUNC_get_ns_current_pid_tgid:
@@ -1527,7 +1528,7 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_stack_proto;
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
case BPF_FUNC_override_return:
- return &bpf_override_return_proto;
+ return sysctl_offensive_bpf_disabled ? NULL : &bpf_override_return_proto;
#endif
case BPF_FUNC_get_func_ip:
return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ?
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 63cd4ab70171..1a15d7451f19 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -9,6 +9,7 @@ CONFIG_BPF_LSM=y
CONFIG_BPF_STREAM_PARSER=y
CONFIG_BPF_SYSCALL=y
# CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set
+# CONFIG_BPF_OFFENSIVE_BPF_OFF is not set
CONFIG_CGROUP_BPF=y
CONFIG_CRYPTO_HMAC=y
CONFIG_CRYPTO_SHA256=y
--
2.34.1



2023-06-12 04:27:59

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH] Add a sysctl option to disable bpf offensive helpers.

On Sat, Jun 10, 2023 at 03:26:18PM +0000, Yi He wrote:
> The default value of sysctl_offensive_bpf_disabled is 0, which means
> all the five helpers are enabled. By setting sysctl_offensive_bpf_disabled
> to 1, these helpers cannot be used util a reboot. By setting it to 2,
> these helpers cannot be used but privieleged users can modify this flag
> to 0.

That's just a nightmare API. The right thing is to not allow
program types that can use the helpers from anything but a global
fully privileged context.

And offensive is in this context a really weird term. Nothing is
offensive here, invasive or allowing to change kernel state might be
better terms.

2023-06-12 15:28:47

by KP Singh

[permalink] [raw]
Subject: Re: [PATCH] Add a sysctl option to disable bpf offensive helpers.

On Sat, Jun 10, 2023 at 5:46 PM Yi He <[email protected]> wrote:
>
> Some eBPF helper functions have been long regarded as problematic[1].
> More than just used for powerful rootkit, these features can also be
> exploited to harm the containers by perform various attacks to the
> processes outside the container in the enrtire VM, such as process
> DoS, information theft, and container escape.
>
> When a container is granted to run eBPF tracing programs (which
> need CAP_SYS_ADMIN), it can use the eBPF KProbe programs to hijack the
> process outside the contianer and to escape the containers. This kind
> of risks is limited as privieleged containers are warned and can hardly
> be accessed by the attackers.
>
> Even without CAP_SYS_ADMIN, since Linux 5.6, programs with CAP_BPF +
> CAP_PERFMON can use dangerous eBPF helpers such as bpf_read_user to steal
> sensitive data (e.g., sshd/nginx private key) in other containers.
>
> Currently, eBPF users just enable CAP_SYS_ADMIN and also enable the
> offensive features. Since lots of eBPF tools are distributed via
> containers, attackers may perform supply chain attacks to create and

I don't understand "supply chain" here.

> spread their eBPF malware, To prevent the abuse of these helpers, we

Are you saying attackers will provide BPF programs that will be loaded
in privileged contexts (e.g. privileged containers)? Please understand
that this threat model does not hold well. Even without these helpers
a CAP_BPF + CAP_PERFMON container is a part of your trusted compute
base and needs to run trusted code.

> introduce a new sysctl option (sysctl_offensive_bpf_disabled) to
> cofine the usages of five dangerous helpers:
> - bpf_probe_write_user
> - bpf_probe_read_user
> - bpf_probe_read_kernel
> - bpf_send_signal
> - bpf_override_return
>
> The default value of sysctl_offensive_bpf_disabled is 0, which means
> all the five helpers are enabled. By setting sysctl_offensive_bpf_disabled
> to 1, these helpers cannot be used util a reboot. By setting it to 2,
> these helpers cannot be used but privieleged users can modify this flag
> to 0.
>
> For benign eBPF programs such as Cillium, they do not need these features
> and can set the sysctl_offensive_bpf_disabled to 1 after initialization.

Again, a container running Cilium needs to only run trusted code.
What's the threat model here? There are components in the cilium
container that are attacker controlled?

>
>
> [1] https://embracethered.com/blog/posts/2021/offensive-bpf/
>
>
> Signed-off-by: Yi He <[email protected]>
> ---
> include/linux/bpf.h | 2 ++
> kernel/bpf/syscall.c | 33 +++++++++++++++++++++++
> kernel/configs/android-recommended.config | 1 +
> kernel/trace/bpf_trace.c | 21 ++++++++-------
> tools/testing/selftests/bpf/config | 1 +
> 5 files changed, 48 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 456f33b9d205..61c723a589f8 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -2043,6 +2043,8 @@ bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align,
>
> extern int sysctl_unprivileged_bpf_disabled;
>
> +extern int sysctl_offensive_bpf_disabled;
> +
> static inline bool bpf_allow_ptr_leaks(void)
> {
> return perfmon_capable();
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 14f39c1e573e..6b8c8ee1ea22 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -58,6 +58,9 @@ static DEFINE_SPINLOCK(link_idr_lock);
> int sysctl_unprivileged_bpf_disabled __read_mostly =
> IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
>
> +int sysctl_offensive_bpf_disabled __read_mostly =
> + IS_BUILTIN(CONFIG_BPF_OFFENSIVE_BPF_OFF) ? 2 : 0;
> +
> static const struct bpf_map_ops * const bpf_map_types[] = {
> #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
> #define BPF_MAP_TYPE(_id, _ops) \
> @@ -5385,6 +5388,27 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write,
> return ret;
> }
>
> +static int bpf_offensive_handler(struct ctl_table *table, int write,
> + void *buffer, size_t *lenp, loff_t *ppos)
> +{
> + int ret, offensive_enable = *(int *)table->data;
> + bool locked_state = offensive_enable == 1;
> + struct ctl_table tmp = *table;
> +
> + if (write && !capable(CAP_SYS_ADMIN))
> + return -EPERM;
> +
> + tmp.data = &offensive_enable;
> + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
> + if (write && !ret) {
> + if (locked_state && offensive_enable != 1)
> + return -EPERM;
> + *(int *)table->data = offensive_enable;
> + }
> +
> + return ret;
> +}
> +
> static struct ctl_table bpf_syscall_table[] = {
> {
> .procname = "unprivileged_bpf_disabled",
> @@ -5395,6 +5419,15 @@ static struct ctl_table bpf_syscall_table[] = {
> .extra1 = SYSCTL_ZERO,
> .extra2 = SYSCTL_TWO,
> },
> + {
> + .procname = "offensive_bpf_disabled",
> + .data = &sysctl_offensive_bpf_disabled,
> + .maxlen = sizeof(sysctl_offensive_bpf_disabled),
> + .mode = 0644,
> + .proc_handler = bpf_offensive_handler,
> + .extra1 = SYSCTL_ZERO,
> + .extra2 = SYSCTL_TWO,
> + },
> {
> .procname = "bpf_stats_enabled",
> .data = &bpf_stats_enabled_key.key,
> diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
> index e400fbbc8aba..cca75258af72 100644
> --- a/kernel/configs/android-recommended.config
> +++ b/kernel/configs/android-recommended.config
> @@ -1,5 +1,6 @@
> # KEEP ALPHABETICALLY SORTED
> # CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set
> +# CONFIG_BPF_OFFENSIVE_BPF_OFF is not set
> # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
> # CONFIG_INPUT_MOUSE is not set
> # CONFIG_LEGACY_PTYS is not set
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 8deb22a99abe..5bdd0bee3e45 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -1432,17 +1432,18 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> case BPF_FUNC_get_prandom_u32:
> return &bpf_get_prandom_u32_proto;
> case BPF_FUNC_probe_write_user:
> - return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
> - NULL : bpf_get_probe_write_proto();
> + return (security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ||
> + sysctl_offensive_bpf_disabled) ? NULL : bpf_get_probe_write_proto();
> case BPF_FUNC_probe_read_user:
> - return &bpf_probe_read_user_proto;
> + return sysctl_offensive_bpf_disabled ? NULL : &bpf_probe_read_user_proto;
> case BPF_FUNC_probe_read_kernel:
> - return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
> - NULL : &bpf_probe_read_kernel_proto;
> + return (security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ||
> + sysctl_offensive_bpf_disabled) ? NULL : &bpf_probe_read_kernel_proto;
> case BPF_FUNC_probe_read_user_str:
> - return &bpf_probe_read_user_str_proto;
> + return sysctl_offensive_bpf_disabled ? NULL : &bpf_probe_read_user_str_proto;
> case BPF_FUNC_probe_read_kernel_str:
> - return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
> + return (security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ||
> + sysctl_offensive_bpf_disabled) ?
> NULL : &bpf_probe_read_kernel_str_proto;
> #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
> case BPF_FUNC_probe_read:
> @@ -1459,9 +1460,9 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> return &bpf_cgrp_storage_delete_proto;
> #endif
> case BPF_FUNC_send_signal:
> - return &bpf_send_signal_proto;
> + return sysctl_offensive_bpf_disabled ? NULL : &bpf_send_signal_proto;
> case BPF_FUNC_send_signal_thread:
> - return &bpf_send_signal_thread_proto;
> + return sysctl_offensive_bpf_disabled ? NULL : &bpf_send_signal_thread_proto;
> case BPF_FUNC_perf_event_read_value:
> return &bpf_perf_event_read_value_proto;
> case BPF_FUNC_get_ns_current_pid_tgid:
> @@ -1527,7 +1528,7 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> return &bpf_get_stack_proto;
> #ifdef CONFIG_BPF_KPROBE_OVERRIDE
> case BPF_FUNC_override_return:
> - return &bpf_override_return_proto;
> + return sysctl_offensive_bpf_disabled ? NULL : &bpf_override_return_proto;
> #endif
> case BPF_FUNC_get_func_ip:
> return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ?
> diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
> index 63cd4ab70171..1a15d7451f19 100644
> --- a/tools/testing/selftests/bpf/config
> +++ b/tools/testing/selftests/bpf/config
> @@ -9,6 +9,7 @@ CONFIG_BPF_LSM=y
> CONFIG_BPF_STREAM_PARSER=y
> CONFIG_BPF_SYSCALL=y
> # CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set
> +# CONFIG_BPF_OFFENSIVE_BPF_OFF is not set
> CONFIG_CGROUP_BPF=y
> CONFIG_CRYPTO_HMAC=y
> CONFIG_CRYPTO_SHA256=y
> --
> 2.34.1
>